Source code for eradiate.test_tools.regression

from __future__ import annotations

import os
from abc import ABC, abstractmethod
from io import StringIO
from pathlib import Path
from typing import TYPE_CHECKING, ClassVar

import attrs
import matplotlib.pyplot as plt
import mitsuba as mi
import numpy as np
import scipy.stats as spstats
import xarray as xr
from numpy.typing import ArrayLike
from robot.api import logger

from ..attrs import define, documented
from ..typing import PathLike
from ..util.misc import summary_repr

if TYPE_CHECKING:
    from matplotlib.axes import Axes
    from matplotlib.figure import Figure


def regression_test_plots(
    ref: ArrayLike,
    result: ArrayLike,
    vza: ArrayLike,
    metric: tuple[str, float],
    ref_var: ArrayLike | None = None,
    result_var: ArrayLike | None = None,
    xlabel: str | None = None,
    ylabel: str | None = None,
) -> tuple[Figure, list[list[Axes]]]:
    """
    Create regression test report plots. Plot errorbars if both ref_var and
    result_var are set.

    Parameters
    ----------
    ref : array-like
        Variable values for the reference data

    result : array-like
        Variable values for the simulation result

    vza : array-like
        VZA values for plotting

    metric : tuple
        A tuple of the form (metric name, value) to be added to the plots.

    ref_var : array-like, optional
        Variable variance for the reference data.

    result_var : array-like, optional
        Variable variance for the simulation result.

    xlabel, ylabel : str or None
        Labels applied to the x and y axes of the plot.

    Returns
    -------
    figure: Figure
        Pyplot Figure containing the report charts

    axes: list
        2x2 array of Axes included in the report Figure
    """
    fig, axes = plt.subplots(2, 2, figsize=(8, 6), layout="constrained")

    ax = axes[0][0]
    if ref_var is None:
        ax.plot(vza, ref, label="reference")
    else:
        ax.errorbar(vza, ref, yerr=np.sqrt(ref_var), label="reference")

    if result_var is None:
        ax.plot(vza, result, label="result")
    else:
        ax.errorbar(vza, result, yerr=np.sqrt(result_var), label="result")

    ax.set_title("Reference and test result")
    handles, labels = ax.get_legend_handles_labels()

    ax = axes[1][0]
    ax.plot(vza, result - ref)
    ax.set_title("Absolute difference")

    ax = axes[1][1]
    ax.plot(vza, (result - ref) / ref)
    ax.set_title("Relative difference")

    ax = axes[0][1]
    ax.set_axis_off()
    ax.legend(handles=handles, labels=labels, loc="upper center")

    if metric[1] is None:
        ax.text(
            0.5,
            0.5,
            f'Metric "{metric[0]}" is not available',
            horizontalalignment="center",
        )
    else:
        ax.text(0.5, 0.5, f"{metric[0]}: {metric[1]:.4}", horizontalalignment="center")

    for i, j in [[0, 0], [1, 0], [1, 1]]:
        ax = axes[i][j]
        if xlabel is not None:
            ax.set_xlabel(xlabel)
        if ylabel is not None:
            ax.set_ylabel(ylabel)

    return fig, axes


def figure_to_html(fig: plt.Figure) -> str:
    """
    Render a figure in HTML format

    Returns a string containing the rendered HTML. The root tag is a <svg> one.

    Parameters
    ----------
    fig : plt.Figure
        Matplotlib figure to render in HTML.

    Returns
    -------
    str
        Rendered HTML <svg> tag with styling.
    """

    str_i = StringIO()
    fig.savefig(str_i, format="svg", transparent=True, bbox_inches="tight")
    fig.canvas.draw_idle()
    svg = str_i.getvalue()

    # Include some CSS in the SVG to render nicely in Robot report's dark and
    # light modes
    return "\n".join(
        [
            "<svg",
            'version="1.1"',
            'baseProfile="full"',
            'width="810" height="540" viewBox="0 0 810 540"'
            'xmlns="http://www.w3.org/2000/svg">',
            "<style>",
            "    path {",
            "        fill: var(--text-color);",
            "        stroke: var(--text-color);",
            "    }",
            "</style>",
            svg,
            "</svg>",
        ]
    )


def reference_converter(value: PathLike | xr.Dataset | None) -> xr.Dataset | None:
    """
    A converter for handling the reference data attribute.

    Parameters
    ----------
    value : path-like or Dataset or None
        Path to the reference dataset file or dataset identifier or Dataset or
        None.

    Returns
    -------
    xr.Dataset or None
        The reference dataset.

    Raises
    ------
    ValueError
        If the reference data is not a valid dataset.

    Notes
    -----
    The ``value`` argument is processed as follows:

    * ``None`` and datasets are passed through.
    * If ``value`` is a path, resolve it with the path resolver. If the path
      points to a non-existing location, return ``None``. Otherwise, try to load
      it as a Dataset.

    Anything else raises a :class:`ValueError`.
    """
    if value is None:
        return value

    if isinstance(value, xr.Dataset):
        return value

    if isinstance(value, (str, os.PathLike, bytes)):
        logger.info(f'Looking up "{str(value)}" on disk', also_console=True)
        from .. import fresolver

        fname = fresolver.resolve(value)
        logger.info(f"Resolved path: {fname}", also_console=True)

        if not fname.exists():
            return None

        return xr.load_dataset(fname)

    raise ValueError(
        "Reference must be provided as a Dataset, a file path or None. "
        f"Got a {type(value).__name__}"
    )



[docs]
@define
class RegressionTest(ABC):
    """
    Common interface for tests based on the comparison of a result array against
    reference values.
    """

    # Name used for the reference metric. Must be set be subclasses.
    METRIC_NAME: ClassVar[str | None] = None

    name: str = documented(
        attrs.field(validator=attrs.validators.instance_of(str)),
        doc="Test case name.",
        type="str",
        init_type="str",
    )

    value: xr.Dataset = documented(
        attrs.field(
            validator=attrs.validators.instance_of(xr.Dataset),
            repr=summary_repr,
        ),
        doc="Simulation result. Must be specified as a dataset.",
        type=":class:`xarray.Dataset`",
        init_type=":class:`xarray.Dataset`",
    )

    reference: xr.Dataset | None = documented(
        attrs.field(
            default=None,
            converter=reference_converter,
            validator=attrs.validators.optional(
                attrs.validators.instance_of(xr.Dataset)
            ),
        ),
        doc="Reference data. Can be specified as an xarray dataset, a path to a "
        "NetCDF file or a path to a resource.",
        type=":class:`xarray.Dataset` or None",
        init_type=":class:`xarray.Dataset` or path-like, optional",
        default="None",
    )

    variable: str = documented(
        attrs.field(kw_only=True, default="brf_srf"),
        doc="Tested variable",
        type="str",
        init_type="str",
        default="brf_srf",
    )

    threshold: float = documented(
        attrs.field(kw_only=True),
        doc="Test metric threshold",
        type="float",
        init_type="float",
    )

    archive_dir: Path = documented(
        attrs.field(kw_only=True, converter=lambda x: Path(x).resolve()),
        doc="Path to output artefact storage directory. Relative paths are "
        "interpreted with respect to the current working directory.",
        type=":class:`pathlib.Path`",
        init_type="path-like",
    )

    plot: bool = documented(
        attrs.field(kw_only=True, converter=bool),
        doc="Enable pyplot charts",
        type="bool",
        init_type="bool",
    )

    def __attrs_pre_init__(self):
        if self.METRIC_NAME is None:
            raise TypeError(f"Unsupported test type {type(self).__name__}")

    def __attrs_post_init__(self):
        if self.plot:
            if (
                "w" in self.reference[self.variable]
                and self.reference[self.variable].w.size > 1
            ):
                raise ValueError(
                    "Regression charts are implemented for single a wavelength, "
                    "but the reference dataset has a spectral dimension. "
                    "Please disable the 'plot' option."
                )
            if (
                "w" in self.value[self.variable]
                and self.value[self.variable].w.size > 1
            ):
                raise ValueError(
                    "Regression charts are implemented for single a wavelength, "
                    "but the tested dataset has a spectral dimension. "
                    "Please disable the 'plot' option."
                )


[docs]
    def run(self, diagnostic=False) -> bool:
        """
        This method controls the execution steps of the regression test:

        * handle missing reference data
        * catch errors during text evaluation
        * create the appropriate plots and data archives

        Returns
        -------
        bool
            Result of the test criterion comparison.
        """

        logger.info(f"Regression test {self.name} results:", also_console=True)

        fname = self.name
        ext = ".nc"
        archive_dir = self.archive_dir

        fname_reference = archive_dir / f"{fname}-ref{ext}"
        fname_result = archive_dir / f"{fname}-result{ext}"

        # if no valid reference is found, store the results as new ref and fail
        # the test
        if not self.reference:
            logger.info(
                "No reference data found. Storing test results to "
                f"{fname_reference}. This can be the new reference.",
                also_console=True,
            )
            self._archive(self.value, fname_reference)
            self._plot(metric_value=None, noref=True)
            return False

        # else (we have a reference value), evaluate the test metric
        try:
            passed, metric_value = self._evaluate(diagnostic)
            msg = "\n".join(
                [
                    "Test passed" if passed else "Test did not pass",
                    f"Metric value: {self.METRIC_NAME} = {metric_value}",
                    f"Metric threshold: {self.threshold}",
                    f"Variable: {self.variable}",
                ]
            )
            logger.info(msg, also_console=True)

        except Exception as e:
            logger.info(
                "An exception occurred during test evaluation!", also_console=True
            )
            self._plot(noref=False, metric_value=None)
            raise e

        # we got a metric: report the results in the archive directory
        logger.info(
            f"Saving current output dataset to {fname_result}", also_console=True
        )
        self._archive(self.value, fname_result)
        logger.info(
            f"Saving reference dataset locally to {fname_reference}", also_console=True
        )
        self._archive(self.reference, fname_reference)
        self._plot(noref=False, metric_value=metric_value)

        return passed


    @abstractmethod
    def _evaluate(self, diagnostic_chart: bool = False) -> tuple[bool, float]:
        """
        Evaluate the test results and perform a comparison to the reference
        based on the criterion defined in the specialized class.

        Parameters
        ----------
        diagnostic_chart : bool, optional
            If ``True``, append a diagnostic chart to the test report.

        Returns
        -------
        passed : bool
            ``True`` iff the test passed.

        metric_value : float
            The value of the test metric.
        """
        pass

    def _archive(self, dataset: xr.Dataset, fname_output: PathLike) -> None:
        """
        Create an archive file for test result and reference storage
        """
        os.makedirs(os.path.dirname(fname_output), exist_ok=True)
        dataset.to_netcdf(fname_output)

    def _plot(self, metric_value: float | None, noref: bool) -> None:
        """
        Create a plot to visualize the results of the test.
        If the ``reference only`` parameter is set, create only a simple plot
        visualizing the new reference data. Otherwise, create the more complex
        comparison plots for the regression test.

        Parameters
        ----------
        metric_value : float or None
            The numerical value of the test metric.

        noref : bool
            If ``True``, create only a simple visualization of the computed
            data.
        """

        if not self.plot:
            return

        fname = self.name
        ext = ".png"

        archive_dir = self.archive_dir
        fname_plot = archive_dir / f"{fname}{ext}"
        fname_plot.parent.mkdir(parents=True, exist_ok=True)

        if noref:
            figure, _ = self._plot_noref()
        else:
            figure, _ = self._plot_ref(metric_value)

        html_svg = figure_to_html(figure)
        logger.info(html_svg, html=True, also_console=False)
        logger.info(f"Saving PNG report chart to {fname_plot}", also_console=True)

        plt.savefig(fname_plot)
        plt.close()

    def _plot_noref(self):
        """
        Draw a simple plot when no reference data is available.
        """

        if not self.plot:
            return

        vza = np.squeeze(self.value.vza.values)
        val = np.squeeze(self.value[self.variable].values)

        fig, ax = plt.subplots(1, 1, figsize=(8, 6))
        ax.plot(vza, val)
        ax.set_xlabel("VZA [deg]")
        ax.set_ylabel(self.variable)
        ax.set_title("Simulation result, can be used as new reference")

        return fig, ax

    def _plot_ref(self, metric_value: float | None = None):
        """
        Draw a comparison plot with reference and test data displayed together.
        """

        if not self.plot:
            return

        vza = np.squeeze(self.value.vza.values)
        val = np.squeeze(self.value[self.variable].values)
        ref = np.squeeze(self.reference[self.variable].values)

        return regression_test_plots(
            ref,
            val,
            vza,
            (self.METRIC_NAME, metric_value),
            xlabel="VZA [deg]",
            ylabel=self.variable,
        )

    def _plot_diagnostic(self, **diagnostic_info) -> None:
        """
        Create an additional plot to display more technical information about
        the test metrics and decision process. The diagnostic plot can help the
        user debug a failing test, or to assess the test power and significance.
        This plot is output directly to the robotframework report.

        Parameters:
        -----------
        **diagnostic_info: dict
            Variadic keyword arguments for the subclasses implementation
        """

        raise NotImplementedError(
            f"{type(self)} does not implement a diagnostic plot method"
        )




[docs]
@define
class RMSETest(RegressionTest):
    """
    This class implements a simple test based on the root mean squared
    error (RMSE) of a result array with respect to the reference data.

    The test will pass if the computed root mean squared deviation between
    the result and reference is smaller or equal to the given threshold.
    """

    METRIC_NAME = "rmse"

    def _evaluate(self, diagnostic_chart=False) -> tuple[bool, float]:
        value_np = self.value[self.variable].values
        ref_np = self.reference[self.variable].values
        if np.shape(value_np) != np.shape(ref_np):
            raise ValueError(
                f"Result and reference do not have the same shape! "
                f"Got: {np.shape(value_np)}, {np.shape(ref_np)}"
            )

        result_flat = np.array(value_np).flatten()
        ref_flat = np.array(ref_np).flatten()

        rmse = np.linalg.norm(result_flat - ref_flat) / np.sqrt(len(ref_flat))
        return rmse <= self.threshold, rmse




[docs]
@define
class Chi2Test(RegressionTest):
    """
    This class implements a statistical test for the regression testing
    campaign, based on Pearson's Chi-squared test.
    https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test

    It determines the probability for the reference and the test result
    following the same distribution.

    This test will pass if the computed p-value is strictly larger than the
    given threshold.
    """

    # The algorithm is adapted from Mitsuba's testing framework.

    METRIC_NAME = "X² p-value"

    def _evaluate(self, diagnostic_chart=False) -> tuple[bool, float]:
        ref_np = self.reference[self.variable].values

        result_np = self.value[self.variable].values
        histo_bins = np.linspace(ref_np.min(), ref_np.max(), 20)
        histo_ref = np.histogram(ref_np, histo_bins)[0]
        histo_res = np.histogram(result_np, histo_bins)[0]

        # sorting both histograms following the ascending frequencies in
        # the reference. Algorithm from:
        # https://stackoverflow.com/questions/9764298/how-to-sort-two-lists-which-reference-each-other-in-the-exact-same-way
        histo_ref_sorted, histo_res_sorted = zip(
            *sorted(zip(histo_ref, histo_res), key=lambda x: x[0])
        )

        from mitsuba.math_py import rlgamma

        chi2val, dof, pooled_in, pooled_out = mi.math.chi2(
            histo_res_sorted, histo_ref_sorted, 5
        )
        p_value = 1.0 - rlgamma(dof / 2.0, chi2val / 2.0)

        return p_value > self.threshold, p_value




[docs]
@define
class AbstractStudentTTest(RegressionTest):
    """
    Abstract Student's T-Test
    =========================

    Implement diagnostic chart common to subclassing T-test implementations.
    """

    def _plot_diagnostic(self, dof=None, t_prim=None) -> None:
        """
        Diagnostic chart for an Independent Student's T-test

        Parameters:
        -----------
        dof: int
            Degrees of Freedom
        t_prim: float
            t' statistic issued from the test
        """

        if not self.plot:
            return

        fig, ax = plt.subplots()
        ax.grid()
        ax2 = ax.twinx()

        start, end = spstats.t.ppf(0.0001, dof), spstats.t.ppf(0.9999, dof)

        if (t_prim > start) and (t_prim < end):
            fx = np.linspace(-np.abs(t_prim), np.abs(t_prim), 100)
            fy = spstats.t(dof).pdf(fx)
            ax.fill_between(np.zeros((100,)), fy)
            ax.axvline(spstats.t.ppf(-self.threshold / 2.0, dof), color="red")
            ax.axvline(spstats.t.ppf(self.threshold / 2.0, dof), color="red")
        else:
            ax.axvline(t_prim, label="T value")

        ax.axvline(0.0, color="red", linestyle="--")
        ax.set_title("T-statistic")

        x = np.linspace(start, end, 100)
        y = spstats.t(dof).pdf(x)

        ax2.plot(x, y, label="target T distribution form", color="black")
        ax2.legend(loc="upper right")
        ax2.set_ylim([0.0, max(y) * 1.1])

        chart = figure_to_html(fig)
        plt.close(fig)

        logger.info(chart, html=True)




[docs]
@define
class IndependentStudentTTest(AbstractStudentTTest):
    """
    Independent Student's T-test
    ============================

    This implementation of a Student's T-test is following the assumption of
    independance of the two groups that are tested. The bias of the mean values
    of the two groups is assumed to be the result of chance under the null
    hypothesis. It is a two-tailed test.

    It is less sensitive to outliers than the paired Student's T-test.
    """

    METRIC_NAME = "T-test p-value"

    def _evaluate(self, diagnostic_chart=False) -> tuple[bool, float]:
        variable_var = self.variable + "_var"

        if variable_var not in self.reference:
            raise ValueError(
                "The reference data for this T-test does not contain expected "
                "appropriate variance values, could not find data variable "
                f"'{variable_var}'"
            )

        if variable_var not in self.value:
            raise ValueError(
                "The tested data for this T-test does not contain expected "
                "appropriate variance values, could not find data variable "
                f"'{variable_var}'"
            )

        ref_np = self.reference[self.variable].values.ravel()
        result_np = self.value[self.variable].values.ravel()

        var_ref_np = self.reference[variable_var].values.ravel()
        var_res_np = self.value[variable_var].values.ravel()

        # Calculate mean values over observations and associated variances
        R_res = np.mean(result_np)
        R_ref = np.mean(ref_np)
        var_R_res = np.sum(var_res_np) / var_res_np.size**2
        var_R_ref = np.sum(var_ref_np) / var_ref_np.size**2
        bias_mean = R_res - R_ref

        # Calculate T-statistic and associated degree of freedom of its
        # T-distribution using a pooled standard deviation

        s_p = np.sqrt(
            ((var_res_np.size - 1.0) * var_R_res + (var_ref_np.size - 1.0) * var_R_ref)
            / (var_res_np.size + var_ref_np.size - 2)
        )
        t_prim = bias_mean / (
            s_p * np.sqrt(1.0 / var_res_np.size + 1.0 / var_ref_np.size)
        )

        dof = (var_res_np.size + var_ref_np.size) - 2

        assert dof > 0

        # Calculate p-value of the two-tailed t-test using the T distribution
        # survival function for the null hypothesis.
        p_value = spstats.t.sf(np.abs(t_prim), dof) * 2

        passed = p_value > self.threshold

        if diagnostic_chart:
            self._plot_diagnostic(dof=dof, t_prim=t_prim)

        logger.info(f"bias    = {bias_mean}", also_console=True)
        logger.info(f"s_p     = {s_p}", also_console=True)
        logger.info(f"t'      = {t_prim}", also_console=True)
        logger.info(f"dof     = {dof}", also_console=True)
        logger.info(f"p-value = {p_value}", also_console=True)
        logger.info(f"alpha   = {self.threshold}", also_console=True)

        return passed, p_value




[docs]
@define
class PairedStudentTTest(AbstractStudentTTest):
    """
    Paired Student's T-test
    =======================

    This implementation of a Student's T-test is following the assumption of
    paired samples within two groups that are tested. The mean of the bias
    between the paired values is assumed to be the result of chance under the
    null hypothesis. It is a two-tailed test.

    The paired test allow to introduce a covariance factor between the pairs.
    By default, this covariance is equal to zero, thus assuming independence of
    the two variables.

    Contrary to the independent Student's T-test, this paired version of the
    test requires an equal degree of freedom of the two groups.
    """

    METRIC_NAME = "paired T-test p-value"

    cov: np.ndarray | float = documented(
        attrs.field(kw_only=True, default=0.0),
        doc="Covariance between observation, defaults to zero",
        type="ndarray or float",
        init_type="array-like or float",
    )

    def _evaluate(self, diagnostic_chart=False) -> tuple[bool, float]:
        variable_var = self.variable + "_var"

        if variable_var not in self.reference:
            raise ValueError(
                "The reference data for this T-test does not contain expected "
                "appropriate variance values, could not find data variable "
                f"'{variable_var}'"
            )

        if variable_var not in self.value:
            raise ValueError(
                "The tested data for this T-test does not contain expected "
                "appropriate variance values, could not find data variable "
                f"'{variable_var}'"
            )

        ref_np = self.reference[self.variable].values.ravel()
        result_np = self.value[self.variable].values.ravel()

        var_ref_np = self.reference[variable_var].values.ravel()
        var_res_np = self.value[variable_var].values.ravel()

        assert ref_np.shape == result_np.shape
        assert ref_np.shape == var_ref_np.shape
        assert ref_np.shape == var_res_np.shape

        # Calculate paired mean value and associated variance
        D_mean = np.mean(result_np - ref_np)
        var_D = (var_res_np + var_ref_np) - 2 * self.cov
        var_D_mean = np.sum(var_D) / var_D.size**2

        # Calculate T-statistic and associated degree of freedom of its
        # T-distribution
        t_prim = D_mean / (var_D_mean / np.sqrt(var_D.size))
        dof = var_D.size - 1

        assert dof > 0

        # Calculate p-value of the two-tailed t-test using the T distribution
        # survival function for the null hypothesis.
        p_value = spstats.t.sf(np.abs(t_prim), dof) * 2

        passed = p_value > self.threshold

        if diagnostic_chart:
            self._plot_diagnostic(dof=dof, t_prim=t_prim)

        logger.info(f"bias     = {D_mean}", also_console=True)
        logger.info(f"var mean = {var_D_mean}", also_console=True)
        logger.info(f"t'       = {t_prim}", also_console=True)
        logger.info(f"dof      = {dof}", also_console=True)
        logger.info(f"p-value  = {p_value}", also_console=True)
        logger.info(f"alpha    = {self.threshold}", also_console=True)

        return passed, p_value




[docs]
@define
class ZTest(RegressionTest):
    """
    Z-Test with Šidák correction factor
    ===================================

    Implement a Z-test, testing the significance of paired differences between
    a set of observations and a set of references. It considers the observations
    variance.

    Paired tests are aggregated into one p-value using a Šidák correction. The
    test passes if the null hypothesis is accepted for at least 99.75% of the
    paired Z-tests

    This paired Z-test requires an equal degree of freedom of the two groups.
    """

    METRIC_NAME = "Z-test p-value"

    def _plot_diagnostic(self, z=None) -> None:
        """
        Diagnostic chart for a Z-test

        Parameters:
        -----------
        z: array-like
            Z-statistic for each pair of measurements
        """

        if not self.plot:
            return

        fig, ax = plt.subplots()
        ax.grid()
        ax2 = ax.twinx()

        ax.hist(z, bins=50, label="Z values")
        ax.axvline(0.0, color="red", linestyle="--")
        ax.set_title("Z-statistic")
        ax.legend(loc="upper left")

        x = np.linspace(-4.0, 4.0, 100)
        y = spstats.norm.pdf(x, 0.0, 1.0)
        ax2.plot(x, y, label="target Z distribution form", color="black")
        ax2.legend(loc="upper right")
        ax2.set_ylim([0.0, max(y) * 1.1])

        chart = figure_to_html(fig)
        plt.close(fig)

        logger.info(chart, html=True)

    def _evaluate(self, diagnostic_chart=False) -> tuple[bool, float]:
        variable_var = self.variable + "_var"

        if variable_var not in self.value:
            raise ValueError(
                "The reference data for this Z-test does not contain expected "
                "appropriate variance values, could not find data variable "
                f"'{variable_var}'"
            )

        ref_np = self.reference[self.variable].values.ravel()
        result_np = self.value[self.variable].values.ravel()

        var_res_np = self.value[variable_var].values.ravel()

        assert ref_np.shape == result_np.shape
        assert ref_np.shape == var_res_np.shape

        # Calculate Z-statistic
        z = (result_np - ref_np) / np.sqrt(var_res_np)

        # Calculate p-value of the two-tailed z-test null hypothesis
        p_values = spstats.norm.sf(np.abs(z)) * 2

        alpha_0 = 1.0 - (1.0 - self.threshold) ** (1.0 / result_np.size)
        accept_null = p_values > alpha_0

        passed = np.count_nonzero(accept_null) >= int(0.9975 * result_np.size)

        if diagnostic_chart:
            self._plot_diagnostic(z=z)

        logger.info(f"min p-value = {min(p_values)}", also_console=True)
        logger.info(f"max p-value = {max(p_values)}", also_console=True)
        logger.info(
            f"n passed    = {np.count_nonzero(accept_null)}/{int(0.9975 * result_np.size)}",
            also_console=True,
        )
        logger.info(f"alpha_1     = {self.threshold}", also_console=True)
        logger.info(f"alpha_0     = {alpha_0}", also_console=True)

        return passed, min(p_values)

    def _plot_ref(self, metric_value: float | None = None):
        """
        Draw a comparison plot with reference and test data displayed together.
        """
        vza = np.squeeze(self.value.vza.values)
        result = np.squeeze(self.value[self.variable].values)
        result_var = np.squeeze(self.value[f"{self.variable}_var"].values)
        ref = np.squeeze(self.reference[self.variable].values)

        return regression_test_plots(
            ref,
            result,
            vza,
            (self.METRIC_NAME, metric_value),
            result_var=result_var,
            xlabel="VZA [deg]",
            ylabel=self.variable,
        )




[docs]
@define
class SidakTTest(RegressionTest):
    """
    T-Test with Šidák correction factor
    ===================================

    Implement a T-test, testing the significance of paired differences between
    a set of observations and a set of references. It considers both the
    observations and reference variance.

    Paired tests are aggregated into one p-value using a Šidák correction. The
    test passes if the null hypothesis is accepted for at least 99.75% of the
    paired T-tests
    """

    METRIC_NAME = "Sidak T-test p-value"

    def _plot_diagnostic(self, t_prim=None) -> None:
        """
        Diagnostic chart for the T-test

        Parameters:
        -----------
        t_prim: array-like
            T-statistic for each pair of measurements
        """

        fig, ax = plt.subplots()
        ax.grid()
        ax2 = ax.twinx()

        start, end = spstats.norm.ppf(0.0001), spstats.norm.ppf(0.9999)

        ax.hist(t_prim, bins=50, label="T values")
        ax.axvline(0.0, color="red", linestyle="--")
        ax.set_title("T-statistic")
        ax.legend(loc="upper left")

        x = np.linspace(start, end, 100)
        y = spstats.norm.pdf(x)
        ax2.plot(x, y, label="target T distribution form", color="black")
        ax2.legend(loc="upper right")
        ax2.set_ylim([0.0, max(y) * 1.1])

        chart = figure_to_html(fig)
        plt.close(fig)

        logger.info(chart, html=True)

    def _evaluate(self, diagnostic_chart=False) -> tuple[bool, float]:
        variable_var = self.variable + "_var"

        if variable_var not in self.reference:
            raise ValueError(
                "The reference data for this T-test does not contain expected "
                "appropriate variance values, could not find data variable "
                f"'{variable_var}'"
            )

        ref_np = self.reference[self.variable].values.ravel()
        result_np = self.value[self.variable].values.ravel()

        assert ref_np.shape == result_np.shape

        var_ref_np = self.reference[variable_var].values.ravel()
        var_res_np = self.value[variable_var].values.ravel()

        assert var_ref_np.shape == var_res_np.shape

        # Calculate T-statistic
        t_prim = (result_np - ref_np) / np.sqrt(var_res_np + var_ref_np)

        # Calculate p-value of the two-tailed t-test using the T distribution
        # survival function for the null hypothesis that there is no difference
        # between the two mean distributions. It is assumed that the sample size
        # is large enough for the T distribution to converge to a normal one.
        p_values = spstats.norm.sf(np.abs(t_prim)) * 2

        # Calculate the Šidák correction
        alpha_0 = 1.0 - (1.0 - self.threshold) ** (1.0 / result_np.size)
        accept_null = p_values > alpha_0

        passed = np.count_nonzero(accept_null) >= int(0.9975 * result_np.size)

        if diagnostic_chart:
            self._plot_diagnostic(t_prim=t_prim)

        logger.info(f"min p-value = {min(p_values)}", also_console=True)
        logger.info(f"max p-value = {max(p_values)}", also_console=True)
        logger.info(
            f"n passed    = {np.count_nonzero(accept_null)}/{int(0.9975 * result_np.size)}",
            also_console=True,
        )
        logger.info(f"alpha_1     = {self.threshold}", also_console=True)
        logger.info(f"alpha_0     = {alpha_0}", also_console=True)

        return passed, min(p_values)