Source code for unistat.continuous

import warnings
from typing import Literal
from collections import namedtuple
from itertools import combinations
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype, is_categorical_dtype
from scipy import stats
from statsmodels.stats.multitest import multipletests
from .exceptions import SeriesNameCollisionError, SeriesNameCollisionWarning
from ._types import PCorrectionMethod



[docs]
class CorrStats:
    r"""Compute correlation statistics between two continuous series.

    Supports Pearson (parametric) or Spearman (nonparametric) correlation.

    Parameters
    ----------
    x : pd.Series
        First continuous variable.
    y : pd.Series
        Second continuous variable.
    parametric : bool, optional
        If True, compute Pearson correlation; otherwise, Spearman. Defaults to True.

    Attributes
    ----------
    x : pd.Series
        First variable after cleaning.
    y : pd.Series
        Second variable after cleaning.
    parametric : bool
        Whether to use parametric (Pearson) or nonparametric (Spearman) method.
    """
    def __init__(self, x: pd.Series, y: pd.Series, parametric: bool = True):
        self._df = pd.concat([x, y], axis='columns').dropna(axis='index')
        self.x = self._df[x.name]
        self.y = self._df[y.name]
        self.parametric = parametric

    def __str__(self):
        """String representation of correlation results.

        Returns
        -------
        str
            Formatted string with correlation coefficient, p-value, and sample
            size.
        """
        if self.parametric:
            print_str = (
                f"{self.x.name} vs. {self.y.name} Pearson's r:\n"
                f"r = {self.stat:.4f}\n"
                f"p = {self.p:.4f}\n"
                f"n = {self.n}\n"
            )
        else:
            print_str = (
                f"{self.x.name} vs. {self.y.name} Spearman's r:\n"
                f"rho = {self.stat:.4f}\n"
                f"p = {self.p:.4f}\n"
                f"n = {self.n}\n"
            )
        return print_str

    @property
    def result(self):
        r"""Compute correlation test result.

        Returns
        -------
        statistic : float
            Test statistic (:math:`r` or :math:`\rho`).
        pvalue : float
            *P*-value for the test.
        """
        if self.parametric:
            return stats.pearsonr(self.x, self.y, alternative='two-sided')
        else:
            return stats.spearmanr(self.x, self.y, alternative='two-sided')

    @property
    def n(self):
        r"""Sample size after dropping NaNs.

        Returns
        -------
        int
            Number of observations.
        """
        return len(self._df.index)

    @property
    def stat(self):
        r"""Correlation coefficient (``r`` for Pearson, ``rho`` for Spearman).

        Returns
        -------
        float
            Correlation test statistic.
        """
        return self.result.statistic

    @property
    def p(self):
        r"""*P*-value of the correlation test.

        Returns
        -------
        float
            *P*-value.
        """
        return self.result.pvalue




[docs]
class TwoSeriesStats:
    r"""Compare 2 continuous series using parametric or nonparametric tests.

    Supports asymptotic t-tests (parametric) or Mann-Whitney U-tests
    (nonparametric).

    Parameters
    ----------
    test : pd.Series
        Test group data.
    control : pd.Series
        Control group data.
    parametric : bool, optional
        If True, use t-test; otherwise, Mann-Whitney U. Defaults to True.
    alpha_level : float, optional
        Significance level for confidence intervals and tests. Defaults to 0.05.

    Attributes
    ----------
    test : pd.Series
        Cleaned test group data.
    control : pd.Series
        Cleaned control group data.
    parametric : bool
        Whether to use parametric or nonparametric methods.
    alpha : float
        Significance level.

    See Also
    --------
    TwoSampleStats :
        Same tests, starting from a Boolean group column (x) and a numeric
        outcome column (y).
    TwoSeriesPermutation :
        Permutation hypothesis tests for 2-sample statistics, including t-test &
        Mann-Whitney U-test.
    """

    def __init__(self,
                 test: pd.Series,
                 control: pd.Series,
                 parametric: bool = True,
                 alpha_level: float = .05):
        if test.name == control.name:
            warnings.warn(
                SeriesNameCollisionWarning(
                    '`test` and `control` have the same column name.'
                ),
                stacklevel=2
            )
            test = test.rename(f'{test.name}_TEST')
            control = control.rename(f'{control.name}_CTRL')
        self.test = test.dropna().reset_index(drop=True)
        self.control = control.dropna().reset_index(drop=True)
        self.parametric = parametric
        self.alpha = alpha_level

    def __str__(self):
        r"""String representation of summary statistics and test results.

        Returns
        -------
        str
            Formatted summary and test results (t-test or Mann-Whitney U)."""
        if self.parametric:
            return (f'{self.parametric_summ_stats()}\n'
                    f'{self.t_test()}')
        else:
            return (f'{self.nonparametric_summ_stats()}\n'
                    f'{self.mwu_test()}')


[docs]
    def conf_int(self,
                 pct_ci: float = None,
                 dist: Literal['t', 'normal', 'z'] = 't'):
        r"""Calculate CI of mean of test and control groups, based on SEM.

        ``pct_ci`` can be used to custom-define a CI width. By default,
        ``1 - alpha`` is used, as set in the :class:`TwoSeriesStats` object.

        Parameters
        ----------
        pct_ci : float, optional
            Confidence level (e.g., ``0.95`` for 95%). Defaults to
            ``1 - alpha``.
        dist: {'t', 'normal'}, optional
            Parent distribution to use when calculating SEM. Defaults to
            ``'t'``.

        Returns
        -------
        ControlTestStats
            Named tuple with confidence intervals for control and test groups.

        See Also
        --------
        TwoSeriesBootstrap :
            Bootstrapped CIs. Useful for finding CI around a statistic other
            than the sample mean (may be useful for median, as in a skewed
            sample), or for cases of small N.

        Notes
        -----

        SEM is always calculated using sample SD (1 DoF). CI can be calculated
        using either the normal (Z) distribution or Student's t-distribution;
        the latter will give slightly wider CI, all else being equal.

        For small N, Gurland & Trepathi (1971) report that CI is too narrow by
        25% for N=2, and ~5% for N=6, and provide an equation for calculating CI
        underestimation. Sokal & Rohlf (1981) give a formula for a correction
        factor to calculate unbiased CI for N < 20, which is not implemented
        in this method.

        For N > 100, Student's t-distribution and Gaussian normal distribution
        are approximately equivalent. Nonetheless, t-distribution is the default
        form used here.

        Rather than considering correction factors, our recommendation for small
        N, or to define CI about a measure of central tendency other than the
        sample mean, is to use a bootstrapped CI, as implemented in the
        :doc:`resampling <resampling>` module.
        """
        if pct_ci is None:
            pct_ci = 1 - self.alpha

        if dist == 't':
            test_ci = stats.t.interval(
                pct_ci,
                df=self.test.count() - 1,
                loc=self.test.mean(),
                scale=self.test.std(ddof=1) / np.sqrt(self.test.count())
            )
            control_ci = stats.t.interval(
                pct_ci,
                df=self.control.count() - 1,
                loc=self.control.mean(),
                scale=self.control.std(ddof=1) / np.sqrt(self.control.count())
            )

        elif dist in ('normal', 'z'):
            test_ci = stats.norm.interval(
                pct_ci,
                loc=self.test.mean(),
                scale=self.test.std(ddof=1) / np.sqrt(self.test.count())
            )
            control_ci = stats.norm.interval(
                pct_ci,
                loc=self.control.mean(),
                scale=self.control.std(ddof=1) / np.sqrt(self.control.count())
            )

        else:
            raise ValueError("`dist` can be: {'t', 'normal', 'z'}.")

        return ControlTestStats(control=control_ci, test=test_ci)



[docs]
    def parametric_summ_stats(self, alpha_level: float = None):
        r"""Compute parametric summary statistics (mean, std, CI, etc.).

        Parameters
        ----------
        alpha_level : float, optional
            Significance level for CI. Defaults to :attr:`alpha`.

        Returns
        -------
        pd.DataFrame
            Summary statistics for test and control groups.
        """
        if alpha_level is None:
            alpha_level = self.alpha

        ci_res = self.conf_int(1 - alpha_level)

        summ_stats = pd.DataFrame(
            data={
                self.test.name: {
                    'n': self.test.count(),
                    'mean': self.test.mean(),
                    'std': self.test.std(),
                    'min': self.test.min(),
                    'max': self.test.max(),
                    f'{100 * (1 - alpha_level):.0f}%_CI_lower': ci_res.test[0],
                    f'{100 * (1 - alpha_level):.0f}%_CI_upper': ci_res.test[1],
                },

                self.control.name: {
                    'n': self.control.count(),
                    'mean': self.control.mean(),
                    'std': self.control.std(),
                    'min': self.control.min(),
                    'max': self.control.max(),
                    f'{100 * (1 - alpha_level):.0f}%_CI_lower': (
                        ci_res.control[0]
                    ),
                    f'{100 * (1 - alpha_level):.0f}%_CI_upper': (
                        ci_res.control[1]
                    ),
                },
            },
        )
        return summ_stats



[docs]
    def t_test(self, equal_var: bool = False):
        r"""Perform 2-independent-samples t-test (Welch's or Student's).

        Defaults to Welch's t-test for heteroskedastic samples. Delacre et al.
        (2017) recommends routine use of Welch's test for unequal variance over
        Student's method, rather than selective use of Welch's test. The loss in
        power from Welch's vs. Student's test in cases of equal variance is
        minimal, whereas the reduction in Type I error rate from Welch's in
        cases of unequal variance is substantial; a strong determination of
        homoskedasticity is often not simple.

        Parameters
        ----------
        equal_var : bool, optional
            If True, assume equal variances (Student's t-test); otherwise, use
            Welch's. Defaults to False.

        Returns
        -------
        pd.Series
            Test results including t-statistic, degrees of freedom, and p-value.
        """
        result = stats.ttest_ind(
            a=self.test,
            b=self.control,
            equal_var=equal_var,
        )

        output = pd.Series(
            {
                't-statistic': result.statistic,
                'welch_dof': result.df,
                'student_dof': (
                    (self.test.count() - 1) + (self.control.count() - 1)
                ),
                'p-value': result.pvalue,
            }
        )

        return output



[docs]
    def nonparametric_summ_stats(self, alpha_level: float = None):
        r"""Compute nonparametric summary statistics (quantiles, IQR).

        Parameters
        ----------
        alpha_level : float, optional
            Significance level for hypothesis direction. Defaults to
            :attr:`alpha`.

        Returns
        -------
        pd.DataFrame
            Summary statistics with quantiles and IQR, including hypothesis
            direction.
        """
        if alpha_level is None:
            alpha_level = self.alpha

        q_test = self.test.quantile([0, 0.25, 0.5, 0.75, 1])
        q_control = self.control.quantile([0, 0.25, 0.5, 0.75, 1])

        summ_stats = pd.DataFrame(
            data={
                self.test.name: {
                    'n': self.test.count(),
                    'min': q_test.loc[0],
                    'q1': q_test.loc[0.25],
                    'median': q_test.loc[0.5],
                    'q3': q_test.loc[0.75],
                    'max': q_test.loc[1],
                    'iqr': q_test.loc[0.75] - q_test.loc[0.25],
                },

                'Ha': '=',

                self.control.name: {
                    'n': self.control.count(),
                    'min': q_control.loc[0],
                    'q1': q_control.loc[0.25],
                    'median': q_control.loc[0.5],
                    'q3': q_control.loc[0.75],
                    'max': q_control.loc[1],
                    'iqr': q_control.loc[0.75] - q_control.loc[0.25],
                },
            },
        )

        if self.mwu_test().at['p-value'] < alpha_level:
            comparator = stats.mannwhitneyu(x=self.test, y=self.control,
                                            alternative='greater')
            if comparator.pvalue < alpha_level:
                summ_stats['Ha'] = '>'
            elif comparator.pvalue > alpha_level:
                comparator = stats.mannwhitneyu(x=self.test, y=self.control,
                                                alternative='less')
                if comparator.pvalue < alpha_level:
                    summ_stats['Ha'] = '<'
                else:
                    summ_stats['Ha'] = '!='
            else:
                summ_stats['Ha'] = '!='

        return summ_stats



[docs]
    def mwu_test(self):
        r"""Perform Mann-Whitney U test.

        Returns
        -------
        pd.Series
            Test results with ``index=['U-statistic', 'p-value']``.
        """
        result = stats.mannwhitneyu(x=self.test, y=self.control)

        desc_stats = pd.Series(
            {
                'U-statistic': result.statistic,
                'p-value': result.pvalue,
            }
        )
        return desc_stats





[docs]
class TwoSampleStats(TwoSeriesStats):
    r"""Compare continuous outcome across 2 groups defined by a boolean variable.

    Extends TwoSeriesStats to split a continuous series by a boolean grouping
    variable.

    Parameters
    ----------
    bool_x : pd.Series
        Boolean variable defining groups.
    num_y : pd.Series
        Continuous outcome variable.
    parametric : bool, optional
        If True, use t-test; otherwise, Mann-Whitney U. Defaults to True.
    alpha_level : float, optional
        Significance level for tests and CIs. Defaults to 0.05.
    x_test_lvl : bool, optional
        Boolean level defining the test group. Defaults to True.

    Attributes
    ----------
    x : pd.Series
        Boolean grouping variable.
    y : pd.Series
        Continuous outcome variable.

    Raises
    ------
    SeriesNameCollisionError
        If bool_x and num_y have the same name.
    ValueError
        If test or control group is empty after splitting.

    See Also
    --------
    TwoSeriesStats :
        Same tests, starting from 2 separate numeric outcome columns.
    TwoSamplePermutation :
        Permutation hypothesis tests for 2-sample statistics, including t-test &
        Mann-Whitney U-test.
    """
    def __init__(self,
                 bool_x: pd.Series, num_y: pd.Series,
                 parametric: bool = True,
                 alpha_level: float = .05,
                 x_test_lvl: bool = True):
        # Ensure names exist and are not the same
        bool_x = bool_x.rename(bool_x.name or 'x')
        num_y = num_y.rename(num_y.name or 'y')
        if bool_x.name == num_y.name:
            raise SeriesNameCollisionError('Both bool_x and num_y '
                                           'cannot have same names.')

        # Concat into a df in order to .dropna, then define x & y series
        self._df = (
            pd.concat(
                [
                    bool_x.astype('boolean'),
                    num_y,
                ],
                axis='columns'
            )
            .dropna(axis='index')
        )
        self.x = self._df[bool_x.name]
        self.y = self._df[num_y.name]

        # Build mask to slice test/control base on x_test_lvl
        self._test_x = bool(x_test_lvl)  # Ensure is type bool
        mask = (self.x == self._test_x).astype(bool)  # ensure safe masking

        # self.test is value of y where x == x_test_lvl
        test = (
            self.y[mask]
            .rename(f'{self.x.name}_{str(self._test_x).upper()}')
        )

        control = (
            self.y[~mask]
            .rename(f'{self.x.name}_{str((not self._test_x)).upper()}')
        )

        if test.empty or control.empty:
            raise ValueError('After cleaning/splitting, at least one group is '
                             'empty. Check x_test_lvl and values in bool_x.')

        super().__init__(test=test,
                         control=control,
                         parametric=parametric,
                         alpha_level=alpha_level)

    def __str__(self):
        """String representation of grouped summary statistics and test results.

        Returns
        -------
        str
            Formatted outcome name, summary, and test results.
        """
        if self.parametric:
            return (f'Outcome: {self.y.name}\n'
                    f'{self.parametric_summ_stats()}\n'
                    f'{self.t_test()}')
        else:
            return (f'Outcome: {self.y.name}\n'
                    f'{self.nonparametric_summ_stats()}\n'
                    f'{self.mwu_test()}')



ControlTestStats = namedtuple('ControlTestStats',
                              ['control', 'test'])



[docs]
class MultiSeries1WayBGStats:
    r"""Compare 3+ continuous series using parametric or nonparametric tests.

    Class is intended for data formatted as 3+ ``pd.Series`` objects, with a
    Series of continuous dependent variable (DV) data for each level of the
    categorical independent variable (IV). This is commonly encountered when
    there are separate DataFrames for each group/category, each of which has
    a column for the same numeric outcome.

    Supports asymptotic analysis of variance (Welch ANOVA) tests (parametric),
    or Kruskal-Wallis tests (nonparametric).

    Also implements post hoc testing with automatic *p*-value correction for
    multiple comparisons (Holm-Bonferroni method by default).

    * parametric post hoc tests via pairwise Welch t-tests.
    * nonparametric post hoc tests via pairwise Mann-Whitney U-tests.

    Parameters
    ----------
    *data : pd.Series | pd.DataFrame
        Series of outcome data is passed for each category/level. Series will be
        converted to DataFrame columns in the order they are passed. If present,
        Series names will be appended with an index value (``__0``, ``__1``,
        etc.); unnamed Series will be named with their equivalent integer index.
        A **single** DataFrame can also be passed as an argument, in which case
        column names will be used as Series names.
    parametric : bool, default True
        If True, use ANOVA; otherwise, Kruskal-Wallis.
    alpha_level : float, default 0.05
        Significance level for confidence intervals and tests.
    **named_data : pd.Series | pd.DataFrame
        Keyword arguments may be used to override automatic, index-based
        Series names (e.g., ``new_name1=series1, new_name2=series2``, etc.).
        ``data`` Is a protected keyword, reserved for passing a single DataFrame
        as ``data=df``.

    Attributes
    ----------
    data : pd.DataFrame
        All passed Series, concatenated as columns, with reassigned unique
        column names.
    parametric : bool
        Whether to use parametric or nonparametric methods.
    alpha : float
        Significance level.

    See Also
    --------
    MultiSample1WayBGStats :
        Same tests, starting from a categorical grouping column (x) and a
        numeric outcome column (y). Useful when all data is derived from a
        single DataFrame.

    Notes
    -----
    Series may be passed **either** as positional arguments (``*data``)**, or as
    named keyword arguments (``**named_data``), **but not both**. If all Series
    are contained as a single DataFrame, the DataFrame may be passed either
    as a ``data=df`` keyword argument (preferred for clarity), or as the lone
    ``*data`` argument. For this reason, ``data`` is a protected name for
    ``**named_data`` keyword arguments.
    """
    def __init__(self,
                 *data: pd.Series | pd.DataFrame,
                 parametric: bool = True,
                 alpha_level: float = .05,
                 **named_data: pd.Series | pd.DataFrame):
        self._input_data: tuple = data
        self._input_named_data: dict = named_data
        self._series_list: list[pd.Series] = self._parse_input_data()
        self._check_duplicate_series_name()  # Error checking, no returns

        self._series_list = [series.dropna().reset_index(drop=True)
                             for series in self._series_list]

        self.parametric: bool = parametric
        self.alpha: float = alpha_level

    @property
    def data(self) -> pd.DataFrame:
        return pd.concat(
            objs=self._series_list,
            axis='columns',
        )

    def __str__(self):
        with pd.option_context('display.max_colwidth', None,
                               'max_colwidth', None,
                               'display.width', 256):
            if self.parametric:
                output_str = (
                    f'{self.parametric_summ_stats().to_string()}\n'
                    f'{self.anova().to_string()}'
                )
                if self._omnibus_sig:
                    output_str += f'\n{self.pairwise_t().to_string()}'
            else:
                output_str = (
                    f'{self.nonparametric_summ_stats().to_string()}\n'
                    f'{self.kruskal_wallis().to_string()}'
                )
                if self._omnibus_sig:
                    output_str += f'\n{self.pairwise_mwu().to_string()}'

            return output_str


[docs]
    def conf_int(self,
                 pct_ci: float = None,
                 dist: Literal['t', 'normal', 'z'] = 't') -> pd.DataFrame:
        r"""Calculate CI of mean of all columns in `data`, based on SEM.

        ``pct_ci`` can be used to custom-define a CI width. By default,
        ``1 - alpha`` is used, as set in the TwoSeriesStats object.

        Parameters
        ----------
        pct_ci : float, optional
            Confidence level (e.g., ``0.95`` for 95%). Defaults to
            ``1 - alpha``.
        dist: {'t', 'normal'}, optional
            Parent distribution to use when calculating SEM. Defaults to
            ``'t'``.

        Returns
        -------
        pd.DataFrame
            Column names match ``data.columns``;
            ``index=['ci_lower', 'ci_upper']``.

        Notes
        -----

        SEM is always calculated using sample SD (1 DoF). CI can be calculated
        using either the normal (Z) distribution or Student's t-distribution;
        the latter will give slightly wider CI, all else being equal.

        For small N, Gurland & Trepathi (1971) report that CI is too narrow by
        25% for N=2, and ~5% for N=6, and provide an equation for calculating CI
        underestimation. Sokal & Rohlf (1981) give a formula for a correction
        factor to calculate unbiased CI for N < 20, which is not implemented
        in this method.

        For N > 100, Student's t-distribution and Gaussian normal distribution
        are approximately equivalent. Nonetheless, t-distribution is the default
        form used here.

        Rather than considering correction factors, our recommendation for small
        N, or to define CI about a measure of central tendency other than the
        sample mean, is to use a bootstrapped CI, as implemented in the
        :doc:`resampling <resampling>` module. However, a bootstrapping class
        has not yet been implemented for multiclass (3+ level) cases.
        """
        if pct_ci is None:
            pct_ci = 1 - self.alpha

        res_dict = {}
        if dist == 't':
            for col in self.data.columns:
                res_dict[col] = stats.t.interval(
                    pct_ci,
                    df=self.data[col].count() - 1,
                    loc=self.data[col].mean(),
                    scale=(
                        self.data[col].std(ddof=1)
                        / np.sqrt(self.data[col].count())
                    )
                )

        elif dist in ('normal', 'z'):
            for col in self.data.columns:
                res_dict[col] = stats.norm.interval(
                    pct_ci,
                    loc=self.data[col].mean(),
                    scale=(
                        self.data[col].std(ddof=1)
                        / np.sqrt(self.data[col].count())
                    )
                )

        else:
            raise ValueError("`dist` can be: {'t', 'normal', 'z'}.")

        return pd.DataFrame(data=res_dict, index=['ci_lower', 'ci_upper'])



[docs]
    def parametric_summ_stats(self, alpha_level: float = None) -> pd.DataFrame:
        r"""Compute parametric summary statistics (mean, std, CI, etc.).

        Parameters
        ----------
        alpha_level : float, optional
            Significance level for CI. Defaults to :attr:`alpha`.

        Returns
        -------
        pd.DataFrame
            Summary statistics for each column in :attr:`data`.
        """
        if alpha_level is None:
            alpha_level = self.alpha

        ci_res: pd.DataFrame = self.conf_int(1 - alpha_level)

        summ_stats = pd.DataFrame(
            data={
                col: {
                    'n': self.data[col].count(),
                    'mean': self.data[col].mean(),
                    'std': self.data[col].std(),
                    'min': self.data[col].min(),
                    'max': self.data[col].max(),
                    f'{100 * (1 - alpha_level):.0f}%_CI_lower': (
                        ci_res.at['ci_lower', col]
                    ),
                    f'{100 * (1 - alpha_level):.0f}%_CI_upper': (
                        ci_res.at['ci_upper', col]
                    ),
                } for col in self.data.columns.tolist()
            },
        ).astype('Float64')

        return summ_stats



[docs]
    def anova(self, equal_var: bool = False) -> pd.Series:
        r"""Perform 1-way between-groups analysis of variance (ANOVA).

        Defaults to Welch's ANOVA for heteroskedastic samples. Delacre et al.
        (2019) recommends routine use of Welch's F-test for unequal variance
        over Student's (Fisher's) method, rather than selective use of Welch's
        test. The loss in power from Welch's vs. Student's test in cases of
        equal variance is minimal, whereas the reduction in Type I error rate
        from Welch's in cases of unequal variance is substantial; a strong
        determination of homoskedasticity is often not simple.

        Parameters
        ----------
        equal_var : bool, default False
            If True, assume equal variances (Fisher's ANOVA); otherwise, use
            Welch's.

        Returns
        -------
        pd.Series
            Test results including F-statistic, degrees of freedom, and p-value.
        """
        result = stats.f_oneway(
            *[self.data[col].astype(float)
              for col in self.data.columns],
            equal_var=equal_var,
            nan_policy='omit',
            axis=0
        )

        output = pd.Series(
            {
                'F-statistic': result.statistic,
                'dof_B': self.data.shape[1] - 1,
                'dof_E': self.data.shape[0] - self.data.shape[1],
                'dof_T': self.data.shape[0] - 1,
                'p-value': result.pvalue,
            }
        ).astype('Float64')

        return output



[docs]
    def nonparametric_summ_stats(self,
                                 alpha_level: float = None) -> pd.DataFrame:
        r"""Compute nonparametric summary statistics (quantiles, IQR).

        Parameters
        ----------
        alpha_level : float, optional
            Significance level for hypothesis direction. Defaults to self.alpha.

        Returns
        -------
        pd.DataFrame
            Summary statistics with quantiles and IQR for each column in
            :attr:`data`.
        """
        if alpha_level is None:
            alpha_level = self.alpha

        quantiles = [0, 0.25, 0.5, 0.75, 1]

        summ_stats = pd.DataFrame(
            data={
                col: {
                    'n': self.data[col].count(),
                    'min': self.data[col].quantile(quantiles).loc[0],
                    'q1': self.data[col].quantile(quantiles).loc[0.25],
                    'median': self.data[col].quantile(quantiles).loc[0.5],
                    'q3': self.data[col].quantile(quantiles).loc[0.75],
                    'max': self.data[col].quantile(quantiles).loc[1],
                    'iqr': (self.data[col].quantile(quantiles).loc[0.75]
                            - self.data[col].quantile(quantiles).loc[0.25]),
                } for col in self.data.columns.tolist()
            },
        ).astype('Float64')

        return summ_stats



[docs]
    def kruskal_wallis(self) -> pd.Series:
        r"""Perform 1-way between-groups Kruskal-Wallis test.

        Returns
        -------
        pd.Series
            Test results including H-statistic, degrees of freedom, and p-value.
        """
        result = stats.kruskal(
            *[self.data[col].astype(float)
              for col in self.data.columns],
            nan_policy='omit',
            axis=0,
        )

        output = pd.Series(
            {
                'H-statistic': result.statistic,
                'dof': self.data.shape[1] - 1,
                'p-value': result.pvalue,
            }
        ).astype('Float64')

        return output


    def _parse_input_data(self) -> list[pd.Series]:
        output_list: list[pd.Series] = []

        def parse_input_df(input_df: pd.DataFrame) -> list[pd.Series]:
            df_list: list[pd.Series] = []
            # Ensure at least 2 columns, then assign columns to series_list
            if input_df.shape[1] >= 2:
                for col in input_df.columns:
                    df_list.append(input_df[col])
                return df_list

            else:
                raise ValueError(
                    'If passing a single DataFrame as a positional `data` '
                    'arg or `data=df`kwarg, the DataFrame must have at '
                    'least 2 columns.'
                )

        # Ensure exactly 1 paradigm used
        if self._input_data and self._input_named_data:
            raise ValueError('Cannot pass both `data` & `named_data`.')

        # If passing `data` as a kwarg: `data=pd.DataFrame()`
        elif 'data' in self._input_named_data:
            df: pd.DataFrame = self._input_named_data['data']

            # Multiple named_data
            if len(self._input_named_data) > 1:
                ValueError(
                    'When using `data=df` argument to pass a DataFrame, '
                    '`*data` positional args must be omitted, and '
                    'no extra `**named_data` kwargs are permitted.'
                )
            elif isinstance(df, pd.DataFrame):
                output_list = parse_input_df(df)
            elif isinstance(df, pd.Series):
                raise ValueError('Pass Series data via `*data` positional '
                                 'args or via `**named_data` kwargs.')
            else:
                raise ValueError('`data=` must be pd.DataFrame or pd.Series.')

        # If passing a single DataFrame to `data` as arg
        elif (
                # pd.DataFrame arg
                (len(self._input_data) == 1)
                and isinstance(self._input_data[0], pd.DataFrame)
        ):
            output_list = parse_input_df(self._input_data[0])

        # If using `data` args
        elif self._input_data:
            # Ensure at least 2 args
            if len(self._input_data) >= 2:
                for enum, series in enumerate(self._input_data):
                    # If `data` includes a DataFrame, ensure it only has 1 col
                    if isinstance(series, pd.DataFrame):
                        if series.shape[1] == 1:
                            series = series.iloc[:, 0]
                        else:
                            raise ValueError(
                                'When passing multiple args to `data`, '
                                'args must be pd.Series or single-column '
                                'pd.DataFrame.'
                            )

                    # Assign name/number to each series
                    if hasattr(series, 'name'):
                        series.name += f'__{enum}'
                    else:
                        series.name = enum
                    output_list.append(series)

            else:
                raise ValueError('When passing individual Series args to '
                                 '`data`, at least 2 args must be passed.')

        # If using `named_data` kwargs
        elif self._input_named_data:
            # Ensure at least 2 kwargs
            if len(self._input_named_data) >= 2:
                for name, series in self._input_named_data.items():
                    # If `data` includes a DataFrame, ensure only has 1 col
                    if isinstance(series, pd.DataFrame):
                        if series.shape[1] == 1:
                            series = series.iloc[:, 0]
                        else:
                            raise ValueError(
                                'When passing multiple kwargs to '
                                '`named_data`, args must be a Series or '
                                'single-column DataFrame.'
                            )

                    series.name = name
                    output_list.append(series)
            else:
                raise ValueError('When using `named_data`, at least 2 '
                                 'kwargs must be passed.')

        # If `data` & `named_data` left blank
        else:
            raise ValueError('Must pass either `data` or `named_data`.')

        return output_list

    def _check_duplicate_series_name(self) -> None:
        series_names: list[str] = [series.name for series in self._series_list]

        # Set will be shorter than list if & only if duplicate column names
        if len(set(series_names)) < len(series_names):

            duplicates = series_names.copy()
            for name in set(series_names):
                duplicates.remove(name)
            duplicates = str(set(duplicates))[1:-1]  # Remove {} wrapper

            raise SeriesNameCollisionError(
                '`data` cannot have duplicate column names.\n'
                f'Duplicates: {duplicates}'
            )

    @property
    def _omnibus_sig(self) -> bool:
        if self.parametric:
            p = self.anova()['p-value']
        else:
            p = self.kruskal_wallis()['p-value']

        if p <= self.alpha:
            return True
        else:
            return False

    @property
    def _pairwise_columns(self) -> list[tuple[str, str]]:
        col_names: list[str] = self.data.columns.tolist()
        return list(combinations(col_names, 2))


[docs]
    def pairwise_t(self,
                   equal_var: bool = False,
                   p_corr_method: PCorrectionMethod = 'holm') -> pd.DataFrame:
        r"""Perform post hoc pairwise t-tests.

        Uses pairwise combinations of input Series, and calculates t-test for
        each pair. By default, uses Welch t-test rather than Student t-test (see
        :meth:`TwoSeriesStats.t_test()` for rationale).

        *p*-Values for all pairwise tests then undergo correction for multiple
        comparisons. By default, Holm-Bonferroni correction is used, but all
        correction methods supported by
        `statsmodels.stats.multitest.multipletests
        <https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html>`_
        are supported.

        Parameters
        ----------
        equal_var : bool, default False
            If ``False``, use Welch t-test; otherwise, use Student t-test.
        p_corr_method : str, default 'holm'
            *P*-Value correction method. Cannot be ``None`` since *P*-value
            correction is always indicated for multiple pairwise comparisons.

        Returns
        -------
        pd.DataFrame
            DataFrame with a ``pd.MultiIndex`` in ``(control, test)`` format.
            Columns give *t*-statistic, Student DoF, calculated Welch DoF,
            and uncorrected and corrected *P*-values.
        """
        t_results: list[pd.Series] = []

        # Run t-test for each column pair
        for col_pair in self._pairwise_columns:
            t_obj = TwoSeriesStats(
                control=self.data[col_pair[0]],
                test=self.data[col_pair[1]],
                alpha_level=self.alpha,
                parametric=True
            )
            # Add columns for control/test names
            t_result: pd.Series = t_obj.t_test()
            t_results.append(t_result)

        t_results_df: pd.DataFrame = pd.concat(
            objs=t_results,
            axis='columns',
        ).T
        t_results_df.index = pd.MultiIndex.from_tuples(
            self._pairwise_columns,
            names=['control', 'test'],
        )

        # Add column of corrected p-values
        t_results_df['p_corr'] = multipletests(
            pvals=t_results_df['p-value'],
            alpha=self.alpha,
            method=p_corr_method,
        )[1]

        return t_results_df



[docs]
    def pairwise_mwu(self,
                     p_corr_method: PCorrectionMethod = 'holm') -> pd.DataFrame:
        r"""Perform post hoc pairwise Mann-Whitney U-tests.

        Uses pairwise combinations of input Series, and calculates Mann-Whitney
        U-test for each pair.

        *p*-Values for all pairwise tests then undergo correction for multiple
        comparisons. By default, Holm-Bonferroni correction is used, but all
        correction methods supported by
        `statsmodels.stats.multitest.multipletests
        <https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html>`_
        are supported.

        Parameters
        ----------
        p_corr_method : str, default 'holm'
            *P*-Value correction method. Cannot be None since *P*-value
            correction is always indicated for multiple pairwise comparisons.

        Returns
        -------
        pd.DataFrame
            DataFrame with a ``pd.MultiIndex`` in ``(control, test)`` format.
            Since *P*-value can reach significance even if medians are the same
            between groups, ``'Ha'`` column denotes the direction of effect
            when uncorrected *p* is significant.
            Columns also give *U*-statistic, DoF, and uncorrected and corrected
            *{*-values.
        """
        mwu_results: list[pd.Series] = []
        mwu_ha_map: dict[str, str] = {
            '<': 'control > test',
            '>': 'control < test',
            '!=': 'control != test',
            '=': 'control = test',
        }

        # Run t-test for each column pair
        for col_pair in self._pairwise_columns:
            mwu_obj = TwoSeriesStats(
                control=self.data[col_pair[0]],
                test=self.data[col_pair[1]],
                alpha_level=self.alpha,
                parametric=False
            )

            # Add column to show direction of effect
            mwu_ha: str = mwu_ha_map[
                mwu_obj.nonparametric_summ_stats().at['n', 'Ha']
            ]
            mwu_result: pd.Series = mwu_obj.mwu_test()
            mwu_result = pd.concat(
                objs=[
                    pd.Series({'Ha': mwu_ha}),
                    mwu_result,
                ],
                axis='rows',
            )

            mwu_results.append(mwu_result)

        mwu_results_df: pd.DataFrame = pd.concat(
            objs=mwu_results,
            axis='columns',
        ).T
        mwu_results_df.index = pd.MultiIndex.from_tuples(
            self._pairwise_columns,
            names=['control', 'test'],
        )

        # Add column of corrected p-values
        mwu_results_df['p_corr'] = multipletests(
            pvals=mwu_results_df['p-value'],
            alpha=self.alpha,
            method=p_corr_method,
        )[1]

        return mwu_results_df





[docs]
class MultiSample1WayBGStats(MultiSeries1WayBGStats):
    r"""Compare continuous outcome across 3 groups defined by a categorical.

    Extends :class:`MultiSeries1WayBGStats` to split a continuous Series by a
    categorical grouping variable.

    Parameters
    ----------
    cat_x : pd.Series
        Categorical variable defining groups. ``cat_x.dtype`` should be either
        ``pd.Categorical``, or convertible to ``pd.Categorical``. Predefined
        categorical ordering will be retained automatically.
    num_y : pd.Series
        Continuous outcome variable.
    cat_order : list[str], optional
        If no categorical ordering is predefined in ``cat_x``, but this is
        desired in the output. Should be a list of strings of all unique values
        that appear in ``cat_x``.
    parametric : bool, default True
        If True, use ANOVA; otherwise, use Kruskal-Wallus.
    alpha_level : float, default 0.05
        Significance level for tests and CIs.

    Attributes
    ----------
    data : pd.DataFrame
        Resulting DataFrame after removal of observations with nulls.
    parametric : bool
        Whether to use parametric or nonparametric methods.
    alpha : float
        Significance level.

    See Also
    --------
    MultiSeries1WayBGStats :
        Same tests, input style is simpler if data is formatted as separate
        Series of numeric outcome data (for each group/level)  without a
        categorical grouping column.
    """
    def __init__(self,
                 cat_x: pd.Series,
                 num_y: pd.Series,
                 cat_order: list[str] = None,
                 parametric: bool = True,
                 alpha_level: float = .05):
        # Combine into a df
        df = (
            pd.concat([cat_x, num_y], axis='columns')
            .dropna(how='any', axis='rows')
        )

        if len(cat_x.unique()) < 2:
            raise ValueError('`cat_x` must contain at least 2 categories.')

        # If not passing categorical ordering
        if cat_order is None:
            # Want to preserve native categorical ordering if cat_x was already
            # an ordered categorical
            if not is_categorical_dtype(df[cat_x.name].dtype):
                df[cat_x.name] = df[cat_x.name].astype('category')
            categories = df[cat_x.name].cat.categories.tolist()

        # If passing categorical ordering
        else:
            # Ensure all used categories appear in cat_order
            missing = (
                    set(df[cat_x.name].unique()) - set(cat_order)
                    - {float('nan'), None, pd.NA}
            )
            if missing:
                raise ValueError(f'Categories in cat_x not found in cat_order: '
                                 f'{missing}')
            df[cat_x.name] = (
                df[cat_x.name]
                .astype(CategoricalDtype(categories=cat_order, ordered=True))
            )
            categories = cat_order

        # Pivot to wide: columns = categories, values = num_y
        wide_df = df.pivot(columns=cat_x.name, values=num_y.name)

        # Warn/omit empty groups
        observed = wide_df.columns
        missing = set(categories) - set(observed)
        if missing:
            warnings.warn(f'Categories with no data after dropna '
                          f'(omitted): {missing}')

        super().__init__(data=wide_df,
                         parametric=parametric,
                         alpha_level=alpha_level)