import warnings
from typing import Literal
from collections import namedtuple
from itertools import combinations
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype, is_categorical_dtype
from scipy import stats
from statsmodels.stats.multitest import multipletests
from .exceptions import SeriesNameCollisionError, SeriesNameCollisionWarning
from ._types import PCorrectionMethod
[docs]
class CorrStats:
r"""Compute correlation statistics between two continuous series.
Supports Pearson (parametric) or Spearman (nonparametric) correlation.
Parameters
----------
x : pd.Series
First continuous variable.
y : pd.Series
Second continuous variable.
parametric : bool, optional
If True, compute Pearson correlation; otherwise, Spearman. Defaults to True.
Attributes
----------
x : pd.Series
First variable after cleaning.
y : pd.Series
Second variable after cleaning.
parametric : bool
Whether to use parametric (Pearson) or nonparametric (Spearman) method.
"""
def __init__(self, x: pd.Series, y: pd.Series, parametric: bool = True):
self._df = pd.concat([x, y], axis='columns').dropna(axis='index')
self.x = self._df[x.name]
self.y = self._df[y.name]
self.parametric = parametric
def __str__(self):
"""String representation of correlation results.
Returns
-------
str
Formatted string with correlation coefficient, p-value, and sample
size.
"""
if self.parametric:
print_str = (
f"{self.x.name} vs. {self.y.name} Pearson's r:\n"
f"r = {self.stat:.4f}\n"
f"p = {self.p:.4f}\n"
f"n = {self.n}\n"
)
else:
print_str = (
f"{self.x.name} vs. {self.y.name} Spearman's r:\n"
f"rho = {self.stat:.4f}\n"
f"p = {self.p:.4f}\n"
f"n = {self.n}\n"
)
return print_str
@property
def result(self):
r"""Compute correlation test result.
Returns
-------
statistic : float
Test statistic (:math:`r` or :math:`\rho`).
pvalue : float
*P*-value for the test.
"""
if self.parametric:
return stats.pearsonr(self.x, self.y, alternative='two-sided')
else:
return stats.spearmanr(self.x, self.y, alternative='two-sided')
@property
def n(self):
r"""Sample size after dropping NaNs.
Returns
-------
int
Number of observations.
"""
return len(self._df.index)
@property
def stat(self):
r"""Correlation coefficient (``r`` for Pearson, ``rho`` for Spearman).
Returns
-------
float
Correlation test statistic.
"""
return self.result.statistic
@property
def p(self):
r"""*P*-value of the correlation test.
Returns
-------
float
*P*-value.
"""
return self.result.pvalue
[docs]
class TwoSeriesStats:
r"""Compare 2 continuous series using parametric or nonparametric tests.
Supports asymptotic t-tests (parametric) or Mann-Whitney U-tests
(nonparametric).
Parameters
----------
test : pd.Series
Test group data.
control : pd.Series
Control group data.
parametric : bool, optional
If True, use t-test; otherwise, Mann-Whitney U. Defaults to True.
alpha_level : float, optional
Significance level for confidence intervals and tests. Defaults to 0.05.
Attributes
----------
test : pd.Series
Cleaned test group data.
control : pd.Series
Cleaned control group data.
parametric : bool
Whether to use parametric or nonparametric methods.
alpha : float
Significance level.
See Also
--------
TwoSampleStats :
Same tests, starting from a Boolean group column (x) and a numeric
outcome column (y).
TwoSeriesPermutation :
Permutation hypothesis tests for 2-sample statistics, including t-test &
Mann-Whitney U-test.
"""
def __init__(self,
test: pd.Series,
control: pd.Series,
parametric: bool = True,
alpha_level: float = .05):
if test.name == control.name:
warnings.warn(
SeriesNameCollisionWarning(
'`test` and `control` have the same column name.'
),
stacklevel=2
)
test = test.rename(f'{test.name}_TEST')
control = control.rename(f'{control.name}_CTRL')
self.test = test.dropna().reset_index(drop=True)
self.control = control.dropna().reset_index(drop=True)
self.parametric = parametric
self.alpha = alpha_level
def __str__(self):
r"""String representation of summary statistics and test results.
Returns
-------
str
Formatted summary and test results (t-test or Mann-Whitney U)."""
if self.parametric:
return (f'{self.parametric_summ_stats()}\n'
f'{self.t_test()}')
else:
return (f'{self.nonparametric_summ_stats()}\n'
f'{self.mwu_test()}')
[docs]
def conf_int(self,
pct_ci: float = None,
dist: Literal['t', 'normal', 'z'] = 't'):
r"""Calculate CI of mean of test and control groups, based on SEM.
``pct_ci`` can be used to custom-define a CI width. By default,
``1 - alpha`` is used, as set in the :class:`TwoSeriesStats` object.
Parameters
----------
pct_ci : float, optional
Confidence level (e.g., ``0.95`` for 95%). Defaults to
``1 - alpha``.
dist: {'t', 'normal'}, optional
Parent distribution to use when calculating SEM. Defaults to
``'t'``.
Returns
-------
ControlTestStats
Named tuple with confidence intervals for control and test groups.
See Also
--------
TwoSeriesBootstrap :
Bootstrapped CIs. Useful for finding CI around a statistic other
than the sample mean (may be useful for median, as in a skewed
sample), or for cases of small N.
Notes
-----
SEM is always calculated using sample SD (1 DoF). CI can be calculated
using either the normal (Z) distribution or Student's t-distribution;
the latter will give slightly wider CI, all else being equal.
For small N, Gurland & Trepathi (1971) report that CI is too narrow by
25% for N=2, and ~5% for N=6, and provide an equation for calculating CI
underestimation. Sokal & Rohlf (1981) give a formula for a correction
factor to calculate unbiased CI for N < 20, which is not implemented
in this method.
For N > 100, Student's t-distribution and Gaussian normal distribution
are approximately equivalent. Nonetheless, t-distribution is the default
form used here.
Rather than considering correction factors, our recommendation for small
N, or to define CI about a measure of central tendency other than the
sample mean, is to use a bootstrapped CI, as implemented in the
:doc:`resampling <resampling>` module.
"""
if pct_ci is None:
pct_ci = 1 - self.alpha
if dist == 't':
test_ci = stats.t.interval(
pct_ci,
df=self.test.count() - 1,
loc=self.test.mean(),
scale=self.test.std(ddof=1) / np.sqrt(self.test.count())
)
control_ci = stats.t.interval(
pct_ci,
df=self.control.count() - 1,
loc=self.control.mean(),
scale=self.control.std(ddof=1) / np.sqrt(self.control.count())
)
elif dist in ('normal', 'z'):
test_ci = stats.norm.interval(
pct_ci,
loc=self.test.mean(),
scale=self.test.std(ddof=1) / np.sqrt(self.test.count())
)
control_ci = stats.norm.interval(
pct_ci,
loc=self.control.mean(),
scale=self.control.std(ddof=1) / np.sqrt(self.control.count())
)
else:
raise ValueError("`dist` can be: {'t', 'normal', 'z'}.")
return ControlTestStats(control=control_ci, test=test_ci)
[docs]
def parametric_summ_stats(self, alpha_level: float = None):
r"""Compute parametric summary statistics (mean, std, CI, etc.).
Parameters
----------
alpha_level : float, optional
Significance level for CI. Defaults to :attr:`alpha`.
Returns
-------
pd.DataFrame
Summary statistics for test and control groups.
"""
if alpha_level is None:
alpha_level = self.alpha
ci_res = self.conf_int(1 - alpha_level)
summ_stats = pd.DataFrame(
data={
self.test.name: {
'n': self.test.count(),
'mean': self.test.mean(),
'std': self.test.std(),
'min': self.test.min(),
'max': self.test.max(),
f'{100 * (1 - alpha_level):.0f}%_CI_lower': ci_res.test[0],
f'{100 * (1 - alpha_level):.0f}%_CI_upper': ci_res.test[1],
},
self.control.name: {
'n': self.control.count(),
'mean': self.control.mean(),
'std': self.control.std(),
'min': self.control.min(),
'max': self.control.max(),
f'{100 * (1 - alpha_level):.0f}%_CI_lower': (
ci_res.control[0]
),
f'{100 * (1 - alpha_level):.0f}%_CI_upper': (
ci_res.control[1]
),
},
},
)
return summ_stats
[docs]
def t_test(self, equal_var: bool = False):
r"""Perform 2-independent-samples t-test (Welch's or Student's).
Defaults to Welch's t-test for heteroskedastic samples. Delacre et al.
(2017) recommends routine use of Welch's test for unequal variance over
Student's method, rather than selective use of Welch's test. The loss in
power from Welch's vs. Student's test in cases of equal variance is
minimal, whereas the reduction in Type I error rate from Welch's in
cases of unequal variance is substantial; a strong determination of
homoskedasticity is often not simple.
Parameters
----------
equal_var : bool, optional
If True, assume equal variances (Student's t-test); otherwise, use
Welch's. Defaults to False.
Returns
-------
pd.Series
Test results including t-statistic, degrees of freedom, and p-value.
"""
result = stats.ttest_ind(
a=self.test,
b=self.control,
equal_var=equal_var,
)
output = pd.Series(
{
't-statistic': result.statistic,
'welch_dof': result.df,
'student_dof': (
(self.test.count() - 1) + (self.control.count() - 1)
),
'p-value': result.pvalue,
}
)
return output
[docs]
def nonparametric_summ_stats(self, alpha_level: float = None):
r"""Compute nonparametric summary statistics (quantiles, IQR).
Parameters
----------
alpha_level : float, optional
Significance level for hypothesis direction. Defaults to
:attr:`alpha`.
Returns
-------
pd.DataFrame
Summary statistics with quantiles and IQR, including hypothesis
direction.
"""
if alpha_level is None:
alpha_level = self.alpha
q_test = self.test.quantile([0, 0.25, 0.5, 0.75, 1])
q_control = self.control.quantile([0, 0.25, 0.5, 0.75, 1])
summ_stats = pd.DataFrame(
data={
self.test.name: {
'n': self.test.count(),
'min': q_test.loc[0],
'q1': q_test.loc[0.25],
'median': q_test.loc[0.5],
'q3': q_test.loc[0.75],
'max': q_test.loc[1],
'iqr': q_test.loc[0.75] - q_test.loc[0.25],
},
'Ha': '=',
self.control.name: {
'n': self.control.count(),
'min': q_control.loc[0],
'q1': q_control.loc[0.25],
'median': q_control.loc[0.5],
'q3': q_control.loc[0.75],
'max': q_control.loc[1],
'iqr': q_control.loc[0.75] - q_control.loc[0.25],
},
},
)
if self.mwu_test().at['p-value'] < alpha_level:
comparator = stats.mannwhitneyu(x=self.test, y=self.control,
alternative='greater')
if comparator.pvalue < alpha_level:
summ_stats['Ha'] = '>'
elif comparator.pvalue > alpha_level:
comparator = stats.mannwhitneyu(x=self.test, y=self.control,
alternative='less')
if comparator.pvalue < alpha_level:
summ_stats['Ha'] = '<'
else:
summ_stats['Ha'] = '!='
else:
summ_stats['Ha'] = '!='
return summ_stats
[docs]
def mwu_test(self):
r"""Perform Mann-Whitney U test.
Returns
-------
pd.Series
Test results with ``index=['U-statistic', 'p-value']``.
"""
result = stats.mannwhitneyu(x=self.test, y=self.control)
desc_stats = pd.Series(
{
'U-statistic': result.statistic,
'p-value': result.pvalue,
}
)
return desc_stats
[docs]
class TwoSampleStats(TwoSeriesStats):
r"""Compare continuous outcome across 2 groups defined by a boolean variable.
Extends TwoSeriesStats to split a continuous series by a boolean grouping
variable.
Parameters
----------
bool_x : pd.Series
Boolean variable defining groups.
num_y : pd.Series
Continuous outcome variable.
parametric : bool, optional
If True, use t-test; otherwise, Mann-Whitney U. Defaults to True.
alpha_level : float, optional
Significance level for tests and CIs. Defaults to 0.05.
x_test_lvl : bool, optional
Boolean level defining the test group. Defaults to True.
Attributes
----------
x : pd.Series
Boolean grouping variable.
y : pd.Series
Continuous outcome variable.
Raises
------
SeriesNameCollisionError
If bool_x and num_y have the same name.
ValueError
If test or control group is empty after splitting.
See Also
--------
TwoSeriesStats :
Same tests, starting from 2 separate numeric outcome columns.
TwoSamplePermutation :
Permutation hypothesis tests for 2-sample statistics, including t-test &
Mann-Whitney U-test.
"""
def __init__(self,
bool_x: pd.Series, num_y: pd.Series,
parametric: bool = True,
alpha_level: float = .05,
x_test_lvl: bool = True):
# Ensure names exist and are not the same
bool_x = bool_x.rename(bool_x.name or 'x')
num_y = num_y.rename(num_y.name or 'y')
if bool_x.name == num_y.name:
raise SeriesNameCollisionError('Both bool_x and num_y '
'cannot have same names.')
# Concat into a df in order to .dropna, then define x & y series
self._df = (
pd.concat(
[
bool_x.astype('boolean'),
num_y,
],
axis='columns'
)
.dropna(axis='index')
)
self.x = self._df[bool_x.name]
self.y = self._df[num_y.name]
# Build mask to slice test/control base on x_test_lvl
self._test_x = bool(x_test_lvl) # Ensure is type bool
mask = (self.x == self._test_x).astype(bool) # ensure safe masking
# self.test is value of y where x == x_test_lvl
test = (
self.y[mask]
.rename(f'{self.x.name}_{str(self._test_x).upper()}')
)
control = (
self.y[~mask]
.rename(f'{self.x.name}_{str((not self._test_x)).upper()}')
)
if test.empty or control.empty:
raise ValueError('After cleaning/splitting, at least one group is '
'empty. Check x_test_lvl and values in bool_x.')
super().__init__(test=test,
control=control,
parametric=parametric,
alpha_level=alpha_level)
def __str__(self):
"""String representation of grouped summary statistics and test results.
Returns
-------
str
Formatted outcome name, summary, and test results.
"""
if self.parametric:
return (f'Outcome: {self.y.name}\n'
f'{self.parametric_summ_stats()}\n'
f'{self.t_test()}')
else:
return (f'Outcome: {self.y.name}\n'
f'{self.nonparametric_summ_stats()}\n'
f'{self.mwu_test()}')
ControlTestStats = namedtuple('ControlTestStats',
['control', 'test'])
[docs]
class MultiSeries1WayBGStats:
r"""Compare 3+ continuous series using parametric or nonparametric tests.
Class is intended for data formatted as 3+ ``pd.Series`` objects, with a
Series of continuous dependent variable (DV) data for each level of the
categorical independent variable (IV). This is commonly encountered when
there are separate DataFrames for each group/category, each of which has
a column for the same numeric outcome.
Supports asymptotic analysis of variance (Welch ANOVA) tests (parametric),
or Kruskal-Wallis tests (nonparametric).
Also implements post hoc testing with automatic *p*-value correction for
multiple comparisons (Holm-Bonferroni method by default).
* parametric post hoc tests via pairwise Welch t-tests.
* nonparametric post hoc tests via pairwise Mann-Whitney U-tests.
Parameters
----------
*data : pd.Series | pd.DataFrame
Series of outcome data is passed for each category/level. Series will be
converted to DataFrame columns in the order they are passed. If present,
Series names will be appended with an index value (``__0``, ``__1``,
etc.); unnamed Series will be named with their equivalent integer index.
A **single** DataFrame can also be passed as an argument, in which case
column names will be used as Series names.
parametric : bool, default True
If True, use ANOVA; otherwise, Kruskal-Wallis.
alpha_level : float, default 0.05
Significance level for confidence intervals and tests.
**named_data : pd.Series | pd.DataFrame
Keyword arguments may be used to override automatic, index-based
Series names (e.g., ``new_name1=series1, new_name2=series2``, etc.).
``data`` Is a protected keyword, reserved for passing a single DataFrame
as ``data=df``.
Attributes
----------
data : pd.DataFrame
All passed Series, concatenated as columns, with reassigned unique
column names.
parametric : bool
Whether to use parametric or nonparametric methods.
alpha : float
Significance level.
See Also
--------
MultiSample1WayBGStats :
Same tests, starting from a categorical grouping column (x) and a
numeric outcome column (y). Useful when all data is derived from a
single DataFrame.
Notes
-----
Series may be passed **either** as positional arguments (``*data``)**, or as
named keyword arguments (``**named_data``), **but not both**. If all Series
are contained as a single DataFrame, the DataFrame may be passed either
as a ``data=df`` keyword argument (preferred for clarity), or as the lone
``*data`` argument. For this reason, ``data`` is a protected name for
``**named_data`` keyword arguments.
"""
def __init__(self,
*data: pd.Series | pd.DataFrame,
parametric: bool = True,
alpha_level: float = .05,
**named_data: pd.Series | pd.DataFrame):
self._input_data: tuple = data
self._input_named_data: dict = named_data
self._series_list: list[pd.Series] = self._parse_input_data()
self._check_duplicate_series_name() # Error checking, no returns
self._series_list = [series.dropna().reset_index(drop=True)
for series in self._series_list]
self.parametric: bool = parametric
self.alpha: float = alpha_level
@property
def data(self) -> pd.DataFrame:
return pd.concat(
objs=self._series_list,
axis='columns',
)
def __str__(self):
with pd.option_context('display.max_colwidth', None,
'max_colwidth', None,
'display.width', 256):
if self.parametric:
output_str = (
f'{self.parametric_summ_stats().to_string()}\n'
f'{self.anova().to_string()}'
)
if self._omnibus_sig:
output_str += f'\n{self.pairwise_t().to_string()}'
else:
output_str = (
f'{self.nonparametric_summ_stats().to_string()}\n'
f'{self.kruskal_wallis().to_string()}'
)
if self._omnibus_sig:
output_str += f'\n{self.pairwise_mwu().to_string()}'
return output_str
[docs]
def conf_int(self,
pct_ci: float = None,
dist: Literal['t', 'normal', 'z'] = 't') -> pd.DataFrame:
r"""Calculate CI of mean of all columns in `data`, based on SEM.
``pct_ci`` can be used to custom-define a CI width. By default,
``1 - alpha`` is used, as set in the TwoSeriesStats object.
Parameters
----------
pct_ci : float, optional
Confidence level (e.g., ``0.95`` for 95%). Defaults to
``1 - alpha``.
dist: {'t', 'normal'}, optional
Parent distribution to use when calculating SEM. Defaults to
``'t'``.
Returns
-------
pd.DataFrame
Column names match ``data.columns``;
``index=['ci_lower', 'ci_upper']``.
Notes
-----
SEM is always calculated using sample SD (1 DoF). CI can be calculated
using either the normal (Z) distribution or Student's t-distribution;
the latter will give slightly wider CI, all else being equal.
For small N, Gurland & Trepathi (1971) report that CI is too narrow by
25% for N=2, and ~5% for N=6, and provide an equation for calculating CI
underestimation. Sokal & Rohlf (1981) give a formula for a correction
factor to calculate unbiased CI for N < 20, which is not implemented
in this method.
For N > 100, Student's t-distribution and Gaussian normal distribution
are approximately equivalent. Nonetheless, t-distribution is the default
form used here.
Rather than considering correction factors, our recommendation for small
N, or to define CI about a measure of central tendency other than the
sample mean, is to use a bootstrapped CI, as implemented in the
:doc:`resampling <resampling>` module. However, a bootstrapping class
has not yet been implemented for multiclass (3+ level) cases.
"""
if pct_ci is None:
pct_ci = 1 - self.alpha
res_dict = {}
if dist == 't':
for col in self.data.columns:
res_dict[col] = stats.t.interval(
pct_ci,
df=self.data[col].count() - 1,
loc=self.data[col].mean(),
scale=(
self.data[col].std(ddof=1)
/ np.sqrt(self.data[col].count())
)
)
elif dist in ('normal', 'z'):
for col in self.data.columns:
res_dict[col] = stats.norm.interval(
pct_ci,
loc=self.data[col].mean(),
scale=(
self.data[col].std(ddof=1)
/ np.sqrt(self.data[col].count())
)
)
else:
raise ValueError("`dist` can be: {'t', 'normal', 'z'}.")
return pd.DataFrame(data=res_dict, index=['ci_lower', 'ci_upper'])
[docs]
def parametric_summ_stats(self, alpha_level: float = None) -> pd.DataFrame:
r"""Compute parametric summary statistics (mean, std, CI, etc.).
Parameters
----------
alpha_level : float, optional
Significance level for CI. Defaults to :attr:`alpha`.
Returns
-------
pd.DataFrame
Summary statistics for each column in :attr:`data`.
"""
if alpha_level is None:
alpha_level = self.alpha
ci_res: pd.DataFrame = self.conf_int(1 - alpha_level)
summ_stats = pd.DataFrame(
data={
col: {
'n': self.data[col].count(),
'mean': self.data[col].mean(),
'std': self.data[col].std(),
'min': self.data[col].min(),
'max': self.data[col].max(),
f'{100 * (1 - alpha_level):.0f}%_CI_lower': (
ci_res.at['ci_lower', col]
),
f'{100 * (1 - alpha_level):.0f}%_CI_upper': (
ci_res.at['ci_upper', col]
),
} for col in self.data.columns.tolist()
},
).astype('Float64')
return summ_stats
[docs]
def anova(self, equal_var: bool = False) -> pd.Series:
r"""Perform 1-way between-groups analysis of variance (ANOVA).
Defaults to Welch's ANOVA for heteroskedastic samples. Delacre et al.
(2019) recommends routine use of Welch's F-test for unequal variance
over Student's (Fisher's) method, rather than selective use of Welch's
test. The loss in power from Welch's vs. Student's test in cases of
equal variance is minimal, whereas the reduction in Type I error rate
from Welch's in cases of unequal variance is substantial; a strong
determination of homoskedasticity is often not simple.
Parameters
----------
equal_var : bool, default False
If True, assume equal variances (Fisher's ANOVA); otherwise, use
Welch's.
Returns
-------
pd.Series
Test results including F-statistic, degrees of freedom, and p-value.
"""
result = stats.f_oneway(
*[self.data[col].astype(float)
for col in self.data.columns],
equal_var=equal_var,
nan_policy='omit',
axis=0
)
output = pd.Series(
{
'F-statistic': result.statistic,
'dof_B': self.data.shape[1] - 1,
'dof_E': self.data.shape[0] - self.data.shape[1],
'dof_T': self.data.shape[0] - 1,
'p-value': result.pvalue,
}
).astype('Float64')
return output
[docs]
def nonparametric_summ_stats(self,
alpha_level: float = None) -> pd.DataFrame:
r"""Compute nonparametric summary statistics (quantiles, IQR).
Parameters
----------
alpha_level : float, optional
Significance level for hypothesis direction. Defaults to self.alpha.
Returns
-------
pd.DataFrame
Summary statistics with quantiles and IQR for each column in
:attr:`data`.
"""
if alpha_level is None:
alpha_level = self.alpha
quantiles = [0, 0.25, 0.5, 0.75, 1]
summ_stats = pd.DataFrame(
data={
col: {
'n': self.data[col].count(),
'min': self.data[col].quantile(quantiles).loc[0],
'q1': self.data[col].quantile(quantiles).loc[0.25],
'median': self.data[col].quantile(quantiles).loc[0.5],
'q3': self.data[col].quantile(quantiles).loc[0.75],
'max': self.data[col].quantile(quantiles).loc[1],
'iqr': (self.data[col].quantile(quantiles).loc[0.75]
- self.data[col].quantile(quantiles).loc[0.25]),
} for col in self.data.columns.tolist()
},
).astype('Float64')
return summ_stats
[docs]
def kruskal_wallis(self) -> pd.Series:
r"""Perform 1-way between-groups Kruskal-Wallis test.
Returns
-------
pd.Series
Test results including H-statistic, degrees of freedom, and p-value.
"""
result = stats.kruskal(
*[self.data[col].astype(float)
for col in self.data.columns],
nan_policy='omit',
axis=0,
)
output = pd.Series(
{
'H-statistic': result.statistic,
'dof': self.data.shape[1] - 1,
'p-value': result.pvalue,
}
).astype('Float64')
return output
def _parse_input_data(self) -> list[pd.Series]:
output_list: list[pd.Series] = []
def parse_input_df(input_df: pd.DataFrame) -> list[pd.Series]:
df_list: list[pd.Series] = []
# Ensure at least 2 columns, then assign columns to series_list
if input_df.shape[1] >= 2:
for col in input_df.columns:
df_list.append(input_df[col])
return df_list
else:
raise ValueError(
'If passing a single DataFrame as a positional `data` '
'arg or `data=df`kwarg, the DataFrame must have at '
'least 2 columns.'
)
# Ensure exactly 1 paradigm used
if self._input_data and self._input_named_data:
raise ValueError('Cannot pass both `data` & `named_data`.')
# If passing `data` as a kwarg: `data=pd.DataFrame()`
elif 'data' in self._input_named_data:
df: pd.DataFrame = self._input_named_data['data']
# Multiple named_data
if len(self._input_named_data) > 1:
ValueError(
'When using `data=df` argument to pass a DataFrame, '
'`*data` positional args must be omitted, and '
'no extra `**named_data` kwargs are permitted.'
)
elif isinstance(df, pd.DataFrame):
output_list = parse_input_df(df)
elif isinstance(df, pd.Series):
raise ValueError('Pass Series data via `*data` positional '
'args or via `**named_data` kwargs.')
else:
raise ValueError('`data=` must be pd.DataFrame or pd.Series.')
# If passing a single DataFrame to `data` as arg
elif (
# pd.DataFrame arg
(len(self._input_data) == 1)
and isinstance(self._input_data[0], pd.DataFrame)
):
output_list = parse_input_df(self._input_data[0])
# If using `data` args
elif self._input_data:
# Ensure at least 2 args
if len(self._input_data) >= 2:
for enum, series in enumerate(self._input_data):
# If `data` includes a DataFrame, ensure it only has 1 col
if isinstance(series, pd.DataFrame):
if series.shape[1] == 1:
series = series.iloc[:, 0]
else:
raise ValueError(
'When passing multiple args to `data`, '
'args must be pd.Series or single-column '
'pd.DataFrame.'
)
# Assign name/number to each series
if hasattr(series, 'name'):
series.name += f'__{enum}'
else:
series.name = enum
output_list.append(series)
else:
raise ValueError('When passing individual Series args to '
'`data`, at least 2 args must be passed.')
# If using `named_data` kwargs
elif self._input_named_data:
# Ensure at least 2 kwargs
if len(self._input_named_data) >= 2:
for name, series in self._input_named_data.items():
# If `data` includes a DataFrame, ensure only has 1 col
if isinstance(series, pd.DataFrame):
if series.shape[1] == 1:
series = series.iloc[:, 0]
else:
raise ValueError(
'When passing multiple kwargs to '
'`named_data`, args must be a Series or '
'single-column DataFrame.'
)
series.name = name
output_list.append(series)
else:
raise ValueError('When using `named_data`, at least 2 '
'kwargs must be passed.')
# If `data` & `named_data` left blank
else:
raise ValueError('Must pass either `data` or `named_data`.')
return output_list
def _check_duplicate_series_name(self) -> None:
series_names: list[str] = [series.name for series in self._series_list]
# Set will be shorter than list if & only if duplicate column names
if len(set(series_names)) < len(series_names):
duplicates = series_names.copy()
for name in set(series_names):
duplicates.remove(name)
duplicates = str(set(duplicates))[1:-1] # Remove {} wrapper
raise SeriesNameCollisionError(
'`data` cannot have duplicate column names.\n'
f'Duplicates: {duplicates}'
)
@property
def _omnibus_sig(self) -> bool:
if self.parametric:
p = self.anova()['p-value']
else:
p = self.kruskal_wallis()['p-value']
if p <= self.alpha:
return True
else:
return False
@property
def _pairwise_columns(self) -> list[tuple[str, str]]:
col_names: list[str] = self.data.columns.tolist()
return list(combinations(col_names, 2))
[docs]
def pairwise_t(self,
equal_var: bool = False,
p_corr_method: PCorrectionMethod = 'holm') -> pd.DataFrame:
r"""Perform post hoc pairwise t-tests.
Uses pairwise combinations of input Series, and calculates t-test for
each pair. By default, uses Welch t-test rather than Student t-test (see
:meth:`TwoSeriesStats.t_test()` for rationale).
*p*-Values for all pairwise tests then undergo correction for multiple
comparisons. By default, Holm-Bonferroni correction is used, but all
correction methods supported by
`statsmodels.stats.multitest.multipletests
<https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html>`_
are supported.
Parameters
----------
equal_var : bool, default False
If ``False``, use Welch t-test; otherwise, use Student t-test.
p_corr_method : str, default 'holm'
*P*-Value correction method. Cannot be ``None`` since *P*-value
correction is always indicated for multiple pairwise comparisons.
Returns
-------
pd.DataFrame
DataFrame with a ``pd.MultiIndex`` in ``(control, test)`` format.
Columns give *t*-statistic, Student DoF, calculated Welch DoF,
and uncorrected and corrected *P*-values.
"""
t_results: list[pd.Series] = []
# Run t-test for each column pair
for col_pair in self._pairwise_columns:
t_obj = TwoSeriesStats(
control=self.data[col_pair[0]],
test=self.data[col_pair[1]],
alpha_level=self.alpha,
parametric=True
)
# Add columns for control/test names
t_result: pd.Series = t_obj.t_test()
t_results.append(t_result)
t_results_df: pd.DataFrame = pd.concat(
objs=t_results,
axis='columns',
).T
t_results_df.index = pd.MultiIndex.from_tuples(
self._pairwise_columns,
names=['control', 'test'],
)
# Add column of corrected p-values
t_results_df['p_corr'] = multipletests(
pvals=t_results_df['p-value'],
alpha=self.alpha,
method=p_corr_method,
)[1]
return t_results_df
[docs]
def pairwise_mwu(self,
p_corr_method: PCorrectionMethod = 'holm') -> pd.DataFrame:
r"""Perform post hoc pairwise Mann-Whitney U-tests.
Uses pairwise combinations of input Series, and calculates Mann-Whitney
U-test for each pair.
*p*-Values for all pairwise tests then undergo correction for multiple
comparisons. By default, Holm-Bonferroni correction is used, but all
correction methods supported by
`statsmodels.stats.multitest.multipletests
<https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html>`_
are supported.
Parameters
----------
p_corr_method : str, default 'holm'
*P*-Value correction method. Cannot be None since *P*-value
correction is always indicated for multiple pairwise comparisons.
Returns
-------
pd.DataFrame
DataFrame with a ``pd.MultiIndex`` in ``(control, test)`` format.
Since *P*-value can reach significance even if medians are the same
between groups, ``'Ha'`` column denotes the direction of effect
when uncorrected *p* is significant.
Columns also give *U*-statistic, DoF, and uncorrected and corrected
*{*-values.
"""
mwu_results: list[pd.Series] = []
mwu_ha_map: dict[str, str] = {
'<': 'control > test',
'>': 'control < test',
'!=': 'control != test',
'=': 'control = test',
}
# Run t-test for each column pair
for col_pair in self._pairwise_columns:
mwu_obj = TwoSeriesStats(
control=self.data[col_pair[0]],
test=self.data[col_pair[1]],
alpha_level=self.alpha,
parametric=False
)
# Add column to show direction of effect
mwu_ha: str = mwu_ha_map[
mwu_obj.nonparametric_summ_stats().at['n', 'Ha']
]
mwu_result: pd.Series = mwu_obj.mwu_test()
mwu_result = pd.concat(
objs=[
pd.Series({'Ha': mwu_ha}),
mwu_result,
],
axis='rows',
)
mwu_results.append(mwu_result)
mwu_results_df: pd.DataFrame = pd.concat(
objs=mwu_results,
axis='columns',
).T
mwu_results_df.index = pd.MultiIndex.from_tuples(
self._pairwise_columns,
names=['control', 'test'],
)
# Add column of corrected p-values
mwu_results_df['p_corr'] = multipletests(
pvals=mwu_results_df['p-value'],
alpha=self.alpha,
method=p_corr_method,
)[1]
return mwu_results_df
[docs]
class MultiSample1WayBGStats(MultiSeries1WayBGStats):
r"""Compare continuous outcome across 3 groups defined by a categorical.
Extends :class:`MultiSeries1WayBGStats` to split a continuous Series by a
categorical grouping variable.
Parameters
----------
cat_x : pd.Series
Categorical variable defining groups. ``cat_x.dtype`` should be either
``pd.Categorical``, or convertible to ``pd.Categorical``. Predefined
categorical ordering will be retained automatically.
num_y : pd.Series
Continuous outcome variable.
cat_order : list[str], optional
If no categorical ordering is predefined in ``cat_x``, but this is
desired in the output. Should be a list of strings of all unique values
that appear in ``cat_x``.
parametric : bool, default True
If True, use ANOVA; otherwise, use Kruskal-Wallus.
alpha_level : float, default 0.05
Significance level for tests and CIs.
Attributes
----------
data : pd.DataFrame
Resulting DataFrame after removal of observations with nulls.
parametric : bool
Whether to use parametric or nonparametric methods.
alpha : float
Significance level.
See Also
--------
MultiSeries1WayBGStats :
Same tests, input style is simpler if data is formatted as separate
Series of numeric outcome data (for each group/level) without a
categorical grouping column.
"""
def __init__(self,
cat_x: pd.Series,
num_y: pd.Series,
cat_order: list[str] = None,
parametric: bool = True,
alpha_level: float = .05):
# Combine into a df
df = (
pd.concat([cat_x, num_y], axis='columns')
.dropna(how='any', axis='rows')
)
if len(cat_x.unique()) < 2:
raise ValueError('`cat_x` must contain at least 2 categories.')
# If not passing categorical ordering
if cat_order is None:
# Want to preserve native categorical ordering if cat_x was already
# an ordered categorical
if not is_categorical_dtype(df[cat_x.name].dtype):
df[cat_x.name] = df[cat_x.name].astype('category')
categories = df[cat_x.name].cat.categories.tolist()
# If passing categorical ordering
else:
# Ensure all used categories appear in cat_order
missing = (
set(df[cat_x.name].unique()) - set(cat_order)
- {float('nan'), None, pd.NA}
)
if missing:
raise ValueError(f'Categories in cat_x not found in cat_order: '
f'{missing}')
df[cat_x.name] = (
df[cat_x.name]
.astype(CategoricalDtype(categories=cat_order, ordered=True))
)
categories = cat_order
# Pivot to wide: columns = categories, values = num_y
wide_df = df.pivot(columns=cat_x.name, values=num_y.name)
# Warn/omit empty groups
observed = wide_df.columns
missing = set(categories) - set(observed)
if missing:
warnings.warn(f'Categories with no data after dropna '
f'(omitted): {missing}')
super().__init__(data=wide_df,
parametric=parametric,
alpha_level=alpha_level)