from typing import Optional, Literal
from collections.abc import Callable
from dataclasses import dataclass
import warnings
import numpy as np
import pandas as pd
from scipy import stats
from ._types import VectorLike
from .exceptions import (
ExperimentalWarning, SeriesNameCollisionError, SeriesNameCollisionWarning
)
################################################################################
# Bootstraps
################################################################################
type _BootstrapTestTypes = Literal['means', 'medians']
[docs]
@dataclass
class BootResult:
r"""Dataclass to store bootstrap resampling results.
Attributes
----------
obs_stat : float or int
Observed test statistic (difference of means or medians).
boot_mean : float or int
Mean of the bootstrap distribution.
boot_sem : float or int
Standard error of the mean from the bootstrap distribution.
boot_ci_lo : float or int
Lower bound of the bias-corrected accelerated (BCa) confidence interval.
boot_ci_hi : float or int
Upper bound of the BCa confidence interval.
p : float
P-value from bootstrap hypothesis test (experimental).
boot_ci_pct : float
Confidence level percentage for the CI.
boot_dist : np.ndarray
Bootstrap distribution array.
"""
obs_stat: float | int
boot_mean: float | int
boot_sem: float | int
boot_ci_lo: float | int
boot_ci_hi: float | int
p: float
boot_ci_pct: float
boot_dist: np.ndarray
def __str__(self):
r"""String representation of bootstrap results.
Returns
-------
str
Formatted string with observed statistic, bootstrap mean, SEM, CI,
and p-value.
"""
return f'''
Observed Test Stat: {self.obs_stat:.4f}
Mean of Bootstrap Dist.: {self.boot_mean:.4f}
SEM of Bootstrap Dist.: {self.boot_sem:.4f}
BCa {self.boot_ci_pct:.0%} CI: {self.boot_ci_lo:.4f} to {self.boot_ci_hi:.4f}
{f'P = {self.p:.5f}' if self.p is not None else 'No hypothesis testing performed.'}
'''
[docs]
class TwoSeriesBootstrap:
r"""Class for bootstrapping CI & hypothesis tests on two continuous series.
Supports difference of means or medians. Provides bootstrap distribution,
CIs, and optional experimental p-value from a bootstrapped hypothesis test.
Currently, by default, no hypothesis tests are performed nor p-values given.
Parameters
----------
test : pd.Series
Test group data.
control : pd.Series
Control group data.
test_type : {'means', 'medians'}, default 'means'
Type of test statistic.
alpha_level : float, default 0.05
Significance level for CI.
n_resamples : int, default 10_000
Number of bootstrap resamples.
rng : int, default 519
Random seed for reproducibility.
boot_p_value : bool, default False
If True, compute experimental bootstrap P-value.
Attributes
----------
test : pd.Series
Cleaned test data.
control : pd.Series
Cleaned control data.
alpha : float
Significance level.
n_resamples : int
Number of resamples.
rng : int
Random seed.
boot_result : BootResult or None
Stored bootstrap results.
Raises
------
ValueError
If test_type is invalid.
Notes
-----
Regarding the current WIP bootstrap hypothesis test, P-values are not
currently displayed by default, as controlled by the ``boot_p_value``
Boolean parameter.
Under Frequentist statistical philosophy, a P-value represents the
probability of getting results as-or-more-extreme than those observed, if
the null hypothesis is true (this is even the assumption for classical
calculated asymptotic hypothesis tests). In the context of bootstrap
statistics, this would mean that a bootstrap distribution should represent
the distribution of test statistics (mean, median, t-stat, U-stat) that
would be observed *under the null hypothesis (H0)*, and the p-value would be
the fraction of test statistic values in that bootstrapped distribution
which are at least as extreme as what was actually observed.
In contrast, when creating a bootstrap distribution to estimate CI or SEM
(the primary use of bootstrapping), one creates a bootstrap distribution
*under the assumption that the* **alternative hypothesis (Ha)** *is true*.
Looking at how these bootstrap classes display results, this is apparent:
we display bootstrapped means/CIs for the test statistic (difference in
group means/medians) between test and control groups. Another way of
phrasing a statistically significant difference in means (at the
:math:`\alpha = .05` significance level), for example, would be to say that
the 95% CI of difference in the means of the test and control samples does
NOT include zero. Since the null hypothesis (at least
in cases relevant to typical biostats) is that the means or medians are
*equal* in the test and control groups (or that the difference in the means/
medians is zero), the fact that the bootstrapped difference in means is not
zero here shows that these bootstrap distributions for CI are NOT bootstraps
under the null hypothesis. Indeed, when first making this module, that
mistake was made, and it was found that even when there was an obvious
difference between test vs. control groups, bootstrapped "p-values" taken
from the same distribution as bootstrapped CIs returns a p-value near 0.50;
to put it another way, this mistaken approach found that the observed
difference in means was typically almost exactly the same as the average
bootstrapped difference in means -- exactly what would be expected in a
distribution representing the alternative hypothesis.
While checking for overlapping CIs does indicate statistical significance in
a Boolean fashion, it cannot give p-values. Instead, to calculate p-values
via a bootstrapped methodology, in addition to the bootstrapped distribution
under the Ha (used to estimate CIs), one must also use some method to
bootstrap the null distribution in order to calculate p-value. Boos &
Stefanski (2013) :cite:`boos_11_6` suggest 2 methods, as summarized in this
`lecture by Alex Kaizer <https://www.alexkaizer.com/bios_6618/files/bios6618/W16/bootstrap_pvalue.pdf>`_:
1. Combine all test and control observations (e.g. Hgb value for both blunt
& penetrating trauma patients), then sample with replacement for the N in
both test and control groups (number of blunt/penetrating patients) to
create bootstrapped test and control samples.
* On average, this approach will create equal means in each group
* However, the intermingling of observations in each group means that if
observed groups do not have equal variance, they will end up with
equal variances during resampling
* This is the same issue encountered in Welch vs. Student t-tests
* This risks inflating Type I error rate if variances are unequal
2. Create separate *observed* distributions for test & control, where each
value represents how much the original value differed from its group mean.
Each groups' bootstrap sample is sampled with replacement from its own
group's mean-shifted sample.
* Both test and control groups now have identical group means of 0
(since the mean variation from the mean in a sample is 0, duh)
* This also preserves unequal variances
* This seems like a superior strategy, and will ultimately be
implemented for this module
* This method does require performing completely separate bootstrap
samples for both test & control groups, if and only if the goal is
to perform a hypothesis test and derive a p-value
Due to coding challenges, a hybrid approach has been taken in the interim
here, which will later be changed.
* We take the bootstrap-under-Ha distribution (as used for CIs), and then
shift it to be centered about the mean of the bootstrap distribution.
* From this, we find how many bootstrap samples had a mean at least as
extreme as what was observed. This is the p-value.
* Since the bootstrap samples still allow admixture of test & control
observations, this method is accurate, but still destroys any unequal
variances between test & control groups
* For this reason, this approach is not robust for deployment
In contrast to bootstrapping, permutation tests (sampling without
replacement, and then randomly assigning to test or control) generates
a distribution under the null hypothesis. For this reason, permutation
methods more easily generate a p-value, but are not (easily) fit to
generating a CI for the test statistic, in the way bootstrapping is. While
the acceptability of using both methods (i.e. bootstrap a CI and then
permute a p-value), it doesn't immediately seem like an objectionable choice
if both values are truly needed. This would probably be preferred even once
bootstrapped hypothesis tests are fully implemented, since even that would
require an entire separate resampling run. The only time when bootstrapped
hypothesis tests are truly necessary is when the permutation test assumption
(that test & control groups are exchangeable; that is, they have the same
distributions if H0 is true) is violated. The proposed bootstrapped
hypothesis testing method (#2 above) does not make this assumption.
"""
_use_parametric_summ_stats = {'means': True, 'medians': False}
def __init__(self,
test: pd.Series,
control: pd.Series,
test_type: _BootstrapTestTypes = 'means',
alpha_level: float = .05,
n_resamples: int = 10_000,
rng: Optional[int] = 519,
boot_p_value: bool = False):
# Rename test & control if column names are same
if test.name == control.name:
warnings.warn(
SeriesNameCollisionWarning(
'`test` and `control` have the same column name.'
),
stacklevel=2
)
test = test.rename(f'{test.name}_TEST')
control = control.rename(f'{test.name}_CTRL')
self.test = test.dropna().reset_index(drop=True)
self.control = control.dropna().reset_index(drop=True)
if test_type in self._use_parametric_summ_stats.keys():
self._test_type = test_type
self._test_means: bool = self._use_parametric_summ_stats[test_type]
else:
raise ValueError(
f"`test_type` must be one of "
f"{self._use_parametric_summ_stats.keys()}"
)
self.alpha = alpha_level
self._alt_hypothesis = '='
self.n_resamples = n_resamples
self.rng = rng
self.boot_result: Optional[BootResult] = None
# Can remove this attribute & argument once a robust bootstrap p-value
# method has been implemented. For now, here to reduce confusion unless
# p-value is truly desired
self._calculate_boot_p = boot_p_value
if self._calculate_boot_p:
# Warn about current bootstrap p-value method
warnings.warn(
ExperimentalWarning(
feature='Bootstrapped p-value',
message='''
Currently, the bootstrapped hypothesis test p-value for mean &
median uses the assumption of approximate translation
equivariance in order to create a bootstrapped null distribution
of the test stat. This method pools variance (like Student's
t-test) in test & control. When samples have unequal variance,
Type I error can somewhat inflate (i.e. p-value is overly
optimistic), especially in small or skewed null distributions.
Of note, this does NOT affect mean/median permutation tests,
and, if bootstrapped Welch's t or Mann-Whitney U-tests were
implemented, would not be affected either under current methods.
The current bootstrap p-value method is valid but suboptimal for
hypothesis testing, though all other bootstrapped stats
including CI follow best practices.
In summary, use bootstrapping methods to calculate estimated
distribution stats (pop. mean, SEM, CIs), but prefer permutation
tests for hypothesis tests (has more distribution assumptions,
so mean/SEM/CIs are difficult and not worth calculating. I have
not checked for if it's frowned upon to both bootstrap (for
distribution stats) & permute (for p-value); I see no obvious
issue, but no clear benefit beyond more numbers to throw in a
manuscript.
Further explanation & references for will be available in
TwoSeriesBootstrap.bootstrap docstring via:
`instance_name.bootstrap.__doc__`
- or -
`help(TwoSeriesBootstrap.bootstrap)`
'''
),
stacklevel=2
)
@property
def results(self):
# Ensure bootstrapping has been run before printing
if self.boot_result is None:
self.bootstrap()
return (
f'{str(self.summ_stats)}\n'
f'''
Bootstrapped Hypothesis Test
* Test Stat: Difference of {self._test_type.capitalize()}
* Test Group: {self.test.name}
* Control Group: {self.control.name}
{str(self.boot_result)}
'''
)
@property
def summ_stats(self):
if self._test_means:
return self._parametric_summ_stats()
else:
return self._nonparametric_summ_stats()
def _parametric_summ_stats(self):
summ_stats = pd.DataFrame(
data={
self.test.name: {
'n': self.test.count(),
'mean': self.test.mean(),
'std': self.test.std(),
'min': self.test.min(),
'max': self.test.max(),
},
'Ha': self._alt_hypothesis,
self.control.name: {
'n': self.control.count(),
'mean': self.control.mean(),
'std': self.control.std(),
'min': self.control.min(),
'max': self.control.max(),
},
},
)
return summ_stats
def _nonparametric_summ_stats(self):
q_test = self.test.quantile([0, 0.25, 0.5, 0.75, 1])
q_control = self.control.quantile([0, 0.25, 0.5, 0.75, 1])
summ_stats = pd.DataFrame(
data={
self.test.name: {
'n': self.test.count(),
'min': q_test.loc[0],
'q1': q_test.loc[0.25],
'median': q_test.loc[0.5],
'q3': q_test.loc[0.75],
'max': q_test.loc[1],
'iqr': q_test.loc[0.75] - q_test.loc[0.25],
},
'Ha': self._alt_hypothesis,
self.control.name: {
'n': self.control.count(),
'min': q_control.loc[0],
'q1': q_control.loc[0.25],
'median': q_control.loc[0.5],
'q3': q_control.loc[0.75],
'max': q_control.loc[1],
'iqr': q_control.loc[0.75] - q_control.loc[0.25],
},
},
)
return summ_stats
[docs]
def bootstrap(self, n_resamples: int = None):
r"""Perform bootstrap resampling.
Parameters
----------
n_resamples : int, optional
Number of resamples. Defaults to self.n_resamples.
Returns
-------
BootResult
Bootstrap results.
Notes
-----
Uses BCa method for confidence intervals. P-value is experimental.
Updates hypothesis direction in summary stats if p < alpha.
"""
# Default to using self.n_resamples
n_resamples = n_resamples or self.n_resamples
# Use correct test statistic function
if self._test_means:
test_stat_func = self.mean_diff
else:
test_stat_func = self.median_diff
boot_result = stats.bootstrap(
data=(self.test, self.control),
statistic=test_stat_func,
n_resamples=n_resamples,
vectorized=True,
axis=0,
paired=False,
confidence_level=1 - self.alpha,
alternative='two-sided',
method='BCa',
rng=self.rng
)
# .item() ensures test stat returned 1 and only 1 value
obs_stat = test_stat_func(self.test, self.control)
# Store returned bootstrap distribution
boot_dist = boot_result.bootstrap_distribution
# Mean & SEM of bootstrap distribution
boot_mean = boot_dist.mean()
boot_sem = boot_result.standard_error
# confidence_level CI of the bootstrap distribution
boot_ci_lo, boot_ci_hi = boot_result.confidence_interval
if self._calculate_boot_p:
p_val = (
# Count of bootstrap stats >= positive value of observed stat
((boot_dist - boot_dist.mean()) >= np.abs(obs_stat)).sum()
# Count of bootstrap stats <= negative value of observed stat
+ ((boot_dist - boot_dist.mean()) <= -np.abs(obs_stat)).sum()
# +1 to numerator & denominator makes p-value bit more conservative
+ 1
) / (n_resamples + 1)
# Add results to stored attribute
self.boot_result = BootResult(
obs_stat=obs_stat,
boot_mean=boot_mean,
boot_sem=boot_sem,
boot_ci_lo=boot_ci_lo,
boot_ci_hi=boot_ci_hi,
p=p_val if self._calculate_boot_p else None,
boot_ci_pct=1 - self.alpha,
boot_dist=boot_dist,
)
# If p-value is significant, indicate direction in summ_stats
if self._calculate_boot_p and (self.boot_result.p is not None):
if self.boot_result.p < self.alpha:
if self.boot_result.obs_stat > 0:
self._alt_hypothesis = '>'
elif self.boot_result.obs_stat < 0:
self._alt_hypothesis = '<'
else:
warnings.warn(
'Significant P value but no between-groups difference in '
'test statistic observed. Verify data/results.'
)
self._alt_hypothesis = '='
else:
self._alt_hypothesis = '='
return self.boot_result
[docs]
@staticmethod
def mean_diff(
sample_a: VectorLike,
sample_b: VectorLike,
axis: Literal[0, 1, 'rows', 'columns'] = 0
) -> float | np.float64:
r"""Compute difference of means.
Parameters
----------
sample_a : VectorLike
First sample.
sample_b : VectorLike
Second sample.
axis : {0, 1, 'rows', 'columns'}, optional
Axis to compute along. Defaults to 0.
Returns
-------
float or np.float64
Difference of means.
"""
axis = {'rows': 0, 'columns': 1}.get(axis, axis)
# Convert inputs once (this fixes the 'unused variable' issue)
a = np.asarray(sample_a)
b = np.asarray(sample_b)
# Calculate absolute difference in means
return a.mean(axis=axis) - b.mean(axis=axis)
[docs]
class TwoSampleBootstrap(TwoSeriesBootstrap):
r"""Class for bootstrapping with a boolean grouping variable.
Extends TwoSeriesBootstrap to split a continuous series by a boolean group.
Parameters
----------
bool_x : pd.Series
Boolean grouping variable.
num_y : pd.Series
Continuous outcome variable.
test_type : {'means', 'medians'}, optional
Test statistic type. Defaults to 'means'.
alpha_level : float, optional
Significance level. Defaults to 0.05.
n_resamples : int, optional
Number of resamples. Defaults to 10,000.
rng : int, optional
Random seed. Defaults to 519.
x_test_lvl : bool, optional
Boolean level for test group. Defaults to True.
boot_p_value : bool, optional
Compute experimental p-value. Defaults to False.
Attributes
----------
x : pd.Series
Grouping variable.
y : pd.Series
Outcome variable.
Raises
------
ValueError
If group names collide or groups are empty after splitting.
Notes
-----
Regarding the current WIP bootstrap hypothesis test, p-values are not
currently displayed by default, as controlled by the ``boot_p_value``
Boolean parameter.
Under Frequentist statistical philosophy, a p-value represents the
probability of getting results as-or-more-extreme than those observed, if
the null hypothesis is true (this is even the assumption for classical
calculated asymptotic hypothesis tests). In the context of bootstrap
statistics, this would mean that a bootstrap distribution should represent
the distribution of test statistics (mean, median, t-stat, U-stat) that
would be observed *under the null hypothesis (H0)*, and the p-value would be
the fraction of test statistic values in that bootstrapped distribution
which are at least as extreme as what was actually observed.
In contrast, when creating a bootstrap distribution to estimate CI or SEM
(the primary use of bootstrapping), one creates a bootstrap distribution
*under the assumption that the* **alternative hypothesis (Ha)** *is true*.
Looking at how these bootstrap classes display results, this is apparent:
we display bootstrapped means/CIs for the test statistic (difference in
group means/medians) between test and control groups. Another way of
phrasing a statistically significant difference in means (at the
:math:`\alpha = .05` significance level), for example, would be to say that
the 95% CI of difference in the means of the test and control samples does
NOT include zero. Since the null hypothesis (at least
in cases relevant to typical biostats) is that the means or medians are
*equal* in the test and control groups (or that the difference in the means/
medians is zero), the fact that the bootstrapped difference in means is not
zero here shows that these bootstrap distributions for CI are NOT bootstraps
under the null hypothesis. Indeed, when first making this module, that
mistake was made, and it was found that even when there was an obvious
difference between test vs. control groups, bootstrapped "p-values" taken
from the same distribution as bootstrapped CIs returns a p-value near 0.50;
to put it another way, this mistaken approach found that the observed
difference in means was typically almost exactly the same as the average
bootstrapped difference in means -- exactly what would be expected in a
distribution representing the alternative hypothesis.
While checking for overlapping CIs does indicate statistical significance in
a Boolean fashion, it cannot give p-values. Instead, to calculate p-values
via a bootstrapped methodology, in addition to the bootstrapped distribution
under the Ha (used to estimate CIs), one must also use some method to
bootstrap the null distribution in order to calculate p-value. Boos &
Stefanski (2013) :cite:`boos_11_6` suggest 2 methods, as summarized in this
`lecture by Alex Kaizer <https://www.alexkaizer.com/bios_6618/files/bios6618/W16/bootstrap_pvalue.pdf>`_:
1. Combine all test and control observations (e.g. Hgb value for both blunt
& penetrating trauma patients), then sample with replacement for the N in
both test and control groups (number of blunt/penetrating patients) to
create bootstrapped test and control samples.
* On average, this approach will create equal means in each group.
* However, the intermingling of observations in each group means that if
observed groups do not have equal variance, they will end up with
equal variances during resampling.
* This is the same issue encountered in Welch vs. Student t-tests.
* This risks inflating Type I error rate if variances are unequal.
2. Create separate *observed* distributions for test & control, where each
value represents how much the original value differed from its group mean.
Each groups' bootstrap sample is sampled with replacement from its own
group's mean-shifted sample.
* Both test and control groups now have identical group means of 0
(since the mean variation from the mean in a sample is 0).
* This also preserves unequal variances.
* This seems like a superior strategy, and will ultimately be
implemented for this module.
* This method does require performing completely separate bootstrap
samples for both test & control groups, if and only if the goal is
to perform a hypothesis test and derive a p-value.
Due to coding challenges, a hybrid approach has been taken in the interim
here, which will later be changed.
* We take the bootstrap-under-Ha distribution (as used for CIs), and then
shift it to be centered about the mean of the bootstrap distribution.
* From this, we find how many bootstrap samples had a mean at least as
extreme as what was observed. This is the p-value.
* Since the bootstrap samples still allow admixture of test & control
observations, this method is accurate, but still destroys any unequal
variances between test & control groups.
* For this reason, this approach is not robust for deployment.
In contrast to bootstrapping, permutation tests (sampling without
replacement, and then randomly assigning to test or control) generates
a distribution under the null hypothesis. For this reason, permutation
methods more easily generate a p-value, but are not (easily) fit to
generating a CI for the test statistic, in the way bootstrapping is. While
the acceptability of using both methods (i.e. bootstrap a CI and then
permute a p-value), it doesn't immediately seem like an objectionable choice
if both values are truly needed. This would probably be preferred even once
bootstrapped hypothesis tests are fully implemented, since even that would
require an entire separate resampling run. The only time when bootstrapped
hypothesis tests are truly necessary is when the permutation test assumption
(that test & control groups are exchangeable; that is, they have the same
distributions if H0 is true) is violated. The proposed bootstrapped
hypothesis testing method (#2 above) does not make this assumption.
"""
def __init__(self,
bool_x: pd.Series, num_y: pd.Series,
test_type: _BootstrapTestTypes = 'means',
alpha_level: float = .05,
n_resamples: int = 10_000,
rng: Optional[int] = 519,
x_test_lvl: bool = True,
boot_p_value: bool = False):
# Ensure names exist and are not the same
bool_x = bool_x.rename(bool_x.name or 'x')
num_y = num_y.rename(num_y.name or 'y')
if bool_x.name == num_y.name:
raise ValueError('Both bool_x and num_y cannot have same names.')
# Concat into a df in order to .dropna, then define x & y series
self._df = (
pd.concat(
[
bool_x.astype('boolean'),
num_y,
],
axis='columns'
)
.dropna(axis='index')
)
self.x = self._df[bool_x.name]
self.y = self._df[num_y.name]
# Build mask to slice test/control base on x_test_lvl
self._test_x = bool(x_test_lvl) # Ensure is type bool
mask = (self.x == self._test_x).astype(bool) # ensure safe masking
# self.test is value of y where x == x_test_lvl
test = (
self.y[mask]
.rename(f'{self.x.name}_{str(self._test_x).upper()}')
)
control = (
self.y[~mask]
.rename(f'{self.x.name}_{str((not self._test_x)).upper()}')
)
if test.empty or control.empty:
raise ValueError('After cleaning/splitting, at least one group is '
'empty. Check x_test_lvl and values in bool_x.')
super().__init__(test=test,
control=control,
test_type=test_type,
alpha_level=alpha_level,
n_resamples=n_resamples,
rng=rng,
boot_p_value=boot_p_value)
def __str__(self):
r"""String representation of bootstrap results with outcome name.
Returns
-------
str
Formatted outcome, summary stats, and bootstrap results.
"""
# Ensure bootstrapping has been run before printing
if self.boot_result is None:
self.bootstrap()
return (
f'Outcome: {self.y.name}\n\n'
f'{str(self.summ_stats)}\n'
f'''
Bootstrapped Hypothesis Test
* Test Stat: Difference of {self._test_type.capitalize()}
* Test Group: {self.test.name}
* Control Group: {self.control.name}
{str(self.boot_result)}
'''
)
################################################################################
# Permutations
################################################################################
type _PermutationTestTypes = Literal['means', 'medians', 't_welch', 'mwu']
[docs]
@dataclass
class PermResult:
r"""Dataclass to store permutation test results.
Attributes
----------
obs_stat : float or int
Observed test statistic.
perm_dist : np.ndarray
Permutation null distribution.
p : float
P-value from permutation test.
"""
obs_stat: float | int
p: float
perm_dist: np.ndarray
def __str__(self):
r"""String representation of permutation results.
Returns
-------
str
Formatted observed statistic, mean/SEM of permutation distribution,
and p-value.
"""
return f'''
Observed Test Stat: {self.obs_stat:.4f}
P = {self.p:.5f}
'''
[docs]
class TwoSeriesPermutation:
r"""Class for permutation hypothesis tests on two continuous series.
Supports difference of means/medians, Welch's t, or Mann-Whitney U.
Parameters
----------
test : pd.Series
Test group data.
control : pd.Series
Control group data.
test_type : {'means', 'medians', 't_welch', 'mwu'}, optional
Test statistic type. Defaults to 'means'.
alpha_level : float, optional
Significance level. Defaults to 0.05.
n_resamples : int, optional
Number of permutations. Defaults to 10,000.
rng : int, default 519
Random seed.
Attributes
----------
test : pd.Series
Cleaned test data.
control : pd.Series
Cleaned control data.
test_type : str
Selected test type.
alpha : float
Significance level.
n_resamples : int
Number of resamples.
rng : int
Random seed.
perm_result : PermResult or None
Stored permutation results.
Raises
------
ValueError
If test_type is invalid.
"""
_test_display_names = {
'means': 'Difference of means',
'medians': 'Difference of medians',
't_welch': 'Welch t-statistic',
'mwu': 'M-W U-statistic',
}
_tests_using_parametric_summ_stats = ['means', 't_welch']
_tests_using_nonparametric_summ_stats = ['medians', 'mwu']
def __init__(self,
test: pd.Series,
control: pd.Series,
test_type: _PermutationTestTypes = 'means',
alpha_level: float = .05,
n_resamples: int = 10_000,
rng: Optional[int] = 519):
# Rename test & control if column names are same
if test.name == control.name:
warnings.warn(
SeriesNameCollisionWarning(
'`test` and `control` have the same column name.'
),
stacklevel=2
)
test = test.rename(f'{test.name}_TEST')
control = control.rename(f'{test.name}_CTRL')
self.test = test.dropna().reset_index(drop=True)
self.control = control.dropna().reset_index(drop=True)
if test_type in self._test_display_names.keys():
self.test_type = test_type
else:
raise ValueError(
f"`test_type` must be one of {self._test_display_names.keys()}."
)
self.alpha = alpha_level
self._alt_hypothesis = '='
self.n_resamples = n_resamples
self.rng = rng
self.perm_result: Optional[PermResult] = None
@property
def summ_stats(self):
r"""Get summary statistics based on test type.
Returns
-------
pd.DataFrame
Parametric or nonparametric summary.
"""
if self.test_type in self._tests_using_parametric_summ_stats:
return self._parametric_summ_stats()
# Explicitly want to default to nonparametric summary stats
else:
return self._nonparametric_summ_stats()
@property
def test_stat_func(self) -> Callable:
r"""Get the test statistic function based on test_type.
Returns
-------
Callable
Function to compute test statistic.
"""
func_map = {
'means': self.mean_diff,
'medians': self.median_diff,
't_welch': self.t_welch,
'mwu': self.mwu,
}
return func_map[self.test_type]
def _parametric_summ_stats(self):
r"""Compute parametric summary statistics.
Returns
-------
pd.DataFrame
Summary with n, mean, std, min, max, Ha indicator.
"""
summ_stats = pd.DataFrame(
data={
self.test.name: {
'n': self.test.count(),
'mean': self.test.mean(),
'std': self.test.std(),
'min': self.test.min(),
'max': self.test.max(),
},
'Ha': self._alt_hypothesis,
self.control.name: {
'n': self.control.count(),
'mean': self.control.mean(),
'std': self.control.std(),
'min': self.control.min(),
'max': self.control.max(),
},
},
)
return summ_stats
def _nonparametric_summ_stats(self):
r"""Compute nonparametric summary statistics.
Returns
-------
pd.DataFrame
Summary with n, min, q1, median, q3, max, iqr, Ha indicator.
"""
q_test = self.test.quantile([0, 0.25, 0.5, 0.75, 1])
q_control = self.control.quantile([0, 0.25, 0.5, 0.75, 1])
summ_stats = pd.DataFrame(
data={
self.test.name: {
'n': self.test.count(),
'min': q_test.loc[0],
'q1': q_test.loc[0.25],
'median': q_test.loc[0.5],
'q3': q_test.loc[0.75],
'max': q_test.loc[1],
'iqr': q_test.loc[0.75] - q_test.loc[0.25],
},
'Ha': self._alt_hypothesis,
self.control.name: {
'n': self.control.count(),
'min': q_control.loc[0],
'q1': q_control.loc[0.25],
'median': q_control.loc[0.5],
'q3': q_control.loc[0.75],
'max': q_control.loc[1],
'iqr': q_control.loc[0.75] - q_control.loc[0.25],
},
},
)
return summ_stats
[docs]
def permute(self, n_resamples: int = None):
r"""Perform permutation test.
Parameters
----------
n_resamples : int, optional
Number of permutations. Defaults to self.n_resamples.
Returns
-------
PermResult
Permutation results.
Notes
-----
Uses 'independent' permutation type, and does not support paired
samples. Updates Ha indicator in summ_stats if p < alpha.
"""
# Default to using self.n_resamples
n_resamples = n_resamples or self.n_resamples
perm_result = stats.permutation_test(
data=(self.test, self.control),
statistic=self.test_stat_func,
permutation_type='independent',
n_resamples=n_resamples,
vectorized=True,
axis=0,
alternative='two-sided',
rng=self.rng
)
# Save output to attribute
self.perm_result = PermResult(
obs_stat=perm_result.statistic.item(),
perm_dist=perm_result.null_distribution,
p=perm_result.pvalue.item()
)
# If p-value is significant, indicate direction in summ_stats
if self.perm_result.p < self.alpha:
if self.perm_result.obs_stat > 0:
self._alt_hypothesis = '>'
elif self.perm_result.obs_stat < 0:
self._alt_hypothesis = '<'
else:
warnings.warn(
'Significant P value but no between-groups difference in '
'test statistic observed. Verify data/results.'
)
self._alt_hypothesis = '='
else:
self._alt_hypothesis = '='
return self.perm_result
[docs]
@staticmethod
def mean_diff(
sample_a: VectorLike,
sample_b: VectorLike,
axis: Literal[0, 1, 'rows', 'columns'] = 0
):
r"""Compute difference of means for permutation.
Parameters
----------
sample_a : VectorLike
First permuted sample.
sample_b : VectorLike
Second permuted sample.
axis : {0, 1, 'rows', 'columns'}, optional
Axis to compute. Defaults to 0.
Returns
-------
float or np.float64
Difference of means.
"""
axis = {'rows': 0, 'columns': 1}.get(axis, axis)
# Convert inputs once (this fixes the 'unused variable' issue)
a = np.asarray(sample_a)
b = np.asarray(sample_b)
# Calculate absolute difference in means
return a.mean(axis=axis) - b.mean(axis=axis)
[docs]
@staticmethod
def t_welch(
sample_a: VectorLike,
sample_b: VectorLike,
axis: Literal[0, 1, 'rows', 'columns'] = 0
):
r"""Calculate Welch's 2-independent t-stat for a single permuted sample.
Takes in observations that have been randomly permuted into test &
control samples, called sample_a & sample_b, respectively (to
distinguish from actual observed test/control), and returns the
t-statistic.
Parameters
----------
sample_a, sample_b : pd.Series or array_like
A permuted set of observations for test or control sample,
respectively.
axis : {0, 1, 'rows', 'columns'}, default=0
Axis of sample_a & sample_b on which to perform t-test; passed to
`stats.ttest_ind()`.
Returns
-------
float or NumPy float
Welch's t-statistic for the permuted sample.
Notes
-----
Computes an asymptotic Welch's t-test (`equal_var=False`; correction for
heteroskedasticity), which is repeated for each permuted sample. Since
unequal-varianced samples violates the exchangeability assumption for
permutation testing, the permuted p-value is not exact for finite N,
thus Type I error is not guaranteed. However, Janssen & Pauls (2005)
Monte Carlo simulation study found that in samples with unequal
variance, permuted Welch t-tests still control Type I error better than
both standard asymptotic Welch's t-test, and better than permuted
Student's t-test.
Of note, passed `PermutationMethod` instance to `method` would use
`stats.permutation_test` to compute identical results, since
`self.permute()` just runs `stats.permutation_test` using
`self.t_welch()` as the test statistic. In later releases, permutation
t- & U-tests will be implemented using native SciPy method, but for now,
all permutation stats methods are being kept housed in this class, and
implementing the native SciPy method would be complex and temporary.
"""
axis = {'rows': 0, 'columns': 1}.get(axis, axis)
# Convert inputs once (this fixes the 'unused variable' issue)
a = np.asarray(sample_a)
b = np.asarray(sample_b)
# Run Welch's t-test & extract t-statistic
test = stats.ttest_ind(
a=a, b=b,
axis=axis,
equal_var=False,
alternative='two-sided',
)
return test.statistic
[docs]
@staticmethod
def mwu(
sample_a: VectorLike,
sample_b: VectorLike,
axis: Literal[0, 1, 'rows', 'columns'] = 0
):
r"""Calculate Mann-Whitney U test for a single permuted sample.
Takes in observations that have been randomly permuted into test &
control samples, called sample_a & sample_b, respectively (to
distinguish from actual observed test/control), and returns the Mann-
Whitney U-statistic.
Parameters
----------
sample_a : VectorLike
permuted observations assigned to test sample
sample_b : VectorLike
permuted observations assigned to control sample
axis : {0, 1, 'rows', 'columns'}, default=0
axis to compute MWU test; passed to stats.mannwhitneyu
Returns
-------
float or NumPy float
U-statistic for the permuted sample.
Notes
-----
Computes a standard non-resampling M-W U-test, which is repeated for
each permuted sample. Per SciPy docs, `method='auto'` computes an
exact test if 8 or fewer n_obs in either sample_a or sample_b.
Of note, passed `PermutationMethod` instance to `method` would use
`stats.permutation_test` to compute identical results, since
`self.permute()` just runs `stats.permutation_test` using `self.mwu()`
as the test statistic. In later releases, permutation t- & U-tests will
be implemented using native SciPy method, but for now, all permutation
stats methods are being kept housed in this class, and implementing
the native SciPy method here would be complex and temporary.
"""
axis = {'rows': 0, 'columns': 1}.get(axis, axis)
# Convert to NDArray)
a = np.asarray(sample_a)
b = np.asarray(sample_b)
# Run Mann-Whitney U-test
test = stats.mannwhitneyu(
x=a, y=b,
use_continuity=True,
alternative='two-sided',
axis=axis,
method='auto',
)
return test.statistic
def __str__(self):
r"""String representation of permutation results.
Runs permute if not done.
Returns
-------
str
Formatted summary and permutation results.
"""
# Ensure permutation has been run before printing
if self.perm_result is None:
self.permute()
return (
f'{str(self.summ_stats)}\n'
f'''
Permutation Hypothesis Test
* Test Stat: {self._test_display_names[self.test_type]}
* Test Group: {self.test.name}
* Control Group: {self.control.name}
{str(self.perm_result)}
'''
)
[docs]
class TwoSamplePermutation(TwoSeriesPermutation):
r"""Class for permutation tests with a boolean grouping variable.
Extends TwoSeriesPermutation to split continuous series by boolean group.
Parameters
----------
bool_x : pd.Series
Boolean grouping.
num_y : pd.Series
Continuous outcome.
test_type : {'means', 'medians', 't_welch', 'mwu'}, default 'means'
Test type.
alpha_level : float, default 0.05
Significance level.
n_resamples : int, default 10_000
Permutations..
rng : int, default 519
Random seed.
x_test_lvl : bool, default True
Test level.
Attributes
----------
x : pd.Series
Grouping.
y : pd.Series
Outcome.
Raises
------
SeriesNameCollisionError
If names collide.
ValueError
If groups empty.
"""
def __init__(self,
bool_x: pd.Series, num_y: pd.Series,
test_type: _PermutationTestTypes = 'means',
alpha_level: float = .05,
n_resamples: int = 10_000,
rng: Optional[int] = 519,
x_test_lvl: bool = True):
# Ensure names exist and are not the same
bool_x = bool_x.rename(bool_x.name or 'x')
num_y = num_y.rename(num_y.name or 'y')
if bool_x.name == num_y.name:
raise SeriesNameCollisionError('Both bool_x and num_y cannot '
'have same names.')
# Concat into a df in order to .dropna, then define x & y series
self._df = (
pd.concat(
[
bool_x.astype('boolean'),
num_y,
],
axis='columns'
)
.dropna(axis='index')
)
self.x = self._df[bool_x.name]
self.y = self._df[num_y.name]
# Build mask to slice test/control base on x_test_lvl
self._test_x = bool(x_test_lvl) # Ensure is type bool
mask = (self.x == self._test_x).astype(bool) # ensure safe masking
# self.test is value of y where x == x_test_lvl
test = (
self.y[mask]
.rename(f'{self.x.name}_{str(self._test_x).upper()}')
)
control = (
self.y[~mask]
.rename(f'{self.x.name}_{str((not self._test_x)).upper()}')
)
if test.empty or control.empty:
raise ValueError('After cleaning/splitting, at least one group is '
'empty. Check x_test_lvl and values in bool_x.')
super().__init__(test=test,
control=control,
test_type=test_type,
alpha_level=alpha_level,
n_resamples=n_resamples,
rng=rng)
def __str__(self):
r"""String representation with outcome name.
Returns
-------
str
Formatted outcome, summary, permutation results.
"""
# Ensure bootstrapping has been run before printing
if self.perm_result is None:
self.permute()
return (
f'Outcome: {self.y.name}\n\n'
f'{str(self.summ_stats)}\n'
f'''
Permutation Hypothesis Test
* Test Stat: {self._test_display_names[self.test_type]}
* Test Group: {self.test.name}
* Control Group: {self.control.name}
{str(self.perm_result)}
'''
)