Source code for unistat.resampling

from typing import Optional, Literal
from collections.abc import Callable
from dataclasses import dataclass
import warnings
import numpy as np
import pandas as pd
from scipy import stats
from ._types import VectorLike
from .exceptions import (
    ExperimentalWarning, SeriesNameCollisionError, SeriesNameCollisionWarning
)

################################################################################
# Bootstraps
################################################################################

type _BootstrapTestTypes = Literal['means', 'medians']


[docs] @dataclass class BootResult: r"""Dataclass to store bootstrap resampling results. Attributes ---------- obs_stat : float or int Observed test statistic (difference of means or medians). boot_mean : float or int Mean of the bootstrap distribution. boot_sem : float or int Standard error of the mean from the bootstrap distribution. boot_ci_lo : float or int Lower bound of the bias-corrected accelerated (BCa) confidence interval. boot_ci_hi : float or int Upper bound of the BCa confidence interval. p : float P-value from bootstrap hypothesis test (experimental). boot_ci_pct : float Confidence level percentage for the CI. boot_dist : np.ndarray Bootstrap distribution array. """ obs_stat: float | int boot_mean: float | int boot_sem: float | int boot_ci_lo: float | int boot_ci_hi: float | int p: float boot_ci_pct: float boot_dist: np.ndarray def __str__(self): r"""String representation of bootstrap results. Returns ------- str Formatted string with observed statistic, bootstrap mean, SEM, CI, and p-value. """ return f''' Observed Test Stat: {self.obs_stat:.4f} Mean of Bootstrap Dist.: {self.boot_mean:.4f} SEM of Bootstrap Dist.: {self.boot_sem:.4f} BCa {self.boot_ci_pct:.0%} CI: {self.boot_ci_lo:.4f} to {self.boot_ci_hi:.4f} {f'P = {self.p:.5f}' if self.p is not None else 'No hypothesis testing performed.'} '''
[docs] class TwoSeriesBootstrap: r"""Class for bootstrapping CI & hypothesis tests on two continuous series. Supports difference of means or medians. Provides bootstrap distribution, CIs, and optional experimental p-value from a bootstrapped hypothesis test. Currently, by default, no hypothesis tests are performed nor p-values given. Parameters ---------- test : pd.Series Test group data. control : pd.Series Control group data. test_type : {'means', 'medians'}, default 'means' Type of test statistic. alpha_level : float, default 0.05 Significance level for CI. n_resamples : int, default 10_000 Number of bootstrap resamples. rng : int, default 519 Random seed for reproducibility. boot_p_value : bool, default False If True, compute experimental bootstrap P-value. Attributes ---------- test : pd.Series Cleaned test data. control : pd.Series Cleaned control data. alpha : float Significance level. n_resamples : int Number of resamples. rng : int Random seed. boot_result : BootResult or None Stored bootstrap results. Raises ------ ValueError If test_type is invalid. Notes ----- Regarding the current WIP bootstrap hypothesis test, P-values are not currently displayed by default, as controlled by the ``boot_p_value`` Boolean parameter. Under Frequentist statistical philosophy, a P-value represents the probability of getting results as-or-more-extreme than those observed, if the null hypothesis is true (this is even the assumption for classical calculated asymptotic hypothesis tests). In the context of bootstrap statistics, this would mean that a bootstrap distribution should represent the distribution of test statistics (mean, median, t-stat, U-stat) that would be observed *under the null hypothesis (H0)*, and the p-value would be the fraction of test statistic values in that bootstrapped distribution which are at least as extreme as what was actually observed. In contrast, when creating a bootstrap distribution to estimate CI or SEM (the primary use of bootstrapping), one creates a bootstrap distribution *under the assumption that the* **alternative hypothesis (Ha)** *is true*. Looking at how these bootstrap classes display results, this is apparent: we display bootstrapped means/CIs for the test statistic (difference in group means/medians) between test and control groups. Another way of phrasing a statistically significant difference in means (at the :math:`\alpha = .05` significance level), for example, would be to say that the 95% CI of difference in the means of the test and control samples does NOT include zero. Since the null hypothesis (at least in cases relevant to typical biostats) is that the means or medians are *equal* in the test and control groups (or that the difference in the means/ medians is zero), the fact that the bootstrapped difference in means is not zero here shows that these bootstrap distributions for CI are NOT bootstraps under the null hypothesis. Indeed, when first making this module, that mistake was made, and it was found that even when there was an obvious difference between test vs. control groups, bootstrapped "p-values" taken from the same distribution as bootstrapped CIs returns a p-value near 0.50; to put it another way, this mistaken approach found that the observed difference in means was typically almost exactly the same as the average bootstrapped difference in means -- exactly what would be expected in a distribution representing the alternative hypothesis. While checking for overlapping CIs does indicate statistical significance in a Boolean fashion, it cannot give p-values. Instead, to calculate p-values via a bootstrapped methodology, in addition to the bootstrapped distribution under the Ha (used to estimate CIs), one must also use some method to bootstrap the null distribution in order to calculate p-value. Boos & Stefanski (2013) :cite:`boos_11_6` suggest 2 methods, as summarized in this `lecture by Alex Kaizer <https://www.alexkaizer.com/bios_6618/files/bios6618/W16/bootstrap_pvalue.pdf>`_: 1. Combine all test and control observations (e.g. Hgb value for both blunt & penetrating trauma patients), then sample with replacement for the N in both test and control groups (number of blunt/penetrating patients) to create bootstrapped test and control samples. * On average, this approach will create equal means in each group * However, the intermingling of observations in each group means that if observed groups do not have equal variance, they will end up with equal variances during resampling * This is the same issue encountered in Welch vs. Student t-tests * This risks inflating Type I error rate if variances are unequal 2. Create separate *observed* distributions for test & control, where each value represents how much the original value differed from its group mean. Each groups' bootstrap sample is sampled with replacement from its own group's mean-shifted sample. * Both test and control groups now have identical group means of 0 (since the mean variation from the mean in a sample is 0, duh) * This also preserves unequal variances * This seems like a superior strategy, and will ultimately be implemented for this module * This method does require performing completely separate bootstrap samples for both test & control groups, if and only if the goal is to perform a hypothesis test and derive a p-value Due to coding challenges, a hybrid approach has been taken in the interim here, which will later be changed. * We take the bootstrap-under-Ha distribution (as used for CIs), and then shift it to be centered about the mean of the bootstrap distribution. * From this, we find how many bootstrap samples had a mean at least as extreme as what was observed. This is the p-value. * Since the bootstrap samples still allow admixture of test & control observations, this method is accurate, but still destroys any unequal variances between test & control groups * For this reason, this approach is not robust for deployment In contrast to bootstrapping, permutation tests (sampling without replacement, and then randomly assigning to test or control) generates a distribution under the null hypothesis. For this reason, permutation methods more easily generate a p-value, but are not (easily) fit to generating a CI for the test statistic, in the way bootstrapping is. While the acceptability of using both methods (i.e. bootstrap a CI and then permute a p-value), it doesn't immediately seem like an objectionable choice if both values are truly needed. This would probably be preferred even once bootstrapped hypothesis tests are fully implemented, since even that would require an entire separate resampling run. The only time when bootstrapped hypothesis tests are truly necessary is when the permutation test assumption (that test & control groups are exchangeable; that is, they have the same distributions if H0 is true) is violated. The proposed bootstrapped hypothesis testing method (#2 above) does not make this assumption. """ _use_parametric_summ_stats = {'means': True, 'medians': False} def __init__(self, test: pd.Series, control: pd.Series, test_type: _BootstrapTestTypes = 'means', alpha_level: float = .05, n_resamples: int = 10_000, rng: Optional[int] = 519, boot_p_value: bool = False): # Rename test & control if column names are same if test.name == control.name: warnings.warn( SeriesNameCollisionWarning( '`test` and `control` have the same column name.' ), stacklevel=2 ) test = test.rename(f'{test.name}_TEST') control = control.rename(f'{test.name}_CTRL') self.test = test.dropna().reset_index(drop=True) self.control = control.dropna().reset_index(drop=True) if test_type in self._use_parametric_summ_stats.keys(): self._test_type = test_type self._test_means: bool = self._use_parametric_summ_stats[test_type] else: raise ValueError( f"`test_type` must be one of " f"{self._use_parametric_summ_stats.keys()}" ) self.alpha = alpha_level self._alt_hypothesis = '=' self.n_resamples = n_resamples self.rng = rng self.boot_result: Optional[BootResult] = None # Can remove this attribute & argument once a robust bootstrap p-value # method has been implemented. For now, here to reduce confusion unless # p-value is truly desired self._calculate_boot_p = boot_p_value if self._calculate_boot_p: # Warn about current bootstrap p-value method warnings.warn( ExperimentalWarning( feature='Bootstrapped p-value', message=''' Currently, the bootstrapped hypothesis test p-value for mean & median uses the assumption of approximate translation equivariance in order to create a bootstrapped null distribution of the test stat. This method pools variance (like Student's t-test) in test & control. When samples have unequal variance, Type I error can somewhat inflate (i.e. p-value is overly optimistic), especially in small or skewed null distributions. Of note, this does NOT affect mean/median permutation tests, and, if bootstrapped Welch's t or Mann-Whitney U-tests were implemented, would not be affected either under current methods. The current bootstrap p-value method is valid but suboptimal for hypothesis testing, though all other bootstrapped stats including CI follow best practices. In summary, use bootstrapping methods to calculate estimated distribution stats (pop. mean, SEM, CIs), but prefer permutation tests for hypothesis tests (has more distribution assumptions, so mean/SEM/CIs are difficult and not worth calculating. I have not checked for if it's frowned upon to both bootstrap (for distribution stats) & permute (for p-value); I see no obvious issue, but no clear benefit beyond more numbers to throw in a manuscript. Further explanation & references for will be available in TwoSeriesBootstrap.bootstrap docstring via: `instance_name.bootstrap.__doc__` - or - `help(TwoSeriesBootstrap.bootstrap)` ''' ), stacklevel=2 ) @property def results(self): # Ensure bootstrapping has been run before printing if self.boot_result is None: self.bootstrap() return ( f'{str(self.summ_stats)}\n' f''' Bootstrapped Hypothesis Test * Test Stat: Difference of {self._test_type.capitalize()} * Test Group: {self.test.name} * Control Group: {self.control.name} {str(self.boot_result)} ''' ) @property def summ_stats(self): if self._test_means: return self._parametric_summ_stats() else: return self._nonparametric_summ_stats() def _parametric_summ_stats(self): summ_stats = pd.DataFrame( data={ self.test.name: { 'n': self.test.count(), 'mean': self.test.mean(), 'std': self.test.std(), 'min': self.test.min(), 'max': self.test.max(), }, 'Ha': self._alt_hypothesis, self.control.name: { 'n': self.control.count(), 'mean': self.control.mean(), 'std': self.control.std(), 'min': self.control.min(), 'max': self.control.max(), }, }, ) return summ_stats def _nonparametric_summ_stats(self): q_test = self.test.quantile([0, 0.25, 0.5, 0.75, 1]) q_control = self.control.quantile([0, 0.25, 0.5, 0.75, 1]) summ_stats = pd.DataFrame( data={ self.test.name: { 'n': self.test.count(), 'min': q_test.loc[0], 'q1': q_test.loc[0.25], 'median': q_test.loc[0.5], 'q3': q_test.loc[0.75], 'max': q_test.loc[1], 'iqr': q_test.loc[0.75] - q_test.loc[0.25], }, 'Ha': self._alt_hypothesis, self.control.name: { 'n': self.control.count(), 'min': q_control.loc[0], 'q1': q_control.loc[0.25], 'median': q_control.loc[0.5], 'q3': q_control.loc[0.75], 'max': q_control.loc[1], 'iqr': q_control.loc[0.75] - q_control.loc[0.25], }, }, ) return summ_stats
[docs] def bootstrap(self, n_resamples: int = None): r"""Perform bootstrap resampling. Parameters ---------- n_resamples : int, optional Number of resamples. Defaults to self.n_resamples. Returns ------- BootResult Bootstrap results. Notes ----- Uses BCa method for confidence intervals. P-value is experimental. Updates hypothesis direction in summary stats if p < alpha. """ # Default to using self.n_resamples n_resamples = n_resamples or self.n_resamples # Use correct test statistic function if self._test_means: test_stat_func = self.mean_diff else: test_stat_func = self.median_diff boot_result = stats.bootstrap( data=(self.test, self.control), statistic=test_stat_func, n_resamples=n_resamples, vectorized=True, axis=0, paired=False, confidence_level=1 - self.alpha, alternative='two-sided', method='BCa', rng=self.rng ) # .item() ensures test stat returned 1 and only 1 value obs_stat = test_stat_func(self.test, self.control) # Store returned bootstrap distribution boot_dist = boot_result.bootstrap_distribution # Mean & SEM of bootstrap distribution boot_mean = boot_dist.mean() boot_sem = boot_result.standard_error # confidence_level CI of the bootstrap distribution boot_ci_lo, boot_ci_hi = boot_result.confidence_interval if self._calculate_boot_p: p_val = ( # Count of bootstrap stats >= positive value of observed stat ((boot_dist - boot_dist.mean()) >= np.abs(obs_stat)).sum() # Count of bootstrap stats <= negative value of observed stat + ((boot_dist - boot_dist.mean()) <= -np.abs(obs_stat)).sum() # +1 to numerator & denominator makes p-value bit more conservative + 1 ) / (n_resamples + 1) # Add results to stored attribute self.boot_result = BootResult( obs_stat=obs_stat, boot_mean=boot_mean, boot_sem=boot_sem, boot_ci_lo=boot_ci_lo, boot_ci_hi=boot_ci_hi, p=p_val if self._calculate_boot_p else None, boot_ci_pct=1 - self.alpha, boot_dist=boot_dist, ) # If p-value is significant, indicate direction in summ_stats if self._calculate_boot_p and (self.boot_result.p is not None): if self.boot_result.p < self.alpha: if self.boot_result.obs_stat > 0: self._alt_hypothesis = '>' elif self.boot_result.obs_stat < 0: self._alt_hypothesis = '<' else: warnings.warn( 'Significant P value but no between-groups difference in ' 'test statistic observed. Verify data/results.' ) self._alt_hypothesis = '=' else: self._alt_hypothesis = '=' return self.boot_result
[docs] @staticmethod def mean_diff( sample_a: VectorLike, sample_b: VectorLike, axis: Literal[0, 1, 'rows', 'columns'] = 0 ) -> float | np.float64: r"""Compute difference of means. Parameters ---------- sample_a : VectorLike First sample. sample_b : VectorLike Second sample. axis : {0, 1, 'rows', 'columns'}, optional Axis to compute along. Defaults to 0. Returns ------- float or np.float64 Difference of means. """ axis = {'rows': 0, 'columns': 1}.get(axis, axis) # Convert inputs once (this fixes the 'unused variable' issue) a = np.asarray(sample_a) b = np.asarray(sample_b) # Calculate absolute difference in means return a.mean(axis=axis) - b.mean(axis=axis)
[docs] @staticmethod def median_diff( sample_a: VectorLike, sample_b: VectorLike, axis: Literal[0, 1, 'rows', 'columns'] = 0 ) -> float | np.float64: r"""Compute difference of medians. Parameters ---------- sample_a : VectorLike First sample. sample_b : VectorLike Second sample. axis : {0, 1, 'rows', 'columns'}, optional Axis to compute along. Defaults to 0. Returns ------- float or np.float64 Difference of medians. """ axis = {'rows': 0, 'columns': 1}.get(axis, axis) # Convert inputs once (this fixes the 'unused variable' issue) a = np.asarray(sample_a) b = np.asarray(sample_b) # Calculate absolute difference in means return np.median(a, axis=axis) - np.median(b, axis=axis)
[docs] class TwoSampleBootstrap(TwoSeriesBootstrap): r"""Class for bootstrapping with a boolean grouping variable. Extends TwoSeriesBootstrap to split a continuous series by a boolean group. Parameters ---------- bool_x : pd.Series Boolean grouping variable. num_y : pd.Series Continuous outcome variable. test_type : {'means', 'medians'}, optional Test statistic type. Defaults to 'means'. alpha_level : float, optional Significance level. Defaults to 0.05. n_resamples : int, optional Number of resamples. Defaults to 10,000. rng : int, optional Random seed. Defaults to 519. x_test_lvl : bool, optional Boolean level for test group. Defaults to True. boot_p_value : bool, optional Compute experimental p-value. Defaults to False. Attributes ---------- x : pd.Series Grouping variable. y : pd.Series Outcome variable. Raises ------ ValueError If group names collide or groups are empty after splitting. Notes ----- Regarding the current WIP bootstrap hypothesis test, p-values are not currently displayed by default, as controlled by the ``boot_p_value`` Boolean parameter. Under Frequentist statistical philosophy, a p-value represents the probability of getting results as-or-more-extreme than those observed, if the null hypothesis is true (this is even the assumption for classical calculated asymptotic hypothesis tests). In the context of bootstrap statistics, this would mean that a bootstrap distribution should represent the distribution of test statistics (mean, median, t-stat, U-stat) that would be observed *under the null hypothesis (H0)*, and the p-value would be the fraction of test statistic values in that bootstrapped distribution which are at least as extreme as what was actually observed. In contrast, when creating a bootstrap distribution to estimate CI or SEM (the primary use of bootstrapping), one creates a bootstrap distribution *under the assumption that the* **alternative hypothesis (Ha)** *is true*. Looking at how these bootstrap classes display results, this is apparent: we display bootstrapped means/CIs for the test statistic (difference in group means/medians) between test and control groups. Another way of phrasing a statistically significant difference in means (at the :math:`\alpha = .05` significance level), for example, would be to say that the 95% CI of difference in the means of the test and control samples does NOT include zero. Since the null hypothesis (at least in cases relevant to typical biostats) is that the means or medians are *equal* in the test and control groups (or that the difference in the means/ medians is zero), the fact that the bootstrapped difference in means is not zero here shows that these bootstrap distributions for CI are NOT bootstraps under the null hypothesis. Indeed, when first making this module, that mistake was made, and it was found that even when there was an obvious difference between test vs. control groups, bootstrapped "p-values" taken from the same distribution as bootstrapped CIs returns a p-value near 0.50; to put it another way, this mistaken approach found that the observed difference in means was typically almost exactly the same as the average bootstrapped difference in means -- exactly what would be expected in a distribution representing the alternative hypothesis. While checking for overlapping CIs does indicate statistical significance in a Boolean fashion, it cannot give p-values. Instead, to calculate p-values via a bootstrapped methodology, in addition to the bootstrapped distribution under the Ha (used to estimate CIs), one must also use some method to bootstrap the null distribution in order to calculate p-value. Boos & Stefanski (2013) :cite:`boos_11_6` suggest 2 methods, as summarized in this `lecture by Alex Kaizer <https://www.alexkaizer.com/bios_6618/files/bios6618/W16/bootstrap_pvalue.pdf>`_: 1. Combine all test and control observations (e.g. Hgb value for both blunt & penetrating trauma patients), then sample with replacement for the N in both test and control groups (number of blunt/penetrating patients) to create bootstrapped test and control samples. * On average, this approach will create equal means in each group. * However, the intermingling of observations in each group means that if observed groups do not have equal variance, they will end up with equal variances during resampling. * This is the same issue encountered in Welch vs. Student t-tests. * This risks inflating Type I error rate if variances are unequal. 2. Create separate *observed* distributions for test & control, where each value represents how much the original value differed from its group mean. Each groups' bootstrap sample is sampled with replacement from its own group's mean-shifted sample. * Both test and control groups now have identical group means of 0 (since the mean variation from the mean in a sample is 0). * This also preserves unequal variances. * This seems like a superior strategy, and will ultimately be implemented for this module. * This method does require performing completely separate bootstrap samples for both test & control groups, if and only if the goal is to perform a hypothesis test and derive a p-value. Due to coding challenges, a hybrid approach has been taken in the interim here, which will later be changed. * We take the bootstrap-under-Ha distribution (as used for CIs), and then shift it to be centered about the mean of the bootstrap distribution. * From this, we find how many bootstrap samples had a mean at least as extreme as what was observed. This is the p-value. * Since the bootstrap samples still allow admixture of test & control observations, this method is accurate, but still destroys any unequal variances between test & control groups. * For this reason, this approach is not robust for deployment. In contrast to bootstrapping, permutation tests (sampling without replacement, and then randomly assigning to test or control) generates a distribution under the null hypothesis. For this reason, permutation methods more easily generate a p-value, but are not (easily) fit to generating a CI for the test statistic, in the way bootstrapping is. While the acceptability of using both methods (i.e. bootstrap a CI and then permute a p-value), it doesn't immediately seem like an objectionable choice if both values are truly needed. This would probably be preferred even once bootstrapped hypothesis tests are fully implemented, since even that would require an entire separate resampling run. The only time when bootstrapped hypothesis tests are truly necessary is when the permutation test assumption (that test & control groups are exchangeable; that is, they have the same distributions if H0 is true) is violated. The proposed bootstrapped hypothesis testing method (#2 above) does not make this assumption. """ def __init__(self, bool_x: pd.Series, num_y: pd.Series, test_type: _BootstrapTestTypes = 'means', alpha_level: float = .05, n_resamples: int = 10_000, rng: Optional[int] = 519, x_test_lvl: bool = True, boot_p_value: bool = False): # Ensure names exist and are not the same bool_x = bool_x.rename(bool_x.name or 'x') num_y = num_y.rename(num_y.name or 'y') if bool_x.name == num_y.name: raise ValueError('Both bool_x and num_y cannot have same names.') # Concat into a df in order to .dropna, then define x & y series self._df = ( pd.concat( [ bool_x.astype('boolean'), num_y, ], axis='columns' ) .dropna(axis='index') ) self.x = self._df[bool_x.name] self.y = self._df[num_y.name] # Build mask to slice test/control base on x_test_lvl self._test_x = bool(x_test_lvl) # Ensure is type bool mask = (self.x == self._test_x).astype(bool) # ensure safe masking # self.test is value of y where x == x_test_lvl test = ( self.y[mask] .rename(f'{self.x.name}_{str(self._test_x).upper()}') ) control = ( self.y[~mask] .rename(f'{self.x.name}_{str((not self._test_x)).upper()}') ) if test.empty or control.empty: raise ValueError('After cleaning/splitting, at least one group is ' 'empty. Check x_test_lvl and values in bool_x.') super().__init__(test=test, control=control, test_type=test_type, alpha_level=alpha_level, n_resamples=n_resamples, rng=rng, boot_p_value=boot_p_value) def __str__(self): r"""String representation of bootstrap results with outcome name. Returns ------- str Formatted outcome, summary stats, and bootstrap results. """ # Ensure bootstrapping has been run before printing if self.boot_result is None: self.bootstrap() return ( f'Outcome: {self.y.name}\n\n' f'{str(self.summ_stats)}\n' f''' Bootstrapped Hypothesis Test * Test Stat: Difference of {self._test_type.capitalize()} * Test Group: {self.test.name} * Control Group: {self.control.name} {str(self.boot_result)} ''' )
################################################################################ # Permutations ################################################################################ type _PermutationTestTypes = Literal['means', 'medians', 't_welch', 'mwu']
[docs] @dataclass class PermResult: r"""Dataclass to store permutation test results. Attributes ---------- obs_stat : float or int Observed test statistic. perm_dist : np.ndarray Permutation null distribution. p : float P-value from permutation test. """ obs_stat: float | int p: float perm_dist: np.ndarray def __str__(self): r"""String representation of permutation results. Returns ------- str Formatted observed statistic, mean/SEM of permutation distribution, and p-value. """ return f''' Observed Test Stat: {self.obs_stat:.4f} P = {self.p:.5f} '''
[docs] class TwoSeriesPermutation: r"""Class for permutation hypothesis tests on two continuous series. Supports difference of means/medians, Welch's t, or Mann-Whitney U. Parameters ---------- test : pd.Series Test group data. control : pd.Series Control group data. test_type : {'means', 'medians', 't_welch', 'mwu'}, optional Test statistic type. Defaults to 'means'. alpha_level : float, optional Significance level. Defaults to 0.05. n_resamples : int, optional Number of permutations. Defaults to 10,000. rng : int, default 519 Random seed. Attributes ---------- test : pd.Series Cleaned test data. control : pd.Series Cleaned control data. test_type : str Selected test type. alpha : float Significance level. n_resamples : int Number of resamples. rng : int Random seed. perm_result : PermResult or None Stored permutation results. Raises ------ ValueError If test_type is invalid. """ _test_display_names = { 'means': 'Difference of means', 'medians': 'Difference of medians', 't_welch': 'Welch t-statistic', 'mwu': 'M-W U-statistic', } _tests_using_parametric_summ_stats = ['means', 't_welch'] _tests_using_nonparametric_summ_stats = ['medians', 'mwu'] def __init__(self, test: pd.Series, control: pd.Series, test_type: _PermutationTestTypes = 'means', alpha_level: float = .05, n_resamples: int = 10_000, rng: Optional[int] = 519): # Rename test & control if column names are same if test.name == control.name: warnings.warn( SeriesNameCollisionWarning( '`test` and `control` have the same column name.' ), stacklevel=2 ) test = test.rename(f'{test.name}_TEST') control = control.rename(f'{test.name}_CTRL') self.test = test.dropna().reset_index(drop=True) self.control = control.dropna().reset_index(drop=True) if test_type in self._test_display_names.keys(): self.test_type = test_type else: raise ValueError( f"`test_type` must be one of {self._test_display_names.keys()}." ) self.alpha = alpha_level self._alt_hypothesis = '=' self.n_resamples = n_resamples self.rng = rng self.perm_result: Optional[PermResult] = None @property def summ_stats(self): r"""Get summary statistics based on test type. Returns ------- pd.DataFrame Parametric or nonparametric summary. """ if self.test_type in self._tests_using_parametric_summ_stats: return self._parametric_summ_stats() # Explicitly want to default to nonparametric summary stats else: return self._nonparametric_summ_stats() @property def test_stat_func(self) -> Callable: r"""Get the test statistic function based on test_type. Returns ------- Callable Function to compute test statistic. """ func_map = { 'means': self.mean_diff, 'medians': self.median_diff, 't_welch': self.t_welch, 'mwu': self.mwu, } return func_map[self.test_type] def _parametric_summ_stats(self): r"""Compute parametric summary statistics. Returns ------- pd.DataFrame Summary with n, mean, std, min, max, Ha indicator. """ summ_stats = pd.DataFrame( data={ self.test.name: { 'n': self.test.count(), 'mean': self.test.mean(), 'std': self.test.std(), 'min': self.test.min(), 'max': self.test.max(), }, 'Ha': self._alt_hypothesis, self.control.name: { 'n': self.control.count(), 'mean': self.control.mean(), 'std': self.control.std(), 'min': self.control.min(), 'max': self.control.max(), }, }, ) return summ_stats def _nonparametric_summ_stats(self): r"""Compute nonparametric summary statistics. Returns ------- pd.DataFrame Summary with n, min, q1, median, q3, max, iqr, Ha indicator. """ q_test = self.test.quantile([0, 0.25, 0.5, 0.75, 1]) q_control = self.control.quantile([0, 0.25, 0.5, 0.75, 1]) summ_stats = pd.DataFrame( data={ self.test.name: { 'n': self.test.count(), 'min': q_test.loc[0], 'q1': q_test.loc[0.25], 'median': q_test.loc[0.5], 'q3': q_test.loc[0.75], 'max': q_test.loc[1], 'iqr': q_test.loc[0.75] - q_test.loc[0.25], }, 'Ha': self._alt_hypothesis, self.control.name: { 'n': self.control.count(), 'min': q_control.loc[0], 'q1': q_control.loc[0.25], 'median': q_control.loc[0.5], 'q3': q_control.loc[0.75], 'max': q_control.loc[1], 'iqr': q_control.loc[0.75] - q_control.loc[0.25], }, }, ) return summ_stats
[docs] def permute(self, n_resamples: int = None): r"""Perform permutation test. Parameters ---------- n_resamples : int, optional Number of permutations. Defaults to self.n_resamples. Returns ------- PermResult Permutation results. Notes ----- Uses 'independent' permutation type, and does not support paired samples. Updates Ha indicator in summ_stats if p < alpha. """ # Default to using self.n_resamples n_resamples = n_resamples or self.n_resamples perm_result = stats.permutation_test( data=(self.test, self.control), statistic=self.test_stat_func, permutation_type='independent', n_resamples=n_resamples, vectorized=True, axis=0, alternative='two-sided', rng=self.rng ) # Save output to attribute self.perm_result = PermResult( obs_stat=perm_result.statistic.item(), perm_dist=perm_result.null_distribution, p=perm_result.pvalue.item() ) # If p-value is significant, indicate direction in summ_stats if self.perm_result.p < self.alpha: if self.perm_result.obs_stat > 0: self._alt_hypothesis = '>' elif self.perm_result.obs_stat < 0: self._alt_hypothesis = '<' else: warnings.warn( 'Significant P value but no between-groups difference in ' 'test statistic observed. Verify data/results.' ) self._alt_hypothesis = '=' else: self._alt_hypothesis = '=' return self.perm_result
[docs] @staticmethod def mean_diff( sample_a: VectorLike, sample_b: VectorLike, axis: Literal[0, 1, 'rows', 'columns'] = 0 ): r"""Compute difference of means for permutation. Parameters ---------- sample_a : VectorLike First permuted sample. sample_b : VectorLike Second permuted sample. axis : {0, 1, 'rows', 'columns'}, optional Axis to compute. Defaults to 0. Returns ------- float or np.float64 Difference of means. """ axis = {'rows': 0, 'columns': 1}.get(axis, axis) # Convert inputs once (this fixes the 'unused variable' issue) a = np.asarray(sample_a) b = np.asarray(sample_b) # Calculate absolute difference in means return a.mean(axis=axis) - b.mean(axis=axis)
[docs] @staticmethod def median_diff( sample_a: VectorLike, sample_b: VectorLike, axis: Literal[0, 1, 'rows', 'columns'] = 0 ): r"""Compute difference of medians for permutation. Parameters ---------- sample_a : VectorLike First permuted sample. sample_b : VectorLike Second permuted sample. axis : {0, 1, 'rows', 'columns'}, optional Axis to compute. Defaults to 0. Returns ------- float or np.float64 Difference of medians. """ axis = {'rows': 0, 'columns': 1}.get(axis, axis) # Convert inputs once (this fixes the 'unused variable' issue) a = np.asarray(sample_a) b = np.asarray(sample_b) # Calculate absolute difference in means return np.median(a, axis=axis) - np.median(b, axis=axis)
[docs] @staticmethod def t_welch( sample_a: VectorLike, sample_b: VectorLike, axis: Literal[0, 1, 'rows', 'columns'] = 0 ): r"""Calculate Welch's 2-independent t-stat for a single permuted sample. Takes in observations that have been randomly permuted into test & control samples, called sample_a & sample_b, respectively (to distinguish from actual observed test/control), and returns the t-statistic. Parameters ---------- sample_a, sample_b : pd.Series or array_like A permuted set of observations for test or control sample, respectively. axis : {0, 1, 'rows', 'columns'}, default=0 Axis of sample_a & sample_b on which to perform t-test; passed to `stats.ttest_ind()`. Returns ------- float or NumPy float Welch's t-statistic for the permuted sample. Notes ----- Computes an asymptotic Welch's t-test (`equal_var=False`; correction for heteroskedasticity), which is repeated for each permuted sample. Since unequal-varianced samples violates the exchangeability assumption for permutation testing, the permuted p-value is not exact for finite N, thus Type I error is not guaranteed. However, Janssen & Pauls (2005) Monte Carlo simulation study found that in samples with unequal variance, permuted Welch t-tests still control Type I error better than both standard asymptotic Welch's t-test, and better than permuted Student's t-test. Of note, passed `PermutationMethod` instance to `method` would use `stats.permutation_test` to compute identical results, since `self.permute()` just runs `stats.permutation_test` using `self.t_welch()` as the test statistic. In later releases, permutation t- & U-tests will be implemented using native SciPy method, but for now, all permutation stats methods are being kept housed in this class, and implementing the native SciPy method would be complex and temporary. """ axis = {'rows': 0, 'columns': 1}.get(axis, axis) # Convert inputs once (this fixes the 'unused variable' issue) a = np.asarray(sample_a) b = np.asarray(sample_b) # Run Welch's t-test & extract t-statistic test = stats.ttest_ind( a=a, b=b, axis=axis, equal_var=False, alternative='two-sided', ) return test.statistic
[docs] @staticmethod def mwu( sample_a: VectorLike, sample_b: VectorLike, axis: Literal[0, 1, 'rows', 'columns'] = 0 ): r"""Calculate Mann-Whitney U test for a single permuted sample. Takes in observations that have been randomly permuted into test & control samples, called sample_a & sample_b, respectively (to distinguish from actual observed test/control), and returns the Mann- Whitney U-statistic. Parameters ---------- sample_a : VectorLike permuted observations assigned to test sample sample_b : VectorLike permuted observations assigned to control sample axis : {0, 1, 'rows', 'columns'}, default=0 axis to compute MWU test; passed to stats.mannwhitneyu Returns ------- float or NumPy float U-statistic for the permuted sample. Notes ----- Computes a standard non-resampling M-W U-test, which is repeated for each permuted sample. Per SciPy docs, `method='auto'` computes an exact test if 8 or fewer n_obs in either sample_a or sample_b. Of note, passed `PermutationMethod` instance to `method` would use `stats.permutation_test` to compute identical results, since `self.permute()` just runs `stats.permutation_test` using `self.mwu()` as the test statistic. In later releases, permutation t- & U-tests will be implemented using native SciPy method, but for now, all permutation stats methods are being kept housed in this class, and implementing the native SciPy method here would be complex and temporary. """ axis = {'rows': 0, 'columns': 1}.get(axis, axis) # Convert to NDArray) a = np.asarray(sample_a) b = np.asarray(sample_b) # Run Mann-Whitney U-test test = stats.mannwhitneyu( x=a, y=b, use_continuity=True, alternative='two-sided', axis=axis, method='auto', ) return test.statistic
def __str__(self): r"""String representation of permutation results. Runs permute if not done. Returns ------- str Formatted summary and permutation results. """ # Ensure permutation has been run before printing if self.perm_result is None: self.permute() return ( f'{str(self.summ_stats)}\n' f''' Permutation Hypothesis Test * Test Stat: {self._test_display_names[self.test_type]} * Test Group: {self.test.name} * Control Group: {self.control.name} {str(self.perm_result)} ''' )
[docs] class TwoSamplePermutation(TwoSeriesPermutation): r"""Class for permutation tests with a boolean grouping variable. Extends TwoSeriesPermutation to split continuous series by boolean group. Parameters ---------- bool_x : pd.Series Boolean grouping. num_y : pd.Series Continuous outcome. test_type : {'means', 'medians', 't_welch', 'mwu'}, default 'means' Test type. alpha_level : float, default 0.05 Significance level. n_resamples : int, default 10_000 Permutations.. rng : int, default 519 Random seed. x_test_lvl : bool, default True Test level. Attributes ---------- x : pd.Series Grouping. y : pd.Series Outcome. Raises ------ SeriesNameCollisionError If names collide. ValueError If groups empty. """ def __init__(self, bool_x: pd.Series, num_y: pd.Series, test_type: _PermutationTestTypes = 'means', alpha_level: float = .05, n_resamples: int = 10_000, rng: Optional[int] = 519, x_test_lvl: bool = True): # Ensure names exist and are not the same bool_x = bool_x.rename(bool_x.name or 'x') num_y = num_y.rename(num_y.name or 'y') if bool_x.name == num_y.name: raise SeriesNameCollisionError('Both bool_x and num_y cannot ' 'have same names.') # Concat into a df in order to .dropna, then define x & y series self._df = ( pd.concat( [ bool_x.astype('boolean'), num_y, ], axis='columns' ) .dropna(axis='index') ) self.x = self._df[bool_x.name] self.y = self._df[num_y.name] # Build mask to slice test/control base on x_test_lvl self._test_x = bool(x_test_lvl) # Ensure is type bool mask = (self.x == self._test_x).astype(bool) # ensure safe masking # self.test is value of y where x == x_test_lvl test = ( self.y[mask] .rename(f'{self.x.name}_{str(self._test_x).upper()}') ) control = ( self.y[~mask] .rename(f'{self.x.name}_{str((not self._test_x)).upper()}') ) if test.empty or control.empty: raise ValueError('After cleaning/splitting, at least one group is ' 'empty. Check x_test_lvl and values in bool_x.') super().__init__(test=test, control=control, test_type=test_type, alpha_level=alpha_level, n_resamples=n_resamples, rng=rng) def __str__(self): r"""String representation with outcome name. Returns ------- str Formatted outcome, summary, permutation results. """ # Ensure bootstrapping has been run before printing if self.perm_result is None: self.permute() return ( f'Outcome: {self.y.name}\n\n' f'{str(self.summ_stats)}\n' f''' Permutation Hypothesis Test * Test Stat: {self._test_display_names[self.test_type]} * Test Group: {self.test.name} * Control Group: {self.control.name} {str(self.perm_result)} ''' )