from abc import ABC, abstractmethod
import numpy as np
import pandas as pd
import patsy
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from .exceptions import warn_experimental
[docs]
class RegressionStats(ABC):
r"""Abstract base class for regression statistics.
Provides common functionality for regression models, including data
preparation, standardization, VIF calculation, and properties for
regression results.
Parameters
----------
X : pd.DataFrame or pd.Series
Independent variables (features).
y : pd.Series
Dependent variable (target).
bool_col_names : list[str] or str or None, optional
Names of boolean columns in X to exclude from standardization.
Attributes
----------
bool_cols : list[str] or None
List of boolean column names.
X : pd.DataFrame
Feature DataFrame. `NaN` values are removed and all columns are
converted to `float64`.
y : pd.DataFrame
Target DataFrame. `NaN` values are removed and all columns are converted
to `float64`.
reg : statsmodels regression result
Fitted regression model.
X_std : pd.DataFrame
`X`, with all non-Boolean columns transformed to Z-scores.
std_reg : statsmodels regression result
Fitted standardized regression model.
Notes
-----
Observations with any missing data in either ``X`` *or* ``y`` are dropped.
"""
def __init__(self, X, y, bool_col_names: list | str | None = None):
self._df = self._concat_xy(X, y)
self.bool_cols = bool_col_names
self.X = (
self._df.drop(columns=[y.name])
.apply(pd.to_numeric, errors='coerce')
.astype('float64')
)
self.y = (
self._df[[y.name]]
.apply(pd.to_numeric, errors='coerce')
.astype('float64')
)
self.reg = None
self.std_reg = None
def __str__(self):
"""
String representation of the regression results.
Returns
-------
str
Formatted string with VIF (if applicable) and regression summaries.
"""
if len(self.X.columns) >= 2:
print_string = (f'{str(self.vif_matrix())}\n'
f'{self.reg.summary2().as_text()}\n')
else:
print_string = f'{self.reg.summary2().as_text()}\n'
if not self._all_bool_cols:
print_string += f'{self.std_reg.summary2().as_text()}\n'
return print_string
@property
def _all_bool_cols(self):
"""Check if all columns in X are boolean.
Returns
-------
bool
True if all columns are boolean, False otherwise.
"""
if self.bool_cols is None:
return False
else:
return len(self.bool_cols) == self.X.shape[1]
@property
def bool_cols(self):
"""Get the list of boolean column names.
Returns
-------
list[str] or None
List of boolean columns.
"""
return self._bool_cols
@bool_cols.setter
def bool_cols(self, bool_col_names):
if isinstance(bool_col_names, str):
self._bool_cols = [bool_col_names]
elif isinstance(bool_col_names, list):
self._bool_cols = bool_col_names
elif bool_col_names is None:
self._bool_cols = None
else:
raise ValueError('bool_col_names must be a str or list of str')
@property
def X_std(self):
"""Standardized version of ``X`` (Z-scores), excluding boolean columns.
Returns
-------
pd.DataFrame
Standardized features.
"""
# Create a standardized version of feature column (z-score)
if self.bool_cols is not None:
X_num = self.X[[col for col in self.X.columns
if col not in self.bool_cols]]
X_num_std = X_num.apply(stats.zscore, axis='index', ddof=1,
nan_policy='omit')
X_num_std.columns = [f'{col}_std' for col in X_num_std.columns]
X_std = pd.concat(
[
X_num_std,
self.X[self.bool_cols]
],
axis='columns'
)
else:
X_std = self.X.apply(stats.zscore, axis='index', ddof=1,
nan_policy='omit')
X_std.columns = [f'{col}_std' for col in X_std.columns]
return X_std
@abstractmethod
def _run_regression(self, standardize: bool = False):
"""Abstract method to run the regression model.
Parameters
----------
standardize : bool, optional
Whether to use standardized features. Defaults to False.
Returns
-------
statsmodels regression result
Fitted model.
"""
if not standardize:
exog = self.X
else:
exog = self.X_std
endog = self.y
reg = sm.OLS(
endog=endog,
exog=sm.add_constant(exog),
missing='drop'
).fit()
return reg
@property
def reg(self):
"""Get the fitted regression model.
Returns
-------
statsmodels regression result
Fitted model.
"""
return self._reg
@reg.setter
def reg(self, result):
if result is None:
self._reg = self._run_regression()
else:
self._reg = result
@property
def std_reg(self):
"""Get the fitted standardized regression model.
Returns
-------
statsmodels regression result
Fitted standardized model.
"""
return self._std_reg
@std_reg.setter
def std_reg(self, result):
if result is None:
self._std_reg = self._run_regression(standardize=True)
else:
self._std_reg = result
[docs]
def vif_matrix(self):
"""Calculate variance inflation factors (VIF) for feature DataFrame.
Returns
-------
pd.Series
VIF values for each feature.
Raises
------
ValueError
If fewer than 2 columns in X.
"""
if len(self.X.columns) < 2:
raise ValueError('VIF requires at least 2 columns')
X = self.X.dropna().astype(float)
vif_matrix = pd.Series(
[variance_inflation_factor(X.values, i)
for i in range(X.shape[1])],
name='vif',
index=X.columns,
)
return vif_matrix
@staticmethod
def _concat_xy(X, y):
"""Concatenate X and y, handle data types, drop NaNs.
Parameters
----------
X : pd.DataFrame or pd.Series
Features.
y : pd.Series
Target.
Returns
-------
pd.DataFrame
Processed DataFrame.
"""
df = pd.concat([X, y], axis='columns').dropna(axis='index')
for col in df.columns:
if df[col].dtype == pd.BooleanDtype():
df[col] = df[col].astype(int)
elif df[col].dtype == bool:
df[col] = df[col].astype(int)
elif df[col].dtype == pd.Int64Dtype():
df[col] = df[col].astype(int)
return df
[docs]
class LogitStats(RegressionStats):
r"""Class for logistic regression statistics.
Extends RegressionStats for logistic regression using Logit model.
Parameters
----------
X : pd.DataFrame or pd.Series
Predictor observations.
y : pd.Series
Binary outcome observations.
bool_col_names : list[str] or str or None, optional
String names of Boolean columns to exclude from standardization.
"""
def __init__(self, X, y, bool_col_names: list | str | None = None):
super().__init__(X, y, bool_col_names)
def __str__(self):
"""String representation with VIF, summary, and odds ratios.
Returns
-------
str
Formatted results.
"""
if len(self.X.columns) >= 2:
print_string = (
f'{str(self.vif_matrix())}\n'
f'{self.reg.summary2().as_text()}\n'
f'{self.logit_or()}\n'
)
else:
print_string = (
f'{self.reg.summary2().as_text()}\n'
f'{self.logit_or()}\n'
)
if not self._all_bool_cols:
print_string += (
f'{self.std_reg.summary2().as_text()}\n'
f'{self.logit_or(standardize=True)}\n'
)
return print_string
def _run_regression(self, standardize: bool = False):
"""Fit logistic regression.
Parameters
----------
standardize : bool, optional
Use standardized features. Defaults to False.
Returns
-------
statsmodels.discrete.discrete_model.LogitResults
Fitted model.
"""
if not standardize:
exog = self.X
else:
if self._all_bool_cols:
return None
else:
exog = self.X_std
endog = self.y
logit = sm.Logit(
endog=endog,
exog=sm.add_constant(exog),
missing='drop'
).fit()
return logit
[docs]
def logit_or(self, standardize: bool = False) -> pd.DataFrame:
"""Calculate odds ratios by predictor, with 95% confidence intervals.
Parameters
----------
standardize : bool, optional
Use standardized model. Defaults to False.
Returns
-------
pd.DataFrame
Odds ratios with 95% CI.
Raises
------
ValueError
If all columns are boolean and standardize is True.
"""
if standardize:
if not self._all_bool_cols:
model = self.std_reg
else:
raise ValueError('Standardized logit cannot be run when all '
'columns are boolean.')
else:
model = self.reg
output = pd.DataFrame(model.params, columns=['OR'])
output[['95% CI lower', '95% CI upper']] = model.conf_int()
return output.map(np.exp)
[docs]
def pretty_print_or(self,
standardize: bool = False,
label: bool = True) -> None:
"""Print formatted ORs & 95% CIs for easy copy-pasting.
Parameters
----------
standardize : bool, optional
Use standardized model. Defaults to False.
label : bool, optional
Include parameter labels. Defaults to True.
Raises
------
ValueError
If all columns are boolean and standardize is True.
"""
if standardize:
if self._all_bool_cols:
raise ValueError('Standardized logit cannot be run when all '
'columns are boolean.')
else:
ratios = self.logit_or(standardize=True)
else:
ratios = self.logit_or()
for idx in ratios.index:
row = ratios.loc[idx, :]
print_string = (
f'OR {row['OR']:.2f} '
f'({row['95% CI lower']:.2f} - {row['95% CI upper']:.2f})'
)
if label:
print_string = f'{idx}: ' + print_string
print(print_string)
[docs]
class LinRegStats(RegressionStats):
r"""Class for linear regression statistics.
Extends RegressionStats for ordinary least squares (OLS) regression.
Parameters
----------
X : pd.DataFrame or pd.Series
Predictor observations.
y : pd.Series
Numeric outcome observations.
bool_col_names : list[str] or str or None, optional
Boolean columns to exclude from standardization.
Notes
-----
``unistat`` does NOT standardize the values of ``y`` for linear regression.
Typically, "standardized regression" refers to a transformation of
:math:`y \sim X` such that
:math:`\text{SD}\left( y \right) \sim \text{SD}\left( X \right)`; a
coefficient :math:`\beta` is thus interpreted as a 1-S.D. increase in
:math:`X` conferring a :math:`\beta` S.D. increase in :math:`y`. We find
this to be difficult to interpret, with no benefit beyond adherence to
convention.
Instead, ``unistat`` opts for "X-standardized regression". That is, since
only :math:`X` is Z-scored, :math:`y \sim X` is transformed such that
:math:`y \sim \text{SD}\left( X \right)`. Here, a coefficient :math:`\beta`
is interpreted as a 1-S.D. increase in :math:`X` conferring an absolute
increase of :math:`\beta` units in :math:`y`. This is more easily
interpretable, while still allowing comparison of the relative strengths of
all :math:`X` predictors.
"""
def __init__(self, X, y, bool_col_names: list | str | None = None):
super().__init__(X, y, bool_col_names)
def _run_regression(self, standardize: bool = False):
"""Run OLS linear regression.
Parameters
----------
standardize : bool, optional
Use standardized features. Defaults to False.
Returns
-------
statsmodels.regression.linear_model.RegressionResults
Fitted model.
"""
if not standardize:
exog = self.X
else:
if self._all_bool_cols:
return None
else:
exog = self.X_std
endog = self.y
reg = sm.OLS(
endog=endog,
exog=sm.add_constant(exog),
missing='drop'
).fit()
return reg
[docs]
def pretty_print_coefs(self,
standardize: bool = False,
label: bool = True) -> None:
r"""Print formatted regression coefficients for easy copy-pasting.
Parameters
----------
standardize : bool, Default False
Use standardized model.
label : bool, Default True
Include parameter labels.
Raises
------
ValueError
If all columns are boolean and standardize is True.
"""
# Use standardized regression results as necessary
if standardize:
if self._all_bool_cols:
raise ValueError('Standardized LinReg cannot be run when all '
'columns are boolean.')
else:
reg = self.std_reg
else:
reg = self.reg
# Make df of coefs & 95% CIs
coef_df = pd.DataFrame(
data={
'Coef': reg.params,
'95% CI lower': reg.conf_int()[0],
'95% CI upper': reg.conf_int()[1],
}
)
for idx in coef_df.index:
row = coef_df.loc[idx, :]
print_string = (
f'{row['Coef']:.3f} '
f'({row['95% CI lower']:.3f} - {row['95% CI upper']:.3f})'
)
if label:
print_string = f'{idx}: ' + print_string
print(print_string)
[docs]
class LogBinStats(RegressionStats):
r"""Class for log-binomial regression statistics (experimental).
Extends RegressionStats for generalized linear model with binomial family
and log link.
Parameters
----------
X : pd.DataFrame or pd.Series
Independent variables.
y : pd.Series
Dependent variable (binary).
bool_col_names : list[str] or str or None, optional
Boolean columns to exclude from standardization.
Warnings
--------
This class is experimental; use with caution and verify results.
"""
def __init__(self, X, y, bool_col_names: list | str | None = None):
warn_experimental('''
This feature is experimental and has not been fully tested nor
optimized; calculations may be incorrect, and/or errors may occur.
In critical applications, output should be verified for accuracy
and/or LogitStats preferred instead.
''')
super().__init__(X, y, bool_col_names)
def __str__(self):
"""String representation with VIF, summary, and risk ratios.
Returns
-------
str
Formatted results.
"""
if len(self.X.columns) >= 2:
print_string = (
f'{str(self.vif_matrix())}\n'
f'{self.reg.summary2().as_text()}\n'
f'{self.logbin_rr()}\n'
)
else:
print_string = (
f'{self.reg.summary2().as_text()}\n'
f'{self.logbin_rr()}\n'
)
if not self._all_bool_cols:
print_string += (
f'{self.std_reg.summary2().as_text()}\n'
f'{self.logbin_rr(standardize=True)}\n'
)
return print_string
def _run_regression(self, standardize: bool = False):
"""Run log-binomial regression.
Parameters
----------
standardize : bool, optional
Use standardized features. Defaults to False.
Returns
-------
statsmodels.genmod.generalized_linear_model.GLMResults
Fitted model.
"""
if not standardize:
exog = self.X
else:
if self._all_bool_cols:
return None
else:
exog = self.X_std
endog = self.y
logbin = sm.GLM(
endog=endog,
exog=sm.add_constant(exog),
family=sm.families.Binomial(link=sm.families.links.Log()),
missing='drop'
).fit(
start_params=(
[np.log(0.5)]
+ [np.float64(0) for i in range(0, len(self.X.columns))]
),
method='irls',
maxiter=1000
)
return logbin
[docs]
def logbin_rr(self, standardize: bool = False) -> pd.DataFrame:
"""Calculate risk ratios by predictor with 95% confidence intervals.
Parameters
----------
standardize : bool, optional
Use standardized model. Defaults to False.
Returns
-------
pd.DataFrame
Risk ratios with 95% CI.
Raises
------
ValueError
If all columns are boolean and standardize is True.
"""
if standardize:
if not self._all_bool_cols:
model = self.std_reg
else:
raise ValueError('Standardized logit cannot be run when all '
'columns are boolean.')
else:
model = self.reg
output = pd.DataFrame(np.exp(model.params), columns=['RR'])
output[['95% CI lower', '95% CI upper']] = model.conf_int()
return output.map(np.exp)
[docs]
def pretty_print_rr(self,
standardize: bool = False,
label: bool = True) -> None:
"""Print formatted risk ratios w/ 95% CIs for easy copy-pasting.
Parameters
----------
standardize : bool, optional
Use standardized model. Defaults to False.
label : bool, optional
Include parameter labels. Defaults to True.
Raises
------
ValueError
If all columns are boolean and standardize is True.
"""
if standardize:
if self._all_bool_cols:
raise ValueError('Standardized logit cannot be run when all '
'columns are boolean.')
else:
ratios = self.logbin_rr(standardize=True)
else:
ratios = self.logbin_rr()
for idx in ratios.index:
row = ratios.loc[idx, :]
print_string = (
f'OR {row['RR']:.2f} '
f'({row['95% CI lower']:.2f} - {row['95% CI upper']:.2f})'
)
if label:
print_string = f'{idx}: ' + print_string
print(print_string)