Source code for unistat.regression

from abc import ABC, abstractmethod
import numpy as np
import pandas as pd
import patsy
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from .exceptions import warn_experimental



[docs]
class RegressionStats(ABC):
    r"""Abstract base class for regression statistics.

    Provides common functionality for regression models, including data
    preparation, standardization, VIF calculation, and properties for
    regression results.

    Parameters
    ----------
    X : pd.DataFrame or pd.Series
        Independent variables (features).
    y : pd.Series
        Dependent variable (target).
    bool_col_names : list[str] or str or None, optional
        Names of boolean columns in X to exclude from standardization.

    Attributes
    ----------
    bool_cols : list[str] or None
        List of boolean column names.
    X : pd.DataFrame
        Feature DataFrame. `NaN` values are removed and all columns are
        converted to `float64`.
    y : pd.DataFrame
        Target DataFrame. `NaN` values are removed and all columns are converted
        to `float64`.
    reg : statsmodels regression result
        Fitted regression model.
    X_std : pd.DataFrame
        `X`, with all non-Boolean columns transformed to Z-scores.
    std_reg : statsmodels regression result
        Fitted standardized regression model.

    Notes
    -----
    Observations with any missing data in either ``X`` *or* ``y`` are dropped.
    """

    def __init__(self, X, y, bool_col_names: list | str | None = None):
        self._df = self._concat_xy(X, y)
        self.bool_cols = bool_col_names
        self.X = (
            self._df.drop(columns=[y.name])
            .apply(pd.to_numeric, errors='coerce')
            .astype('float64')
        )
        self.y = (
            self._df[[y.name]]
            .apply(pd.to_numeric, errors='coerce')
            .astype('float64')
        )
        self.reg = None
        self.std_reg = None

    def __str__(self):
        """
        String representation of the regression results.

        Returns
        -------
        str
            Formatted string with VIF (if applicable) and regression summaries.
        """
        if len(self.X.columns) >= 2:
            print_string = (f'{str(self.vif_matrix())}\n'
                            f'{self.reg.summary2().as_text()}\n')
        else:
            print_string = f'{self.reg.summary2().as_text()}\n'

        if not self._all_bool_cols:
            print_string += f'{self.std_reg.summary2().as_text()}\n'

        return print_string

    @property
    def _all_bool_cols(self):
        """Check if all columns in X are boolean.

        Returns
        -------
        bool
            True if all columns are boolean, False otherwise.
        """
        if self.bool_cols is None:
            return False
        else:
            return len(self.bool_cols) == self.X.shape[1]

    @property
    def bool_cols(self):
        """Get the list of boolean column names.

        Returns
        -------
        list[str] or None
            List of boolean columns.
        """
        return self._bool_cols

    @bool_cols.setter
    def bool_cols(self, bool_col_names):
        if isinstance(bool_col_names, str):
            self._bool_cols = [bool_col_names]
        elif isinstance(bool_col_names, list):
            self._bool_cols = bool_col_names
        elif bool_col_names is None:
            self._bool_cols = None
        else:
            raise ValueError('bool_col_names must be a str or list of str')

    @property
    def X_std(self):
        """Standardized version of ``X`` (Z-scores), excluding boolean columns.

        Returns
        -------
        pd.DataFrame
            Standardized features.
        """
        # Create a standardized version of feature column (z-score)
        if self.bool_cols is not None:
            X_num = self.X[[col for col in self.X.columns
                            if col not in self.bool_cols]]
            X_num_std = X_num.apply(stats.zscore, axis='index', ddof=1,
                                    nan_policy='omit')
            X_num_std.columns = [f'{col}_std' for col in X_num_std.columns]
            X_std = pd.concat(
                [
                    X_num_std,
                    self.X[self.bool_cols]
                ],
                axis='columns'
            )
        else:
            X_std = self.X.apply(stats.zscore, axis='index', ddof=1,
                                 nan_policy='omit')
            X_std.columns = [f'{col}_std' for col in X_std.columns]
        return X_std

    @abstractmethod
    def _run_regression(self, standardize: bool = False):
        """Abstract method to run the regression model.

        Parameters
        ----------
        standardize : bool, optional
            Whether to use standardized features. Defaults to False.

        Returns
        -------
        statsmodels regression result
            Fitted model.
        """
        if not standardize:
            exog = self.X
        else:
            exog = self.X_std
        endog = self.y

        reg = sm.OLS(
            endog=endog,
            exog=sm.add_constant(exog),
            missing='drop'
        ).fit()

        return reg

    @property
    def reg(self):
        """Get the fitted regression model.

        Returns
        -------
        statsmodels regression result
            Fitted model.
        """
        return self._reg

    @reg.setter
    def reg(self, result):
        if result is None:
            self._reg = self._run_regression()
        else:
            self._reg = result

    @property
    def std_reg(self):
        """Get the fitted standardized regression model.

        Returns
        -------
        statsmodels regression result
            Fitted standardized model.
        """
        return self._std_reg

    @std_reg.setter
    def std_reg(self, result):
        if result is None:
            self._std_reg = self._run_regression(standardize=True)
        else:
            self._std_reg = result


[docs]
    def vif_matrix(self):
        """Calculate variance inflation factors (VIF) for feature DataFrame.

        Returns
        -------
        pd.Series
            VIF values for each feature.

        Raises
        ------
        ValueError
            If fewer than 2 columns in X.
        """
        if len(self.X.columns) < 2:
            raise ValueError('VIF requires at least 2 columns')
        X = self.X.dropna().astype(float)

        vif_matrix = pd.Series(
            [variance_inflation_factor(X.values, i)
             for i in range(X.shape[1])],
            name='vif',
            index=X.columns,
        )

        return vif_matrix


    @staticmethod
    def _concat_xy(X, y):
        """Concatenate X and y, handle data types, drop NaNs.

        Parameters
        ----------
        X : pd.DataFrame or pd.Series
            Features.
        y : pd.Series
            Target.

        Returns
        -------
        pd.DataFrame
            Processed DataFrame.
        """
        df = pd.concat([X, y], axis='columns').dropna(axis='index')
        for col in df.columns:
            if df[col].dtype == pd.BooleanDtype():
                df[col] = df[col].astype(int)
            elif df[col].dtype == bool:
                df[col] = df[col].astype(int)
            elif df[col].dtype == pd.Int64Dtype():
                df[col] = df[col].astype(int)
        return df




[docs]
class LogitStats(RegressionStats):
    r"""Class for logistic regression statistics.

    Extends RegressionStats for logistic regression using Logit model.

    Parameters
    ----------
    X : pd.DataFrame or pd.Series
        Predictor observations.
    y : pd.Series
        Binary outcome observations.
    bool_col_names : list[str] or str or None, optional
        String names of Boolean columns to exclude from standardization.
    """
    def __init__(self, X, y, bool_col_names: list | str | None = None):
        super().__init__(X, y, bool_col_names)

    def __str__(self):
        """String representation with VIF, summary, and odds ratios.

        Returns
        -------
        str
            Formatted results.
        """
        if len(self.X.columns) >= 2:
            print_string = (
                f'{str(self.vif_matrix())}\n'
                f'{self.reg.summary2().as_text()}\n'
                f'{self.logit_or()}\n'
            )
        else:
            print_string = (
                f'{self.reg.summary2().as_text()}\n'
                f'{self.logit_or()}\n'
            )

        if not self._all_bool_cols:
            print_string += (
                f'{self.std_reg.summary2().as_text()}\n'
                f'{self.logit_or(standardize=True)}\n'
            )

        return print_string

    def _run_regression(self, standardize: bool = False):
        """Fit logistic regression.

        Parameters
        ----------
        standardize : bool, optional
            Use standardized features. Defaults to False.

        Returns
        -------
        statsmodels.discrete.discrete_model.LogitResults
            Fitted model.
        """
        if not standardize:
            exog = self.X
        else:
            if self._all_bool_cols:
                return None
            else:
                exog = self.X_std
        endog = self.y

        logit = sm.Logit(
            endog=endog,
            exog=sm.add_constant(exog),
            missing='drop'
        ).fit()

        return logit


[docs]
    def logit_or(self, standardize: bool = False) -> pd.DataFrame:
        """Calculate odds ratios by predictor, with 95% confidence intervals.

        Parameters
        ----------
        standardize : bool, optional
            Use standardized model. Defaults to False.

        Returns
        -------
        pd.DataFrame
            Odds ratios with 95% CI.

        Raises
        ------
        ValueError
            If all columns are boolean and standardize is True.
        """
        if standardize:
            if not self._all_bool_cols:
                model = self.std_reg
            else:
                raise ValueError('Standardized logit cannot be run when all '
                                 'columns are boolean.')
        else:
            model = self.reg

        output = pd.DataFrame(model.params, columns=['OR'])
        output[['95% CI lower', '95% CI upper']] = model.conf_int()
        return output.map(np.exp)



[docs]
    def pretty_print_or(self,
                        standardize: bool = False,
                        label: bool = True) -> None:
        """Print formatted ORs & 95% CIs for easy copy-pasting.

        Parameters
        ----------
        standardize : bool, optional
            Use standardized model. Defaults to False.
        label : bool, optional
            Include parameter labels. Defaults to True.

        Raises
        ------
        ValueError
            If all columns are boolean and standardize is True.
        """
        if standardize:
            if self._all_bool_cols:
                raise ValueError('Standardized logit cannot be run when all '
                                 'columns are boolean.')
            else:
                ratios = self.logit_or(standardize=True)
        else:
            ratios = self.logit_or()

        for idx in ratios.index:
            row = ratios.loc[idx, :]

            print_string = (
                f'OR {row['OR']:.2f} '
                f'({row['95% CI lower']:.2f} - {row['95% CI upper']:.2f})'
            )
            if label:
                print_string = f'{idx}: ' + print_string

            print(print_string)





[docs]
class LinRegStats(RegressionStats):
    r"""Class for linear regression statistics.

    Extends RegressionStats for ordinary least squares (OLS) regression.

    Parameters
    ----------
    X : pd.DataFrame or pd.Series
        Predictor observations.
    y : pd.Series
        Numeric outcome observations.
    bool_col_names : list[str] or str or None, optional
        Boolean columns to exclude from standardization.

    Notes
    -----
    ``unistat`` does NOT standardize the values of ``y`` for linear regression.
    Typically, "standardized regression" refers to a transformation of
    :math:`y \sim X` such that
    :math:`\text{SD}\left( y \right) \sim \text{SD}\left( X \right)`; a
    coefficient :math:`\beta` is thus interpreted as a 1-S.D. increase in
    :math:`X` conferring a :math:`\beta` S.D. increase in :math:`y`. We find
    this to be difficult to interpret, with no benefit beyond adherence to
    convention.

    Instead, ``unistat`` opts for "X-standardized regression". That is, since
    only :math:`X` is Z-scored, :math:`y \sim X` is transformed such that
    :math:`y \sim \text{SD}\left( X \right)`. Here, a coefficient :math:`\beta`
    is interpreted as a 1-S.D. increase in :math:`X` conferring an absolute
    increase of :math:`\beta` units in :math:`y`. This is more easily
    interpretable, while still allowing comparison of the relative strengths of
    all :math:`X` predictors.
    """

    def __init__(self, X, y, bool_col_names: list | str | None = None):
        super().__init__(X, y, bool_col_names)

    def _run_regression(self, standardize: bool = False):
        """Run OLS linear regression.

        Parameters
        ----------
        standardize : bool, optional
            Use standardized features. Defaults to False.

        Returns
        -------
        statsmodels.regression.linear_model.RegressionResults
            Fitted model.
        """
        if not standardize:
            exog = self.X
        else:
            if self._all_bool_cols:
                return None
            else:
                exog = self.X_std
        endog = self.y

        reg = sm.OLS(
            endog=endog,
            exog=sm.add_constant(exog),
            missing='drop'
        ).fit()

        return reg


[docs]
    def pretty_print_coefs(self,
                           standardize: bool = False,
                           label: bool = True) -> None:
        r"""Print formatted regression coefficients for easy copy-pasting.

        Parameters
        ----------
        standardize : bool, Default False
            Use standardized model.
        label : bool, Default True
            Include parameter labels.

        Raises
        ------
        ValueError
            If all columns are boolean and standardize is True.
        """
        # Use standardized regression results as necessary
        if standardize:
            if self._all_bool_cols:
                raise ValueError('Standardized LinReg cannot be run when all '
                                 'columns are boolean.')
            else:
                reg = self.std_reg
        else:
            reg = self.reg

        # Make df of coefs & 95% CIs
        coef_df = pd.DataFrame(
            data={
                'Coef': reg.params,
                '95% CI lower': reg.conf_int()[0],
                '95% CI upper': reg.conf_int()[1],
            }
        )

        for idx in coef_df.index:
            row = coef_df.loc[idx, :]

            print_string = (
                f'{row['Coef']:.3f} '
                f'({row['95% CI lower']:.3f} - {row['95% CI upper']:.3f})'
            )
            if label:
                print_string = f'{idx}: ' + print_string

            print(print_string)





[docs]
class LogBinStats(RegressionStats):
    r"""Class for log-binomial regression statistics (experimental).

    Extends RegressionStats for generalized linear model with binomial family
    and log link.

    Parameters
    ----------
    X : pd.DataFrame or pd.Series
        Independent variables.
    y : pd.Series
        Dependent variable (binary).
    bool_col_names : list[str] or str or None, optional
        Boolean columns to exclude from standardization.

    Warnings
    --------
    This class is experimental; use with caution and verify results.
    """

    def __init__(self, X, y, bool_col_names: list | str | None = None):
        warn_experimental('''
            This feature is experimental and has not been fully tested nor 
            optimized; calculations may be incorrect, and/or errors may occur. 
            In critical applications, output should be verified for accuracy 
            and/or LogitStats preferred instead.
        ''')
        super().__init__(X, y, bool_col_names)

    def __str__(self):
        """String representation with VIF, summary, and risk ratios.

        Returns
        -------
        str
            Formatted results.
        """
        if len(self.X.columns) >= 2:
            print_string = (
                f'{str(self.vif_matrix())}\n'
                f'{self.reg.summary2().as_text()}\n'
                f'{self.logbin_rr()}\n'
            )
        else:
            print_string = (
                f'{self.reg.summary2().as_text()}\n'
                f'{self.logbin_rr()}\n'
            )

        if not self._all_bool_cols:
            print_string += (
                f'{self.std_reg.summary2().as_text()}\n'
                f'{self.logbin_rr(standardize=True)}\n'
            )

        return print_string

    def _run_regression(self, standardize: bool = False):
        """Run log-binomial regression.

        Parameters
        ----------
        standardize : bool, optional
            Use standardized features. Defaults to False.

        Returns
        -------
        statsmodels.genmod.generalized_linear_model.GLMResults
            Fitted model.
        """
        if not standardize:
            exog = self.X
        else:
            if self._all_bool_cols:
                return None
            else:
                exog = self.X_std
        endog = self.y

        logbin = sm.GLM(
            endog=endog,
            exog=sm.add_constant(exog),
            family=sm.families.Binomial(link=sm.families.links.Log()),
            missing='drop'
        ).fit(
            start_params=(
                [np.log(0.5)]
                + [np.float64(0) for i in range(0, len(self.X.columns))]
            ),
            method='irls',
            maxiter=1000
        )

        return logbin


[docs]
    def logbin_rr(self, standardize: bool = False) -> pd.DataFrame:
        """Calculate risk ratios by predictor with 95% confidence intervals.

        Parameters
        ----------
        standardize : bool, optional
            Use standardized model. Defaults to False.

        Returns
        -------
        pd.DataFrame
            Risk ratios with 95% CI.

        Raises
        ------
        ValueError
            If all columns are boolean and standardize is True.
        """
        if standardize:
            if not self._all_bool_cols:
                model = self.std_reg
            else:
                raise ValueError('Standardized logit cannot be run when all '
                                 'columns are boolean.')
        else:
            model = self.reg

        output = pd.DataFrame(np.exp(model.params), columns=['RR'])
        output[['95% CI lower', '95% CI upper']] = model.conf_int()
        return output.map(np.exp)



[docs]
    def pretty_print_rr(self,
                        standardize: bool = False,
                        label: bool = True) -> None:
        """Print formatted risk ratios w/ 95% CIs for easy copy-pasting.

        Parameters
        ----------
        standardize : bool, optional
            Use standardized model. Defaults to False.
        label : bool, optional
            Include parameter labels. Defaults to True.

        Raises
        ------
        ValueError
            If all columns are boolean and standardize is True.
        """
        if standardize:
            if self._all_bool_cols:
                raise ValueError('Standardized logit cannot be run when all '
                                 'columns are boolean.')
            else:
                ratios = self.logbin_rr(standardize=True)
        else:
            ratios = self.logbin_rr()

        for idx in ratios.index:
            row = ratios.loc[idx, :]

            print_string = (
                f'OR {row['RR']:.2f} '
                f'({row['95% CI lower']:.2f} - {row['95% CI upper']:.2f})'
            )
            if label:
                print_string = f'{idx}: ' + print_string

            print(print_string)