Source code for mne_nirs.statistics._statsmodels

# Authors: Robert Luke <mail@robertluke.net>
#
# License: BSD (3-clause)

from io import StringIO

import pandas as pd
import numpy as np


def summary_to_dataframe(summary):
    '''Convert statsmodels summary to pandas dataframe.

    .. warning:: The summary has precision issues, use the numerical values
                 from it with caution.
    '''
    results = summary.tables[1]
    if type(results) is not pd.core.frame.DataFrame:
        results = StringIO(summary.tables[1].as_html())
        results = pd.read_html(results, header=0, index_col=0)[0]
    return results


def expand_summary_dataframe(summary):
    '''Expand dataframe index column in to individual columns'''

    # Determine new columns
    new_cols = summary.index[0].split(':')
    col_names = []
    for col in new_cols:
        col_name = col.split('[')[0]
        summary[col_name] = 'NaN'
        col_names.append(col_name)

    # Fill in values
    if 'Group Var' in summary.index:
        summary = summary[:-1]
    summary = summary.copy(deep=True)
    indices = summary.index
    for row_idx, row in enumerate(indices):
        col_vals = row.split(':')
        for col_idx, col in enumerate(col_names):
            if "]" in col_vals[col_idx]:
                val = col_vals[col_idx].split('[')[1].split(']')[0]
            else:
                val = col
            summary.at[row, col] = val

    summary = summary.copy()  # Copies required to suppress .loc warnings
    sum_copy = summary.copy(deep=True)
    key = 'P>|t|' if 'P>|t|' in summary.columns else 'P>|z|'
    float_p = [float(p) for p in sum_copy[key]]
    summary.loc[:, key] = float_p
    summary.loc[:, "Significant"] = False
    summary.loc[summary[key] < 0.05, 'Significant'] = True

    # Standardise returned column name, it seems to vary per test
    if 'Coef.' in summary.columns:
        summary.loc[:, "Coef."] = [float(c) for c in summary["Coef."]]
    elif 'coef' in summary.columns:
        summary = summary.rename(columns={"coef": "Coef."})

    return summary


_REPLACEMENTS = (
    ('P>|z|', 'pvalues'),
    ('Coef.', 'fe_params'),
    ('z', 'tvalues'),
    ('P>|t|', 'pvalues'),
)


[docs] def statsmodels_to_results(model, order=None): """ Convert statsmodels summary to a dataframe. Parameters ---------- model : statsmodels model output The output of a statsmodels analysis. For example rlm or mixedlm. order : array of str Requested order of the channels. Returns ------- df : Pandas dataframe. Data frame with the results from the stats model. """ from statsmodels.regression.mixed_linear_model import MixedLMResultsWrapper from scipy.stats.distributions import norm df = summary_to_dataframe(model.summary()) # deal with numerical precision loss in at least some of the values for col, attr in _REPLACEMENTS: if col in df.columns: df[col] = getattr(model, attr, df[col]) # This one messes up the standard error and quartiles, too if isinstance(model, MixedLMResultsWrapper): sl = slice(model.k_fe) mu = np.asarray(df.iloc[sl, df.columns == 'Coef.'])[:, 0] # Adapted from statsmodels, see # https://github.com/statsmodels/statsmodels/blob/master/statsmodels/regression/mixed_linear_model.py#L2710-L2736 # noqa: E501 stderr = np.sqrt(np.diag(model.cov_params()[sl])) df.iloc[sl, df.columns == 'Std.Err.'] = stderr # Confidence intervals qm = -norm.ppf(0.05 / 2) df.iloc[sl, df.columns == '[0.025'] = mu - qm * stderr df.iloc[sl, df.columns == '0.975]'] = mu + qm * stderr # All random effects variances and covariances sdf = np.zeros((model.k_re2 + model.k_vc, 2)) jj = 0 for i in range(model.k_re): for j in range(i + 1): sdf[jj, 0] = np.asarray(model.cov_re)[i, j] sdf[jj, 1] = np.sqrt(model.scale) * \ model.bse.iloc[model.k_fe + jj] jj += 1 # Variance components for i in range(model.k_vc): sdf[jj, 0] = model.vcomp[i] sdf[jj, 1] = np.sqrt(model.scale) * model.bse[model.k_fe + jj] jj += 1 df.iloc[model.k_fe:, df.columns == 'Coef.'] = sdf[:, 0] df.iloc[model.k_fe:, df.columns == 'Std.Err.'] = sdf[:, 1] df = expand_summary_dataframe(df) if order is not None: df['old_index'] = df.index df = df.set_index('ch_name') df = df.loc[order, :] df['ch_name'] = df.index df.index = df['old_index'] df.drop(columns='old_index', inplace=True) df.rename_axis(None, inplace=True) return df