Source code for mne_nirs.statistics._statsmodels

# Authors: Robert Luke <mail@robertluke.net>
#
# License: BSD (3-clause)

from io import StringIO

import numpy as np
import pandas as pd


def summary_to_dataframe(summary):
    """Convert statsmodels summary to pandas dataframe.

    .. warning:: The summary has precision issues, use the numerical values
                 from it with caution.
    """
    results = summary.tables[1]
    if type(results) is not pd.core.frame.DataFrame:
        results = StringIO(summary.tables[1].as_html())
        results = pd.read_html(results, header=0, index_col=0)[0]
    return results


def expand_summary_dataframe(summary):
    """Expand dataframe index column in to individual columns."""
    # Determine new columns
    new_cols = summary.index[0].split(":")
    col_names = []
    for col in new_cols:
        col_name = col.split("[")[0]
        summary[col_name] = "NaN"
        col_names.append(col_name)

    # Fill in values
    if "Group Var" in summary.index:
        summary = summary[:-1]
    summary = summary.copy(deep=True)
    indices = summary.index
    for row_idx, row in enumerate(indices):
        col_vals = row.split(":")
        for col_idx, col in enumerate(col_names):
            if "]" in col_vals[col_idx]:
                val = col_vals[col_idx].split("[")[1].split("]")[0]
            else:
                val = col
            summary.at[row, col] = val

    summary = summary.copy()  # Copies required to suppress .loc warnings
    sum_copy = summary.copy(deep=True)
    key = "P>|t|" if "P>|t|" in summary.columns else "P>|z|"
    float_p = [float(p) for p in sum_copy[key]]
    summary.loc[:, key] = float_p
    summary.loc[:, "Significant"] = False
    summary.loc[summary[key] < 0.05, "Significant"] = True

    # Standardise returned column name, it seems to vary per test
    if "Coef." in summary.columns:
        summary.loc[:, "Coef."] = [float(c) for c in summary["Coef."]]
    elif "coef" in summary.columns:
        summary = summary.rename(columns={"coef": "Coef."})

    return summary


_REPLACEMENTS = (
    ("P>|z|", "pvalues"),
    ("Coef.", "fe_params"),
    ("z", "tvalues"),
    ("P>|t|", "pvalues"),
)



[docs]
def statsmodels_to_results(model, order=None):
    """
    Convert statsmodels summary to a dataframe.

    Parameters
    ----------
    model : statsmodels model output
        The output of a statsmodels analysis. For example rlm or mixedlm.
    order : array of str
        Requested order of the channels.

    Returns
    -------
    df : Pandas dataframe.
        Data frame with the results from the stats model.
    """
    from scipy.stats.distributions import norm
    from statsmodels.regression.mixed_linear_model import MixedLMResultsWrapper

    df = summary_to_dataframe(model.summary())
    # deal with numerical precision loss in at least some of the values
    for col, attr in _REPLACEMENTS:
        if col in df.columns:
            df[col] = getattr(model, attr, df[col])

    # This one messes up the standard error and quartiles, too
    if isinstance(model, MixedLMResultsWrapper):
        sl = slice(model.k_fe)
        mu = np.asarray(df.iloc[sl, df.columns == "Coef."])[:, 0]
        # Adapted from statsmodels, see
        # https://github.com/statsmodels/statsmodels/blob/master/statsmodels/regression/mixed_linear_model.py#L2710-L2736  # noqa: E501
        stderr = np.sqrt(np.diag(model.cov_params()[sl]))
        df.iloc[sl, df.columns == "Std.Err."] = stderr
        # Confidence intervals
        qm = -norm.ppf(0.05 / 2)
        df.iloc[sl, df.columns == "[0.025"] = mu - qm * stderr
        df.iloc[sl, df.columns == "0.975]"] = mu + qm * stderr
        # All random effects variances and covariances
        sdf = np.zeros((model.k_re2 + model.k_vc, 2))
        jj = 0
        for i in range(model.k_re):
            for j in range(i + 1):
                sdf[jj, 0] = np.asarray(model.cov_re)[i, j]
                sdf[jj, 1] = np.sqrt(model.scale) * model.bse.iloc[model.k_fe + jj]
                jj += 1

        # Variance components
        for i in range(model.k_vc):
            sdf[jj, 0] = model.vcomp[i]
            sdf[jj, 1] = np.sqrt(model.scale) * model.bse[model.k_fe + jj]
            jj += 1

        df.iloc[model.k_fe :, df.columns == "Coef."] = sdf[:, 0]
        df.iloc[model.k_fe :, df.columns == "Std.Err."] = sdf[:, 1]

    df = expand_summary_dataframe(df)

    if order is not None:
        df["old_index"] = df.index
        df = df.set_index("ch_name")
        df = df.loc[order, :]
        df["ch_name"] = df.index
        df.index = df["old_index"]
        df.drop(columns="old_index", inplace=True)
        df.rename_axis(None, inplace=True)

    return df