Source code for regtools.ext_statsmodels

import numpy as np
import pandas as pd

from scipy import stats

from statsmodels.iolib.summary2 import summary_params
from statsmodels.iolib.table import SimpleTable
from statsmodels.iolib.tableformatting import fmt_params

from statsmodels.iolib.summary2 import (
    lrange,
    lzip,
    reduce,
    _col_params,
    _make_unique,
    _col_info,
    Summary,
)


[docs]def summary_col( results, float_format="%.4f", model_names=[], stars=False, info_dict=None, regressor_order=[], ): """ Summarize multiple results instances side-by-side (coefs and SEs) Parameters ---------- results : statsmodels results instance or list of result instances float_format : string float format for coefficients and standard errors Default : '%.4f' model_names : list of strings of length len(results) if the names are not unique, a roman number will be appended to all model names stars : bool print significance stars info_dict : dict dict of lambda functions to be applied to results instances to retrieve model info. To use specific information for different models, add a (nested) info_dict with model name as the key. Example: `info_dict = {"N":..., "R2": ..., "OLS":{"R2":...}}` would only show `R2` for OLS regression models, but additionally `N` for all other results. Default : None (use the info_dict specified in result.default_model_infos, if this property exists) regressor_order : list of strings list of names of the regressors in the desired order. All regressors not specified will be appended to the end of the list. """ # TODO [#1]: replace summary_col with an import from statsmodels # # the purpose of recreating the summary_col function is becuase of issue # #3767 in statsmodels: # https://github.com/statsmodels/statsmodels/issues/3767 # Which can cause coefficients to become mismatched when using regressor_order. # I have patched the issue by replacing np.unique with pd.unique. if not isinstance(results, list): results = [results] cols = [_col_params(x, stars=stars, float_format=float_format) for x in results] # Unique column names (pandas has problems merging otherwise) if model_names: colnames = _make_unique(model_names) else: colnames = _make_unique([x.columns[0] for x in cols]) for i in range(len(cols)): cols[i].columns = [colnames[i]] merg = lambda x, y: x.merge(y, how="outer", right_index=True, left_index=True) summ = reduce(merg, cols) if regressor_order: varnames = summ.index.get_level_values(0).tolist() ordered = [x for x in regressor_order if x in varnames] unordered = [x for x in varnames if x not in regressor_order + [""]] order = ordered + list(np.unique(unordered)) f = lambda idx: sum([[x + "coef", x + "stde"] for x in idx], []) summ.index = f(pd.unique(varnames)) summ = summ.reindex(f(order)) summ.index = [x[:-4] for x in summ.index] idx = pd.Series(lrange(summ.shape[0])) % 2 == 1 summ.index = np.where(idx, "", summ.index.get_level_values(0)) # add infos about the models. if info_dict: cols = [ _col_info(x, info_dict.get(x.model.__class__.__name__, info_dict)) for x in results ] else: cols = [_col_info(x, getattr(x, "default_model_infos", None)) for x in results] # use unique column names, otherwise the merge will not succeed for df, name in zip(cols, _make_unique([df.columns[0] for df in cols])): df.columns = [name] merg = lambda x, y: x.merge(y, how="outer", right_index=True, left_index=True) info = reduce(merg, cols) dat = pd.DataFrame(np.vstack([summ, info])) # pd.concat better, but error dat.columns = summ.columns dat.index = pd.Index(summ.index.tolist() + info.index.tolist()) summ = dat summ = summ.fillna("") smry = Summary() smry.add_df(summ, header=True, align="l") smry.add_text("Standard errors in parentheses.") if stars: smry.add_text("* p<.1, ** p<.05, ***p<.01") return smry
[docs]def update_statsmodel_result_with_new_cov_matrix(result, cov_matrix: pd.DataFrame): """ Note: inplace Statsmodels results have caching going on. Need to update all the properties which depend on the covariance matrix """ result.cov_params = lambda: cov_matrix result.bse = pd.Series( np.sqrt(np.diag(result.cov_params())), index=result.model.exog_names ) result.tvalues = result.params / result.bse if result.use_t: df_resid = getattr(result, "df_resid_inference", result.df_resid) result.pvalues = stats.t.sf(np.abs(result.tvalues), df_resid) * 2 else: result.pvalues = stats.norm.sf(np.abs(result.tvalues)) * 2 _update_statsmodel_result_summary_after_cov_matrix_changed(result)
def _update_statsmodel_result_summary_after_cov_matrix_changed(result): """ Note: inplace """ # Create new param/stderr section of summary new_param_stderr = summary_params(result) new_table = SimpleTable( new_param_stderr.values, headers=list(new_param_stderr.columns), stubs=list(new_param_stderr.index), txt_fmt=fmt_params, ) # Create summary object with param/stderr table replaced summ = result.summary() summ.tables[1] = new_table # Assign summary method of result to return this summary object result.summary = lambda: summ # Repeat steps for summary2, which only requires df and not SimpleTable summ2 = result.summary2() summ2.tables[1] = new_param_stderr result.summary2 = lambda: summ2