Source code for regtools.summarize

from typing import Sequence, Optional

from regtools.ext_statsmodels import summary_col
import pandas as pd

from regtools.fe.output import add_fixed_effects_rows
from regtools.fe.tools import extract_all_dummy_cols_from_dummy_cols_dict_list, \
    extract_all_fe_names_from_dummy_cols_dict_list
from regtools.controls import suppress_controls_in_summary_df
from regtools.cluster.output import add_cluster_rows
from regtools.summarize.split import get_var_df_and_non_var_df
from regtools.summarize.yesno import col_boolean_dict_from_list_of_lists_of_columns
from regtools.summarize.tstat import replace_stderr_with_t_stat_in_summary_df


[docs]def produce_summary(reg_list, stderr: bool = False, t_stats: bool = False, float_format: str = '%0.1f',
                    regressor_order: Sequence[str] = tuple(),
                    suppress_other_regressors: bool = False, model_names: Optional[Sequence[str]] = None):
    """
    Produce a summary from a list of regression results

    :param reg_list: list of statsmodels regression results
    :param stderr: set to True to keep rows for standard errors below coefficient estimates
    :param t_stats: set to True to keep rows for standard errors below coefficient estimates and convert them to t-stats
    :param float_format: format string for how to format results in summary
    :param regressor_order: sequence of column names to put first in the regression results
    :param suppress_other_regressors: True for when using regressor_order to suppress coefficients
        that are not in regressor_order into "Controls: Yes". False to keep coefficients
    :param model_names: If a collection is passed, will be used as column names in summary table.
    :return: a regression summary
    :rtype:
    """

    if isinstance(regressor_order, tuple):
        regressor_order = list(regressor_order)

    _check_produce_summary_inputs(
        regressor_order,
        suppress_other_regressors,
        model_names,
        len(reg_list),
        stderr,
        t_stats
    )

    info_dict = {'N': lambda x: "{0:d}".format(int(x.nobs))}

    # Grab proper r-squared. For OLS, it's adjusted r-squared, for probit and logit, it's Pseudo r-squared
    if _result_has_adjusted_r2(reg_list[0]):
        info_dict.update({
            'Adj-R2': lambda x: "{:.2f}".format(x.rsquared_adj)
        })
    elif _result_has_pseudo_r2(reg_list[0]):
        info_dict.update({
            'Pseudo-R2': lambda x: "{:.2f}".format(x.prsquared)
        })

    summ = summary_col(reg_list, stars=True, float_format=float_format,
                       regressor_order=regressor_order,
                       info_dict=info_dict)
    split_rows = [var for var in info_dict]

    # Convert stderrs to t-stats if necessary
    if t_stats:
        summ.tables[0] = replace_stderr_with_t_stat_in_summary_df(summ.tables[0], split_rows, reg_list)

    # Handle fe - remove individual fe cols and replace with e.g. Industry Fixed Effects No, Yes, Yes
    dummy_col_dicts = [result.dummy_cols_dict for result in reg_list]
    if any([dummy_col_dict is not None for dummy_col_dict in dummy_col_dicts]): #if fixed effects
        _remove_fe_cols_replace_with_fixed_effect_yes_no_lines(summ, dummy_col_dicts, split_rows)

    # Handle dropping of unimportant coefficients and replacing with Controls: Yes or No
    if suppress_other_regressors:
        summ.tables[0] = suppress_controls_in_summary_df(summ.tables[0], regressor_order, dummy_col_dicts,
                                                         info_dict)

    # Add Yes and No for each cluster variable
    _add_cluster_yes_no_lines(summ, reg_list, split_rows)

    if not stderr and not t_stats:
        summ.tables[0].drop('', axis=0, inplace=True)  # drops the rows containing standard errors

    # Change const to Intercept in output
    summ.tables[0].index = [col if col != 'const' else 'Intercept' for col in summ.tables[0].index]

    if model_names:
        summ.tables[0].columns = model_names

    return summ


def _add_cluster_yes_no_lines(summ, reg_list, split_rows):
    cluster_list_of_lists = _get_cluster_list_of_lists(reg_list)
    if not any([cluster is not None for cluster in cluster_list_of_lists]):
        return

    cluster_col_boolean_dict = col_boolean_dict_from_list_of_lists_of_columns(cluster_list_of_lists)
    var_df, split_df = get_var_df_and_non_var_df(summ.tables[0], split_rows=split_rows)

    var_df = add_cluster_rows(var_df, cluster_col_boolean_dict)

    # Recombine with n, R^2, etc. and
    summ.tables[0] = pd.concat([var_df, split_df], axis=0)

def _get_cluster_list_of_lists(reg_list):
    cluster_list_of_lists = []
    for result in reg_list:
        if hasattr(result, 'cluster_variables'):
            cluster_list_of_lists.append(result.cluster_variables)
        else:
            cluster_list_of_lists.append(None)
    return cluster_list_of_lists


def _remove_fe_cols_replace_with_fixed_effect_yes_no_lines(summ, dummy_col_dicts, split_rows):
    """
    Note: inplace
    """
    # split into dataframe of variables and dataframe of N, R^2, etc.
    var_df, split_df = get_var_df_and_non_var_df(summ.tables[0], split_rows=split_rows)
    # get name of all fixed effect variables
    all_cols_to_remove = extract_all_dummy_cols_from_dummy_cols_dict_list(dummy_col_dicts)
    # remove fixed effect coefs and stderrs
    var_df = _drop_variables_from_reg_summary_df(var_df, all_cols_to_remove)

    # construct a single dict where the keys are names of fixed effects and the values are lists of booleans for
    # whether the fixed effect was used
    fe_dict = _multiple_model_fe_dict_from_dummy_col_dict_list(dummy_col_dicts)

    # Add yes no row
    var_df = add_fixed_effects_rows(var_df, fe_dict)

    # Recombine with n, R^2, etc. and
    summ.tables[0] = pd.concat([var_df, split_df], axis=0)

def _extract_result_list_and_dummy_dicts(result_sets):
    plain_results = []
    dummy_dicts = []
    for ambiguous_result in result_sets:
        # This is the case where fe has been passed, and we have a dummy_col_dict
        if isinstance(ambiguous_result, tuple):
            plain_results.append(ambiguous_result[0])
            dummy_dicts.append(ambiguous_result[1])
        # No fe passed, just plain result
        else:
            plain_results.append(ambiguous_result)
            dummy_dicts.append(None)  # keep order by appending None

    return plain_results, dummy_dicts


def _multiple_model_fe_dict_from_dummy_col_dict_list(dummy_col_dict_list):
    fixed_effect_rows = extract_all_fe_names_from_dummy_cols_dict_list(dummy_col_dict_list)
    out_dict = {fe_name: [] for fe_name in fixed_effect_rows}
    for dummy_col_dict in dummy_col_dict_list:
        for fe_name in fixed_effect_rows:
            if (not dummy_col_dict) or (fe_name not in dummy_col_dict):
                out_dict[fe_name].append(False)
            else:
                out_dict[fe_name].append(True)

    return out_dict


def _drop_variables_from_reg_summary_df(df, dropvars):
    # Find variables to be kept
    keepvars = [var for var in df.index if var not in dropvars and var != '']

    # Create column identifying row as an estimate or standard error
    df['type'] = ['estimate', 'stderr'] * int(len(df.index) / 2)

    # Create column identifying variable name of row (no spaces)
    df['regressor'] = [i for sublist in [[j] * 2 for j in df.index[0::2]] for i in sublist]

    # Create a column of the original location of the row (will be sorting index, need to get original
    # order back later)
    df['idx'] = [i for i in range(len(df))]

    # Create the multi-index for slicing
    df = df.set_index(['regressor', 'type'])
    df = df.sort_index()

    # Slice on the chosen regressors, reset the index to delete a column later
    df = df.loc[keepvars].reset_index()
    df = df.sort_values('idx')

    # Set value of index back to original - which had blanks for stderrs
    df.loc[df['type'] == 'stderr', 'regressor'] = ''

    # Delete the type column
    df.drop(['type', 'idx'], axis=1, inplace=True)

    # Reindex the dataframe on the regressor
    df = df.set_index(['regressor'])

    # Get rid of name on index
    df.index.name = None

    return df


def _check_produce_summary_inputs(regressor_order, supress_other_regressors, model_names, num_models,
                                  stderr: bool, t_stats: bool):
    if (regressor_order == []) & (supress_other_regressors):
        raise ValueError('must pass regressors to regressor_order to suppress other regressors')

    if model_names and (len(model_names) != num_models):
        raise ValueError(f'must pass model_names of equal length to num models. Have {len(model_names)} names and {num_models} models.')

    if stderr and t_stats:
        raise ValueError(f'cannot pass both stderr and t stats, pick one of the two or neither')

def _result_has_adjusted_r2(result):
    return hasattr(result, 'rsquared_adj')

def _result_has_pseudo_r2(result):
    return hasattr(result, 'prsquared')