Source code for pd_utils.filldata

import pandas as pd
import numpy as np
from functools import partial
from itertools import product
from typing import List, Optional

from pd_utils.utils import _to_list_if_str, _to_series_if_str, _to_name_if_series


[docs]def fillna_by_groups_and_keep_one_per_group( df, byvars, exclude_cols=None, str_vars="first", num_vars="mean" ): """ Fills missing values by group, with different handling for string variables versus numeric, then keeps one observation per group. WARNING: do not use if index is important, it will be dropped """ byvars = _to_list_if_str(byvars) if exclude_cols: exclude_cols = _to_list_if_str(exclude_cols) df = fillna_by_groups( df, byvars, exclude_cols=exclude_cols, str_vars=str_vars, num_vars=num_vars ) _drop_duplicates(df, byvars) return df
[docs]def fillna_by_groups(df, byvars, exclude_cols=None, str_vars="first", num_vars="mean"): """ Fills missing values by group, with different handling for string variables versus numeric WARNING: do not use if index is important, it will be dropped """ byvars = _to_list_if_str(byvars) if exclude_cols: cols_to_fill = [ col for col in df.columns if (col not in exclude_cols) and (col not in byvars) ] concat_vars = byvars + exclude_cols else: cols_to_fill = [col for col in df.columns if col not in byvars] concat_vars = byvars _fill_data = partial(_fill_data_for_series, str_vars=str_vars, num_vars=num_vars) out_dfs = [] for group, group_df in df[byvars + cols_to_fill].groupby(byvars, as_index=False): out_dfs.append(group_df.apply(_fill_data, axis=0)) filled = pd.concat(out_dfs, axis=0).reset_index(drop=True) filled = _restore_nans_after_fill( filled ) # _fill_data places -999.999 in place of nans, now convert back return filled
[docs]def add_missing_group_rows( df, group_id_cols: List[str], non_group_id_cols: List[str], fill_method: Optional[str] = "ffill", fill_limit: Optional[int] = None, ): """ Adds rows so that each group has all non group IDs, optionally filling values by a pandas fill method :param df: :param group_id_cols: typically entity ids. these ids represents groups in the data. data will not be forward/back filled across differences in these ids. :param non_group_id_cols: typically date or time ids. data will be forward/back filled across differences in these ids :param fill_method: pandas fill methods, None to not fill :param fill_limit: pandas fill limit :return: """ fill_id_cols = group_id_cols + non_group_id_cols fill_ids = [df[fill_id_col].unique() for fill_id_col in fill_id_cols] index_df = pd.DataFrame([i for i in product(*fill_ids)], columns=fill_id_cols) merged = index_df.merge(df, how="left", on=fill_id_cols) # Newly created rows will have missing values. Sort and fill merged.sort_values(fill_id_cols, inplace=True) # TODO [#3]: Update add_missing_group_rows to not fill nans in existing data # # this method can still fill nans in existing data, not just created rows # if fill_method is None, don't call fillna at all, return with NaNs if fill_method is not None: merged = merged.groupby(group_id_cols, as_index=False).fillna( method=fill_method, limit=fill_limit ) return merged
[docs]def drop_missing_group_rows(df, fill_id_cols): drop_subset = [col for col in df.columns if col not in fill_id_cols] return df.dropna(subset=drop_subset, how="all")
def _fill_data_for_series(series, str_vars="first", num_vars="mean"): index = _get_non_nan_value_index(series, str_vars) if index is None: # All nans, can't do anything but return back nothing # But transform ignores nans in the output and then complains when the sizes don't match. # So instead, put a placeholder of -999.999 return pd.Series([-999.999 for i in range(len(series))]) # handle numeric if series.dtype in (np.float64, np.int64): if num_vars in ("first", "last"): # Overwrite index for that of num vars if not using the same value as for str vars if num_vars != str_vars: index = _get_non_nan_value_index(series, num_vars) return _fill_data_for_str_series(series, non_nan_index=index) return _fill_data_for_numeric_series(series, fill_function=num_vars) # handle strs else: return _fill_data_for_str_series(series, non_nan_index=index) def _fill_data_for_numeric_series(series, fill_function="mean"): return series.fillna(series.agg(fill_function)) def _fill_data_for_str_series(series, non_nan_index): fill_value = series.loc[non_nan_index] return series.fillna(fill_value) def _get_non_nan_value_index(series, first_or_last): if first_or_last == "first": return series.first_valid_index() elif first_or_last == "last": return series.last_valid_index() else: raise ValueError("Did not pass 'first' or 'last'") def _restore_nans_after_fill(df): """ -999.999 was used as a missing representation as pandas can not handle nans in transform. Convert back to nan now """ return df.applymap(lambda x: np.nan if x == -999.999 else x) def _drop_duplicates(df, byvars): """ Note: inplace """ df.drop_duplicates(subset=byvars, inplace=True)
[docs]def fill_excluded_rows(df, byvars, fillvars=None, **fillna_kwargs): """ Takes a dataframe which does not contain all possible combinations of byvars as rows. Creates those rows if fillna_kwargs are passed, calls fillna using fillna_kwargs for fillvars :param df: :param byvars: variables on which dataset should be expanded to product. Can pass a str, list of strs, or a list of pd.Series. :param fillvars: optional variables to apply fillna to :param fillna_kwargs: See pandas.DataFrame.fillna for kwargs, value=0 is common :return: :Example: An example:: df: date id var 0 2003-06-09 42223C 1 1 2003-06-10 09255G 2 with fillna_for_excluded_rows(df, byvars=['date','id'], fillvars='var', value=0) becomes: date id var 0 2003-06-09 42223C 1 1 2003-06-10 42223C 0 2 2003-06-09 09255G 0 3 2003-06-10 09255G 2 """ byvars, fillvars = [ _to_list_if_str(v) for v in [byvars, fillvars] ] # convert to lists # multiindex = [df[i].dropna().unique() for i in byvars] multiindex = [_to_series_if_str(df, i).dropna().unique() for i in byvars] byvars = [_to_name_if_series(i) for i in byvars] # get name of any series all_df = pd.DataFrame(index=pd.MultiIndex.from_product(multiindex)).reset_index() all_df.columns = byvars merged = all_df.merge(df, how="left", on=byvars) if fillna_kwargs: fillna_kwargs.update({"inplace": False}) merged[fillvars] = merged[fillvars].fillna(**fillna_kwargs) return merged