Source code for pd_utils.filldata

import pandas as pd
import numpy as np
from functools import partial
from itertools import product
from typing import List, Optional

from pd_utils.utils import _to_list_if_str, _to_series_if_str, _to_name_if_series


[docs]def fillna_by_groups_and_keep_one_per_group(
    df, byvars, exclude_cols=None, str_vars="first", num_vars="mean"
):
    """
    Fills missing values by group, with different handling for string variables versus numeric,
    then keeps one observation per group.

    WARNING: do not use if index is important, it will be dropped
    """
    byvars = _to_list_if_str(byvars)
    if exclude_cols:
        exclude_cols = _to_list_if_str(exclude_cols)

    df = fillna_by_groups(
        df, byvars, exclude_cols=exclude_cols, str_vars=str_vars, num_vars=num_vars
    )
    _drop_duplicates(df, byvars)

    return df


[docs]def fillna_by_groups(df, byvars, exclude_cols=None, str_vars="first", num_vars="mean"):
    """
    Fills missing values by group, with different handling for string variables versus numeric

    WARNING: do not use if index is important, it will be dropped
    """
    byvars = _to_list_if_str(byvars)

    if exclude_cols:
        cols_to_fill = [
            col
            for col in df.columns
            if (col not in exclude_cols) and (col not in byvars)
        ]
        concat_vars = byvars + exclude_cols
    else:
        cols_to_fill = [col for col in df.columns if col not in byvars]
        concat_vars = byvars

    _fill_data = partial(_fill_data_for_series, str_vars=str_vars, num_vars=num_vars)

    out_dfs = []
    for group, group_df in df[byvars + cols_to_fill].groupby(byvars, as_index=False):
        out_dfs.append(group_df.apply(_fill_data, axis=0))

    filled = pd.concat(out_dfs, axis=0).reset_index(drop=True)

    filled = _restore_nans_after_fill(
        filled
    )  # _fill_data places -999.999 in place of nans, now convert back

    return filled


[docs]def add_missing_group_rows(
    df,
    group_id_cols: List[str],
    non_group_id_cols: List[str],
    fill_method: Optional[str] = "ffill",
    fill_limit: Optional[int] = None,
):
    """
    Adds rows so that each group has all non group IDs, optionally filling values by a pandas fill method

    :param df:
    :param group_id_cols: typically entity ids. these ids represents groups in the data. data will not be
            forward/back filled across differences in these ids.
    :param non_group_id_cols: typically date or time ids. data will be forward/back filled across differences in these ids
    :param fill_method: pandas fill methods, None to not fill
    :param fill_limit: pandas fill limit
    :return:
    """
    fill_id_cols = group_id_cols + non_group_id_cols
    fill_ids = [df[fill_id_col].unique() for fill_id_col in fill_id_cols]
    index_df = pd.DataFrame([i for i in product(*fill_ids)], columns=fill_id_cols)

    merged = index_df.merge(df, how="left", on=fill_id_cols)

    # Newly created rows will have missing values. Sort and fill
    merged.sort_values(fill_id_cols, inplace=True)
    # TODO [#3]: Update add_missing_group_rows to not fill nans in existing data
    #
    # this method can still fill nans in existing data, not just created rows

    # if fill_method is None, don't call fillna at all, return with NaNs
    if fill_method is not None:
        merged = merged.groupby(group_id_cols, as_index=False).fillna(
            method=fill_method, limit=fill_limit
        )

    return merged


[docs]def drop_missing_group_rows(df, fill_id_cols):
    drop_subset = [col for col in df.columns if col not in fill_id_cols]
    return df.dropna(subset=drop_subset, how="all")


def _fill_data_for_series(series, str_vars="first", num_vars="mean"):
    index = _get_non_nan_value_index(series, str_vars)
    if index is None:
        # All nans, can't do anything but return back nothing
        # But transform ignores nans in the output and then complains when the sizes don't match.
        # So instead, put a placeholder of -999.999
        return pd.Series([-999.999 for i in range(len(series))])
    # handle numeric
    if series.dtype in (np.float64, np.int64):
        if num_vars in ("first", "last"):
            # Overwrite index for that of num vars if not using the same value as for str vars
            if num_vars != str_vars:
                index = _get_non_nan_value_index(series, num_vars)
            return _fill_data_for_str_series(series, non_nan_index=index)
        return _fill_data_for_numeric_series(series, fill_function=num_vars)
    # handle strs
    else:
        return _fill_data_for_str_series(series, non_nan_index=index)


def _fill_data_for_numeric_series(series, fill_function="mean"):
    return series.fillna(series.agg(fill_function))


def _fill_data_for_str_series(series, non_nan_index):
    fill_value = series.loc[non_nan_index]

    return series.fillna(fill_value)


def _get_non_nan_value_index(series, first_or_last):
    if first_or_last == "first":
        return series.first_valid_index()
    elif first_or_last == "last":
        return series.last_valid_index()
    else:
        raise ValueError("Did not pass 'first' or 'last'")


def _restore_nans_after_fill(df):
    """
    -999.999 was used as a missing representation as pandas can not handle nans in transform.
    Convert back to nan now
    """
    return df.applymap(lambda x: np.nan if x == -999.999 else x)


def _drop_duplicates(df, byvars):
    """
    Note: inplace
    """
    df.drop_duplicates(subset=byvars, inplace=True)


[docs]def fill_excluded_rows(df, byvars, fillvars=None, **fillna_kwargs):
    """
    Takes a dataframe which does not contain all possible combinations of byvars as rows. Creates
    those rows if fillna_kwargs are passed, calls fillna using fillna_kwargs for fillvars

    :param df:
    :param byvars: variables on which dataset should be expanded to product. Can pass a str, list of
            strs, or a list of pd.Series.
    :param fillvars: optional variables to apply fillna to
    :param fillna_kwargs: See pandas.DataFrame.fillna for kwargs, value=0 is common
    :return:

    :Example:

    An example::

        df:
                     date     id  var
            0  2003-06-09 42223C    1
            1  2003-06-10 09255G    2

        with fillna_for_excluded_rows(df, byvars=['date','id'], fillvars='var', value=0) becomes:

                      date     id  var
            0  2003-06-09 42223C    1
            1  2003-06-10 42223C    0
            2  2003-06-09 09255G    0
            3  2003-06-10 09255G    2
    """
    byvars, fillvars = [
        _to_list_if_str(v) for v in [byvars, fillvars]
    ]  # convert to lists

    #     multiindex = [df[i].dropna().unique() for i in byvars]
    multiindex = [_to_series_if_str(df, i).dropna().unique() for i in byvars]
    byvars = [_to_name_if_series(i) for i in byvars]  # get name of any series

    all_df = pd.DataFrame(index=pd.MultiIndex.from_product(multiindex)).reset_index()
    all_df.columns = byvars
    merged = all_df.merge(df, how="left", on=byvars)

    if fillna_kwargs:
        fillna_kwargs.update({"inplace": False})
        merged[fillvars] = merged[fillvars].fillna(**fillna_kwargs)
    return merged