Source code for pd_utils.optimize.dftypes

from typing import List, Union

import pandas as pd
from pd_utils.optimize.typing import PdDTypeQuadTuple, StrDict


[docs]def optimized_df(df: pd.DataFrame) -> pd.DataFrame: columns = [col for col in df.columns] type_dfs: PdDTypeQuadTuple = _optimized_dtype_dfs(df) return pd.concat(type_dfs, axis=1)[columns]
def _optimized_dtype_dfs(df: pd.DataFrame) -> PdDTypeQuadTuple: obj_df = df.select_dtypes(include=["object"]) if not obj_df.empty: obj_df = obj_df.astype("category") int_df = df.select_dtypes(include=["int"]) if not int_df.empty: int_df = int_df.apply(pd.to_numeric, downcast="unsigned") float_df = df.select_dtypes(include=["float"]) if not float_df.empty: float_df = float_df.apply(pd.to_numeric, downcast="float") type_dfs = (obj_df, int_df, float_df) optimized_columns: List[Union[str, float, int]] = [] for type_df in type_dfs: optimized_columns += [col for col in type_df.columns] # Excluded dtype should just be 'datetime', which does not need conversion excluded_columns = [col for col in df.columns if col not in optimized_columns] return type_dfs + (df[excluded_columns],)
[docs]def df_types_dict(df: pd.DataFrame, remove_dates=True) -> StrDict: df_types_dict = _df_types_dict(df) if remove_dates: return { col_name: dtype for col_name, dtype in df_types_dict.items() if "date" not in dtype } else: return df_types_dict
def _df_types_dict(df: pd.DataFrame) -> StrDict: return df.dtypes.apply(lambda x: str(x)).to_dict()