Source code for dstream_excel.dstream.combine

import argparse
import os
import re

import pandas as pd

from dstream_excel.tools.ext_pandas import _get_outpath_and_df_of_headers, _append_df_to_csv
from dstream_excel.tracker.files import FileProcessTracker
from dstream_excel.dstream.workbook.filename import _original_symbol_from_filename_symbol


[docs]def combine_all_datastream_xlsx(infolder: str, outpath: str = 'all_data.csv', restart: bool = True): """ Combine populated data in XLSX files into single csv file. :param infolder: Folder containing populated XLSX files :param outpath: Full file path to output csv to :param restart: Whether to force a restart of combining files. If the process is stopped, it will be continued where it left off if restart=False, and start from the beginning if restart=True. :return: None """ file_tracker = FileProcessTracker(folder=infolder, restart=restart, file_types=('xlsx',)) outpath, df_of_headers = _get_outpath_and_df_of_headers(outpath) all_columns = [col for col in df_of_headers.columns] for file in file_tracker.file_generator(): df_of_headers, all_columns = _append_datastream_xlsx_to_csv(file, outpath, df_of_headers, all_columns)
def _append_datastream_xlsx_to_csv(file, outpath, df_of_headers, all_columns): df_for_append = pd.read_excel(file) # load new data _reformat_datastream_df_for_append(df_for_append, file) df_of_headers, all_columns = _append_df_to_csv(df_for_append, df_of_headers, outpath, all_columns) return df_of_headers, all_columns def _reformat_datastream_df_for_append(df, file): """ Note: inplace """ _clean_datastream_df(df) df['Ticker'] = _datastream_filepath_to_iq_id(file) return df def _clean_datastream_df(df): """ Note: inplace """ _drop_unneded_data(df) _rename_datastream_cols(df) def _drop_unneded_data(df): """ Note: inplace """ error_cols = [col for col in df.columns if '#ERROR' in col] df.drop(error_cols, axis=1, inplace=True) data_cols = [col for col in df.columns if '-' in col] df.dropna(subset=data_cols, inplace=True) def _rename_datastream_cols(df): """ Note: inplace """ rename_dict = _rename_dict_for_datastream_cols(df) df.rename(columns=rename_dict, inplace=True) def _rename_dict_for_datastream_cols(df): rename_dict = { 'Name': 'Date' } data_cols = [col for col in df.columns if '-' in col] data_rename = {col: _new_name_for_datastream_column(col) for col in data_cols} rename_dict.update(data_rename) return rename_dict def _new_name_for_datastream_column(col): return col.split('-')[-1].strip() def _datastream_filepath_to_iq_id(filepath): filename = os.path.basename(filepath) #strips folders, etc. pattern = re.compile(r'([\S\s]*)([.]xlsx)') ds_filename_id = pattern.match(filename).group(1) ds_id = _original_symbol_from_filename_symbol(ds_filename_id) return ds_id if __name__ == '__main__': parser = argparse.ArgumentParser( description='Populate data in XLSX files containing datastream functions') parser.add_argument('-f', '--folder', required=False, default=r'C:\Users\derobertisna.UFAD\Dropbox (Personal)\UF\Andy\ETF Project\Data\Datastream\inprogress') parser.add_argument('-o', '--outpath', required=False, default=r'C:\Users\derobertisna.UFAD\Dropbox (Personal)\UF\Andy\ETF Project\Data\Datastream\all datastream data.csv') # Default is to restart creation of data parser.add_argument('-n', '--no-restart', action='store_true', default=False) args = parser.parse_args() restart = (not args.no_restart) combine_all_datastream_xlsx(args.folder, args.outpath, restart=restart)