Source code for capiq_excel.combine
import os
import re
import tempfile
import math
import traceback
import pandas as pd
from capiq_excel.tools.ext_pandas import _get_outpath_and_df_of_headers, _append_df_to_csv, append_csv_to_csv
from processfiles.files import FileProcessTracker
[docs]def combine_all_capiq_xlsx(infolder, outpath, restart=True, num_parts=100):
file_tracker = FileProcessTracker(folder=infolder, restart=restart, file_types=('xlsx',))
outpath, df_of_headers = _get_outpath_and_df_of_headers(outpath)
all_columns = [col for col in df_of_headers.columns]
# TODO: cleanup
# Set up appending to many files to speed up process. Then the part files will be combined at the end
num_files_per_part = math.ceil(len(file_tracker.process_list) / num_parts)
file_num = 0
with tempfile.TemporaryDirectory() as temp_dir:
print(f'Creating temporary directory {temp_dir}')
print(f'Running first pass of append. Will create {num_parts} files to be used in the final append.')
for i, file in enumerate(file_tracker.file_generator()):
# Every time we process num_files_per_part number of files, increment the output file
if i % num_files_per_part == 0:
file_num += 1
temp_outpath = os.path.join(temp_dir, f'{file_num}.csv')
df_of_headers, all_columns = _append_capiq_xlsx_to_csv(file, temp_outpath, df_of_headers, all_columns)
# Now append created parts to output file
print('Running second pass of append. Using in part files to create output file.')
file_tracker = FileProcessTracker(folder=temp_dir, restart=True, file_types=('csv',))
outpath, df_of_headers = _get_outpath_and_df_of_headers(outpath)
all_columns = [col for col in df_of_headers.columns]
for file in file_tracker.file_generator():
df_for_append = pd.read_csv(file) # load new data
df_of_headers, all_columns = _append_df_to_csv(df_for_append, df_of_headers, outpath, all_columns)
def _append_capiq_xlsx_to_csv(file, outpath, df_of_headers, all_columns):
df_for_append = pd.read_excel(file) # load new data
if _filepath_has_date(file):
id_, date = _capiq_filepath_to_iq_id_and_date(file)
df_for_append['CQID'] = id_
df_for_append['Date'] = date
else:
df_for_append['CQID'] = _capiq_filepath_to_iq_id(file)
df_of_headers, all_columns = _append_df_to_csv(df_for_append, df_of_headers, outpath, all_columns)
return df_of_headers, all_columns
def _filepath_has_date(filepath):
filename = os.path.basename(filepath) # strips folders, etc.
pattern = re.compile(r'(IQ\d+) ([\d-]+)([.]xlsx)')
return True if pattern.match(filename) else False
def _capiq_filepath_to_iq_id(filepath):
filename = os.path.basename(filepath) #strips folders, etc.
pattern = re.compile(r'(IQ\d+)([.]xlsx)')
return pattern.match(filename).group(1)
def _capiq_filepath_to_iq_id_and_date(filepath):
filename = os.path.basename(filepath) # strips folders, etc.
pattern = re.compile(r'(IQ\d+) ([\d-]+)([.]xlsx)')
match = pattern.match(filename)
return match.group(1), match.group(2)