48 lines
2.6 KiB
Python
48 lines
2.6 KiB
Python
import pandas as pd
|
|
from logline_leviathan.exporter.export_constructor import generate_dataframe
|
|
|
|
def ensure_utf8(s):
|
|
if isinstance(s, str):
|
|
return s.encode('utf-8', errors='replace').decode('utf-8')
|
|
return s
|
|
|
|
def generate_xlsx_file(output_file_path, db_session, checkboxes, files, context_selection, only_crossmatches, start_date=None, end_date=None, include_flagged=False, only_flagged=False, only_unflagged=False):
|
|
# Fetch data using the new DataFrame constructor
|
|
df = generate_dataframe(db_session, checkboxes, files, context_selection, only_crossmatches, start_date, end_date, include_flagged, only_flagged, only_unflagged)
|
|
# Process context field
|
|
if 'Context' in df.columns:
|
|
df['Context'] = df['Context'].str.strip() # Trim whitespaces
|
|
df['Context'] = df['Context'].str.replace(r'[^\x00-\x7F]+', '', regex=True) # Remove non-ASCII characters
|
|
df['Context'] = df['Context'].apply(lambda x: x[:32767] if isinstance(x, str) else x) # Truncate to 32767 characters (Excel limit)
|
|
|
|
# Reorder columns based on whether 'Sources' or 'Source File' and 'Line Number' columns are in the DataFrame
|
|
if 'Sources' in df.columns:
|
|
df = df[["Entity Type", "Entity", "Occurrences", "Timestamp", "Sources", "Context"]]
|
|
elif 'Source File' in df.columns and 'Line Number' in df.columns:
|
|
df = df[["Entity Type", "Entity", "Occurrences", "Timestamp", "Source File", "Line Number", "Context"]]
|
|
|
|
# Apply ensure_utf8 to all string columns in df
|
|
for col in df.select_dtypes(include=[object]):
|
|
df[col] = df[col].apply(ensure_utf8)
|
|
|
|
# Using pandas.ExcelWriter
|
|
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
|
|
for entity_type in df['Entity Type'].unique():
|
|
df_filtered = df[df['Entity Type'] == entity_type]
|
|
df_filtered.to_excel(writer, sheet_name=entity_type, index=False)
|
|
|
|
# Get the xlsxwriter workbook and worksheet objects.
|
|
worksheet = writer.sheets[entity_type]
|
|
|
|
# Set column width and enable text wrapping
|
|
for idx, col in enumerate(df_filtered.columns):
|
|
# Adjust the column width if necessary
|
|
worksheet.column_dimensions[chr(65 + idx)].width = 20 # 65 is ASCII for 'A'
|
|
|
|
# Set alignment if needed
|
|
# for row in worksheet.iter_rows(min_row=2, max_col=len(df_filtered.columns), max_row=len(df_filtered) + 1):
|
|
# for cell in row:
|
|
# cell.alignment = Alignment(wrap_text=True)
|
|
|
|
# The file is saved automatically using the with context
|