2025-09-03 13:20:23 +02:00

48 lines
2.6 KiB
Python

import pandas as pd
from logline_leviathan.exporter.export_constructor import generate_dataframe
def ensure_utf8(s):
if isinstance(s, str):
return s.encode('utf-8', errors='replace').decode('utf-8')
return s
def generate_xlsx_file(output_file_path, db_session, checkboxes, files, context_selection, only_crossmatches, start_date=None, end_date=None, include_flagged=False, only_flagged=False, only_unflagged=False):
# Fetch data using the new DataFrame constructor
df = generate_dataframe(db_session, checkboxes, files, context_selection, only_crossmatches, start_date, end_date, include_flagged, only_flagged, only_unflagged)
# Process context field
if 'Context' in df.columns:
df['Context'] = df['Context'].str.strip() # Trim whitespaces
df['Context'] = df['Context'].str.replace(r'[^\x00-\x7F]+', '', regex=True) # Remove non-ASCII characters
df['Context'] = df['Context'].apply(lambda x: x[:32767] if isinstance(x, str) else x) # Truncate to 32767 characters (Excel limit)
# Reorder columns based on whether 'Sources' or 'Source File' and 'Line Number' columns are in the DataFrame
if 'Sources' in df.columns:
df = df[["Entity Type", "Entity", "Occurrences", "Timestamp", "Sources", "Context"]]
elif 'Source File' in df.columns and 'Line Number' in df.columns:
df = df[["Entity Type", "Entity", "Occurrences", "Timestamp", "Source File", "Line Number", "Context"]]
# Apply ensure_utf8 to all string columns in df
for col in df.select_dtypes(include=[object]):
df[col] = df[col].apply(ensure_utf8)
# Using pandas.ExcelWriter
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
for entity_type in df['Entity Type'].unique():
df_filtered = df[df['Entity Type'] == entity_type]
df_filtered.to_excel(writer, sheet_name=entity_type, index=False)
# Get the xlsxwriter workbook and worksheet objects.
worksheet = writer.sheets[entity_type]
# Set column width and enable text wrapping
for idx, col in enumerate(df_filtered.columns):
# Adjust the column width if necessary
worksheet.column_dimensions[chr(65 + idx)].width = 20 # 65 is ASCII for 'A'
# Set alignment if needed
# for row in worksheet.iter_rows(min_row=2, max_col=len(df_filtered.columns), max_row=len(df_filtered) + 1):
# for cell in row:
# cell.alignment = Alignment(wrap_text=True)
# The file is saved automatically using the with context