LoglineLeviathan/logline_leviathan/exporter/xlsx_export.py

import pandas as pd
from logline_leviathan.exporter.export_constructor import generate_dataframe

def ensure_utf8(s):
    if isinstance(s, str):
        return s.encode('utf-8', errors='replace').decode('utf-8')
    return s

def generate_xlsx_file(output_file_path, db_session, checkboxes, files, context_selection, only_crossmatches, start_date=None, end_date=None, include_flagged=False, only_flagged=False, only_unflagged=False):
    # Fetch data using the new DataFrame constructor
    df = generate_dataframe(db_session, checkboxes, files, context_selection, only_crossmatches, start_date, end_date, include_flagged, only_flagged, only_unflagged)
    # Process context field
    if 'Context' in df.columns:
        df['Context'] = df['Context'].str.strip()  # Trim whitespaces
        df['Context'] = df['Context'].str.replace(r'[^\x00-\x7F]+', '', regex=True)  # Remove non-ASCII characters
        df['Context'] = df['Context'].apply(lambda x: x[:32767] if isinstance(x, str) else x)  # Truncate to 32767 characters (Excel limit)

    # Reorder columns based on whether 'Sources' or 'Source File' and 'Line Number' columns are in the DataFrame
    if 'Sources' in df.columns:
        df = df[["Entity Type", "Entity", "Occurrences", "Timestamp", "Sources", "Context"]]
    elif 'Source File' in df.columns and 'Line Number' in df.columns:
        df = df[["Entity Type", "Entity", "Occurrences", "Timestamp", "Source File", "Line Number", "Context"]]

    # Apply ensure_utf8 to all string columns in df
    for col in df.select_dtypes(include=[object]):
        df[col] = df[col].apply(ensure_utf8)

    # Using pandas.ExcelWriter
    with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
        for entity_type in df['Entity Type'].unique():
            df_filtered = df[df['Entity Type'] == entity_type]
            df_filtered.to_excel(writer, sheet_name=entity_type, index=False)

            # Get the xlsxwriter workbook and worksheet objects.
            worksheet = writer.sheets[entity_type]

            # Set column width and enable text wrapping
            for idx, col in enumerate(df_filtered.columns):
                # Adjust the column width if necessary
                worksheet.column_dimensions[chr(65 + idx)].width = 20  # 65 is ASCII for 'A'

                # Set alignment if needed
                # for row in worksheet.iter_rows(min_row=2, max_col=len(df_filtered.columns), max_row=len(df_filtered) + 1):
                #     for cell in row:
                #         cell.alignment = Alignment(wrap_text=True)

    # The file is saved automatically using the with context