import pandas as pd from logline_leviathan.exporter.export_constructor import generate_dataframe def ensure_utf8(s): if isinstance(s, str): return s.encode('utf-8', errors='replace').decode('utf-8') return s def generate_xlsx_file(output_file_path, db_session, checkboxes, files, context_selection, only_crossmatches, start_date=None, end_date=None, include_flagged=False, only_flagged=False, only_unflagged=False): # Fetch data using the new DataFrame constructor df = generate_dataframe(db_session, checkboxes, files, context_selection, only_crossmatches, start_date, end_date, include_flagged, only_flagged, only_unflagged) # Process context field if 'Context' in df.columns: df['Context'] = df['Context'].str.strip() # Trim whitespaces df['Context'] = df['Context'].str.replace(r'[^\x00-\x7F]+', '', regex=True) # Remove non-ASCII characters df['Context'] = df['Context'].apply(lambda x: x[:32767] if isinstance(x, str) else x) # Truncate to 32767 characters (Excel limit) # Reorder columns based on whether 'Sources' or 'Source File' and 'Line Number' columns are in the DataFrame if 'Sources' in df.columns: df = df[["Entity Type", "Entity", "Occurrences", "Timestamp", "Sources", "Context"]] elif 'Source File' in df.columns and 'Line Number' in df.columns: df = df[["Entity Type", "Entity", "Occurrences", "Timestamp", "Source File", "Line Number", "Context"]] # Apply ensure_utf8 to all string columns in df for col in df.select_dtypes(include=[object]): df[col] = df[col].apply(ensure_utf8) # Using pandas.ExcelWriter with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer: for entity_type in df['Entity Type'].unique(): df_filtered = df[df['Entity Type'] == entity_type] df_filtered.to_excel(writer, sheet_name=entity_type, index=False) # Get the xlsxwriter workbook and worksheet objects. worksheet = writer.sheets[entity_type] # Set column width and enable text wrapping for idx, col in enumerate(df_filtered.columns): # Adjust the column width if necessary worksheet.column_dimensions[chr(65 + idx)].width = 20 # 65 is ASCII for 'A' # Set alignment if needed # for row in worksheet.iter_rows(min_row=2, max_col=len(df_filtered.columns), max_row=len(df_filtered) + 1): # for cell in row: # cell.alignment = Alignment(wrap_text=True) # The file is saved automatically using the with context