import logging import pandas as pd from datetime import datetime as dt from PyQt5.QtCore import Qt from sqlalchemy import func, cast, String, distinct from logline_leviathan.database.database_manager import ContextTable, EntityTypesTable, DistinctEntitiesTable, EntitiesTable, FileMetadata def generate_dataframe(db_session, tree_items, file_items, context_selection, only_crossmatches=False, start_date=None, end_date=None, include_flagged=False, only_flagged=False, only_unflagged=False): if not db_session: raise ValueError("Database session is None") all_data = [] # List to accumulate data from all entity types # Extract entity_type from selected tree items selected_entity_types = [item.entity_type for item in tree_items if item.checkState(0) == Qt.Checked] checked_files = [item for item in file_items.getCheckedFiles()] logging.debug(f"Generating dataframe, selected entity types: {selected_entity_types}, passed timestamp range: {start_date} - {end_date}") context_field = { 'Kompakte Zusammenfassung ohne Kontext': None, 'Kontext - gleiche Zeile': ContextTable.context_small, 'Kontext - mittelgroß': ContextTable.context_medium, 'Kontext - umfangreich': ContextTable.context_large }.get(context_selection) # Convert start_date and end_date to datetime objects if they are not None if start_date and end_date: start_datetime = dt.combine(start_date, dt.min.time()) end_datetime = dt.combine(end_date, dt.max.time()) # Creating a subquery to count distinct file IDs file_count_subquery = db_session.query( EntitiesTable.distinct_entities_id, func.count(distinct(EntitiesTable.file_id)).label('file_count') ).group_by(EntitiesTable.distinct_entities_id) if only_crossmatches: file_count_subquery = file_count_subquery.having(func.count(distinct(EntitiesTable.file_id)) > 1) file_count_subquery = file_count_subquery.subquery() for entity_type in selected_entity_types: if context_selection == 'Kompakte Zusammenfassung ohne Kontext': query = db_session.query( EntityTypesTable.entity_type, DistinctEntitiesTable.distinct_entity, func.count(EntitiesTable.entities_id).label('occurrences'), func.group_concat( FileMetadata.file_name + ':line' + cast(EntitiesTable.line_number, String) ).label('sources'), func.group_concat( cast(EntitiesTable.entry_timestamp, String) ).label('timestamps') ).join(EntityTypesTable, DistinctEntitiesTable.entity_types_id == EntityTypesTable.entity_type_id ).join(EntitiesTable, DistinctEntitiesTable.distinct_entities_id == EntitiesTable.distinct_entities_id ).join(FileMetadata, EntitiesTable.file_id == FileMetadata.file_id ).join(file_count_subquery, DistinctEntitiesTable.distinct_entities_id == file_count_subquery.c.distinct_entities_id ).filter(EntityTypesTable.entity_type == entity_type ).group_by(DistinctEntitiesTable.distinct_entity) # Apply timestamp filter if start_date and end_date are provided if start_date and end_date: query = query.filter(EntitiesTable.entry_timestamp.between(start_datetime, end_datetime)) if checked_files: query = query.filter(FileMetadata.file_name.in_(checked_files)) if include_flagged: if only_flagged: query = query.filter(EntitiesTable.flag == True) elif only_unflagged: query = query.filter(EntitiesTable.flag == False) for row in query.all(): sources = row[3].replace(',', ' // ') if row[3] is not None else '' timestamps = row[4].replace(',', ' // ') if row[4] is not None else '' all_data.append([row[0], row[1], row[2], timestamps, sources, '']) else: query = db_session.query( EntityTypesTable.entity_type, DistinctEntitiesTable.distinct_entity, func.count(EntitiesTable.entities_id).over(partition_by=DistinctEntitiesTable.distinct_entity).label('occurrences'), FileMetadata.file_name, EntitiesTable.line_number, context_field, EntitiesTable.entry_timestamp ).select_from(EntitiesTable ).join(DistinctEntitiesTable, EntitiesTable.distinct_entities_id == DistinctEntitiesTable.distinct_entities_id ).join(EntityTypesTable, DistinctEntitiesTable.entity_types_id == EntityTypesTable.entity_type_id ).join(FileMetadata, EntitiesTable.file_id == FileMetadata.file_id ).outerjoin(ContextTable, EntitiesTable.entities_id == ContextTable.entities_id ).join(file_count_subquery, DistinctEntitiesTable.distinct_entities_id == file_count_subquery.c.distinct_entities_id ).filter(EntityTypesTable.entity_type == entity_type) # Apply timestamp filter if start_date and end_date are provided if start_date and end_date: query = query.filter(EntitiesTable.entry_timestamp.between(start_datetime, end_datetime)) if checked_files: query = query.filter(FileMetadata.file_name.in_(checked_files)) if include_flagged: if only_flagged: query = query.filter(EntitiesTable.flag == True) elif only_unflagged: query = query.filter(EntitiesTable.flag == False) for row in query.all(): file_name = row[3] line_number = row[4] entry_timestamp = row[6].strftime('%Y-%m-%d %H:%M:%S') if row[6] is not None else '' context_info = row[5] if row[5] is not None else '' all_data.append([row[0], row[1], row[2], entry_timestamp, file_name, line_number, context_info]) # Define the columns for the DataFrame based on context_selection columns = ["Entity Type", "Entity", "Occurrences", "Timestamp", "Sources", "Context"] if context_selection == 'Kompakte Zusammenfassung ohne Kontext' else ["Entity Type", "Entity", "Occurrences", "Timestamp", "Source File", "Line Number", "Context"] # Construct and return the DataFrame from all accumulated data return pd.DataFrame(all_data, columns=columns)