LoglineLeviathan/logline_leviathan/exporter/export_constructor.py

import logging
import pandas as pd
from datetime import datetime as dt
from PyQt5.QtCore import Qt
from sqlalchemy import func, cast, String, distinct
from logline_leviathan.database.database_manager import ContextTable, EntityTypesTable, DistinctEntitiesTable, EntitiesTable, FileMetadata

def generate_dataframe(db_session, tree_items, file_items, context_selection, only_crossmatches=False, start_date=None, end_date=None, include_flagged=False, only_flagged=False, only_unflagged=False):
    if not db_session:
        raise ValueError("Database session is None")

    all_data = []  # List to accumulate data from all entity types

    # Extract entity_type from selected tree items
    selected_entity_types = [item.entity_type for item in tree_items if item.checkState(0) == Qt.Checked]
    checked_files = [item for item in file_items.getCheckedFiles()]
    logging.debug(f"Generating dataframe, selected entity types: {selected_entity_types}, passed timestamp range: {start_date} - {end_date}")

    context_field = {
        'Kompakte Zusammenfassung ohne Kontext': None,
        'Kontext - gleiche Zeile': ContextTable.context_small,
        'Kontext - mittelgroß': ContextTable.context_medium,
        'Kontext - umfangreich': ContextTable.context_large
    }.get(context_selection)

    # Convert start_date and end_date to datetime objects if they are not None
    if start_date and end_date:
        start_datetime = dt.combine(start_date, dt.min.time())
        end_datetime = dt.combine(end_date, dt.max.time())

    # Creating a subquery to count distinct file IDs
    file_count_subquery = db_session.query(
        EntitiesTable.distinct_entities_id,
        func.count(distinct(EntitiesTable.file_id)).label('file_count')
    ).group_by(EntitiesTable.distinct_entities_id)

    if only_crossmatches:
        file_count_subquery = file_count_subquery.having(func.count(distinct(EntitiesTable.file_id)) > 1)

    file_count_subquery = file_count_subquery.subquery()

    for entity_type in selected_entity_types:
        if context_selection == 'Kompakte Zusammenfassung ohne Kontext':
            query = db_session.query(
                EntityTypesTable.entity_type,
                DistinctEntitiesTable.distinct_entity,
                func.count(EntitiesTable.entities_id).label('occurrences'),
                func.group_concat(
                    FileMetadata.file_name + ':line' + cast(EntitiesTable.line_number, String)
                ).label('sources'),
                func.group_concat(
                    cast(EntitiesTable.entry_timestamp, String)
                ).label('timestamps')
            ).join(EntityTypesTable, DistinctEntitiesTable.entity_types_id == EntityTypesTable.entity_type_id
            ).join(EntitiesTable, DistinctEntitiesTable.distinct_entities_id == EntitiesTable.distinct_entities_id
            ).join(FileMetadata, EntitiesTable.file_id == FileMetadata.file_id
            ).join(file_count_subquery, DistinctEntitiesTable.distinct_entities_id == file_count_subquery.c.distinct_entities_id
            ).filter(EntityTypesTable.entity_type == entity_type
            ).group_by(DistinctEntitiesTable.distinct_entity)
            # Apply timestamp filter if start_date and end_date are provided
            if start_date and end_date:
                query = query.filter(EntitiesTable.entry_timestamp.between(start_datetime, end_datetime))
            if checked_files:
                query = query.filter(FileMetadata.file_name.in_(checked_files))
            if include_flagged:
                if only_flagged:
                    query = query.filter(EntitiesTable.flag == True)
                elif only_unflagged:
                    query = query.filter(EntitiesTable.flag == False)


            for row in query.all():
                sources = row[3].replace(',', ' // ') if row[3] is not None else ''
                timestamps = row[4].replace(',', ' // ') if row[4] is not None else ''
                all_data.append([row[0], row[1], row[2], timestamps, sources, ''])

        else:
            query = db_session.query(
                EntityTypesTable.entity_type,
                DistinctEntitiesTable.distinct_entity,
                func.count(EntitiesTable.entities_id).over(partition_by=DistinctEntitiesTable.distinct_entity).label('occurrences'),
                FileMetadata.file_name,
                EntitiesTable.line_number,
                context_field,
                EntitiesTable.entry_timestamp
            ).select_from(EntitiesTable
            ).join(DistinctEntitiesTable, EntitiesTable.distinct_entities_id == DistinctEntitiesTable.distinct_entities_id
            ).join(EntityTypesTable, DistinctEntitiesTable.entity_types_id == EntityTypesTable.entity_type_id
            ).join(FileMetadata, EntitiesTable.file_id == FileMetadata.file_id
            ).outerjoin(ContextTable, EntitiesTable.entities_id == ContextTable.entities_id
            ).join(file_count_subquery, DistinctEntitiesTable.distinct_entities_id == file_count_subquery.c.distinct_entities_id
            ).filter(EntityTypesTable.entity_type == entity_type)
            # Apply timestamp filter if start_date and end_date are provided
            if start_date and end_date:
                query = query.filter(EntitiesTable.entry_timestamp.between(start_datetime, end_datetime))
            if checked_files:
                query = query.filter(FileMetadata.file_name.in_(checked_files))
            if include_flagged:
                if only_flagged:
                    query = query.filter(EntitiesTable.flag == True)
                elif only_unflagged:
                    query = query.filter(EntitiesTable.flag == False)

            for row in query.all():
                file_name = row[3]
                line_number = row[4]
                entry_timestamp = row[6].strftime('%Y-%m-%d %H:%M:%S') if row[6] is not None else ''
                context_info = row[5] if row[5] is not None else ''
                all_data.append([row[0], row[1], row[2], entry_timestamp, file_name, line_number, context_info])

    # Define the columns for the DataFrame based on context_selection
    columns = ["Entity Type", "Entity", "Occurrences", "Timestamp", "Sources", "Context"] if context_selection == 'Kompakte Zusammenfassung ohne Kontext' else ["Entity Type", "Entity", "Occurrences", "Timestamp", "Source File", "Line Number", "Context"]

    # Construct and return the DataFrame from all accumulated data
    return pd.DataFrame(all_data, columns=columns)