2025-09-03 13:20:23 +02:00

116 lines
6.5 KiB
Python

import logging
import pandas as pd
from datetime import datetime as dt
from PyQt5.QtCore import Qt
from sqlalchemy import func, cast, String, distinct
from logline_leviathan.database.database_manager import ContextTable, EntityTypesTable, DistinctEntitiesTable, EntitiesTable, FileMetadata
def generate_dataframe(db_session, tree_items, file_items, context_selection, only_crossmatches=False, start_date=None, end_date=None, include_flagged=False, only_flagged=False, only_unflagged=False):
if not db_session:
raise ValueError("Database session is None")
all_data = [] # List to accumulate data from all entity types
# Extract entity_type from selected tree items
selected_entity_types = [item.entity_type for item in tree_items if item.checkState(0) == Qt.Checked]
checked_files = [item for item in file_items.getCheckedFiles()]
logging.debug(f"Generating dataframe, selected entity types: {selected_entity_types}, passed timestamp range: {start_date} - {end_date}")
context_field = {
'Kompakte Zusammenfassung ohne Kontext': None,
'Kontext - gleiche Zeile': ContextTable.context_small,
'Kontext - mittelgroß': ContextTable.context_medium,
'Kontext - umfangreich': ContextTable.context_large
}.get(context_selection)
# Convert start_date and end_date to datetime objects if they are not None
if start_date and end_date:
start_datetime = dt.combine(start_date, dt.min.time())
end_datetime = dt.combine(end_date, dt.max.time())
# Creating a subquery to count distinct file IDs
file_count_subquery = db_session.query(
EntitiesTable.distinct_entities_id,
func.count(distinct(EntitiesTable.file_id)).label('file_count')
).group_by(EntitiesTable.distinct_entities_id)
if only_crossmatches:
file_count_subquery = file_count_subquery.having(func.count(distinct(EntitiesTable.file_id)) > 1)
file_count_subquery = file_count_subquery.subquery()
for entity_type in selected_entity_types:
if context_selection == 'Kompakte Zusammenfassung ohne Kontext':
query = db_session.query(
EntityTypesTable.entity_type,
DistinctEntitiesTable.distinct_entity,
func.count(EntitiesTable.entities_id).label('occurrences'),
func.group_concat(
FileMetadata.file_name + ':line' + cast(EntitiesTable.line_number, String)
).label('sources'),
func.group_concat(
cast(EntitiesTable.entry_timestamp, String)
).label('timestamps')
).join(EntityTypesTable, DistinctEntitiesTable.entity_types_id == EntityTypesTable.entity_type_id
).join(EntitiesTable, DistinctEntitiesTable.distinct_entities_id == EntitiesTable.distinct_entities_id
).join(FileMetadata, EntitiesTable.file_id == FileMetadata.file_id
).join(file_count_subquery, DistinctEntitiesTable.distinct_entities_id == file_count_subquery.c.distinct_entities_id
).filter(EntityTypesTable.entity_type == entity_type
).group_by(DistinctEntitiesTable.distinct_entity)
# Apply timestamp filter if start_date and end_date are provided
if start_date and end_date:
query = query.filter(EntitiesTable.entry_timestamp.between(start_datetime, end_datetime))
if checked_files:
query = query.filter(FileMetadata.file_name.in_(checked_files))
if include_flagged:
if only_flagged:
query = query.filter(EntitiesTable.flag == True)
elif only_unflagged:
query = query.filter(EntitiesTable.flag == False)
for row in query.all():
sources = row[3].replace(',', ' // ') if row[3] is not None else ''
timestamps = row[4].replace(',', ' // ') if row[4] is not None else ''
all_data.append([row[0], row[1], row[2], timestamps, sources, ''])
else:
query = db_session.query(
EntityTypesTable.entity_type,
DistinctEntitiesTable.distinct_entity,
func.count(EntitiesTable.entities_id).over(partition_by=DistinctEntitiesTable.distinct_entity).label('occurrences'),
FileMetadata.file_name,
EntitiesTable.line_number,
context_field,
EntitiesTable.entry_timestamp
).select_from(EntitiesTable
).join(DistinctEntitiesTable, EntitiesTable.distinct_entities_id == DistinctEntitiesTable.distinct_entities_id
).join(EntityTypesTable, DistinctEntitiesTable.entity_types_id == EntityTypesTable.entity_type_id
).join(FileMetadata, EntitiesTable.file_id == FileMetadata.file_id
).outerjoin(ContextTable, EntitiesTable.entities_id == ContextTable.entities_id
).join(file_count_subquery, DistinctEntitiesTable.distinct_entities_id == file_count_subquery.c.distinct_entities_id
).filter(EntityTypesTable.entity_type == entity_type)
# Apply timestamp filter if start_date and end_date are provided
if start_date and end_date:
query = query.filter(EntitiesTable.entry_timestamp.between(start_datetime, end_datetime))
if checked_files:
query = query.filter(FileMetadata.file_name.in_(checked_files))
if include_flagged:
if only_flagged:
query = query.filter(EntitiesTable.flag == True)
elif only_unflagged:
query = query.filter(EntitiesTable.flag == False)
for row in query.all():
file_name = row[3]
line_number = row[4]
entry_timestamp = row[6].strftime('%Y-%m-%d %H:%M:%S') if row[6] is not None else ''
context_info = row[5] if row[5] is not None else ''
all_data.append([row[0], row[1], row[2], entry_timestamp, file_name, line_number, context_info])
# Define the columns for the DataFrame based on context_selection
columns = ["Entity Type", "Entity", "Occurrences", "Timestamp", "Sources", "Context"] if context_selection == 'Kompakte Zusammenfassung ohne Kontext' else ["Entity Type", "Entity", "Occurrences", "Timestamp", "Source File", "Line Number", "Context"]
# Construct and return the DataFrame from all accumulated data
return pd.DataFrame(all_data, columns=columns)