116 lines
6.5 KiB
Python
116 lines
6.5 KiB
Python
import logging
|
|
import pandas as pd
|
|
from datetime import datetime as dt
|
|
from PyQt5.QtCore import Qt
|
|
from sqlalchemy import func, cast, String, distinct
|
|
from logline_leviathan.database.database_manager import ContextTable, EntityTypesTable, DistinctEntitiesTable, EntitiesTable, FileMetadata
|
|
|
|
def generate_dataframe(db_session, tree_items, file_items, context_selection, only_crossmatches=False, start_date=None, end_date=None, include_flagged=False, only_flagged=False, only_unflagged=False):
|
|
if not db_session:
|
|
raise ValueError("Database session is None")
|
|
|
|
all_data = [] # List to accumulate data from all entity types
|
|
|
|
# Extract entity_type from selected tree items
|
|
selected_entity_types = [item.entity_type for item in tree_items if item.checkState(0) == Qt.Checked]
|
|
checked_files = [item for item in file_items.getCheckedFiles()]
|
|
logging.debug(f"Generating dataframe, selected entity types: {selected_entity_types}, passed timestamp range: {start_date} - {end_date}")
|
|
|
|
context_field = {
|
|
'Kompakte Zusammenfassung ohne Kontext': None,
|
|
'Kontext - gleiche Zeile': ContextTable.context_small,
|
|
'Kontext - mittelgroß': ContextTable.context_medium,
|
|
'Kontext - umfangreich': ContextTable.context_large
|
|
}.get(context_selection)
|
|
|
|
# Convert start_date and end_date to datetime objects if they are not None
|
|
if start_date and end_date:
|
|
start_datetime = dt.combine(start_date, dt.min.time())
|
|
end_datetime = dt.combine(end_date, dt.max.time())
|
|
|
|
# Creating a subquery to count distinct file IDs
|
|
file_count_subquery = db_session.query(
|
|
EntitiesTable.distinct_entities_id,
|
|
func.count(distinct(EntitiesTable.file_id)).label('file_count')
|
|
).group_by(EntitiesTable.distinct_entities_id)
|
|
|
|
if only_crossmatches:
|
|
file_count_subquery = file_count_subquery.having(func.count(distinct(EntitiesTable.file_id)) > 1)
|
|
|
|
file_count_subquery = file_count_subquery.subquery()
|
|
|
|
for entity_type in selected_entity_types:
|
|
if context_selection == 'Kompakte Zusammenfassung ohne Kontext':
|
|
query = db_session.query(
|
|
EntityTypesTable.entity_type,
|
|
DistinctEntitiesTable.distinct_entity,
|
|
func.count(EntitiesTable.entities_id).label('occurrences'),
|
|
func.group_concat(
|
|
FileMetadata.file_name + ':line' + cast(EntitiesTable.line_number, String)
|
|
).label('sources'),
|
|
func.group_concat(
|
|
cast(EntitiesTable.entry_timestamp, String)
|
|
).label('timestamps')
|
|
).join(EntityTypesTable, DistinctEntitiesTable.entity_types_id == EntityTypesTable.entity_type_id
|
|
).join(EntitiesTable, DistinctEntitiesTable.distinct_entities_id == EntitiesTable.distinct_entities_id
|
|
).join(FileMetadata, EntitiesTable.file_id == FileMetadata.file_id
|
|
).join(file_count_subquery, DistinctEntitiesTable.distinct_entities_id == file_count_subquery.c.distinct_entities_id
|
|
).filter(EntityTypesTable.entity_type == entity_type
|
|
).group_by(DistinctEntitiesTable.distinct_entity)
|
|
# Apply timestamp filter if start_date and end_date are provided
|
|
if start_date and end_date:
|
|
query = query.filter(EntitiesTable.entry_timestamp.between(start_datetime, end_datetime))
|
|
if checked_files:
|
|
query = query.filter(FileMetadata.file_name.in_(checked_files))
|
|
if include_flagged:
|
|
if only_flagged:
|
|
query = query.filter(EntitiesTable.flag == True)
|
|
elif only_unflagged:
|
|
query = query.filter(EntitiesTable.flag == False)
|
|
|
|
|
|
for row in query.all():
|
|
sources = row[3].replace(',', ' // ') if row[3] is not None else ''
|
|
timestamps = row[4].replace(',', ' // ') if row[4] is not None else ''
|
|
all_data.append([row[0], row[1], row[2], timestamps, sources, ''])
|
|
|
|
else:
|
|
query = db_session.query(
|
|
EntityTypesTable.entity_type,
|
|
DistinctEntitiesTable.distinct_entity,
|
|
func.count(EntitiesTable.entities_id).over(partition_by=DistinctEntitiesTable.distinct_entity).label('occurrences'),
|
|
FileMetadata.file_name,
|
|
EntitiesTable.line_number,
|
|
context_field,
|
|
EntitiesTable.entry_timestamp
|
|
).select_from(EntitiesTable
|
|
).join(DistinctEntitiesTable, EntitiesTable.distinct_entities_id == DistinctEntitiesTable.distinct_entities_id
|
|
).join(EntityTypesTable, DistinctEntitiesTable.entity_types_id == EntityTypesTable.entity_type_id
|
|
).join(FileMetadata, EntitiesTable.file_id == FileMetadata.file_id
|
|
).outerjoin(ContextTable, EntitiesTable.entities_id == ContextTable.entities_id
|
|
).join(file_count_subquery, DistinctEntitiesTable.distinct_entities_id == file_count_subquery.c.distinct_entities_id
|
|
).filter(EntityTypesTable.entity_type == entity_type)
|
|
# Apply timestamp filter if start_date and end_date are provided
|
|
if start_date and end_date:
|
|
query = query.filter(EntitiesTable.entry_timestamp.between(start_datetime, end_datetime))
|
|
if checked_files:
|
|
query = query.filter(FileMetadata.file_name.in_(checked_files))
|
|
if include_flagged:
|
|
if only_flagged:
|
|
query = query.filter(EntitiesTable.flag == True)
|
|
elif only_unflagged:
|
|
query = query.filter(EntitiesTable.flag == False)
|
|
|
|
for row in query.all():
|
|
file_name = row[3]
|
|
line_number = row[4]
|
|
entry_timestamp = row[6].strftime('%Y-%m-%d %H:%M:%S') if row[6] is not None else ''
|
|
context_info = row[5] if row[5] is not None else ''
|
|
all_data.append([row[0], row[1], row[2], entry_timestamp, file_name, line_number, context_info])
|
|
|
|
# Define the columns for the DataFrame based on context_selection
|
|
columns = ["Entity Type", "Entity", "Occurrences", "Timestamp", "Sources", "Context"] if context_selection == 'Kompakte Zusammenfassung ohne Kontext' else ["Entity Type", "Entity", "Occurrences", "Timestamp", "Source File", "Line Number", "Context"]
|
|
|
|
# Construct and return the DataFrame from all accumulated data
|
|
return pd.DataFrame(all_data, columns=columns)
|