initial commit
This commit is contained in:
0
logline_leviathan/exporter/__init__.py
Normal file
0
logline_leviathan/exporter/__init__.py
Normal file
115
logline_leviathan/exporter/export_constructor.py
Normal file
115
logline_leviathan/exporter/export_constructor.py
Normal file
@@ -0,0 +1,115 @@
|
||||
import logging
|
||||
import pandas as pd
|
||||
from datetime import datetime as dt
|
||||
from PyQt5.QtCore import Qt
|
||||
from sqlalchemy import func, cast, String, distinct
|
||||
from logline_leviathan.database.database_manager import ContextTable, EntityTypesTable, DistinctEntitiesTable, EntitiesTable, FileMetadata
|
||||
|
||||
def generate_dataframe(db_session, tree_items, file_items, context_selection, only_crossmatches=False, start_date=None, end_date=None, include_flagged=False, only_flagged=False, only_unflagged=False):
|
||||
if not db_session:
|
||||
raise ValueError("Database session is None")
|
||||
|
||||
all_data = [] # List to accumulate data from all entity types
|
||||
|
||||
# Extract entity_type from selected tree items
|
||||
selected_entity_types = [item.entity_type for item in tree_items if item.checkState(0) == Qt.Checked]
|
||||
checked_files = [item for item in file_items.getCheckedFiles()]
|
||||
logging.debug(f"Generating dataframe, selected entity types: {selected_entity_types}, passed timestamp range: {start_date} - {end_date}")
|
||||
|
||||
context_field = {
|
||||
'Kompakte Zusammenfassung ohne Kontext': None,
|
||||
'Kontext - gleiche Zeile': ContextTable.context_small,
|
||||
'Kontext - mittelgroß': ContextTable.context_medium,
|
||||
'Kontext - umfangreich': ContextTable.context_large
|
||||
}.get(context_selection)
|
||||
|
||||
# Convert start_date and end_date to datetime objects if they are not None
|
||||
if start_date and end_date:
|
||||
start_datetime = dt.combine(start_date, dt.min.time())
|
||||
end_datetime = dt.combine(end_date, dt.max.time())
|
||||
|
||||
# Creating a subquery to count distinct file IDs
|
||||
file_count_subquery = db_session.query(
|
||||
EntitiesTable.distinct_entities_id,
|
||||
func.count(distinct(EntitiesTable.file_id)).label('file_count')
|
||||
).group_by(EntitiesTable.distinct_entities_id)
|
||||
|
||||
if only_crossmatches:
|
||||
file_count_subquery = file_count_subquery.having(func.count(distinct(EntitiesTable.file_id)) > 1)
|
||||
|
||||
file_count_subquery = file_count_subquery.subquery()
|
||||
|
||||
for entity_type in selected_entity_types:
|
||||
if context_selection == 'Kompakte Zusammenfassung ohne Kontext':
|
||||
query = db_session.query(
|
||||
EntityTypesTable.entity_type,
|
||||
DistinctEntitiesTable.distinct_entity,
|
||||
func.count(EntitiesTable.entities_id).label('occurrences'),
|
||||
func.group_concat(
|
||||
FileMetadata.file_name + ':line' + cast(EntitiesTable.line_number, String)
|
||||
).label('sources'),
|
||||
func.group_concat(
|
||||
cast(EntitiesTable.entry_timestamp, String)
|
||||
).label('timestamps')
|
||||
).join(EntityTypesTable, DistinctEntitiesTable.entity_types_id == EntityTypesTable.entity_type_id
|
||||
).join(EntitiesTable, DistinctEntitiesTable.distinct_entities_id == EntitiesTable.distinct_entities_id
|
||||
).join(FileMetadata, EntitiesTable.file_id == FileMetadata.file_id
|
||||
).join(file_count_subquery, DistinctEntitiesTable.distinct_entities_id == file_count_subquery.c.distinct_entities_id
|
||||
).filter(EntityTypesTable.entity_type == entity_type
|
||||
).group_by(DistinctEntitiesTable.distinct_entity)
|
||||
# Apply timestamp filter if start_date and end_date are provided
|
||||
if start_date and end_date:
|
||||
query = query.filter(EntitiesTable.entry_timestamp.between(start_datetime, end_datetime))
|
||||
if checked_files:
|
||||
query = query.filter(FileMetadata.file_name.in_(checked_files))
|
||||
if include_flagged:
|
||||
if only_flagged:
|
||||
query = query.filter(EntitiesTable.flag == True)
|
||||
elif only_unflagged:
|
||||
query = query.filter(EntitiesTable.flag == False)
|
||||
|
||||
|
||||
for row in query.all():
|
||||
sources = row[3].replace(',', ' // ') if row[3] is not None else ''
|
||||
timestamps = row[4].replace(',', ' // ') if row[4] is not None else ''
|
||||
all_data.append([row[0], row[1], row[2], timestamps, sources, ''])
|
||||
|
||||
else:
|
||||
query = db_session.query(
|
||||
EntityTypesTable.entity_type,
|
||||
DistinctEntitiesTable.distinct_entity,
|
||||
func.count(EntitiesTable.entities_id).over(partition_by=DistinctEntitiesTable.distinct_entity).label('occurrences'),
|
||||
FileMetadata.file_name,
|
||||
EntitiesTable.line_number,
|
||||
context_field,
|
||||
EntitiesTable.entry_timestamp
|
||||
).select_from(EntitiesTable
|
||||
).join(DistinctEntitiesTable, EntitiesTable.distinct_entities_id == DistinctEntitiesTable.distinct_entities_id
|
||||
).join(EntityTypesTable, DistinctEntitiesTable.entity_types_id == EntityTypesTable.entity_type_id
|
||||
).join(FileMetadata, EntitiesTable.file_id == FileMetadata.file_id
|
||||
).outerjoin(ContextTable, EntitiesTable.entities_id == ContextTable.entities_id
|
||||
).join(file_count_subquery, DistinctEntitiesTable.distinct_entities_id == file_count_subquery.c.distinct_entities_id
|
||||
).filter(EntityTypesTable.entity_type == entity_type)
|
||||
# Apply timestamp filter if start_date and end_date are provided
|
||||
if start_date and end_date:
|
||||
query = query.filter(EntitiesTable.entry_timestamp.between(start_datetime, end_datetime))
|
||||
if checked_files:
|
||||
query = query.filter(FileMetadata.file_name.in_(checked_files))
|
||||
if include_flagged:
|
||||
if only_flagged:
|
||||
query = query.filter(EntitiesTable.flag == True)
|
||||
elif only_unflagged:
|
||||
query = query.filter(EntitiesTable.flag == False)
|
||||
|
||||
for row in query.all():
|
||||
file_name = row[3]
|
||||
line_number = row[4]
|
||||
entry_timestamp = row[6].strftime('%Y-%m-%d %H:%M:%S') if row[6] is not None else ''
|
||||
context_info = row[5] if row[5] is not None else ''
|
||||
all_data.append([row[0], row[1], row[2], entry_timestamp, file_name, line_number, context_info])
|
||||
|
||||
# Define the columns for the DataFrame based on context_selection
|
||||
columns = ["Entity Type", "Entity", "Occurrences", "Timestamp", "Sources", "Context"] if context_selection == 'Kompakte Zusammenfassung ohne Kontext' else ["Entity Type", "Entity", "Occurrences", "Timestamp", "Source File", "Line Number", "Context"]
|
||||
|
||||
# Construct and return the DataFrame from all accumulated data
|
||||
return pd.DataFrame(all_data, columns=columns)
|
||||
71
logline_leviathan/exporter/html_export.py
Normal file
71
logline_leviathan/exporter/html_export.py
Normal file
@@ -0,0 +1,71 @@
|
||||
|
||||
from logline_leviathan.exporter.export_constructor import generate_dataframe
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def create_regex_pattern_from_entity(entity):
|
||||
words = entity.split()
|
||||
regex_pattern = "|".join(re.escape(word) for word in words)
|
||||
return re.compile(regex_pattern, re.IGNORECASE)
|
||||
|
||||
def highlight_entities_in_context(context, entity_regex):
|
||||
def replace_match(match):
|
||||
return f"<mark>{match.group()}</mark>"
|
||||
return re.sub(entity_regex, replace_match, context)
|
||||
|
||||
def generate_html_file(output_file_path, db_session, checkboxes, files, context_selection, only_crossmatches, start_date=None, end_date=None, include_flagged=False, only_flagged=False, only_unflagged=False):
|
||||
# Fetch data using the new DataFrame constructor
|
||||
df = generate_dataframe(db_session, checkboxes, files, context_selection, only_crossmatches, start_date, end_date, include_flagged, only_flagged, only_unflagged)
|
||||
|
||||
# Add line breaks for HTML formatting where needed
|
||||
if context_selection == 'Kompakte Zusammenfassung ohne Kontext':
|
||||
df['Sources'] = df['Sources'].apply(lambda x: x.replace(' // ', ' // <br>'))
|
||||
df['Timestamp'] = df['Timestamp'].apply(lambda x: x.replace(' // ', ' // <br>'))
|
||||
|
||||
# Iterate over the DataFrame to apply regex-based highlighting
|
||||
for index, row in df.iterrows():
|
||||
entity_regex = create_regex_pattern_from_entity(row['Entity'])
|
||||
df.at[index, 'Context'] = highlight_entities_in_context(row['Context'], entity_regex)
|
||||
|
||||
# Replace newline characters with HTML line breaks in the 'Context' column
|
||||
df['Context'] = df['Context'].apply(lambda x: x.replace('\n', '<br>') if x else x)
|
||||
|
||||
# Convert DataFrame to HTML table
|
||||
html_table = df.to_html(classes="table table-bordered", escape=False, index=False)
|
||||
|
||||
html_template = f"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Logline Leviathan Report</title>
|
||||
<style>
|
||||
.table {{
|
||||
width: 100%;
|
||||
max-width: 100%;
|
||||
margin-bottom: 1rem;
|
||||
background-color: transparent;
|
||||
}}
|
||||
.table th, .table td {{
|
||||
padding: 0.75rem;
|
||||
vertical-align: top;
|
||||
border-top: 1px solid #dee2e6;
|
||||
max-width: 300px; /* Max width */
|
||||
word-wrap: break-word; /* Enable word wrapping */
|
||||
}}
|
||||
.table-bordered {{
|
||||
border: 1px solid #dee2e6;
|
||||
}}
|
||||
.table-bordered th, .table-bordered td {{
|
||||
border: 1px solid #dee2e6;
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
{html_table}
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
# Write the HTML template to the file
|
||||
with open(output_file_path, 'w', encoding='utf-8') as file:
|
||||
file.write(html_template)
|
||||
94
logline_leviathan/exporter/nice_export.py
Normal file
94
logline_leviathan/exporter/nice_export.py
Normal file
@@ -0,0 +1,94 @@
|
||||
import re
|
||||
from logline_leviathan.exporter.export_constructor import generate_dataframe
|
||||
|
||||
def create_regex_pattern_from_entity(entity):
|
||||
words = entity.split()
|
||||
regex_pattern = "|".join(re.escape(word) for word in words)
|
||||
return re.compile(regex_pattern, re.IGNORECASE)
|
||||
|
||||
def highlight_entities_in_context(context, entity_regex):
|
||||
def replace_match(match):
|
||||
return f"<mark>{match.group()}</mark>"
|
||||
return re.sub(entity_regex, replace_match, context)
|
||||
|
||||
def generate_niceoutput_file(output_file_path, db_session, checkboxes, files, context_selection, only_crossmatches, start_date=None, end_date=None, include_flagged=False, only_flagged=False, only_unflagged=False):
|
||||
# Fetch data using the new DataFrame constructor
|
||||
df = generate_dataframe(db_session, checkboxes, files, context_selection, only_crossmatches, start_date, end_date, include_flagged, only_flagged, only_unflagged)
|
||||
|
||||
# Add line breaks for HTML formatting where needed
|
||||
if context_selection == 'Kompakte Zusammenfassung ohne Kontext':
|
||||
df['Sources'] = df['Sources'].apply(lambda x: x.replace(' // ', ' // <br>'))
|
||||
df['Timestamp'] = df['Timestamp'].apply(lambda x: x.replace(' // ', ' // <br>'))
|
||||
|
||||
# Iterate over the DataFrame to apply regex-based highlighting
|
||||
for index, row in df.iterrows():
|
||||
entity_regex = create_regex_pattern_from_entity(row['Entity'])
|
||||
df.at[index, 'Context'] = highlight_entities_in_context(row['Context'], entity_regex)
|
||||
|
||||
# Replace newline characters with HTML line breaks in the 'Context' column
|
||||
df['Context'] = df['Context'].apply(lambda x: x.replace('\n', '<br>') if x else x)
|
||||
|
||||
# Convert DataFrame to HTML table
|
||||
html_table = df.to_html(classes="display responsive nowrap", table_id="example", escape=False, index=False)
|
||||
|
||||
# HTML template with doubled curly braces in JavaScript part and additional configurations
|
||||
html_template = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Logline Leviathan Report</title>
|
||||
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css"/>
|
||||
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/buttons/2.2.2/css/buttons.dataTables.min.css"/>
|
||||
<script type="text/javascript" src="https://code.jquery.com/jquery-3.5.1.js"></script>
|
||||
<script type="text/javascript" src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script>
|
||||
<script type="text/javascript" src="https://cdn.datatables.net/buttons/2.2.2/js/dataTables.buttons.min.js"></script>
|
||||
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/jszip/3.1.3/jszip.min.js"></script>
|
||||
<script type="text/javascript" src="https://cdn.datatables.net/buttons/2.2.2/js/buttons.html5.min.js"></script>
|
||||
<script type="text/javascript" src="https://cdn.datatables.net/buttons/2.2.2/js/buttons.print.min.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
{0}
|
||||
<script type="text/javascript">
|
||||
$(document).ready(function () {{
|
||||
// DataTables initialization
|
||||
var table = $('#example').DataTable({{
|
||||
"dom": 'Blfrtip',
|
||||
"buttons": ['copy', 'csv', 'excel', 'pdf', 'print'],
|
||||
"searching": true,
|
||||
"fixedHeader": true,
|
||||
"autoWidth": false,
|
||||
"lengthChange": true,
|
||||
"pageLength": 10,
|
||||
"orderCellsTop": true,
|
||||
}});
|
||||
|
||||
// Create dropdown filtering menus
|
||||
$('#example thead tr').clone(true).appendTo('#example thead');
|
||||
$('#example thead tr:eq(1) th').each(function (i) {{
|
||||
var title = $(this).text();
|
||||
if (title === 'Entity Type' || title === 'Entity' || title === 'Occurrences' || title === 'Timestamp' || title === 'Sources' || title === 'Source File' || title === 'Line Number') {{
|
||||
var select = $('<select><option value=""></option></select>')
|
||||
.appendTo($(this).empty())
|
||||
.on('change', function () {{
|
||||
var val = $(this).val();
|
||||
table.column(i)
|
||||
.search(val ? '^' + $(this).val() + '$' : val, true, false)
|
||||
.draw();
|
||||
}});
|
||||
|
||||
table.column(i).data().unique().sort().each(function (d, j) {{
|
||||
select.append('<option value="'+d+'">'+d+'</option>')
|
||||
}});
|
||||
}} else {{
|
||||
$(this).html('');
|
||||
}}
|
||||
}});
|
||||
}});
|
||||
</script>
|
||||
</body>
|
||||
</html>""".format(html_table)
|
||||
|
||||
|
||||
# Write the HTML template to the file
|
||||
with open(output_file_path, 'w', encoding='utf-8') as file:
|
||||
file.write(html_template)
|
||||
46
logline_leviathan/exporter/wordlist_export.py
Normal file
46
logline_leviathan/exporter/wordlist_export.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from logline_leviathan.database.database_manager import ContextTable, EntityTypesTable, DistinctEntitiesTable, EntitiesTable, FileMetadata
|
||||
from sqlalchemy import func, distinct
|
||||
from PyQt5.QtCore import Qt
|
||||
|
||||
|
||||
|
||||
def generate_wordlist(output_file_path, db_session, checkboxes, only_crossmatches, start_date=None, end_date=None, include_flagged=False, only_flagged=False, only_unflagged=False):
|
||||
# Check if there are any checkboxes selected
|
||||
if not checkboxes:
|
||||
raise ValueError("No entities selected")
|
||||
|
||||
# Get selected entity types from checkboxes
|
||||
selected_entity_types = [item.entity_type for item in checkboxes if item.checkState(0) == Qt.Checked]
|
||||
|
||||
# Prepare the initial query with proper joins
|
||||
query = db_session.query(
|
||||
DistinctEntitiesTable.distinct_entity
|
||||
).join(
|
||||
EntitiesTable, DistinctEntitiesTable.distinct_entities_id == EntitiesTable.distinct_entities_id
|
||||
).join(
|
||||
EntityTypesTable, EntitiesTable.entity_types_id == EntityTypesTable.entity_type_id
|
||||
).filter(
|
||||
EntityTypesTable.entity_type.in_(selected_entity_types)
|
||||
)
|
||||
|
||||
# Add timestamp filtering if necessary
|
||||
if start_date and end_date:
|
||||
query = query.filter(EntitiesTable.entry_timestamp.between(start_date, end_date))
|
||||
|
||||
# Handle crossmatches, flagged, and unflagged conditions
|
||||
if only_crossmatches:
|
||||
query = query.group_by(DistinctEntitiesTable.distinct_entity).having(func.count(distinct(EntitiesTable.file_id)) > 1)
|
||||
if include_flagged:
|
||||
if only_flagged:
|
||||
query = query.filter(EntitiesTable.flag == True)
|
||||
elif only_unflagged:
|
||||
query = query.filter(EntitiesTable.flag == False)
|
||||
|
||||
# Execute the query and fetch all results
|
||||
results = query.all()
|
||||
|
||||
# Write the results to the file
|
||||
with open(output_file_path, 'w', encoding='utf-8') as file:
|
||||
for result in results:
|
||||
file.write(result.distinct_entity + '\n')
|
||||
|
||||
47
logline_leviathan/exporter/xlsx_export.py
Normal file
47
logline_leviathan/exporter/xlsx_export.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import pandas as pd
|
||||
from logline_leviathan.exporter.export_constructor import generate_dataframe
|
||||
|
||||
def ensure_utf8(s):
|
||||
if isinstance(s, str):
|
||||
return s.encode('utf-8', errors='replace').decode('utf-8')
|
||||
return s
|
||||
|
||||
def generate_xlsx_file(output_file_path, db_session, checkboxes, files, context_selection, only_crossmatches, start_date=None, end_date=None, include_flagged=False, only_flagged=False, only_unflagged=False):
|
||||
# Fetch data using the new DataFrame constructor
|
||||
df = generate_dataframe(db_session, checkboxes, files, context_selection, only_crossmatches, start_date, end_date, include_flagged, only_flagged, only_unflagged)
|
||||
# Process context field
|
||||
if 'Context' in df.columns:
|
||||
df['Context'] = df['Context'].str.strip() # Trim whitespaces
|
||||
df['Context'] = df['Context'].str.replace(r'[^\x00-\x7F]+', '', regex=True) # Remove non-ASCII characters
|
||||
df['Context'] = df['Context'].apply(lambda x: x[:32767] if isinstance(x, str) else x) # Truncate to 32767 characters (Excel limit)
|
||||
|
||||
# Reorder columns based on whether 'Sources' or 'Source File' and 'Line Number' columns are in the DataFrame
|
||||
if 'Sources' in df.columns:
|
||||
df = df[["Entity Type", "Entity", "Occurrences", "Timestamp", "Sources", "Context"]]
|
||||
elif 'Source File' in df.columns and 'Line Number' in df.columns:
|
||||
df = df[["Entity Type", "Entity", "Occurrences", "Timestamp", "Source File", "Line Number", "Context"]]
|
||||
|
||||
# Apply ensure_utf8 to all string columns in df
|
||||
for col in df.select_dtypes(include=[object]):
|
||||
df[col] = df[col].apply(ensure_utf8)
|
||||
|
||||
# Using pandas.ExcelWriter
|
||||
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
|
||||
for entity_type in df['Entity Type'].unique():
|
||||
df_filtered = df[df['Entity Type'] == entity_type]
|
||||
df_filtered.to_excel(writer, sheet_name=entity_type, index=False)
|
||||
|
||||
# Get the xlsxwriter workbook and worksheet objects.
|
||||
worksheet = writer.sheets[entity_type]
|
||||
|
||||
# Set column width and enable text wrapping
|
||||
for idx, col in enumerate(df_filtered.columns):
|
||||
# Adjust the column width if necessary
|
||||
worksheet.column_dimensions[chr(65 + idx)].width = 20 # 65 is ASCII for 'A'
|
||||
|
||||
# Set alignment if needed
|
||||
# for row in worksheet.iter_rows(min_row=2, max_col=len(df_filtered.columns), max_row=len(df_filtered) + 1):
|
||||
# for cell in row:
|
||||
# cell.alignment = Alignment(wrap_text=True)
|
||||
|
||||
# The file is saved automatically using the with context
|
||||
Reference in New Issue
Block a user