initial commit
This commit is contained in:
0
logline_leviathan/file_processor/__init__.py
Normal file
0
logline_leviathan/file_processor/__init__.py
Normal file
88
logline_leviathan/file_processor/docx_processor.py
Normal file
88
logline_leviathan/file_processor/docx_processor.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import logging
|
||||
from docx import Document
|
||||
from datetime import datetime
|
||||
from logline_leviathan.file_processor.parser_thread import parse_content
|
||||
from logline_leviathan.file_processor.file_database_ops import handle_file_metadata, handle_individual_entity, handle_distinct_entity, handle_context_snippet
|
||||
import re
|
||||
|
||||
def read_docx_content(file_path):
|
||||
try:
|
||||
doc = Document(file_path)
|
||||
full_content = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
|
||||
return full_content.splitlines(True) # Keep end-of-line characters
|
||||
except Exception as e:
|
||||
logging.error(f"Error reading DOCX file {file_path}: {e}")
|
||||
return None
|
||||
|
||||
def get_line_numbers_from_pos(content, start_pos, end_pos):
|
||||
# This function is similar to the one for text and xlsx files
|
||||
# Adjustments might be needed for the nuances of docx content structure
|
||||
start_line = end_line = 0
|
||||
current_pos = 0
|
||||
for i, line in enumerate(content):
|
||||
current_pos += len(line)
|
||||
if start_pos < current_pos:
|
||||
start_line = i
|
||||
break
|
||||
for i, line in enumerate(content[start_line:], start=start_line):
|
||||
current_pos += len(line)
|
||||
if end_pos <= current_pos:
|
||||
end_line = i
|
||||
break
|
||||
return start_line, end_line
|
||||
|
||||
def process_docx_file(file_path, file_mimetype, thread_instance, db_session, abort_flag):
|
||||
try:
|
||||
file_metadata = handle_file_metadata(db_session, file_path, file_mimetype)
|
||||
content = read_docx_content(file_path)
|
||||
full_content = ''.join(content) # Join all lines into a single string
|
||||
thread_instance.update_status.emit(f" Verarbeite DOCX-Datei: {file_path}")
|
||||
|
||||
# Call the parser and get matches along with entity types
|
||||
parsed_entities = parse_content(full_content, abort_flag, db_session)
|
||||
|
||||
entity_count = 0
|
||||
for entity_type_id, match_text, start_pos, end_pos in parsed_entities:
|
||||
if not match_text.strip():
|
||||
continue
|
||||
|
||||
timestamp = find_timestamp_before_match(full_content, start_pos)
|
||||
match_start_line, match_end_line = get_line_numbers_from_pos(content, start_pos, end_pos)
|
||||
|
||||
entity = handle_distinct_entity(db_session, match_text, entity_type_id)
|
||||
individual_entity = handle_individual_entity(db_session, entity, file_metadata, match_start_line, timestamp, entity_type_id, abort_flag, thread_instance)
|
||||
|
||||
if individual_entity:
|
||||
entity_count += 1
|
||||
handle_context_snippet(db_session, individual_entity, content, match_start_line, match_end_line)
|
||||
|
||||
return entity_count
|
||||
except Exception as e:
|
||||
db_session.rollback()
|
||||
logging.error(f"Error processing DOCX file {file_path}: {e}")
|
||||
return 0
|
||||
|
||||
def find_timestamp_before_match(content, match_start_pos):
|
||||
search_content = content[:match_start_pos]
|
||||
timestamp_patterns = [
|
||||
(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '%Y-%m-%d %H:%M:%S'), # ISO 8601 Extended
|
||||
(r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}', '%Y/%m/%d %H:%M:%S'), # ISO 8601 with slashes
|
||||
(r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}', '%d/%m/%Y %H:%M:%S'), # European Date Format
|
||||
(r'\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}', '%m-%d-%Y %H:%M:%S'), # US Date Format
|
||||
(r'\d{8}_\d{6}', '%Y%m%d_%H%M%S'), # Compact Format
|
||||
(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}', '%Y-%m-%dT%H:%M:%S'), # ISO 8601 Basic
|
||||
(r'\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}:\d{2}', '%d.%m.%Y %H:%M:%S'),# German Date Format
|
||||
(r'\d{4}\d{2}\d{2} \d{2}:\d{2}:\d{2}', '%Y%m%d %H:%M:%S'), # Basic Format without Separators
|
||||
(r'\d{1,2}-[A-Za-z]{3}-\d{4} \d{2}:\d{2}:\d{2}', '%d-%b-%Y %H:%M:%S'), # English Date Format with Month Name
|
||||
(r'(?:19|20)\d{10}', '%Y%m%d%H%M'), # Compact Numeric Format
|
||||
# Add more patterns as needed
|
||||
]
|
||||
for pattern, date_format in timestamp_patterns:
|
||||
for timestamp_match in reversed(list(re.finditer(pattern, search_content))):
|
||||
try:
|
||||
# Convert the matched timestamp to the standardized format
|
||||
matched_timestamp = datetime.strptime(timestamp_match.group(), date_format)
|
||||
return matched_timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except ValueError:
|
||||
continue # If conversion fails, continue to the next pattern
|
||||
return None
|
||||
136
logline_leviathan/file_processor/file_database_ops.py
Normal file
136
logline_leviathan/file_processor/file_database_ops.py
Normal file
@@ -0,0 +1,136 @@
|
||||
import logging
|
||||
import os
|
||||
from logline_leviathan.database.database_manager import FileMetadata, DistinctEntitiesTable, EntitiesTable, ContextTable, session_scope
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def handle_file_metadata(db_session, file_path, file_mimetype, sheet_name=None):
|
||||
#with session_scope() as db_session:
|
||||
try:
|
||||
# Construct file name with or without sheet name
|
||||
base_file_name = os.path.basename(file_path)
|
||||
modified_file_name = f"{base_file_name}_{sheet_name}" if sheet_name else base_file_name
|
||||
|
||||
# Search for existing metadata using the modified file name
|
||||
file_metadata = db_session.query(FileMetadata).filter_by(file_path=file_path, file_name=modified_file_name).first()
|
||||
|
||||
if not file_metadata:
|
||||
logging.debug(f"File metadata {file_metadata} does not exist.")
|
||||
file_metadata = FileMetadata(file_name=modified_file_name, file_path=file_path, file_mimetype=file_mimetype)
|
||||
db_session.add(file_metadata)
|
||||
else:
|
||||
logging.debug(f"File metadata {file_metadata} already exists.")
|
||||
# Update the MIME type if the record already exists
|
||||
file_metadata.file_mimetype = file_mimetype
|
||||
logging.debug(f"Updated file mimetype: {file_metadata.file_mimetype}")
|
||||
logging.debug(f"committing file metadata {file_metadata}")
|
||||
db_session.commit()
|
||||
return file_metadata
|
||||
except Exception as e:
|
||||
logging.error(f"Error handling file metadata for {file_path}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def handle_distinct_entity(db_session, match_text, entity_type_id):
|
||||
#with session_scope() as db_session:
|
||||
try:
|
||||
distinct_entity = db_session.query(DistinctEntitiesTable).filter_by(distinct_entity=match_text, entity_types_id=entity_type_id).first()
|
||||
if not distinct_entity:
|
||||
logging.debug(f"Distinct entity {match_text} does not exist.")
|
||||
distinct_entity = DistinctEntitiesTable(distinct_entity=match_text, entity_types_id=entity_type_id)
|
||||
db_session.add(distinct_entity)
|
||||
logging.debug(f"committing distinct entity {distinct_entity}")
|
||||
db_session.commit()
|
||||
else:
|
||||
logging.debug(f"Distinct entity {distinct_entity} already exists.")
|
||||
|
||||
return distinct_entity
|
||||
except Exception as e:
|
||||
logging.error(f"Error handling distinct entity {match_text}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def handle_individual_entity(db_session, entity, file_metadata, line_number, timestamp, entity_types_id, abort_flag, thread_instance):
|
||||
#with session_scope() as db_session:
|
||||
try:
|
||||
if abort_flag():
|
||||
return None
|
||||
if timestamp and isinstance(timestamp, str):
|
||||
try:
|
||||
timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
|
||||
except ValueError:
|
||||
logging.warning(f"Invalid timestamp format: {timestamp}")
|
||||
timestamp = None
|
||||
|
||||
individual_entity = db_session.query(EntitiesTable).filter_by(
|
||||
distinct_entities_id=entity.distinct_entities_id,
|
||||
file_id=file_metadata.file_id,
|
||||
line_number=line_number
|
||||
).first()
|
||||
|
||||
if not individual_entity:
|
||||
logging.debug(f"Individual entity {individual_entity} does not exist.")
|
||||
individual_entity = EntitiesTable(
|
||||
distinct_entities_id=entity.distinct_entities_id,
|
||||
file_id=file_metadata.file_id,
|
||||
line_number=line_number,
|
||||
entry_timestamp=timestamp,
|
||||
entity_types_id=entity_types_id
|
||||
)
|
||||
db_session.add(individual_entity)
|
||||
logging.debug(f"committing individual entity {individual_entity}")
|
||||
db_session.commit()
|
||||
|
||||
thread_instance.total_entities_count_lock.lock() # Lock the mutex
|
||||
try:
|
||||
thread_instance.total_entities_count += 1
|
||||
finally:
|
||||
thread_instance.total_entities_count_lock.unlock() # Unlock the mutex
|
||||
|
||||
thread_instance.calculate_and_emit_rate()
|
||||
else:
|
||||
logging.debug(f"Individual entity {individual_entity} already exists.")
|
||||
|
||||
return individual_entity
|
||||
except Exception as e:
|
||||
logging.error(f"Error handling individual entity in {file_metadata.file_path}, line {line_number}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
#def count_newlines(content, start, end):
|
||||
# return content[start:end].count('\n')
|
||||
|
||||
def handle_context_snippet(db_session, individual_entity, content, start_line, end_line):
|
||||
#with session_scope() as db_session:
|
||||
try:
|
||||
context_sizes = {
|
||||
'Kontext - gleiche Zeile': 0,
|
||||
'Kontext - mittelgroß': 8,
|
||||
'Kontext - umfangreich': 15
|
||||
#'Index Context': 30
|
||||
}
|
||||
|
||||
context_snippets = {}
|
||||
for size, lines in context_sizes.items():
|
||||
context_start = max(0, start_line - lines)
|
||||
context_end = min(len(content), end_line + lines + 1)
|
||||
context_snippets[size] = "\n".join(content[context_start:context_end])
|
||||
|
||||
# Check if a similar context already exists
|
||||
existing_context = db_session.query(ContextTable).filter_by(entities_id=individual_entity.entities_id).first()
|
||||
if not existing_context:
|
||||
context = ContextTable(entities_id=individual_entity.entities_id,
|
||||
context_small=context_snippets['Kontext - gleiche Zeile'],
|
||||
context_medium=context_snippets['Kontext - mittelgroß'],
|
||||
context_large=context_snippets['Kontext - umfangreich']
|
||||
)
|
||||
db_session.add(context)
|
||||
logging.debug(f"committing context {context}")
|
||||
db_session.commit()
|
||||
else:
|
||||
logging.debug(f"Existing context {existing_context} already exists.")
|
||||
except Exception as e:
|
||||
logging.error(f"Error handling context snippet: {e}")
|
||||
|
||||
218
logline_leviathan/file_processor/file_processor_thread copy.py
Normal file
218
logline_leviathan/file_processor/file_processor_thread copy.py
Normal file
@@ -0,0 +1,218 @@
|
||||
from multiprocessing.spawn import import_main_path
|
||||
import sys
|
||||
import time
|
||||
import os
|
||||
from PyQt5.QtCore import QThread, pyqtSignal, QMutex
|
||||
from logline_leviathan.database.database_manager import session_scope
|
||||
from logline_leviathan.gui.checkbox_panel import CheckboxPanel
|
||||
from .text_processor import process_text_file
|
||||
from .xlsx_processor import process_xlsx_file
|
||||
from .pdf_processor import process_pdf_file
|
||||
from .docx_processor import process_docx_file
|
||||
import magic
|
||||
import logging
|
||||
import pathlib
|
||||
|
||||
|
||||
class FileProcessorThread(QThread):
|
||||
update_progress = pyqtSignal(int)
|
||||
update_status = pyqtSignal(str)
|
||||
update_tree_signal = pyqtSignal()
|
||||
update_checkboxes_signal = pyqtSignal()
|
||||
update_rate = pyqtSignal(float, int, float, int, float, float)
|
||||
last_update_time = 0
|
||||
|
||||
|
||||
def __init__(self, file_paths):
|
||||
super().__init__()
|
||||
self.start_time = time.time()
|
||||
self.total_entities_count = 0
|
||||
self.total_entities_count_lock = QMutex()
|
||||
self.abort_mutex = QMutex()
|
||||
|
||||
self.abort_flag = False
|
||||
self.file_paths = file_paths
|
||||
self.unsupported_files_count = 0
|
||||
self.processed_files_count = 0
|
||||
self.total_data_processed_kb = 0
|
||||
self.total_files_size_kb = sum(os.path.getsize(f) / 1024 for f in file_paths)
|
||||
|
||||
self.unsupported_files_list = []
|
||||
self.all_unsupported_files = []
|
||||
|
||||
self.checkbox_panel = CheckboxPanel()
|
||||
|
||||
@property
|
||||
def abort_flag(self):
|
||||
# This is the getter method for the property
|
||||
self.abort_mutex.lock()
|
||||
flag = self._abort_flag
|
||||
self.abort_mutex.unlock()
|
||||
return flag
|
||||
|
||||
@abort_flag.setter
|
||||
def abort_flag(self, value):
|
||||
# This is the setter method for the property
|
||||
self.abort_mutex.lock()
|
||||
self._abort_flag = value
|
||||
self.abort_mutex.unlock()
|
||||
|
||||
def classify_file_type(self, file_path):
|
||||
# Mapping of file extensions to MIME types
|
||||
mime_types = {
|
||||
'.txt': 'text/plain',
|
||||
'.pdf': 'application/pdf',
|
||||
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'.csv': 'text/csv',
|
||||
'.html': 'text/html',
|
||||
'.htm': 'text/html',
|
||||
'.xml': 'text/xml',
|
||||
'.json': 'application/json',
|
||||
'.yaml': 'text/yaml',
|
||||
'.yml': 'text/yaml',
|
||||
'.md': 'text/markdown',
|
||||
'.rtf': 'application/rtf',
|
||||
'.odt': 'application/vnd.oasis.opendocument.text',
|
||||
'.ods': 'application/vnd.oasis.opendocument.spreadsheet',
|
||||
'.odp': 'application/vnd.oasis.opendocument.presentation',
|
||||
'.log': 'text/plain',
|
||||
'.ini': 'text/plain',
|
||||
'.conf': 'text/plain',
|
||||
'.cfg': 'text/plain',
|
||||
'.js': 'application/javascript',
|
||||
'.css': 'text/css',
|
||||
'.php': 'text/php',
|
||||
'.py': 'text/x-python',
|
||||
'.rb': 'text/x-ruby',
|
||||
'.java': 'text/x-java-source',
|
||||
'.c': 'text/x-c',
|
||||
'.cpp': 'text/x-c++',
|
||||
'.h': 'text/x-c-header',
|
||||
'.hpp': 'text/x-c++-header',
|
||||
'.sh': 'application/x-sh',
|
||||
'.bat': 'application/x-bat',
|
||||
'.ps1': 'application/x-powershell',
|
||||
'.sql': 'text/x-sql',
|
||||
# Add more mappings as needed
|
||||
}
|
||||
try:
|
||||
mime = magic.Magic(mime=True)
|
||||
file_type = mime.from_file(file_path)
|
||||
return file_type
|
||||
except FileNotFoundError as e:
|
||||
logging.error(f"File not found: {file_path}. Encoding: {sys.getfilesystemencoding()}", exc_info=True)
|
||||
except Exception as e:
|
||||
try:
|
||||
clean_file_path = pathlib.Path(file_path)
|
||||
mime = magic.Magic(mime=True)
|
||||
file_type = mime.from_file(clean_file_path)
|
||||
return file_type
|
||||
except Exception as e:
|
||||
logging.error(f"The magic library failed classifying the file type: {e} // falling back to file extension")
|
||||
_, file_extension = os.path.splitext(file_path)
|
||||
return mime_types.get(file_extension.lower(), 'application/octet-stream') # Default to binary type if unknown
|
||||
|
||||
|
||||
def run(self):
|
||||
logging.debug("Thread run method started.")
|
||||
try:
|
||||
for index, file_path in enumerate(self.file_paths):
|
||||
#if not self.debugFileProcessor(file_path):
|
||||
# continue
|
||||
file_size_kb = os.path.getsize(file_path) / 1024 # Get file size in KiB
|
||||
self.total_data_processed_kb += file_size_kb
|
||||
if self.abort_flag:
|
||||
self.update_status.emit("Analyse abgebrochen")
|
||||
return
|
||||
logging.debug(f"Attempting to process file: {file_path}")
|
||||
file_type = self.classify_file_type(file_path)
|
||||
logging.info(f"ANALYZING {file_path} TYPE {file_type}")
|
||||
|
||||
with session_scope() as session:
|
||||
if 'text/' in file_type:
|
||||
process_text_file(file_path, file_type, self, session, lambda: self.abort_flag)
|
||||
elif 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' in file_type:
|
||||
process_xlsx_file(file_path, file_type, self, session, lambda: self.abort_flag)
|
||||
elif 'application/pdf' in file_type or file_type == ".pdf":
|
||||
process_pdf_file(file_path, file_type, self, session, lambda: self.abort_flag)
|
||||
elif 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' in file_type:
|
||||
# Handling DOCX file
|
||||
process_docx_file(file_path, file_type, self, session, lambda: self.abort_flag)
|
||||
else:
|
||||
logging.info(f"Skipping unsupported file type: {file_type}")
|
||||
self.all_unsupported_files.append(file_path)
|
||||
self.unsupported_files_count += 1
|
||||
if len(self.unsupported_files_list) < 20:
|
||||
self.unsupported_files_list.append(f"{file_path} (Type: {file_type})")
|
||||
continue
|
||||
self.update_tree_signal.emit()
|
||||
self.update_checkboxes_signal.emit()
|
||||
self.processed_files_count = index + 1
|
||||
self.update_progress.emit(index + 1)
|
||||
self.update_status.emit(f" Verarbeitung abgeschlossen. {index + 1 - self.unsupported_files_count} von {len(self.file_paths)} Dateien verarbeitet.")
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing files: {e}")
|
||||
self.update_status.emit(f"Fehler beim Verarbeiten von Dateien {e}", exc_info=True)
|
||||
|
||||
def calculate_and_emit_rate(self):
|
||||
current_time = time.time()
|
||||
if current_time - self.last_update_time >= 1: # Check if 1 second has passed
|
||||
entity_rate = self.calculate_rate()
|
||||
file_rate = self.calculate_file_rate()
|
||||
data_rate_kibs = self.calculate_data_rate()
|
||||
estimated_time = self.calculate_estimated_time_to_completion(data_rate_kibs)
|
||||
self.update_rate.emit(entity_rate, self.total_entities_count, file_rate, self.processed_files_count, estimated_time, data_rate_kibs)
|
||||
self.last_update_time = current_time
|
||||
|
||||
def calculate_data_rate(self):
|
||||
elapsed_time = time.time() - self.start_time
|
||||
return self.total_data_processed_kb / elapsed_time if elapsed_time > 0 else 0
|
||||
|
||||
def calculate_estimated_time_to_completion(self, data_rate_kibs):
|
||||
remaining_data_kb = self.total_files_size_kb - self.total_data_processed_kb
|
||||
if data_rate_kibs > 0:
|
||||
estimated_time = remaining_data_kb / data_rate_kibs
|
||||
else:
|
||||
estimated_time = float('inf') # Indefinite time if rate is zero
|
||||
return estimated_time
|
||||
|
||||
def calculate_file_rate(self):
|
||||
elapsed_time = time.time() - self.start_time
|
||||
return self.processed_files_count / elapsed_time if elapsed_time > 0 else 0
|
||||
|
||||
|
||||
def calculate_rate(self):
|
||||
elapsed_time = time.time() - self.start_time
|
||||
rate = self.total_entities_count / elapsed_time if elapsed_time > 0 else 0
|
||||
return rate
|
||||
|
||||
def abort(self):
|
||||
self.abort_flag = True
|
||||
|
||||
def getUnsupportedFilesCount(self):
|
||||
return self.unsupported_files_count
|
||||
|
||||
def getUnsupportedFilesList(self):
|
||||
return self.unsupported_files_list
|
||||
|
||||
def debugFileProcessor(self, file_path):
|
||||
logging.debug(f"Attempting to process file: {file_path}")
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
logging.warning(f"File does not exist: {file_path}")
|
||||
return False
|
||||
elif not os.access(file_path, os.R_OK):
|
||||
logging.warning(f"File is not accessible: {file_path}")
|
||||
return False
|
||||
|
||||
try:
|
||||
detected_encoding = magic.from_file(file_path, mime=True)
|
||||
logging.debug(f"Detected encoding for {file_path}: {detected_encoding}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to detect encoding for {file_path}: {e}", exc_info=True)
|
||||
|
||||
file_type = self.classify_file_type(file_path)
|
||||
logging.debug(f"Classified file type for {file_path}: {file_type}")
|
||||
|
||||
return True
|
||||
240
logline_leviathan/file_processor/file_processor_thread.py
Normal file
240
logline_leviathan/file_processor/file_processor_thread.py
Normal file
@@ -0,0 +1,240 @@
|
||||
from multiprocessing.spawn import import_main_path
|
||||
import sys
|
||||
import time
|
||||
import os
|
||||
from PyQt5.QtCore import QThread, pyqtSignal, QMutex
|
||||
from logline_leviathan.database.database_manager import session_scope
|
||||
from logline_leviathan.gui.checkbox_panel import CheckboxPanel
|
||||
from .text_processor import process_text_file
|
||||
from .xlsx_processor import process_xlsx_file
|
||||
from .pdf_processor import process_pdf_file
|
||||
from .docx_processor import process_docx_file
|
||||
import magic
|
||||
import logging
|
||||
import pathlib
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.exc import OperationalError
|
||||
|
||||
class FileProcessorThread(QThread):
|
||||
update_progress = pyqtSignal(int)
|
||||
update_status = pyqtSignal(str)
|
||||
update_tree_signal = pyqtSignal()
|
||||
update_checkboxes_signal = pyqtSignal()
|
||||
update_rate = pyqtSignal(float, int, float, int, float, float)
|
||||
last_update_time = 0
|
||||
|
||||
|
||||
def __init__(self, file_paths):
|
||||
super().__init__()
|
||||
self.start_time = time.time()
|
||||
self.total_entities_count = 0
|
||||
self.total_entities_count_lock = QMutex()
|
||||
self.abort_mutex = QMutex()
|
||||
|
||||
self.abort_flag = False
|
||||
self.file_paths = file_paths
|
||||
self.unsupported_files_count = 0
|
||||
self.processed_files_count = 0
|
||||
self.total_data_processed_kb = 0
|
||||
self.total_files_size_kb = sum(os.path.getsize(f) / 1024 for f in file_paths)
|
||||
|
||||
self.unsupported_files_list = []
|
||||
self.all_unsupported_files = []
|
||||
|
||||
self.checkbox_panel = CheckboxPanel()
|
||||
|
||||
@property
|
||||
def abort_flag(self):
|
||||
# This is the getter method for the property
|
||||
self.abort_mutex.lock()
|
||||
flag = self._abort_flag
|
||||
self.abort_mutex.unlock()
|
||||
return flag
|
||||
|
||||
@abort_flag.setter
|
||||
def abort_flag(self, value):
|
||||
# This is the setter method for the property
|
||||
self.abort_mutex.lock()
|
||||
self._abort_flag = value
|
||||
self.abort_mutex.unlock()
|
||||
|
||||
def classify_file_type(self, file_path):
|
||||
# Mapping of file extensions to MIME types
|
||||
mime_types = {
|
||||
'.txt': 'text/plain',
|
||||
'.pdf': 'application/pdf',
|
||||
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'.csv': 'text/csv',
|
||||
'.html': 'text/html',
|
||||
'.htm': 'text/html',
|
||||
'.xml': 'text/xml',
|
||||
'.json': 'application/json',
|
||||
'.yaml': 'text/yaml',
|
||||
'.yml': 'text/yaml',
|
||||
'.md': 'text/markdown',
|
||||
'.rtf': 'application/rtf',
|
||||
'.odt': 'application/vnd.oasis.opendocument.text',
|
||||
'.ods': 'application/vnd.oasis.opendocument.spreadsheet',
|
||||
'.odp': 'application/vnd.oasis.opendocument.presentation',
|
||||
'.log': 'text/plain',
|
||||
'.ini': 'text/plain',
|
||||
'.conf': 'text/plain',
|
||||
'.cfg': 'text/plain',
|
||||
'.js': 'application/javascript',
|
||||
'.css': 'text/css',
|
||||
'.php': 'text/php',
|
||||
'.py': 'text/x-python',
|
||||
'.rb': 'text/x-ruby',
|
||||
'.java': 'text/x-java-source',
|
||||
'.c': 'text/x-c',
|
||||
'.cpp': 'text/x-c++',
|
||||
'.h': 'text/x-c-header',
|
||||
'.hpp': 'text/x-c++-header',
|
||||
'.sh': 'application/x-sh',
|
||||
'.bat': 'application/x-bat',
|
||||
'.ps1': 'application/x-powershell',
|
||||
'.sql': 'text/x-sql',
|
||||
# Add more mappings as needed
|
||||
}
|
||||
try:
|
||||
mime = magic.Magic(mime=True)
|
||||
file_type = mime.from_file(file_path)
|
||||
return file_type
|
||||
except FileNotFoundError as e:
|
||||
logging.error(f"File not found: {file_path}. Encoding: {sys.getfilesystemencoding()}", exc_info=True)
|
||||
except Exception as e:
|
||||
try:
|
||||
clean_file_path = pathlib.Path(file_path)
|
||||
mime = magic.Magic(mime=True)
|
||||
file_type = mime.from_file(clean_file_path)
|
||||
return file_type
|
||||
except Exception as e:
|
||||
logging.error(f"The magic library failed classifying the file type: {e} // falling back to file extension")
|
||||
_, file_extension = os.path.splitext(file_path)
|
||||
return mime_types.get(file_extension.lower(), 'application/octet-stream') # Default to binary type if unknown
|
||||
|
||||
|
||||
def run(self):
|
||||
logging.debug("Thread run method started.")
|
||||
try:
|
||||
for index, file_path in enumerate(self.file_paths):
|
||||
#if not self.debugFileProcessor(file_path):
|
||||
# continue
|
||||
file_size_kb = os.path.getsize(file_path) / 1024 # Get file size in KiB
|
||||
self.total_data_processed_kb += file_size_kb
|
||||
if self.abort_flag:
|
||||
self.update_status.emit("Analyse abgebrochen")
|
||||
return
|
||||
logging.debug(f"Attempting to process file: {file_path}")
|
||||
file_type = self.classify_file_type(file_path)
|
||||
logging.info(f"ANALYZING {file_path} TYPE {file_type}")
|
||||
# Check and potentially re-establish the database connection
|
||||
if not self.check_and_restore_db_connection():
|
||||
logging.error(f"Database connection could not be established for {file_path}. Skipping file.")
|
||||
continue
|
||||
with session_scope() as session:
|
||||
if 'text/' in file_type:
|
||||
process_text_file(file_path, file_type, self, session, lambda: self.abort_flag)
|
||||
elif 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' in file_type:
|
||||
process_xlsx_file(file_path, file_type, self, session, lambda: self.abort_flag)
|
||||
elif 'application/pdf' in file_type or file_type == ".pdf":
|
||||
process_pdf_file(file_path, file_type, self, session, lambda: self.abort_flag)
|
||||
elif 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' in file_type:
|
||||
# Handling DOCX file
|
||||
process_docx_file(file_path, file_type, self, session, lambda: self.abort_flag)
|
||||
else:
|
||||
logging.info(f"Skipping unsupported file type: {file_type}")
|
||||
self.all_unsupported_files.append(file_path)
|
||||
self.unsupported_files_count += 1
|
||||
if len(self.unsupported_files_list) < 20:
|
||||
self.unsupported_files_list.append(f"{file_path} (Type: {file_type})")
|
||||
continue
|
||||
self.update_tree_signal.emit()
|
||||
self.update_checkboxes_signal.emit()
|
||||
self.processed_files_count = index + 1
|
||||
self.update_progress.emit(index + 1)
|
||||
self.update_status.emit(f" Verarbeitung abgeschlossen. {index + 1 - self.unsupported_files_count} von {len(self.file_paths)} Dateien verarbeitet.")
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing files: {e}")
|
||||
self.update_status.emit(f"Fehler beim Verarbeiten von Dateien {e}", exc_info=True)
|
||||
|
||||
|
||||
def check_and_restore_db_connection(self):
|
||||
attempts = 0
|
||||
max_attempts = 5
|
||||
while attempts < max_attempts:
|
||||
try:
|
||||
with session_scope() as session:
|
||||
session.execute(text('SELECT 1'))
|
||||
return True
|
||||
except OperationalError:
|
||||
attempts += 1
|
||||
time.sleep(2 ** attempts) # Exponential backoff
|
||||
continue
|
||||
logging.error("Failed to re-establish database connection after several attempts.")
|
||||
return False
|
||||
|
||||
|
||||
def calculate_and_emit_rate(self):
|
||||
current_time = time.time()
|
||||
if current_time - self.last_update_time >= 1: # Check if 1 second has passed
|
||||
entity_rate = self.calculate_rate()
|
||||
file_rate = self.calculate_file_rate()
|
||||
data_rate_kibs = self.calculate_data_rate()
|
||||
estimated_time = self.calculate_estimated_time_to_completion(data_rate_kibs)
|
||||
self.update_rate.emit(entity_rate, self.total_entities_count, file_rate, self.processed_files_count, estimated_time, data_rate_kibs)
|
||||
self.last_update_time = current_time
|
||||
|
||||
def calculate_data_rate(self):
|
||||
elapsed_time = time.time() - self.start_time
|
||||
return self.total_data_processed_kb / elapsed_time if elapsed_time > 0 else 0
|
||||
|
||||
def calculate_estimated_time_to_completion(self, data_rate_kibs):
|
||||
remaining_data_kb = self.total_files_size_kb - self.total_data_processed_kb
|
||||
if data_rate_kibs > 0:
|
||||
estimated_time = remaining_data_kb / data_rate_kibs
|
||||
else:
|
||||
estimated_time = float('inf') # Indefinite time if rate is zero
|
||||
return estimated_time
|
||||
|
||||
def calculate_file_rate(self):
|
||||
elapsed_time = time.time() - self.start_time
|
||||
return self.processed_files_count / elapsed_time if elapsed_time > 0 else 0
|
||||
|
||||
|
||||
def calculate_rate(self):
|
||||
elapsed_time = time.time() - self.start_time
|
||||
rate = self.total_entities_count / elapsed_time if elapsed_time > 0 else 0
|
||||
return rate
|
||||
|
||||
def abort(self):
|
||||
self.abort_flag = True
|
||||
|
||||
def getUnsupportedFilesCount(self):
|
||||
return self.unsupported_files_count
|
||||
|
||||
def getUnsupportedFilesList(self):
|
||||
return self.unsupported_files_list
|
||||
|
||||
def debugFileProcessor(self, file_path):
|
||||
logging.debug(f"Attempting to process file: {file_path}")
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
logging.warning(f"File does not exist: {file_path}")
|
||||
return False
|
||||
elif not os.access(file_path, os.R_OK):
|
||||
logging.warning(f"File is not accessible: {file_path}")
|
||||
return False
|
||||
|
||||
try:
|
||||
detected_encoding = magic.from_file(file_path, mime=True)
|
||||
logging.debug(f"Detected encoding for {file_path}: {detected_encoding}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to detect encoding for {file_path}: {e}", exc_info=True)
|
||||
|
||||
file_type = self.classify_file_type(file_path)
|
||||
logging.debug(f"Classified file type for {file_path}: {file_type}")
|
||||
|
||||
return True
|
||||
|
||||
101
logline_leviathan/file_processor/parser_thread.py
Normal file
101
logline_leviathan/file_processor/parser_thread.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# the parse_content receives the full_content string from the methods process_text_file, process_xlsx_file, process_pdf_file or similar along the abort_flag
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import logging
|
||||
import importlib.util
|
||||
import multiprocessing
|
||||
from logline_leviathan.database.database_manager import EntityTypesTable
|
||||
|
||||
#logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
#multiprocessing.set_start_method('spawn')
|
||||
|
||||
def parse_with_script(parser_module_name, full_content):
|
||||
parser_module_name = parser_module_name.replace('.py', '') # Remove .py extension
|
||||
|
||||
if getattr(sys, 'frozen', False):
|
||||
# The base path is the directory of the executable
|
||||
base_dir = os.path.dirname(sys.executable)
|
||||
# Construct the path to the 'data/parser' directory
|
||||
base_path = os.path.join(base_dir, 'data', 'parser')
|
||||
else:
|
||||
# Running in a normal Python environment
|
||||
base_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'data', 'parser')
|
||||
|
||||
# Construct the path to the parser module
|
||||
parser_module_path = os.path.join(base_path, parser_module_name + '.py')
|
||||
|
||||
if not os.path.exists(parser_module_path):
|
||||
logging.error(f"Parser module not found: {parser_module_path}")
|
||||
return []
|
||||
|
||||
# Dynamically import the module using its file path
|
||||
spec = importlib.util.spec_from_file_location(parser_module_name, parser_module_path)
|
||||
parser_module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(parser_module)
|
||||
|
||||
try:
|
||||
script_results = parser_module.parse(full_content)
|
||||
return script_results
|
||||
except Exception as e:
|
||||
logging.error(f"Error using parser module {parser_module_name}: {e}")
|
||||
return []
|
||||
|
||||
|
||||
|
||||
def parse_with_regex(regex_pattern, full_content):
|
||||
try:
|
||||
#logging.debug(f"Using regex pattern: {regex_pattern}")
|
||||
regex_results = [(match.group(), match.start(), match.end()) for match in re.finditer(regex_pattern, full_content)]
|
||||
#logging.debug(f"Regex parser results: {regex_results}")
|
||||
return regex_results
|
||||
except re.error as e:
|
||||
logging.error(f"Invalid regex pattern: {regex_pattern}. Error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def parse_entity_type(entity_type, full_content):
|
||||
try:
|
||||
if entity_type.script_parser and os.path.exists(os.path.join('data', 'parser', entity_type.script_parser)):
|
||||
# Use the script_parser name directly
|
||||
parser_module_name = entity_type.script_parser.replace('.py', '')
|
||||
return [(entity_type.entity_type_id, *match) for match in parse_with_script(parser_module_name, full_content)]
|
||||
elif entity_type.regex_pattern:
|
||||
return [(entity_type.entity_type_id, *match) for match in parse_with_regex(entity_type.regex_pattern, full_content)]
|
||||
else:
|
||||
return []
|
||||
except Exception as e:
|
||||
logging.error(f"Error in parse_entity_type for {entity_type}: {e}")
|
||||
return []
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def parse_content(full_content, abort_flag, db_session):
|
||||
#logging.debug("Starting parsing content")
|
||||
entity_types = db_session.query(EntityTypesTable).filter(EntityTypesTable.parser_enabled == True).all()
|
||||
matches = []
|
||||
|
||||
with multiprocessing.Pool() as pool:
|
||||
results = [pool.apply_async(parse_entity_type, (et, full_content)) for et in entity_types]
|
||||
|
||||
for result in results:
|
||||
if abort_flag():
|
||||
logging.debug("Aborting parsing due to flag")
|
||||
break
|
||||
try:
|
||||
match_result = result.get()
|
||||
#logging.debug(f"Match result: {match_result}")
|
||||
matches.extend(match_result)
|
||||
except Exception as e:
|
||||
logging.error(f"Error parsing entity type: {e}")
|
||||
for match in matches:
|
||||
if len(match) != 4:
|
||||
logging.error(f"Unexpected format for parsd entity: {match}")
|
||||
#logging.debug(f"Finished parsing content. Total matches: {len(matches)}")
|
||||
return matches
|
||||
|
||||
|
||||
|
||||
153
logline_leviathan/file_processor/pdf_processor.py
Normal file
153
logline_leviathan/file_processor/pdf_processor.py
Normal file
@@ -0,0 +1,153 @@
|
||||
import logging
|
||||
import re
|
||||
import os
|
||||
from datetime import datetime
|
||||
from logline_leviathan.file_processor.parser_thread import parse_content
|
||||
from logline_leviathan.file_processor.file_database_ops import handle_file_metadata, handle_individual_entity, handle_distinct_entity, handle_context_snippet
|
||||
import fitz
|
||||
logging.getLogger('pdfminer').setLevel(logging.INFO)
|
||||
|
||||
|
||||
def read_pdf_content(file_path):
|
||||
try:
|
||||
with fitz.open(file_path) as pdf:
|
||||
pages = [page.get_text("text") for page in pdf]
|
||||
return pages
|
||||
except Exception as e:
|
||||
logging.error(f"Error reading PDF file {file_path}: {e}")
|
||||
return None
|
||||
|
||||
def process_pdf_file(file_path, file_mimetype, thread_instance, db_session, abort_flag):
|
||||
try:
|
||||
logging.info(f"Starting processing of PDF file: {file_path}")
|
||||
with fitz.open(file_path) as pdf: # Open the PDF with fitz
|
||||
pages = [page.get_text("text") for page in pdf]
|
||||
|
||||
if pages is None:
|
||||
return 0
|
||||
|
||||
entity_count = 0
|
||||
file_metadata = handle_file_metadata(db_session, file_path, file_mimetype)
|
||||
|
||||
for page_number, content in enumerate(pages):
|
||||
if content is None:
|
||||
continue # Skip empty pages
|
||||
|
||||
if abort_flag():
|
||||
logging.info("Processing aborted.")
|
||||
return entity_count
|
||||
thread_instance.update_status.emit(f" Verarbeite PDF-Datei: {file_path}, Seite {page_number + 1}")
|
||||
|
||||
parsed_entities = parse_content(content, abort_flag, db_session)
|
||||
|
||||
for entity_type_id, match_text, start_pos, end_pos in parsed_entities:
|
||||
if not match_text.strip():
|
||||
continue
|
||||
|
||||
timestamp = find_timestamp_before_match(content, start_pos, file_path)
|
||||
match_start_line, match_end_line = get_line_numbers_from_pos(pdf, page_number, start_pos, end_pos)
|
||||
|
||||
entity = handle_distinct_entity(db_session, match_text, entity_type_id)
|
||||
individual_entity = handle_individual_entity(db_session, entity, file_metadata, match_start_line, timestamp, entity_type_id, abort_flag, thread_instance)
|
||||
|
||||
if individual_entity:
|
||||
handle_context_snippet(db_session, individual_entity, [content], match_start_line, match_end_line)
|
||||
entity_count += 1
|
||||
|
||||
logging.info(f"Finished processing PDF file: {file_path}")
|
||||
return entity_count
|
||||
except Exception as e:
|
||||
db_session.rollback()
|
||||
logging.error(f"Error processing PDF file {file_path}: {e}")
|
||||
return 0
|
||||
|
||||
def alternative_get_line_numbers_from_pos(pdf, page_number, start_pos, end_pos):
|
||||
cumulative_line_number = 0
|
||||
start_line_number = end_line_number = None
|
||||
|
||||
for current_page in range(page_number + 1):
|
||||
page = pdf[current_page]
|
||||
text_blocks = page.get_text("dict")["blocks"]
|
||||
|
||||
for block in text_blocks:
|
||||
if 'lines' in block:
|
||||
for line_number, line in enumerate(block['lines']):
|
||||
if current_page == page_number:
|
||||
line_text = "".join([span['text'] for span in line['spans']])
|
||||
current_pos = len(line_text)
|
||||
|
||||
if start_pos < current_pos and start_line_number is None:
|
||||
start_line_number = cumulative_line_number + line_number
|
||||
if end_pos <= current_pos:
|
||||
end_line_number = cumulative_line_number + line_number
|
||||
return start_line_number, end_line_number
|
||||
|
||||
cumulative_line_number += 1
|
||||
|
||||
return start_line_number, end_line_number
|
||||
|
||||
|
||||
def get_line_numbers_from_pos(pdf, page_number, start_pos, end_pos):
|
||||
page = pdf[page_number]
|
||||
text_blocks = page.get_text("dict")["blocks"]
|
||||
start_line_number = end_line_number = 0
|
||||
current_pos = 0
|
||||
|
||||
for block in text_blocks:
|
||||
if 'lines' in block:
|
||||
for line_number, line in enumerate(block['lines']):
|
||||
line_text = "".join([span['text'] for span in line['spans']])
|
||||
current_pos += len(line_text)
|
||||
if start_pos < current_pos and start_line_number == 0:
|
||||
start_line_number = line_number
|
||||
if end_pos <= current_pos:
|
||||
end_line_number = line_number
|
||||
#logging.debug(f"start_line_number: {start_line_number}, end_line_number: {end_line_number}, line_number: {line_number}, page_number: {page_number}")
|
||||
return start_line_number, end_line_number
|
||||
return start_line_number, end_line_number
|
||||
|
||||
|
||||
|
||||
def find_timestamp_before_match(content, match_start_pos, file_path):
|
||||
# Function to search for timestamps across line breaks
|
||||
def search_timestamps(search_content):
|
||||
# Modified patterns to account for potential line breaks
|
||||
timestamp_patterns = [
|
||||
(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '%Y-%m-%d %H:%M:%S'), # ISO 8601 Extended
|
||||
(r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}', '%Y/%m/%d %H:%M:%S'), # ISO 8601 with slashes
|
||||
(r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}', '%d/%m/%Y %H:%M:%S'), # European Date Format
|
||||
(r'\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}', '%m-%d-%Y %H:%M:%S'), # US Date Format
|
||||
(r'\d{8}_\d{6}', '%Y%m%d_%H%M%S'), # Compact Format
|
||||
(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}', '%Y-%m-%dT%H:%M:%S'), # ISO 8601 Basic
|
||||
(r'\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}:\d{2}', '%d.%m.%Y %H:%M:%S'),# German Date Format
|
||||
(r'\d{4}\d{2}\d{2} \d{2}:\d{2}:\d{2}', '%Y%m%d %H:%M:%S'), # Basic Format without Separators
|
||||
(r'\d{1,2}-[A-Za-z]{3}-\d{4} \d{2}:\d{2}:\d{2}', '%d-%b-%Y %H:%M:%S'), # English Date Format with Month Name
|
||||
(r'(?:19|20)\d{10}', '%Y%m%d%H%M'), # Compact Numeric Format
|
||||
# Add more patterns as needed
|
||||
]
|
||||
|
||||
for pattern, date_format in timestamp_patterns:
|
||||
for timestamp_match in reversed(list(re.finditer(pattern, search_content, re.DOTALL))):
|
||||
try:
|
||||
# Convert the matched timestamp to the standardized format
|
||||
matched_timestamp = datetime.strptime(timestamp_match.group().replace('\n', ''), date_format)
|
||||
return matched_timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
# First, try to find a timestamp in the content
|
||||
timestamp = search_timestamps(content[:match_start_pos])
|
||||
if timestamp:
|
||||
return timestamp
|
||||
|
||||
# If not found in content, try to find a timestamp in the file path
|
||||
basename = os.path.basename(file_path)
|
||||
return search_timestamps(basename)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
104
logline_leviathan/file_processor/text_processor.py
Normal file
104
logline_leviathan/file_processor/text_processor.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import re
|
||||
import os
|
||||
import logging
|
||||
from logline_leviathan.file_processor.parser_thread import parse_content
|
||||
from datetime import datetime
|
||||
from logline_leviathan.file_processor.file_database_ops import handle_file_metadata, handle_individual_entity, handle_context_snippet, handle_distinct_entity
|
||||
from logline_leviathan.database.database_manager import session_scope
|
||||
|
||||
def read_file_content(file_path):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
return file.readlines()
|
||||
except Exception as e:
|
||||
logging.error(f"Error reading file {file_path}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def process_text_file(file_path, file_mimetype, thread_instance, db_session, abort_flag):
|
||||
#with session_scope() as db_session:
|
||||
try:
|
||||
#logging.info(f"Starting processing of text file: {file_path}")
|
||||
file_metadata = handle_file_metadata(db_session, file_path, file_mimetype)
|
||||
content = read_file_content(file_path)
|
||||
full_content = ''.join(content) # Join all lines into a single string
|
||||
thread_instance.update_status.emit(f" Verarbeite textbasierte Datei: {file_path}")
|
||||
|
||||
# Call the new parser and get matches along with entity types
|
||||
parsed_entities = parse_content(full_content, abort_flag, db_session)
|
||||
|
||||
entity_count = 0
|
||||
for entity_type_id, match_text, start_pos, end_pos in parsed_entities:
|
||||
if not match_text.strip():
|
||||
continue
|
||||
|
||||
timestamp = find_timestamp_before_match(full_content, start_pos, file_path)
|
||||
match_start_line, match_end_line = get_line_numbers_from_pos(content, start_pos, end_pos)
|
||||
|
||||
entity = handle_distinct_entity(db_session, match_text, entity_type_id)
|
||||
individual_entity = handle_individual_entity(db_session, entity, file_metadata, match_start_line, timestamp, entity_type_id, abort_flag, thread_instance)
|
||||
|
||||
if individual_entity:
|
||||
entity_count += 1
|
||||
handle_context_snippet(db_session, individual_entity, content, match_start_line, match_end_line)
|
||||
|
||||
return entity_count
|
||||
except Exception as e:
|
||||
db_session.rollback()
|
||||
logging.error(f"Error processing text file {file_path}: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def get_line_numbers_from_pos(content, start_pos, end_pos):
|
||||
start_line = end_line = 0
|
||||
current_pos = 0
|
||||
for i, line in enumerate(content):
|
||||
current_pos += len(line)
|
||||
if start_pos < current_pos:
|
||||
start_line = i
|
||||
break
|
||||
for i, line in enumerate(content[start_line:], start=start_line):
|
||||
current_pos += len(line)
|
||||
if end_pos <= current_pos:
|
||||
end_line = i
|
||||
break
|
||||
return start_line, end_line
|
||||
|
||||
|
||||
def find_timestamp_before_match(content, match_start_pos, file_path):
|
||||
# Function to search for timestamps across line breaks
|
||||
def search_timestamps(search_content):
|
||||
# Modified patterns to account for potential line breaks
|
||||
timestamp_patterns = [
|
||||
(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '%Y-%m-%d %H:%M:%S'), # ISO 8601 Extended
|
||||
(r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}', '%Y/%m/%d %H:%M:%S'), # ISO 8601 with slashes
|
||||
(r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}', '%d/%m/%Y %H:%M:%S'), # European Date Format
|
||||
(r'\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}', '%m-%d-%Y %H:%M:%S'), # US Date Format
|
||||
(r'\d{8}_\d{6}', '%Y%m%d_%H%M%S'), # Compact Format
|
||||
(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}', '%Y-%m-%dT%H:%M:%S'), # ISO 8601 Basic
|
||||
(r'\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}:\d{2}', '%d.%m.%Y %H:%M:%S'),# German Date Format
|
||||
(r'\d{4}\d{2}\d{2} \d{2}:\d{2}:\d{2}', '%Y%m%d %H:%M:%S'), # Basic Format without Separators
|
||||
(r'\d{1,2}-[A-Za-z]{3}-\d{4} \d{2}:\d{2}:\d{2}', '%d-%b-%Y %H:%M:%S'), # English Date Format with Month Name
|
||||
(r'(?:19|20)\d{10}', '%Y%m%d%H%M'), # Compact Numeric Format
|
||||
# Add more patterns as needed
|
||||
]
|
||||
|
||||
for pattern, date_format in timestamp_patterns:
|
||||
for timestamp_match in reversed(list(re.finditer(pattern, search_content, re.DOTALL))):
|
||||
try:
|
||||
# Convert the matched timestamp to the standardized format
|
||||
matched_timestamp = datetime.strptime(timestamp_match.group().replace('\n', ''), date_format)
|
||||
return matched_timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
# First, try to find a timestamp in the content
|
||||
timestamp = search_timestamps(content[:match_start_pos])
|
||||
if timestamp:
|
||||
return timestamp
|
||||
|
||||
# If not found in content, try to find a timestamp in the file path
|
||||
basename = os.path.basename(file_path)
|
||||
return search_timestamps(basename)
|
||||
|
||||
120
logline_leviathan/file_processor/xlsx_processor.py
Normal file
120
logline_leviathan/file_processor/xlsx_processor.py
Normal file
@@ -0,0 +1,120 @@
|
||||
import logging
|
||||
import datetime
|
||||
import re
|
||||
import os
|
||||
from openpyxl import load_workbook
|
||||
from logline_leviathan.file_processor.parser_thread import parse_content
|
||||
from logline_leviathan.file_processor.file_database_ops import handle_file_metadata, handle_individual_entity, handle_context_snippet, handle_distinct_entity
|
||||
|
||||
def read_xlsx_content(file_path):
|
||||
try:
|
||||
workbook = load_workbook(filename=file_path)
|
||||
return workbook
|
||||
except Exception as e:
|
||||
logging.error(f"Error reading XLSX file {file_path}: {e}")
|
||||
return None
|
||||
|
||||
def get_line_numbers_from_pos(content, start_pos, end_pos):
|
||||
# For XLSX, the line number is the row number in the current sheet
|
||||
start_line = end_line = 0
|
||||
current_pos = 0
|
||||
for i, line in enumerate(content):
|
||||
current_pos += len(line)
|
||||
if start_pos < current_pos:
|
||||
start_line = i
|
||||
break
|
||||
for i, line in enumerate(content[start_line:], start=start_line):
|
||||
current_pos += len(line)
|
||||
if end_pos <= current_pos:
|
||||
end_line = i
|
||||
break
|
||||
return start_line, end_line
|
||||
|
||||
|
||||
def find_timestamp_before_match(content, match_start_pos, file_path):
|
||||
# Function to search for timestamps across line breaks
|
||||
def search_timestamps(search_content):
|
||||
# Modified patterns to account for potential line breaks
|
||||
timestamp_patterns = [
|
||||
(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '%Y-%m-%d %H:%M:%S'), # ISO 8601 Extended
|
||||
(r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}', '%Y/%m/%d %H:%M:%S'), # ISO 8601 with slashes
|
||||
(r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}', '%d/%m/%Y %H:%M:%S'), # European Date Format
|
||||
(r'\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}', '%m-%d-%Y %H:%M:%S'), # US Date Format
|
||||
(r'\d{8}_\d{6}', '%Y%m%d_%H%M%S'), # Compact Format
|
||||
(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}', '%Y-%m-%dT%H:%M:%S'), # ISO 8601 Basic
|
||||
(r'\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}:\d{2}', '%d.%m.%Y %H:%M:%S'),# German Date Format
|
||||
(r'\d{4}\d{2}\d{2} \d{2}:\d{2}:\d{2}', '%Y%m%d %H:%M:%S'), # Basic Format without Separators
|
||||
(r'\d{1,2}-[A-Za-z]{3}-\d{4} \d{2}:\d{2}:\d{2}', '%d-%b-%Y %H:%M:%S'), # English Date Format with Month Name
|
||||
(r'(?:19|20)\d{10}', '%Y%m%d%H%M'), # Compact Numeric Format
|
||||
# Add more patterns as needed
|
||||
]
|
||||
|
||||
for pattern, date_format in timestamp_patterns:
|
||||
for timestamp_match in reversed(list(re.finditer(pattern, search_content, re.DOTALL))):
|
||||
try:
|
||||
# Convert the matched timestamp to the standardized format
|
||||
matched_timestamp = datetime.strptime(timestamp_match.group().replace('\n', ''), date_format)
|
||||
return matched_timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
# First, try to find a timestamp in the content
|
||||
timestamp = search_timestamps(content[:match_start_pos])
|
||||
if timestamp:
|
||||
return timestamp
|
||||
|
||||
# If not found in content, try to find a timestamp in the file path
|
||||
basename = os.path.basename(file_path)
|
||||
return search_timestamps(basename)
|
||||
|
||||
|
||||
def process_xlsx_file(file_path, file_mimetype, thread_instance, db_session, abort_flag):
|
||||
try:
|
||||
logging.info(f"Starting processing of XLSX file: {file_path}")
|
||||
workbook = read_xlsx_content(file_path)
|
||||
|
||||
if workbook is None:
|
||||
return 0
|
||||
|
||||
entity_count = 0
|
||||
|
||||
for sheet in workbook:
|
||||
sheet_name = sheet.title
|
||||
file_metadata = handle_file_metadata(db_session, file_path, file_mimetype, sheet_name=sheet_name)
|
||||
|
||||
if abort_flag():
|
||||
logging.info("Processing aborted.")
|
||||
return entity_count
|
||||
|
||||
# Combining all cells into a single string for parsing
|
||||
content = [' '.join([str(cell.value) if cell.value is not None else '' for cell in row]) for row in sheet.iter_rows()]
|
||||
full_content = '\n'.join(content)
|
||||
thread_instance.update_status.emit(f"Processing Excel file: {file_path} Sheet {sheet_name}")
|
||||
|
||||
parsed_entities = parse_content(full_content, abort_flag, db_session)
|
||||
|
||||
for entity_type_id, match_text, start_pos, end_pos in parsed_entities:
|
||||
if not match_text.strip():
|
||||
continue
|
||||
|
||||
match_start_line, match_end_line = get_line_numbers_from_pos(content, start_pos, end_pos)
|
||||
|
||||
# Find timestamp before match
|
||||
timestamp = find_timestamp_before_match(full_content, start_pos, file_path)
|
||||
|
||||
entity = handle_distinct_entity(db_session, match_text, entity_type_id)
|
||||
individual_entity = handle_individual_entity(db_session, entity, file_metadata, match_start_line, timestamp, entity_type_id, abort_flag, thread_instance)
|
||||
if individual_entity:
|
||||
handle_context_snippet(db_session, individual_entity, content, match_start_line, match_end_line)
|
||||
entity_count += 1
|
||||
|
||||
logging.info(f"Finished processing XLSX file: {file_path}")
|
||||
return entity_count
|
||||
except Exception as e:
|
||||
db_session.rollback()
|
||||
logging.error(f"Error processing XLSX file {file_path}: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user