240 lines
10 KiB
Python
240 lines
10 KiB
Python
from multiprocessing.spawn import import_main_path
|
|
import sys
|
|
import time
|
|
import os
|
|
from PyQt5.QtCore import QThread, pyqtSignal, QMutex
|
|
from logline_leviathan.database.database_manager import session_scope
|
|
from logline_leviathan.gui.checkbox_panel import CheckboxPanel
|
|
from .text_processor import process_text_file
|
|
from .xlsx_processor import process_xlsx_file
|
|
from .pdf_processor import process_pdf_file
|
|
from .docx_processor import process_docx_file
|
|
import magic
|
|
import logging
|
|
import pathlib
|
|
from sqlalchemy import text
|
|
from sqlalchemy.exc import OperationalError
|
|
|
|
class FileProcessorThread(QThread):
|
|
update_progress = pyqtSignal(int)
|
|
update_status = pyqtSignal(str)
|
|
update_tree_signal = pyqtSignal()
|
|
update_checkboxes_signal = pyqtSignal()
|
|
update_rate = pyqtSignal(float, int, float, int, float, float)
|
|
last_update_time = 0
|
|
|
|
|
|
def __init__(self, file_paths):
|
|
super().__init__()
|
|
self.start_time = time.time()
|
|
self.total_entities_count = 0
|
|
self.total_entities_count_lock = QMutex()
|
|
self.abort_mutex = QMutex()
|
|
|
|
self.abort_flag = False
|
|
self.file_paths = file_paths
|
|
self.unsupported_files_count = 0
|
|
self.processed_files_count = 0
|
|
self.total_data_processed_kb = 0
|
|
self.total_files_size_kb = sum(os.path.getsize(f) / 1024 for f in file_paths)
|
|
|
|
self.unsupported_files_list = []
|
|
self.all_unsupported_files = []
|
|
|
|
self.checkbox_panel = CheckboxPanel()
|
|
|
|
@property
|
|
def abort_flag(self):
|
|
# This is the getter method for the property
|
|
self.abort_mutex.lock()
|
|
flag = self._abort_flag
|
|
self.abort_mutex.unlock()
|
|
return flag
|
|
|
|
@abort_flag.setter
|
|
def abort_flag(self, value):
|
|
# This is the setter method for the property
|
|
self.abort_mutex.lock()
|
|
self._abort_flag = value
|
|
self.abort_mutex.unlock()
|
|
|
|
def classify_file_type(self, file_path):
|
|
# Mapping of file extensions to MIME types
|
|
mime_types = {
|
|
'.txt': 'text/plain',
|
|
'.pdf': 'application/pdf',
|
|
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
'.csv': 'text/csv',
|
|
'.html': 'text/html',
|
|
'.htm': 'text/html',
|
|
'.xml': 'text/xml',
|
|
'.json': 'application/json',
|
|
'.yaml': 'text/yaml',
|
|
'.yml': 'text/yaml',
|
|
'.md': 'text/markdown',
|
|
'.rtf': 'application/rtf',
|
|
'.odt': 'application/vnd.oasis.opendocument.text',
|
|
'.ods': 'application/vnd.oasis.opendocument.spreadsheet',
|
|
'.odp': 'application/vnd.oasis.opendocument.presentation',
|
|
'.log': 'text/plain',
|
|
'.ini': 'text/plain',
|
|
'.conf': 'text/plain',
|
|
'.cfg': 'text/plain',
|
|
'.js': 'application/javascript',
|
|
'.css': 'text/css',
|
|
'.php': 'text/php',
|
|
'.py': 'text/x-python',
|
|
'.rb': 'text/x-ruby',
|
|
'.java': 'text/x-java-source',
|
|
'.c': 'text/x-c',
|
|
'.cpp': 'text/x-c++',
|
|
'.h': 'text/x-c-header',
|
|
'.hpp': 'text/x-c++-header',
|
|
'.sh': 'application/x-sh',
|
|
'.bat': 'application/x-bat',
|
|
'.ps1': 'application/x-powershell',
|
|
'.sql': 'text/x-sql',
|
|
# Add more mappings as needed
|
|
}
|
|
try:
|
|
mime = magic.Magic(mime=True)
|
|
file_type = mime.from_file(file_path)
|
|
return file_type
|
|
except FileNotFoundError as e:
|
|
logging.error(f"File not found: {file_path}. Encoding: {sys.getfilesystemencoding()}", exc_info=True)
|
|
except Exception as e:
|
|
try:
|
|
clean_file_path = pathlib.Path(file_path)
|
|
mime = magic.Magic(mime=True)
|
|
file_type = mime.from_file(clean_file_path)
|
|
return file_type
|
|
except Exception as e:
|
|
logging.error(f"The magic library failed classifying the file type: {e} // falling back to file extension")
|
|
_, file_extension = os.path.splitext(file_path)
|
|
return mime_types.get(file_extension.lower(), 'application/octet-stream') # Default to binary type if unknown
|
|
|
|
|
|
def run(self):
|
|
logging.debug("Thread run method started.")
|
|
try:
|
|
for index, file_path in enumerate(self.file_paths):
|
|
#if not self.debugFileProcessor(file_path):
|
|
# continue
|
|
file_size_kb = os.path.getsize(file_path) / 1024 # Get file size in KiB
|
|
self.total_data_processed_kb += file_size_kb
|
|
if self.abort_flag:
|
|
self.update_status.emit("Analyse abgebrochen")
|
|
return
|
|
logging.debug(f"Attempting to process file: {file_path}")
|
|
file_type = self.classify_file_type(file_path)
|
|
logging.info(f"ANALYZING {file_path} TYPE {file_type}")
|
|
# Check and potentially re-establish the database connection
|
|
if not self.check_and_restore_db_connection():
|
|
logging.error(f"Database connection could not be established for {file_path}. Skipping file.")
|
|
continue
|
|
with session_scope() as session:
|
|
if 'text/' in file_type:
|
|
process_text_file(file_path, file_type, self, session, lambda: self.abort_flag)
|
|
elif 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' in file_type:
|
|
process_xlsx_file(file_path, file_type, self, session, lambda: self.abort_flag)
|
|
elif 'application/pdf' in file_type or file_type == ".pdf":
|
|
process_pdf_file(file_path, file_type, self, session, lambda: self.abort_flag)
|
|
elif 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' in file_type:
|
|
# Handling DOCX file
|
|
process_docx_file(file_path, file_type, self, session, lambda: self.abort_flag)
|
|
else:
|
|
logging.info(f"Skipping unsupported file type: {file_type}")
|
|
self.all_unsupported_files.append(file_path)
|
|
self.unsupported_files_count += 1
|
|
if len(self.unsupported_files_list) < 20:
|
|
self.unsupported_files_list.append(f"{file_path} (Type: {file_type})")
|
|
continue
|
|
self.update_tree_signal.emit()
|
|
self.update_checkboxes_signal.emit()
|
|
self.processed_files_count = index + 1
|
|
self.update_progress.emit(index + 1)
|
|
self.update_status.emit(f" Verarbeitung abgeschlossen. {index + 1 - self.unsupported_files_count} von {len(self.file_paths)} Dateien verarbeitet.")
|
|
except Exception as e:
|
|
logging.error(f"Error processing files: {e}")
|
|
self.update_status.emit(f"Fehler beim Verarbeiten von Dateien {e}", exc_info=True)
|
|
|
|
|
|
def check_and_restore_db_connection(self):
|
|
attempts = 0
|
|
max_attempts = 5
|
|
while attempts < max_attempts:
|
|
try:
|
|
with session_scope() as session:
|
|
session.execute(text('SELECT 1'))
|
|
return True
|
|
except OperationalError:
|
|
attempts += 1
|
|
time.sleep(2 ** attempts) # Exponential backoff
|
|
continue
|
|
logging.error("Failed to re-establish database connection after several attempts.")
|
|
return False
|
|
|
|
|
|
def calculate_and_emit_rate(self):
|
|
current_time = time.time()
|
|
if current_time - self.last_update_time >= 1: # Check if 1 second has passed
|
|
entity_rate = self.calculate_rate()
|
|
file_rate = self.calculate_file_rate()
|
|
data_rate_kibs = self.calculate_data_rate()
|
|
estimated_time = self.calculate_estimated_time_to_completion(data_rate_kibs)
|
|
self.update_rate.emit(entity_rate, self.total_entities_count, file_rate, self.processed_files_count, estimated_time, data_rate_kibs)
|
|
self.last_update_time = current_time
|
|
|
|
def calculate_data_rate(self):
|
|
elapsed_time = time.time() - self.start_time
|
|
return self.total_data_processed_kb / elapsed_time if elapsed_time > 0 else 0
|
|
|
|
def calculate_estimated_time_to_completion(self, data_rate_kibs):
|
|
remaining_data_kb = self.total_files_size_kb - self.total_data_processed_kb
|
|
if data_rate_kibs > 0:
|
|
estimated_time = remaining_data_kb / data_rate_kibs
|
|
else:
|
|
estimated_time = float('inf') # Indefinite time if rate is zero
|
|
return estimated_time
|
|
|
|
def calculate_file_rate(self):
|
|
elapsed_time = time.time() - self.start_time
|
|
return self.processed_files_count / elapsed_time if elapsed_time > 0 else 0
|
|
|
|
|
|
def calculate_rate(self):
|
|
elapsed_time = time.time() - self.start_time
|
|
rate = self.total_entities_count / elapsed_time if elapsed_time > 0 else 0
|
|
return rate
|
|
|
|
def abort(self):
|
|
self.abort_flag = True
|
|
|
|
def getUnsupportedFilesCount(self):
|
|
return self.unsupported_files_count
|
|
|
|
def getUnsupportedFilesList(self):
|
|
return self.unsupported_files_list
|
|
|
|
def debugFileProcessor(self, file_path):
|
|
logging.debug(f"Attempting to process file: {file_path}")
|
|
|
|
if not os.path.exists(file_path):
|
|
logging.warning(f"File does not exist: {file_path}")
|
|
return False
|
|
elif not os.access(file_path, os.R_OK):
|
|
logging.warning(f"File is not accessible: {file_path}")
|
|
return False
|
|
|
|
try:
|
|
detected_encoding = magic.from_file(file_path, mime=True)
|
|
logging.debug(f"Detected encoding for {file_path}: {detected_encoding}")
|
|
except Exception as e:
|
|
logging.error(f"Failed to detect encoding for {file_path}: {e}", exc_info=True)
|
|
|
|
file_type = self.classify_file_type(file_path)
|
|
logging.debug(f"Classified file type for {file_path}: {file_type}")
|
|
|
|
return True
|
|
|