LoglineLeviathan/logline_leviathan/file_processor/file_processor_thread.py

from multiprocessing.spawn import import_main_path
import sys
import time
import os
from PyQt5.QtCore import QThread, pyqtSignal, QMutex
from logline_leviathan.database.database_manager import session_scope
from logline_leviathan.gui.checkbox_panel import CheckboxPanel
from .text_processor import process_text_file
from .xlsx_processor import process_xlsx_file
from .pdf_processor import process_pdf_file
from .docx_processor import process_docx_file
import magic
import logging
import pathlib
from sqlalchemy import text
from sqlalchemy.exc import OperationalError

class FileProcessorThread(QThread):
    update_progress = pyqtSignal(int)
    update_status = pyqtSignal(str)
    update_tree_signal = pyqtSignal()
    update_checkboxes_signal = pyqtSignal()
    update_rate = pyqtSignal(float, int, float, int, float, float)
    last_update_time = 0


    def __init__(self, file_paths):
        super().__init__()
        self.start_time = time.time()
        self.total_entities_count = 0
        self.total_entities_count_lock = QMutex()
        self.abort_mutex = QMutex()

        self.abort_flag = False
        self.file_paths = file_paths
        self.unsupported_files_count = 0
        self.processed_files_count = 0
        self.total_data_processed_kb = 0
        self.total_files_size_kb = sum(os.path.getsize(f) / 1024 for f in file_paths)

        self.unsupported_files_list = []
        self.all_unsupported_files = []

        self.checkbox_panel = CheckboxPanel()

    @property
    def abort_flag(self):
        # This is the getter method for the property
        self.abort_mutex.lock()
        flag = self._abort_flag
        self.abort_mutex.unlock()
        return flag

    @abort_flag.setter
    def abort_flag(self, value):
        # This is the setter method for the property
        self.abort_mutex.lock()
        self._abort_flag = value
        self.abort_mutex.unlock()

    def classify_file_type(self, file_path):
        # Mapping of file extensions to MIME types
        mime_types = {
            '.txt': 'text/plain',
            '.pdf': 'application/pdf',
            '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
            '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
            '.csv': 'text/csv',
            '.html': 'text/html',
            '.htm': 'text/html',
            '.xml': 'text/xml',
            '.json': 'application/json',
            '.yaml': 'text/yaml',
            '.yml': 'text/yaml',
            '.md': 'text/markdown',
            '.rtf': 'application/rtf',
            '.odt': 'application/vnd.oasis.opendocument.text',
            '.ods': 'application/vnd.oasis.opendocument.spreadsheet',
            '.odp': 'application/vnd.oasis.opendocument.presentation',
            '.log': 'text/plain',
            '.ini': 'text/plain',
            '.conf': 'text/plain',
            '.cfg': 'text/plain',
            '.js': 'application/javascript',
            '.css': 'text/css',
            '.php': 'text/php',
            '.py': 'text/x-python',
            '.rb': 'text/x-ruby',
            '.java': 'text/x-java-source',
            '.c': 'text/x-c',
            '.cpp': 'text/x-c++',
            '.h': 'text/x-c-header',
            '.hpp': 'text/x-c++-header',
            '.sh': 'application/x-sh',
            '.bat': 'application/x-bat',
            '.ps1': 'application/x-powershell',
            '.sql': 'text/x-sql',
            # Add more mappings as needed
        }
        try:
            mime = magic.Magic(mime=True)
            file_type = mime.from_file(file_path)
            return file_type
        except FileNotFoundError as e:
            logging.error(f"File not found: {file_path}. Encoding: {sys.getfilesystemencoding()}", exc_info=True)
        except Exception as e:
            try:
                clean_file_path = pathlib.Path(file_path)
                mime = magic.Magic(mime=True)
                file_type = mime.from_file(clean_file_path)
                return file_type
            except Exception as e:
                logging.error(f"The magic library failed classifying the file type: {e} // falling back to file extension")
                _, file_extension = os.path.splitext(file_path)
            return mime_types.get(file_extension.lower(), 'application/octet-stream')  # Default to binary type if unknown


    def run(self):
        logging.debug("Thread run method started.")
        try:
            for index, file_path in enumerate(self.file_paths):
                #if not self.debugFileProcessor(file_path):
                #    continue
                file_size_kb = os.path.getsize(file_path) / 1024  # Get file size in KiB
                self.total_data_processed_kb += file_size_kb
                if self.abort_flag:
                    self.update_status.emit("Analyse abgebrochen")
                    return
                logging.debug(f"Attempting to process file: {file_path}")
                file_type = self.classify_file_type(file_path)
                logging.info(f"ANALYZING {file_path} TYPE {file_type}")
                # Check and potentially re-establish the database connection
                if not self.check_and_restore_db_connection():
                    logging.error(f"Database connection could not be established for {file_path}. Skipping file.")
                    continue
                with session_scope() as session:
                    if 'text/' in file_type:
                        process_text_file(file_path, file_type, self, session, lambda: self.abort_flag)
                    elif 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' in file_type:
                        process_xlsx_file(file_path, file_type, self, session, lambda: self.abort_flag)
                    elif 'application/pdf' in file_type or file_type == ".pdf":
                        process_pdf_file(file_path, file_type, self, session, lambda: self.abort_flag)
                    elif 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' in file_type:
                        # Handling DOCX file
                        process_docx_file(file_path, file_type, self, session, lambda: self.abort_flag)
                    else:
                        logging.info(f"Skipping unsupported file type: {file_type}")
                        self.all_unsupported_files.append(file_path)
                        self.unsupported_files_count += 1
                        if len(self.unsupported_files_list) < 20:
                            self.unsupported_files_list.append(f"{file_path} (Type: {file_type})")
                        continue
                self.update_tree_signal.emit()
                self.update_checkboxes_signal.emit()
                self.processed_files_count = index + 1
                self.update_progress.emit(index + 1)
            self.update_status.emit(f"   Verarbeitung abgeschlossen. {index + 1 - self.unsupported_files_count} von {len(self.file_paths)} Dateien verarbeitet.")
        except Exception as e:
            logging.error(f"Error processing files: {e}")
            self.update_status.emit(f"Fehler beim Verarbeiten von Dateien {e}", exc_info=True)


    def check_and_restore_db_connection(self):
        attempts = 0
        max_attempts = 5
        while attempts < max_attempts:
            try:
                with session_scope() as session:
                    session.execute(text('SELECT 1'))
                    return True
            except OperationalError:
                attempts += 1
                time.sleep(2 ** attempts)  # Exponential backoff
                continue
        logging.error("Failed to re-establish database connection after several attempts.")
        return False


    def calculate_and_emit_rate(self):
        current_time = time.time()
        if current_time - self.last_update_time >= 1:  # Check if 1 second has passed
            entity_rate = self.calculate_rate()
            file_rate = self.calculate_file_rate()
            data_rate_kibs = self.calculate_data_rate()
            estimated_time = self.calculate_estimated_time_to_completion(data_rate_kibs)
            self.update_rate.emit(entity_rate, self.total_entities_count, file_rate, self.processed_files_count, estimated_time, data_rate_kibs)
            self.last_update_time = current_time

    def calculate_data_rate(self):
        elapsed_time = time.time() - self.start_time
        return self.total_data_processed_kb / elapsed_time if elapsed_time > 0 else 0

    def calculate_estimated_time_to_completion(self, data_rate_kibs):
        remaining_data_kb = self.total_files_size_kb - self.total_data_processed_kb
        if data_rate_kibs > 0:
            estimated_time = remaining_data_kb / data_rate_kibs
        else:
            estimated_time = float('inf')  # Indefinite time if rate is zero
        return estimated_time

    def calculate_file_rate(self):
        elapsed_time = time.time() - self.start_time
        return self.processed_files_count / elapsed_time if elapsed_time > 0 else 0


    def calculate_rate(self):
        elapsed_time = time.time() - self.start_time
        rate = self.total_entities_count / elapsed_time if elapsed_time > 0 else 0
        return rate

    def abort(self):
        self.abort_flag = True

    def getUnsupportedFilesCount(self):
        return self.unsupported_files_count

    def getUnsupportedFilesList(self):
        return self.unsupported_files_list

    def debugFileProcessor(self, file_path):
        logging.debug(f"Attempting to process file: {file_path}")

        if not os.path.exists(file_path):
            logging.warning(f"File does not exist: {file_path}")
            return False
        elif not os.access(file_path, os.R_OK):
            logging.warning(f"File is not accessible: {file_path}")
            return False

        try:
            detected_encoding = magic.from_file(file_path, mime=True)
            logging.debug(f"Detected encoding for {file_path}: {detected_encoding}")
        except Exception as e:
            logging.error(f"Failed to detect encoding for {file_path}: {e}", exc_info=True)

        file_type = self.classify_file_type(file_path)
        logging.debug(f"Classified file type for {file_path}: {file_type}")

        return True