LoglineLeviathan/logline_leviathan/file_processor/parser_thread.py

# the parse_content receives the full_content string from the methods process_text_file, process_xlsx_file, process_pdf_file or similar along the abort_flag

import os
import sys
import re
import logging
import importlib.util
import multiprocessing
from logline_leviathan.database.database_manager import EntityTypesTable

#logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
#multiprocessing.set_start_method('spawn')

def parse_with_script(parser_module_name, full_content):
    parser_module_name = parser_module_name.replace('.py', '')  # Remove .py extension

    if getattr(sys, 'frozen', False):
        # The base path is the directory of the executable
        base_dir = os.path.dirname(sys.executable)
        # Construct the path to the 'data/parser' directory
        base_path = os.path.join(base_dir, 'data', 'parser')
    else:
        # Running in a normal Python environment
        base_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'data', 'parser')

    # Construct the path to the parser module
    parser_module_path = os.path.join(base_path, parser_module_name + '.py')

    if not os.path.exists(parser_module_path):
        logging.error(f"Parser module not found: {parser_module_path}")
        return []

    # Dynamically import the module using its file path
    spec = importlib.util.spec_from_file_location(parser_module_name, parser_module_path)
    parser_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(parser_module)

    try:
        script_results = parser_module.parse(full_content)
        return script_results
    except Exception as e:
        logging.error(f"Error using parser module {parser_module_name}: {e}")
        return []


def parse_with_regex(regex_pattern, full_content):
    try:
        #logging.debug(f"Using regex pattern: {regex_pattern}")
        regex_results = [(match.group(), match.start(), match.end()) for match in re.finditer(regex_pattern, full_content)]
        #logging.debug(f"Regex parser results: {regex_results}")
        return regex_results
    except re.error as e:
        logging.error(f"Invalid regex pattern: {regex_pattern}. Error: {e}")
        return []


def parse_entity_type(entity_type, full_content):
    try:
        if entity_type.script_parser and os.path.exists(os.path.join('data', 'parser', entity_type.script_parser)):
            # Use the script_parser name directly
            parser_module_name = entity_type.script_parser.replace('.py', '')
            return [(entity_type.entity_type_id, *match) for match in parse_with_script(parser_module_name, full_content)]
        elif entity_type.regex_pattern:
            return [(entity_type.entity_type_id, *match) for match in parse_with_regex(entity_type.regex_pattern, full_content)]
        else:
            return []
    except Exception as e:
        logging.error(f"Error in parse_entity_type for {entity_type}: {e}")
        return []


def parse_content(full_content, abort_flag, db_session):
    #logging.debug("Starting parsing content")
    entity_types = db_session.query(EntityTypesTable).filter(EntityTypesTable.parser_enabled == True).all()
    matches = []

    with multiprocessing.Pool() as pool:
        results = [pool.apply_async(parse_entity_type, (et, full_content)) for et in entity_types]

        for result in results:
            if abort_flag():
                logging.debug("Aborting parsing due to flag")
                break
            try:
                match_result = result.get()
                #logging.debug(f"Match result: {match_result}")
                matches.extend(match_result)
            except Exception as e:
                logging.error(f"Error parsing entity type: {e}")
    for match in matches:
        if len(match) != 4:
            logging.error(f"Unexpected format for parsd entity: {match}")
    #logging.debug(f"Finished parsing content. Total matches: {len(matches)}")
    return matches