# the parse_content receives the full_content string from the methods process_text_file, process_xlsx_file, process_pdf_file or similar along the abort_flag import os import sys import re import logging import importlib.util import multiprocessing from logline_leviathan.database.database_manager import EntityTypesTable #logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') #multiprocessing.set_start_method('spawn') def parse_with_script(parser_module_name, full_content): parser_module_name = parser_module_name.replace('.py', '') # Remove .py extension if getattr(sys, 'frozen', False): # The base path is the directory of the executable base_dir = os.path.dirname(sys.executable) # Construct the path to the 'data/parser' directory base_path = os.path.join(base_dir, 'data', 'parser') else: # Running in a normal Python environment base_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'data', 'parser') # Construct the path to the parser module parser_module_path = os.path.join(base_path, parser_module_name + '.py') if not os.path.exists(parser_module_path): logging.error(f"Parser module not found: {parser_module_path}") return [] # Dynamically import the module using its file path spec = importlib.util.spec_from_file_location(parser_module_name, parser_module_path) parser_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(parser_module) try: script_results = parser_module.parse(full_content) return script_results except Exception as e: logging.error(f"Error using parser module {parser_module_name}: {e}") return [] def parse_with_regex(regex_pattern, full_content): try: #logging.debug(f"Using regex pattern: {regex_pattern}") regex_results = [(match.group(), match.start(), match.end()) for match in re.finditer(regex_pattern, full_content)] #logging.debug(f"Regex parser results: {regex_results}") return regex_results except re.error as e: logging.error(f"Invalid regex pattern: {regex_pattern}. Error: {e}") return [] def parse_entity_type(entity_type, full_content): try: if entity_type.script_parser and os.path.exists(os.path.join('data', 'parser', entity_type.script_parser)): # Use the script_parser name directly parser_module_name = entity_type.script_parser.replace('.py', '') return [(entity_type.entity_type_id, *match) for match in parse_with_script(parser_module_name, full_content)] elif entity_type.regex_pattern: return [(entity_type.entity_type_id, *match) for match in parse_with_regex(entity_type.regex_pattern, full_content)] else: return [] except Exception as e: logging.error(f"Error in parse_entity_type for {entity_type}: {e}") return [] def parse_content(full_content, abort_flag, db_session): #logging.debug("Starting parsing content") entity_types = db_session.query(EntityTypesTable).filter(EntityTypesTable.parser_enabled == True).all() matches = [] with multiprocessing.Pool() as pool: results = [pool.apply_async(parse_entity_type, (et, full_content)) for et in entity_types] for result in results: if abort_flag(): logging.debug("Aborting parsing due to flag") break try: match_result = result.get() #logging.debug(f"Match result: {match_result}") matches.extend(match_result) except Exception as e: logging.error(f"Error parsing entity type: {e}") for match in matches: if len(match) != 4: logging.error(f"Unexpected format for parsd entity: {match}") #logging.debug(f"Finished parsing content. Total matches: {len(matches)}") return matches