2025-09-03 13:20:23 +02:00

102 lines
3.9 KiB
Python

# the parse_content receives the full_content string from the methods process_text_file, process_xlsx_file, process_pdf_file or similar along the abort_flag
import os
import sys
import re
import logging
import importlib.util
import multiprocessing
from logline_leviathan.database.database_manager import EntityTypesTable
#logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
#multiprocessing.set_start_method('spawn')
def parse_with_script(parser_module_name, full_content):
parser_module_name = parser_module_name.replace('.py', '') # Remove .py extension
if getattr(sys, 'frozen', False):
# The base path is the directory of the executable
base_dir = os.path.dirname(sys.executable)
# Construct the path to the 'data/parser' directory
base_path = os.path.join(base_dir, 'data', 'parser')
else:
# Running in a normal Python environment
base_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'data', 'parser')
# Construct the path to the parser module
parser_module_path = os.path.join(base_path, parser_module_name + '.py')
if not os.path.exists(parser_module_path):
logging.error(f"Parser module not found: {parser_module_path}")
return []
# Dynamically import the module using its file path
spec = importlib.util.spec_from_file_location(parser_module_name, parser_module_path)
parser_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(parser_module)
try:
script_results = parser_module.parse(full_content)
return script_results
except Exception as e:
logging.error(f"Error using parser module {parser_module_name}: {e}")
return []
def parse_with_regex(regex_pattern, full_content):
try:
#logging.debug(f"Using regex pattern: {regex_pattern}")
regex_results = [(match.group(), match.start(), match.end()) for match in re.finditer(regex_pattern, full_content)]
#logging.debug(f"Regex parser results: {regex_results}")
return regex_results
except re.error as e:
logging.error(f"Invalid regex pattern: {regex_pattern}. Error: {e}")
return []
def parse_entity_type(entity_type, full_content):
try:
if entity_type.script_parser and os.path.exists(os.path.join('data', 'parser', entity_type.script_parser)):
# Use the script_parser name directly
parser_module_name = entity_type.script_parser.replace('.py', '')
return [(entity_type.entity_type_id, *match) for match in parse_with_script(parser_module_name, full_content)]
elif entity_type.regex_pattern:
return [(entity_type.entity_type_id, *match) for match in parse_with_regex(entity_type.regex_pattern, full_content)]
else:
return []
except Exception as e:
logging.error(f"Error in parse_entity_type for {entity_type}: {e}")
return []
def parse_content(full_content, abort_flag, db_session):
#logging.debug("Starting parsing content")
entity_types = db_session.query(EntityTypesTable).filter(EntityTypesTable.parser_enabled == True).all()
matches = []
with multiprocessing.Pool() as pool:
results = [pool.apply_async(parse_entity_type, (et, full_content)) for et in entity_types]
for result in results:
if abort_flag():
logging.debug("Aborting parsing due to flag")
break
try:
match_result = result.get()
#logging.debug(f"Match result: {match_result}")
matches.extend(match_result)
except Exception as e:
logging.error(f"Error parsing entity type: {e}")
for match in matches:
if len(match) != 4:
logging.error(f"Unexpected format for parsd entity: {match}")
#logging.debug(f"Finished parsing content. Total matches: {len(matches)}")
return matches