LoglineLeviathan/logline_leviathan/file_processor/docx_processor.py

import logging
from docx import Document
from datetime import datetime
from logline_leviathan.file_processor.parser_thread import parse_content
from logline_leviathan.file_processor.file_database_ops import handle_file_metadata, handle_individual_entity, handle_distinct_entity, handle_context_snippet
import re

def read_docx_content(file_path):
    try:
        doc = Document(file_path)
        full_content = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
        return full_content.splitlines(True)  # Keep end-of-line characters
    except Exception as e:
        logging.error(f"Error reading DOCX file {file_path}: {e}")
        return None

def get_line_numbers_from_pos(content, start_pos, end_pos):
    # This function is similar to the one for text and xlsx files
    # Adjustments might be needed for the nuances of docx content structure
    start_line = end_line = 0
    current_pos = 0
    for i, line in enumerate(content):
        current_pos += len(line)
        if start_pos < current_pos:
            start_line = i
            break
    for i, line in enumerate(content[start_line:], start=start_line):
        current_pos += len(line)
        if end_pos <= current_pos:
            end_line = i
            break
    return start_line, end_line

def process_docx_file(file_path, file_mimetype, thread_instance, db_session, abort_flag):
    try:
        file_metadata = handle_file_metadata(db_session, file_path, file_mimetype)
        content = read_docx_content(file_path)
        full_content = ''.join(content)  # Join all lines into a single string
        thread_instance.update_status.emit(f"   Verarbeite DOCX-Datei: {file_path}")

        # Call the parser and get matches along with entity types
        parsed_entities = parse_content(full_content, abort_flag, db_session)

        entity_count = 0
        for entity_type_id, match_text, start_pos, end_pos in parsed_entities:
            if not match_text.strip():
                continue

            timestamp = find_timestamp_before_match(full_content, start_pos)
            match_start_line, match_end_line = get_line_numbers_from_pos(content, start_pos, end_pos)

            entity = handle_distinct_entity(db_session, match_text, entity_type_id)
            individual_entity = handle_individual_entity(db_session, entity, file_metadata, match_start_line, timestamp, entity_type_id, abort_flag, thread_instance)

            if individual_entity:
                entity_count += 1
                handle_context_snippet(db_session, individual_entity, content, match_start_line, match_end_line)

        return entity_count
    except Exception as e:
        db_session.rollback()
        logging.error(f"Error processing DOCX file {file_path}: {e}")
        return 0

def find_timestamp_before_match(content, match_start_pos):
    search_content = content[:match_start_pos]
    timestamp_patterns = [
        (r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '%Y-%m-%d %H:%M:%S'),  # ISO 8601 Extended
        (r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}', '%Y/%m/%d %H:%M:%S'),  # ISO 8601 with slashes
        (r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}', '%d/%m/%Y %H:%M:%S'),  # European Date Format
        (r'\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}', '%m-%d-%Y %H:%M:%S'),  # US Date Format
        (r'\d{8}_\d{6}', '%Y%m%d_%H%M%S'),                             # Compact Format
        (r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}', '%Y-%m-%dT%H:%M:%S'),  # ISO 8601 Basic
        (r'\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}:\d{2}', '%d.%m.%Y %H:%M:%S'),# German Date Format
        (r'\d{4}\d{2}\d{2} \d{2}:\d{2}:\d{2}', '%Y%m%d %H:%M:%S'),      # Basic Format without Separators
        (r'\d{1,2}-[A-Za-z]{3}-\d{4} \d{2}:\d{2}:\d{2}', '%d-%b-%Y %H:%M:%S'), # English Date Format with Month Name
        (r'(?:19|20)\d{10}', '%Y%m%d%H%M'),                             # Compact Numeric Format
        # Add more patterns as needed
    ]
    for pattern, date_format in timestamp_patterns:
        for timestamp_match in reversed(list(re.finditer(pattern, search_content))):
            try:
                # Convert the matched timestamp to the standardized format
                matched_timestamp = datetime.strptime(timestamp_match.group(), date_format)
                return matched_timestamp.strftime('%Y-%m-%d %H:%M:%S')
            except ValueError:
                continue  # If conversion fails, continue to the next pattern
    return None