from sqlalchemy import or_, String from logline_leviathan.database.database_manager import get_db_session, EntitiesTable, DistinctEntitiesTable, EntityTypesTable, ContextTable, FileMetadata, session_scope from PyQt5.QtCore import pyqtSignal, QThread, pyqtSignal from fuzzywuzzy import fuzz import re class QueryThread(QThread): queryCompleted = pyqtSignal(dict) # Signal to indicate completion with a dictionary def __init__(self, db_query_instance, query_text): super(QueryThread, self).__init__() self.db_query_instance = db_query_instance self.query_text = query_text def run(self): base_query, search_terms = self.db_query_instance.prepare_query(self.query_text) query_lambda = self.db_query_instance.parse_query(self.query_text) results = base_query.filter(query_lambda).all() # Calculate scored results and create a dictionary with entities_id as keys scored_results = {result.entities_id: self.db_query_instance.calculate_match_score(result, self.query_text) for result in results} self.queryCompleted.emit(scored_results) class DatabaseGUIQuery: def __init__(self): self.db_session = get_db_session() self.entity_types = EntityTypesTable self.entities = EntitiesTable self.distinct_entities = DistinctEntitiesTable self.context = ContextTable self.file_metadata = FileMetadata def parse_query(self, query): if not query.strip(): return lambda _: False # Extract quoted and unquoted parts quoted_parts = re.findall(r'"([^"]+)"', query) unquoted_parts = re.split(r'"[^"]+"', query) # Process unquoted parts (case-insensitive) unquoted_tokens = [] for part in unquoted_parts: unquoted_tokens.extend(re.findall(r'\S+', part)) filters = [] # Handling unquoted parts with 'ilike' for case-insensitive search for token in unquoted_tokens: search_condition = f'%{token.replace("*", "%")}%' filters.append( or_( self.distinct_entities.distinct_entity.ilike(search_condition), self.entity_types.entity_type.ilike(search_condition), self.entity_types.gui_name.ilike(search_condition), self.entity_types.gui_tooltip.ilike(search_condition), self.file_metadata.file_name.ilike(search_condition), self.file_metadata.file_path.ilike(search_condition), self.file_metadata.file_mimetype.ilike(search_condition), self.entities.line_number.cast(String).ilike(search_condition), self.context.context_large.ilike(search_condition) # ... [add other fields for ilike search] ) ) # Handling quoted parts with 'like' for case-sensitive exact match for token in quoted_parts: exact_condition = f'%{token}%' filters.append( or_( self.distinct_entities.distinct_entity.like(exact_condition), self.entity_types.entity_type.like(exact_condition), self.entity_types.gui_name.like(exact_condition), self.entity_types.gui_tooltip.like(exact_condition), self.file_metadata.file_name.like(exact_condition), self.file_metadata.file_path.like(exact_condition), self.file_metadata.file_mimetype.like(exact_condition), self.entities.line_number.cast(String).like(exact_condition), self.context.context_large.like(exact_condition) # ... [add other fields for exact match search] ) ) return lambda: or_(*filters) def parse_search_terms(self, query): tokens = query.split() search_terms = [token.lstrip('+-') for token in tokens if not token.startswith('-') and not token.startswith('+')] return search_terms def prepare_query(self, query): search_terms = self.parse_search_terms(query) # Construct the base query with proper joins base_query = self.db_session.query( self.distinct_entities.distinct_entity, self.entity_types.gui_name, self.file_metadata.file_name, self.entities.line_number, self.entities.entry_timestamp, self.context.context_large, self.entities.flag, self.entities.entities_id ).join( self.entities, self.distinct_entities.distinct_entities_id == self.entities.distinct_entities_id ).join( self.file_metadata, self.entities.file_id == self.file_metadata.file_id ).join( self.context, self.entities.entities_id == self.context.entities_id ).join( self.entity_types, self.entities.entity_types_id == self.entity_types.entity_type_id ).distinct() # Apply filters and return results return base_query, search_terms def calculate_match_score(self, result, query): # Adjusted weights and thresholds distinct_entity_weight = 4 file_name_weight = 4 timestamp_weight = 1 line_number_weight = 1 context_weight = 5 multiple_term_weight = 1 order_weight = 8 # Increased weight for exact order of terms fuzzy_match_weight = 0.3 # More discerning fuzzy match threshold_for_fuzzy = 90 # Higher threshold for fuzzy matches proximity_weight = 2 # Increased weight for proximity positive_operand_weight = 10 # Weight for terms with '+' negative_operand_penalty = -5 # Penalty for terms with '-' exact_match_weight = 10 # Increased weight for exact sequence match score = 0 # Extracting operands and terms tokens = re.findall(r'"[^"]+"|\S+', query) processed_terms = [(token.startswith('+'), token.startswith('-'), token.strip('+-"').lower()) for token in tokens] # Normalize result fields lower_distinct_entity = result.distinct_entity.lower() lower_file_name = result.file_name.lower() timestamp_str = str(result.entry_timestamp).lower() line_number_str = str(result.line_number).lower() words_in_context = result.context_large.lower().split() # Check matches in various fields with operand consideration for is_positive, is_negative, term in processed_terms: if term in lower_distinct_entity: score += positive_operand_weight if is_positive else (negative_operand_penalty if is_negative else distinct_entity_weight) if term in lower_file_name: score += positive_operand_weight if is_positive else (negative_operand_penalty if is_negative else file_name_weight) if term in timestamp_str: score += positive_operand_weight if is_positive else (negative_operand_penalty if is_negative else timestamp_weight) if term in line_number_str: score += positive_operand_weight if is_positive else (negative_operand_penalty if is_negative else line_number_weight) if term in words_in_context: score += positive_operand_weight if is_positive else (negative_operand_penalty if is_negative else context_weight) # Creating a cleaned substring of search terms in the exact order they appear in the query exact_terms_substring = ' '.join([token.strip('+-"').lower() for token in tokens]) # Check for exact order of terms in the context if exact_terms_substring and exact_terms_substring in ' '.join(words_in_context): score += exact_match_weight # Check for exact order of terms if '"' in query: exact_query = ' '.join(term for _, _, term in processed_terms) if exact_query in ' '.join(words_in_context): score += order_weight # Additional weight for multiple different terms unique_terms = set(term for _, _, term in processed_terms) score += len(unique_terms) * multiple_term_weight # Proximity score calculation for _, _, term in processed_terms: if term in words_in_context: # Find the positions of the term and the entity in the context term_pos = words_in_context.index(term) entity_pos = words_in_context.index(lower_distinct_entity) if lower_distinct_entity in words_in_context else 0 # Calculate the distance and adjust the score distance = abs(term_pos - entity_pos) proximity_score = max(0, proximity_weight - distance * 0.01) # Reduce score based on distance score += proximity_score # Fuzzy matching all_text = f"{result.distinct_entity} {result.file_name} {result.entry_timestamp} {result.line_number} {result.context_large}".lower() for _, _, term in processed_terms: fuzzy_score = max(fuzz.partial_ratio(term, word) for word in all_text.split()) if fuzzy_score > threshold_for_fuzzy: score += (fuzzy_score / 100) * fuzzy_match_weight # Normalize the score max_possible_positive_score = ( distinct_entity_weight + file_name_weight + timestamp_weight + line_number_weight + context_weight * len(processed_terms) + # Assuming each term can match in the context order_weight + exact_match_weight + len(processed_terms) * multiple_term_weight + # Each term contributes to multiple_term_weight len(processed_terms) * positive_operand_weight # Each term could have a positive operand ) # Considering the negative operand penalty max_possible_negative_score = len(processed_terms) * negative_operand_penalty # The maximum score is the sum of the possible positive score and the absolute value of the possible negative score max_possible_score = max_possible_positive_score + abs(max_possible_negative_score) # Normalizing the score to a scale of 100 score = (score / max_possible_score) * 100 return score def get_entity_types(self): with session_scope() as session: # Query to filter entity types that have either regex_pattern or script_parser return [entity_type.gui_name for entity_type in session.query(EntityTypesTable) .filter(or_(EntityTypesTable.regex_pattern.isnot(None), EntityTypesTable.script_parser.isnot(None))) .all()]