From 0957027eedab1c9c55b5cd2b5873c72fdd57e067 Mon Sep 17 00:00:00 2001 From: mstoeck3 Date: Sun, 30 Nov 2025 14:49:18 +0100 Subject: [PATCH] search object data model --- src/semeion/models/semeionArtifact.py | 6 +++- src/semeion/models/semeionSearchObject.py | 42 ++++++++++++++++++++--- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/semeion/models/semeionArtifact.py b/src/semeion/models/semeionArtifact.py index 01d2538..ae4f565 100644 --- a/src/semeion/models/semeionArtifact.py +++ b/src/semeion/models/semeionArtifact.py @@ -167,6 +167,8 @@ class SemeionArtifact(BaseModel): "title": "string | null" } | null, "source_specific": {} | null, + "marked_interesting": "bool", + "marked_not_interesting": "bool", "ingestion": { "ingested_at": "string (ISO8601)", "source_file": "string", @@ -176,7 +178,7 @@ class SemeionArtifact(BaseModel): } """ - _schema_version: str = "1.1.0" + _schema_version: str = "1.1.1" id: UUID = Field(description="deterministic UUID v5 based on case_id, source_file, and unique key") case_id: str = Field(description="case identifier this artifact belongs to") searchable: bool = Field(description="searchable or only for timeline context") @@ -192,6 +194,8 @@ class SemeionArtifact(BaseModel): context_group: ContextGroup | None = Field(default=None, description="a parameter which provides information for semantically linked artifacts, e.g., email threads, browser sessions, etc.") location: Location | None = Field(default=None, description="information about where the artifact was located, e.g., file path, URL, host, etc.") source_specific: source_specific_models | None = Field(default=None, description="source-specific metadata for the artifact") + marked_interesting: bool = Field(description="user feedback flag marking this artifact as interesting") + marked_not_interesting: bool = Field(description="user feedback flag marking this artifact as not interesting") ingestion: Ingestion = Field(description="metadata about the ingestion process") class Config: diff --git a/src/semeion/models/semeionSearchObject.py b/src/semeion/models/semeionSearchObject.py index e7ab9a7..f306a3e 100644 --- a/src/semeion/models/semeionSearchObject.py +++ b/src/semeion/models/semeionSearchObject.py @@ -14,6 +14,35 @@ from uuid import UUID from pydantic import BaseModel, Field +class Filters(BaseModel): + """ + filters which can be applied to the search query to limit the search space. + all filters are optional and can be combined. + """ + case_ids: list[str] | None = Field(default=None, description="list of case IDs to limit the search to") + artifact_classes: list[str] | None = Field(default=None, description="list of artifact classes to limit the search to") + source_platforms: list[str] | None = Field(default=None, description="list of source platforms to limit the search to") + actor_identifiers: list[str] | None = Field(default=None, description="list of actor identifiers to limit the search to") + indexed_entities_any: list[str] | None = Field(default=None, description="list of indexed entities, any of which must be present in the artifact") + time_after: datetime | None = Field(default=None, description="only return artifacts after this timestamp (inclusive)") + time_before: datetime | None = Field(default=None, description="only return artifacts before this timestamp (inclusive)") + context_group_ids: list[str] | None = Field(default=None, description="list of context group IDs to limit the search to") + hosts: list[str] | None = Field(default=None, description="list of hostnames or IPs to limit the search to") + +class Options(BaseModel): + """ + options to modify the search behavior. + """ + limit: int = Field(default=100, description="maximum number of search results to return") + min_score: float | None = Field(default=None, description="minimum similarity score threshold for returned results") + use_hybrid: bool = Field(default=False, description="whether to use hybrid search (semantic dense vectors + keyword (sparse vectors)") + +class Interpretation(BaseModel): + """ + this data structure holds information about how the semantic query was interpreted by the generative LLM + """ + original_query: str = Field(description="the original user query before any modifications or interpretations") + notes: list[str] = Field(default=[], description="any notes or clarifications made by the LLM regarding the query interpretation") class SemeionSearchObject(BaseModel): """ @@ -48,11 +77,16 @@ class SemeionSearchObject(BaseModel): "interpretation": { "original_query": "string", - "notes": ["string"], - "confidence": "float" - } + "notes": ["string"] + } } """ - pass + _schema_version: str = "1.1.0" + query_id: UUID = Field(description="UUID v4 identifying this search query") + created_at: datetime = Field(description="timestamp when the search query was created for audit") + semantic_query: str = Field(description="the search string which gets embedded and used for semantic search") + filters: Filters = Field(description="filters to limit the search space") + options: Options = Field(description="options to modify the search behavior") + interpretation: Interpretation = Field(description="information about how the semantic query was interpreted by the generative LLM")