From c0e03259589bac477d23ad6f08cbc28001875789 Mon Sep 17 00:00:00 2001 From: mstoeck3 Date: Sat, 29 Nov 2025 23:41:51 +0100 Subject: [PATCH] fixes to semeionSearchArtifact --- src/semeion/models/__init__.py | 32 ++++++++++++++-- src/semeion/models/semeionArtifact.py | 46 +++++++++++------------ src/semeion/models/semeionSearchObject.py | 2 - 3 files changed, 51 insertions(+), 29 deletions(-) diff --git a/src/semeion/models/__init__.py b/src/semeion/models/__init__.py index ce9dc02..33c4224 100644 --- a/src/semeion/models/__init__.py +++ b/src/semeion/models/__init__.py @@ -6,7 +6,33 @@ # LICENSE file in the root directory of this source tree. # -# from .llm import LLMClient -# from .qdrant import QdrantClient +from .semeionArtifact import SemeionArtifact, Actors, ActorsRole, Content, ChunkInfo, ContextGroup, ContextGroupType, Location, Ingestion, ArtifactClass +from .semeionSearchObject import SemeionSearchObject +from .source_specific_models import AuthenticationEventMetadata, BrowserEventMetadata, DocumentMetadata, EmailMetadata, FileEventMetadata, MessageMetadata, NetworkEventMetadata, ProcessEventMetadata, RegistryEventMetadata, ScheduledTaskMetadata, SystemEventMetadata -__all__ = [] \ No newline at end of file +__all__ = ["SemeionArtifact", + "Actors", + "ActorsRole", + "Content", + "ChunkInfo", + "ContextGroup", + "ContextGroupType", + "Location", + "Ingestion", + "ArtifactClass", + "ContextGroupType", + "Location", + "Ingestion", + "ArtifactClass", + "SemeionSearchObject", + "AuthenticationEventMetadata", + "BrowserEventMetadata", + "DocumentMetadata", + "EmailMetadata", + "FileEventMetadata", + "MessageMetadata", + "NetworkEventMetadata", + "ProcessEventMetadata", + "RegistryEventMetadata", + "ScheduledTaskMetadata", + "SystemEventMetadata"] \ No newline at end of file diff --git a/src/semeion/models/semeionArtifact.py b/src/semeion/models/semeionArtifact.py index 2b59400..01d2538 100644 --- a/src/semeion/models/semeionArtifact.py +++ b/src/semeion/models/semeionArtifact.py @@ -8,14 +8,13 @@ from datetime import datetime -import enum +from enum import Enum from typing import Annotated, Union from uuid import UUID -from interfaces import QdrantInterface from pydantic import BaseModel, Field -class ArtifactClass(str, enum): +class ArtifactClass(str, Enum): """ the general class of the artifact which determines wether it can be searched for or not. this also affects how the artifact is processed during ingestion and search. @@ -37,16 +36,8 @@ class ArtifactClass(str, enum): NETWORK_EVENT = "network_event" AUTHENTICATION_EVENT = "authentication_event" SCHEDULED_TASK = "scheduled_task" - -class Actors(BaseModel): - """ - the actor is any entity associated with some behavioral pattern. - """ - identifier: str # some unique identifier which is consistent across artifacts - display_name: str # human readeable, needs to be parsed properly by ingestion module - role: ActorsRole # see class - -class ActorsRole(str, enum): + +class ActorsRole(str, Enum): """ this will determine the role of the actor, which is mainly important for message filtering and later behavioral analysis. """ @@ -57,6 +48,14 @@ class ActorsRole(str, enum): OWNER = "owner" # filesystem objects INITIATOR = "initiator" # browser events, filesystem, network events TARGET = "target" # network events, authentication events + +class Actors(BaseModel): + """ + the actor is any entity associated with some behavioral pattern. + """ + identifier: str # some unique identifier which is consistent across artifacts + display_name: str # human readeable, needs to be parsed properly by ingestion module + role: ActorsRole # see class class Content(BaseModel): """ @@ -73,14 +72,7 @@ class ChunkInfo(BaseModel): index: int # zero-based index of the chunk total: int # total number of chunks -class ContextGroup(BaseModel): - """ - some artifacts can be aggregated into context groups, if they are semantically liked - """ - type: ContextGroupType - id: str - -class ContextGroupType(str, enum): +class ContextGroupType(str, Enum): """ some artifacts can be aggregated into context groups, if they are semantically liked """ @@ -90,6 +82,13 @@ class ContextGroupType(str, enum): # not file system directories: they are inherently linked by path and parent_id # no chunks: they are inherently linked by chunk_info +class ContextGroup(BaseModel): + """ + some artifacts can be aggregated into context groups, if they are semantically liked + """ + type: ContextGroupType + id: str + class Location(BaseModel): """ any information about where the artifact was located. @@ -209,14 +208,13 @@ class SemeionArtifact(BaseModel): elif self.artifact_class in [ArtifactClass.MESSAGE, ArtifactClass.BROWSER_EVENT, ArtifactClass.EMAIL, ArtifactClass.DOCUMENT] and not self.searchable: exception = f"Artifact {self.id} of class {self.artifact_class} is marked as non-searchable, but this class should always be searchable." raise ValueError(exception) - return False else: return False - def submit_to_vector_db(self) -> bool: + def get_vector_payload(self) -> dict: """ submit to qdrant """ payload = self.model_dump(mode="json", by_alias=True) - QdrantInterface.submit(payload) + return payload diff --git a/src/semeion/models/semeionSearchObject.py b/src/semeion/models/semeionSearchObject.py index ec7e692..e7ab9a7 100644 --- a/src/semeion/models/semeionSearchObject.py +++ b/src/semeion/models/semeionSearchObject.py @@ -10,8 +10,6 @@ # from .qdrant import QdrantClient from datetime import datetime -from enum import Enum -from typing import Annotated, Union from uuid import UUID from pydantic import BaseModel, Field