fixes to semeionSearchArtifact

This commit is contained in:
2025-11-29 23:41:51 +01:00
parent f5219bf2e4
commit c0e0325958
3 changed files with 51 additions and 29 deletions

View File

@@ -6,7 +6,33 @@
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
# #
# from .llm import LLMClient from .semeionArtifact import SemeionArtifact, Actors, ActorsRole, Content, ChunkInfo, ContextGroup, ContextGroupType, Location, Ingestion, ArtifactClass
# from .qdrant import QdrantClient from .semeionSearchObject import SemeionSearchObject
from .source_specific_models import AuthenticationEventMetadata, BrowserEventMetadata, DocumentMetadata, EmailMetadata, FileEventMetadata, MessageMetadata, NetworkEventMetadata, ProcessEventMetadata, RegistryEventMetadata, ScheduledTaskMetadata, SystemEventMetadata
__all__ = [] __all__ = ["SemeionArtifact",
"Actors",
"ActorsRole",
"Content",
"ChunkInfo",
"ContextGroup",
"ContextGroupType",
"Location",
"Ingestion",
"ArtifactClass",
"ContextGroupType",
"Location",
"Ingestion",
"ArtifactClass",
"SemeionSearchObject",
"AuthenticationEventMetadata",
"BrowserEventMetadata",
"DocumentMetadata",
"EmailMetadata",
"FileEventMetadata",
"MessageMetadata",
"NetworkEventMetadata",
"ProcessEventMetadata",
"RegistryEventMetadata",
"ScheduledTaskMetadata",
"SystemEventMetadata"]

View File

@@ -8,14 +8,13 @@
from datetime import datetime from datetime import datetime
import enum from enum import Enum
from typing import Annotated, Union from typing import Annotated, Union
from uuid import UUID from uuid import UUID
from interfaces import QdrantInterface
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
class ArtifactClass(str, enum): class ArtifactClass(str, Enum):
""" """
the general class of the artifact which determines wether it can be searched for or not. the general class of the artifact which determines wether it can be searched for or not.
this also affects how the artifact is processed during ingestion and search. this also affects how the artifact is processed during ingestion and search.
@@ -37,16 +36,8 @@ class ArtifactClass(str, enum):
NETWORK_EVENT = "network_event" NETWORK_EVENT = "network_event"
AUTHENTICATION_EVENT = "authentication_event" AUTHENTICATION_EVENT = "authentication_event"
SCHEDULED_TASK = "scheduled_task" SCHEDULED_TASK = "scheduled_task"
class Actors(BaseModel): class ActorsRole(str, Enum):
"""
the actor is any entity associated with some behavioral pattern.
"""
identifier: str # some unique identifier which is consistent across artifacts
display_name: str # human readeable, needs to be parsed properly by ingestion module
role: ActorsRole # see class
class ActorsRole(str, enum):
""" """
this will determine the role of the actor, which is mainly important for message filtering and later behavioral analysis. this will determine the role of the actor, which is mainly important for message filtering and later behavioral analysis.
""" """
@@ -57,6 +48,14 @@ class ActorsRole(str, enum):
OWNER = "owner" # filesystem objects OWNER = "owner" # filesystem objects
INITIATOR = "initiator" # browser events, filesystem, network events INITIATOR = "initiator" # browser events, filesystem, network events
TARGET = "target" # network events, authentication events TARGET = "target" # network events, authentication events
class Actors(BaseModel):
"""
the actor is any entity associated with some behavioral pattern.
"""
identifier: str # some unique identifier which is consistent across artifacts
display_name: str # human readeable, needs to be parsed properly by ingestion module
role: ActorsRole # see class
class Content(BaseModel): class Content(BaseModel):
""" """
@@ -73,14 +72,7 @@ class ChunkInfo(BaseModel):
index: int # zero-based index of the chunk index: int # zero-based index of the chunk
total: int # total number of chunks total: int # total number of chunks
class ContextGroup(BaseModel): class ContextGroupType(str, Enum):
"""
some artifacts can be aggregated into context groups, if they are semantically liked
"""
type: ContextGroupType
id: str
class ContextGroupType(str, enum):
""" """
some artifacts can be aggregated into context groups, if they are semantically liked some artifacts can be aggregated into context groups, if they are semantically liked
""" """
@@ -90,6 +82,13 @@ class ContextGroupType(str, enum):
# not file system directories: they are inherently linked by path and parent_id # not file system directories: they are inherently linked by path and parent_id
# no chunks: they are inherently linked by chunk_info # no chunks: they are inherently linked by chunk_info
class ContextGroup(BaseModel):
"""
some artifacts can be aggregated into context groups, if they are semantically liked
"""
type: ContextGroupType
id: str
class Location(BaseModel): class Location(BaseModel):
""" """
any information about where the artifact was located. any information about where the artifact was located.
@@ -209,14 +208,13 @@ class SemeionArtifact(BaseModel):
elif self.artifact_class in [ArtifactClass.MESSAGE, ArtifactClass.BROWSER_EVENT, ArtifactClass.EMAIL, ArtifactClass.DOCUMENT] and not self.searchable: elif self.artifact_class in [ArtifactClass.MESSAGE, ArtifactClass.BROWSER_EVENT, ArtifactClass.EMAIL, ArtifactClass.DOCUMENT] and not self.searchable:
exception = f"Artifact {self.id} of class {self.artifact_class} is marked as non-searchable, but this class should always be searchable." exception = f"Artifact {self.id} of class {self.artifact_class} is marked as non-searchable, but this class should always be searchable."
raise ValueError(exception) raise ValueError(exception)
return False
else: else:
return False return False
def submit_to_vector_db(self) -> bool: def get_vector_payload(self) -> dict:
""" """
submit to qdrant submit to qdrant
""" """
payload = self.model_dump(mode="json", by_alias=True) payload = self.model_dump(mode="json", by_alias=True)
QdrantInterface.submit(payload) return payload

View File

@@ -10,8 +10,6 @@
# from .qdrant import QdrantClient # from .qdrant import QdrantClient
from datetime import datetime from datetime import datetime
from enum import Enum
from typing import Annotated, Union
from uuid import UUID from uuid import UUID
from pydantic import BaseModel, Field from pydantic import BaseModel, Field