fixes to semeionSearchArtifact

This commit is contained in:
2025-11-29 23:41:51 +01:00
parent f5219bf2e4
commit c0e0325958
3 changed files with 51 additions and 29 deletions

View File

@@ -6,7 +6,33 @@
# LICENSE file in the root directory of this source tree.
#
# from .llm import LLMClient
# from .qdrant import QdrantClient
from .semeionArtifact import SemeionArtifact, Actors, ActorsRole, Content, ChunkInfo, ContextGroup, ContextGroupType, Location, Ingestion, ArtifactClass
from .semeionSearchObject import SemeionSearchObject
from .source_specific_models import AuthenticationEventMetadata, BrowserEventMetadata, DocumentMetadata, EmailMetadata, FileEventMetadata, MessageMetadata, NetworkEventMetadata, ProcessEventMetadata, RegistryEventMetadata, ScheduledTaskMetadata, SystemEventMetadata
__all__ = []
__all__ = ["SemeionArtifact",
"Actors",
"ActorsRole",
"Content",
"ChunkInfo",
"ContextGroup",
"ContextGroupType",
"Location",
"Ingestion",
"ArtifactClass",
"ContextGroupType",
"Location",
"Ingestion",
"ArtifactClass",
"SemeionSearchObject",
"AuthenticationEventMetadata",
"BrowserEventMetadata",
"DocumentMetadata",
"EmailMetadata",
"FileEventMetadata",
"MessageMetadata",
"NetworkEventMetadata",
"ProcessEventMetadata",
"RegistryEventMetadata",
"ScheduledTaskMetadata",
"SystemEventMetadata"]

View File

@@ -8,14 +8,13 @@
from datetime import datetime
import enum
from enum import Enum
from typing import Annotated, Union
from uuid import UUID
from interfaces import QdrantInterface
from pydantic import BaseModel, Field
class ArtifactClass(str, enum):
class ArtifactClass(str, Enum):
"""
the general class of the artifact which determines wether it can be searched for or not.
this also affects how the artifact is processed during ingestion and search.
@@ -38,15 +37,7 @@ class ArtifactClass(str, enum):
AUTHENTICATION_EVENT = "authentication_event"
SCHEDULED_TASK = "scheduled_task"
class Actors(BaseModel):
"""
the actor is any entity associated with some behavioral pattern.
"""
identifier: str # some unique identifier which is consistent across artifacts
display_name: str # human readeable, needs to be parsed properly by ingestion module
role: ActorsRole # see class
class ActorsRole(str, enum):
class ActorsRole(str, Enum):
"""
this will determine the role of the actor, which is mainly important for message filtering and later behavioral analysis.
"""
@@ -58,6 +49,14 @@ class ActorsRole(str, enum):
INITIATOR = "initiator" # browser events, filesystem, network events
TARGET = "target" # network events, authentication events
class Actors(BaseModel):
"""
the actor is any entity associated with some behavioral pattern.
"""
identifier: str # some unique identifier which is consistent across artifacts
display_name: str # human readeable, needs to be parsed properly by ingestion module
role: ActorsRole # see class
class Content(BaseModel):
"""
the most important field, this is what gets embedded.
@@ -73,14 +72,7 @@ class ChunkInfo(BaseModel):
index: int # zero-based index of the chunk
total: int # total number of chunks
class ContextGroup(BaseModel):
"""
some artifacts can be aggregated into context groups, if they are semantically liked
"""
type: ContextGroupType
id: str
class ContextGroupType(str, enum):
class ContextGroupType(str, Enum):
"""
some artifacts can be aggregated into context groups, if they are semantically liked
"""
@@ -90,6 +82,13 @@ class ContextGroupType(str, enum):
# not file system directories: they are inherently linked by path and parent_id
# no chunks: they are inherently linked by chunk_info
class ContextGroup(BaseModel):
"""
some artifacts can be aggregated into context groups, if they are semantically liked
"""
type: ContextGroupType
id: str
class Location(BaseModel):
"""
any information about where the artifact was located.
@@ -209,14 +208,13 @@ class SemeionArtifact(BaseModel):
elif self.artifact_class in [ArtifactClass.MESSAGE, ArtifactClass.BROWSER_EVENT, ArtifactClass.EMAIL, ArtifactClass.DOCUMENT] and not self.searchable:
exception = f"Artifact {self.id} of class {self.artifact_class} is marked as non-searchable, but this class should always be searchable."
raise ValueError(exception)
return False
else:
return False
def submit_to_vector_db(self) -> bool:
def get_vector_payload(self) -> dict:
"""
submit to qdrant
"""
payload = self.model_dump(mode="json", by_alias=True)
QdrantInterface.submit(payload)
return payload

View File

@@ -10,8 +10,6 @@
# from .qdrant import QdrantClient
from datetime import datetime
from enum import Enum
from typing import Annotated, Union
from uuid import UUID
from pydantic import BaseModel, Field