large entity recreation

This commit is contained in:
overcuriousity
2025-09-19 00:38:26 +02:00
parent 332805709d
commit 0a6d12de9a
3 changed files with 147 additions and 72 deletions

View File

@@ -821,6 +821,88 @@ class Scanner:
self._update_provider_state(target, provider_name, 'failed', 0, str(e), start_time)
return None
def _create_large_entity_from_result(self, source_node: str, provider_name: str,
provider_result: ProviderResult, depth: int) -> Set[str]:
"""
Creates a large entity node and tags all member nodes.
"""
members = {rel.target_node for rel in provider_result.relationships
if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)}
if not members:
return set()
large_entity_id = f"le_{provider_name}_{source_node}"
# Add the large entity node to the graph
self.graph.add_node(
node_id=large_entity_id,
node_type=NodeType.LARGE_ENTITY,
attributes=[
{"name": "count", "value": len(members), "type": "statistic"},
{"name": "source_provider", "value": provider_name, "type": "metadata"},
{"name": "discovery_depth", "value": depth, "type": "metadata"},
{"name": "nodes", "value": list(members), "type": "metadata"}
],
description=f"A collection of {len(members)} nodes discovered from {source_node} via {provider_name}."
)
# Create a single edge from the source to the large entity
self.graph.add_edge(
source_node, large_entity_id,
relationship_type=f"{provider_name}_collection",
confidence_score=0.95,
source_provider=provider_name,
raw_data={'description': 'Represents a large collection of nodes.'}
)
# Tag each member node with the large entity ID
for member_id in members:
node_type = NodeType.IP if _is_valid_ip(member_id) else NodeType.DOMAIN
self.graph.add_node(
node_id=member_id,
node_type=node_type,
metadata={'large_entity_id': large_entity_id}
)
return members
def extract_node_from_large_entity(self, large_entity_id: str, node_id: str) -> bool:
"""
Removes a node from a large entity, allowing it to be processed normally.
"""
if not self.graph.graph.has_node(node_id):
return False
node_data = self.graph.graph.nodes[node_id]
metadata = node_data.get('metadata', {})
if metadata.get('large_entity_id') == large_entity_id:
# Remove the large entity tag
del metadata['large_entity_id']
self.graph.add_node(node_id, NodeType(node_data['type']), metadata=metadata)
# Re-enqueue the node for full processing
is_ip = _is_valid_ip(node_id)
eligible_providers = self._get_eligible_providers(node_id, is_ip, False)
for provider in eligible_providers:
provider_name = provider.get_name()
priority = self._get_priority(provider_name)
# Use current depth of the large entity if available, else 0
depth = 0
if self.graph.graph.has_node(large_entity_id):
le_attrs = self.graph.graph.nodes[large_entity_id].get('attributes', [])
depth_attr = next((a for a in le_attrs if a['name'] == 'discovery_depth'), None)
if depth_attr:
depth = depth_attr['value']
self.task_queue.put((time.time(), priority, (provider_name, node_id, depth)))
self.total_tasks_ever_enqueued += 1
return True
return False
def _process_provider_result_unified(self, target: str, provider: BaseProvider,
provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]:
"""
@@ -830,7 +912,7 @@ class Scanner:
"""
provider_name = provider.get_name()
discovered_targets = set()
#large_entity_members = set()
large_entity_members = set()
if self._is_stop_requested():
return discovered_targets, False
@@ -842,11 +924,11 @@ class Scanner:
is_large_entity = eligible_relationship_count > self.config.large_entity_threshold
#if is_large_entity:
if is_large_entity:
# Create the large entity node and get the set of its members
#large_entity_members = self._create_large_entity_from_result(
# target, provider_name, provider_result, current_depth
#)
large_entity_members = self._create_large_entity_from_result(
target, provider_name, provider_result, current_depth
)
# Process ALL relationships to build the complete underlying data model
for i, relationship in enumerate(provider_result.relationships):
@@ -885,7 +967,8 @@ class Scanner:
# Add all discovered domains/IPs to be considered for further processing
if (_is_valid_domain(target_node) or _is_valid_ip(target_node)) and not max_depth_reached:
discovered_targets.add(target_node)
if target_node not in large_entity_members:
discovered_targets.add(target_node)
# Process all attributes and add them to the corresponding nodes
attributes_by_node = defaultdict(list)
@@ -1004,8 +1087,19 @@ class Scanner:
eligible = []
target_key = 'ips' if is_ip else 'domains'
# Check if the target is part of a large entity
is_in_large_entity = False
if self.graph.graph.has_node(target):
metadata = self.graph.graph.nodes[target].get('metadata', {})
if 'large_entity_id' in metadata:
is_in_large_entity = True
for provider in self.providers:
try:
# If in large entity, only allow dns and correlation providers
if is_in_large_entity and provider.get_name() not in ['dns', 'correlation']:
continue
# Check if provider supports this target type
if not provider.get_eligibility().get(target_key, False):
continue