attempt fix large entity
This commit is contained in:
252
core/scanner.py
252
core/scanner.py
@@ -825,81 +825,33 @@ class Scanner:
|
||||
return None
|
||||
|
||||
def _process_provider_result_unified(self, target: str, provider: BaseProvider,
|
||||
provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]:
|
||||
provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]:
|
||||
"""
|
||||
Process a unified ProviderResult object to update the graph.
|
||||
FIXED: Ensure CA and ISP relationships are created even when large entities are formed.
|
||||
Handles large entity creation while ensuring all underlying nodes and edges are
|
||||
added to the graph data model for a complete dataset.
|
||||
"""
|
||||
provider_name = provider.get_name()
|
||||
discovered_targets = set()
|
||||
large_entity_members = set()
|
||||
|
||||
if self._is_stop_requested():
|
||||
return discovered_targets, False
|
||||
|
||||
# Check if this should be a large entity (only counting domain/IP relationships)
|
||||
eligible_relationship_count = 0
|
||||
for rel in provider_result.relationships:
|
||||
# Only count relationships that would go into large entities
|
||||
if provider_name == 'crtsh' and rel.relationship_type == 'crtsh_cert_issuer':
|
||||
continue # Don't count CA relationships
|
||||
if provider_name == 'shodan' and rel.relationship_type == 'shodan_isp':
|
||||
continue # Don't count ISP relationships
|
||||
if rel.relationship_type.startswith('corr_'):
|
||||
continue # Don't count correlation relationships
|
||||
# Check if a large entity should be created based on the count of domain/IP relationships
|
||||
eligible_relationship_count = sum(
|
||||
1 for rel in provider_result.relationships if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)
|
||||
)
|
||||
|
||||
# Only count domain/IP targets
|
||||
if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node):
|
||||
eligible_relationship_count += 1
|
||||
is_large_entity = eligible_relationship_count > self.config.large_entity_threshold
|
||||
|
||||
if eligible_relationship_count > self.config.large_entity_threshold:
|
||||
# Create large entity but ALSO process special relationships
|
||||
members = self._create_large_entity_from_provider_result(target, provider_name, provider_result, current_depth)
|
||||
if is_large_entity:
|
||||
# Create the large entity node and get the set of its members
|
||||
large_entity_members = self._create_large_entity_from_provider_result(
|
||||
target, provider_name, provider_result, current_depth
|
||||
)
|
||||
|
||||
# FIXED: Still process CA, ISP, and correlation relationships directly on the graph
|
||||
for relationship in provider_result.relationships:
|
||||
if self._is_stop_requested():
|
||||
break
|
||||
|
||||
source_node = relationship.source_node
|
||||
target_node = relationship.target_node
|
||||
|
||||
# Process special relationship types that should appear directly on graph
|
||||
should_create_direct_relationship = False
|
||||
target_type = None
|
||||
|
||||
if provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer':
|
||||
target_type = NodeType.CA
|
||||
should_create_direct_relationship = True
|
||||
elif provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp':
|
||||
target_type = NodeType.ISP
|
||||
should_create_direct_relationship = True
|
||||
elif relationship.relationship_type.startswith('corr_'):
|
||||
target_type = NodeType.CORRELATION_OBJECT
|
||||
should_create_direct_relationship = True
|
||||
|
||||
if should_create_direct_relationship:
|
||||
# Create source and target nodes
|
||||
source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN
|
||||
self.graph.add_node(source_node, source_type)
|
||||
self.graph.add_node(target_node, target_type)
|
||||
|
||||
# Add the relationship edge
|
||||
self.graph.add_edge(
|
||||
source_node, target_node,
|
||||
relationship.relationship_type,
|
||||
relationship.confidence,
|
||||
provider_name,
|
||||
relationship.raw_data
|
||||
)
|
||||
|
||||
# Add to discovered targets if it's a valid target for further processing
|
||||
max_depth_reached = current_depth >= self.max_depth
|
||||
if not max_depth_reached and (_is_valid_domain(target_node) or _is_valid_ip(target_node)):
|
||||
discovered_targets.add(target_node)
|
||||
|
||||
return members, True
|
||||
|
||||
# Normal processing (existing logic) when not creating large entity
|
||||
# Process ALL relationships to build the complete underlying data model
|
||||
for i, relationship in enumerate(provider_result.relationships):
|
||||
if i % 5 == 0 and self._is_stop_requested():
|
||||
break
|
||||
@@ -907,10 +859,8 @@ class Scanner:
|
||||
source_node = relationship.source_node
|
||||
target_node = relationship.target_node
|
||||
|
||||
# Determine source node type
|
||||
# Determine node types
|
||||
source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN
|
||||
|
||||
# Determine target node type based on provider and relationship
|
||||
if provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp':
|
||||
target_type = NodeType.ISP
|
||||
elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer':
|
||||
@@ -922,131 +872,101 @@ class Scanner:
|
||||
else:
|
||||
target_type = NodeType.DOMAIN
|
||||
|
||||
# Add max_depth_reached flag
|
||||
max_depth_reached = current_depth >= self.max_depth
|
||||
|
||||
# Create or update nodes with proper types
|
||||
# Add all nodes and edges to the graph's data model.
|
||||
# The frontend will handle the visual re-routing for large entity members.
|
||||
self.graph.add_node(source_node, source_type)
|
||||
self.graph.add_node(target_node, target_type, metadata={'max_depth_reached': max_depth_reached})
|
||||
|
||||
# Add the relationship edge
|
||||
if self.graph.add_edge(
|
||||
self.graph.add_edge(
|
||||
source_node, target_node,
|
||||
relationship.relationship_type,
|
||||
relationship.confidence,
|
||||
provider_name,
|
||||
relationship.raw_data
|
||||
):
|
||||
pass # Edge was successfully added
|
||||
)
|
||||
|
||||
# Add target to discovered nodes for further processing
|
||||
# Add all discovered domains/IPs to be considered for further processing
|
||||
if (_is_valid_domain(target_node) or _is_valid_ip(target_node)) and not max_depth_reached:
|
||||
discovered_targets.add(target_node)
|
||||
|
||||
# Process all attributes (existing logic unchanged)
|
||||
# Process all attributes and add them to the corresponding nodes
|
||||
attributes_by_node = defaultdict(list)
|
||||
for attribute in provider_result.attributes:
|
||||
attr_dict = {
|
||||
"name": attribute.name,
|
||||
"value": attribute.value,
|
||||
"type": attribute.type,
|
||||
"provider": attribute.provider,
|
||||
"confidence": attribute.confidence,
|
||||
"metadata": attribute.metadata
|
||||
"name": attribute.name, "value": attribute.value, "type": attribute.type,
|
||||
"provider": attribute.provider, "confidence": attribute.confidence, "metadata": attribute.metadata
|
||||
}
|
||||
attributes_by_node[attribute.target_node].append(attr_dict)
|
||||
|
||||
# Add attributes to existing nodes OR create new nodes if they don't exist
|
||||
for node_id, node_attributes_list in attributes_by_node.items():
|
||||
if not self.graph.graph.has_node(node_id):
|
||||
node_type = NodeType.IP if _is_valid_ip(node_id) else NodeType.DOMAIN
|
||||
self.graph.add_node(node_id, node_type, attributes=node_attributes_list)
|
||||
else:
|
||||
node_type_val = self.graph.graph.nodes[node_id].get('type', 'domain')
|
||||
self.graph.add_node(node_id, NodeType(node_type_val), attributes=node_attributes_list)
|
||||
existing_attrs = self.graph.graph.nodes[node_id].get('attributes', [])
|
||||
self.graph.graph.nodes[node_id]['attributes'] = existing_attrs + node_attributes_list
|
||||
|
||||
return discovered_targets, False
|
||||
return discovered_targets, is_large_entity
|
||||
|
||||
def _create_large_entity_from_provider_result(self, source: str, provider_name: str,
|
||||
provider_result: ProviderResult, current_depth: int) -> Set[str]:
|
||||
"""
|
||||
Create a large entity node from a ProviderResult.
|
||||
FIXED: Only include domain/IP nodes in large entities, exclude CA and other special node types.
|
||||
Create a large entity node and connect it to the source and any shared
|
||||
non-member nodes like CAs or ISPs.
|
||||
"""
|
||||
entity_id = f"large_entity_{provider_name}_{hash(source) & 0x7FFFFFFF}"
|
||||
|
||||
# FIXED: Filter out CA, ISP, and correlation nodes from large entity inclusion
|
||||
eligible_targets = []
|
||||
for rel in provider_result.relationships:
|
||||
target_node = rel.target_node
|
||||
members = {
|
||||
rel.target_node for rel in provider_result.relationships
|
||||
if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)
|
||||
}
|
||||
|
||||
# Skip CA nodes (certificate issuers) - they should appear directly on graph
|
||||
if provider_name == 'crtsh' and rel.relationship_type == 'crtsh_cert_issuer':
|
||||
continue
|
||||
|
||||
# Skip ISP nodes - they should appear directly on graph
|
||||
if provider_name == 'shodan' and rel.relationship_type == 'shodan_isp':
|
||||
continue
|
||||
|
||||
# Skip correlation objects - they should appear directly on graph
|
||||
if rel.relationship_type.startswith('corr_'):
|
||||
continue
|
||||
|
||||
# Only include valid domains and IPs in large entities
|
||||
if _is_valid_domain(target_node) or _is_valid_ip(target_node):
|
||||
eligible_targets.append(target_node)
|
||||
|
||||
# If no eligible targets after filtering, don't create large entity
|
||||
if not eligible_targets:
|
||||
if not members:
|
||||
return set()
|
||||
|
||||
node_type = 'unknown'
|
||||
if eligible_targets:
|
||||
if _is_valid_domain(eligible_targets[0]):
|
||||
node_type = 'domain'
|
||||
elif _is_valid_ip(eligible_targets[0]):
|
||||
node_type = 'ip'
|
||||
|
||||
# Create individual nodes for eligible targets
|
||||
for target in eligible_targets:
|
||||
target_node_type = NodeType.DOMAIN if node_type == 'domain' else NodeType.IP
|
||||
self.graph.add_node(target, target_node_type)
|
||||
first_member = next(iter(members))
|
||||
node_type = 'ip' if _is_valid_ip(first_member) else 'domain'
|
||||
|
||||
attributes_dict = {
|
||||
'count': len(eligible_targets),
|
||||
'nodes': eligible_targets, # Only eligible domain/IP nodes
|
||||
'count': len(members),
|
||||
'nodes': list(members),
|
||||
'node_type': node_type,
|
||||
'source_provider': provider_name,
|
||||
'discovery_depth': current_depth,
|
||||
'threshold_exceeded': self.config.large_entity_threshold,
|
||||
}
|
||||
attributes_list = [
|
||||
{
|
||||
"name": key, "value": value, "type": "large_entity_info",
|
||||
"provider": provider_name, "confidence": 0.9, "metadata": {}
|
||||
} for key, value in attributes_dict.items()
|
||||
]
|
||||
|
||||
attributes_list = []
|
||||
for key, value in attributes_dict.items():
|
||||
attributes_list.append({
|
||||
"name": key,
|
||||
"value": value,
|
||||
"type": "large_entity_info",
|
||||
"provider": provider_name,
|
||||
"confidence": 0.9,
|
||||
"metadata": {}
|
||||
})
|
||||
|
||||
description = f'Large entity created due to {len(eligible_targets)} relationships from {provider_name}'
|
||||
description = f'Large entity created due to {len(members)} relationships from {provider_name}'
|
||||
|
||||
self.graph.add_node(entity_id, NodeType.LARGE_ENTITY, attributes=attributes_list, description=description)
|
||||
|
||||
# Add a representative edge from the source to the large entity
|
||||
if provider_result.relationships:
|
||||
# Use the first eligible relationship for the large entity connection
|
||||
eligible_rels = [rel for rel in provider_result.relationships if rel.target_node in eligible_targets]
|
||||
if eligible_rels:
|
||||
rel_type = eligible_rels[0].relationship_type
|
||||
self.graph.add_edge(source, entity_id, rel_type, 0.9, provider_name,
|
||||
{'large_entity_info': f'Contains {len(eligible_targets)} {node_type}s'})
|
||||
rep_rel = provider_result.relationships[0]
|
||||
self.graph.add_edge(source, entity_id, rep_rel.relationship_type, 0.9, provider_name,
|
||||
{'large_entity_info': f'Contains {len(members)} {node_type}s'})
|
||||
|
||||
self.logger.logger.warning(f"Large entity created: {entity_id} contains {len(eligible_targets)} targets from {provider_name}")
|
||||
# Create edges from the large entity to shared non-member nodes (e.g., CAs, ISPs)
|
||||
processed_targets = set()
|
||||
for rel in provider_result.relationships:
|
||||
if rel.source_node in members and rel.target_node not in members:
|
||||
if rel.target_node not in processed_targets:
|
||||
self.graph.add_edge(
|
||||
entity_id, rel.target_node, rel.relationship_type, rel.confidence,
|
||||
rel.provider, rel.raw_data
|
||||
)
|
||||
processed_targets.add(rel.target_node)
|
||||
|
||||
return set(eligible_targets)
|
||||
self.logger.logger.warning(f"Large entity created: {entity_id} contains {len(members)} targets from {provider_name}")
|
||||
|
||||
return members
|
||||
|
||||
def stop_scan(self) -> bool:
|
||||
"""Request immediate scan termination with proper cleanup."""
|
||||
@@ -1077,41 +997,32 @@ class Scanner:
|
||||
|
||||
def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool:
|
||||
"""
|
||||
Extracts a node from a large entity and re-queues it for scanning.
|
||||
FIXED: Properly handle different node types during extraction.
|
||||
Extracts a node from a large entity, restores ALL of its original connections,
|
||||
and re-queues it for scanning.
|
||||
"""
|
||||
if not self.graph.graph.has_node(large_entity_id):
|
||||
return False
|
||||
|
||||
predecessors = list(self.graph.graph.predecessors(large_entity_id))
|
||||
if not predecessors:
|
||||
return False
|
||||
source_node_id = predecessors[0]
|
||||
|
||||
original_edge_data = self.graph.graph.get_edge_data(source_node_id, large_entity_id)
|
||||
if not original_edge_data:
|
||||
return False
|
||||
|
||||
# Extract the node from the large entity's internal list
|
||||
success = self.graph.extract_node_from_large_entity(large_entity_id, node_id_to_extract)
|
||||
if not success:
|
||||
return False
|
||||
|
||||
# Create relationship from source to extracted node
|
||||
self.graph.add_edge(
|
||||
source_id=source_node_id,
|
||||
target_id=node_id_to_extract,
|
||||
relationship_type=original_edge_data.get('relationship_type', 'extracted_from_large_entity'),
|
||||
confidence_score=original_edge_data.get('confidence_score', 0.85),
|
||||
source_provider=original_edge_data.get('source_provider', 'unknown'),
|
||||
raw_data={'context': f'Extracted from large entity {large_entity_id}'}
|
||||
)
|
||||
# Restore all incoming and outgoing edges for the extracted node
|
||||
# These edges already exist in the graph data model; this ensures they are "activated"
|
||||
# for the frontend.
|
||||
for u, v, data in self.graph.graph.in_edges(node_id_to_extract, data=True):
|
||||
self.graph.add_edge(u, v, data.get('relationship_type'), data.get('confidence_score'),
|
||||
data.get('source_provider'), data.get('raw_data'))
|
||||
|
||||
# FIXED: Only queue for further scanning if it's a domain/IP that can be scanned
|
||||
for u, v, data in self.graph.graph.out_edges(node_id_to_extract, data=True):
|
||||
self.graph.add_edge(u, v, data.get('relationship_type'), data.get('confidence_score'),
|
||||
data.get('source_provider'), data.get('raw_data'))
|
||||
|
||||
# Re-queue the extracted node for further scanning if it is a domain or IP
|
||||
is_ip = _is_valid_ip(node_id_to_extract)
|
||||
is_domain = _is_valid_domain(node_id_to_extract)
|
||||
|
||||
# Only queue valid domains and IPs for further processing
|
||||
# Don't queue CA nodes, ISP nodes, etc. as they can't be scanned
|
||||
if is_domain or is_ip:
|
||||
large_entity_attributes = self.graph.graph.nodes[large_entity_id].get('attributes', [])
|
||||
discovery_depth_attr = next((attr for attr in large_entity_attributes if attr.get('name') == 'discovery_depth'), None)
|
||||
@@ -1119,10 +1030,12 @@ class Scanner:
|
||||
|
||||
eligible_providers = self._get_eligible_providers(node_id_to_extract, is_ip, False)
|
||||
for provider in eligible_providers:
|
||||
provider_name = provider.get_name()
|
||||
priority = self._get_priority(provider_name)
|
||||
self.task_queue.put((time.time(), priority, (provider_name, node_id_to_extract, current_depth)))
|
||||
self.total_tasks_ever_enqueued += 1
|
||||
# Exclude DNS and correlation providers from re-processing
|
||||
if provider.get_name() not in ['dns', 'correlation']:
|
||||
provider_name = provider.get_name()
|
||||
priority = self._get_priority(provider_name)
|
||||
self.task_queue.put((time.time(), priority, (provider_name, node_id_to_extract, current_depth)))
|
||||
self.total_tasks_ever_enqueued += 1
|
||||
|
||||
if self.status != ScanStatus.RUNNING:
|
||||
self.status = ScanStatus.RUNNING
|
||||
@@ -1136,7 +1049,6 @@ class Scanner:
|
||||
)
|
||||
self.scan_thread.start()
|
||||
else:
|
||||
# For non-scannable nodes (CA, ISP, etc.), just log that they were extracted
|
||||
self.logger.logger.info(f"Extracted non-scannable node {node_id_to_extract} of type {self.graph.graph.nodes[node_id_to_extract].get('type', 'unknown')}")
|
||||
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user