attempt fix large entity
This commit is contained in:
parent
95cebbf935
commit
1558731c1c
252
core/scanner.py
252
core/scanner.py
@ -825,81 +825,33 @@ class Scanner:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def _process_provider_result_unified(self, target: str, provider: BaseProvider,
|
def _process_provider_result_unified(self, target: str, provider: BaseProvider,
|
||||||
provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]:
|
provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]:
|
||||||
"""
|
"""
|
||||||
Process a unified ProviderResult object to update the graph.
|
Process a unified ProviderResult object to update the graph.
|
||||||
FIXED: Ensure CA and ISP relationships are created even when large entities are formed.
|
Handles large entity creation while ensuring all underlying nodes and edges are
|
||||||
|
added to the graph data model for a complete dataset.
|
||||||
"""
|
"""
|
||||||
provider_name = provider.get_name()
|
provider_name = provider.get_name()
|
||||||
discovered_targets = set()
|
discovered_targets = set()
|
||||||
|
large_entity_members = set()
|
||||||
|
|
||||||
if self._is_stop_requested():
|
if self._is_stop_requested():
|
||||||
return discovered_targets, False
|
return discovered_targets, False
|
||||||
|
|
||||||
# Check if this should be a large entity (only counting domain/IP relationships)
|
# Check if a large entity should be created based on the count of domain/IP relationships
|
||||||
eligible_relationship_count = 0
|
eligible_relationship_count = sum(
|
||||||
for rel in provider_result.relationships:
|
1 for rel in provider_result.relationships if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)
|
||||||
# Only count relationships that would go into large entities
|
)
|
||||||
if provider_name == 'crtsh' and rel.relationship_type == 'crtsh_cert_issuer':
|
|
||||||
continue # Don't count CA relationships
|
|
||||||
if provider_name == 'shodan' and rel.relationship_type == 'shodan_isp':
|
|
||||||
continue # Don't count ISP relationships
|
|
||||||
if rel.relationship_type.startswith('corr_'):
|
|
||||||
continue # Don't count correlation relationships
|
|
||||||
|
|
||||||
# Only count domain/IP targets
|
is_large_entity = eligible_relationship_count > self.config.large_entity_threshold
|
||||||
if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node):
|
|
||||||
eligible_relationship_count += 1
|
|
||||||
|
|
||||||
if eligible_relationship_count > self.config.large_entity_threshold:
|
if is_large_entity:
|
||||||
# Create large entity but ALSO process special relationships
|
# Create the large entity node and get the set of its members
|
||||||
members = self._create_large_entity_from_provider_result(target, provider_name, provider_result, current_depth)
|
large_entity_members = self._create_large_entity_from_provider_result(
|
||||||
|
target, provider_name, provider_result, current_depth
|
||||||
|
)
|
||||||
|
|
||||||
# FIXED: Still process CA, ISP, and correlation relationships directly on the graph
|
# Process ALL relationships to build the complete underlying data model
|
||||||
for relationship in provider_result.relationships:
|
|
||||||
if self._is_stop_requested():
|
|
||||||
break
|
|
||||||
|
|
||||||
source_node = relationship.source_node
|
|
||||||
target_node = relationship.target_node
|
|
||||||
|
|
||||||
# Process special relationship types that should appear directly on graph
|
|
||||||
should_create_direct_relationship = False
|
|
||||||
target_type = None
|
|
||||||
|
|
||||||
if provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer':
|
|
||||||
target_type = NodeType.CA
|
|
||||||
should_create_direct_relationship = True
|
|
||||||
elif provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp':
|
|
||||||
target_type = NodeType.ISP
|
|
||||||
should_create_direct_relationship = True
|
|
||||||
elif relationship.relationship_type.startswith('corr_'):
|
|
||||||
target_type = NodeType.CORRELATION_OBJECT
|
|
||||||
should_create_direct_relationship = True
|
|
||||||
|
|
||||||
if should_create_direct_relationship:
|
|
||||||
# Create source and target nodes
|
|
||||||
source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN
|
|
||||||
self.graph.add_node(source_node, source_type)
|
|
||||||
self.graph.add_node(target_node, target_type)
|
|
||||||
|
|
||||||
# Add the relationship edge
|
|
||||||
self.graph.add_edge(
|
|
||||||
source_node, target_node,
|
|
||||||
relationship.relationship_type,
|
|
||||||
relationship.confidence,
|
|
||||||
provider_name,
|
|
||||||
relationship.raw_data
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add to discovered targets if it's a valid target for further processing
|
|
||||||
max_depth_reached = current_depth >= self.max_depth
|
|
||||||
if not max_depth_reached and (_is_valid_domain(target_node) or _is_valid_ip(target_node)):
|
|
||||||
discovered_targets.add(target_node)
|
|
||||||
|
|
||||||
return members, True
|
|
||||||
|
|
||||||
# Normal processing (existing logic) when not creating large entity
|
|
||||||
for i, relationship in enumerate(provider_result.relationships):
|
for i, relationship in enumerate(provider_result.relationships):
|
||||||
if i % 5 == 0 and self._is_stop_requested():
|
if i % 5 == 0 and self._is_stop_requested():
|
||||||
break
|
break
|
||||||
@ -907,10 +859,8 @@ class Scanner:
|
|||||||
source_node = relationship.source_node
|
source_node = relationship.source_node
|
||||||
target_node = relationship.target_node
|
target_node = relationship.target_node
|
||||||
|
|
||||||
# Determine source node type
|
# Determine node types
|
||||||
source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN
|
source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN
|
||||||
|
|
||||||
# Determine target node type based on provider and relationship
|
|
||||||
if provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp':
|
if provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp':
|
||||||
target_type = NodeType.ISP
|
target_type = NodeType.ISP
|
||||||
elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer':
|
elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer':
|
||||||
@ -922,131 +872,101 @@ class Scanner:
|
|||||||
else:
|
else:
|
||||||
target_type = NodeType.DOMAIN
|
target_type = NodeType.DOMAIN
|
||||||
|
|
||||||
# Add max_depth_reached flag
|
|
||||||
max_depth_reached = current_depth >= self.max_depth
|
max_depth_reached = current_depth >= self.max_depth
|
||||||
|
|
||||||
# Create or update nodes with proper types
|
# Add all nodes and edges to the graph's data model.
|
||||||
|
# The frontend will handle the visual re-routing for large entity members.
|
||||||
self.graph.add_node(source_node, source_type)
|
self.graph.add_node(source_node, source_type)
|
||||||
self.graph.add_node(target_node, target_type, metadata={'max_depth_reached': max_depth_reached})
|
self.graph.add_node(target_node, target_type, metadata={'max_depth_reached': max_depth_reached})
|
||||||
|
self.graph.add_edge(
|
||||||
# Add the relationship edge
|
|
||||||
if self.graph.add_edge(
|
|
||||||
source_node, target_node,
|
source_node, target_node,
|
||||||
relationship.relationship_type,
|
relationship.relationship_type,
|
||||||
relationship.confidence,
|
relationship.confidence,
|
||||||
provider_name,
|
provider_name,
|
||||||
relationship.raw_data
|
relationship.raw_data
|
||||||
):
|
)
|
||||||
pass # Edge was successfully added
|
|
||||||
|
|
||||||
# Add target to discovered nodes for further processing
|
# Add all discovered domains/IPs to be considered for further processing
|
||||||
if (_is_valid_domain(target_node) or _is_valid_ip(target_node)) and not max_depth_reached:
|
if (_is_valid_domain(target_node) or _is_valid_ip(target_node)) and not max_depth_reached:
|
||||||
discovered_targets.add(target_node)
|
discovered_targets.add(target_node)
|
||||||
|
|
||||||
# Process all attributes (existing logic unchanged)
|
# Process all attributes and add them to the corresponding nodes
|
||||||
attributes_by_node = defaultdict(list)
|
attributes_by_node = defaultdict(list)
|
||||||
for attribute in provider_result.attributes:
|
for attribute in provider_result.attributes:
|
||||||
attr_dict = {
|
attr_dict = {
|
||||||
"name": attribute.name,
|
"name": attribute.name, "value": attribute.value, "type": attribute.type,
|
||||||
"value": attribute.value,
|
"provider": attribute.provider, "confidence": attribute.confidence, "metadata": attribute.metadata
|
||||||
"type": attribute.type,
|
|
||||||
"provider": attribute.provider,
|
|
||||||
"confidence": attribute.confidence,
|
|
||||||
"metadata": attribute.metadata
|
|
||||||
}
|
}
|
||||||
attributes_by_node[attribute.target_node].append(attr_dict)
|
attributes_by_node[attribute.target_node].append(attr_dict)
|
||||||
|
|
||||||
# Add attributes to existing nodes OR create new nodes if they don't exist
|
|
||||||
for node_id, node_attributes_list in attributes_by_node.items():
|
for node_id, node_attributes_list in attributes_by_node.items():
|
||||||
if not self.graph.graph.has_node(node_id):
|
if not self.graph.graph.has_node(node_id):
|
||||||
node_type = NodeType.IP if _is_valid_ip(node_id) else NodeType.DOMAIN
|
node_type = NodeType.IP if _is_valid_ip(node_id) else NodeType.DOMAIN
|
||||||
self.graph.add_node(node_id, node_type, attributes=node_attributes_list)
|
self.graph.add_node(node_id, node_type, attributes=node_attributes_list)
|
||||||
else:
|
else:
|
||||||
node_type_val = self.graph.graph.nodes[node_id].get('type', 'domain')
|
existing_attrs = self.graph.graph.nodes[node_id].get('attributes', [])
|
||||||
self.graph.add_node(node_id, NodeType(node_type_val), attributes=node_attributes_list)
|
self.graph.graph.nodes[node_id]['attributes'] = existing_attrs + node_attributes_list
|
||||||
|
|
||||||
return discovered_targets, False
|
return discovered_targets, is_large_entity
|
||||||
|
|
||||||
def _create_large_entity_from_provider_result(self, source: str, provider_name: str,
|
def _create_large_entity_from_provider_result(self, source: str, provider_name: str,
|
||||||
provider_result: ProviderResult, current_depth: int) -> Set[str]:
|
provider_result: ProviderResult, current_depth: int) -> Set[str]:
|
||||||
"""
|
"""
|
||||||
Create a large entity node from a ProviderResult.
|
Create a large entity node and connect it to the source and any shared
|
||||||
FIXED: Only include domain/IP nodes in large entities, exclude CA and other special node types.
|
non-member nodes like CAs or ISPs.
|
||||||
"""
|
"""
|
||||||
entity_id = f"large_entity_{provider_name}_{hash(source) & 0x7FFFFFFF}"
|
entity_id = f"large_entity_{provider_name}_{hash(source) & 0x7FFFFFFF}"
|
||||||
|
|
||||||
# FIXED: Filter out CA, ISP, and correlation nodes from large entity inclusion
|
members = {
|
||||||
eligible_targets = []
|
rel.target_node for rel in provider_result.relationships
|
||||||
for rel in provider_result.relationships:
|
if _is_valid_domain(rel.target_node) or _is_valid_ip(rel.target_node)
|
||||||
target_node = rel.target_node
|
}
|
||||||
|
|
||||||
# Skip CA nodes (certificate issuers) - they should appear directly on graph
|
if not members:
|
||||||
if provider_name == 'crtsh' and rel.relationship_type == 'crtsh_cert_issuer':
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Skip ISP nodes - they should appear directly on graph
|
|
||||||
if provider_name == 'shodan' and rel.relationship_type == 'shodan_isp':
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Skip correlation objects - they should appear directly on graph
|
|
||||||
if rel.relationship_type.startswith('corr_'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Only include valid domains and IPs in large entities
|
|
||||||
if _is_valid_domain(target_node) or _is_valid_ip(target_node):
|
|
||||||
eligible_targets.append(target_node)
|
|
||||||
|
|
||||||
# If no eligible targets after filtering, don't create large entity
|
|
||||||
if not eligible_targets:
|
|
||||||
return set()
|
return set()
|
||||||
|
|
||||||
node_type = 'unknown'
|
first_member = next(iter(members))
|
||||||
if eligible_targets:
|
node_type = 'ip' if _is_valid_ip(first_member) else 'domain'
|
||||||
if _is_valid_domain(eligible_targets[0]):
|
|
||||||
node_type = 'domain'
|
|
||||||
elif _is_valid_ip(eligible_targets[0]):
|
|
||||||
node_type = 'ip'
|
|
||||||
|
|
||||||
# Create individual nodes for eligible targets
|
|
||||||
for target in eligible_targets:
|
|
||||||
target_node_type = NodeType.DOMAIN if node_type == 'domain' else NodeType.IP
|
|
||||||
self.graph.add_node(target, target_node_type)
|
|
||||||
|
|
||||||
attributes_dict = {
|
attributes_dict = {
|
||||||
'count': len(eligible_targets),
|
'count': len(members),
|
||||||
'nodes': eligible_targets, # Only eligible domain/IP nodes
|
'nodes': list(members),
|
||||||
'node_type': node_type,
|
'node_type': node_type,
|
||||||
'source_provider': provider_name,
|
'source_provider': provider_name,
|
||||||
'discovery_depth': current_depth,
|
'discovery_depth': current_depth,
|
||||||
'threshold_exceeded': self.config.large_entity_threshold,
|
'threshold_exceeded': self.config.large_entity_threshold,
|
||||||
}
|
}
|
||||||
|
attributes_list = [
|
||||||
|
{
|
||||||
|
"name": key, "value": value, "type": "large_entity_info",
|
||||||
|
"provider": provider_name, "confidence": 0.9, "metadata": {}
|
||||||
|
} for key, value in attributes_dict.items()
|
||||||
|
]
|
||||||
|
|
||||||
attributes_list = []
|
description = f'Large entity created due to {len(members)} relationships from {provider_name}'
|
||||||
for key, value in attributes_dict.items():
|
|
||||||
attributes_list.append({
|
|
||||||
"name": key,
|
|
||||||
"value": value,
|
|
||||||
"type": "large_entity_info",
|
|
||||||
"provider": provider_name,
|
|
||||||
"confidence": 0.9,
|
|
||||||
"metadata": {}
|
|
||||||
})
|
|
||||||
|
|
||||||
description = f'Large entity created due to {len(eligible_targets)} relationships from {provider_name}'
|
|
||||||
|
|
||||||
self.graph.add_node(entity_id, NodeType.LARGE_ENTITY, attributes=attributes_list, description=description)
|
self.graph.add_node(entity_id, NodeType.LARGE_ENTITY, attributes=attributes_list, description=description)
|
||||||
|
|
||||||
|
# Add a representative edge from the source to the large entity
|
||||||
if provider_result.relationships:
|
if provider_result.relationships:
|
||||||
# Use the first eligible relationship for the large entity connection
|
rep_rel = provider_result.relationships[0]
|
||||||
eligible_rels = [rel for rel in provider_result.relationships if rel.target_node in eligible_targets]
|
self.graph.add_edge(source, entity_id, rep_rel.relationship_type, 0.9, provider_name,
|
||||||
if eligible_rels:
|
{'large_entity_info': f'Contains {len(members)} {node_type}s'})
|
||||||
rel_type = eligible_rels[0].relationship_type
|
|
||||||
self.graph.add_edge(source, entity_id, rel_type, 0.9, provider_name,
|
|
||||||
{'large_entity_info': f'Contains {len(eligible_targets)} {node_type}s'})
|
|
||||||
|
|
||||||
self.logger.logger.warning(f"Large entity created: {entity_id} contains {len(eligible_targets)} targets from {provider_name}")
|
# Create edges from the large entity to shared non-member nodes (e.g., CAs, ISPs)
|
||||||
|
processed_targets = set()
|
||||||
|
for rel in provider_result.relationships:
|
||||||
|
if rel.source_node in members and rel.target_node not in members:
|
||||||
|
if rel.target_node not in processed_targets:
|
||||||
|
self.graph.add_edge(
|
||||||
|
entity_id, rel.target_node, rel.relationship_type, rel.confidence,
|
||||||
|
rel.provider, rel.raw_data
|
||||||
|
)
|
||||||
|
processed_targets.add(rel.target_node)
|
||||||
|
|
||||||
return set(eligible_targets)
|
self.logger.logger.warning(f"Large entity created: {entity_id} contains {len(members)} targets from {provider_name}")
|
||||||
|
|
||||||
|
return members
|
||||||
|
|
||||||
def stop_scan(self) -> bool:
|
def stop_scan(self) -> bool:
|
||||||
"""Request immediate scan termination with proper cleanup."""
|
"""Request immediate scan termination with proper cleanup."""
|
||||||
@ -1077,41 +997,32 @@ class Scanner:
|
|||||||
|
|
||||||
def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool:
|
def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Extracts a node from a large entity and re-queues it for scanning.
|
Extracts a node from a large entity, restores ALL of its original connections,
|
||||||
FIXED: Properly handle different node types during extraction.
|
and re-queues it for scanning.
|
||||||
"""
|
"""
|
||||||
if not self.graph.graph.has_node(large_entity_id):
|
if not self.graph.graph.has_node(large_entity_id):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
predecessors = list(self.graph.graph.predecessors(large_entity_id))
|
# Extract the node from the large entity's internal list
|
||||||
if not predecessors:
|
|
||||||
return False
|
|
||||||
source_node_id = predecessors[0]
|
|
||||||
|
|
||||||
original_edge_data = self.graph.graph.get_edge_data(source_node_id, large_entity_id)
|
|
||||||
if not original_edge_data:
|
|
||||||
return False
|
|
||||||
|
|
||||||
success = self.graph.extract_node_from_large_entity(large_entity_id, node_id_to_extract)
|
success = self.graph.extract_node_from_large_entity(large_entity_id, node_id_to_extract)
|
||||||
if not success:
|
if not success:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Create relationship from source to extracted node
|
# Restore all incoming and outgoing edges for the extracted node
|
||||||
self.graph.add_edge(
|
# These edges already exist in the graph data model; this ensures they are "activated"
|
||||||
source_id=source_node_id,
|
# for the frontend.
|
||||||
target_id=node_id_to_extract,
|
for u, v, data in self.graph.graph.in_edges(node_id_to_extract, data=True):
|
||||||
relationship_type=original_edge_data.get('relationship_type', 'extracted_from_large_entity'),
|
self.graph.add_edge(u, v, data.get('relationship_type'), data.get('confidence_score'),
|
||||||
confidence_score=original_edge_data.get('confidence_score', 0.85),
|
data.get('source_provider'), data.get('raw_data'))
|
||||||
source_provider=original_edge_data.get('source_provider', 'unknown'),
|
|
||||||
raw_data={'context': f'Extracted from large entity {large_entity_id}'}
|
|
||||||
)
|
|
||||||
|
|
||||||
# FIXED: Only queue for further scanning if it's a domain/IP that can be scanned
|
for u, v, data in self.graph.graph.out_edges(node_id_to_extract, data=True):
|
||||||
|
self.graph.add_edge(u, v, data.get('relationship_type'), data.get('confidence_score'),
|
||||||
|
data.get('source_provider'), data.get('raw_data'))
|
||||||
|
|
||||||
|
# Re-queue the extracted node for further scanning if it is a domain or IP
|
||||||
is_ip = _is_valid_ip(node_id_to_extract)
|
is_ip = _is_valid_ip(node_id_to_extract)
|
||||||
is_domain = _is_valid_domain(node_id_to_extract)
|
is_domain = _is_valid_domain(node_id_to_extract)
|
||||||
|
|
||||||
# Only queue valid domains and IPs for further processing
|
|
||||||
# Don't queue CA nodes, ISP nodes, etc. as they can't be scanned
|
|
||||||
if is_domain or is_ip:
|
if is_domain or is_ip:
|
||||||
large_entity_attributes = self.graph.graph.nodes[large_entity_id].get('attributes', [])
|
large_entity_attributes = self.graph.graph.nodes[large_entity_id].get('attributes', [])
|
||||||
discovery_depth_attr = next((attr for attr in large_entity_attributes if attr.get('name') == 'discovery_depth'), None)
|
discovery_depth_attr = next((attr for attr in large_entity_attributes if attr.get('name') == 'discovery_depth'), None)
|
||||||
@ -1119,10 +1030,12 @@ class Scanner:
|
|||||||
|
|
||||||
eligible_providers = self._get_eligible_providers(node_id_to_extract, is_ip, False)
|
eligible_providers = self._get_eligible_providers(node_id_to_extract, is_ip, False)
|
||||||
for provider in eligible_providers:
|
for provider in eligible_providers:
|
||||||
provider_name = provider.get_name()
|
# Exclude DNS and correlation providers from re-processing
|
||||||
priority = self._get_priority(provider_name)
|
if provider.get_name() not in ['dns', 'correlation']:
|
||||||
self.task_queue.put((time.time(), priority, (provider_name, node_id_to_extract, current_depth)))
|
provider_name = provider.get_name()
|
||||||
self.total_tasks_ever_enqueued += 1
|
priority = self._get_priority(provider_name)
|
||||||
|
self.task_queue.put((time.time(), priority, (provider_name, node_id_to_extract, current_depth)))
|
||||||
|
self.total_tasks_ever_enqueued += 1
|
||||||
|
|
||||||
if self.status != ScanStatus.RUNNING:
|
if self.status != ScanStatus.RUNNING:
|
||||||
self.status = ScanStatus.RUNNING
|
self.status = ScanStatus.RUNNING
|
||||||
@ -1136,7 +1049,6 @@ class Scanner:
|
|||||||
)
|
)
|
||||||
self.scan_thread.start()
|
self.scan_thread.start()
|
||||||
else:
|
else:
|
||||||
# For non-scannable nodes (CA, ISP, etc.), just log that they were extracted
|
|
||||||
self.logger.logger.info(f"Extracted non-scannable node {node_id_to_extract} of type {self.graph.graph.nodes[node_id_to_extract].get('type', 'unknown')}")
|
self.logger.logger.info(f"Extracted non-scannable node {node_id_to_extract} of type {self.graph.graph.nodes[node_id_to_extract].get('type', 'unknown')}")
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user