data-model #2

Merged
mstoeck3 merged 20 commits from data-model into main 2025-09-17 21:56:18 +00:00
4 changed files with 45 additions and 15 deletions
Showing only changes of commit 98e1b2280b - Show all commits

View File

@ -43,12 +43,13 @@ class GraphManager:
# Compile regex for date filtering for efficiency # Compile regex for date filtering for efficiency
self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}') self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')
# These are the actual attribute names created in providers, WITHOUT provider prefix # FIXED: Exclude cert_issuer_name since we already create proper CA relationships
self.EXCLUDED_KEYS = [ self.EXCLUDED_KEYS = [
# Certificate metadata that creates noise # Certificate metadata that creates noise or has dedicated node types
'cert_source', # Always 'crtsh' for crtsh provider 'cert_source', # Always 'crtsh' for crtsh provider
'cert_common_name', 'cert_common_name',
'cert_validity_period_days', # Numerical, not useful for correlation 'cert_validity_period_days', # Numerical, not useful for correlation
'cert_issuer_name', # FIXED: Has dedicated CA nodes, don't correlate
#'cert_certificate_id', # Unique per certificate #'cert_certificate_id', # Unique per certificate
#'cert_serial_number', # Unique per certificate #'cert_serial_number', # Unique per certificate
'cert_entry_timestamp', # Timestamp, filtered by date regex anyway 'cert_entry_timestamp', # Timestamp, filtered by date regex anyway
@ -211,7 +212,7 @@ class GraphManager:
def _has_direct_edge_bidirectional(self, node_a: str, node_b: str) -> bool: def _has_direct_edge_bidirectional(self, node_a: str, node_b: str) -> bool:
""" """
Check if there's a direct edge between two nodes in either direction. Check if there's a direct edge between two nodes in either direction.
Returns True if node_aâ'node_b OR node_bâ†'node_a exists. Returns True if node_aââ 'node_b OR node_bâ†'node_a exists.
""" """
return (self.graph.has_edge(node_a, node_b) or return (self.graph.has_edge(node_a, node_b) or
self.graph.has_edge(node_b, node_a)) self.graph.has_edge(node_b, node_a))

View File

@ -547,9 +547,10 @@ class Scanner:
return None return None
def _process_provider_result_unified(self, target: str, provider: BaseProvider, def _process_provider_result_unified(self, target: str, provider: BaseProvider,
provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]: provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]:
""" """
Process a unified ProviderResult object to update the graph. Process a unified ProviderResult object to update the graph.
VERIFIED: Proper ISP and CA node type assignment.
""" """
provider_name = provider.get_name() provider_name = provider.get_name()
discovered_targets = set() discovered_targets = set()
@ -557,6 +558,7 @@ class Scanner:
if self._is_stop_requested(): if self._is_stop_requested():
return discovered_targets, False return discovered_targets, False
# Process all attributes first, grouping by target node
attributes_by_node = defaultdict(list) attributes_by_node = defaultdict(list)
for attribute in provider_result.attributes: for attribute in provider_result.attributes:
attr_dict = { attr_dict = {
@ -569,8 +571,10 @@ class Scanner:
} }
attributes_by_node[attribute.target_node].append(attr_dict) attributes_by_node[attribute.target_node].append(attr_dict)
# Add attributes to existing nodes (important for ISP nodes to get ASN attributes)
for node_id, node_attributes_list in attributes_by_node.items(): for node_id, node_attributes_list in attributes_by_node.items():
if self.graph.graph.has_node(node_id): if self.graph.graph.has_node(node_id):
# Node already exists, just add attributes
if _is_valid_ip(node_id): if _is_valid_ip(node_id):
node_type = NodeType.IP node_type = NodeType.IP
else: else:
@ -578,10 +582,12 @@ class Scanner:
self.graph.add_node(node_id, node_type, attributes=node_attributes_list) self.graph.add_node(node_id, node_type, attributes=node_attributes_list)
# Check if this should be a large entity
if provider_result.get_relationship_count() > self.config.large_entity_threshold: if provider_result.get_relationship_count() > self.config.large_entity_threshold:
members = self._create_large_entity_from_provider_result(target, provider_name, provider_result, current_depth) members = self._create_large_entity_from_provider_result(target, provider_name, provider_result, current_depth)
return members, True return members, True
# Process relationships and create nodes with proper types
for i, relationship in enumerate(provider_result.relationships): for i, relationship in enumerate(provider_result.relationships):
if i % 5 == 0 and self._is_stop_requested(): if i % 5 == 0 and self._is_stop_requested():
break break
@ -589,20 +595,24 @@ class Scanner:
source_node = relationship.source_node source_node = relationship.source_node
target_node = relationship.target_node target_node = relationship.target_node
# VERIFIED: Determine source node type
source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN
if provider_name == 'shodan' and relationship.relationship_type == 'ip_to_isp': # VERIFIED: Determine target node type based on provider and relationship
target_type = NodeType.ISP if provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp':
elif provider_name == 'crtsh' and relationship.relationship_type == 'issued_by': target_type = NodeType.ISP # ISP node for Shodan organization data
target_type = NodeType.CA elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer':
target_type = NodeType.CA # CA node for certificate issuers
elif _is_valid_ip(target_node): elif _is_valid_ip(target_node):
target_type = NodeType.IP target_type = NodeType.IP
else: else:
target_type = NodeType.DOMAIN target_type = NodeType.DOMAIN
# Create or update nodes with proper types
self.graph.add_node(source_node, source_type) self.graph.add_node(source_node, source_type)
self.graph.add_node(target_node, target_type) self.graph.add_node(target_node, target_type)
# Add the relationship edge
if self.graph.add_edge( if self.graph.add_edge(
source_node, target_node, source_node, target_node,
relationship.relationship_type, relationship.relationship_type,
@ -610,8 +620,9 @@ class Scanner:
provider_name, provider_name,
relationship.raw_data relationship.raw_data
): ):
pass pass # Edge was successfully added
# Add target to discovered nodes for further processing
if _is_valid_domain(target_node) or _is_valid_ip(target_node): if _is_valid_domain(target_node) or _is_valid_ip(target_node):
discovered_targets.add(target_node) discovered_targets.add(target_node)

View File

@ -298,7 +298,7 @@ class CrtShProvider(BaseProvider):
result.add_relationship( result.add_relationship(
source_node=domain, source_node=domain,
target_node=issuer_name, target_node=issuer_name,
relationship_type='issued_by', relationship_type='crtsh_cert_issuer',
provider=self.name, provider=self.name,
confidence=0.95 confidence=0.95
) )

View File

@ -211,31 +211,48 @@ class ShodanProvider(BaseProvider):
def _process_shodan_data(self, ip: str, data: Dict[str, Any]) -> ProviderResult: def _process_shodan_data(self, ip: str, data: Dict[str, Any]) -> ProviderResult:
""" """
UPDATED: Process Shodan data with raw attribute names and values. VERIFIED: Process Shodan data creating ISP nodes with ASN attributes and proper relationships.
""" """
result = ProviderResult() result = ProviderResult()
# VERIFIED: Extract ISP information and create proper ISP node with ASN
isp_name = data.get('org') isp_name = data.get('org')
asn_value = data.get('asn') asn_value = data.get('asn')
if isp_name and asn_value: if isp_name and asn_value:
# Create relationship from IP to ISP
result.add_relationship( result.add_relationship(
source_node=ip, source_node=ip,
target_node=isp_name, target_node=isp_name,
relationship_type='ip_to_isp', relationship_type='shodan_isp',
provider=self.name, provider=self.name,
confidence=0.9, confidence=0.9,
raw_data={'asn': asn_value} raw_data={'asn': asn_value, 'shodan_org': isp_name}
) )
# Add ASN as attribute to the ISP node
result.add_attribute( result.add_attribute(
target_node=isp_name, target_node=isp_name,
name='asn', name='asn',
value=asn_value, value=asn_value,
attr_type='isp_info', attr_type='isp_info',
provider=self.name, provider=self.name,
confidence=0.9 confidence=0.9,
metadata={'description': 'Autonomous System Number from Shodan'}
)
# Also add organization name as attribute to ISP node for completeness
result.add_attribute(
target_node=isp_name,
name='organization_name',
value=isp_name,
attr_type='isp_info',
provider=self.name,
confidence=0.9,
metadata={'description': 'Organization name from Shodan'}
) )
# Process hostnames (reverse DNS)
for key, value in data.items(): for key, value in data.items():
if key == 'hostnames': if key == 'hostnames':
for hostname in value: for hostname in value:
@ -257,6 +274,7 @@ class ShodanProvider(BaseProvider):
discovery_method="shodan_host_lookup" discovery_method="shodan_host_lookup"
) )
elif key == 'ports': elif key == 'ports':
# Add open ports as attributes to the IP
for port in value: for port in value:
result.add_attribute( result.add_attribute(
target_node=ip, target_node=ip,
@ -267,7 +285,7 @@ class ShodanProvider(BaseProvider):
confidence=0.9 confidence=0.9
) )
elif isinstance(value, (str, int, float, bool)) and value is not None: elif isinstance(value, (str, int, float, bool)) and value is not None:
# UPDATED: Keep raw Shodan field names (no "shodan_" prefix) # Add other Shodan fields as IP attributes (keep raw field names)
result.add_attribute( result.add_attribute(
target_node=ip, target_node=ip,
name=key, # Raw field name from Shodan API name=key, # Raw field name from Shodan API