diff --git a/core/graph_manager.py b/core/graph_manager.py index 954f744..d6910ba 100644 --- a/core/graph_manager.py +++ b/core/graph_manager.py @@ -43,12 +43,13 @@ class GraphManager: # Compile regex for date filtering for efficiency self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}') - # These are the actual attribute names created in providers, WITHOUT provider prefix + # FIXED: Exclude cert_issuer_name since we already create proper CA relationships self.EXCLUDED_KEYS = [ - # Certificate metadata that creates noise + # Certificate metadata that creates noise or has dedicated node types 'cert_source', # Always 'crtsh' for crtsh provider 'cert_common_name', 'cert_validity_period_days', # Numerical, not useful for correlation + 'cert_issuer_name', # FIXED: Has dedicated CA nodes, don't correlate #'cert_certificate_id', # Unique per certificate #'cert_serial_number', # Unique per certificate 'cert_entry_timestamp', # Timestamp, filtered by date regex anyway @@ -211,7 +212,7 @@ class GraphManager: def _has_direct_edge_bidirectional(self, node_a: str, node_b: str) -> bool: """ Check if there's a direct edge between two nodes in either direction. - Returns True if node_aâ†'node_b OR node_bâ†'node_a exists. + Returns True if node_aâ†'node_b OR node_bâ†'node_a exists. """ return (self.graph.has_edge(node_a, node_b) or self.graph.has_edge(node_b, node_a)) diff --git a/core/scanner.py b/core/scanner.py index 794f98b..67de6d0 100644 --- a/core/scanner.py +++ b/core/scanner.py @@ -547,9 +547,10 @@ class Scanner: return None def _process_provider_result_unified(self, target: str, provider: BaseProvider, - provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]: + provider_result: ProviderResult, current_depth: int) -> Tuple[Set[str], bool]: """ Process a unified ProviderResult object to update the graph. + VERIFIED: Proper ISP and CA node type assignment. """ provider_name = provider.get_name() discovered_targets = set() @@ -557,6 +558,7 @@ class Scanner: if self._is_stop_requested(): return discovered_targets, False + # Process all attributes first, grouping by target node attributes_by_node = defaultdict(list) for attribute in provider_result.attributes: attr_dict = { @@ -569,8 +571,10 @@ class Scanner: } attributes_by_node[attribute.target_node].append(attr_dict) + # Add attributes to existing nodes (important for ISP nodes to get ASN attributes) for node_id, node_attributes_list in attributes_by_node.items(): if self.graph.graph.has_node(node_id): + # Node already exists, just add attributes if _is_valid_ip(node_id): node_type = NodeType.IP else: @@ -578,10 +582,12 @@ class Scanner: self.graph.add_node(node_id, node_type, attributes=node_attributes_list) + # Check if this should be a large entity if provider_result.get_relationship_count() > self.config.large_entity_threshold: members = self._create_large_entity_from_provider_result(target, provider_name, provider_result, current_depth) return members, True + # Process relationships and create nodes with proper types for i, relationship in enumerate(provider_result.relationships): if i % 5 == 0 and self._is_stop_requested(): break @@ -589,20 +595,24 @@ class Scanner: source_node = relationship.source_node target_node = relationship.target_node + # VERIFIED: Determine source node type source_type = NodeType.IP if _is_valid_ip(source_node) else NodeType.DOMAIN - if provider_name == 'shodan' and relationship.relationship_type == 'ip_to_isp': - target_type = NodeType.ISP - elif provider_name == 'crtsh' and relationship.relationship_type == 'issued_by': - target_type = NodeType.CA + # VERIFIED: Determine target node type based on provider and relationship + if provider_name == 'shodan' and relationship.relationship_type == 'shodan_isp': + target_type = NodeType.ISP # ISP node for Shodan organization data + elif provider_name == 'crtsh' and relationship.relationship_type == 'crtsh_cert_issuer': + target_type = NodeType.CA # CA node for certificate issuers elif _is_valid_ip(target_node): target_type = NodeType.IP else: target_type = NodeType.DOMAIN + # Create or update nodes with proper types self.graph.add_node(source_node, source_type) self.graph.add_node(target_node, target_type) + # Add the relationship edge if self.graph.add_edge( source_node, target_node, relationship.relationship_type, @@ -610,8 +620,9 @@ class Scanner: provider_name, relationship.raw_data ): - pass + pass # Edge was successfully added + # Add target to discovered nodes for further processing if _is_valid_domain(target_node) or _is_valid_ip(target_node): discovered_targets.add(target_node) diff --git a/providers/crtsh_provider.py b/providers/crtsh_provider.py index 8af7099..ab41c1d 100644 --- a/providers/crtsh_provider.py +++ b/providers/crtsh_provider.py @@ -298,7 +298,7 @@ class CrtShProvider(BaseProvider): result.add_relationship( source_node=domain, target_node=issuer_name, - relationship_type='issued_by', + relationship_type='crtsh_cert_issuer', provider=self.name, confidence=0.95 ) diff --git a/providers/shodan_provider.py b/providers/shodan_provider.py index 78f2301..930eb0e 100644 --- a/providers/shodan_provider.py +++ b/providers/shodan_provider.py @@ -211,31 +211,48 @@ class ShodanProvider(BaseProvider): def _process_shodan_data(self, ip: str, data: Dict[str, Any]) -> ProviderResult: """ - UPDATED: Process Shodan data with raw attribute names and values. + VERIFIED: Process Shodan data creating ISP nodes with ASN attributes and proper relationships. """ result = ProviderResult() + # VERIFIED: Extract ISP information and create proper ISP node with ASN isp_name = data.get('org') asn_value = data.get('asn') if isp_name and asn_value: + # Create relationship from IP to ISP result.add_relationship( source_node=ip, target_node=isp_name, - relationship_type='ip_to_isp', + relationship_type='shodan_isp', provider=self.name, confidence=0.9, - raw_data={'asn': asn_value} + raw_data={'asn': asn_value, 'shodan_org': isp_name} ) + + # Add ASN as attribute to the ISP node result.add_attribute( target_node=isp_name, name='asn', value=asn_value, attr_type='isp_info', provider=self.name, - confidence=0.9 + confidence=0.9, + metadata={'description': 'Autonomous System Number from Shodan'} + ) + + # Also add organization name as attribute to ISP node for completeness + result.add_attribute( + target_node=isp_name, + name='organization_name', + value=isp_name, + attr_type='isp_info', + provider=self.name, + confidence=0.9, + metadata={'description': 'Organization name from Shodan'} ) + # Process hostnames (reverse DNS) for key, value in data.items(): if key == 'hostnames': for hostname in value: @@ -257,6 +274,7 @@ class ShodanProvider(BaseProvider): discovery_method="shodan_host_lookup" ) elif key == 'ports': + # Add open ports as attributes to the IP for port in value: result.add_attribute( target_node=ip, @@ -267,7 +285,7 @@ class ShodanProvider(BaseProvider): confidence=0.9 ) elif isinstance(value, (str, int, float, bool)) and value is not None: - # UPDATED: Keep raw Shodan field names (no "shodan_" prefix) + # Add other Shodan fields as IP attributes (keep raw field names) result.add_attribute( target_node=ip, name=key, # Raw field name from Shodan API