This commit is contained in:
overcuriousity
2025-09-24 09:30:42 +02:00
parent 571912218e
commit 897bb80183
15 changed files with 541 additions and 335 deletions

View File

@@ -229,7 +229,6 @@ class BaseProvider(ABC):
def log_relationship_discovery(self, source_node: str, target_node: str,
relationship_type: str,
confidence_score: float,
raw_data: Dict[str, Any],
discovery_method: str) -> None:
"""
@@ -239,7 +238,6 @@ class BaseProvider(ABC):
source_node: Source node identifier
target_node: Target node identifier
relationship_type: Type of relationship
confidence_score: Confidence score
raw_data: Raw data from provider
discovery_method: Method used for discovery
"""
@@ -249,7 +247,6 @@ class BaseProvider(ABC):
source_node=source_node,
target_node=target_node,
relationship_type=relationship_type,
confidence_score=confidence_score,
provider=self.name,
raw_data=raw_data,
discovery_method=discovery_method

View File

@@ -2,6 +2,7 @@
import re
from typing import Dict, Any, List
from datetime import datetime, timezone
from .base_provider import BaseProvider
from core.provider_result import ProviderResult
@@ -10,6 +11,7 @@ from core.graph_manager import NodeType, GraphManager
class CorrelationProvider(BaseProvider):
"""
A provider that finds correlations between nodes in the graph.
UPDATED: Enhanced with discovery timestamps for time-based edge coloring.
"""
def __init__(self, name: str = "correlation", session_config=None):
@@ -61,12 +63,14 @@ class CorrelationProvider(BaseProvider):
def query_domain(self, domain: str) -> ProviderResult:
"""
Query the provider for information about a domain.
UPDATED: Enhanced with discovery timestamps for time-based edge coloring.
"""
return self._find_correlations(domain)
def query_ip(self, ip: str) -> ProviderResult:
"""
Query the provider for information about an IP address.
UPDATED: Enhanced with discovery timestamps for time-based edge coloring.
"""
return self._find_correlations(ip)
@@ -79,8 +83,10 @@ class CorrelationProvider(BaseProvider):
def _find_correlations(self, node_id: str) -> ProviderResult:
"""
Find correlations for a given node with enhanced filtering and error handling.
UPDATED: Enhanced with discovery timestamps for time-based edge coloring.
"""
result = ProviderResult()
discovery_time = datetime.now(timezone.utc)
# Enhanced safety checks
if not self.graph or not self.graph.graph.has_node(node_id):
@@ -133,7 +139,7 @@ class CorrelationProvider(BaseProvider):
# Create correlation if we have multiple nodes with this value
if len(self.correlation_index[attr_value]['nodes']) > 1:
self._create_correlation_relationships(attr_value, self.correlation_index[attr_value], result)
self._create_correlation_relationships(attr_value, self.correlation_index[attr_value], result, discovery_time)
correlations_found += 1
# Log correlation results
@@ -187,9 +193,11 @@ class CorrelationProvider(BaseProvider):
return False
def _create_correlation_relationships(self, value: Any, correlation_data: Dict[str, Any], result: ProviderResult):
def _create_correlation_relationships(self, value: Any, correlation_data: Dict[str, Any],
result: ProviderResult, discovery_time: datetime):
"""
Create correlation relationships with enhanced deduplication and validation.
UPDATED: Enhanced with discovery timestamps for time-based edge coloring.
"""
correlation_node_id = f"corr_{hash(str(value)) & 0x7FFFFFFF}"
nodes = correlation_data['nodes']
@@ -216,7 +224,6 @@ class CorrelationProvider(BaseProvider):
value=value,
attr_type=str(type(value).__name__),
provider=self.name,
confidence=0.9,
metadata={
'correlated_nodes': list(nodes),
'sources': sources,
@@ -225,7 +232,7 @@ class CorrelationProvider(BaseProvider):
}
)
# Create relationships with source validation
# Create relationships with source validation and enhanced timestamps
created_relationships = set()
for source in sources:
@@ -240,19 +247,23 @@ class CorrelationProvider(BaseProvider):
relationship_label = f"corr_{provider}_{attribute}"
# Enhanced raw_data with discovery timestamp for time-based edge coloring
raw_data = {
'correlation_value': value,
'original_attribute': attribute,
'correlation_type': 'attribute_matching',
'correlation_size': len(nodes),
'discovery_timestamp': discovery_time.isoformat(),
'relevance_timestamp': discovery_time.isoformat() # Correlation data is "fresh" when discovered
}
# Add the relationship to the result
result.add_relationship(
source_node=node_id,
target_node=correlation_node_id,
relationship_type=relationship_label,
provider=self.name,
confidence=0.9,
raw_data={
'correlation_value': value,
'original_attribute': attribute,
'correlation_type': 'attribute_matching',
'correlation_size': len(nodes)
}
raw_data=raw_data
)
created_relationships.add(relationship_key)

View File

@@ -18,6 +18,7 @@ class CrtShProvider(BaseProvider):
Provider for querying crt.sh certificate transparency database.
FIXED: Improved caching logic and error handling to prevent infinite retry loops.
Returns standardized ProviderResult objects with caching support.
UPDATED: Enhanced with certificate timestamps for time-based edge coloring.
"""
def __init__(self, name=None, session_config=None):
@@ -131,6 +132,7 @@ class CrtShProvider(BaseProvider):
def query_domain(self, domain: str) -> ProviderResult:
"""
FIXED: Simplified and more robust domain querying with better error handling.
UPDATED: Enhanced with certificate timestamps for time-based edge coloring.
"""
if not _is_valid_domain(domain):
return ProviderResult()
@@ -245,7 +247,6 @@ class CrtShProvider(BaseProvider):
target_node=rel_data.get("target_node", ""),
relationship_type=rel_data.get("relationship_type", ""),
provider=rel_data.get("provider", self.name),
confidence=float(rel_data.get("confidence", 0.8)),
raw_data=rel_data.get("raw_data", {})
)
except (ValueError, TypeError) as e:
@@ -265,7 +266,6 @@ class CrtShProvider(BaseProvider):
value=attr_data.get("value"),
attr_type=attr_data.get("type", "unknown"),
provider=attr_data.get("provider", self.name),
confidence=float(attr_data.get("confidence", 0.9)),
metadata=attr_data.get("metadata", {})
)
except (ValueError, TypeError) as e:
@@ -293,7 +293,6 @@ class CrtShProvider(BaseProvider):
"source_node": rel.source_node,
"target_node": rel.target_node,
"relationship_type": rel.relationship_type,
"confidence": rel.confidence,
"provider": rel.provider,
"raw_data": rel.raw_data
} for rel in result.relationships
@@ -305,7 +304,6 @@ class CrtShProvider(BaseProvider):
"value": attr.value,
"type": attr.type,
"provider": attr.provider,
"confidence": attr.confidence,
"metadata": attr.metadata
} for attr in result.attributes
]
@@ -372,6 +370,7 @@ class CrtShProvider(BaseProvider):
"""
Process certificates to create proper domain and CA nodes.
FIXED: Better error handling and progress tracking.
UPDATED: Enhanced with certificate timestamps for time-based edge coloring.
"""
result = ProviderResult()
@@ -391,8 +390,7 @@ class CrtShProvider(BaseProvider):
name="crtsh_data_warning",
value=incompleteness_warning,
attr_type='metadata',
provider=self.name,
confidence=1.0
provider=self.name
)
all_discovered_domains = set()
@@ -415,16 +413,28 @@ class CrtShProvider(BaseProvider):
if cert_domains:
all_discovered_domains.update(cert_domains)
# Create CA nodes for certificate issuers
# Create CA nodes for certificate issuers with timestamp
issuer_name = self._parse_issuer_organization(cert_data.get('issuer_name', ''))
if issuer_name and issuer_name not in processed_issuers:
# Enhanced raw_data with certificate timestamp for time-based edge coloring
issuer_raw_data = {'issuer_dn': cert_data.get('issuer_name', '')}
# Add certificate issue date (not_before) as relevance timestamp
not_before = cert_data.get('not_before')
if not_before:
try:
not_before_date = self._parse_certificate_date(not_before)
issuer_raw_data['cert_not_before'] = not_before_date.isoformat()
issuer_raw_data['relevance_timestamp'] = not_before_date.isoformat() # Standardized field
except Exception as e:
self.logger.logger.debug(f"Failed to parse not_before date for issuer: {e}")
result.add_relationship(
source_node=query_domain,
target_node=issuer_name,
relationship_type='crtsh_cert_issuer',
provider=self.name,
confidence=0.95,
raw_data={'issuer_dn': cert_data.get('issuer_name', '')}
raw_data=issuer_raw_data
)
processed_issuers.add(issuer_name)
@@ -442,7 +452,6 @@ class CrtShProvider(BaseProvider):
value=value,
attr_type='certificate_data',
provider=self.name,
confidence=0.9,
metadata={'certificate_id': cert_data.get('id')}
)
@@ -457,7 +466,7 @@ class CrtShProvider(BaseProvider):
self.logger.logger.info(f"CrtSh query cancelled before relationship creation for domain: {query_domain}")
return result
# Create selective relationships to avoid large entities
# Create selective relationships to avoid large entities with enhanced timestamps
relationships_created = 0
for discovered_domain in all_discovered_domains:
if discovered_domain == query_domain:
@@ -467,25 +476,36 @@ class CrtShProvider(BaseProvider):
continue
if self._should_create_relationship(query_domain, discovered_domain):
confidence = self._calculate_domain_relationship_confidence(
query_domain, discovered_domain, [], all_discovered_domains
# Enhanced raw_data with certificate timestamp for domain relationships
domain_raw_data = {'relationship_type': 'certificate_discovery'}
# Find the most recent certificate for this domain pair to use as timestamp
most_recent_cert = self._find_most_recent_cert_for_domains(
certificates, query_domain, discovered_domain
)
if most_recent_cert:
not_before = most_recent_cert.get('not_before')
if not_before:
try:
not_before_date = self._parse_certificate_date(not_before)
domain_raw_data['cert_not_before'] = not_before_date.isoformat()
domain_raw_data['relevance_timestamp'] = not_before_date.isoformat()
except Exception as e:
self.logger.logger.debug(f"Failed to parse not_before date for domain relationship: {e}")
result.add_relationship(
source_node=query_domain,
target_node=discovered_domain,
relationship_type='crtsh_san_certificate',
provider=self.name,
confidence=confidence,
raw_data={'relationship_type': 'certificate_discovery'}
raw_data=domain_raw_data
)
self.log_relationship_discovery(
source_node=query_domain,
target_node=discovered_domain,
relationship_type='crtsh_san_certificate',
confidence_score=confidence,
raw_data={'relationship_type': 'certificate_discovery'},
raw_data=domain_raw_data,
discovery_method="certificate_transparency_analysis"
)
relationships_created += 1
@@ -493,6 +513,31 @@ class CrtShProvider(BaseProvider):
self.logger.logger.info(f"CrtSh processing completed for {query_domain}: processed {processed_certs}/{len(certificates)} certificates, {len(all_discovered_domains)} domains, {relationships_created} relationships")
return result
def _find_most_recent_cert_for_domains(self, certificates: List[Dict[str, Any]],
domain1: str, domain2: str) -> Optional[Dict[str, Any]]:
"""
Find the most recent certificate that contains both domains.
Used for determining the relevance timestamp for domain relationships.
"""
most_recent_cert = None
most_recent_date = None
for cert in certificates:
# Check if this certificate contains both domains
cert_domains = self._extract_domains_from_certificate(cert)
if domain1 in cert_domains and domain2 in cert_domains:
not_before = cert.get('not_before')
if not_before:
try:
cert_date = self._parse_certificate_date(not_before)
if most_recent_date is None or cert_date > most_recent_date:
most_recent_date = cert_date
most_recent_cert = cert
except Exception:
continue
return most_recent_cert
# [Rest of the methods remain the same as in the original file]
def _should_create_relationship(self, source_domain: str, target_domain: str) -> bool:
"""
@@ -664,25 +709,6 @@ class CrtShProvider(BaseProvider):
return [d for d in final_domains if _is_valid_domain(d)]
def _calculate_domain_relationship_confidence(self, domain1: str, domain2: str,
shared_certificates: List[Dict[str, Any]],
all_discovered_domains: Set[str]) -> float:
"""Calculate confidence score for domain relationship based on various factors."""
base_confidence = 0.9
relationship_context = self._determine_relationship_context(domain2, domain1)
if relationship_context == 'exact_match':
context_bonus = 0.0
elif relationship_context == 'subdomain':
context_bonus = 0.1
elif relationship_context == 'parent_domain':
context_bonus = 0.05
else:
context_bonus = 0.0
final_confidence = base_confidence + context_bonus
return max(0.1, min(1.0, final_confidence))
def _determine_relationship_context(self, cert_domain: str, query_domain: str) -> str:
"""Determine the context of the relationship between certificate domain and query domain."""

View File

@@ -2,6 +2,7 @@
from dns import resolver, reversename
from typing import Dict
from datetime import datetime, timezone
from .base_provider import BaseProvider
from core.provider_result import ProviderResult
from utils.helpers import _is_valid_ip, _is_valid_domain, get_ip_version
@@ -11,6 +12,7 @@ class DNSProvider(BaseProvider):
"""
Provider for standard DNS resolution and reverse DNS lookups.
Now returns standardized ProviderResult objects with IPv4 and IPv6 support.
UPDATED: Enhanced with discovery timestamps for time-based edge coloring.
"""
def __init__(self, name=None, session_config=None):
@@ -51,6 +53,7 @@ class DNSProvider(BaseProvider):
"""
Query DNS records for the domain to discover relationships and attributes.
FIXED: Now creates separate attributes for each DNS record type.
UPDATED: Enhanced with discovery timestamps for time-based edge coloring.
Args:
domain: Domain to investigate
@@ -62,11 +65,12 @@ class DNSProvider(BaseProvider):
return ProviderResult()
result = ProviderResult()
discovery_time = datetime.now(timezone.utc)
# Query all record types - each gets its own attribute
for record_type in ['A', 'AAAA', 'CNAME', 'MX', 'NS', 'SOA', 'TXT', 'SRV', 'CAA']:
try:
self._query_record(domain, record_type, result)
self._query_record(domain, record_type, result, discovery_time)
#except resolver.NoAnswer:
# This is not an error, just a confirmation that the record doesn't exist.
#self.logger.logger.debug(f"No {record_type} record found for {domain}")
@@ -79,6 +83,7 @@ class DNSProvider(BaseProvider):
def query_ip(self, ip: str) -> ProviderResult:
"""
Query reverse DNS for the IP address (supports both IPv4 and IPv6).
UPDATED: Enhanced with discovery timestamps for time-based edge coloring.
Args:
ip: IP address to investigate (IPv4 or IPv6)
@@ -91,6 +96,7 @@ class DNSProvider(BaseProvider):
result = ProviderResult()
ip_version = get_ip_version(ip)
discovery_time = datetime.now(timezone.utc)
try:
# Perform reverse DNS lookup (works for both IPv4 and IPv6)
@@ -112,20 +118,24 @@ class DNSProvider(BaseProvider):
relationship_type = 'dns_a_record'
record_prefix = 'A'
# Enhanced raw_data with discovery timestamp for time-based edge coloring
raw_data = {
'query_type': 'PTR',
'ip_address': ip,
'ip_version': ip_version,
'hostname': hostname,
'ttl': response.ttl,
'discovery_timestamp': discovery_time.isoformat(),
'relevance_timestamp': discovery_time.isoformat() # DNS data is "fresh" when discovered
}
# Add the relationship
result.add_relationship(
source_node=ip,
target_node=hostname,
relationship_type='dns_ptr_record',
provider=self.name,
confidence=0.8,
raw_data={
'query_type': 'PTR',
'ip_address': ip,
'ip_version': ip_version,
'hostname': hostname,
'ttl': response.ttl
}
raw_data=raw_data
)
# Add to PTR records list
@@ -136,14 +146,7 @@ class DNSProvider(BaseProvider):
source_node=ip,
target_node=hostname,
relationship_type='dns_ptr_record',
confidence_score=0.8,
raw_data={
'query_type': 'PTR',
'ip_address': ip,
'ip_version': ip_version,
'hostname': hostname,
'ttl': response.ttl
},
raw_data=raw_data,
discovery_method=f"reverse_dns_lookup_ipv{ip_version}"
)
@@ -155,7 +158,6 @@ class DNSProvider(BaseProvider):
value=ptr_records,
attr_type='dns_record',
provider=self.name,
confidence=0.8,
metadata={'ttl': response.ttl, 'ip_version': ip_version}
)
@@ -170,10 +172,11 @@ class DNSProvider(BaseProvider):
return result
def _query_record(self, domain: str, record_type: str, result: ProviderResult) -> None:
def _query_record(self, domain: str, record_type: str, result: ProviderResult, discovery_time: datetime) -> None:
"""
FIXED: Query DNS records with unique attribute names for each record type.
Enhanced to better handle IPv6 AAAA records.
UPDATED: Enhanced with discovery timestamps for time-based edge coloring.
"""
try:
self.total_requests += 1
@@ -217,18 +220,20 @@ class DNSProvider(BaseProvider):
if record_type in ['A', 'AAAA'] and _is_valid_ip(target):
ip_version = get_ip_version(target)
# Enhanced raw_data with discovery timestamp for time-based edge coloring
raw_data = {
'query_type': record_type,
'domain': domain,
'value': target,
'ttl': response.ttl
'ttl': response.ttl,
'discovery_timestamp': discovery_time.isoformat(),
'relevance_timestamp': discovery_time.isoformat() # DNS data is "fresh" when discovered
}
if ip_version:
raw_data['ip_version'] = ip_version
relationship_type = f"dns_{record_type.lower()}_record"
confidence = 0.8
# Add relationship
result.add_relationship(
@@ -236,7 +241,6 @@ class DNSProvider(BaseProvider):
target_node=target,
relationship_type=relationship_type,
provider=self.name,
confidence=confidence,
raw_data=raw_data
)
@@ -252,7 +256,6 @@ class DNSProvider(BaseProvider):
source_node=domain,
target_node=target,
relationship_type=relationship_type,
confidence_score=confidence,
raw_data=raw_data,
discovery_method=discovery_method
)
@@ -276,7 +279,6 @@ class DNSProvider(BaseProvider):
value=dns_records,
attr_type='dns_record_list',
provider=self.name,
confidence=0.8,
metadata=metadata
)

View File

@@ -15,6 +15,7 @@ class ShodanProvider(BaseProvider):
"""
Provider for querying Shodan API for IP address information.
Now returns standardized ProviderResult objects with caching support for IPv4 and IPv6.
UPDATED: Enhanced with last_seen timestamp for time-based edge coloring.
"""
def __init__(self, name=None, session_config=None):
@@ -145,6 +146,7 @@ class ShodanProvider(BaseProvider):
"""
Query Shodan for information about an IP address (IPv4 or IPv6), with caching of processed data.
FIXED: Proper 404 handling to prevent unnecessary retries.
UPDATED: Enhanced with last_seen timestamp extraction for time-based edge coloring.
Args:
ip: IP address to investigate (IPv4 or IPv6)
@@ -304,7 +306,6 @@ class ShodanProvider(BaseProvider):
target_node=rel_data["target_node"],
relationship_type=rel_data["relationship_type"],
provider=rel_data["provider"],
confidence=rel_data["confidence"],
raw_data=rel_data.get("raw_data", {})
)
@@ -316,7 +317,6 @@ class ShodanProvider(BaseProvider):
value=attr_data["value"],
attr_type=attr_data["type"],
provider=attr_data["provider"],
confidence=attr_data["confidence"],
metadata=attr_data.get("metadata", {})
)
@@ -336,7 +336,6 @@ class ShodanProvider(BaseProvider):
"source_node": rel.source_node,
"target_node": rel.target_node,
"relationship_type": rel.relationship_type,
"confidence": rel.confidence,
"provider": rel.provider,
"raw_data": rel.raw_data
} for rel in result.relationships
@@ -348,7 +347,6 @@ class ShodanProvider(BaseProvider):
"value": attr.value,
"type": attr.type,
"provider": attr.provider,
"confidence": attr.confidence,
"metadata": attr.metadata
} for attr in result.attributes
]
@@ -362,25 +360,40 @@ class ShodanProvider(BaseProvider):
"""
VERIFIED: Process Shodan data creating ISP nodes with ASN attributes and proper relationships.
Enhanced to include IP version information for IPv6 addresses.
UPDATED: Enhanced with last_seen timestamp for time-based edge coloring.
"""
result = ProviderResult()
# Determine IP version for metadata
ip_version = get_ip_version(ip)
# Extract last_seen timestamp for time-based edge coloring
last_seen = data.get('last_seen')
# VERIFIED: Extract ISP information and create proper ISP node with ASN
isp_name = data.get('org')
asn_value = data.get('asn')
if isp_name and asn_value:
# Enhanced raw_data with last_seen timestamp
raw_data = {
'asn': asn_value,
'shodan_org': isp_name,
'ip_version': ip_version
}
# Add last_seen timestamp if available
if last_seen:
raw_data['last_seen'] = last_seen
raw_data['relevance_timestamp'] = last_seen # Standardized field for time-based coloring
# Create relationship from IP to ISP
result.add_relationship(
source_node=ip,
target_node=isp_name,
relationship_type='shodan_isp',
provider=self.name,
confidence=0.9,
raw_data={'asn': asn_value, 'shodan_org': isp_name, 'ip_version': ip_version}
raw_data=raw_data
)
# Add ASN as attribute to the ISP node
@@ -390,7 +403,6 @@ class ShodanProvider(BaseProvider):
value=asn_value,
attr_type='isp_info',
provider=self.name,
confidence=0.9,
metadata={'description': 'Autonomous System Number from Shodan', 'ip_version': ip_version}
)
@@ -401,7 +413,6 @@ class ShodanProvider(BaseProvider):
value=isp_name,
attr_type='isp_info',
provider=self.name,
confidence=0.9,
metadata={'description': 'Organization name from Shodan', 'ip_version': ip_version}
)
@@ -416,20 +427,24 @@ class ShodanProvider(BaseProvider):
else:
relationship_type = 'shodan_a_record'
# Enhanced raw_data with last_seen timestamp
hostname_raw_data = {**data, 'ip_version': ip_version}
if last_seen:
hostname_raw_data['last_seen'] = last_seen
hostname_raw_data['relevance_timestamp'] = last_seen
result.add_relationship(
source_node=ip,
target_node=hostname,
relationship_type=relationship_type,
provider=self.name,
confidence=0.8,
raw_data={**data, 'ip_version': ip_version}
raw_data=hostname_raw_data
)
self.log_relationship_discovery(
source_node=ip,
target_node=hostname,
relationship_type=relationship_type,
confidence_score=0.8,
raw_data={**data, 'ip_version': ip_version},
raw_data=hostname_raw_data,
discovery_method=f"shodan_host_lookup_ipv{ip_version}"
)
elif key == 'ports':
@@ -441,7 +456,6 @@ class ShodanProvider(BaseProvider):
value=port,
attr_type='shodan_network_info',
provider=self.name,
confidence=0.9,
metadata={'ip_version': ip_version}
)
elif isinstance(value, (str, int, float, bool)) and value is not None:
@@ -452,7 +466,6 @@ class ShodanProvider(BaseProvider):
value=value,
attr_type='shodan_info',
provider=self.name,
confidence=0.9,
metadata={'ip_version': ip_version}
)