# dnsrecon-reduced/core/graph_manager.py

"""
Graph data model for DNSRecon using NetworkX.
Manages in-memory graph storage with confidence scoring and forensic metadata.
Now fully compatible with the unified ProviderResult data model.
UPDATED: Fixed correlation exclusion keys to match actual attribute names.
UPDATED: Removed export_json() method - now handled by ExportManager.
"""
import re
from datetime import datetime, timezone
from enum import Enum
from typing import Dict, List, Any, Optional, Tuple

import networkx as nx


class NodeType(Enum):
    """Enumeration of supported node types."""
    DOMAIN = "domain"
    IP = "ip"
    ISP = "isp"
    CA = "ca"
    LARGE_ENTITY = "large_entity"
    CORRELATION_OBJECT = "correlation_object"

    def __repr__(self):
        return self.value


class GraphManager:
    """
    Thread-safe graph manager for DNSRecon infrastructure mapping.
    Uses NetworkX for in-memory graph storage with confidence scoring.
    Compatible with unified ProviderResult data model.
    """

    def __init__(self):
        """Initialize empty directed graph."""
        self.graph = nx.DiGraph()
        self.creation_time = datetime.now(timezone.utc).isoformat()
        self.last_modified = self.creation_time
        self.correlation_index = {}
        # Compile regex for date filtering for efficiency
        self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')
        
        # FIXED: Exclude cert_issuer_name since we already create proper CA relationships
        self.EXCLUDED_KEYS = [
            # Certificate metadata that creates noise or has dedicated node types
            'cert_source',                    # Always 'crtsh' for crtsh provider
            'cert_common_name',
            'cert_validity_period_days',      # Numerical, not useful for correlation
            'cert_issuer_name',              # FIXED: Has dedicated CA nodes, don't correlate
            #'cert_certificate_id',            # Unique per certificate
            #'cert_serial_number',            # Unique per certificate
            'cert_entry_timestamp',          # Timestamp, filtered by date regex anyway
            'cert_not_before',               # Date, filtered by date regex anyway
            'cert_not_after',                # Date, filtered by date regex anyway
            # DNS metadata that creates noise
            'dns_ttl',                       # TTL values are not meaningful for correlation
            # Shodan metadata that might create noise
            'timestamp',                     # Generic timestamp fields
            'last_update',                   # Generic timestamp fields
            #'org',                          # Too generic, causes false correlations
            #'isp',                          # Too generic, causes false correlations
            # Generic noisy attributes
            'updated_timestamp',             # Any timestamp field
            'discovery_timestamp',           # Any timestamp field
            'query_timestamp',               # Any timestamp field
        ]

    def __getstate__(self):
        """Prepare GraphManager for pickling, excluding compiled regex."""
        state = self.__dict__.copy()
        # Compiled regex patterns are not always picklable
        if 'date_pattern' in state:
            del state['date_pattern']
        return state

    def __setstate__(self, state):
        """Restore GraphManager state and recompile regex."""
        self.__dict__.update(state)
        self.date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}')

    def process_correlations_for_node(self, node_id: str):
        """
        UPDATED: Process correlations for a given node with enhanced tracking.
        Now properly tracks which attribute/provider created each correlation.
        """
        if not self.graph.has_node(node_id):
            return

        node_attributes = self.graph.nodes[node_id].get('attributes', [])
        
        # Process each attribute for potential correlations
        for attr in node_attributes:
            attr_name = attr.get('name')
            attr_value = attr.get('value')
            attr_provider = attr.get('provider', 'unknown')

            # IMPROVED: More comprehensive exclusion logic
            should_exclude = (
                # Check against excluded keys (exact match or substring)
                any(excluded_key in attr_name or attr_name == excluded_key for excluded_key in self.EXCLUDED_KEYS) or
                # Invalid value types
                not isinstance(attr_value, (str, int, float, bool)) or 
                attr_value is None or
                # Boolean values are not useful for correlation
                isinstance(attr_value, bool) or
                # String values that are too short or are dates
                (isinstance(attr_value, str) and (
                    len(attr_value) < 4 or 
                    self.date_pattern.match(attr_value) or
                    # Exclude common generic values that create noise
                    attr_value.lower() in ['unknown', 'none', 'null', 'n/a', 'true', 'false', '0', '1']
                )) or
                # Numerical values that are likely to be unique identifiers
                (isinstance(attr_value, (int, float)) and (
                    attr_value == 0 or  # Zero values are not meaningful
                    attr_value == 1 or  # One values are too common
                    abs(attr_value) > 1000000  # Very large numbers are likely IDs
                ))
            )

            if should_exclude:
                continue

            # Initialize correlation tracking for this value
            if attr_value not in self.correlation_index:
                self.correlation_index[attr_value] = {
                    'nodes': set(),
                    'sources': []  # Track which provider/attribute combinations contributed
                }

            # Add this node and source information
            self.correlation_index[attr_value]['nodes'].add(node_id)
            
            # Track the source of this correlation value
            source_info = {
                'node_id': node_id,
                'provider': attr_provider,
                'attribute': attr_name,
                'path': f"{attr_provider}_{attr_name}"
            }
            
            # Add source if not already present (avoid duplicates)
            existing_sources = [s for s in self.correlation_index[attr_value]['sources'] 
                              if s['node_id'] == node_id and s['path'] == source_info['path']]
            if not existing_sources:
                self.correlation_index[attr_value]['sources'].append(source_info)

            # Create correlation node if we have multiple nodes with this value
            if len(self.correlation_index[attr_value]['nodes']) > 1:
                self._create_enhanced_correlation_node_and_edges(attr_value, self.correlation_index[attr_value])

    def _create_enhanced_correlation_node_and_edges(self, value, correlation_data):
        """
        UPDATED: Create correlation node and edges with raw provider data (no formatting).
        """
        correlation_node_id = f"corr_{hash(str(value)) & 0x7FFFFFFF}"
        nodes = correlation_data['nodes']
        sources = correlation_data['sources']
        
        # Create or update correlation node
        if not self.graph.has_node(correlation_node_id):
            # Use raw provider/attribute data - no formatting
            provider_counts = {}
            for source in sources:
                # Keep original provider and attribute names
                key = f"{source['provider']}_{source['attribute']}"
                provider_counts[key] = provider_counts.get(key, 0) + 1
            
            # Use the most common provider/attribute as the primary label (raw)
            primary_source = max(provider_counts.items(), key=lambda x: x[1])[0] if provider_counts else "unknown_correlation"
            
            metadata = {
                'value': value,
                'correlated_nodes': list(nodes),
                'sources': sources,
                'primary_source': primary_source,
                'correlation_count': len(nodes)
            }
            
            self.add_node(correlation_node_id, NodeType.CORRELATION_OBJECT, metadata=metadata)
            #print(f"Created correlation node {correlation_node_id} for value '{value}' with {len(nodes)} nodes")

        # Create edges from each node to the correlation node
        for source in sources:
            node_id = source['node_id']
            provider = source['provider']
            attribute = source['attribute']
            
            if self.graph.has_node(node_id) and not self.graph.has_edge(node_id, correlation_node_id):
                # Format relationship label as "corr_provider_attribute"
                relationship_label = f"corr_{provider}_{attribute}"
                
                self.add_edge(
                    source_id=node_id,
                    target_id=correlation_node_id,
                    relationship_type=relationship_label,
                    confidence_score=0.9,
                    source_provider=provider,
                    raw_data={
                        'correlation_value': value,
                        'original_attribute': attribute,
                        'correlation_type': 'attribute_matching'
                    }
                )
                
                #print(f"Added correlation edge: {node_id} -> {correlation_node_id} ({relationship_label})")


    def _has_direct_edge_bidirectional(self, node_a: str, node_b: str) -> bool:
        """
        Check if there's a direct edge between two nodes in either direction.
        Returns True if node_aÃƒÂ¢Ã¢â‚¬ 'node_b OR node_bÃƒÂ¢Ã¢â‚¬ 'node_a exists.
        """
        return (self.graph.has_edge(node_a, node_b) or 
                self.graph.has_edge(node_b, node_a))

    def _correlation_value_matches_existing_node(self, correlation_value: str) -> bool:
        """
        Check if correlation value contains any existing node ID as substring.
        Returns True if match found (correlation node should NOT be created).
        """
        correlation_str = str(correlation_value).lower()
        
        # Check against all existing nodes
        for existing_node_id in self.graph.nodes():
            if existing_node_id.lower() in correlation_str:
                return True
        
        return False

    def _find_correlation_nodes_with_same_pattern(self, node_set: set) -> List[str]:
        """
        Find existing correlation nodes that have the exact same pattern of connected nodes.
        Returns list of correlation node IDs with matching patterns.
        """
        correlation_nodes = self.get_nodes_by_type(NodeType.CORRELATION_OBJECT)
        matching_nodes = []
        
        for corr_node_id in correlation_nodes:
            # Get all nodes connected to this correlation node
            connected_nodes = set()
            
            # Add all predecessors (nodes pointing TO the correlation node)
            connected_nodes.update(self.graph.predecessors(corr_node_id))
            
            # Add all successors (nodes pointed TO by the correlation node)  
            connected_nodes.update(self.graph.successors(corr_node_id))
            
            # Check if the pattern matches exactly
            if connected_nodes == node_set:
                matching_nodes.append(corr_node_id)
        
        return matching_nodes

    def _merge_correlation_values(self, target_node_id: str, new_value: Any, corr_data: Dict) -> None:
        """
        Merge a new correlation value into an existing correlation node.
        Uses same logic as large entity merging.
        """
        if not self.graph.has_node(target_node_id):
            return
            
        target_metadata = self.graph.nodes[target_node_id]['metadata']
        
        # Get existing values (ensure it's a list)
        existing_values = target_metadata.get('values', [])
        if not isinstance(existing_values, list):
            existing_values = [existing_values]
        
        # Add new value if not already present
        if new_value not in existing_values:
            existing_values.append(new_value)
            
        # Merge sources
        existing_sources = target_metadata.get('sources', [])
        new_sources = corr_data.get('sources', [])
        
        # Create set of unique sources based on (node_id, path) tuples
        source_set = set()
        for source in existing_sources + new_sources:
            source_tuple = (source['node_id'], source.get('path', ''))
            source_set.add(source_tuple)
        
        # Convert back to list of dictionaries
        merged_sources = [{'node_id': nid, 'path': path} for nid, path in source_set]
        
        # Update metadata
        target_metadata.update({
            'values': existing_values,
            'sources': merged_sources,
            'correlated_nodes': list(set(target_metadata.get('correlated_nodes', []) + corr_data.get('nodes', []))),
            'merge_count': len(existing_values),
            'last_merge_timestamp': datetime.now(timezone.utc).isoformat()
        })
        
        # Update description to reflect merged nature
        value_count = len(existing_values)
        node_count = len(target_metadata['correlated_nodes'])
        self.graph.nodes[target_node_id]['description'] = (
            f"Correlation container with {value_count} merged values "
            f"across {node_count} nodes"
        )
        
    def add_node(self, node_id: str, node_type: NodeType, attributes: Optional[List[Dict[str, Any]]] = None,
                description: str = "", metadata: Optional[Dict[str, Any]] = None) -> bool:
        """
        Add a node to the graph, update attributes, and process correlations.
        Now compatible with unified data model - attributes are dictionaries from converted StandardAttribute objects.
        """
        is_new_node = not self.graph.has_node(node_id)
        if is_new_node:
            self.graph.add_node(node_id, type=node_type.value,
                                added_timestamp=datetime.now(timezone.utc).isoformat(),
                                attributes=attributes or [],  # Store as a list from the start
                                description=description,
                                metadata=metadata or {})
        else:
            # Safely merge new attributes into the existing list of attributes
            if attributes:
                existing_attributes = self.graph.nodes[node_id].get('attributes', [])
                
                # Handle cases where old data might still be in dictionary format
                if not isinstance(existing_attributes, list):
                    existing_attributes = []

                # Create a set of existing attribute names for efficient duplicate checking
                existing_attr_names = {attr['name'] for attr in existing_attributes}

                for new_attr in attributes:
                    if new_attr['name'] not in existing_attr_names:
                        existing_attributes.append(new_attr)
                        existing_attr_names.add(new_attr['name'])
                
                self.graph.nodes[node_id]['attributes'] = existing_attributes
            if description:
                self.graph.nodes[node_id]['description'] = description
            if metadata:
                existing_metadata = self.graph.nodes[node_id].get('metadata', {})
                existing_metadata.update(metadata)
                self.graph.nodes[node_id]['metadata'] = existing_metadata

        self.last_modified = datetime.now(timezone.utc).isoformat()
        return is_new_node

    def add_edge(self, source_id: str, target_id: str, relationship_type: str,
                confidence_score: float = 0.5, source_provider: str = "unknown",
                raw_data: Optional[Dict[str, Any]] = None) -> bool:
        """
        UPDATED: Add or update an edge between two nodes with raw relationship labels.
        """
        if not self.graph.has_node(source_id) or not self.graph.has_node(target_id):
            return False

        new_confidence = confidence_score
        
        # UPDATED: Use raw relationship type - no formatting
        edge_label = relationship_type
        
        if self.graph.has_edge(source_id, target_id):
            # If edge exists, update confidence if the new score is higher.
            if new_confidence > self.graph.edges[source_id, target_id].get('confidence_score', 0):
                self.graph.edges[source_id, target_id]['confidence_score'] = new_confidence
                self.graph.edges[source_id, target_id]['updated_timestamp'] = datetime.now(timezone.utc).isoformat()
                self.graph.edges[source_id, target_id]['updated_by'] = source_provider
            return False

        # Add a new edge with raw attributes
        self.graph.add_edge(source_id, target_id,
                            relationship_type=edge_label,
                            confidence_score=new_confidence,
                            source_provider=source_provider,
                            discovery_timestamp=datetime.now(timezone.utc).isoformat(),
                            raw_data=raw_data or {})
        self.last_modified = datetime.now(timezone.utc).isoformat()
        return True

    def extract_node_from_large_entity(self, large_entity_id: str, node_id_to_extract: str) -> bool:
        """
        Removes a node from a large entity's internal lists and updates its count.
        This prepares the large entity for the node's promotion to a regular node.
        """
        if not self.graph.has_node(large_entity_id):
            return False
            
        node_data = self.graph.nodes[large_entity_id]
        attributes = node_data.get('attributes', [])
        
        # Find the 'nodes' attribute dictionary in the list
        nodes_attr = next((attr for attr in attributes if attr.get('name') == 'nodes'), None)
        
        # Remove from the list of member nodes
        if nodes_attr and 'value' in nodes_attr and isinstance(nodes_attr['value'], list) and node_id_to_extract in nodes_attr['value']:
            nodes_attr['value'].remove(node_id_to_extract)
            
            # Find the 'count' attribute and update it
            count_attr = next((attr for attr in attributes if attr.get('name') == 'count'), None)
            if count_attr:
                count_attr['value'] = len(nodes_attr['value'])
        else:
            # This can happen if the node was already extracted, which is not an error.
            print(f"Warning: Node {node_id_to_extract} not found in the 'nodes' list of {large_entity_id}.")
            return True # Proceed as if successful
            
        self.last_modified = datetime.now(timezone.utc).isoformat()
        return True

    def remove_node(self, node_id: str) -> bool:
        """Remove a node and its connected edges from the graph."""
        if not self.graph.has_node(node_id):
            return False

        # Remove node from the graph (NetworkX handles removing connected edges)
        self.graph.remove_node(node_id)

        # Clean up the correlation index
        keys_to_delete = []
        for value, data in self.correlation_index.items():
            if isinstance(data, dict) and 'nodes' in data:
                # Updated correlation structure
                if node_id in data['nodes']:
                    data['nodes'].discard(node_id)
                    # Remove sources for this node
                    data['sources'] = [s for s in data['sources'] if s['node_id'] != node_id]
                if not data['nodes']:  # If no other nodes are associated, remove it
                    keys_to_delete.append(value)
            else:
                # Legacy correlation structure (fallback)
                if isinstance(data, set) and node_id in data:
                    data.discard(node_id)
                if not data:
                    keys_to_delete.append(value)
        
        for key in keys_to_delete:
            if key in self.correlation_index:
                del self.correlation_index[key]

        self.last_modified = datetime.now(timezone.utc).isoformat()
        return True

    def get_node_count(self) -> int:
        """Get total number of nodes in the graph."""
        return self.graph.number_of_nodes()

    def get_edge_count(self) -> int:
        """Get total number of edges in the graph."""
        return self.graph.number_of_edges()

    def get_nodes_by_type(self, node_type: NodeType) -> List[str]:
        """Get all nodes of a specific type."""
        return [n for n, d in self.graph.nodes(data=True) if d.get('type') == node_type.value]

    def get_high_confidence_edges(self, min_confidence: float = 0.8) -> List[Tuple[str, str, Dict]]:
        """Get edges with confidence score above a given threshold."""
        return [(u, v, d) for u, v, d in self.graph.edges(data=True)
                if d.get('confidence_score', 0) >= min_confidence]

    def get_graph_data(self) -> Dict[str, Any]:
        """
        Export graph data formatted for frontend visualization.
        SIMPLIFIED: No certificate styling - frontend handles all visual styling.
        """
        nodes = []
        for node_id, attrs in self.graph.nodes(data=True):
            node_data = {
                'id': node_id, 
                'label': node_id, 
                'type': attrs.get('type', 'unknown'),
                'attributes': attrs.get('attributes', []), # Raw attributes list
                'description': attrs.get('description', ''),
                'metadata': attrs.get('metadata', {}),
                'added_timestamp': attrs.get('added_timestamp'),
                'max_depth_reached': attrs.get('metadata', {}).get('max_depth_reached', False)
            }
            
            # Add incoming and outgoing edges to node data
            if self.graph.has_node(node_id):
                node_data['incoming_edges'] = [
                    {'from': u, 'data': d} for u, _, d in self.graph.in_edges(node_id, data=True)
                ]
                node_data['outgoing_edges'] = [
                    {'to': v, 'data': d} for _, v, d in self.graph.out_edges(node_id, data=True)
                ]
            
            nodes.append(node_data)

        edges = []
        for source, target, attrs in self.graph.edges(data=True):
            edges.append({
                'from': source, 
                'to': target,
                'label': attrs.get('relationship_type', ''),
                'confidence_score': attrs.get('confidence_score', 0),
                'source_provider': attrs.get('source_provider', ''),
                'discovery_timestamp': attrs.get('discovery_timestamp')
            })
        
        return {
            'nodes': nodes, 
            'edges': edges,
            'statistics': self.get_statistics()['basic_metrics']
        }

    def _get_confidence_distribution(self) -> Dict[str, int]:
        """Get distribution of edge confidence scores with empty graph handling."""
        distribution = {'high': 0, 'medium': 0, 'low': 0}
        
        # FIXED: Handle empty graph case
        if self.get_edge_count() == 0:
            return distribution
            
        for _, _, data in self.graph.edges(data=True):
            confidence = data.get('confidence_score', 0)
            if confidence >= 0.8:
                distribution['high'] += 1
            elif confidence >= 0.6:
                distribution['medium'] += 1
            else:
                distribution['low'] += 1
        return distribution

    def get_statistics(self) -> Dict[str, Any]:
        """Get comprehensive statistics about the graph with proper empty graph handling."""
        
        # FIXED: Handle empty graph case properly
        node_count = self.get_node_count()
        edge_count = self.get_edge_count()
        
        stats = {
            'basic_metrics': {
                'total_nodes': node_count,
                'total_edges': edge_count,
                'creation_time': self.creation_time,
                'last_modified': self.last_modified
            },
            'node_type_distribution': {}, 
            'relationship_type_distribution': {},
            'confidence_distribution': self._get_confidence_distribution(),
            'provider_distribution': {}
        }
        
        # FIXED: Only calculate distributions if we have data
        if node_count > 0:
            # Calculate node type distributions
            for node_type in NodeType:
                count = len(self.get_nodes_by_type(node_type))
                if count > 0:  # Only include types that exist
                    stats['node_type_distribution'][node_type.value] = count
        
        if edge_count > 0:
            # Calculate edge distributions
            for _, _, data in self.graph.edges(data=True):
                rel_type = data.get('relationship_type', 'unknown')
                stats['relationship_type_distribution'][rel_type] = stats['relationship_type_distribution'].get(rel_type, 0) + 1
                
                provider = data.get('source_provider', 'unknown')
                stats['provider_distribution'][provider] = stats['provider_distribution'].get(provider, 0) + 1
        
        return stats

    def clear(self) -> None:
        """Clear all nodes, edges, and indices from the graph."""
        self.graph.clear()
        self.correlation_index.clear()
        self.creation_time = datetime.now(timezone.utc).isoformat()
        self.last_modified = self.creation_time