llm-eval-forensics/analyze_results.py

#!/usr/bin/env python3
"""
AI Model Evaluation Results Analyzer
Compares results across different models and quantizations
Includes interactive web interface for visualization and analysis
"""

import json
import sys
from pathlib import Path
from typing import List, Dict
import argparse
from collections import defaultdict
from flask import Flask, render_template, jsonify, request
import numpy as np


class ResultsAnalyzer:
    def __init__(self, results_dir: str = "results"):
        self.results_dir = Path(results_dir)

    def load_result_file(self, filepath: Path) -> Dict:
        """Load a single result file"""
        with open(filepath, 'r', encoding='utf-8') as f:
            return json.load(f)

    def find_result_files(self, pattern: str = "*_latest.json") -> List[Path]:
        """Find all result files matching pattern"""
        return sorted(self.results_dir.glob(pattern))

    def extract_scores_by_category(self, results: Dict) -> Dict[str, List[float]]:
        """Extract scores organized by category"""
        scores_by_category = defaultdict(list)

        for test in results.get('test_results', []):
            category = test.get('category', 'Unknown')
            score = test.get('score') or test.get('overall_score')

            if score is not None:
                scores_by_category[category].append(score)

        return dict(scores_by_category)

    def calculate_statistics(self, scores: List[float]) -> Dict:
        """Calculate statistics for a list of scores"""
        if not scores:
            return {
                'count': 0,
                'average': 0.0,
                'min': 0.0,
                'max': 0.0,
                'pass_rate': 0.0,
                'exceptional_rate': 0.0
            }

        return {
            'count': len(scores),
            'average': sum(scores) / len(scores),
            'min': min(scores),
            'max': max(scores),
            'pass_rate': len([s for s in scores if s >= 2]) / len(scores) * 100,
            'exceptional_rate': len([s for s in scores if s >= 4]) / len(scores) * 100
        }

    def compare_models(self, result_files: List[Path]):
        """Generate comparison report for multiple models"""
        print("\n" + "="*100)
        print("📊 AI MODEL COMPARISON REPORT")
        print("="*100)

        # Load all results
        all_results = {}
        for filepath in result_files:
            try:
                results = self.load_result_file(filepath)
                model_name = results['metadata']['model_name']
                all_results[model_name] = results
            except Exception as e:
                print(f"⚠️  Error loading {filepath}: {e}")

        if not all_results:
            print("❌ No valid result files found")
            return

        # Overall comparison
        print("\n📈 OVERALL PERFORMANCE")
        print("-"*100)
        print(f"{'Model':<30} {'Total Tests':<12} {'Avg Score':<12} {'Pass Rate':<12} {'Exceptional':<12}")
        print("-"*100)

        model_stats = {}
        for model_name, results in sorted(all_results.items()):
            all_scores = [
                test.get('score') or test.get('overall_score')
                for test in results.get('test_results', [])
            ]
            all_scores = [s for s in all_scores if s is not None]

            stats = self.calculate_statistics(all_scores)
            model_stats[model_name] = stats

            print(f"{model_name:<30} {stats['count']:<12} {stats['average']:<12.2f} "
                  f"{stats['pass_rate']:<12.1f}% {stats['exceptional_rate']:<12.1f}%")

        # Category-wise comparison
        print("\n\n📂 CATEGORY-WISE PERFORMANCE")
        print("="*100)

        # Get all unique categories
        all_categories = set()
        for results in all_results.values():
            for test in results.get('test_results', []):
                all_categories.add(test.get('category', 'Unknown'))

        for category in sorted(all_categories):
            print(f"\n🔖 {category}")
            print("-"*100)
            print(f"{'Model':<30} {'Tests':<8} {'Avg Score':<12} {'Pass Rate':<12} {'Exceptional':<12}")
            print("-"*100)

            for model_name, results in sorted(all_results.items()):
                cat_scores = [
                    test.get('score') or test.get('overall_score')
                    for test in results.get('test_results', [])
                    if test.get('category') == category and (test.get('score') or test.get('overall_score')) is not None
                ]

                if cat_scores:
                    stats = self.calculate_statistics(cat_scores)
                    print(f"{model_name:<30} {stats['count']:<8} {stats['average']:<12.2f} "
                          f"{stats['pass_rate']:<12.1f}% {stats['exceptional_rate']:<12.1f}%")
                else:
                    print(f"{model_name:<30} {'N/A':<8} {'N/A':<12} {'N/A':<12} {'N/A':<12}")

        # Difficulty-based comparison
        print("\n\n⚡ DIFFICULTY-BASED PERFORMANCE")
        print("="*100)

        difficulties = ['medium', 'hard', 'very_hard']
        for difficulty in difficulties:
            print(f"\n🎯 Difficulty: {difficulty.replace('_', ' ').title()}")
            print("-"*100)
            print(f"{'Model':<30} {'Tests':<8} {'Avg Score':<12} {'Pass Rate':<12}")
            print("-"*100)

            for model_name, results in sorted(all_results.items()):
                diff_scores = [
                    test.get('score') or test.get('overall_score')
                    for test in results.get('test_results', [])
                    if test.get('difficulty') == difficulty and (test.get('score') or test.get('overall_score')) is not None
                ]

                if diff_scores:
                    stats = self.calculate_statistics(diff_scores)
                    print(f"{model_name:<30} {stats['count']:<8} {stats['average']:<12.2f} "
                          f"{stats['pass_rate']:<12.1f}%")
                else:
                    print(f"{model_name:<30} {'N/A':<8} {'N/A':<12} {'N/A':<12}")

        # Winner analysis
        print("\n\n🏆 WINNERS BY CATEGORY")
        print("="*100)

        for category in sorted(all_categories):
            best_model = None
            best_score = -1

            for model_name, results in all_results.items():
                cat_scores = [
                    test.get('score') or test.get('overall_score')
                    for test in results.get('test_results', [])
                    if test.get('category') == category and (test.get('score') or test.get('overall_score')) is not None
                ]

                if cat_scores:
                    avg = sum(cat_scores) / len(cat_scores)
                    if avg > best_score:
                        best_score = avg
                        best_model = model_name

            if best_model:
                print(f"{category:<50} → {best_model} ({best_score:.2f})")

        print("\n\n🎖️ OVERALL WINNER")
        print("="*100)
        best_overall = max(model_stats.items(), key=lambda x: x[1]['average'])
        print(f"Model: {best_overall[0]}")
        print(f"Average Score: {best_overall[1]['average']:.2f}/5.00")
        print(f"Pass Rate: {best_overall[1]['pass_rate']:.1f}%")
        print(f"Exceptional Rate: {best_overall[1]['exceptional_rate']:.1f}%")
        print("="*100)

    def generate_detailed_report(self, model_name: str):
        """Generate detailed report for a specific model"""
        # Find result file for this model
        pattern = f"{model_name.replace(':', '_')}_latest.json"
        filepath = self.results_dir / pattern

        if not filepath.exists():
            print(f"❌ No results found for model: {model_name}")
            return

        results = self.load_result_file(filepath)

        print("\n" + "="*100)
        print(f"📋 DETAILED REPORT: {model_name}")
        print("="*100)

        # Metadata
        metadata = results.get('metadata', {})
        print(f"\n⏱️  Test Duration: {metadata.get('test_start')} → {metadata.get('test_end')}")
        print(f"📊 Tests: {metadata.get('completed_tests')}/{metadata.get('total_tests')}")

        # Overall stats
        all_scores = [
            test.get('score') or test.get('overall_score')
            for test in results.get('test_results', [])
        ]
        all_scores = [s for s in all_scores if s is not None]
        stats = self.calculate_statistics(all_scores)

        print(f"\n📈 Overall Performance:")
        print(f"  Average Score: {stats['average']:.2f}/5.00")
        print(f"  Pass Rate: {stats['pass_rate']:.1f}%")
        print(f"  Exceptional Rate: {stats['exceptional_rate']:.1f}%")
        print(f"  Score Range: {stats['min']:.1f} - {stats['max']:.1f}")

        # Test-by-test results
        print(f"\n\n📝 TEST-BY-TEST RESULTS")
        print("="*100)

        for test in results.get('test_results', []):
            score = test.get('score') or test.get('overall_score')
            status_icon = "✅" if score and score >= 4 else "⚠️" if score and score >= 2 else "❌"

            print(f"\n{status_icon} [{test.get('test_id')}] {test.get('test_name')}")
            print(f"   Category: {test.get('category')}")
            print(f"   Type: {test.get('type')}")
            print(f"   Difficulty: {test.get('difficulty', 'unknown')}")
            print(f"   Score: {score if score is not None else 'N/A'}/5.00")

            if test.get('notes'):
                print(f"   Notes: {test['notes']}")

            # Show criteria pass/fail if available
            if test.get('evaluation_criteria'):
                print(f"   Criteria ({len(test['evaluation_criteria'])} items):")
                for criterion in test['evaluation_criteria']:
                    print(f"     • {criterion}")

        print("\n" + "="*100)

    def export_csv(self, output_file: str = "comparison.csv"):
        """Export comparison data to CSV"""
        import csv

        result_files = self.find_result_files()
        if not result_files:
            print("❌ No result files found")
            return

        # Prepare CSV data
        csv_data = []
        headers = ['Model', 'Test ID', 'Test Name', 'Category', 'Type', 'Difficulty', 'Score', 'Notes']

        for filepath in result_files:
            results = self.load_result_file(filepath)
            model_name = results['metadata']['model_name']

            for test in results.get('test_results', []):
                csv_data.append([
                    model_name,
                    test.get('test_id', ''),
                    test.get('test_name', ''),
                    test.get('category', ''),
                    test.get('type', ''),
                    test.get('difficulty', ''),
                    test.get('score') or test.get('overall_score', ''),
                    test.get('notes', '')
                ])

        # Write CSV
        output_path = self.results_dir / output_file
        with open(output_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(headers)
            writer.writerows(csv_data)

        print(f"✅ CSV exported to: {output_path}")


class WebInterface:
    """Interactive web interface for results analysis"""

    def __init__(self, results_dir: str = "results"):
        self.results_dir = Path(results_dir)
        self.analyzer = ResultsAnalyzer(results_dir)
        self.app = Flask(__name__)
        self.setup_routes()

    def setup_routes(self):
        """Setup Flask routes"""

        @self.app.route('/')
        def index():
            """Main dashboard"""
            return render_template('dashboard.html')

        @self.app.route('/api/models')
        def get_models():
            """Get list of all available models"""
            result_files = self.analyzer.find_result_files()
            models = []

            for filepath in result_files:
                try:
                    results = self.analyzer.load_result_file(filepath)
                    metadata = results.get('metadata', {})
                    models.append({
                        'name': metadata.get('model_name', 'Unknown'),
                        'file': filepath.name,
                        'total_tests': metadata.get('total_tests', 0),
                        'completed_tests': metadata.get('completed_tests', 0),
                        'test_start': metadata.get('test_start'),
                        'test_end': metadata.get('test_end')
                    })
                except Exception as e:
                    print(f"Error loading {filepath}: {e}")

            return jsonify(models)

        @self.app.route('/api/results/<model_name>')
        def get_model_results(model_name):
            """Get detailed results for a specific model"""
            pattern = f"{model_name.replace(':', '_')}_latest.json"
            filepath = self.results_dir / pattern

            if not filepath.exists():
                return jsonify({'error': 'Model not found'}), 404

            results = self.analyzer.load_result_file(filepath)
            return jsonify(results)

        @self.app.route('/api/comparison')
        def get_comparison():
            """Get comparison data for all models"""
            result_files = self.analyzer.find_result_files()
            comparison_data = {
                'models': {},
                'categories': set(),
                'difficulty_levels': set()
            }

            for filepath in result_files:
                try:
                    results = self.analyzer.load_result_file(filepath)
                    model_name = results['metadata']['model_name']

                    # Extract all scores and metadata
                    test_results = []
                    for test in results.get('test_results', []):
                        score = test.get('score') or test.get('overall_score')

                        # Handle notes differently for multi-turn vs single-turn tests
                        if test.get('type') == 'multi_turn' and 'turns' in test:
                            # Combine notes from all turns for multi-turn tests
                            turn_notes = []
                            for turn in test.get('turns', []):
                                turn_num = turn.get('turn', '?')
                                turn_note = turn.get('notes', '')
                                if turn_note:
                                    turn_notes.append(f"T{turn_num}: {turn_note}")
                            notes = ' | '.join(turn_notes) if turn_notes else ''

                            # Get aggregate generation time and metrics for multi-turn
                            total_gen_time = sum(t.get('generation_time', 0) for t in test.get('turns', []))
                            api_metrics = test.get('aggregate_metrics', {})
                        else:
                            notes = test.get('notes', '')
                            total_gen_time = test.get('generation_time')
                            api_metrics = test.get('api_metrics')

                        test_data = {
                            'test_id': test.get('test_id'),
                            'test_name': test.get('test_name'),
                            'category': test.get('category', 'Unknown'),
                            'type': test.get('type'),
                            'difficulty': test.get('difficulty', 'medium'),
                            'score': score,
                            'status': test.get('status'),
                            'notes': notes,
                            'generation_time': total_gen_time,
                            'api_metrics': api_metrics
                        }
                        test_results.append(test_data)

                        if test_data['category']:
                            comparison_data['categories'].add(test_data['category'])
                        if test_data['difficulty']:
                            comparison_data['difficulty_levels'].add(test_data['difficulty'])

                    # Calculate overall statistics
                    all_scores = [t['score'] for t in test_results if t['score'] is not None]
                    stats = self.analyzer.calculate_statistics(all_scores) if all_scores else {}

                    # Calculate category statistics
                    category_stats = {}
                    for category in comparison_data['categories']:
                        cat_scores = [t['score'] for t in test_results
                                    if t['category'] == category and t['score'] is not None]
                        if cat_scores:
                            category_stats[category] = self.analyzer.calculate_statistics(cat_scores)

                    # Calculate difficulty statistics
                    difficulty_stats = {}
                    for difficulty in comparison_data['difficulty_levels']:
                        diff_scores = [t['score'] for t in test_results
                                     if t['difficulty'] == difficulty and t['score'] is not None]
                        if diff_scores:
                            difficulty_stats[difficulty] = self.analyzer.calculate_statistics(diff_scores)

                    comparison_data['models'][model_name] = {
                        'test_results': test_results,
                        'overall_stats': stats,
                        'category_stats': category_stats,
                        'difficulty_stats': difficulty_stats,
                        'metadata': results.get('metadata', {})
                    }

                except Exception as e:
                    print(f"Error processing {filepath}: {e}")

            # Convert sets to lists for JSON serialization
            comparison_data['categories'] = sorted(list(comparison_data['categories']))
            comparison_data['difficulty_levels'] = sorted(list(comparison_data['difficulty_levels']))

            return jsonify(comparison_data)

        @self.app.route('/api/statistics')
        def get_statistics():
            """Get advanced statistical analysis"""
            result_files = self.analyzer.find_result_files()

            all_data = []
            model_names = []

            for filepath in result_files:
                try:
                    results = self.analyzer.load_result_file(filepath)
                    model_name = results['metadata']['model_name']
                    model_names.append(model_name)

                    scores = [test.get('score') or test.get('overall_score')
                            for test in results.get('test_results', [])]
                    scores = [s for s in scores if s is not None]
                    all_data.append(scores)
                except Exception as e:
                    print(f"Error in statistics: {e}")

            # Calculate advanced statistics
            statistics = {
                'models': model_names,
                'variance': [],
                'std_dev': [],
                'consistency_score': [],
                'robustness_score': []
            }

            for i, scores in enumerate(all_data):
                if scores:
                    variance = np.var(scores)
                    std_dev = np.std(scores)

                    # Consistency: lower variance is better (inverse normalized)
                    consistency = max(0, 100 - (std_dev * 20))

                    # Robustness: combination of average and consistency
                    avg_score = np.mean(scores)
                    robustness = (avg_score * 20) * (consistency / 100)

                    statistics['variance'].append(float(variance))
                    statistics['std_dev'].append(float(std_dev))
                    statistics['consistency_score'].append(float(consistency))
                    statistics['robustness_score'].append(float(robustness))
                else:
                    statistics['variance'].append(0)
                    statistics['std_dev'].append(0)
                    statistics['consistency_score'].append(0)
                    statistics['robustness_score'].append(0)

            return jsonify(statistics)

        @self.app.route('/api/intelligence_metrics')
        def get_intelligence_metrics():
            """Calculate intelligence evaluation metrics"""
            result_files = self.analyzer.find_result_files()

            metrics = {}

            for filepath in result_files:
                try:
                    results = self.analyzer.load_result_file(filepath)
                    model_name = results['metadata']['model_name']
                    test_results = results.get('test_results', [])

                    # Define intelligence dimensions
                    dimensions = {
                        'logical_reasoning': ['Logic & Reasoning'],
                        'mathematical_ability': ['Mathematics & Calculation'],
                        'instruction_following': ['Instruction Following', 'Multi-turn: Instruction Following'],
                        'creativity': ['Creative Writing'],
                        'technical_knowledge': [
                            'Code Generation',
                            'IT Forensics - File Systems',
                            'IT Forensics - Registry & Artifacts',
                            'IT Forensics - Memory & Network',
                            'IT Forensics - Timeline & Log Analysis'
                        ],
                        'linguistic_nuance': ['Language Nuance', 'Multilingual Competence'],
                        'problem_solving': ['Problem Solving & Logistics'],
                        'conversational_depth': ['Multi-turn: Context Retention']
                    }

                    model_metrics = {}

                    for dimension, categories in dimensions.items():
                        dim_scores = [
                            test.get('score') or test.get('overall_score')
                            for test in test_results
                            if test.get('category') in categories and
                               (test.get('score') or test.get('overall_score')) is not None
                        ]

                        if dim_scores:
                            avg = sum(dim_scores) / len(dim_scores)
                            model_metrics[dimension] = {
                                'score': avg,
                                'normalized': (avg / 5.0) * 100,
                                'count': len(dim_scores)
                            }
                        else:
                            model_metrics[dimension] = {
                                'score': 0,
                                'normalized': 0,
                                'count': 0
                            }

                    # Calculate overall intelligence quotient (IQ score)
                    weighted_scores = []
                    weights = {
                        'logical_reasoning': 1.5,  # Higher weight for core reasoning
                        'mathematical_ability': 1.3,
                        'instruction_following': 1.2,
                        'creativity': 1.0,
                        'technical_knowledge': 1.4,
                        'linguistic_nuance': 1.1,
                        'problem_solving': 1.4,
                        'conversational_depth': 1.0
                    }

                    total_weight = 0
                    for dim, data in model_metrics.items():
                        if data['count'] > 0:
                            weighted_scores.append(data['score'] * weights[dim])
                            total_weight += weights[dim]

                    iq_score = (sum(weighted_scores) / total_weight * 20) if total_weight > 0 else 0

                    # Adaptability: performance across diverse categories
                    category_scores = {}
                    for test in test_results:
                        cat = test.get('category')
                        score = test.get('score') or test.get('overall_score')
                        if cat and score is not None:
                            if cat not in category_scores:
                                category_scores[cat] = []
                            category_scores[cat].append(score)

                    adaptability = len([cat for cat, scores in category_scores.items()
                                      if sum(scores)/len(scores) >= 2.5]) / max(len(category_scores), 1) * 100

                    # Problem-solving depth: performance on hard problems
                    hard_scores = [
                        test.get('score') or test.get('overall_score')
                        for test in test_results
                        if test.get('difficulty') in ['hard', 'very_hard'] and
                           (test.get('score') or test.get('overall_score')) is not None
                    ]

                    problem_solving_depth = (sum(hard_scores) / len(hard_scores) * 20) if hard_scores else 0

                    metrics[model_name] = {
                        'dimensions': model_metrics,
                        'iq_score': iq_score,
                        'adaptability': adaptability,
                        'problem_solving_depth': problem_solving_depth,
                        'overall_intelligence': (iq_score * 0.5 + adaptability * 0.3 + problem_solving_depth * 0.2)
                    }

                except Exception as e:
                    print(f"Error calculating intelligence metrics: {e}")

            return jsonify(metrics)

    def run(self, host='127.0.0.1', port=5000, debug=False):
        """Start the web server"""

        # Create templates directory if it doesn't exist
        templates_dir = Path(__file__).parent / 'templates'
        templates_dir.mkdir(exist_ok=True)

        # Generate HTML template
        self.create_dashboard_template(templates_dir)

        print(f"\n🌐 Starting web interface at http://{host}:{port}")
        print(f"📊 Open the above URL in your browser to view the dashboard")
        print(f"🛑 Press Ctrl+C to stop the server\n")

        self.app.run(host=host, port=port, debug=debug)

    def create_dashboard_template(self, templates_dir: Path):
        """Create the HTML dashboard template"""

        html_content = '''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>LLM Evaluation Dashboard</title>
    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
    <style>
        * {
            margin: 0;
            padding: 0;
            box-sizing: border-box;
        }

        :root {
            --bg-gradient-start: #667eea;
            --bg-gradient-end: #764ba2;
            --card-bg: #ffffff;
            --text-primary: #333333;
            --text-secondary: #666666;
            --border-color: #e0e0e0;
            --stat-card-bg: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
            --shadow: rgba(0,0,0,0.1);
            --shadow-hover: rgba(0,0,0,0.15);
        }

        body.dark-mode {
            --bg-gradient-start: #1a1a2e;
            --bg-gradient-end: #16213e;
            --card-bg: #0f1419;
            --text-primary: #e0e0e0;
            --text-secondary: #a0a0a0;
            --border-color: #2a2a3e;
            --stat-card-bg: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
            --shadow: rgba(0,0,0,0.3);
            --shadow-hover: rgba(0,0,0,0.5);
        }

        body {
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
            background: linear-gradient(135deg, var(--bg-gradient-start) 0%, var(--bg-gradient-end) 100%);
            color: var(--text-primary);
            min-height: 100vh;
            padding: 20px;
            transition: all 0.3s ease;
        }

        .container {
            max-width: 1400px;
            margin: 0 auto;
        }

        header {
            background: var(--card-bg);
            padding: 30px;
            border-radius: 15px;
            box-shadow: 0 10px 40px var(--shadow);
            margin-bottom: 30px;
            position: relative;
        }

        .theme-toggle {
            position: absolute;
            top: 30px;
            right: 30px;
            background: var(--border-color);
            border: none;
            padding: 10px 20px;
            border-radius: 20px;
            cursor: pointer;
            font-size: 1em;
            transition: all 0.3s;
        }

        .theme-toggle:hover {
            transform: scale(1.05);
            box-shadow: 0 4px 15px var(--shadow-hover);
        }

        .scale-toggle {
            position: absolute;
            top: 30px;
            right: 140px;
            background: var(--border-color);
            border: none;
            padding: 10px 20px;
            border-radius: 20px;
            cursor: pointer;
            font-size: 1em;
            transition: all 0.3s;
        }

        .scale-toggle:hover {
            transform: scale(1.05);
            box-shadow: 0 4px 15px var(--shadow-hover);
        }

        .scale-toggle.zoomed {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
        }

        h1 {
            font-size: 2.5em;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
            margin-bottom: 10px;
        }

        .subtitle {
            color: var(--text-secondary);
            font-size: 1.1em;
        }

        .tabs {
            display: flex;
            gap: 10px;
            margin-bottom: 20px;
            flex-wrap: wrap;
        }

        .tab {
            background: var(--card-bg);
            border: none;
            padding: 12px 24px;
            border-radius: 8px;
            cursor: pointer;
            font-size: 1em;
            transition: all 0.3s;
            box-shadow: 0 2px 10px var(--shadow);
            color: var(--text-primary);
        }

        .tab:hover {
            transform: translateY(-2px);
            box-shadow: 0 4px 15px var(--shadow-hover);
        }

        .tab.active {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
        }

        .content-panel {
            display: none;
            background: var(--card-bg);
            padding: 30px;
            border-radius: 15px;
            box-shadow: 0 10px 40px var(--shadow);
            animation: fadeIn 0.3s;
        }

        .content-panel.active {
            display: block;
        }

        @keyframes fadeIn {
            from { opacity: 0; transform: translateY(10px); }
            to { opacity: 1; transform: translateY(0); }
        }

        .stats-grid {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
            gap: 20px;
            margin-bottom: 30px;
        }

        .stat-card {
            background: var(--stat-card-bg);
            padding: 20px;
            border-radius: 10px;
            text-align: center;
        }

        .stat-card h3 {
            font-size: 0.9em;
            color: var(--text-secondary);
            margin-bottom: 10px;
            text-transform: uppercase;
        }

        .stat-card .value {
            font-size: 2.5em;
            font-weight: bold;
            color: #667eea;
        }

        .chart-container {
            position: relative;
            height: 400px;
            margin-bottom: 30px;
        }

        .controls {
            display: flex;
            gap: 15px;
            margin-bottom: 20px;
            flex-wrap: wrap;
        }

        select, input {
            padding: 10px 15px;
            border: 2px solid var(--border-color);
            border-radius: 8px;
            font-size: 1em;
            background: var(--card-bg);
            color: var(--text-primary);
            cursor: pointer;
            transition: border-color 0.3s;
        }

        select:hover, input:hover {
            border-color: #667eea;
        }

        select:focus, input:focus {
            outline: none;
            border-color: #764ba2;
        }

        table {
            width: 100%;
            border-collapse: collapse;
            margin-top: 20px;
        }

        th, td {
            padding: 12px;
            text-align: left;
            border-bottom: 1px solid var(--border-color);
        }

        th {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            font-weight: 600;
            cursor: pointer;
            user-select: none;
        }

        th:hover {
            opacity: 0.9;
        }

        tr:hover {
            background: var(--border-color);
        }

        .score-badge {
            display: inline-block;
            padding: 5px 12px;
            border-radius: 20px;
            font-weight: bold;
            font-size: 0.9em;
        }

        .score-exceptional {
            background: #10b981;
            color: white;
        }

        .score-pass {
            background: #f59e0b;
            color: white;
        }

        .score-fail {
            background: #ef4444;
            color: white;
        }

        .loading {
            text-align: center;
            padding: 40px;
            color: var(--text-secondary);
        }

        .spinner {
            border: 3px solid var(--border-color);
            border-top: 3px solid #667eea;
            border-radius: 50%;
            width: 40px;
            height: 40px;
            animation: spin 1s linear infinite;
            margin: 20px auto;
        }

        @keyframes spin {
            0% { transform: rotate(0deg); }
            100% { transform: rotate(360deg); }
        }

        .model-selector {
            display: flex;
            gap: 10px;
            flex-wrap: wrap;
            margin-bottom: 20px;
        }

        .model-chip {
            padding: 8px 16px;
            border-radius: 20px;
            border: 2px solid #667eea;
            background: var(--card-bg);
            color: var(--text-primary);
            cursor: pointer;
            transition: all 0.3s;
        }

        .model-chip:hover {
            background: #667eea;
            color: white;
        }

        .model-chip.selected {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
        }

        .metric-card {
            background: var(--card-bg);
            border: 2px solid var(--border-color);
            border-radius: 10px;
            padding: 20px;
            margin-bottom: 20px;
        }

        .metric-card h3 {
            color: #667eea;
            margin-bottom: 15px;
        }

        .progress-bar {
            background: var(--border-color);
            height: 30px;
            border-radius: 15px;
            overflow: hidden;
            margin: 10px 0;
            position: relative;
            cursor: help;
        }

        .progress-fill {
            height: 100%;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            transition: width 0.5s;
            display: flex;
            align-items: center;
            justify-content: flex-end;
            padding-right: 10px;
            color: white;
            font-weight: bold;
        }

        /* Tooltip styles */
        .tooltip {
            position: relative;
            display: inline-block;
        }

        .tooltip .tooltiptext {
            visibility: hidden;
            width: 300px;
            background-color: rgba(0, 0, 0, 0.9);
            color: #fff;
            text-align: left;
            border-radius: 8px;
            padding: 12px;
            position: absolute;
            z-index: 1000;
            bottom: 125%;
            left: 50%;
            margin-left: -150px;
            opacity: 0;
            transition: opacity 0.3s;
            font-size: 0.85em;
            line-height: 1.4;
            box-shadow: 0 4px 20px rgba(0,0,0,0.3);
        }

        .tooltip .tooltiptext::after {
            content: "";
            position: absolute;
            top: 100%;
            left: 50%;
            margin-left: -5px;
            border-width: 5px;
            border-style: solid;
            border-color: rgba(0, 0, 0, 0.9) transparent transparent transparent;
        }

        .tooltip:hover .tooltiptext {
            visibility: visible;
            opacity: 1;
        }

        .tooltiptext code {
            background: rgba(255, 255, 255, 0.1);
            padding: 2px 6px;
            border-radius: 3px;
            font-family: monospace;
            font-size: 0.9em;
        }

        .tooltiptext strong {
            color: #667eea;
        }
    </style>
</head>
<body>
    <div class="container">
        <header>
            <button class="scale-toggle" id="scaleToggle" onclick="toggleScale()" title="Toggle between full scale (0-5) and zoomed view for better distinction">🔍 Full Scale</button>
            <button class="theme-toggle" onclick="toggleTheme()">🌓 Toggle Dark Mode</button>
            <h1>🧠 LLM Evaluation Dashboard</h1>
            <p class="subtitle">Comprehensive Intelligence & Performance Analysis</p>
        </header>

        <div class="tabs">
            <button class="tab active" onclick="switchTab('overview')">📊 Overview</button>
            <button class="tab" onclick="switchTab('comparison')">⚔️ Model Comparison</button>
            <button class="tab" onclick="switchTab('intelligence')">🎯 Intelligence Metrics</button>
            <button class="tab" onclick="switchTab('categories')">📂 Category Analysis</button>
            <button class="tab" onclick="switchTab('details')">🔍 Detailed Results</button>
        </div>

        <div id="overview" class="content-panel active">
            <h2>System Overview</h2>
            <div class="stats-grid" id="overviewStats">
                <div class="loading">
                    <div class="spinner"></div>
                    Loading data...
                </div>
            </div>
            <div class="chart-container">
                <canvas id="overviewChart"></canvas>
            </div>
        </div>

        <div id="comparison" class="content-panel">
            <h2>Model Performance Comparison</h2>
            <div class="controls">
                <select id="metricSelect" onchange="updateComparisonChart()">
                    <option value="average">Average Score</option>
                    <option value="pass_rate">Pass Rate</option>
                    <option value="exceptional_rate">Exceptional Rate</option>
                    <option value="consistency">Consistency</option>
                    <option value="robustness">Robustness</option>
                </select>
            </div>
            <div class="chart-container">
                <canvas id="comparisonChart"></canvas>
            </div>
        </div>

        <div id="intelligence" class="content-panel">
            <h2>Intelligence Metrics Analysis</h2>
            <p style="margin-bottom: 20px; color: #666;">
                Advanced metrics evaluating different dimensions of AI intelligence and reasoning capabilities.
            </p>
            <div id="intelligenceMetrics">
                <div class="loading">
                    <div class="spinner"></div>
                    Calculating intelligence metrics...
                </div>
            </div>
        </div>

        <div id="categories" class="content-panel">
            <h2>Performance by Category</h2>
            <div class="controls">
                <select id="categorySelect" onchange="updateCategoryChart()">
                    <option value="">Loading categories...</option>
                </select>
            </div>
            <div class="chart-container">
                <canvas id="categoryChart"></canvas>
            </div>
        </div>

        <div id="details" class="content-panel">
            <h2>Detailed Test Results</h2>
            <div class="controls">
                <select id="modelSelect" onchange="loadModelDetails()">
                    <option value="">Select a model...</option>
                </select>
                <input type="text" id="searchInput" placeholder="Search tests..." onkeyup="filterTable()">
                <select id="filterCategory" onchange="filterTable()">
                    <option value="">All Categories</option>
                </select>
                <select id="filterScore" onchange="filterTable()">
                    <option value="">All Scores</option>
                    <option value="exceptional">Exceptional (4-5)</option>
                    <option value="pass">Pass (2-3)</option>
                    <option value="fail">Fail (0-1)</option>
                </select>
            </div>
            <div id="detailsTable">
                <p class="loading">Select a model to view detailed results</p>
            </div>
        </div>
    </div>

    <script>
        let comparisonData = null;
        let statisticsData = null;
        let intelligenceData = null;
        let currentModelDetails = null;
        let zoomedScale = false;
        let overviewChartInstance = null;

        // Theme toggle functionality
        function toggleTheme() {
            document.body.classList.toggle('dark-mode');
            const isDark = document.body.classList.contains('dark-mode');
            localStorage.setItem('darkMode', isDark ? 'enabled' : 'disabled');
        }

        // Load theme preference
        function loadThemePreference() {
            const darkMode = localStorage.getItem('darkMode');
            if (darkMode === 'enabled') {
                document.body.classList.add('dark-mode');
            }
        }

        // Scale toggle functionality
        function toggleScale() {
            zoomedScale = !zoomedScale;
            const btn = document.getElementById('scaleToggle');
            if (zoomedScale) {
                btn.textContent = '🔎 Zoomed';
                btn.classList.add('zoomed');
            } else {
                btn.textContent = '🔍 Full Scale';
                btn.classList.remove('zoomed');
            }
            localStorage.setItem('zoomedScale', zoomedScale ? 'enabled' : 'disabled');
            // Refresh all charts with new scale
            refreshAllCharts();
        }

        // Load scale preference
        function loadScalePreference() {
            const savedScale = localStorage.getItem('zoomedScale');
            if (savedScale === 'enabled') {
                zoomedScale = true;
                const btn = document.getElementById('scaleToggle');
                btn.textContent = '🔎 Zoomed';
                btn.classList.add('zoomed');
            }
        }

        // Calculate optimal Y-axis range for zoomed view
        function getScaleOptions(data, isRadar = false) {
            if (!zoomedScale) {
                // Full scale: 0 to 5
                if (isRadar) {
                    return { r: { beginAtZero: true, max: 5 } };
                }
                return { y: { beginAtZero: true, max: 5 } };
            }

            // Zoomed scale: calculate min/max with padding
            const validData = data.filter(d => d !== null && d !== undefined && !isNaN(d));
            if (validData.length === 0) {
                if (isRadar) {
                    return { r: { beginAtZero: true, max: 5 } };
                }
                return { y: { beginAtZero: true, max: 5 } };
            }

            const minVal = Math.min(...validData);
            const maxVal = Math.max(...validData);
            const range = maxVal - minVal;
            const padding = Math.max(range * 0.2, 0.2); // At least 0.2 padding

            let min = Math.max(0, Math.floor((minVal - padding) * 10) / 10);
            let max = Math.min(5, Math.ceil((maxVal + padding) * 10) / 10);

            // Ensure we have at least some range
            if (max - min < 0.5) {
                min = Math.max(0, minVal - 0.3);
                max = Math.min(5, maxVal + 0.3);
            }

            if (isRadar) {
                return { r: { min: min, max: max, beginAtZero: false } };
            }
            return { y: { min: min, max: max, beginAtZero: false } };
        }

        // Refresh all charts when scale changes
        function refreshAllCharts() {
            if (comparisonData) {
                refreshOverviewChart();
                updateComparisonChart();
                updateCategoryChart();
            }
        }

        // Tab switching
        function switchTab(tabName) {
            document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
            document.querySelectorAll('.content-panel').forEach(p => p.classList.remove('active'));

            event.target.classList.add('active');
            document.getElementById(tabName).classList.add('active');
        }

        // Initialize dashboard
        async function initDashboard() {
            loadThemePreference();
            loadScalePreference();
            await loadOverview();
            await loadComparison();
            await loadStatistics();
            await loadIntelligenceMetrics();
            populateModelSelector();
        }

        async function loadOverview() {
            try {
                const response = await axios.get('/api/comparison');
                comparisonData = response.data;

                const models = Object.keys(comparisonData.models);
                const totalTests = models.reduce((sum, model) =>
                    sum + comparisonData.models[model].metadata.total_tests, 0);
                const avgScore = models.reduce((sum, model) =>
                    sum + (comparisonData.models[model].overall_stats.average || 0), 0) / models.length;

                const statsHtml = `
                    <div class="stat-card">
                        <h3>Models Evaluated</h3>
                        <div class="value">${models.length}</div>
                    </div>
                    <div class="stat-card">
                        <h3>Total Tests</h3>
                        <div class="value">${totalTests}</div>
                    </div>
                    <div class="stat-card">
                        <h3>Average Score</h3>
                        <div class="value">${avgScore.toFixed(2)}</div>
                    </div>
                    <div class="stat-card">
                        <h3>Categories</h3>
                        <div class="value">${comparisonData.categories.length}</div>
                    </div>
                `;

                document.getElementById('overviewStats').innerHTML = statsHtml;

                // Create overview chart
                refreshOverviewChart();

            } catch (error) {
                console.error('Error loading overview:', error);
            }
        }

        function refreshOverviewChart() {
            if (!comparisonData) return;

            const models = Object.keys(comparisonData.models);
            const data = models.map(m => comparisonData.models[m].overall_stats.average || 0);

            if (overviewChartInstance) {
                overviewChartInstance.destroy();
            }

            const ctx = document.getElementById('overviewChart').getContext('2d');
            overviewChartInstance = new Chart(ctx, {
                type: 'bar',
                data: {
                    labels: models,
                    datasets: [{
                        label: 'Average Score',
                        data: data,
                        backgroundColor: 'rgba(102, 126, 234, 0.6)',
                        borderColor: 'rgba(102, 126, 234, 1)',
                        borderWidth: 2
                    }]
                },
                options: {
                    responsive: true,
                    maintainAspectRatio: false,
                    scales: getScaleOptions(data)
                }
            });
        }

        async function loadComparison() {
            updateComparisonChart();
        }

        async function updateComparisonChart() {
            if (!comparisonData) return;

            const metric = document.getElementById('metricSelect').value;
            const models = Object.keys(comparisonData.models);

            let data, label;

            if (metric === 'consistency' || metric === 'robustness') {
                if (!statisticsData) {
                    await loadStatistics();
                }
                const index = statisticsData.models.indexOf(models[0]);
                data = models.map((m, i) => statisticsData[metric + '_score'][i]);
                label = metric.charAt(0).toUpperCase() + metric.slice(1) + ' Score';
            } else {
                data = models.map(m => comparisonData.models[m].overall_stats[metric] || 0);
                label = metric.split('_').map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' ');
            }

            const ctx = document.getElementById('comparisonChart');
            if (window.comparisonChartInstance) {
                window.comparisonChartInstance.destroy();
            }

            window.comparisonChartInstance = new Chart(ctx, {
                type: 'radar',
                data: {
                    labels: models,
                    datasets: [{
                        label: label,
                        data: data,
                        backgroundColor: 'rgba(118, 75, 162, 0.2)',
                        borderColor: 'rgba(118, 75, 162, 1)',
                        pointBackgroundColor: 'rgba(118, 75, 162, 1)',
                        pointBorderColor: '#fff',
                        pointHoverBackgroundColor: '#fff',
                        pointHoverBorderColor: 'rgba(118, 75, 162, 1)'
                    }]
                },
                options: {
                    responsive: true,
                    maintainAspectRatio: false,
                    scales: getScaleOptions(data, true)
                }
            });
        }

        async function loadStatistics() {
            try {
                const response = await axios.get('/api/statistics');
                statisticsData = response.data;
            } catch (error) {
                console.error('Error loading statistics:', error);
            }
        }

        async function loadIntelligenceMetrics() {
            try {
                const response = await axios.get('/api/intelligence_metrics');
                intelligenceData = response.data;

                let html = '';

                for (const [model, metrics] of Object.entries(intelligenceData)) {
                    html += `
                        <div class="metric-card">
                            <h3>${model}</h3>

                            <div style="margin-bottom: 20px;" class="tooltip">
                                <strong>Overall Intelligence Score:</strong>
                                <span class="tooltiptext">
                                    <strong>Calculation:</strong><br>
                                    Overall = (IQ × 0.5) + (Adaptability × 0.3) + (Problem-Solving × 0.2)<br><br>
                                    <strong>Values:</strong><br>
                                    • IQ: ${metrics.iq_score.toFixed(1)}<br>
                                    • Adaptability: ${metrics.adaptability.toFixed(1)}%<br>
                                    • Problem-Solving: ${metrics.problem_solving_depth.toFixed(1)}<br><br>
                                    Result: ${metrics.overall_intelligence.toFixed(1)}
                                </span>
                                <div class="progress-bar">
                                    <div class="progress-fill" style="width: ${metrics.overall_intelligence}%">
                                        ${metrics.overall_intelligence.toFixed(1)}
                                    </div>
                                </div>
                            </div>

                            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 15px;">
                                <div class="tooltip">
                                    <strong>IQ Score:</strong>
                                    <span class="tooltiptext">
                                        <strong>Weighted Average of Dimensions:</strong><br><br>
                                        ${Object.entries(metrics.dimensions).map(([dim, data]) => {
                                            const weights = {
                                                'logical_reasoning': 1.5,
                                                'mathematical_ability': 1.3,
                                                'technical_knowledge': 1.4,
                                                'instruction_following': 1.2,
                                                'linguistic_nuance': 1.1,
                                                'creativity': 1.0,
                                                'conversational_depth': 1.0
                                            };
                                            return `• ${dim.replace(/_/g, ' ')}: ${data.score.toFixed(1)} × ${weights[dim] || 1.0}`;
                                        }).join('<br>')}<br><br>
                                        Normalized to 0-100 scale
                                    </span>
                                    <div class="progress-bar">
                                        <div class="progress-fill" style="width: ${metrics.iq_score}%">
                                            ${metrics.iq_score.toFixed(1)}
                                        </div>
                                    </div>
                                </div>

                                <div class="tooltip">
                                    <strong>Adaptability:</strong>
                                    <span class="tooltiptext">
                                        <strong>Cross-Category Performance:</strong><br><br>
                                        Measures versatility across different task types.<br><br>
                                        Formula: (Categories with avg ≥ 2.5) / (Total categories) × 100<br><br>
                                        Higher score = more versatile model
                                    </span>
                                    <div class="progress-bar">
                                        <div class="progress-fill" style="width: ${metrics.adaptability}%">
                                            ${metrics.adaptability.toFixed(1)}%
                                        </div>
                                    </div>
                                </div>

                                <div class="tooltip">
                                    <strong>Problem-Solving Depth:</strong>
                                    <span class="tooltiptext">
                                        <strong>Performance on Challenging Tasks:</strong><br><br>
                                        Average score on "hard" and "very_hard" difficulty tests.<br><br>
                                        Formula: (Avg score on hard tests) × 20<br><br>
                                        Tests critical thinking and complex reasoning
                                    </span>
                                    <div class="progress-bar">
                                        <div class="progress-fill" style="width: ${metrics.problem_solving_depth}%">
                                            ${metrics.problem_solving_depth.toFixed(1)}
                                        </div>
                                    </div>
                                </div>
                            </div>

                            <h4 style="margin-top: 20px; color: #764ba2;">Cognitive Dimensions:</h4>
                            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 10px; margin-top: 10px;">
                    `;

                    const dimensionWeights = {
                        'logical_reasoning': 1.5,
                        'mathematical_ability': 1.3,
                        'technical_knowledge': 1.4,
                        'instruction_following': 1.2,
                        'linguistic_nuance': 1.1,
                        'creativity': 1.0,
                        'conversational_depth': 1.0
                    };

                    for (const [dim, data] of Object.entries(metrics.dimensions)) {
                        const weight = dimensionWeights[dim] || 1.0;
                        html += `
                            <div class="tooltip">
                                <small>${dim.replace(/_/g, ' ').toUpperCase()}</small>
                                <span class="tooltiptext">
                                    <strong>${dim.replace(/_/g, ' ').toUpperCase()}</strong><br><br>
                                    Score: <code>${data.score.toFixed(2)}/5.00</code><br>
                                    Weight in IQ: <code>${weight}</code><br>
                                    Tests evaluated: <code>${data.count}</code><br><br>
                                    Normalized: ${data.normalized.toFixed(1)}%
                                </span>
                                <div class="progress-bar" style="height: 20px;">
                                    <div class="progress-fill" style="width: ${data.normalized}%; font-size: 0.8em;">
                                        ${data.score.toFixed(1)}
                                    </div>
                                </div>
                            </div>
                        `;
                    }

                    html += `
                            </div>
                        </div>
                    `;
                }

                document.getElementById('intelligenceMetrics').innerHTML = html;

            } catch (error) {
                console.error('Error loading intelligence metrics:', error);
                document.getElementById('intelligenceMetrics').innerHTML =
                    '<p class="loading">Error loading intelligence metrics</p>';
            }
        }

        function populateModelSelector() {
            if (!comparisonData) return;

            const models = Object.keys(comparisonData.models);
            const select = document.getElementById('modelSelect');

            select.innerHTML = '<option value="">Select a model...</option>';
            models.forEach(model => {
                const option = document.createElement('option');
                option.value = model;
                option.textContent = model;
                select.appendChild(option);
            });

            // Populate category filter
            const categoryFilter = document.getElementById('filterCategory');
            categoryFilter.innerHTML = '<option value="">All Categories</option>';
            comparisonData.categories.forEach(cat => {
                const option = document.createElement('option');
                option.value = cat;
                option.textContent = cat;
                categoryFilter.appendChild(option);
            });

            // Populate category chart selector
            const categorySelect = document.getElementById('categorySelect');
            categorySelect.innerHTML = '';
            comparisonData.categories.forEach(cat => {
                const option = document.createElement('option');
                option.value = cat;
                option.textContent = cat;
                categorySelect.appendChild(option);
            });

            if (comparisonData.categories.length > 0) {
                updateCategoryChart();
            }
        }

        function updateCategoryChart() {
            if (!comparisonData) return;

            const category = document.getElementById('categorySelect').value;
            const models = Object.keys(comparisonData.models);

            const data = models.map(model => {
                const stats = comparisonData.models[model].category_stats[category];
                return stats ? stats.average : 0;
            });

            const ctx = document.getElementById('categoryChart');
            if (window.categoryChartInstance) {
                window.categoryChartInstance.destroy();
            }

            window.categoryChartInstance = new Chart(ctx, {
                type: 'bar',
                data: {
                    labels: models,
                    datasets: [{
                        label: `${category} - Average Score`,
                        data: data,
                        backgroundColor: 'rgba(102, 126, 234, 0.6)',
                        borderColor: 'rgba(102, 126, 234, 1)',
                        borderWidth: 2
                    }]
                },
                options: {
                    responsive: true,
                    maintainAspectRatio: false,
                    scales: getScaleOptions(data)
                }
            });
        }

        async function loadModelDetails() {
            const modelName = document.getElementById('modelSelect').value;
            if (!modelName || !comparisonData) return;

            currentModelDetails = comparisonData.models[modelName].test_results;
            displayDetailsTable(currentModelDetails);
        }

        function displayDetailsTable(results) {
            let html = `
                <table>
                    <thead>
                        <tr>
                            <th onclick="sortTable('test_name')">Test Name</th>
                            <th onclick="sortTable('category')">Category</th>
                            <th onclick="sortTable('difficulty')">Difficulty</th>
                            <th onclick="sortTable('score')">Score</th>
                            <th onclick="sortTable('generation_time')">Time (s)</th>
                            <th onclick="sortTable('tokens')">Tokens</th>
                            <th onclick="sortTable('status')">Status</th>
                            <th>Notes</th>
                        </tr>
                    </thead>
                    <tbody>
            `;

            results.forEach(test => {
                const scoreClass = test.score >= 4 ? 'exceptional' : test.score >= 2 ? 'pass' : 'fail';
                const scoreDisplay = test.score !== null ? test.score.toFixed(1) : 'N/A';

                // Extract timing and token info
                const genTime = test.generation_time ? test.generation_time.toFixed(2) : 'N/A';
                let tokenInfo = 'N/A';
                let tokensPerSec = '';

                if (test.api_metrics && test.api_metrics.usage) {
                    const usage = test.api_metrics.usage;
                    const totalTokens = usage.total_tokens || usage.eval_count || 'N/A';
                    const completionTokens = usage.completion_tokens || usage.eval_count;

                    if (totalTokens !== 'N/A') {
                        tokenInfo = totalTokens.toString();

                        // Calculate tokens/sec if we have both values
                        if (test.generation_time && completionTokens) {
                            const tps = completionTokens / test.generation_time;
                            tokensPerSec = `<br><small>(${tps.toFixed(1)} t/s)</small>`;
                        }
                    }
                }

                html += `
                    <tr>
                        <td><strong>${test.test_name}</strong></td>
                        <td>${test.category}</td>
                        <td>${test.difficulty}</td>
                        <td><span class="score-badge score-${scoreClass}">${scoreDisplay}</span></td>
                        <td>${genTime}</td>
                        <td>${tokenInfo}${tokensPerSec}</td>
                        <td>${test.status}</td>
                        <td><small>${test.notes || ''}</small></td>
                    </tr>
                `;
            });

            html += '</tbody></table>';
            document.getElementById('detailsTable').innerHTML = html;
        }

        function filterTable() {
            if (!currentModelDetails) return;

            const searchTerm = document.getElementById('searchInput').value.toLowerCase();
            const categoryFilter = document.getElementById('filterCategory').value;
            const scoreFilter = document.getElementById('filterScore').value;

            const filtered = currentModelDetails.filter(test => {
                const matchesSearch = test.test_name.toLowerCase().includes(searchTerm) ||
                                    test.category.toLowerCase().includes(searchTerm);
                const matchesCategory = !categoryFilter || test.category === categoryFilter;

                let matchesScore = true;
                if (scoreFilter === 'exceptional') matchesScore = test.score >= 4;
                else if (scoreFilter === 'pass') matchesScore = test.score >= 2 && test.score < 4;
                else if (scoreFilter === 'fail') matchesScore = test.score < 2;

                return matchesSearch && matchesCategory && matchesScore;
            });

            displayDetailsTable(filtered);
        }

        function sortTable(column) {
            if (!currentModelDetails) return;

            currentModelDetails.sort((a, b) => {
                if (column === 'score') {
                    return (b[column] || 0) - (a[column] || 0);
                }
                return (a[column] || '').toString().localeCompare((b[column] || '').toString());
            });

            filterTable();
        }

        // Initialize on load
        initDashboard();
    </script>
</body>
</html>'''

        template_path = templates_dir / 'dashboard.html'
        with open(template_path, 'w', encoding='utf-8') as f:
            f.write(html_content)


def main():
    parser = argparse.ArgumentParser(
        description="Analyze and compare AI model evaluation results",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Compare all models
  python analyze_results.py --compare

  # Detailed report for specific model
  python analyze_results.py --detail "qwen3:4b-q4_K_M"

  # Export to CSV
  python analyze_results.py --export comparison.csv

  # Custom results directory
  python analyze_results.py --results-dir ./my_results --compare
        """
    )

    parser.add_argument(
        '--results-dir',
        default='results',
        help='Directory containing result JSON files (default: results)'
    )

    parser.add_argument(
        '--compare',
        action='store_true',
        help='Generate comparison report for all models'
    )

    parser.add_argument(
        '--detail',
        type=str,
        help='Generate detailed report for specific model'
    )

    parser.add_argument(
        '--export',
        type=str,
        help='Export results to CSV file'
    )

    parser.add_argument(
        '--web',
        action='store_true',
        help='Launch interactive web dashboard'
    )

    parser.add_argument(
        '--host',
        default='127.0.0.1',
        help='Web server host (default: 127.0.0.1)'
    )

    parser.add_argument(
        '--port',
        type=int,
        default=5000,
        help='Web server port (default: 5000)'
    )

    args = parser.parse_args()

    if args.web:
        web = WebInterface(results_dir=args.results_dir)
        web.run(host=args.host, port=args.port)
        return

    analyzer = ResultsAnalyzer(results_dir=args.results_dir)

    if args.compare:
        result_files = analyzer.find_result_files()
        if result_files:
            analyzer.compare_models(result_files)
        else:
            print(f"❌ No result files found in {args.results_dir}")

    if args.detail:
        analyzer.generate_detailed_report(args.detail)

    if args.export:
        analyzer.export_csv(args.export)

    if not (args.compare or args.detail or args.export):
        parser.print_help()


if __name__ == "__main__":
    main()