llm-eval-forensics/analyze_results.py

#!/usr/bin/env python3
"""
AI Model Evaluation Results Analyzer
Compares results across different models and quantizations
"""

import json
import sys
from pathlib import Path
from typing import List, Dict
import argparse
from collections import defaultdict


class ResultsAnalyzer:
    def __init__(self, results_dir: str = "results"):
        self.results_dir = Path(results_dir)

    def load_result_file(self, filepath: Path) -> Dict:
        """Load a single result file"""
        with open(filepath, 'r', encoding='utf-8') as f:
            return json.load(f)

    def find_result_files(self, pattern: str = "*_latest.json") -> List[Path]:
        """Find all result files matching pattern"""
        return sorted(self.results_dir.glob(pattern))

    def extract_scores_by_category(self, results: Dict) -> Dict[str, List[float]]:
        """Extract scores organized by category"""
        scores_by_category = defaultdict(list)

        for test in results.get('test_results', []):
            category = test.get('category', 'Unknown')
            score = test.get('score') or test.get('overall_score')

            if score is not None:
                scores_by_category[category].append(score)

        return dict(scores_by_category)

    def calculate_statistics(self, scores: List[float]) -> Dict:
        """Calculate statistics for a list of scores"""
        if not scores:
            return {
                'count': 0,
                'average': 0.0,
                'min': 0.0,
                'max': 0.0,
                'pass_rate': 0.0,
                'exceptional_rate': 0.0
            }

        return {
            'count': len(scores),
            'average': sum(scores) / len(scores),
            'min': min(scores),
            'max': max(scores),
            'pass_rate': len([s for s in scores if s >= 2]) / len(scores) * 100,
            'exceptional_rate': len([s for s in scores if s >= 4]) / len(scores) * 100
        }

    def compare_models(self, result_files: List[Path]):
        """Generate comparison report for multiple models"""
        print("\n" + "="*100)
        print("📊 AI MODEL COMPARISON REPORT")
        print("="*100)

        # Load all results
        all_results = {}
        for filepath in result_files:
            try:
                results = self.load_result_file(filepath)
                model_name = results['metadata']['model_name']
                all_results[model_name] = results
            except Exception as e:
                print(f"⚠️  Error loading {filepath}: {e}")

        if not all_results:
            print("❌ No valid result files found")
            return

        # Overall comparison
        print("\n📈 OVERALL PERFORMANCE")
        print("-"*100)
        print(f"{'Model':<30} {'Total Tests':<12} {'Avg Score':<12} {'Pass Rate':<12} {'Exceptional':<12}")
        print("-"*100)

        model_stats = {}
        for model_name, results in sorted(all_results.items()):
            all_scores = [
                test.get('score') or test.get('overall_score')
                for test in results.get('test_results', [])
            ]
            all_scores = [s for s in all_scores if s is not None]

            stats = self.calculate_statistics(all_scores)
            model_stats[model_name] = stats

            print(f"{model_name:<30} {stats['count']:<12} {stats['average']:<12.2f} "
                  f"{stats['pass_rate']:<12.1f}% {stats['exceptional_rate']:<12.1f}%")

        # Category-wise comparison
        print("\n\n📂 CATEGORY-WISE PERFORMANCE")
        print("="*100)

        # Get all unique categories
        all_categories = set()
        for results in all_results.values():
            for test in results.get('test_results', []):
                all_categories.add(test.get('category', 'Unknown'))

        for category in sorted(all_categories):
            print(f"\n🔖 {category}")
            print("-"*100)
            print(f"{'Model':<30} {'Tests':<8} {'Avg Score':<12} {'Pass Rate':<12} {'Exceptional':<12}")
            print("-"*100)

            for model_name, results in sorted(all_results.items()):
                cat_scores = [
                    test.get('score') or test.get('overall_score')
                    for test in results.get('test_results', [])
                    if test.get('category') == category and (test.get('score') or test.get('overall_score')) is not None
                ]

                if cat_scores:
                    stats = self.calculate_statistics(cat_scores)
                    print(f"{model_name:<30} {stats['count']:<8} {stats['average']:<12.2f} "
                          f"{stats['pass_rate']:<12.1f}% {stats['exceptional_rate']:<12.1f}%")
                else:
                    print(f"{model_name:<30} {'N/A':<8} {'N/A':<12} {'N/A':<12} {'N/A':<12}")

        # Difficulty-based comparison
        print("\n\n⚡ DIFFICULTY-BASED PERFORMANCE")
        print("="*100)

        difficulties = ['medium', 'hard', 'very_hard']
        for difficulty in difficulties:
            print(f"\n🎯 Difficulty: {difficulty.replace('_', ' ').title()}")
            print("-"*100)
            print(f"{'Model':<30} {'Tests':<8} {'Avg Score':<12} {'Pass Rate':<12}")
            print("-"*100)

            for model_name, results in sorted(all_results.items()):
                diff_scores = [
                    test.get('score') or test.get('overall_score')
                    for test in results.get('test_results', [])
                    if test.get('difficulty') == difficulty and (test.get('score') or test.get('overall_score')) is not None
                ]

                if diff_scores:
                    stats = self.calculate_statistics(diff_scores)
                    print(f"{model_name:<30} {stats['count']:<8} {stats['average']:<12.2f} "
                          f"{stats['pass_rate']:<12.1f}%")
                else:
                    print(f"{model_name:<30} {'N/A':<8} {'N/A':<12} {'N/A':<12}")

        # Winner analysis
        print("\n\n🏆 WINNERS BY CATEGORY")
        print("="*100)

        for category in sorted(all_categories):
            best_model = None
            best_score = -1

            for model_name, results in all_results.items():
                cat_scores = [
                    test.get('score') or test.get('overall_score')
                    for test in results.get('test_results', [])
                    if test.get('category') == category and (test.get('score') or test.get('overall_score')) is not None
                ]

                if cat_scores:
                    avg = sum(cat_scores) / len(cat_scores)
                    if avg > best_score:
                        best_score = avg
                        best_model = model_name

            if best_model:
                print(f"{category:<50} → {best_model} ({best_score:.2f})")

        print("\n\n🎖️ OVERALL WINNER")
        print("="*100)
        best_overall = max(model_stats.items(), key=lambda x: x[1]['average'])
        print(f"Model: {best_overall[0]}")
        print(f"Average Score: {best_overall[1]['average']:.2f}/5.00")
        print(f"Pass Rate: {best_overall[1]['pass_rate']:.1f}%")
        print(f"Exceptional Rate: {best_overall[1]['exceptional_rate']:.1f}%")
        print("="*100)

    def generate_detailed_report(self, model_name: str):
        """Generate detailed report for a specific model"""
        # Find result file for this model
        pattern = f"{model_name.replace(':', '_')}_latest.json"
        filepath = self.results_dir / pattern

        if not filepath.exists():
            print(f"❌ No results found for model: {model_name}")
            return

        results = self.load_result_file(filepath)

        print("\n" + "="*100)
        print(f"📋 DETAILED REPORT: {model_name}")
        print("="*100)

        # Metadata
        metadata = results.get('metadata', {})
        print(f"\n⏱️  Test Duration: {metadata.get('test_start')} → {metadata.get('test_end')}")
        print(f"📊 Tests: {metadata.get('completed_tests')}/{metadata.get('total_tests')}")

        # Overall stats
        all_scores = [
            test.get('score') or test.get('overall_score')
            for test in results.get('test_results', [])
        ]
        all_scores = [s for s in all_scores if s is not None]
        stats = self.calculate_statistics(all_scores)

        print(f"\n📈 Overall Performance:")
        print(f"  Average Score: {stats['average']:.2f}/5.00")
        print(f"  Pass Rate: {stats['pass_rate']:.1f}%")
        print(f"  Exceptional Rate: {stats['exceptional_rate']:.1f}%")
        print(f"  Score Range: {stats['min']:.1f} - {stats['max']:.1f}")

        # Test-by-test results
        print(f"\n\n📝 TEST-BY-TEST RESULTS")
        print("="*100)

        for test in results.get('test_results', []):
            score = test.get('score') or test.get('overall_score')
            status_icon = "✅" if score and score >= 4 else "⚠️" if score and score >= 2 else "❌"

            print(f"\n{status_icon} [{test.get('test_id')}] {test.get('test_name')}")
            print(f"   Category: {test.get('category')}")
            print(f"   Type: {test.get('type')}")
            print(f"   Difficulty: {test.get('difficulty', 'unknown')}")
            print(f"   Score: {score if score is not None else 'N/A'}/5.00")

            if test.get('notes'):
                print(f"   Notes: {test['notes']}")

            # Show criteria pass/fail if available
            if test.get('evaluation_criteria'):
                print(f"   Criteria ({len(test['evaluation_criteria'])} items):")
                for criterion in test['evaluation_criteria']:
                    print(f"     • {criterion}")

        print("\n" + "="*100)

    def export_csv(self, output_file: str = "comparison.csv"):
        """Export comparison data to CSV"""
        import csv

        result_files = self.find_result_files()
        if not result_files:
            print("❌ No result files found")
            return

        # Prepare CSV data
        csv_data = []
        headers = ['Model', 'Test ID', 'Test Name', 'Category', 'Type', 'Difficulty', 'Score', 'Notes']

        for filepath in result_files:
            results = self.load_result_file(filepath)
            model_name = results['metadata']['model_name']

            for test in results.get('test_results', []):
                csv_data.append([
                    model_name,
                    test.get('test_id', ''),
                    test.get('test_name', ''),
                    test.get('category', ''),
                    test.get('type', ''),
                    test.get('difficulty', ''),
                    test.get('score') or test.get('overall_score', ''),
                    test.get('notes', '')
                ])

        # Write CSV
        output_path = self.results_dir / output_file
        with open(output_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(headers)
            writer.writerows(csv_data)

        print(f"✅ CSV exported to: {output_path}")


def main():
    parser = argparse.ArgumentParser(
        description="Analyze and compare AI model evaluation results",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Compare all models
  python analyze_results.py --compare

  # Detailed report for specific model
  python analyze_results.py --detail "qwen3:4b-q4_K_M"

  # Export to CSV
  python analyze_results.py --export comparison.csv

  # Custom results directory
  python analyze_results.py --results-dir ./my_results --compare
        """
    )

    parser.add_argument(
        '--results-dir',
        default='results',
        help='Directory containing result JSON files (default: results)'
    )

    parser.add_argument(
        '--compare',
        action='store_true',
        help='Generate comparison report for all models'
    )

    parser.add_argument(
        '--detail',
        type=str,
        help='Generate detailed report for specific model'
    )

    parser.add_argument(
        '--export',
        type=str,
        help='Export results to CSV file'
    )

    args = parser.parse_args()

    analyzer = ResultsAnalyzer(results_dir=args.results_dir)

    if args.compare:
        result_files = analyzer.find_result_files()
        if result_files:
            analyzer.compare_models(result_files)
        else:
            print(f"❌ No result files found in {args.results_dir}")

    if args.detail:
        analyzer.generate_detailed_report(args.detail)

    if args.export:
        analyzer.export_csv(args.export)

    if not (args.compare or args.detail or args.export):
        parser.print_help()


if __name__ == "__main__":
    main()