initial commit

2026-01-16 09:18:07 +01:00
parent 1ef6758b3d
commit 514bd9b571
7 changed files with 2134 additions and 1 deletions
--- a/analyze_results.py
+++ b/analyze_results.py
@@ -0,0 +1,355 @@
+#!/usr/bin/env python3
+"""
+AI Model Evaluation Results Analyzer
+Compares results across different models and quantizations
+"""
+
+import json
+import sys
+from pathlib import Path
+from typing import List, Dict
+import argparse
+from collections import defaultdict
+
+
+class ResultsAnalyzer:
+    def __init__(self, results_dir: str = "results"):
+        self.results_dir = Path(results_dir)
+        
+    def load_result_file(self, filepath: Path) -> Dict:
+        """Load a single result file"""
+        with open(filepath, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    
+    def find_result_files(self, pattern: str = "*_latest.json") -> List[Path]:
+        """Find all result files matching pattern"""
+        return sorted(self.results_dir.glob(pattern))
+    
+    def extract_scores_by_category(self, results: Dict) -> Dict[str, List[float]]:
+        """Extract scores organized by category"""
+        scores_by_category = defaultdict(list)
+        
+        for test in results.get('test_results', []):
+            category = test.get('category', 'Unknown')
+            score = test.get('score') or test.get('overall_score')
+            
+            if score is not None:
+                scores_by_category[category].append(score)
+        
+        return dict(scores_by_category)
+    
+    def calculate_statistics(self, scores: List[float]) -> Dict:
+        """Calculate statistics for a list of scores"""
+        if not scores:
+            return {
+                'count': 0,
+                'average': 0.0,
+                'min': 0.0,
+                'max': 0.0,
+                'pass_rate': 0.0,
+                'exceptional_rate': 0.0
+            }
+        
+        return {
+            'count': len(scores),
+            'average': sum(scores) / len(scores),
+            'min': min(scores),
+            'max': max(scores),
+            'pass_rate': len([s for s in scores if s >= 2]) / len(scores) * 100,
+            'exceptional_rate': len([s for s in scores if s >= 4]) / len(scores) * 100
+        }
+    
+    def compare_models(self, result_files: List[Path]):
+        """Generate comparison report for multiple models"""
+        print("\n" + "="*100)
+        print("📊 AI MODEL COMPARISON REPORT")
+        print("="*100)
+        
+        # Load all results
+        all_results = {}
+        for filepath in result_files:
+            try:
+                results = self.load_result_file(filepath)
+                model_name = results['metadata']['model_name']
+                all_results[model_name] = results
+            except Exception as e:
+                print(f"⚠️  Error loading {filepath}: {e}")
+        
+        if not all_results:
+            print("❌ No valid result files found")
+            return
+        
+        # Overall comparison
+        print("\n📈 OVERALL PERFORMANCE")
+        print("-"*100)
+        print(f"{'Model':<30} {'Total Tests':<12} {'Avg Score':<12} {'Pass Rate':<12} {'Exceptional':<12}")
+        print("-"*100)
+        
+        model_stats = {}
+        for model_name, results in sorted(all_results.items()):
+            all_scores = [
+                test.get('score') or test.get('overall_score')
+                for test in results.get('test_results', [])
+            ]
+            all_scores = [s for s in all_scores if s is not None]
+            
+            stats = self.calculate_statistics(all_scores)
+            model_stats[model_name] = stats
+            
+            print(f"{model_name:<30} {stats['count']:<12} {stats['average']:<12.2f} "
+                  f"{stats['pass_rate']:<12.1f}% {stats['exceptional_rate']:<12.1f}%")
+        
+        # Category-wise comparison
+        print("\n\n📂 CATEGORY-WISE PERFORMANCE")
+        print("="*100)
+        
+        # Get all unique categories
+        all_categories = set()
+        for results in all_results.values():
+            for test in results.get('test_results', []):
+                all_categories.add(test.get('category', 'Unknown'))
+        
+        for category in sorted(all_categories):
+            print(f"\n🔖 {category}")
+            print("-"*100)
+            print(f"{'Model':<30} {'Tests':<8} {'Avg Score':<12} {'Pass Rate':<12} {'Exceptional':<12}")
+            print("-"*100)
+            
+            for model_name, results in sorted(all_results.items()):
+                cat_scores = [
+                    test.get('score') or test.get('overall_score')
+                    for test in results.get('test_results', [])
+                    if test.get('category') == category and (test.get('score') or test.get('overall_score')) is not None
+                ]
+                
+                if cat_scores:
+                    stats = self.calculate_statistics(cat_scores)
+                    print(f"{model_name:<30} {stats['count']:<8} {stats['average']:<12.2f} "
+                          f"{stats['pass_rate']:<12.1f}% {stats['exceptional_rate']:<12.1f}%")
+                else:
+                    print(f"{model_name:<30} {'N/A':<8} {'N/A':<12} {'N/A':<12} {'N/A':<12}")
+        
+        # Difficulty-based comparison
+        print("\n\n⚡ DIFFICULTY-BASED PERFORMANCE")
+        print("="*100)
+        
+        difficulties = ['medium', 'hard', 'very_hard']
+        for difficulty in difficulties:
+            print(f"\n🎯 Difficulty: {difficulty.replace('_', ' ').title()}")
+            print("-"*100)
+            print(f"{'Model':<30} {'Tests':<8} {'Avg Score':<12} {'Pass Rate':<12}")
+            print("-"*100)
+            
+            for model_name, results in sorted(all_results.items()):
+                diff_scores = [
+                    test.get('score') or test.get('overall_score')
+                    for test in results.get('test_results', [])
+                    if test.get('difficulty') == difficulty and (test.get('score') or test.get('overall_score')) is not None
+                ]
+                
+                if diff_scores:
+                    stats = self.calculate_statistics(diff_scores)
+                    print(f"{model_name:<30} {stats['count']:<8} {stats['average']:<12.2f} "
+                          f"{stats['pass_rate']:<12.1f}%")
+                else:
+                    print(f"{model_name:<30} {'N/A':<8} {'N/A':<12} {'N/A':<12}")
+        
+        # Winner analysis
+        print("\n\n🏆 WINNERS BY CATEGORY")
+        print("="*100)
+        
+        for category in sorted(all_categories):
+            best_model = None
+            best_score = -1
+            
+            for model_name, results in all_results.items():
+                cat_scores = [
+                    test.get('score') or test.get('overall_score')
+                    for test in results.get('test_results', [])
+                    if test.get('category') == category and (test.get('score') or test.get('overall_score')) is not None
+                ]
+                
+                if cat_scores:
+                    avg = sum(cat_scores) / len(cat_scores)
+                    if avg > best_score:
+                        best_score = avg
+                        best_model = model_name
+            
+            if best_model:
+                print(f"{category:<50} → {best_model} ({best_score:.2f})")
+        
+        print("\n\n🎖️ OVERALL WINNER")
+        print("="*100)
+        best_overall = max(model_stats.items(), key=lambda x: x[1]['average'])
+        print(f"Model: {best_overall[0]}")
+        print(f"Average Score: {best_overall[1]['average']:.2f}/5.00")
+        print(f"Pass Rate: {best_overall[1]['pass_rate']:.1f}%")
+        print(f"Exceptional Rate: {best_overall[1]['exceptional_rate']:.1f}%")
+        print("="*100)
+    
+    def generate_detailed_report(self, model_name: str):
+        """Generate detailed report for a specific model"""
+        # Find result file for this model
+        pattern = f"{model_name.replace(':', '_')}_latest.json"
+        filepath = self.results_dir / pattern
+        
+        if not filepath.exists():
+            print(f"❌ No results found for model: {model_name}")
+            return
+        
+        results = self.load_result_file(filepath)
+        
+        print("\n" + "="*100)
+        print(f"📋 DETAILED REPORT: {model_name}")
+        print("="*100)
+        
+        # Metadata
+        metadata = results.get('metadata', {})
+        print(f"\n⏱️  Test Duration: {metadata.get('test_start')} → {metadata.get('test_end')}")
+        print(f"📊 Tests: {metadata.get('completed_tests')}/{metadata.get('total_tests')}")
+        
+        # Overall stats
+        all_scores = [
+            test.get('score') or test.get('overall_score')
+            for test in results.get('test_results', [])
+        ]
+        all_scores = [s for s in all_scores if s is not None]
+        stats = self.calculate_statistics(all_scores)
+        
+        print(f"\n📈 Overall Performance:")
+        print(f"  Average Score: {stats['average']:.2f}/5.00")
+        print(f"  Pass Rate: {stats['pass_rate']:.1f}%")
+        print(f"  Exceptional Rate: {stats['exceptional_rate']:.1f}%")
+        print(f"  Score Range: {stats['min']:.1f} - {stats['max']:.1f}")
+        
+        # Test-by-test results
+        print(f"\n\n📝 TEST-BY-TEST RESULTS")
+        print("="*100)
+        
+        for test in results.get('test_results', []):
+            score = test.get('score') or test.get('overall_score')
+            status_icon = "✅" if score and score >= 4 else "⚠️" if score and score >= 2 else "❌"
+            
+            print(f"\n{status_icon} [{test.get('test_id')}] {test.get('test_name')}")
+            print(f"   Category: {test.get('category')}")
+            print(f"   Type: {test.get('type')}")
+            print(f"   Difficulty: {test.get('difficulty', 'unknown')}")
+            print(f"   Score: {score if score is not None else 'N/A'}/5.00")
+            
+            if test.get('notes'):
+                print(f"   Notes: {test['notes']}")
+            
+            # Show criteria pass/fail if available
+            if test.get('evaluation_criteria'):
+                print(f"   Criteria ({len(test['evaluation_criteria'])} items):")
+                for criterion in test['evaluation_criteria']:
+                    print(f"     • {criterion}")
+        
+        print("\n" + "="*100)
+    
+    def export_csv(self, output_file: str = "comparison.csv"):
+        """Export comparison data to CSV"""
+        import csv
+        
+        result_files = self.find_result_files()
+        if not result_files:
+            print("❌ No result files found")
+            return
+        
+        # Prepare CSV data
+        csv_data = []
+        headers = ['Model', 'Test ID', 'Test Name', 'Category', 'Type', 'Difficulty', 'Score', 'Notes']
+        
+        for filepath in result_files:
+            results = self.load_result_file(filepath)
+            model_name = results['metadata']['model_name']
+            
+            for test in results.get('test_results', []):
+                csv_data.append([
+                    model_name,
+                    test.get('test_id', ''),
+                    test.get('test_name', ''),
+                    test.get('category', ''),
+                    test.get('type', ''),
+                    test.get('difficulty', ''),
+                    test.get('score') or test.get('overall_score', ''),
+                    test.get('notes', '')
+                ])
+        
+        # Write CSV
+        output_path = self.results_dir / output_file
+        with open(output_path, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.writer(f)
+            writer.writerow(headers)
+            writer.writerows(csv_data)
+        
+        print(f"✅ CSV exported to: {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Analyze and compare AI model evaluation results",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Compare all models
+  python analyze_results.py --compare
+  
+  # Detailed report for specific model
+  python analyze_results.py --detail "qwen3:4b-q4_K_M"
+  
+  # Export to CSV
+  python analyze_results.py --export comparison.csv
+  
+  # Custom results directory
+  python analyze_results.py --results-dir ./my_results --compare
+        """
+    )
+    
+    parser.add_argument(
+        '--results-dir',
+        default='results',
+        help='Directory containing result JSON files (default: results)'
+    )
+    
+    parser.add_argument(
+        '--compare',
+        action='store_true',
+        help='Generate comparison report for all models'
+    )
+    
+    parser.add_argument(
+        '--detail',
+        type=str,
+        help='Generate detailed report for specific model'
+    )
+    
+    parser.add_argument(
+        '--export',
+        type=str,
+        help='Export results to CSV file'
+    )
+    
+    args = parser.parse_args()
+    
+    analyzer = ResultsAnalyzer(results_dir=args.results_dir)
+    
+    if args.compare:
+        result_files = analyzer.find_result_files()
+        if result_files:
+            analyzer.compare_models(result_files)
+        else:
+            print(f"❌ No result files found in {args.results_dir}")
+    
+    if args.detail:
+        analyzer.generate_detailed_report(args.detail)
+    
+    if args.export:
+        analyzer.export_csv(args.export)
+    
+    if not (args.compare or args.detail or args.export):
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()