355 lines
13 KiB
Python
355 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
AI Model Evaluation Results Analyzer
|
|
Compares results across different models and quantizations
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List, Dict
|
|
import argparse
|
|
from collections import defaultdict
|
|
|
|
|
|
class ResultsAnalyzer:
|
|
def __init__(self, results_dir: str = "results"):
|
|
self.results_dir = Path(results_dir)
|
|
|
|
def load_result_file(self, filepath: Path) -> Dict:
|
|
"""Load a single result file"""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
def find_result_files(self, pattern: str = "*_latest.json") -> List[Path]:
|
|
"""Find all result files matching pattern"""
|
|
return sorted(self.results_dir.glob(pattern))
|
|
|
|
def extract_scores_by_category(self, results: Dict) -> Dict[str, List[float]]:
|
|
"""Extract scores organized by category"""
|
|
scores_by_category = defaultdict(list)
|
|
|
|
for test in results.get('test_results', []):
|
|
category = test.get('category', 'Unknown')
|
|
score = test.get('score') or test.get('overall_score')
|
|
|
|
if score is not None:
|
|
scores_by_category[category].append(score)
|
|
|
|
return dict(scores_by_category)
|
|
|
|
def calculate_statistics(self, scores: List[float]) -> Dict:
|
|
"""Calculate statistics for a list of scores"""
|
|
if not scores:
|
|
return {
|
|
'count': 0,
|
|
'average': 0.0,
|
|
'min': 0.0,
|
|
'max': 0.0,
|
|
'pass_rate': 0.0,
|
|
'exceptional_rate': 0.0
|
|
}
|
|
|
|
return {
|
|
'count': len(scores),
|
|
'average': sum(scores) / len(scores),
|
|
'min': min(scores),
|
|
'max': max(scores),
|
|
'pass_rate': len([s for s in scores if s >= 2]) / len(scores) * 100,
|
|
'exceptional_rate': len([s for s in scores if s >= 4]) / len(scores) * 100
|
|
}
|
|
|
|
def compare_models(self, result_files: List[Path]):
|
|
"""Generate comparison report for multiple models"""
|
|
print("\n" + "="*100)
|
|
print("📊 AI MODEL COMPARISON REPORT")
|
|
print("="*100)
|
|
|
|
# Load all results
|
|
all_results = {}
|
|
for filepath in result_files:
|
|
try:
|
|
results = self.load_result_file(filepath)
|
|
model_name = results['metadata']['model_name']
|
|
all_results[model_name] = results
|
|
except Exception as e:
|
|
print(f"⚠️ Error loading {filepath}: {e}")
|
|
|
|
if not all_results:
|
|
print("❌ No valid result files found")
|
|
return
|
|
|
|
# Overall comparison
|
|
print("\n📈 OVERALL PERFORMANCE")
|
|
print("-"*100)
|
|
print(f"{'Model':<30} {'Total Tests':<12} {'Avg Score':<12} {'Pass Rate':<12} {'Exceptional':<12}")
|
|
print("-"*100)
|
|
|
|
model_stats = {}
|
|
for model_name, results in sorted(all_results.items()):
|
|
all_scores = [
|
|
test.get('score') or test.get('overall_score')
|
|
for test in results.get('test_results', [])
|
|
]
|
|
all_scores = [s for s in all_scores if s is not None]
|
|
|
|
stats = self.calculate_statistics(all_scores)
|
|
model_stats[model_name] = stats
|
|
|
|
print(f"{model_name:<30} {stats['count']:<12} {stats['average']:<12.2f} "
|
|
f"{stats['pass_rate']:<12.1f}% {stats['exceptional_rate']:<12.1f}%")
|
|
|
|
# Category-wise comparison
|
|
print("\n\n📂 CATEGORY-WISE PERFORMANCE")
|
|
print("="*100)
|
|
|
|
# Get all unique categories
|
|
all_categories = set()
|
|
for results in all_results.values():
|
|
for test in results.get('test_results', []):
|
|
all_categories.add(test.get('category', 'Unknown'))
|
|
|
|
for category in sorted(all_categories):
|
|
print(f"\n🔖 {category}")
|
|
print("-"*100)
|
|
print(f"{'Model':<30} {'Tests':<8} {'Avg Score':<12} {'Pass Rate':<12} {'Exceptional':<12}")
|
|
print("-"*100)
|
|
|
|
for model_name, results in sorted(all_results.items()):
|
|
cat_scores = [
|
|
test.get('score') or test.get('overall_score')
|
|
for test in results.get('test_results', [])
|
|
if test.get('category') == category and (test.get('score') or test.get('overall_score')) is not None
|
|
]
|
|
|
|
if cat_scores:
|
|
stats = self.calculate_statistics(cat_scores)
|
|
print(f"{model_name:<30} {stats['count']:<8} {stats['average']:<12.2f} "
|
|
f"{stats['pass_rate']:<12.1f}% {stats['exceptional_rate']:<12.1f}%")
|
|
else:
|
|
print(f"{model_name:<30} {'N/A':<8} {'N/A':<12} {'N/A':<12} {'N/A':<12}")
|
|
|
|
# Difficulty-based comparison
|
|
print("\n\n⚡ DIFFICULTY-BASED PERFORMANCE")
|
|
print("="*100)
|
|
|
|
difficulties = ['medium', 'hard', 'very_hard']
|
|
for difficulty in difficulties:
|
|
print(f"\n🎯 Difficulty: {difficulty.replace('_', ' ').title()}")
|
|
print("-"*100)
|
|
print(f"{'Model':<30} {'Tests':<8} {'Avg Score':<12} {'Pass Rate':<12}")
|
|
print("-"*100)
|
|
|
|
for model_name, results in sorted(all_results.items()):
|
|
diff_scores = [
|
|
test.get('score') or test.get('overall_score')
|
|
for test in results.get('test_results', [])
|
|
if test.get('difficulty') == difficulty and (test.get('score') or test.get('overall_score')) is not None
|
|
]
|
|
|
|
if diff_scores:
|
|
stats = self.calculate_statistics(diff_scores)
|
|
print(f"{model_name:<30} {stats['count']:<8} {stats['average']:<12.2f} "
|
|
f"{stats['pass_rate']:<12.1f}%")
|
|
else:
|
|
print(f"{model_name:<30} {'N/A':<8} {'N/A':<12} {'N/A':<12}")
|
|
|
|
# Winner analysis
|
|
print("\n\n🏆 WINNERS BY CATEGORY")
|
|
print("="*100)
|
|
|
|
for category in sorted(all_categories):
|
|
best_model = None
|
|
best_score = -1
|
|
|
|
for model_name, results in all_results.items():
|
|
cat_scores = [
|
|
test.get('score') or test.get('overall_score')
|
|
for test in results.get('test_results', [])
|
|
if test.get('category') == category and (test.get('score') or test.get('overall_score')) is not None
|
|
]
|
|
|
|
if cat_scores:
|
|
avg = sum(cat_scores) / len(cat_scores)
|
|
if avg > best_score:
|
|
best_score = avg
|
|
best_model = model_name
|
|
|
|
if best_model:
|
|
print(f"{category:<50} → {best_model} ({best_score:.2f})")
|
|
|
|
print("\n\n🎖️ OVERALL WINNER")
|
|
print("="*100)
|
|
best_overall = max(model_stats.items(), key=lambda x: x[1]['average'])
|
|
print(f"Model: {best_overall[0]}")
|
|
print(f"Average Score: {best_overall[1]['average']:.2f}/5.00")
|
|
print(f"Pass Rate: {best_overall[1]['pass_rate']:.1f}%")
|
|
print(f"Exceptional Rate: {best_overall[1]['exceptional_rate']:.1f}%")
|
|
print("="*100)
|
|
|
|
def generate_detailed_report(self, model_name: str):
|
|
"""Generate detailed report for a specific model"""
|
|
# Find result file for this model
|
|
pattern = f"{model_name.replace(':', '_')}_latest.json"
|
|
filepath = self.results_dir / pattern
|
|
|
|
if not filepath.exists():
|
|
print(f"❌ No results found for model: {model_name}")
|
|
return
|
|
|
|
results = self.load_result_file(filepath)
|
|
|
|
print("\n" + "="*100)
|
|
print(f"📋 DETAILED REPORT: {model_name}")
|
|
print("="*100)
|
|
|
|
# Metadata
|
|
metadata = results.get('metadata', {})
|
|
print(f"\n⏱️ Test Duration: {metadata.get('test_start')} → {metadata.get('test_end')}")
|
|
print(f"📊 Tests: {metadata.get('completed_tests')}/{metadata.get('total_tests')}")
|
|
|
|
# Overall stats
|
|
all_scores = [
|
|
test.get('score') or test.get('overall_score')
|
|
for test in results.get('test_results', [])
|
|
]
|
|
all_scores = [s for s in all_scores if s is not None]
|
|
stats = self.calculate_statistics(all_scores)
|
|
|
|
print(f"\n📈 Overall Performance:")
|
|
print(f" Average Score: {stats['average']:.2f}/5.00")
|
|
print(f" Pass Rate: {stats['pass_rate']:.1f}%")
|
|
print(f" Exceptional Rate: {stats['exceptional_rate']:.1f}%")
|
|
print(f" Score Range: {stats['min']:.1f} - {stats['max']:.1f}")
|
|
|
|
# Test-by-test results
|
|
print(f"\n\n📝 TEST-BY-TEST RESULTS")
|
|
print("="*100)
|
|
|
|
for test in results.get('test_results', []):
|
|
score = test.get('score') or test.get('overall_score')
|
|
status_icon = "✅" if score and score >= 4 else "⚠️" if score and score >= 2 else "❌"
|
|
|
|
print(f"\n{status_icon} [{test.get('test_id')}] {test.get('test_name')}")
|
|
print(f" Category: {test.get('category')}")
|
|
print(f" Type: {test.get('type')}")
|
|
print(f" Difficulty: {test.get('difficulty', 'unknown')}")
|
|
print(f" Score: {score if score is not None else 'N/A'}/5.00")
|
|
|
|
if test.get('notes'):
|
|
print(f" Notes: {test['notes']}")
|
|
|
|
# Show criteria pass/fail if available
|
|
if test.get('evaluation_criteria'):
|
|
print(f" Criteria ({len(test['evaluation_criteria'])} items):")
|
|
for criterion in test['evaluation_criteria']:
|
|
print(f" • {criterion}")
|
|
|
|
print("\n" + "="*100)
|
|
|
|
def export_csv(self, output_file: str = "comparison.csv"):
|
|
"""Export comparison data to CSV"""
|
|
import csv
|
|
|
|
result_files = self.find_result_files()
|
|
if not result_files:
|
|
print("❌ No result files found")
|
|
return
|
|
|
|
# Prepare CSV data
|
|
csv_data = []
|
|
headers = ['Model', 'Test ID', 'Test Name', 'Category', 'Type', 'Difficulty', 'Score', 'Notes']
|
|
|
|
for filepath in result_files:
|
|
results = self.load_result_file(filepath)
|
|
model_name = results['metadata']['model_name']
|
|
|
|
for test in results.get('test_results', []):
|
|
csv_data.append([
|
|
model_name,
|
|
test.get('test_id', ''),
|
|
test.get('test_name', ''),
|
|
test.get('category', ''),
|
|
test.get('type', ''),
|
|
test.get('difficulty', ''),
|
|
test.get('score') or test.get('overall_score', ''),
|
|
test.get('notes', '')
|
|
])
|
|
|
|
# Write CSV
|
|
output_path = self.results_dir / output_file
|
|
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.writer(f)
|
|
writer.writerow(headers)
|
|
writer.writerows(csv_data)
|
|
|
|
print(f"✅ CSV exported to: {output_path}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Analyze and compare AI model evaluation results",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Compare all models
|
|
python analyze_results.py --compare
|
|
|
|
# Detailed report for specific model
|
|
python analyze_results.py --detail "qwen3:4b-q4_K_M"
|
|
|
|
# Export to CSV
|
|
python analyze_results.py --export comparison.csv
|
|
|
|
# Custom results directory
|
|
python analyze_results.py --results-dir ./my_results --compare
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--results-dir',
|
|
default='results',
|
|
help='Directory containing result JSON files (default: results)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--compare',
|
|
action='store_true',
|
|
help='Generate comparison report for all models'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--detail',
|
|
type=str,
|
|
help='Generate detailed report for specific model'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--export',
|
|
type=str,
|
|
help='Export results to CSV file'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
analyzer = ResultsAnalyzer(results_dir=args.results_dir)
|
|
|
|
if args.compare:
|
|
result_files = analyzer.find_result_files()
|
|
if result_files:
|
|
analyzer.compare_models(result_files)
|
|
else:
|
|
print(f"❌ No result files found in {args.results_dir}")
|
|
|
|
if args.detail:
|
|
analyzer.generate_detailed_report(args.detail)
|
|
|
|
if args.export:
|
|
analyzer.export_csv(args.export)
|
|
|
|
if not (args.compare or args.detail or args.export):
|
|
parser.print_help()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |