1798 lines
70 KiB
Python
1798 lines
70 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
AI Model Evaluation Results Analyzer
|
||
Compares results across different models and quantizations
|
||
Includes interactive web interface for visualization and analysis
|
||
"""
|
||
|
||
import json
|
||
import sys
|
||
from pathlib import Path
|
||
from typing import List, Dict
|
||
import argparse
|
||
from collections import defaultdict
|
||
from flask import Flask, render_template, jsonify, request
|
||
import numpy as np
|
||
|
||
|
||
class ResultsAnalyzer:
|
||
def __init__(self, results_dir: str = "results"):
|
||
self.results_dir = Path(results_dir)
|
||
|
||
def load_result_file(self, filepath: Path) -> Dict:
|
||
"""Load a single result file"""
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
return json.load(f)
|
||
|
||
def find_result_files(self, pattern: str = "*_latest.json") -> List[Path]:
|
||
"""Find all result files matching pattern"""
|
||
return sorted(self.results_dir.glob(pattern))
|
||
|
||
def extract_scores_by_category(self, results: Dict) -> Dict[str, List[float]]:
|
||
"""Extract scores organized by category"""
|
||
scores_by_category = defaultdict(list)
|
||
|
||
for test in results.get('test_results', []):
|
||
category = test.get('category', 'Unknown')
|
||
score = test.get('score') or test.get('overall_score')
|
||
|
||
if score is not None:
|
||
scores_by_category[category].append(score)
|
||
|
||
return dict(scores_by_category)
|
||
|
||
def calculate_statistics(self, scores: List[float]) -> Dict:
|
||
"""Calculate statistics for a list of scores"""
|
||
if not scores:
|
||
return {
|
||
'count': 0,
|
||
'average': 0.0,
|
||
'min': 0.0,
|
||
'max': 0.0,
|
||
'pass_rate': 0.0,
|
||
'exceptional_rate': 0.0
|
||
}
|
||
|
||
return {
|
||
'count': len(scores),
|
||
'average': sum(scores) / len(scores),
|
||
'min': min(scores),
|
||
'max': max(scores),
|
||
'pass_rate': len([s for s in scores if s >= 2]) / len(scores) * 100,
|
||
'exceptional_rate': len([s for s in scores if s >= 4]) / len(scores) * 100
|
||
}
|
||
|
||
def compare_models(self, result_files: List[Path]):
|
||
"""Generate comparison report for multiple models"""
|
||
print("\n" + "="*100)
|
||
print("📊 AI MODEL COMPARISON REPORT")
|
||
print("="*100)
|
||
|
||
# Load all results
|
||
all_results = {}
|
||
for filepath in result_files:
|
||
try:
|
||
results = self.load_result_file(filepath)
|
||
model_name = results['metadata']['model_name']
|
||
all_results[model_name] = results
|
||
except Exception as e:
|
||
print(f"⚠️ Error loading {filepath}: {e}")
|
||
|
||
if not all_results:
|
||
print("❌ No valid result files found")
|
||
return
|
||
|
||
# Overall comparison
|
||
print("\n📈 OVERALL PERFORMANCE")
|
||
print("-"*100)
|
||
print(f"{'Model':<30} {'Total Tests':<12} {'Avg Score':<12} {'Pass Rate':<12} {'Exceptional':<12}")
|
||
print("-"*100)
|
||
|
||
model_stats = {}
|
||
for model_name, results in sorted(all_results.items()):
|
||
all_scores = [
|
||
test.get('score') or test.get('overall_score')
|
||
for test in results.get('test_results', [])
|
||
]
|
||
all_scores = [s for s in all_scores if s is not None]
|
||
|
||
stats = self.calculate_statistics(all_scores)
|
||
model_stats[model_name] = stats
|
||
|
||
print(f"{model_name:<30} {stats['count']:<12} {stats['average']:<12.2f} "
|
||
f"{stats['pass_rate']:<12.1f}% {stats['exceptional_rate']:<12.1f}%")
|
||
|
||
# Category-wise comparison
|
||
print("\n\n📂 CATEGORY-WISE PERFORMANCE")
|
||
print("="*100)
|
||
|
||
# Get all unique categories
|
||
all_categories = set()
|
||
for results in all_results.values():
|
||
for test in results.get('test_results', []):
|
||
all_categories.add(test.get('category', 'Unknown'))
|
||
|
||
for category in sorted(all_categories):
|
||
print(f"\n🔖 {category}")
|
||
print("-"*100)
|
||
print(f"{'Model':<30} {'Tests':<8} {'Avg Score':<12} {'Pass Rate':<12} {'Exceptional':<12}")
|
||
print("-"*100)
|
||
|
||
for model_name, results in sorted(all_results.items()):
|
||
cat_scores = [
|
||
test.get('score') or test.get('overall_score')
|
||
for test in results.get('test_results', [])
|
||
if test.get('category') == category and (test.get('score') or test.get('overall_score')) is not None
|
||
]
|
||
|
||
if cat_scores:
|
||
stats = self.calculate_statistics(cat_scores)
|
||
print(f"{model_name:<30} {stats['count']:<8} {stats['average']:<12.2f} "
|
||
f"{stats['pass_rate']:<12.1f}% {stats['exceptional_rate']:<12.1f}%")
|
||
else:
|
||
print(f"{model_name:<30} {'N/A':<8} {'N/A':<12} {'N/A':<12} {'N/A':<12}")
|
||
|
||
# Difficulty-based comparison
|
||
print("\n\n⚡ DIFFICULTY-BASED PERFORMANCE")
|
||
print("="*100)
|
||
|
||
difficulties = ['medium', 'hard', 'very_hard']
|
||
for difficulty in difficulties:
|
||
print(f"\n🎯 Difficulty: {difficulty.replace('_', ' ').title()}")
|
||
print("-"*100)
|
||
print(f"{'Model':<30} {'Tests':<8} {'Avg Score':<12} {'Pass Rate':<12}")
|
||
print("-"*100)
|
||
|
||
for model_name, results in sorted(all_results.items()):
|
||
diff_scores = [
|
||
test.get('score') or test.get('overall_score')
|
||
for test in results.get('test_results', [])
|
||
if test.get('difficulty') == difficulty and (test.get('score') or test.get('overall_score')) is not None
|
||
]
|
||
|
||
if diff_scores:
|
||
stats = self.calculate_statistics(diff_scores)
|
||
print(f"{model_name:<30} {stats['count']:<8} {stats['average']:<12.2f} "
|
||
f"{stats['pass_rate']:<12.1f}%")
|
||
else:
|
||
print(f"{model_name:<30} {'N/A':<8} {'N/A':<12} {'N/A':<12}")
|
||
|
||
# Winner analysis
|
||
print("\n\n🏆 WINNERS BY CATEGORY")
|
||
print("="*100)
|
||
|
||
for category in sorted(all_categories):
|
||
best_model = None
|
||
best_score = -1
|
||
|
||
for model_name, results in all_results.items():
|
||
cat_scores = [
|
||
test.get('score') or test.get('overall_score')
|
||
for test in results.get('test_results', [])
|
||
if test.get('category') == category and (test.get('score') or test.get('overall_score')) is not None
|
||
]
|
||
|
||
if cat_scores:
|
||
avg = sum(cat_scores) / len(cat_scores)
|
||
if avg > best_score:
|
||
best_score = avg
|
||
best_model = model_name
|
||
|
||
if best_model:
|
||
print(f"{category:<50} → {best_model} ({best_score:.2f})")
|
||
|
||
print("\n\n🎖️ OVERALL WINNER")
|
||
print("="*100)
|
||
best_overall = max(model_stats.items(), key=lambda x: x[1]['average'])
|
||
print(f"Model: {best_overall[0]}")
|
||
print(f"Average Score: {best_overall[1]['average']:.2f}/5.00")
|
||
print(f"Pass Rate: {best_overall[1]['pass_rate']:.1f}%")
|
||
print(f"Exceptional Rate: {best_overall[1]['exceptional_rate']:.1f}%")
|
||
print("="*100)
|
||
|
||
def generate_detailed_report(self, model_name: str):
|
||
"""Generate detailed report for a specific model"""
|
||
# Find result file for this model
|
||
pattern = f"{model_name.replace(':', '_')}_latest.json"
|
||
filepath = self.results_dir / pattern
|
||
|
||
if not filepath.exists():
|
||
print(f"❌ No results found for model: {model_name}")
|
||
return
|
||
|
||
results = self.load_result_file(filepath)
|
||
|
||
print("\n" + "="*100)
|
||
print(f"📋 DETAILED REPORT: {model_name}")
|
||
print("="*100)
|
||
|
||
# Metadata
|
||
metadata = results.get('metadata', {})
|
||
print(f"\n⏱️ Test Duration: {metadata.get('test_start')} → {metadata.get('test_end')}")
|
||
print(f"📊 Tests: {metadata.get('completed_tests')}/{metadata.get('total_tests')}")
|
||
|
||
# Overall stats
|
||
all_scores = [
|
||
test.get('score') or test.get('overall_score')
|
||
for test in results.get('test_results', [])
|
||
]
|
||
all_scores = [s for s in all_scores if s is not None]
|
||
stats = self.calculate_statistics(all_scores)
|
||
|
||
print(f"\n📈 Overall Performance:")
|
||
print(f" Average Score: {stats['average']:.2f}/5.00")
|
||
print(f" Pass Rate: {stats['pass_rate']:.1f}%")
|
||
print(f" Exceptional Rate: {stats['exceptional_rate']:.1f}%")
|
||
print(f" Score Range: {stats['min']:.1f} - {stats['max']:.1f}")
|
||
|
||
# Test-by-test results
|
||
print(f"\n\n📝 TEST-BY-TEST RESULTS")
|
||
print("="*100)
|
||
|
||
for test in results.get('test_results', []):
|
||
score = test.get('score') or test.get('overall_score')
|
||
status_icon = "✅" if score and score >= 4 else "⚠️" if score and score >= 2 else "❌"
|
||
|
||
print(f"\n{status_icon} [{test.get('test_id')}] {test.get('test_name')}")
|
||
print(f" Category: {test.get('category')}")
|
||
print(f" Type: {test.get('type')}")
|
||
print(f" Difficulty: {test.get('difficulty', 'unknown')}")
|
||
print(f" Score: {score if score is not None else 'N/A'}/5.00")
|
||
|
||
if test.get('notes'):
|
||
print(f" Notes: {test['notes']}")
|
||
|
||
# Show criteria pass/fail if available
|
||
if test.get('evaluation_criteria'):
|
||
print(f" Criteria ({len(test['evaluation_criteria'])} items):")
|
||
for criterion in test['evaluation_criteria']:
|
||
print(f" • {criterion}")
|
||
|
||
print("\n" + "="*100)
|
||
|
||
def export_csv(self, output_file: str = "comparison.csv"):
|
||
"""Export comparison data to CSV"""
|
||
import csv
|
||
|
||
result_files = self.find_result_files()
|
||
if not result_files:
|
||
print("❌ No result files found")
|
||
return
|
||
|
||
# Prepare CSV data
|
||
csv_data = []
|
||
headers = ['Model', 'Test ID', 'Test Name', 'Category', 'Type', 'Difficulty', 'Score', 'Notes']
|
||
|
||
for filepath in result_files:
|
||
results = self.load_result_file(filepath)
|
||
model_name = results['metadata']['model_name']
|
||
|
||
for test in results.get('test_results', []):
|
||
csv_data.append([
|
||
model_name,
|
||
test.get('test_id', ''),
|
||
test.get('test_name', ''),
|
||
test.get('category', ''),
|
||
test.get('type', ''),
|
||
test.get('difficulty', ''),
|
||
test.get('score') or test.get('overall_score', ''),
|
||
test.get('notes', '')
|
||
])
|
||
|
||
# Write CSV
|
||
output_path = self.results_dir / output_file
|
||
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
||
writer = csv.writer(f)
|
||
writer.writerow(headers)
|
||
writer.writerows(csv_data)
|
||
|
||
print(f"✅ CSV exported to: {output_path}")
|
||
|
||
|
||
class WebInterface:
|
||
"""Interactive web interface for results analysis"""
|
||
|
||
def __init__(self, results_dir: str = "results"):
|
||
self.results_dir = Path(results_dir)
|
||
self.analyzer = ResultsAnalyzer(results_dir)
|
||
self.app = Flask(__name__)
|
||
self.setup_routes()
|
||
|
||
def setup_routes(self):
|
||
"""Setup Flask routes"""
|
||
|
||
@self.app.route('/')
|
||
def index():
|
||
"""Main dashboard"""
|
||
return render_template('dashboard.html')
|
||
|
||
@self.app.route('/api/models')
|
||
def get_models():
|
||
"""Get list of all available models"""
|
||
result_files = self.analyzer.find_result_files()
|
||
models = []
|
||
|
||
for filepath in result_files:
|
||
try:
|
||
results = self.analyzer.load_result_file(filepath)
|
||
metadata = results.get('metadata', {})
|
||
models.append({
|
||
'name': metadata.get('model_name', 'Unknown'),
|
||
'file': filepath.name,
|
||
'total_tests': metadata.get('total_tests', 0),
|
||
'completed_tests': metadata.get('completed_tests', 0),
|
||
'test_start': metadata.get('test_start'),
|
||
'test_end': metadata.get('test_end')
|
||
})
|
||
except Exception as e:
|
||
print(f"Error loading {filepath}: {e}")
|
||
|
||
return jsonify(models)
|
||
|
||
@self.app.route('/api/results/<model_name>')
|
||
def get_model_results(model_name):
|
||
"""Get detailed results for a specific model"""
|
||
pattern = f"{model_name.replace(':', '_')}_latest.json"
|
||
filepath = self.results_dir / pattern
|
||
|
||
if not filepath.exists():
|
||
return jsonify({'error': 'Model not found'}), 404
|
||
|
||
results = self.analyzer.load_result_file(filepath)
|
||
return jsonify(results)
|
||
|
||
@self.app.route('/api/comparison')
|
||
def get_comparison():
|
||
"""Get comparison data for all models"""
|
||
result_files = self.analyzer.find_result_files()
|
||
comparison_data = {
|
||
'models': {},
|
||
'categories': set(),
|
||
'difficulty_levels': set()
|
||
}
|
||
|
||
for filepath in result_files:
|
||
try:
|
||
results = self.analyzer.load_result_file(filepath)
|
||
model_name = results['metadata']['model_name']
|
||
|
||
# Extract all scores and metadata
|
||
test_results = []
|
||
for test in results.get('test_results', []):
|
||
score = test.get('score') or test.get('overall_score')
|
||
|
||
# Handle notes differently for multi-turn vs single-turn tests
|
||
if test.get('type') == 'multi_turn' and 'turns' in test:
|
||
# Combine notes from all turns for multi-turn tests
|
||
turn_notes = []
|
||
for turn in test.get('turns', []):
|
||
turn_num = turn.get('turn', '?')
|
||
turn_note = turn.get('notes', '')
|
||
if turn_note:
|
||
turn_notes.append(f"T{turn_num}: {turn_note}")
|
||
notes = ' | '.join(turn_notes) if turn_notes else ''
|
||
|
||
# Get aggregate generation time and metrics for multi-turn
|
||
total_gen_time = sum(t.get('generation_time', 0) for t in test.get('turns', []))
|
||
api_metrics = test.get('aggregate_metrics', {})
|
||
else:
|
||
notes = test.get('notes', '')
|
||
total_gen_time = test.get('generation_time')
|
||
api_metrics = test.get('api_metrics')
|
||
|
||
test_data = {
|
||
'test_id': test.get('test_id'),
|
||
'test_name': test.get('test_name'),
|
||
'category': test.get('category', 'Unknown'),
|
||
'type': test.get('type'),
|
||
'difficulty': test.get('difficulty', 'medium'),
|
||
'score': score,
|
||
'status': test.get('status'),
|
||
'notes': notes,
|
||
'generation_time': total_gen_time,
|
||
'api_metrics': api_metrics
|
||
}
|
||
test_results.append(test_data)
|
||
|
||
if test_data['category']:
|
||
comparison_data['categories'].add(test_data['category'])
|
||
if test_data['difficulty']:
|
||
comparison_data['difficulty_levels'].add(test_data['difficulty'])
|
||
|
||
# Calculate overall statistics
|
||
all_scores = [t['score'] for t in test_results if t['score'] is not None]
|
||
stats = self.analyzer.calculate_statistics(all_scores) if all_scores else {}
|
||
|
||
# Calculate category statistics
|
||
category_stats = {}
|
||
for category in comparison_data['categories']:
|
||
cat_scores = [t['score'] for t in test_results
|
||
if t['category'] == category and t['score'] is not None]
|
||
if cat_scores:
|
||
category_stats[category] = self.analyzer.calculate_statistics(cat_scores)
|
||
|
||
# Calculate difficulty statistics
|
||
difficulty_stats = {}
|
||
for difficulty in comparison_data['difficulty_levels']:
|
||
diff_scores = [t['score'] for t in test_results
|
||
if t['difficulty'] == difficulty and t['score'] is not None]
|
||
if diff_scores:
|
||
difficulty_stats[difficulty] = self.analyzer.calculate_statistics(diff_scores)
|
||
|
||
comparison_data['models'][model_name] = {
|
||
'test_results': test_results,
|
||
'overall_stats': stats,
|
||
'category_stats': category_stats,
|
||
'difficulty_stats': difficulty_stats,
|
||
'metadata': results.get('metadata', {})
|
||
}
|
||
|
||
except Exception as e:
|
||
print(f"Error processing {filepath}: {e}")
|
||
|
||
# Convert sets to lists for JSON serialization
|
||
comparison_data['categories'] = sorted(list(comparison_data['categories']))
|
||
comparison_data['difficulty_levels'] = sorted(list(comparison_data['difficulty_levels']))
|
||
|
||
return jsonify(comparison_data)
|
||
|
||
@self.app.route('/api/statistics')
|
||
def get_statistics():
|
||
"""Get advanced statistical analysis"""
|
||
result_files = self.analyzer.find_result_files()
|
||
|
||
all_data = []
|
||
model_names = []
|
||
|
||
for filepath in result_files:
|
||
try:
|
||
results = self.analyzer.load_result_file(filepath)
|
||
model_name = results['metadata']['model_name']
|
||
model_names.append(model_name)
|
||
|
||
scores = [test.get('score') or test.get('overall_score')
|
||
for test in results.get('test_results', [])]
|
||
scores = [s for s in scores if s is not None]
|
||
all_data.append(scores)
|
||
except Exception as e:
|
||
print(f"Error in statistics: {e}")
|
||
|
||
# Calculate advanced statistics
|
||
statistics = {
|
||
'models': model_names,
|
||
'variance': [],
|
||
'std_dev': [],
|
||
'consistency_score': [],
|
||
'robustness_score': []
|
||
}
|
||
|
||
for i, scores in enumerate(all_data):
|
||
if scores:
|
||
variance = np.var(scores)
|
||
std_dev = np.std(scores)
|
||
|
||
# Consistency: lower variance is better (inverse normalized)
|
||
consistency = max(0, 100 - (std_dev * 20))
|
||
|
||
# Robustness: combination of average and consistency
|
||
avg_score = np.mean(scores)
|
||
robustness = (avg_score * 20) * (consistency / 100)
|
||
|
||
statistics['variance'].append(float(variance))
|
||
statistics['std_dev'].append(float(std_dev))
|
||
statistics['consistency_score'].append(float(consistency))
|
||
statistics['robustness_score'].append(float(robustness))
|
||
else:
|
||
statistics['variance'].append(0)
|
||
statistics['std_dev'].append(0)
|
||
statistics['consistency_score'].append(0)
|
||
statistics['robustness_score'].append(0)
|
||
|
||
return jsonify(statistics)
|
||
|
||
@self.app.route('/api/intelligence_metrics')
|
||
def get_intelligence_metrics():
|
||
"""Calculate intelligence evaluation metrics"""
|
||
result_files = self.analyzer.find_result_files()
|
||
|
||
metrics = {}
|
||
|
||
for filepath in result_files:
|
||
try:
|
||
results = self.analyzer.load_result_file(filepath)
|
||
model_name = results['metadata']['model_name']
|
||
test_results = results.get('test_results', [])
|
||
|
||
# Define intelligence dimensions
|
||
dimensions = {
|
||
'logical_reasoning': ['Logic & Reasoning'],
|
||
'mathematical_ability': ['Mathematics & Calculation'],
|
||
'instruction_following': ['Instruction Following', 'Multi-turn: Instruction Following'],
|
||
'creativity': ['Creative Writing'],
|
||
'technical_knowledge': [
|
||
'Code Generation',
|
||
'IT Forensics - File Systems',
|
||
'IT Forensics - Registry & Artifacts',
|
||
'IT Forensics - Memory & Network',
|
||
'IT Forensics - Timeline & Log Analysis'
|
||
],
|
||
'linguistic_nuance': ['Language Nuance', 'Multilingual Competence'],
|
||
'problem_solving': ['Problem Solving & Logistics'],
|
||
'conversational_depth': ['Multi-turn: Context Retention']
|
||
}
|
||
|
||
model_metrics = {}
|
||
|
||
for dimension, categories in dimensions.items():
|
||
dim_scores = [
|
||
test.get('score') or test.get('overall_score')
|
||
for test in test_results
|
||
if test.get('category') in categories and
|
||
(test.get('score') or test.get('overall_score')) is not None
|
||
]
|
||
|
||
if dim_scores:
|
||
avg = sum(dim_scores) / len(dim_scores)
|
||
model_metrics[dimension] = {
|
||
'score': avg,
|
||
'normalized': (avg / 5.0) * 100,
|
||
'count': len(dim_scores)
|
||
}
|
||
else:
|
||
model_metrics[dimension] = {
|
||
'score': 0,
|
||
'normalized': 0,
|
||
'count': 0
|
||
}
|
||
|
||
# Calculate overall intelligence quotient (IQ score)
|
||
weighted_scores = []
|
||
weights = {
|
||
'logical_reasoning': 1.5, # Higher weight for core reasoning
|
||
'mathematical_ability': 1.3,
|
||
'instruction_following': 1.2,
|
||
'creativity': 1.0,
|
||
'technical_knowledge': 1.4,
|
||
'linguistic_nuance': 1.1,
|
||
'problem_solving': 1.4,
|
||
'conversational_depth': 1.0
|
||
}
|
||
|
||
total_weight = 0
|
||
for dim, data in model_metrics.items():
|
||
if data['count'] > 0:
|
||
weighted_scores.append(data['score'] * weights[dim])
|
||
total_weight += weights[dim]
|
||
|
||
iq_score = (sum(weighted_scores) / total_weight * 20) if total_weight > 0 else 0
|
||
|
||
# Adaptability: performance across diverse categories
|
||
category_scores = {}
|
||
for test in test_results:
|
||
cat = test.get('category')
|
||
score = test.get('score') or test.get('overall_score')
|
||
if cat and score is not None:
|
||
if cat not in category_scores:
|
||
category_scores[cat] = []
|
||
category_scores[cat].append(score)
|
||
|
||
adaptability = len([cat for cat, scores in category_scores.items()
|
||
if sum(scores)/len(scores) >= 2.5]) / max(len(category_scores), 1) * 100
|
||
|
||
# Problem-solving depth: performance on hard problems
|
||
hard_scores = [
|
||
test.get('score') or test.get('overall_score')
|
||
for test in test_results
|
||
if test.get('difficulty') in ['hard', 'very_hard'] and
|
||
(test.get('score') or test.get('overall_score')) is not None
|
||
]
|
||
|
||
problem_solving_depth = (sum(hard_scores) / len(hard_scores) * 20) if hard_scores else 0
|
||
|
||
metrics[model_name] = {
|
||
'dimensions': model_metrics,
|
||
'iq_score': iq_score,
|
||
'adaptability': adaptability,
|
||
'problem_solving_depth': problem_solving_depth,
|
||
'overall_intelligence': (iq_score * 0.5 + adaptability * 0.3 + problem_solving_depth * 0.2)
|
||
}
|
||
|
||
except Exception as e:
|
||
print(f"Error calculating intelligence metrics: {e}")
|
||
|
||
return jsonify(metrics)
|
||
|
||
def run(self, host='127.0.0.1', port=5000, debug=False):
|
||
"""Start the web server"""
|
||
|
||
# Create templates directory if it doesn't exist
|
||
templates_dir = Path(__file__).parent / 'templates'
|
||
templates_dir.mkdir(exist_ok=True)
|
||
|
||
# Generate HTML template
|
||
self.create_dashboard_template(templates_dir)
|
||
|
||
print(f"\n🌐 Starting web interface at http://{host}:{port}")
|
||
print(f"📊 Open the above URL in your browser to view the dashboard")
|
||
print(f"🛑 Press Ctrl+C to stop the server\n")
|
||
|
||
self.app.run(host=host, port=port, debug=debug)
|
||
|
||
def create_dashboard_template(self, templates_dir: Path):
|
||
"""Create the HTML dashboard template"""
|
||
|
||
html_content = '''<!DOCTYPE html>
|
||
<html lang="en">
|
||
<head>
|
||
<meta charset="UTF-8">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||
<title>LLM Evaluation Dashboard</title>
|
||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||
<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
|
||
<style>
|
||
* {
|
||
margin: 0;
|
||
padding: 0;
|
||
box-sizing: border-box;
|
||
}
|
||
|
||
:root {
|
||
--bg-gradient-start: #667eea;
|
||
--bg-gradient-end: #764ba2;
|
||
--card-bg: #ffffff;
|
||
--text-primary: #333333;
|
||
--text-secondary: #666666;
|
||
--border-color: #e0e0e0;
|
||
--stat-card-bg: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
|
||
--shadow: rgba(0,0,0,0.1);
|
||
--shadow-hover: rgba(0,0,0,0.15);
|
||
}
|
||
|
||
body.dark-mode {
|
||
--bg-gradient-start: #1a1a2e;
|
||
--bg-gradient-end: #16213e;
|
||
--card-bg: #0f1419;
|
||
--text-primary: #e0e0e0;
|
||
--text-secondary: #a0a0a0;
|
||
--border-color: #2a2a3e;
|
||
--stat-card-bg: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
|
||
--shadow: rgba(0,0,0,0.3);
|
||
--shadow-hover: rgba(0,0,0,0.5);
|
||
}
|
||
|
||
body {
|
||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
|
||
background: linear-gradient(135deg, var(--bg-gradient-start) 0%, var(--bg-gradient-end) 100%);
|
||
color: var(--text-primary);
|
||
min-height: 100vh;
|
||
padding: 20px;
|
||
transition: all 0.3s ease;
|
||
}
|
||
|
||
.container {
|
||
max-width: 1400px;
|
||
margin: 0 auto;
|
||
}
|
||
|
||
header {
|
||
background: var(--card-bg);
|
||
padding: 30px;
|
||
border-radius: 15px;
|
||
box-shadow: 0 10px 40px var(--shadow);
|
||
margin-bottom: 30px;
|
||
position: relative;
|
||
}
|
||
|
||
.theme-toggle {
|
||
position: absolute;
|
||
top: 30px;
|
||
right: 30px;
|
||
background: var(--border-color);
|
||
border: none;
|
||
padding: 10px 20px;
|
||
border-radius: 20px;
|
||
cursor: pointer;
|
||
font-size: 1em;
|
||
transition: all 0.3s;
|
||
}
|
||
|
||
.theme-toggle:hover {
|
||
transform: scale(1.05);
|
||
box-shadow: 0 4px 15px var(--shadow-hover);
|
||
}
|
||
|
||
.scale-toggle {
|
||
position: absolute;
|
||
top: 30px;
|
||
right: 140px;
|
||
background: var(--border-color);
|
||
border: none;
|
||
padding: 10px 20px;
|
||
border-radius: 20px;
|
||
cursor: pointer;
|
||
font-size: 1em;
|
||
transition: all 0.3s;
|
||
}
|
||
|
||
.scale-toggle:hover {
|
||
transform: scale(1.05);
|
||
box-shadow: 0 4px 15px var(--shadow-hover);
|
||
}
|
||
|
||
.scale-toggle.zoomed {
|
||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||
color: white;
|
||
}
|
||
|
||
h1 {
|
||
font-size: 2.5em;
|
||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||
-webkit-background-clip: text;
|
||
-webkit-text-fill-color: transparent;
|
||
margin-bottom: 10px;
|
||
}
|
||
|
||
.subtitle {
|
||
color: var(--text-secondary);
|
||
font-size: 1.1em;
|
||
}
|
||
|
||
.tabs {
|
||
display: flex;
|
||
gap: 10px;
|
||
margin-bottom: 20px;
|
||
flex-wrap: wrap;
|
||
}
|
||
|
||
.tab {
|
||
background: var(--card-bg);
|
||
border: none;
|
||
padding: 12px 24px;
|
||
border-radius: 8px;
|
||
cursor: pointer;
|
||
font-size: 1em;
|
||
transition: all 0.3s;
|
||
box-shadow: 0 2px 10px var(--shadow);
|
||
color: var(--text-primary);
|
||
}
|
||
|
||
.tab:hover {
|
||
transform: translateY(-2px);
|
||
box-shadow: 0 4px 15px var(--shadow-hover);
|
||
}
|
||
|
||
.tab.active {
|
||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||
color: white;
|
||
}
|
||
|
||
.content-panel {
|
||
display: none;
|
||
background: var(--card-bg);
|
||
padding: 30px;
|
||
border-radius: 15px;
|
||
box-shadow: 0 10px 40px var(--shadow);
|
||
animation: fadeIn 0.3s;
|
||
}
|
||
|
||
.content-panel.active {
|
||
display: block;
|
||
}
|
||
|
||
@keyframes fadeIn {
|
||
from { opacity: 0; transform: translateY(10px); }
|
||
to { opacity: 1; transform: translateY(0); }
|
||
}
|
||
|
||
.stats-grid {
|
||
display: grid;
|
||
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
||
gap: 20px;
|
||
margin-bottom: 30px;
|
||
}
|
||
|
||
.stat-card {
|
||
background: var(--stat-card-bg);
|
||
padding: 20px;
|
||
border-radius: 10px;
|
||
text-align: center;
|
||
}
|
||
|
||
.stat-card h3 {
|
||
font-size: 0.9em;
|
||
color: var(--text-secondary);
|
||
margin-bottom: 10px;
|
||
text-transform: uppercase;
|
||
}
|
||
|
||
.stat-card .value {
|
||
font-size: 2.5em;
|
||
font-weight: bold;
|
||
color: #667eea;
|
||
}
|
||
|
||
.chart-container {
|
||
position: relative;
|
||
height: 400px;
|
||
margin-bottom: 30px;
|
||
}
|
||
|
||
.controls {
|
||
display: flex;
|
||
gap: 15px;
|
||
margin-bottom: 20px;
|
||
flex-wrap: wrap;
|
||
}
|
||
|
||
select, input {
|
||
padding: 10px 15px;
|
||
border: 2px solid var(--border-color);
|
||
border-radius: 8px;
|
||
font-size: 1em;
|
||
background: var(--card-bg);
|
||
color: var(--text-primary);
|
||
cursor: pointer;
|
||
transition: border-color 0.3s;
|
||
}
|
||
|
||
select:hover, input:hover {
|
||
border-color: #667eea;
|
||
}
|
||
|
||
select:focus, input:focus {
|
||
outline: none;
|
||
border-color: #764ba2;
|
||
}
|
||
|
||
table {
|
||
width: 100%;
|
||
border-collapse: collapse;
|
||
margin-top: 20px;
|
||
}
|
||
|
||
th, td {
|
||
padding: 12px;
|
||
text-align: left;
|
||
border-bottom: 1px solid var(--border-color);
|
||
}
|
||
|
||
th {
|
||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||
color: white;
|
||
font-weight: 600;
|
||
cursor: pointer;
|
||
user-select: none;
|
||
}
|
||
|
||
th:hover {
|
||
opacity: 0.9;
|
||
}
|
||
|
||
tr:hover {
|
||
background: var(--border-color);
|
||
}
|
||
|
||
.score-badge {
|
||
display: inline-block;
|
||
padding: 5px 12px;
|
||
border-radius: 20px;
|
||
font-weight: bold;
|
||
font-size: 0.9em;
|
||
}
|
||
|
||
.score-exceptional {
|
||
background: #10b981;
|
||
color: white;
|
||
}
|
||
|
||
.score-pass {
|
||
background: #f59e0b;
|
||
color: white;
|
||
}
|
||
|
||
.score-fail {
|
||
background: #ef4444;
|
||
color: white;
|
||
}
|
||
|
||
.loading {
|
||
text-align: center;
|
||
padding: 40px;
|
||
color: var(--text-secondary);
|
||
}
|
||
|
||
.spinner {
|
||
border: 3px solid var(--border-color);
|
||
border-top: 3px solid #667eea;
|
||
border-radius: 50%;
|
||
width: 40px;
|
||
height: 40px;
|
||
animation: spin 1s linear infinite;
|
||
margin: 20px auto;
|
||
}
|
||
|
||
@keyframes spin {
|
||
0% { transform: rotate(0deg); }
|
||
100% { transform: rotate(360deg); }
|
||
}
|
||
|
||
.model-selector {
|
||
display: flex;
|
||
gap: 10px;
|
||
flex-wrap: wrap;
|
||
margin-bottom: 20px;
|
||
}
|
||
|
||
.model-chip {
|
||
padding: 8px 16px;
|
||
border-radius: 20px;
|
||
border: 2px solid #667eea;
|
||
background: var(--card-bg);
|
||
color: var(--text-primary);
|
||
cursor: pointer;
|
||
transition: all 0.3s;
|
||
}
|
||
|
||
.model-chip:hover {
|
||
background: #667eea;
|
||
color: white;
|
||
}
|
||
|
||
.model-chip.selected {
|
||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||
color: white;
|
||
}
|
||
|
||
.metric-card {
|
||
background: var(--card-bg);
|
||
border: 2px solid var(--border-color);
|
||
border-radius: 10px;
|
||
padding: 20px;
|
||
margin-bottom: 20px;
|
||
}
|
||
|
||
.metric-card h3 {
|
||
color: #667eea;
|
||
margin-bottom: 15px;
|
||
}
|
||
|
||
.progress-bar {
|
||
background: var(--border-color);
|
||
height: 30px;
|
||
border-radius: 15px;
|
||
overflow: hidden;
|
||
margin: 10px 0;
|
||
position: relative;
|
||
cursor: help;
|
||
}
|
||
|
||
.progress-fill {
|
||
height: 100%;
|
||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||
transition: width 0.5s;
|
||
display: flex;
|
||
align-items: center;
|
||
justify-content: flex-end;
|
||
padding-right: 10px;
|
||
color: white;
|
||
font-weight: bold;
|
||
}
|
||
|
||
/* Tooltip styles */
|
||
.tooltip {
|
||
position: relative;
|
||
display: inline-block;
|
||
}
|
||
|
||
.tooltip .tooltiptext {
|
||
visibility: hidden;
|
||
width: 300px;
|
||
background-color: rgba(0, 0, 0, 0.9);
|
||
color: #fff;
|
||
text-align: left;
|
||
border-radius: 8px;
|
||
padding: 12px;
|
||
position: absolute;
|
||
z-index: 1000;
|
||
bottom: 125%;
|
||
left: 50%;
|
||
margin-left: -150px;
|
||
opacity: 0;
|
||
transition: opacity 0.3s;
|
||
font-size: 0.85em;
|
||
line-height: 1.4;
|
||
box-shadow: 0 4px 20px rgba(0,0,0,0.3);
|
||
}
|
||
|
||
.tooltip .tooltiptext::after {
|
||
content: "";
|
||
position: absolute;
|
||
top: 100%;
|
||
left: 50%;
|
||
margin-left: -5px;
|
||
border-width: 5px;
|
||
border-style: solid;
|
||
border-color: rgba(0, 0, 0, 0.9) transparent transparent transparent;
|
||
}
|
||
|
||
.tooltip:hover .tooltiptext {
|
||
visibility: visible;
|
||
opacity: 1;
|
||
}
|
||
|
||
.tooltiptext code {
|
||
background: rgba(255, 255, 255, 0.1);
|
||
padding: 2px 6px;
|
||
border-radius: 3px;
|
||
font-family: monospace;
|
||
font-size: 0.9em;
|
||
}
|
||
|
||
.tooltiptext strong {
|
||
color: #667eea;
|
||
}
|
||
</style>
|
||
</head>
|
||
<body>
|
||
<div class="container">
|
||
<header>
|
||
<button class="scale-toggle" id="scaleToggle" onclick="toggleScale()" title="Toggle between full scale (0-5) and zoomed view for better distinction">🔍 Full Scale</button>
|
||
<button class="theme-toggle" onclick="toggleTheme()">🌓 Toggle Dark Mode</button>
|
||
<h1>🧠 LLM Evaluation Dashboard</h1>
|
||
<p class="subtitle">Comprehensive Intelligence & Performance Analysis</p>
|
||
</header>
|
||
|
||
<div class="tabs">
|
||
<button class="tab active" onclick="switchTab('overview')">📊 Overview</button>
|
||
<button class="tab" onclick="switchTab('comparison')">⚔️ Model Comparison</button>
|
||
<button class="tab" onclick="switchTab('intelligence')">🎯 Intelligence Metrics</button>
|
||
<button class="tab" onclick="switchTab('categories')">📂 Category Analysis</button>
|
||
<button class="tab" onclick="switchTab('details')">🔍 Detailed Results</button>
|
||
</div>
|
||
|
||
<div id="overview" class="content-panel active">
|
||
<h2>System Overview</h2>
|
||
<div class="stats-grid" id="overviewStats">
|
||
<div class="loading">
|
||
<div class="spinner"></div>
|
||
Loading data...
|
||
</div>
|
||
</div>
|
||
<div class="chart-container">
|
||
<canvas id="overviewChart"></canvas>
|
||
</div>
|
||
</div>
|
||
|
||
<div id="comparison" class="content-panel">
|
||
<h2>Model Performance Comparison</h2>
|
||
<div class="controls">
|
||
<select id="metricSelect" onchange="updateComparisonChart()">
|
||
<option value="average">Average Score</option>
|
||
<option value="pass_rate">Pass Rate</option>
|
||
<option value="exceptional_rate">Exceptional Rate</option>
|
||
<option value="consistency">Consistency</option>
|
||
<option value="robustness">Robustness</option>
|
||
</select>
|
||
</div>
|
||
<div class="chart-container">
|
||
<canvas id="comparisonChart"></canvas>
|
||
</div>
|
||
</div>
|
||
|
||
<div id="intelligence" class="content-panel">
|
||
<h2>Intelligence Metrics Analysis</h2>
|
||
<p style="margin-bottom: 20px; color: #666;">
|
||
Advanced metrics evaluating different dimensions of AI intelligence and reasoning capabilities.
|
||
</p>
|
||
<div id="intelligenceMetrics">
|
||
<div class="loading">
|
||
<div class="spinner"></div>
|
||
Calculating intelligence metrics...
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<div id="categories" class="content-panel">
|
||
<h2>Performance by Category</h2>
|
||
<div class="controls">
|
||
<select id="categorySelect" onchange="updateCategoryChart()">
|
||
<option value="">Loading categories...</option>
|
||
</select>
|
||
</div>
|
||
<div class="chart-container">
|
||
<canvas id="categoryChart"></canvas>
|
||
</div>
|
||
</div>
|
||
|
||
<div id="details" class="content-panel">
|
||
<h2>Detailed Test Results</h2>
|
||
<div class="controls">
|
||
<select id="modelSelect" onchange="loadModelDetails()">
|
||
<option value="">Select a model...</option>
|
||
</select>
|
||
<input type="text" id="searchInput" placeholder="Search tests..." onkeyup="filterTable()">
|
||
<select id="filterCategory" onchange="filterTable()">
|
||
<option value="">All Categories</option>
|
||
</select>
|
||
<select id="filterScore" onchange="filterTable()">
|
||
<option value="">All Scores</option>
|
||
<option value="exceptional">Exceptional (4-5)</option>
|
||
<option value="pass">Pass (2-3)</option>
|
||
<option value="fail">Fail (0-1)</option>
|
||
</select>
|
||
</div>
|
||
<div id="detailsTable">
|
||
<p class="loading">Select a model to view detailed results</p>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<script>
|
||
let comparisonData = null;
|
||
let statisticsData = null;
|
||
let intelligenceData = null;
|
||
let currentModelDetails = null;
|
||
let zoomedScale = false;
|
||
let overviewChartInstance = null;
|
||
|
||
// Theme toggle functionality
|
||
function toggleTheme() {
|
||
document.body.classList.toggle('dark-mode');
|
||
const isDark = document.body.classList.contains('dark-mode');
|
||
localStorage.setItem('darkMode', isDark ? 'enabled' : 'disabled');
|
||
}
|
||
|
||
// Load theme preference
|
||
function loadThemePreference() {
|
||
const darkMode = localStorage.getItem('darkMode');
|
||
if (darkMode === 'enabled') {
|
||
document.body.classList.add('dark-mode');
|
||
}
|
||
}
|
||
|
||
// Scale toggle functionality
|
||
function toggleScale() {
|
||
zoomedScale = !zoomedScale;
|
||
const btn = document.getElementById('scaleToggle');
|
||
if (zoomedScale) {
|
||
btn.textContent = '🔎 Zoomed';
|
||
btn.classList.add('zoomed');
|
||
} else {
|
||
btn.textContent = '🔍 Full Scale';
|
||
btn.classList.remove('zoomed');
|
||
}
|
||
localStorage.setItem('zoomedScale', zoomedScale ? 'enabled' : 'disabled');
|
||
// Refresh all charts with new scale
|
||
refreshAllCharts();
|
||
}
|
||
|
||
// Load scale preference
|
||
function loadScalePreference() {
|
||
const savedScale = localStorage.getItem('zoomedScale');
|
||
if (savedScale === 'enabled') {
|
||
zoomedScale = true;
|
||
const btn = document.getElementById('scaleToggle');
|
||
btn.textContent = '🔎 Zoomed';
|
||
btn.classList.add('zoomed');
|
||
}
|
||
}
|
||
|
||
// Calculate optimal Y-axis range for zoomed view
|
||
function getScaleOptions(data, isRadar = false) {
|
||
if (!zoomedScale) {
|
||
// Full scale: 0 to 5
|
||
if (isRadar) {
|
||
return { r: { beginAtZero: true, max: 5 } };
|
||
}
|
||
return { y: { beginAtZero: true, max: 5 } };
|
||
}
|
||
|
||
// Zoomed scale: calculate min/max with padding
|
||
const validData = data.filter(d => d !== null && d !== undefined && !isNaN(d));
|
||
if (validData.length === 0) {
|
||
if (isRadar) {
|
||
return { r: { beginAtZero: true, max: 5 } };
|
||
}
|
||
return { y: { beginAtZero: true, max: 5 } };
|
||
}
|
||
|
||
const minVal = Math.min(...validData);
|
||
const maxVal = Math.max(...validData);
|
||
const range = maxVal - minVal;
|
||
const padding = Math.max(range * 0.2, 0.2); // At least 0.2 padding
|
||
|
||
let min = Math.max(0, Math.floor((minVal - padding) * 10) / 10);
|
||
let max = Math.min(5, Math.ceil((maxVal + padding) * 10) / 10);
|
||
|
||
// Ensure we have at least some range
|
||
if (max - min < 0.5) {
|
||
min = Math.max(0, minVal - 0.3);
|
||
max = Math.min(5, maxVal + 0.3);
|
||
}
|
||
|
||
if (isRadar) {
|
||
return { r: { min: min, max: max, beginAtZero: false } };
|
||
}
|
||
return { y: { min: min, max: max, beginAtZero: false } };
|
||
}
|
||
|
||
// Refresh all charts when scale changes
|
||
function refreshAllCharts() {
|
||
if (comparisonData) {
|
||
refreshOverviewChart();
|
||
updateComparisonChart();
|
||
updateCategoryChart();
|
||
}
|
||
}
|
||
|
||
// Tab switching
|
||
function switchTab(tabName) {
|
||
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
|
||
document.querySelectorAll('.content-panel').forEach(p => p.classList.remove('active'));
|
||
|
||
event.target.classList.add('active');
|
||
document.getElementById(tabName).classList.add('active');
|
||
}
|
||
|
||
// Initialize dashboard
|
||
async function initDashboard() {
|
||
loadThemePreference();
|
||
loadScalePreference();
|
||
await loadOverview();
|
||
await loadComparison();
|
||
await loadStatistics();
|
||
await loadIntelligenceMetrics();
|
||
populateModelSelector();
|
||
}
|
||
|
||
async function loadOverview() {
|
||
try {
|
||
const response = await axios.get('/api/comparison');
|
||
comparisonData = response.data;
|
||
|
||
const models = Object.keys(comparisonData.models);
|
||
const totalTests = models.reduce((sum, model) =>
|
||
sum + comparisonData.models[model].metadata.total_tests, 0);
|
||
const avgScore = models.reduce((sum, model) =>
|
||
sum + (comparisonData.models[model].overall_stats.average || 0), 0) / models.length;
|
||
|
||
const statsHtml = `
|
||
<div class="stat-card">
|
||
<h3>Models Evaluated</h3>
|
||
<div class="value">${models.length}</div>
|
||
</div>
|
||
<div class="stat-card">
|
||
<h3>Total Tests</h3>
|
||
<div class="value">${totalTests}</div>
|
||
</div>
|
||
<div class="stat-card">
|
||
<h3>Average Score</h3>
|
||
<div class="value">${avgScore.toFixed(2)}</div>
|
||
</div>
|
||
<div class="stat-card">
|
||
<h3>Categories</h3>
|
||
<div class="value">${comparisonData.categories.length}</div>
|
||
</div>
|
||
`;
|
||
|
||
document.getElementById('overviewStats').innerHTML = statsHtml;
|
||
|
||
// Create overview chart
|
||
refreshOverviewChart();
|
||
|
||
} catch (error) {
|
||
console.error('Error loading overview:', error);
|
||
}
|
||
}
|
||
|
||
function refreshOverviewChart() {
|
||
if (!comparisonData) return;
|
||
|
||
const models = Object.keys(comparisonData.models);
|
||
const data = models.map(m => comparisonData.models[m].overall_stats.average || 0);
|
||
|
||
if (overviewChartInstance) {
|
||
overviewChartInstance.destroy();
|
||
}
|
||
|
||
const ctx = document.getElementById('overviewChart').getContext('2d');
|
||
overviewChartInstance = new Chart(ctx, {
|
||
type: 'bar',
|
||
data: {
|
||
labels: models,
|
||
datasets: [{
|
||
label: 'Average Score',
|
||
data: data,
|
||
backgroundColor: 'rgba(102, 126, 234, 0.6)',
|
||
borderColor: 'rgba(102, 126, 234, 1)',
|
||
borderWidth: 2
|
||
}]
|
||
},
|
||
options: {
|
||
responsive: true,
|
||
maintainAspectRatio: false,
|
||
scales: getScaleOptions(data)
|
||
}
|
||
});
|
||
}
|
||
|
||
async function loadComparison() {
|
||
updateComparisonChart();
|
||
}
|
||
|
||
async function updateComparisonChart() {
|
||
if (!comparisonData) return;
|
||
|
||
const metric = document.getElementById('metricSelect').value;
|
||
const models = Object.keys(comparisonData.models);
|
||
|
||
let data, label;
|
||
|
||
if (metric === 'consistency' || metric === 'robustness') {
|
||
if (!statisticsData) {
|
||
await loadStatistics();
|
||
}
|
||
const index = statisticsData.models.indexOf(models[0]);
|
||
data = models.map((m, i) => statisticsData[metric + '_score'][i]);
|
||
label = metric.charAt(0).toUpperCase() + metric.slice(1) + ' Score';
|
||
} else {
|
||
data = models.map(m => comparisonData.models[m].overall_stats[metric] || 0);
|
||
label = metric.split('_').map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' ');
|
||
}
|
||
|
||
const ctx = document.getElementById('comparisonChart');
|
||
if (window.comparisonChartInstance) {
|
||
window.comparisonChartInstance.destroy();
|
||
}
|
||
|
||
window.comparisonChartInstance = new Chart(ctx, {
|
||
type: 'radar',
|
||
data: {
|
||
labels: models,
|
||
datasets: [{
|
||
label: label,
|
||
data: data,
|
||
backgroundColor: 'rgba(118, 75, 162, 0.2)',
|
||
borderColor: 'rgba(118, 75, 162, 1)',
|
||
pointBackgroundColor: 'rgba(118, 75, 162, 1)',
|
||
pointBorderColor: '#fff',
|
||
pointHoverBackgroundColor: '#fff',
|
||
pointHoverBorderColor: 'rgba(118, 75, 162, 1)'
|
||
}]
|
||
},
|
||
options: {
|
||
responsive: true,
|
||
maintainAspectRatio: false,
|
||
scales: getScaleOptions(data, true)
|
||
}
|
||
});
|
||
}
|
||
|
||
async function loadStatistics() {
|
||
try {
|
||
const response = await axios.get('/api/statistics');
|
||
statisticsData = response.data;
|
||
} catch (error) {
|
||
console.error('Error loading statistics:', error);
|
||
}
|
||
}
|
||
|
||
async function loadIntelligenceMetrics() {
|
||
try {
|
||
const response = await axios.get('/api/intelligence_metrics');
|
||
intelligenceData = response.data;
|
||
|
||
let html = '';
|
||
|
||
for (const [model, metrics] of Object.entries(intelligenceData)) {
|
||
html += `
|
||
<div class="metric-card">
|
||
<h3>${model}</h3>
|
||
|
||
<div style="margin-bottom: 20px;" class="tooltip">
|
||
<strong>Overall Intelligence Score:</strong>
|
||
<span class="tooltiptext">
|
||
<strong>Calculation:</strong><br>
|
||
Overall = (IQ × 0.5) + (Adaptability × 0.3) + (Problem-Solving × 0.2)<br><br>
|
||
<strong>Values:</strong><br>
|
||
• IQ: ${metrics.iq_score.toFixed(1)}<br>
|
||
• Adaptability: ${metrics.adaptability.toFixed(1)}%<br>
|
||
• Problem-Solving: ${metrics.problem_solving_depth.toFixed(1)}<br><br>
|
||
Result: ${metrics.overall_intelligence.toFixed(1)}
|
||
</span>
|
||
<div class="progress-bar">
|
||
<div class="progress-fill" style="width: ${metrics.overall_intelligence}%">
|
||
${metrics.overall_intelligence.toFixed(1)}
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 15px;">
|
||
<div class="tooltip">
|
||
<strong>IQ Score:</strong>
|
||
<span class="tooltiptext">
|
||
<strong>Weighted Average of Dimensions:</strong><br><br>
|
||
${Object.entries(metrics.dimensions).map(([dim, data]) => {
|
||
const weights = {
|
||
'logical_reasoning': 1.5,
|
||
'mathematical_ability': 1.3,
|
||
'technical_knowledge': 1.4,
|
||
'instruction_following': 1.2,
|
||
'linguistic_nuance': 1.1,
|
||
'creativity': 1.0,
|
||
'conversational_depth': 1.0
|
||
};
|
||
return `• ${dim.replace(/_/g, ' ')}: ${data.score.toFixed(1)} × ${weights[dim] || 1.0}`;
|
||
}).join('<br>')}<br><br>
|
||
Normalized to 0-100 scale
|
||
</span>
|
||
<div class="progress-bar">
|
||
<div class="progress-fill" style="width: ${metrics.iq_score}%">
|
||
${metrics.iq_score.toFixed(1)}
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<div class="tooltip">
|
||
<strong>Adaptability:</strong>
|
||
<span class="tooltiptext">
|
||
<strong>Cross-Category Performance:</strong><br><br>
|
||
Measures versatility across different task types.<br><br>
|
||
Formula: (Categories with avg ≥ 2.5) / (Total categories) × 100<br><br>
|
||
Higher score = more versatile model
|
||
</span>
|
||
<div class="progress-bar">
|
||
<div class="progress-fill" style="width: ${metrics.adaptability}%">
|
||
${metrics.adaptability.toFixed(1)}%
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<div class="tooltip">
|
||
<strong>Problem-Solving Depth:</strong>
|
||
<span class="tooltiptext">
|
||
<strong>Performance on Challenging Tasks:</strong><br><br>
|
||
Average score on "hard" and "very_hard" difficulty tests.<br><br>
|
||
Formula: (Avg score on hard tests) × 20<br><br>
|
||
Tests critical thinking and complex reasoning
|
||
</span>
|
||
<div class="progress-bar">
|
||
<div class="progress-fill" style="width: ${metrics.problem_solving_depth}%">
|
||
${metrics.problem_solving_depth.toFixed(1)}
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<h4 style="margin-top: 20px; color: #764ba2;">Cognitive Dimensions:</h4>
|
||
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 10px; margin-top: 10px;">
|
||
`;
|
||
|
||
const dimensionWeights = {
|
||
'logical_reasoning': 1.5,
|
||
'mathematical_ability': 1.3,
|
||
'technical_knowledge': 1.4,
|
||
'instruction_following': 1.2,
|
||
'linguistic_nuance': 1.1,
|
||
'creativity': 1.0,
|
||
'conversational_depth': 1.0
|
||
};
|
||
|
||
for (const [dim, data] of Object.entries(metrics.dimensions)) {
|
||
const weight = dimensionWeights[dim] || 1.0;
|
||
html += `
|
||
<div class="tooltip">
|
||
<small>${dim.replace(/_/g, ' ').toUpperCase()}</small>
|
||
<span class="tooltiptext">
|
||
<strong>${dim.replace(/_/g, ' ').toUpperCase()}</strong><br><br>
|
||
Score: <code>${data.score.toFixed(2)}/5.00</code><br>
|
||
Weight in IQ: <code>${weight}</code><br>
|
||
Tests evaluated: <code>${data.count}</code><br><br>
|
||
Normalized: ${data.normalized.toFixed(1)}%
|
||
</span>
|
||
<div class="progress-bar" style="height: 20px;">
|
||
<div class="progress-fill" style="width: ${data.normalized}%; font-size: 0.8em;">
|
||
${data.score.toFixed(1)}
|
||
</div>
|
||
</div>
|
||
</div>
|
||
`;
|
||
}
|
||
|
||
html += `
|
||
</div>
|
||
</div>
|
||
`;
|
||
}
|
||
|
||
document.getElementById('intelligenceMetrics').innerHTML = html;
|
||
|
||
} catch (error) {
|
||
console.error('Error loading intelligence metrics:', error);
|
||
document.getElementById('intelligenceMetrics').innerHTML =
|
||
'<p class="loading">Error loading intelligence metrics</p>';
|
||
}
|
||
}
|
||
|
||
function populateModelSelector() {
|
||
if (!comparisonData) return;
|
||
|
||
const models = Object.keys(comparisonData.models);
|
||
const select = document.getElementById('modelSelect');
|
||
|
||
select.innerHTML = '<option value="">Select a model...</option>';
|
||
models.forEach(model => {
|
||
const option = document.createElement('option');
|
||
option.value = model;
|
||
option.textContent = model;
|
||
select.appendChild(option);
|
||
});
|
||
|
||
// Populate category filter
|
||
const categoryFilter = document.getElementById('filterCategory');
|
||
categoryFilter.innerHTML = '<option value="">All Categories</option>';
|
||
comparisonData.categories.forEach(cat => {
|
||
const option = document.createElement('option');
|
||
option.value = cat;
|
||
option.textContent = cat;
|
||
categoryFilter.appendChild(option);
|
||
});
|
||
|
||
// Populate category chart selector
|
||
const categorySelect = document.getElementById('categorySelect');
|
||
categorySelect.innerHTML = '';
|
||
comparisonData.categories.forEach(cat => {
|
||
const option = document.createElement('option');
|
||
option.value = cat;
|
||
option.textContent = cat;
|
||
categorySelect.appendChild(option);
|
||
});
|
||
|
||
if (comparisonData.categories.length > 0) {
|
||
updateCategoryChart();
|
||
}
|
||
}
|
||
|
||
function updateCategoryChart() {
|
||
if (!comparisonData) return;
|
||
|
||
const category = document.getElementById('categorySelect').value;
|
||
const models = Object.keys(comparisonData.models);
|
||
|
||
const data = models.map(model => {
|
||
const stats = comparisonData.models[model].category_stats[category];
|
||
return stats ? stats.average : 0;
|
||
});
|
||
|
||
const ctx = document.getElementById('categoryChart');
|
||
if (window.categoryChartInstance) {
|
||
window.categoryChartInstance.destroy();
|
||
}
|
||
|
||
window.categoryChartInstance = new Chart(ctx, {
|
||
type: 'bar',
|
||
data: {
|
||
labels: models,
|
||
datasets: [{
|
||
label: `${category} - Average Score`,
|
||
data: data,
|
||
backgroundColor: 'rgba(102, 126, 234, 0.6)',
|
||
borderColor: 'rgba(102, 126, 234, 1)',
|
||
borderWidth: 2
|
||
}]
|
||
},
|
||
options: {
|
||
responsive: true,
|
||
maintainAspectRatio: false,
|
||
scales: getScaleOptions(data)
|
||
}
|
||
});
|
||
}
|
||
|
||
async function loadModelDetails() {
|
||
const modelName = document.getElementById('modelSelect').value;
|
||
if (!modelName || !comparisonData) return;
|
||
|
||
currentModelDetails = comparisonData.models[modelName].test_results;
|
||
displayDetailsTable(currentModelDetails);
|
||
}
|
||
|
||
function displayDetailsTable(results) {
|
||
let html = `
|
||
<table>
|
||
<thead>
|
||
<tr>
|
||
<th onclick="sortTable('test_name')">Test Name</th>
|
||
<th onclick="sortTable('category')">Category</th>
|
||
<th onclick="sortTable('difficulty')">Difficulty</th>
|
||
<th onclick="sortTable('score')">Score</th>
|
||
<th onclick="sortTable('generation_time')">Time (s)</th>
|
||
<th onclick="sortTable('tokens')">Tokens</th>
|
||
<th onclick="sortTable('status')">Status</th>
|
||
<th>Notes</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
`;
|
||
|
||
results.forEach(test => {
|
||
const scoreClass = test.score >= 4 ? 'exceptional' : test.score >= 2 ? 'pass' : 'fail';
|
||
const scoreDisplay = test.score !== null ? test.score.toFixed(1) : 'N/A';
|
||
|
||
// Extract timing and token info
|
||
const genTime = test.generation_time ? test.generation_time.toFixed(2) : 'N/A';
|
||
let tokenInfo = 'N/A';
|
||
let tokensPerSec = '';
|
||
|
||
if (test.api_metrics && test.api_metrics.usage) {
|
||
const usage = test.api_metrics.usage;
|
||
const totalTokens = usage.total_tokens || usage.eval_count || 'N/A';
|
||
const completionTokens = usage.completion_tokens || usage.eval_count;
|
||
|
||
if (totalTokens !== 'N/A') {
|
||
tokenInfo = totalTokens.toString();
|
||
|
||
// Calculate tokens/sec if we have both values
|
||
if (test.generation_time && completionTokens) {
|
||
const tps = completionTokens / test.generation_time;
|
||
tokensPerSec = `<br><small>(${tps.toFixed(1)} t/s)</small>`;
|
||
}
|
||
}
|
||
}
|
||
|
||
html += `
|
||
<tr>
|
||
<td><strong>${test.test_name}</strong></td>
|
||
<td>${test.category}</td>
|
||
<td>${test.difficulty}</td>
|
||
<td><span class="score-badge score-${scoreClass}">${scoreDisplay}</span></td>
|
||
<td>${genTime}</td>
|
||
<td>${tokenInfo}${tokensPerSec}</td>
|
||
<td>${test.status}</td>
|
||
<td><small>${test.notes || ''}</small></td>
|
||
</tr>
|
||
`;
|
||
});
|
||
|
||
html += '</tbody></table>';
|
||
document.getElementById('detailsTable').innerHTML = html;
|
||
}
|
||
|
||
function filterTable() {
|
||
if (!currentModelDetails) return;
|
||
|
||
const searchTerm = document.getElementById('searchInput').value.toLowerCase();
|
||
const categoryFilter = document.getElementById('filterCategory').value;
|
||
const scoreFilter = document.getElementById('filterScore').value;
|
||
|
||
const filtered = currentModelDetails.filter(test => {
|
||
const matchesSearch = test.test_name.toLowerCase().includes(searchTerm) ||
|
||
test.category.toLowerCase().includes(searchTerm);
|
||
const matchesCategory = !categoryFilter || test.category === categoryFilter;
|
||
|
||
let matchesScore = true;
|
||
if (scoreFilter === 'exceptional') matchesScore = test.score >= 4;
|
||
else if (scoreFilter === 'pass') matchesScore = test.score >= 2 && test.score < 4;
|
||
else if (scoreFilter === 'fail') matchesScore = test.score < 2;
|
||
|
||
return matchesSearch && matchesCategory && matchesScore;
|
||
});
|
||
|
||
displayDetailsTable(filtered);
|
||
}
|
||
|
||
function sortTable(column) {
|
||
if (!currentModelDetails) return;
|
||
|
||
currentModelDetails.sort((a, b) => {
|
||
if (column === 'score') {
|
||
return (b[column] || 0) - (a[column] || 0);
|
||
}
|
||
return (a[column] || '').toString().localeCompare((b[column] || '').toString());
|
||
});
|
||
|
||
filterTable();
|
||
}
|
||
|
||
// Initialize on load
|
||
initDashboard();
|
||
</script>
|
||
</body>
|
||
</html>'''
|
||
|
||
template_path = templates_dir / 'dashboard.html'
|
||
with open(template_path, 'w', encoding='utf-8') as f:
|
||
f.write(html_content)
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description="Analyze and compare AI model evaluation results",
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog="""
|
||
Examples:
|
||
# Compare all models
|
||
python analyze_results.py --compare
|
||
|
||
# Detailed report for specific model
|
||
python analyze_results.py --detail "qwen3:4b-q4_K_M"
|
||
|
||
# Export to CSV
|
||
python analyze_results.py --export comparison.csv
|
||
|
||
# Custom results directory
|
||
python analyze_results.py --results-dir ./my_results --compare
|
||
"""
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--results-dir',
|
||
default='results',
|
||
help='Directory containing result JSON files (default: results)'
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--compare',
|
||
action='store_true',
|
||
help='Generate comparison report for all models'
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--detail',
|
||
type=str,
|
||
help='Generate detailed report for specific model'
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--export',
|
||
type=str,
|
||
help='Export results to CSV file'
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--web',
|
||
action='store_true',
|
||
help='Launch interactive web dashboard'
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--host',
|
||
default='127.0.0.1',
|
||
help='Web server host (default: 127.0.0.1)'
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--port',
|
||
type=int,
|
||
default=5000,
|
||
help='Web server port (default: 5000)'
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
if args.web:
|
||
web = WebInterface(results_dir=args.results_dir)
|
||
web.run(host=args.host, port=args.port)
|
||
return
|
||
|
||
analyzer = ResultsAnalyzer(results_dir=args.results_dir)
|
||
|
||
if args.compare:
|
||
result_files = analyzer.find_result_files()
|
||
if result_files:
|
||
analyzer.compare_models(result_files)
|
||
else:
|
||
print(f"❌ No result files found in {args.results_dir}")
|
||
|
||
if args.detail:
|
||
analyzer.generate_detailed_report(args.detail)
|
||
|
||
if args.export:
|
||
analyzer.export_csv(args.export)
|
||
|
||
if not (args.compare or args.detail or args.export):
|
||
parser.print_help()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |