#!/usr/bin/env python3 """ AI Model Evaluation Results Analyzer Compares results across different models and quantizations Includes interactive web interface for visualization and analysis """ import json import sys from pathlib import Path from typing import List, Dict import argparse from collections import defaultdict from flask import Flask, render_template, jsonify, request import webbrowser from threading import Timer import numpy as np class ResultsAnalyzer: def __init__(self, results_dir: str = "results"): self.results_dir = Path(results_dir) def load_result_file(self, filepath: Path) -> Dict: """Load a single result file""" with open(filepath, 'r', encoding='utf-8') as f: return json.load(f) def find_result_files(self, pattern: str = "*_latest.json") -> List[Path]: """Find all result files matching pattern""" return sorted(self.results_dir.glob(pattern)) def extract_scores_by_category(self, results: Dict) -> Dict[str, List[float]]: """Extract scores organized by category""" scores_by_category = defaultdict(list) for test in results.get('test_results', []): category = test.get('category', 'Unknown') score = test.get('score') or test.get('overall_score') if score is not None: scores_by_category[category].append(score) return dict(scores_by_category) def calculate_statistics(self, scores: List[float]) -> Dict: """Calculate statistics for a list of scores""" if not scores: return { 'count': 0, 'average': 0.0, 'min': 0.0, 'max': 0.0, 'pass_rate': 0.0, 'exceptional_rate': 0.0 } return { 'count': len(scores), 'average': sum(scores) / len(scores), 'min': min(scores), 'max': max(scores), 'pass_rate': len([s for s in scores if s >= 2]) / len(scores) * 100, 'exceptional_rate': len([s for s in scores if s >= 4]) / len(scores) * 100 } def compare_models(self, result_files: List[Path]): """Generate comparison report for multiple models""" print("\n" + "="*100) print("šŸ“Š AI MODEL COMPARISON REPORT") print("="*100) # Load all results all_results = {} for filepath in result_files: try: results = self.load_result_file(filepath) model_name = results['metadata']['model_name'] all_results[model_name] = results except Exception as e: print(f"āš ļø Error loading {filepath}: {e}") if not all_results: print("āŒ No valid result files found") return # Overall comparison print("\nšŸ“ˆ OVERALL PERFORMANCE") print("-"*100) print(f"{'Model':<30} {'Total Tests':<12} {'Avg Score':<12} {'Pass Rate':<12} {'Exceptional':<12}") print("-"*100) model_stats = {} for model_name, results in sorted(all_results.items()): all_scores = [ test.get('score') or test.get('overall_score') for test in results.get('test_results', []) ] all_scores = [s for s in all_scores if s is not None] stats = self.calculate_statistics(all_scores) model_stats[model_name] = stats print(f"{model_name:<30} {stats['count']:<12} {stats['average']:<12.2f} " f"{stats['pass_rate']:<12.1f}% {stats['exceptional_rate']:<12.1f}%") # Category-wise comparison print("\n\nšŸ“‚ CATEGORY-WISE PERFORMANCE") print("="*100) # Get all unique categories all_categories = set() for results in all_results.values(): for test in results.get('test_results', []): all_categories.add(test.get('category', 'Unknown')) for category in sorted(all_categories): print(f"\nšŸ”– {category}") print("-"*100) print(f"{'Model':<30} {'Tests':<8} {'Avg Score':<12} {'Pass Rate':<12} {'Exceptional':<12}") print("-"*100) for model_name, results in sorted(all_results.items()): cat_scores = [ test.get('score') or test.get('overall_score') for test in results.get('test_results', []) if test.get('category') == category and (test.get('score') or test.get('overall_score')) is not None ] if cat_scores: stats = self.calculate_statistics(cat_scores) print(f"{model_name:<30} {stats['count']:<8} {stats['average']:<12.2f} " f"{stats['pass_rate']:<12.1f}% {stats['exceptional_rate']:<12.1f}%") else: print(f"{model_name:<30} {'N/A':<8} {'N/A':<12} {'N/A':<12} {'N/A':<12}") # Difficulty-based comparison print("\n\n⚔ DIFFICULTY-BASED PERFORMANCE") print("="*100) difficulties = ['medium', 'hard', 'very_hard'] for difficulty in difficulties: print(f"\nšŸŽÆ Difficulty: {difficulty.replace('_', ' ').title()}") print("-"*100) print(f"{'Model':<30} {'Tests':<8} {'Avg Score':<12} {'Pass Rate':<12}") print("-"*100) for model_name, results in sorted(all_results.items()): diff_scores = [ test.get('score') or test.get('overall_score') for test in results.get('test_results', []) if test.get('difficulty') == difficulty and (test.get('score') or test.get('overall_score')) is not None ] if diff_scores: stats = self.calculate_statistics(diff_scores) print(f"{model_name:<30} {stats['count']:<8} {stats['average']:<12.2f} " f"{stats['pass_rate']:<12.1f}%") else: print(f"{model_name:<30} {'N/A':<8} {'N/A':<12} {'N/A':<12}") # Winner analysis print("\n\nšŸ† WINNERS BY CATEGORY") print("="*100) for category in sorted(all_categories): best_model = None best_score = -1 for model_name, results in all_results.items(): cat_scores = [ test.get('score') or test.get('overall_score') for test in results.get('test_results', []) if test.get('category') == category and (test.get('score') or test.get('overall_score')) is not None ] if cat_scores: avg = sum(cat_scores) / len(cat_scores) if avg > best_score: best_score = avg best_model = model_name if best_model: print(f"{category:<50} → {best_model} ({best_score:.2f})") print("\n\nšŸŽ–ļø OVERALL WINNER") print("="*100) best_overall = max(model_stats.items(), key=lambda x: x[1]['average']) print(f"Model: {best_overall[0]}") print(f"Average Score: {best_overall[1]['average']:.2f}/5.00") print(f"Pass Rate: {best_overall[1]['pass_rate']:.1f}%") print(f"Exceptional Rate: {best_overall[1]['exceptional_rate']:.1f}%") print("="*100) def generate_detailed_report(self, model_name: str): """Generate detailed report for a specific model""" # Find result file for this model pattern = f"{model_name.replace(':', '_')}_latest.json" filepath = self.results_dir / pattern if not filepath.exists(): print(f"āŒ No results found for model: {model_name}") return results = self.load_result_file(filepath) print("\n" + "="*100) print(f"šŸ“‹ DETAILED REPORT: {model_name}") print("="*100) # Metadata metadata = results.get('metadata', {}) print(f"\nā±ļø Test Duration: {metadata.get('test_start')} → {metadata.get('test_end')}") print(f"šŸ“Š Tests: {metadata.get('completed_tests')}/{metadata.get('total_tests')}") # Overall stats all_scores = [ test.get('score') or test.get('overall_score') for test in results.get('test_results', []) ] all_scores = [s for s in all_scores if s is not None] stats = self.calculate_statistics(all_scores) print(f"\nšŸ“ˆ Overall Performance:") print(f" Average Score: {stats['average']:.2f}/5.00") print(f" Pass Rate: {stats['pass_rate']:.1f}%") print(f" Exceptional Rate: {stats['exceptional_rate']:.1f}%") print(f" Score Range: {stats['min']:.1f} - {stats['max']:.1f}") # Test-by-test results print(f"\n\nšŸ“ TEST-BY-TEST RESULTS") print("="*100) for test in results.get('test_results', []): score = test.get('score') or test.get('overall_score') status_icon = "āœ…" if score and score >= 4 else "āš ļø" if score and score >= 2 else "āŒ" print(f"\n{status_icon} [{test.get('test_id')}] {test.get('test_name')}") print(f" Category: {test.get('category')}") print(f" Type: {test.get('type')}") print(f" Difficulty: {test.get('difficulty', 'unknown')}") print(f" Score: {score if score is not None else 'N/A'}/5.00") if test.get('notes'): print(f" Notes: {test['notes']}") # Show criteria pass/fail if available if test.get('evaluation_criteria'): print(f" Criteria ({len(test['evaluation_criteria'])} items):") for criterion in test['evaluation_criteria']: print(f" • {criterion}") print("\n" + "="*100) def export_csv(self, output_file: str = "comparison.csv"): """Export comparison data to CSV""" import csv result_files = self.find_result_files() if not result_files: print("āŒ No result files found") return # Prepare CSV data csv_data = [] headers = ['Model', 'Test ID', 'Test Name', 'Category', 'Type', 'Difficulty', 'Score', 'Notes'] for filepath in result_files: results = self.load_result_file(filepath) model_name = results['metadata']['model_name'] for test in results.get('test_results', []): csv_data.append([ model_name, test.get('test_id', ''), test.get('test_name', ''), test.get('category', ''), test.get('type', ''), test.get('difficulty', ''), test.get('score') or test.get('overall_score', ''), test.get('notes', '') ]) # Write CSV output_path = self.results_dir / output_file with open(output_path, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(headers) writer.writerows(csv_data) print(f"āœ… CSV exported to: {output_path}") class WebInterface: """Interactive web interface for results analysis""" def __init__(self, results_dir: str = "results"): self.results_dir = Path(results_dir) self.analyzer = ResultsAnalyzer(results_dir) self.app = Flask(__name__) self.setup_routes() def setup_routes(self): """Setup Flask routes""" @self.app.route('/') def index(): """Main dashboard""" return render_template('dashboard.html') @self.app.route('/api/models') def get_models(): """Get list of all available models""" result_files = self.analyzer.find_result_files() models = [] for filepath in result_files: try: results = self.analyzer.load_result_file(filepath) metadata = results.get('metadata', {}) models.append({ 'name': metadata.get('model_name', 'Unknown'), 'file': filepath.name, 'total_tests': metadata.get('total_tests', 0), 'completed_tests': metadata.get('completed_tests', 0), 'test_start': metadata.get('test_start'), 'test_end': metadata.get('test_end') }) except Exception as e: print(f"Error loading {filepath}: {e}") return jsonify(models) @self.app.route('/api/results/') def get_model_results(model_name): """Get detailed results for a specific model""" pattern = f"{model_name.replace(':', '_')}_latest.json" filepath = self.results_dir / pattern if not filepath.exists(): return jsonify({'error': 'Model not found'}), 404 results = self.analyzer.load_result_file(filepath) return jsonify(results) @self.app.route('/api/comparison') def get_comparison(): """Get comparison data for all models""" result_files = self.analyzer.find_result_files() comparison_data = { 'models': {}, 'categories': set(), 'difficulty_levels': set() } for filepath in result_files: try: results = self.analyzer.load_result_file(filepath) model_name = results['metadata']['model_name'] # Extract all scores and metadata test_results = [] for test in results.get('test_results', []): score = test.get('score') or test.get('overall_score') test_data = { 'test_id': test.get('test_id'), 'test_name': test.get('test_name'), 'category': test.get('category', 'Unknown'), 'type': test.get('type'), 'difficulty': test.get('difficulty', 'medium'), 'score': score, 'status': test.get('status'), 'notes': test.get('notes', '') } test_results.append(test_data) if test_data['category']: comparison_data['categories'].add(test_data['category']) if test_data['difficulty']: comparison_data['difficulty_levels'].add(test_data['difficulty']) # Calculate overall statistics all_scores = [t['score'] for t in test_results if t['score'] is not None] stats = self.analyzer.calculate_statistics(all_scores) if all_scores else {} # Calculate category statistics category_stats = {} for category in comparison_data['categories']: cat_scores = [t['score'] for t in test_results if t['category'] == category and t['score'] is not None] if cat_scores: category_stats[category] = self.analyzer.calculate_statistics(cat_scores) # Calculate difficulty statistics difficulty_stats = {} for difficulty in comparison_data['difficulty_levels']: diff_scores = [t['score'] for t in test_results if t['difficulty'] == difficulty and t['score'] is not None] if diff_scores: difficulty_stats[difficulty] = self.analyzer.calculate_statistics(diff_scores) comparison_data['models'][model_name] = { 'test_results': test_results, 'overall_stats': stats, 'category_stats': category_stats, 'difficulty_stats': difficulty_stats, 'metadata': results.get('metadata', {}) } except Exception as e: print(f"Error processing {filepath}: {e}") # Convert sets to lists for JSON serialization comparison_data['categories'] = sorted(list(comparison_data['categories'])) comparison_data['difficulty_levels'] = sorted(list(comparison_data['difficulty_levels'])) return jsonify(comparison_data) @self.app.route('/api/statistics') def get_statistics(): """Get advanced statistical analysis""" result_files = self.analyzer.find_result_files() all_data = [] model_names = [] for filepath in result_files: try: results = self.analyzer.load_result_file(filepath) model_name = results['metadata']['model_name'] model_names.append(model_name) scores = [test.get('score') or test.get('overall_score') for test in results.get('test_results', [])] scores = [s for s in scores if s is not None] all_data.append(scores) except Exception as e: print(f"Error in statistics: {e}") # Calculate advanced statistics statistics = { 'models': model_names, 'variance': [], 'std_dev': [], 'consistency_score': [], 'robustness_score': [] } for i, scores in enumerate(all_data): if scores: variance = np.var(scores) std_dev = np.std(scores) # Consistency: lower variance is better (inverse normalized) consistency = max(0, 100 - (std_dev * 20)) # Robustness: combination of average and consistency avg_score = np.mean(scores) robustness = (avg_score * 20) * (consistency / 100) statistics['variance'].append(float(variance)) statistics['std_dev'].append(float(std_dev)) statistics['consistency_score'].append(float(consistency)) statistics['robustness_score'].append(float(robustness)) else: statistics['variance'].append(0) statistics['std_dev'].append(0) statistics['consistency_score'].append(0) statistics['robustness_score'].append(0) return jsonify(statistics) @self.app.route('/api/intelligence_metrics') def get_intelligence_metrics(): """Calculate intelligence evaluation metrics""" result_files = self.analyzer.find_result_files() metrics = {} for filepath in result_files: try: results = self.analyzer.load_result_file(filepath) model_name = results['metadata']['model_name'] test_results = results.get('test_results', []) # Define intelligence dimensions dimensions = { 'logical_reasoning': ['Logic & Reasoning'], 'mathematical_ability': ['Mathematics & Calculation'], 'instruction_following': ['Instruction Following'], 'creativity': ['Creative Writing'], 'technical_knowledge': ['Code Generation', 'IT Forensics'], 'linguistic_nuance': ['Language Nuance'], 'conversational_depth': ['Multi-turn Conversations'] } model_metrics = {} for dimension, categories in dimensions.items(): dim_scores = [ test.get('score') or test.get('overall_score') for test in test_results if test.get('category') in categories and (test.get('score') or test.get('overall_score')) is not None ] if dim_scores: avg = sum(dim_scores) / len(dim_scores) model_metrics[dimension] = { 'score': avg, 'normalized': (avg / 5.0) * 100, 'count': len(dim_scores) } else: model_metrics[dimension] = { 'score': 0, 'normalized': 0, 'count': 0 } # Calculate overall intelligence quotient (IQ score) weighted_scores = [] weights = { 'logical_reasoning': 1.5, # Higher weight for core reasoning 'mathematical_ability': 1.3, 'instruction_following': 1.2, 'creativity': 1.0, 'technical_knowledge': 1.4, 'linguistic_nuance': 1.1, 'conversational_depth': 1.0 } total_weight = 0 for dim, data in model_metrics.items(): if data['count'] > 0: weighted_scores.append(data['score'] * weights[dim]) total_weight += weights[dim] iq_score = (sum(weighted_scores) / total_weight * 20) if total_weight > 0 else 0 # Adaptability: performance across diverse categories category_scores = {} for test in test_results: cat = test.get('category') score = test.get('score') or test.get('overall_score') if cat and score is not None: if cat not in category_scores: category_scores[cat] = [] category_scores[cat].append(score) adaptability = len([cat for cat, scores in category_scores.items() if sum(scores)/len(scores) >= 2.5]) / max(len(category_scores), 1) * 100 # Problem-solving depth: performance on hard problems hard_scores = [ test.get('score') or test.get('overall_score') for test in test_results if test.get('difficulty') in ['hard', 'very_hard'] and (test.get('score') or test.get('overall_score')) is not None ] problem_solving_depth = (sum(hard_scores) / len(hard_scores) * 20) if hard_scores else 0 metrics[model_name] = { 'dimensions': model_metrics, 'iq_score': iq_score, 'adaptability': adaptability, 'problem_solving_depth': problem_solving_depth, 'overall_intelligence': (iq_score * 0.5 + adaptability * 0.3 + problem_solving_depth * 0.2) } except Exception as e: print(f"Error calculating intelligence metrics: {e}") return jsonify(metrics) def run(self, host='127.0.0.1', port=5000, debug=False): """Start the web server""" # Create templates directory if it doesn't exist templates_dir = Path(__file__).parent / 'templates' templates_dir.mkdir(exist_ok=True) # Generate HTML template self.create_dashboard_template(templates_dir) # Open browser automatically def open_browser(): webbrowser.open(f'http://{host}:{port}') if not debug: Timer(1.5, open_browser).start() print(f"\n🌐 Starting web interface at http://{host}:{port}") print(f"šŸ“Š Dashboard will open automatically in your browser") print(f"šŸ›‘ Press Ctrl+C to stop the server\n") self.app.run(host=host, port=port, debug=debug) def create_dashboard_template(self, templates_dir: Path): """Create the HTML dashboard template""" html_content = ''' LLM Evaluation Dashboard

🧠 LLM Evaluation Dashboard

Comprehensive Intelligence & Performance Analysis

System Overview

Loading data...

Model Performance Comparison

Intelligence Metrics Analysis

Advanced metrics evaluating different dimensions of AI intelligence and reasoning capabilities.

Calculating intelligence metrics...

Performance by Category

Detailed Test Results

Select a model to view detailed results

''' template_path = templates_dir / 'dashboard.html' with open(template_path, 'w', encoding='utf-8') as f: f.write(html_content) def main(): parser = argparse.ArgumentParser( description="Analyze and compare AI model evaluation results", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Compare all models python analyze_results.py --compare # Detailed report for specific model python analyze_results.py --detail "qwen3:4b-q4_K_M" # Export to CSV python analyze_results.py --export comparison.csv # Custom results directory python analyze_results.py --results-dir ./my_results --compare """ ) parser.add_argument( '--results-dir', default='results', help='Directory containing result JSON files (default: results)' ) parser.add_argument( '--compare', action='store_true', help='Generate comparison report for all models' ) parser.add_argument( '--detail', type=str, help='Generate detailed report for specific model' ) parser.add_argument( '--export', type=str, help='Export results to CSV file' ) parser.add_argument( '--web', action='store_true', help='Launch interactive web dashboard' ) parser.add_argument( '--host', default='127.0.0.1', help='Web server host (default: 127.0.0.1)' ) parser.add_argument( '--port', type=int, default=5000, help='Web server port (default: 5000)' ) args = parser.parse_args() if args.web: web = WebInterface(results_dir=args.results_dir) web.run(host=args.host, port=args.port) return analyzer = ResultsAnalyzer(results_dir=args.results_dir) if args.compare: result_files = analyzer.find_result_files() if result_files: analyzer.compare_models(result_files) else: print(f"āŒ No result files found in {args.results_dir}") if args.detail: analyzer.generate_detailed_report(args.detail) if args.export: analyzer.export_csv(args.export) if not (args.compare or args.detail or args.export): parser.print_help() if __name__ == "__main__": main()