diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..ab3fa72
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,52 @@
+# AI Model Evaluation Configuration
+# Copy this file to .env and fill in your values
+
+# =============================================================================
+# MODEL UNDER TEST (MUT) - The model being evaluated
+# =============================================================================
+# OpenAI-compatible API endpoint for the model under test
+MUT_ENDPOINT=http://localhost:11434
+
+# API key for the model under test (optional for local endpoints like Ollama)
+MUT_API_KEY=
+
+# Model name/identifier to test
+# Supports multiple models separated by commas for batch testing:
+# MUT_MODEL=qwen3:4b-q4_K_M,qwen3:4b-q8_0,qwen3:4b-fp16,qwen3:8b-q4_K_M
+# Or specify a single model:
+MUT_MODEL=qwen3:4b-q4_K_M
+
+# =============================================================================
+# EVALUATOR API - Used for non-interactive mode to automatically score responses
+# =============================================================================
+# OpenAI-compatible API endpoint for the evaluator model
+EVALUATOR_ENDPOINT=http://localhost:11434
+
+# API key for the evaluator API
+EVALUATOR_API_KEY=
+
+# Evaluator model name (should be a capable model for evaluation tasks)
+EVALUATOR_MODEL=qwen3:14b
+
+# Temperature for evaluator (lower = more consistent scoring)
+EVALUATOR_TEMPERATURE=0.3
+
+# =============================================================================
+# TEST CONFIGURATION
+# =============================================================================
+# Path to test suite YAML file
+TEST_SUITE=test_suite.yaml
+
+# Output directory for results
+OUTPUT_DIR=results
+
+# Filter tests by category (optional, leave empty for all categories)
+FILTER_CATEGORY=
+
+# =============================================================================
+# EXECUTION MODE
+# =============================================================================
+# Run in non-interactive mode (true/false)
+# When true, uses EVALUATOR_* settings for automated scoring
+# When false, prompts user for manual evaluation
+NON_INTERACTIVE=false
diff --git a/.gitignore b/.gitignore
index 36b13f1..3fc2490 100644
--- a/.gitignore
+++ b/.gitignore
@@ -174,3 +174,4 @@ cython_debug/
 # PyPI configuration file
 .pypirc
 
+results/
\ No newline at end of file
diff --git a/README.md b/README.md
index f9cf4aa..9482730 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,11 @@ Comprehensive testing suite for evaluating AI models on general reasoning tasks
   - Category-wise performance breakdown
   - Difficulty-based analysis
   - CSV export for further analysis
+  - **🌐 Interactive Web Dashboard** (New!)
+    - Visual analytics with charts and graphs
+    - Advanced intelligence metrics
+    - Filtering, sorting, and statistical analysis
+    - Multi-dimensional performance evaluation
 
 ## Quick Start
 
@@ -41,25 +46,82 @@ Comprehensive testing suite for evaluating AI models on general reasoning tasks
 
 ```bash
 # Python 3.8+
-pip install pyyaml requests
+pip install -r requirements.txt
+# or manually:
+pip install pyyaml requests python-dotenv
 ```
 
 ### Installation
 
 ```bash
 # Clone or download the files
-# Ensure these files are in your working directory:
-# - ai_eval.py
-# - analyze_results.py
-# - test_suite.yaml
+# Copy the example environment file
+cp .env.example .env
+
+# Edit .env with your settings
+# - Configure the model under test (MUT_*)
+# - Configure the evaluator model for non-interactive mode (EVALUATOR_*)
+# - Set NON_INTERACTIVE=true for automated evaluation
+nano .env
+```
+
+### Configuration with .env File (Recommended)
+
+The test suite can be configured using a `.env` file for easier batch testing and non-interactive mode:
+
+```bash
+# Model Under Test (MUT) - The model being evaluated
+MUT_ENDPOINT=http://localhost:11434
+MUT_API_KEY=                         # Optional for local endpoints
+MUT_MODEL=qwen3:4b-q4_K_M
+
+# Evaluator API - For non-interactive automated scoring
+EVALUATOR_ENDPOINT=http://localhost:11434
+EVALUATOR_API_KEY=                   # Optional
+EVALUATOR_MODEL=qwen3:14b           # Use a capable model for evaluation
+EVALUATOR_TEMPERATURE=0.3           # Lower = more consistent scoring
+
+# Execution Mode
+NON_INTERACTIVE=false               # Set to true for automated evaluation
+TEST_SUITE=test_suite.yaml
+OUTPUT_DIR=results
+FILTER_CATEGORY=                    # Optional: filter by category
 ```
 
 ### Basic Usage
 
-#### 1. Test a Single Model
+#### 0. Test Connectivity (Dry Run)
+
+Before running the full test suite, verify that your API endpoints are reachable and properly configured:
 
 ```bash
-# For Ollama (default: http://localhost:11434)
+# Test MUT endpoint connectivity
+python ai_eval.py --dry-run
+
+# Test with specific configuration
+python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b --dry-run
+
+# Test non-interactive mode (tests both MUT and evaluator endpoints)
+python ai_eval.py --non-interactive --dry-run
+
+# Test multiple models
+python ai_eval.py --model qwen3:4b,qwen3:8b,qwen3:14b --dry-run
+```
+
+The dry-run mode will:
+- Test connectivity to the model under test endpoint(s)
+- Verify authentication (API keys)
+- Confirm model availability
+- Test evaluator endpoint if in non-interactive mode
+- Exit with success/failure status
+
+#### 1. Interactive Mode (Manual Evaluation)
+
+```bash
+# Using .env file
+python ai_eval.py
+
+# Or with command-line arguments
 python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q4_K_M
 
 # For other endpoints with API key
@@ -69,33 +131,94 @@ python ai_eval.py \
   --model your-model-name
 ```
 
-#### 2. Test Multiple Models (Quantization Comparison)
+#### 2. Non-Interactive Mode (Automated Evaluation)
+
+Non-interactive mode uses a separate evaluator model to automatically score responses. This is ideal for batch testing and comparing multiple models without manual intervention.
 
 ```bash
-# Test different quantizations of qwen3:4b
-python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q4_K_M
-python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q8_0
-python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-fp16
+# Configure .env file
+NON_INTERACTIVE=true
+EVALUATOR_ENDPOINT=http://localhost:11434
+EVALUATOR_MODEL=qwen3:14b
 
-# Test different model sizes
-python ai_eval.py --endpoint http://localhost:11434 --model qwen3:8b-q4_K_M
-python ai_eval.py --endpoint http://localhost:11434 --model qwen3:14b-q4_K_M
+# Run the test
+python ai_eval.py
+
+# Or with command-line arguments
+python ai_eval.py \
+  --endpoint http://localhost:11434 \
+  --model qwen3:4b-q4_K_M \
+  --non-interactive \
+  --evaluator-endpoint http://localhost:11434 \
+  --evaluator-model qwen3:14b
 ```
 
-#### 3. Filter by Category
+**How Non-Interactive Mode Works:**
+- For each test, the script sends the original prompt, model response, and evaluation criteria to the evaluator API
+- The evaluator model analyzes the response and returns a score (0-5) with notes
+- This enables automated, consistent scoring across multiple model runs
+- The evaluator uses a specialized system prompt designed for objective evaluation
+
+**Choosing an Evaluator Model:**
+- Use a capable model (e.g., qwen3:14b, gpt-4, claude-3) for reliable evaluation
+- The evaluator model should be more capable than the model under test
+- Lower temperature (0.3) provides more consistent scoring
+
+#### 3. Test Multiple Models (Batch Mode)
+
+Test multiple models in one run by specifying comma-separated model names:
+
+```bash
+# In .env file
+MUT_MODEL=qwen3:4b-q4_K_M,qwen3:4b-q8_0,qwen3:4b-fp16,qwen3:8b-q4_K_M
+
+# Run batch test
+python ai_eval.py
+
+# Or via command line
+python ai_eval.py --model qwen3:4b-q4_K_M,qwen3:4b-q8_0,qwen3:4b-fp16
+```
+
+The script will automatically test each model sequentially and save individual results.
+
+#### 4. Filter by Category
 
 ```bash
 # Test only IT Forensics categories
-python ai_eval.py \
-  --endpoint http://localhost:11434 \
-  --model qwen3:4b \
-  --category "IT Forensics - File Systems"
+python ai_eval.py --category "IT Forensics - File Systems"
 ```
 
-#### 4. Analyze Results
+#### 5. Analyze Results
 
 ```bash
-# Compare all tested models
+## Analyzing Results
+
+### Interactive Web Dashboard (Recommended)
+
+Launch the comprehensive web interface for visual analysis:
+
+```bash
+# Start web dashboard (opens automatically in browser)
+python analyze_results.py --web
+
+# Custom host/port
+python analyze_results.py --web --host 0.0.0.0 --port 8080
+```
+
+**Features:**
+- 📊 Visual comparison charts and graphs
+- 🎯 Advanced intelligence metrics (IQ, Adaptability, Problem-Solving Depth)
+- 🔍 Interactive filtering and sorting
+- 📈 Statistical analysis (consistency, robustness)
+- 📂 Category and difficulty breakdowns
+- 💡 Multi-dimensional cognitive evaluation
+
+See [WEB_INTERFACE.md](WEB_INTERFACE.md) for detailed documentation.
+
+### Command-Line Analysis
+
+```bash
+# Compare all models
 python analyze_results.py --compare
 
 # Detailed report for specific model
@@ -188,6 +311,7 @@ All tests are evaluated on a 0-5 scale:
 ├── ai_eval.py              # Main testing script
 ├── analyze_results.py      # Results analysis and comparison
 ├── test_suite.yaml         # Test definitions
+├── .env.example            # Configuration template
 ├── results/                # Auto-created results directory
 │   ├── qwen3_4b-q4_K_M_latest.json
 │   ├── qwen3_4b-q8_0_latest.json
@@ -195,6 +319,60 @@ All tests are evaluated on a 0-5 scale:
 └── README.md
 ```
 
+## Configuration Reference
+
+### Environment Variables (.env file)
+
+All configuration can be set via `.env` file or command-line arguments. Command-line arguments override `.env` values.
+
+#### Model Under Test (MUT)
+
+| Variable | Description | Example |
+| --- | --- | --- |
+| `MUT_ENDPOINT` | API endpoint for model under test | `http://localhost:11434` |
+| `MUT_API_KEY` | API key (optional for local endpoints) | `sk-...` |
+| `MUT_MODEL` | Model name/identifier | `qwen3:4b-q4_K_M` |
+
+#### Evaluator Configuration (for Non-Interactive Mode)
+
+| Variable | Description | Example |
+| --- | --- | --- |
+| `EVALUATOR_ENDPOINT` | API endpoint for evaluator model | `http://localhost:11434` |
+| `EVALUATOR_API_KEY` | API key for evaluator | `sk-...` |
+| `EVALUATOR_MODEL` | Evaluator model name | `qwen3:14b` |
+| `EVALUATOR_TEMPERATURE` | Temperature for evaluator (lower = more consistent) | `0.3` |
+
+#### Test Configuration
+
+| Variable | Description | Example |
+| --- | --- | --- |
+| `NON_INTERACTIVE` | Enable automated evaluation | `true` or `false` |
+| `TEST_SUITE` | Path to test suite YAML file | `test_suite.yaml` |
+| `OUTPUT_DIR` | Results output directory | `results` |
+| `FILTER_CATEGORY` | Filter tests by category (optional) | `IT Forensics - File Systems` |
+
+### Command-Line Arguments
+
+All environment variables have corresponding command-line flags:
+
+```bash
+python ai_eval.py --help
+
+Options:
+  --endpoint ENDPOINT                Model under test endpoint
+  --api-key API_KEY                  Model under test API key
+  --model MODEL                      Model name to test
+  --test-suite FILE                  Test suite YAML file
+  --output-dir DIR                   Output directory
+  --category CATEGORY                Filter by category
+  --non-interactive                  Enable automated evaluation
+  --evaluator-endpoint ENDPOINT      Evaluator API endpoint
+  --evaluator-api-key KEY            Evaluator API key
+  --evaluator-model MODEL            Evaluator model name
+  --evaluator-temperature TEMP       Evaluator temperature
+```
+
+
 ## Advanced Usage
 
 ### Custom Test Suite
@@ -214,28 +392,25 @@ Edit `test_suite.yaml` to add your own tests:
       expected_difficulty: "medium"  # medium, hard, very_hard
 ```
 
-### Batch Testing Script
+### Batch Testing Examples
 
-Create `batch_test.sh`:
+Testing multiple models using the `.env` configuration:
 
 ```bash
-#!/bin/bash
+# Configure .env with multiple models
+cp .env.example .env
+nano .env
 
-ENDPOINT="http://localhost:11434"
+# Set multiple models (comma-separated)
+MUT_MODEL=qwen3:4b-q4_K_M,qwen3:4b-q8_0,qwen3:4b-fp16,qwen3:8b-q4_K_M
 
-# Test all qwen3:4b quantizations
-for quant in q4_K_M q8_0 fp16; do
-    echo "Testing qwen3:4b-${quant}..."
-    python ai_eval.py --endpoint $ENDPOINT --model "qwen3:4b-${quant}"
-done
+# Run batch tests
+python ai_eval.py
 
-# Test all sizes with q4_K_M
-for size in 4b 8b 14b; do
-    echo "Testing qwen3:${size}-q4_K_M..."
-    python ai_eval.py --endpoint $ENDPOINT --model "qwen3:${size}-q4_K_M"
-done
+# Or via command line
+python ai_eval.py --model qwen3:4b-q4_K_M,qwen3:8b-q4_K_M,qwen3:14b-q4_K_M
 
-# Generate comparison
+# Generate comparison after testing
 python analyze_results.py --compare
 ```
 
@@ -244,8 +419,8 @@ python analyze_results.py --compare
 For OpenAI-compatible cloud services:
 
 ```bash
-python ai_eval.py \
-  --endpoint https://api.service.com \
-  --api-key your-api-key \
-  --model model-name
+# In .env file
+MUT_ENDPOINT=https://api.service.com
+MUT_API_KEY=your-api-key
+MUT_MODEL=model-name
 ```
diff --git a/ai_eval.py b/ai_eval.py
index fa86bc5..caf9ae4 100644
--- a/ai_eval.py
+++ b/ai_eval.py
@@ -9,14 +9,19 @@ import json
 import requests
 import os
 import sys
+import time
 from datetime import datetime
 from typing import Dict, List, Any, Optional
 from pathlib import Path
 import argparse
+from dotenv import load_dotenv
 
 
 class AIModelTester:
-    def __init__(self, endpoint: str, api_key: str, model_name: str, output_dir: str = "results"):
+    def __init__(self, endpoint: str, api_key: str, model_name: str, output_dir: str = "results",
+                 non_interactive: bool = False, evaluator_endpoint: Optional[str] = None,
+                 evaluator_api_key: Optional[str] = None, evaluator_model: Optional[str] = None,
+                 evaluator_temperature: float = 0.3):
         """
         Initialize the AI Model Tester
         
@@ -25,6 +30,11 @@ class AIModelTester:
             api_key: API key for authentication
             model_name: Name/identifier of the model being tested
             output_dir: Directory to save results
+            non_interactive: If True, use evaluator API for automatic scoring
+            evaluator_endpoint: API endpoint for evaluation model
+            evaluator_api_key: API key for evaluator
+            evaluator_model: Model name for evaluator
+            evaluator_temperature: Temperature for evaluator
         """
         self.endpoint = endpoint.rstrip('/')
         self.api_key = api_key
@@ -32,6 +42,19 @@ class AIModelTester:
         self.output_dir = Path(output_dir)
         self.output_dir.mkdir(exist_ok=True)
         
+        # Non-interactive mode settings
+        self.non_interactive = non_interactive
+        self.evaluator_endpoint = evaluator_endpoint.rstrip('/') if evaluator_endpoint else None
+        self.evaluator_api_key = evaluator_api_key
+        self.evaluator_model = evaluator_model
+        self.evaluator_temperature = evaluator_temperature
+        
+        if self.non_interactive:
+            if not all([self.evaluator_endpoint, self.evaluator_model]):
+                raise ValueError("Non-interactive mode requires evaluator_endpoint and evaluator_model")
+            print(f"🤖 Non-interactive mode enabled")
+            print(f"   Evaluator: {self.evaluator_model} @ {self.evaluator_endpoint}")
+        
         # Results storage
         self.results = {
             "metadata": {
@@ -49,6 +72,69 @@ class AIModelTester:
         self.current_test_id = None
         self.conversation_history = []
         
+        # Track failed auto-evaluations for manual review
+        self.failed_evaluations = []
+    
+    def test_connection(self, endpoint: str, api_key: str, model: str, endpoint_name: str = "API") -> bool:
+        """
+        Test if an API endpoint can be reached and authenticated against
+        
+        Args:
+            endpoint: API endpoint URL
+            api_key: API key for authentication
+            model: Model name to test
+            endpoint_name: Name for display purposes
+            
+        Returns:
+            True if connection successful, False otherwise
+        """
+        print(f"\n🔍 Testing {endpoint_name} connection...")
+        print(f"   Endpoint: {endpoint}")
+        print(f"   Model: {model}")
+        
+        headers = {
+            "Content-Type": "application/json"
+        }
+        
+        if api_key:
+            headers["Authorization"] = f"Bearer {api_key}"
+            print(f"   API Key: {api_key[:10]}..." if len(api_key) > 10 else "   API Key: [set]")
+        else:
+            print(f"   API Key: [none]")
+        
+        # Simple test message
+        payload = {
+            "model": model,
+            "messages": [{"role": "user", "content": "test"}],
+            "max_tokens": 1
+        }
+        
+        try:
+            response = requests.post(
+                f"{endpoint}/v1/chat/completions",
+                headers=headers,
+                json=payload,
+                timeout=10
+            )
+            response.raise_for_status()
+            
+            print(f"   ✅ {endpoint_name} connection successful")
+            return True
+            
+        except requests.exceptions.HTTPError as e:
+            print(f"   ❌ {endpoint_name} HTTP error: {e.response.status_code}")
+            print(f"      {e.response.text[:200]}")
+            return False
+        except requests.exceptions.ConnectionError as e:
+            print(f"   ❌ {endpoint_name} connection failed: Cannot reach endpoint")
+            return False
+        except requests.exceptions.Timeout:
+            print(f"   ❌ {endpoint_name} connection timeout")
+            return False
+        except requests.exceptions.RequestException as e:
+            print(f"   ❌ {endpoint_name} error: {e}")
+            return False
+        
     def load_test_suite(self, yaml_file: str) -> Dict:
         """Load test suite from YAML file"""
         try:
@@ -94,18 +180,42 @@ class AIModelTester:
         }
         
         try:
+            print(f"\nDEBUG: Calling API endpoint: {self.endpoint}/v1/chat/completions")
+            print(f"DEBUG: Model name: {self.model_name}")
+            
             response = requests.post(
                 f"{self.endpoint}/v1/chat/completions",
                 headers=headers,
                 json=payload,
-                timeout=120
+                timeout=240
             )
             response.raise_for_status()
             return response.json()
+        except requests.exceptions.Timeout as e:
+            print(f"\n⚠️  API Timeout Error: Request exceeded 120 seconds")
+            print(f"   This usually means the model is taking too long to generate a response.")
+            print(f"   Consider using a faster model or reducing the complexity of the prompt.")
+            return None
+        except requests.exceptions.HTTPError as e:
+            print(f"\n⚠️  API HTTP Error: {e}")
+            print(f"   Status Code: {e.response.status_code}")
+            print(f"   Response: {e.response.text[:500]}")
+            
+            # Show request details for debugging
+            print("\n   REQUEST DETAILS:")
+            print(f"     URL: {self.endpoint}/v1/chat/completions")
+            print(f"     Model: {self.model_name}")
+            
+            return None
+        except requests.exceptions.ConnectionError as e:
+            print(f"\n⚠️  API Connection Error: {e}")
+            print(f"   Could not connect to {self.endpoint}")
+            print(f"   Please check your network connection and endpoint URL.")
+            return None
         except requests.exceptions.RequestException as e:
-            print(f"\n❌ API Error: {e}")
+            print(f"\n⚠️  API Request Error: {e}")
             if hasattr(e, 'response') and e.response is not None:
-                print(f"Response: {e.response.text}")
+                print(f"   Response: {e.response.text[:500]}")
             return None
     
     def display_test_info(self, test: Dict, category: str):
@@ -128,9 +238,34 @@ class AIModelTester:
         print(prompt)
         print("-"*80)
     
-    def display_response(self, response_text: str):
-        """Display the model's response"""
-        print(f"\n🤖 MODEL RESPONSE:")
+    def display_response(self, response_text: str, raw_response: Optional[Dict] = None, generation_time: Optional[float] = None):
+        """Display the model's response with timing and token usage metrics"""       
+        print("\nRAW API RESPONSE:")
+        print("="*80)
+        import json
+        print(json.dumps(raw_response, indent=2))
+        print("="*80)
+        
+        # Display timing and token metrics if available
+        if generation_time is not None:
+            print(f"\n⏱️  Generation Time: {generation_time:.2f}s")
+        
+        if raw_response and 'usage' in raw_response:
+            usage = raw_response['usage']
+            print(f"\n📊 Token Usage:")
+            if 'prompt_tokens' in usage:
+                print(f"   Prompt Tokens: {usage['prompt_tokens']}")
+            if 'completion_tokens' in usage:
+                print(f"   Completion Tokens: {usage['completion_tokens']}")
+            if 'total_tokens' in usage:
+                print(f"   Total Tokens: {usage['total_tokens']}")
+            
+            # Calculate tokens per second if we have timing
+            if generation_time and generation_time > 0 and 'completion_tokens' in usage:
+                tokens_per_sec = usage['completion_tokens'] / generation_time
+                print(f"   Speed: {tokens_per_sec:.2f} tokens/sec")
+        
+        print(f"\n\nMODEL RESPONSE:")
         print("-"*80)
         print(response_text)
         print("-"*80)
@@ -169,6 +304,194 @@ class AIModelTester:
                 print("\n\n⚠️  Test interrupted by user")
                 return {"score": None, "notes": "Interrupted"}
     
+    def extract_api_metrics(self, response: Dict) -> Optional[Dict]:
+        """
+        Extract all available metrics from API response
+        
+        Args:
+            response: The API response dict
+            
+        Returns:
+            Dict with usage statistics and timing information, or None if no metrics available
+        """
+        metrics = {}
+        
+        if response and isinstance(response, dict):
+            # Extract usage statistics if available
+            usage = response.get('usage', {})
+            if usage:
+                metrics['usage'] = {
+                    'prompt_tokens': usage.get('prompt_tokens'),
+                    'completion_tokens': usage.get('completion_tokens'),
+                    'total_tokens': usage.get('total_tokens'),
+                    'prompt_eval_count': usage.get('prompt_eval_count'),
+                    'eval_count': usage.get('eval_count'),
+                    'prompt_eval_duration': usage.get('prompt_eval_duration'),
+                    'eval_duration': usage.get('eval_duration'),
+                    'load_duration': usage.get('load_duration'),
+                    'total_duration': usage.get('total_duration'),
+                    'response_token_s': usage.get('response_token/s'),
+                    'prompt_token_s': usage.get('prompt_token/s'),
+                }
+                # Remove None values
+                metrics['usage'] = {k: v for k, v in metrics['usage'].items() if v is not None}
+            
+            # Extract model info
+            if 'model' in response:
+                metrics['model'] = response['model']
+            
+            # Extract creation timestamp
+            if 'created' in response:
+                metrics['created'] = response['created']
+            
+            # Extract finish reason
+            if 'choices' in response and len(response['choices']) > 0:
+                finish_reason = response['choices'][0].get('finish_reason')
+                if finish_reason:
+                    metrics['finish_reason'] = finish_reason
+        
+        return metrics if metrics else None
+    
+    def get_evaluator_score(self, prompt: str, response: str, criteria: List[str]) -> Dict:
+        """
+        Get automated score from evaluator API
+        
+        Args:
+            prompt: The original prompt sent to the model under test
+            response: The model's response to evaluate
+            criteria: List of evaluation criteria
+            
+        Returns:
+            Dict with 'score', 'notes', and 'auto_eval_failed' keys
+        """
+        # Construct evaluation prompt
+        criteria_text = "\n".join([f"{i+1}. {c}" for i, c in enumerate(criteria)])
+        
+        eval_system_prompt = """You are an expert AI model evaluator. Your task is to objectively assess AI model responses based on specific criteria.
+
+Scoring Rubric:
+- 0-1: FAIL - Major errors, fails to meet basic requirements, incorrect or misleading information
+- 2-3: PASS - Meets requirements with minor issues, mostly correct with some imperfections
+- 4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding, accurate and comprehensive
+
+CRITICAL: You MUST respond with ONLY valid JSON. Do not include any explanatory text before or after the JSON.
+
+Response format (use ONLY this, nothing else):
+{"score": <integer 0-5>, "notes": "<brief explanation>"}"""
+
+        eval_user_prompt = f"""Evaluate this AI model response against the criteria below.
+
+**Original Prompt:**
+{prompt}
+
+**Model Response:**
+{response}
+
+**Evaluation Criteria:**
+{criteria_text}
+
+Respond with ONLY valid JSON in this exact format:
+{{"score": <integer 0-5>, "notes": "<brief explanation of the score>"}}
+
+Do not include any text before or after the JSON object."""
+
+        messages = [
+            {"role": "system", "content": eval_system_prompt},
+            {"role": "user", "content": eval_user_prompt}
+        ]
+        
+        # Call evaluator API
+        headers = {
+            "Content-Type": "application/json"
+        }
+        
+        if self.evaluator_api_key:
+            headers["Authorization"] = f"Bearer {self.evaluator_api_key}"
+        
+        payload = {
+            "model": self.evaluator_model,
+            "messages": messages,
+            "temperature": self.evaluator_temperature,
+            "max_tokens": 500
+        }
+        
+        try:
+            print(f"\n🤖 Calling evaluator API for automated scoring...")
+            response_obj = requests.post(
+                f"{self.evaluator_endpoint}/v1/chat/completions",
+                headers=headers,
+                json=payload,
+                timeout=90  # Increased timeout for evaluator
+            )
+            response_obj.raise_for_status()
+            eval_response = response_obj.json()
+            
+            # Extract the evaluation
+            eval_text = eval_response['choices'][0]['message']['content'].strip()
+            
+            # Try to extract JSON from the response
+            # Handle case where model wraps JSON in markdown code blocks or adds extra text
+            json_obj = None
+            
+            # Try direct parsing first
+            try:
+                json_obj = json.loads(eval_text)
+            except json.JSONDecodeError:
+                # Try to find JSON in markdown code blocks
+                if '```json' in eval_text or '```' in eval_text:
+                    # Extract content between code fences
+                    import re
+                    json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', eval_text, re.DOTALL)
+                    if json_match:
+                        json_obj = json.loads(json_match.group(1))
+                
+                # Try to find any JSON object in the text
+                if json_obj is None:
+                    import re
+                    json_match = re.search(r'\{[^{}]*"score"[^{}]*"notes"[^{}]*\}', eval_text, re.DOTALL)
+                    if json_match:
+                        json_obj = json.loads(json_match.group(0))
+            
+            if json_obj is None:
+                raise json.JSONDecodeError("No valid JSON found", eval_text, 0)
+            
+            score = int(json_obj.get('score', 0))
+            notes = json_obj.get('notes', 'Automated evaluation')
+            
+            # Validate score range
+            if not 0 <= score <= 5:
+                print(f"⚠️  Evaluator returned invalid score {score}, clamping to 0-5")
+                score = max(0, min(5, score))
+            
+            print(f"   Score: {score}/5")
+            print(f"   Notes: {notes}")
+            
+            return {"score": score, "notes": f"[Auto-Reviev by {self.evaluator_model}] {notes}", "auto_eval_failed": False}
+            
+        except json.JSONDecodeError as e:
+            print(f"⚠️  Failed to parse evaluator response as JSON: {e}")
+            print(f"   Raw response: {eval_text[:200]}")
+            print(f"   Marking for manual review")
+            return {"score": None, "notes": f"[AUTO-ERROR] Evaluator returned non-JSON response", "auto_eval_failed": True}
+        except (KeyError, IndexError, ValueError) as e:
+            print(f"⚠️  Invalid evaluator response format: {e}")
+            print(f"   Marking for manual review")
+            return {"score": None, "notes": f"[AUTO-ERROR] Invalid response format: {str(e)}", "auto_eval_failed": True}
+        except requests.exceptions.Timeout as e:
+            print(f"⚠️  Evaluator API timeout: Request exceeded 90 seconds")
+            print(f"   The evaluator model is taking too long to respond.")
+            print(f"   Marking for manual review")
+            return {"score": None, "notes": "[AUTO-ERROR] Evaluator API timeout", "auto_eval_failed": True}
+        except requests.exceptions.ConnectionError as e:
+            print(f"⚠️  Evaluator connection error: {e}")
+            print(f"   Could not connect to evaluator endpoint.")
+            print(f"   Marking for manual review")
+            return {"score": None, "notes": "[AUTO-ERROR] Cannot connect to evaluator", "auto_eval_failed": True}
+        except requests.exceptions.RequestException as e:
+            print(f"⚠️  Evaluator API error: {e}")
+            print(f"   Marking for manual review")
+            return {"score": None, "notes": f"[AUTO-ERROR] API call failed: {str(e)[:100]}", "auto_eval_failed": True}
+    
     def run_single_turn_test(self, test: Dict, category: str) -> Dict:
         """Run a single-turn test"""
         self.display_test_info(test, category)
@@ -177,8 +500,10 @@ class AIModelTester:
         # Prepare messages
         messages = [{"role": "user", "content": test['prompt']}]
         
-        # Call API
+        # Call API and measure time
+        start_time = time.time()
         response = self.call_api(messages)
+        generation_time = time.time() - start_time
         if response is None:
             return {
                 "test_id": test['id'],
@@ -187,20 +512,79 @@ class AIModelTester:
                 "type": "single_turn",
                 "status": "api_error",
                 "score": None,
-                "notes": "API call failed"
+                "notes": "API call failed",
+                "generation_time": generation_time
             }
         
-        # Extract response text
-        response_text = response['choices'][0]['message']['content']
-        self.display_response(response_text)
+        # Extract response text with better error handling
+        try:
+            message = response['choices'][0]['message']
+            # Try to get content, if empty check for reasoning_content
+            response_text = message.get('content', '')
+            
+            # If content is empty but reasoning_content exists, use that
+            if not response_text and 'reasoning_content' in message:
+                response_text = message['reasoning_content']
+                print("\n⚠️  Note: Response contained only reasoning_content, no actual content generated")
+            
+            # If still empty, check for tool_calls (model might be trying to call functions)
+            if not response_text and 'tool_calls' in message:
+                print("\n⚠️  Warning: Model attempted to call tools instead of generating content")
+                tool_info = json.dumps(message['tool_calls'], indent=2)
+                response_text = f"[MODEL ERROR: Attempted tool calls instead of text response]\n{tool_info}"
+            
+            # If completely empty, this is an error
+            if not response_text:
+                print("\n⚠️  ERROR: Model returned completely empty response")
+                response_text = "[ERROR: Empty response from model]"
+                
+        except (KeyError, IndexError, TypeError) as e:
+            print(f"\nERROR: Failed to parse response - {e}")
+            print("\nRAW API RESPONSE:")
+            print("="*80)
+            import json
+            print(json.dumps(response, indent=2))
+            print("="*80)
+            
+            response_text = f"[PARSING ERROR: {e}]"
+            
+        self.display_response(response_text, raw_response=response, generation_time=generation_time)
         
         # Display evaluation criteria
         self.display_evaluation_criteria(test.get('evaluation_criteria', []))
         
-        # Get user evaluation
-        evaluation = self.get_user_score()
+        # Get evaluation (interactive or automated)
+        # Skip automated evaluation if response is an error
+        if self.non_interactive:
+            # Check if response is actually an error
+            if response_text.startswith('[ERROR:') or response_text.startswith('[MODEL ERROR:') or response_text.startswith('[PARSING ERROR:'):
+                print(f"\n⚠️  Skipping automated evaluation due to model error")
+                evaluation = {"score": 0, "notes": "[AUTO-SKIP] Model failed to generate valid response", "auto_eval_failed": False}
+            else:
+                evaluation = self.get_evaluator_score(
+                    prompt=test['prompt'],
+                    response=response_text,
+                    criteria=test.get('evaluation_criteria', [])
+                )
+                # Track failed evaluations for manual review
+                if evaluation.get('auto_eval_failed', False):
+                    self.failed_evaluations.append({
+                        'test_id': test['id'],
+                        'test_name': test['name'],
+                        'category': category,
+                        'type': 'single_turn',
+                        'prompt': test['prompt'],
+                        'response': response_text,
+                        'criteria': test.get('evaluation_criteria', []),
+                        'error': evaluation['notes']
+                    })
+        else:
+            evaluation = self.get_user_score()
         
-        return {
+        # Extract API metrics
+        api_metrics = self.extract_api_metrics(response)
+        
+        result = {
             "test_id": test['id'],
             "test_name": test['name'],
             "category": category,
@@ -208,12 +592,20 @@ class AIModelTester:
             "difficulty": test.get('expected_difficulty', 'unknown'),
             "prompt": test['prompt'],
             "response": response_text,
+            "raw_response": response if response_text.startswith("[PARSING ERROR") else None,
             "evaluation_criteria": test.get('evaluation_criteria', []),
             "score": evaluation['score'],
             "notes": evaluation['notes'],
             "status": "completed" if evaluation['score'] is not None else "skipped",
-            "timestamp": datetime.now().isoformat()
+            "timestamp": datetime.now().isoformat(),
+            "generation_time": generation_time
         }
+        
+        # Add metrics if available
+        if api_metrics:
+            result['api_metrics'] = api_metrics
+        
+        return result
     
     def run_multi_turn_test(self, test: Dict, category: str) -> Dict:
         """Run a multi-turn test"""
@@ -232,20 +624,57 @@ class AIModelTester:
             # Add to conversation history
             self.conversation_history.append({"role": "user", "content": prompt})
             
-            # Call API with full conversation history
+            # Call API with full conversation history and measure time
+            start_time = time.time()
             response = self.call_api(self.conversation_history)
+            generation_time = time.time() - start_time
+            
             if response is None:
                 turn_results.append({
                     "turn": turn_num,
                     "status": "api_error",
                     "prompt": prompt,
-                    "response": None
+                    "response": None,
+                    "score": None,
+                    "notes": "API error - failed to get response",
+                    "generation_time": generation_time
                 })
                 break
             
-            # Extract and display response
-            response_text = response['choices'][0]['message']['content']
-            self.display_response(response_text)
+            # Extract and display response with better error handling
+            try:
+                message = response['choices'][0]['message']
+                # Try to get content, if empty check for reasoning_content
+                response_text = message.get('content', '')
+                
+                # If content is empty but reasoning_content exists, use that
+                if not response_text and 'reasoning_content' in message:
+                    response_text = message['reasoning_content']
+                    print("\n⚠️  Note: Response contained only reasoning_content, no actual content generated")
+                
+                # If still empty, check for tool_calls
+                if not response_text and 'tool_calls' in message:
+                    print("\n⚠️  Warning: Model attempted to call tools instead of generating content")
+                    import json
+                    tool_info = json.dumps(message['tool_calls'], indent=2)
+                    response_text = f"[MODEL ERROR: Attempted tool calls instead of text response]\n{tool_info}"
+                
+                # If completely empty, this is an error
+                if not response_text:
+                    print("\n⚠️  ERROR: Model returned completely empty response")
+                    response_text = "[ERROR: Empty response from model]"
+                    
+            except (KeyError, IndexError, TypeError) as e:
+                print(f"\nERROR: Failed to parse response - {e}")
+                print("\nRAW API RESPONSE:")
+                print("="*80)
+                import json
+                print(json.dumps(response, indent=2))
+                print("="*80)
+                
+                response_text = f"[PARSING ERROR: {e}]"
+                
+            self.display_response(response_text, raw_response=response, generation_time=generation_time)
             
             # Add assistant response to history
             self.conversation_history.append({"role": "assistant", "content": response_text})
@@ -253,29 +682,84 @@ class AIModelTester:
             # Display criteria for this turn
             self.display_evaluation_criteria(turn_data.get('evaluation_criteria', []))
             
-            # Get evaluation for this turn
-            print(f"\n🎯 Evaluate Turn {turn_num}:")
-            evaluation = self.get_user_score()
+            # Get evaluation for this turn (interactive or automated)
+            print(f"\nEvaluate Turn {turn_num}:")
+            if self.non_interactive:
+                # Skip automated evaluation if response is an error
+                if response_text.startswith('[ERROR:') or response_text.startswith('[MODEL ERROR:') or response_text.startswith('[PARSING ERROR:'):
+                    print(f"\n⚠️  Skipping automated evaluation due to model error")
+                    evaluation = {"score": 0, "notes": "[AUTO-SKIP] Model failed to generate valid response", "auto_eval_failed": False}
+                else:
+                    evaluation = self.get_evaluator_score(
+                        prompt=prompt,
+                        response=response_text,
+                        criteria=turn_data.get('evaluation_criteria', [])
+                    )
+                    # Track failed evaluations for manual review
+                    if evaluation.get('auto_eval_failed', False):
+                        self.failed_evaluations.append({
+                            'test_id': test['id'],
+                            'test_name': test['name'],
+                            'category': category,
+                            'type': 'multi_turn',
+                            'turn': turn_num,
+                            'prompt': prompt,
+                            'response': response_text,
+                            'criteria': turn_data.get('evaluation_criteria', []),
+                            'error': evaluation['notes']
+                        })
+            else:
+                evaluation = self.get_user_score()
             
-            turn_results.append({
+            # Extract API metrics for this turn
+            api_metrics = self.extract_api_metrics(response)
+            
+            turn_result = {
                 "turn": turn_num,
                 "prompt": prompt,
                 "response": response_text,
                 "evaluation_criteria": turn_data.get('evaluation_criteria', []),
                 "score": evaluation['score'],
                 "notes": evaluation['notes'],
-                "status": "completed" if evaluation['score'] is not None else "skipped"
-            })
+                "status": "completed" if evaluation['score'] is not None else "skipped",
+                "generation_time": generation_time
+            }
+            
+            # Add metrics if available
+            if api_metrics:
+                turn_result['api_metrics'] = api_metrics
+            
+            turn_results.append(turn_result)
             
             if evaluation['score'] is None:
-                print(f"\n⚠️  Turn {turn_num} skipped, stopping multi-turn test")
+                print(f"\nTurn {turn_num} skipped, stopping multi-turn test")
                 break
         
         # Calculate overall score for multi-turn test
         valid_scores = [t['score'] for t in turn_results if t['score'] is not None]
         overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else None
         
-        return {
+        # Aggregate metrics across all turns
+        aggregate_metrics = {}
+        turn_metrics = [t.get('api_metrics') for t in turn_results if t.get('api_metrics')]
+        
+        if turn_metrics:
+            # Sum up token counts and durations
+            total_prompt_tokens = sum(m.get('usage', {}).get('prompt_tokens', 0) for m in turn_metrics)
+            total_completion_tokens = sum(m.get('usage', {}).get('completion_tokens', 0) for m in turn_metrics)
+            total_duration = sum(m.get('usage', {}).get('total_duration', 0) for m in turn_metrics)
+            
+            aggregate_metrics['usage'] = {
+                'total_prompt_tokens': total_prompt_tokens if total_prompt_tokens else None,
+                'total_completion_tokens': total_completion_tokens if total_completion_tokens else None,
+                'total_tokens': (total_prompt_tokens + total_completion_tokens) if (total_prompt_tokens or total_completion_tokens) else None,
+                'total_duration': total_duration if total_duration else None,
+                'turn_count': len(turn_metrics)
+            }
+            # Remove None values
+            aggregate_metrics['usage'] = {k: v for k, v in aggregate_metrics['usage'].items() if v is not None}
+        
+        result = {
             "test_id": test['id'],
             "test_name": test['name'],
             "category": category,
@@ -286,6 +770,99 @@ class AIModelTester:
             "status": "completed" if overall_score is not None else "incomplete",
             "timestamp": datetime.now().isoformat()
         }
+        
+        # Add aggregate metrics if available
+        if aggregate_metrics:
+            result['aggregate_metrics'] = aggregate_metrics
+        
+        return result
+    
+    def manual_review_failed_evaluations(self):
+        """Present failed automated evaluations for manual review"""
+        print("\n\n" + "="*80)
+        print("⚠️  MANUAL REVIEW REQUIRED")
+        print("="*80)
+        print(f"\n{len(self.failed_evaluations)} test(s) could not be automatically evaluated.")
+        print("Please provide manual scores for these tests.\n")
+        
+        for idx, failed in enumerate(self.failed_evaluations, 1):
+            print("\n" + "="*80)
+            print(f"📋 MANUAL REVIEW {idx}/{len(self.failed_evaluations)}")
+            print("="*80)
+            print(f"🆔 Test ID: {failed['test_id']}")
+            print(f"📝 Test Name: {failed['test_name']}")
+            print(f"📂 Category: {failed['category']}")
+            if failed['type'] == 'multi_turn':
+                print(f"🔄 Turn: {failed['turn']}")
+            print(f"❌ Auto-Eval Error: {failed['error']}")
+            
+            print(f"\n💬 PROMPT:")
+            print("-"*80)
+            print(failed['prompt'])
+            print("-"*80)
+            
+            print(f"\nMODEL RESPONSE:")
+            print("-"*80)
+            print(failed['response'])
+            print("-"*80)
+            
+            print(f"\n✅ EVALUATION CRITERIA:")
+            for i, criterion in enumerate(failed['criteria'], 1):
+                print(f"  {i}. {criterion}")
+            
+            # Get manual score
+            print("\n" + "="*80)
+            print("📊 EVALUATION SCORING RUBRIC:")
+            print("  0-1: FAIL - Major errors, fails to meet basic requirements")
+            print("  2-3: PASS - Meets requirements with minor issues")
+            print("  4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding")
+            print("="*80)
+            
+            manual_evaluation = self.get_user_score()
+            
+            # Update the corresponding test result
+            self.update_test_result_with_manual_score(
+                failed['test_id'],
+                failed.get('turn'),
+                manual_evaluation
+            )
+        
+        # Save updated results
+        self.save_results()
+        print("\n✅ All manual reviews completed and saved!")
+    
+    def update_test_result_with_manual_score(self, test_id: str, turn: Optional[int], evaluation: Dict):
+        """Update a test result with manually provided score"""
+        for result in self.results['test_results']:
+            if result['test_id'] == test_id:
+                if turn is None:
+                    # Single-turn test
+                    result['score'] = evaluation['score']
+                    # Only add MANUAL-OVERRIDE prefix if a score was actually provided
+                    if evaluation['score'] is not None:
+                        result['notes'] = f"[MANUAL-OVERRIDE] {evaluation['notes']}" if evaluation['notes'] else "[MANUAL-OVERRIDE]"
+                        result['status'] = 'completed'
+                    else:
+                        result['notes'] = evaluation['notes'] if evaluation['notes'] else "Manual review skipped"
+                        result['status'] = 'manual_review_skipped'
+                else:
+                    # Multi-turn test - update specific turn
+                    for turn_result in result.get('turns', []):
+                        if turn_result['turn'] == turn:
+                            turn_result['score'] = evaluation['score']
+                            # Only add MANUAL-OVERRIDE prefix if a score was actually provided
+                            if evaluation['score'] is not None:
+                                turn_result['notes'] = f"[MANUAL-OVERRIDE] {evaluation['notes']}" if evaluation['notes'] else "[MANUAL-OVERRIDE]"
+                                turn_result['status'] = 'completed'
+                            else:
+                                turn_result['notes'] = evaluation['notes'] if evaluation['notes'] else "Manual review skipped"
+                                turn_result['status'] = 'manual_review_skipped'
+                    
+                    # Recalculate overall score
+                    valid_scores = [t['score'] for t in result['turns'] if t['score'] is not None]
+                    result['overall_score'] = sum(valid_scores) / len(valid_scores) if valid_scores else None
+                    result['status'] = 'completed' if result['overall_score'] is not None else 'incomplete'
+                break
     
     def run_test_suite(self, test_suite: Dict, filter_category: Optional[str] = None):
         """Run the complete test suite"""
@@ -340,8 +917,14 @@ class AIModelTester:
         self.results['metadata']['test_end'] = datetime.now().isoformat()
         self.save_results()
         
+        # Handle failed evaluations if in non-interactive mode
+        if self.non_interactive and self.failed_evaluations:
+            self.manual_review_failed_evaluations()
+        
         print("\n\n" + "="*80)
         print("✅ TEST SUITE COMPLETE")
+        if self.non_interactive and self.failed_evaluations:
+            print(f"   ({len(self.failed_evaluations)} test(s) manually reviewed)")
         print("="*80)
         self.display_summary()
     
@@ -378,90 +961,286 @@ class AIModelTester:
             print(f"  Pass Rate: {len([s for s in scores if s >= 2]) / len(scores) * 100:.1f}%")
             print(f"  Exceptional Rate: {len([s for s in scores if s >= 4]) / len(scores) * 100:.1f}%")
         
+        # Calculate aggregate API metrics
+        total_prompt_tokens = 0
+        total_completion_tokens = 0
+        total_duration = 0
+        tests_with_metrics = 0
+        
+        for result in self.results['test_results']:
+            # Single-turn tests
+            if result.get('api_metrics'):
+                usage = result['api_metrics'].get('usage', {})
+                total_prompt_tokens += usage.get('prompt_tokens', 0)
+                total_completion_tokens += usage.get('completion_tokens', 0)
+                total_duration += usage.get('total_duration', 0)
+                tests_with_metrics += 1
+            # Multi-turn tests
+            elif result.get('aggregate_metrics'):
+                usage = result['aggregate_metrics'].get('usage', {})
+                total_prompt_tokens += usage.get('total_prompt_tokens', 0)
+                total_completion_tokens += usage.get('total_completion_tokens', 0)
+                total_duration += usage.get('total_duration', 0)
+                tests_with_metrics += 1
+        
+        if tests_with_metrics > 0:
+            print(f"\n⚡ API METRICS:")
+            print(f"  Total Prompt Tokens: {total_prompt_tokens:,}")
+            print(f"  Total Completion Tokens: {total_completion_tokens:,}")
+            print(f"  Total Tokens: {total_prompt_tokens + total_completion_tokens:,}")
+            if total_duration > 0:
+                # Convert nanoseconds to seconds
+                duration_seconds = total_duration / 1_000_000_000
+                print(f"  Total Duration: {duration_seconds:.2f}s")
+                if total_completion_tokens > 0:
+                    tokens_per_second = total_completion_tokens / duration_seconds
+                    print(f"  Average Speed: {tokens_per_second:.2f} tokens/s")
+        
         print(f"\n💾 Results saved to: {self.output_dir}")
 
 
 def main():
+    # Load environment variables from .env file if it exists
+    load_dotenv()
+    
     parser = argparse.ArgumentParser(
         description="AI Model Evaluation Test Suite",
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
-  # Test a single model
+  # Test a single model (interactive mode)
   python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q4_K_M
   
   # Test with API key
   python ai_eval.py --endpoint https://api.example.com --api-key sk-xxx --model qwen3:8b
   
+  # Non-interactive mode with evaluator
+  python ai_eval.py --non-interactive --evaluator-endpoint http://localhost:11434 --evaluator-model qwen3:14b
+  
+  # Use .env file for configuration (recommended)
+  cp .env.example .env
+  # Edit .env with your settings
+  python ai_eval.py
+  
   # Test only forensics category
   python ai_eval.py --endpoint http://localhost:11434 --model qwen3:14b --category "IT Forensics - File Systems"
-  
-  # Test multiple models (run separately)
-  python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q4_K_M
-  python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q8_0
-  python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-fp16
         """
     )
     
     parser.add_argument(
         '--endpoint',
-        required=True,
-        help='OpenAI-compatible API endpoint (e.g., http://localhost:11434 for Ollama)'
+        default=os.getenv('MUT_ENDPOINT'),
+        help='OpenAI-compatible API endpoint for model under test (default: from .env MUT_ENDPOINT)'
     )
     
     parser.add_argument(
         '--api-key',
-        default='',
-        help='API key for authentication (optional for local endpoints)'
+        default=os.getenv('MUT_API_KEY', ''),
+        help='API key for model under test (default: from .env MUT_API_KEY)'
     )
     
     parser.add_argument(
         '--model',
-        required=True,
-        help='Model name/identifier (e.g., qwen3:4b-q4_K_M)'
+        default=os.getenv('MUT_MODEL'),
+        help='Model name/identifier to test (default: from .env MUT_MODEL)'
     )
     
     parser.add_argument(
         '--test-suite',
-        default='test_suite.yaml',
-        help='Path to test suite YAML file (default: test_suite.yaml)'
+        default=os.getenv('TEST_SUITE', 'test_suite.yaml'),
+        help='Path to test suite YAML file (default: from .env TEST_SUITE or test_suite.yaml)'
     )
     
     parser.add_argument(
         '--output-dir',
-        default='results',
-        help='Directory to save results (default: results)'
+        default=os.getenv('OUTPUT_DIR', 'results'),
+        help='Directory to save results (default: from .env OUTPUT_DIR or results)'
     )
     
     parser.add_argument(
         '--category',
-        default=None,
-        help='Filter tests by category (optional)'
+        default=os.getenv('FILTER_CATEGORY'),
+        help='Filter tests by category (default: from .env FILTER_CATEGORY)'
+    )
+    
+    parser.add_argument(
+        '--non-interactive',
+        action='store_true',
+        default=os.getenv('NON_INTERACTIVE', '').lower() in ('true', '1', 'yes'),
+        help='Run in non-interactive mode with automated evaluation (default: from .env NON_INTERACTIVE)'
+    )
+    
+    parser.add_argument(
+        '--evaluator-endpoint',
+        default=os.getenv('EVALUATOR_ENDPOINT'),
+        help='API endpoint for evaluator model (required for non-interactive mode, default: from .env EVALUATOR_ENDPOINT)'
+    )
+    
+    parser.add_argument(
+        '--evaluator-api-key',
+        default=os.getenv('EVALUATOR_API_KEY', ''),
+        help='API key for evaluator (default: from .env EVALUATOR_API_KEY)'
+    )
+    
+    parser.add_argument(
+        '--evaluator-model',
+        default=os.getenv('EVALUATOR_MODEL'),
+        help='Model name for evaluator (required for non-interactive mode, default: from .env EVALUATOR_MODEL)'
+    )
+    
+    parser.add_argument(
+        '--evaluator-temperature',
+        type=float,
+        default=float(os.getenv('EVALUATOR_TEMPERATURE', '0.3')),
+        help='Temperature for evaluator model (default: from .env EVALUATOR_TEMPERATURE or 0.3)'
+    )
+    
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Test API connectivity without running the full test suite'
     )
     
     args = parser.parse_args()
     
-    # Initialize tester
-    tester = AIModelTester(
-        endpoint=args.endpoint,
-        api_key=args.api_key,
-        model_name=args.model,
-        output_dir=args.output_dir
-    )
-    
-    # Load test suite
-    print(f"📁 Loading test suite from: {args.test_suite}")
-    test_suite = tester.load_test_suite(args.test_suite)
-    
-    # Run tests
-    try:
-        tester.run_test_suite(test_suite, filter_category=args.category)
-    except KeyboardInterrupt:
-        print("\n\n⚠️  Test suite interrupted by user")
-        tester.results['metadata']['test_end'] = datetime.now().isoformat()
-        tester.save_results()
-        print(f"\n💾 Partial results saved to: {tester.output_dir}")
+    # Validate required arguments
+    if not args.endpoint:
+        print("❌ Error: --endpoint is required (or set MUT_ENDPOINT in .env)")
+        print("   Example: --endpoint http://localhost:11434")
         sys.exit(1)
+    
+    if not args.model:
+        print("❌ Error: --model is required (or set MUT_MODEL in .env)")
+        print("   Example: --model qwen3:4b-q4_K_M")
+        sys.exit(1)
+    
+    if args.non_interactive:
+        if not args.evaluator_endpoint:
+            print("❌ Error: --evaluator-endpoint is required for non-interactive mode")
+            print("   (or set EVALUATOR_ENDPOINT in .env)")
+            sys.exit(1)
+        if not args.evaluator_model:
+            print("❌ Error: --evaluator-model is required for non-interactive mode")
+            print("   (or set EVALUATOR_MODEL in .env)")
+            sys.exit(1)
+    
+    # Parse model list (supports comma-separated models)
+    model_list = [m.strip() for m in args.model.split(',') if m.strip()]
+    
+    # Dry run mode - just test connections
+    if args.dry_run:
+        print(f"\n{'='*80}")
+        print("🧪 DRY RUN MODE - Testing API Connectivity")
+        print(f"{'='*80}")
+        
+        all_success = True
+        
+        # Test MUT endpoint for each model
+        for idx, model_name in enumerate(model_list, 1):
+            if len(model_list) > 1:
+                print(f"\n--- Model {idx}/{len(model_list)} ---")
+            
+            tester = AIModelTester(
+                endpoint=args.endpoint,
+                api_key=args.api_key,
+                model_name=model_name,
+                output_dir=args.output_dir,
+                non_interactive=args.non_interactive,
+                evaluator_endpoint=args.evaluator_endpoint,
+                evaluator_api_key=args.evaluator_api_key,
+                evaluator_model=args.evaluator_model,
+                evaluator_temperature=args.evaluator_temperature
+            )
+            
+            success = tester.test_connection(
+                endpoint=args.endpoint,
+                api_key=args.api_key,
+                model=model_name,
+                endpoint_name="Model Under Test"
+            )
+            all_success = all_success and success
+        
+        # Test evaluator endpoint if non-interactive mode
+        if args.non_interactive and args.evaluator_endpoint and args.evaluator_model:
+            print(f"\n{'='*80}")
+            tester = AIModelTester(
+                endpoint=args.endpoint,
+                api_key=args.api_key,
+                model_name=model_list[0],
+                output_dir=args.output_dir,
+                non_interactive=args.non_interactive,
+                evaluator_endpoint=args.evaluator_endpoint,
+                evaluator_api_key=args.evaluator_api_key,
+                evaluator_model=args.evaluator_model,
+                evaluator_temperature=args.evaluator_temperature
+            )
+            
+            success = tester.test_connection(
+                endpoint=args.evaluator_endpoint,
+                api_key=args.evaluator_api_key,
+                model=args.evaluator_model,
+                endpoint_name="Evaluator"
+            )
+            all_success = all_success and success
+        
+        print(f"\n{'='*80}")
+        if all_success:
+            print("✅ All connectivity tests passed")
+            print(f"{'='*80}")
+            sys.exit(0)
+        else:
+            print("❌ Some connectivity tests failed")
+            print(f"{'='*80}")
+            sys.exit(1)
+    
+    if len(model_list) > 1:
+        print(f"\n🔄 Batch mode: Testing {len(model_list)} models")
+        print("=" * 80)
+    
+    # Test each model
+    for idx, model_name in enumerate(model_list, 1):
+        if len(model_list) > 1:
+            print(f"\n{'='*80}")
+            print(f"📊 Model {idx}/{len(model_list)}: {model_name}")
+            print(f"{'='*80}\n")
+        
+        # Initialize tester
+        tester = AIModelTester(
+            endpoint=args.endpoint,
+            api_key=args.api_key,
+            model_name=model_name,
+            output_dir=args.output_dir,
+            non_interactive=args.non_interactive,
+            evaluator_endpoint=args.evaluator_endpoint,
+            evaluator_api_key=args.evaluator_api_key,
+            evaluator_model=args.evaluator_model,
+            evaluator_temperature=args.evaluator_temperature
+        )
+        
+        # Load test suite
+        if idx == 1 or len(model_list) == 1:
+            print(f"📁 Loading test suite from: {args.test_suite}")
+        test_suite = tester.load_test_suite(args.test_suite)
+        
+        # Run tests
+        try:
+            tester.run_test_suite(test_suite, filter_category=args.category)
+        except KeyboardInterrupt:
+            print("\n\n⚠️  Test suite interrupted by user")
+            tester.results['metadata']['test_end'] = datetime.now().isoformat()
+            tester.save_results()
+            print(f"\n💾 Partial results saved to: {tester.output_dir}")
+            if len(model_list) > 1 and idx < len(model_list):
+                print(f"\n⚠️  Skipping remaining {len(model_list) - idx} models")
+            sys.exit(1)
+    
+    if len(model_list) > 1:
+        print(f"\n{'='*80}")
+        print(f"✅ BATCH COMPLETE: Tested {len(model_list)} models")
+        print(f"{'='*80}")
+        print(f"\n💾 Results saved to: {args.output_dir}/")
+        print("\nTo compare results, run:")
+        print("  python analyze_results.py --compare")
 
 
 if __name__ == "__main__":
diff --git a/analyze_results.py b/analyze_results.py
index 2c55a22..df9bcb8 100644
--- a/analyze_results.py
+++ b/analyze_results.py
@@ -2,6 +2,7 @@
 """
 AI Model Evaluation Results Analyzer
 Compares results across different models and quantizations
+Includes interactive web interface for visualization and analysis
 """
 
 import json
@@ -10,6 +11,10 @@ from pathlib import Path
 from typing import List, Dict
 import argparse
 from collections import defaultdict
+from flask import Flask, render_template, jsonify, request
+import webbrowser
+from threading import Timer
+import numpy as np
 
 
 class ResultsAnalyzer:
@@ -286,6 +291,1298 @@ class ResultsAnalyzer:
         print(f"✅ CSV exported to: {output_path}")
 
 
+class WebInterface:
+    """Interactive web interface for results analysis"""
+    
+    def __init__(self, results_dir: str = "results"):
+        self.results_dir = Path(results_dir)
+        self.analyzer = ResultsAnalyzer(results_dir)
+        self.app = Flask(__name__)
+        self.setup_routes()
+    
+    def setup_routes(self):
+        """Setup Flask routes"""
+        
+        @self.app.route('/')
+        def index():
+            """Main dashboard"""
+            return render_template('dashboard.html')
+        
+        @self.app.route('/api/models')
+        def get_models():
+            """Get list of all available models"""
+            result_files = self.analyzer.find_result_files()
+            models = []
+            
+            for filepath in result_files:
+                try:
+                    results = self.analyzer.load_result_file(filepath)
+                    metadata = results.get('metadata', {})
+                    models.append({
+                        'name': metadata.get('model_name', 'Unknown'),
+                        'file': filepath.name,
+                        'total_tests': metadata.get('total_tests', 0),
+                        'completed_tests': metadata.get('completed_tests', 0),
+                        'test_start': metadata.get('test_start'),
+                        'test_end': metadata.get('test_end')
+                    })
+                except Exception as e:
+                    print(f"Error loading {filepath}: {e}")
+            
+            return jsonify(models)
+        
+        @self.app.route('/api/results/<model_name>')
+        def get_model_results(model_name):
+            """Get detailed results for a specific model"""
+            pattern = f"{model_name.replace(':', '_')}_latest.json"
+            filepath = self.results_dir / pattern
+            
+            if not filepath.exists():
+                return jsonify({'error': 'Model not found'}), 404
+            
+            results = self.analyzer.load_result_file(filepath)
+            return jsonify(results)
+        
+        @self.app.route('/api/comparison')
+        def get_comparison():
+            """Get comparison data for all models"""
+            result_files = self.analyzer.find_result_files()
+            comparison_data = {
+                'models': {},
+                'categories': set(),
+                'difficulty_levels': set()
+            }
+            
+            for filepath in result_files:
+                try:
+                    results = self.analyzer.load_result_file(filepath)
+                    model_name = results['metadata']['model_name']
+                    
+                    # Extract all scores and metadata
+                    test_results = []
+                    for test in results.get('test_results', []):
+                        score = test.get('score') or test.get('overall_score')
+                        test_data = {
+                            'test_id': test.get('test_id'),
+                            'test_name': test.get('test_name'),
+                            'category': test.get('category', 'Unknown'),
+                            'type': test.get('type'),
+                            'difficulty': test.get('difficulty', 'medium'),
+                            'score': score,
+                            'status': test.get('status'),
+                            'notes': test.get('notes', '')
+                        }
+                        test_results.append(test_data)
+                        
+                        if test_data['category']:
+                            comparison_data['categories'].add(test_data['category'])
+                        if test_data['difficulty']:
+                            comparison_data['difficulty_levels'].add(test_data['difficulty'])
+                    
+                    # Calculate overall statistics
+                    all_scores = [t['score'] for t in test_results if t['score'] is not None]
+                    stats = self.analyzer.calculate_statistics(all_scores) if all_scores else {}
+                    
+                    # Calculate category statistics
+                    category_stats = {}
+                    for category in comparison_data['categories']:
+                        cat_scores = [t['score'] for t in test_results 
+                                    if t['category'] == category and t['score'] is not None]
+                        if cat_scores:
+                            category_stats[category] = self.analyzer.calculate_statistics(cat_scores)
+                    
+                    # Calculate difficulty statistics
+                    difficulty_stats = {}
+                    for difficulty in comparison_data['difficulty_levels']:
+                        diff_scores = [t['score'] for t in test_results 
+                                     if t['difficulty'] == difficulty and t['score'] is not None]
+                        if diff_scores:
+                            difficulty_stats[difficulty] = self.analyzer.calculate_statistics(diff_scores)
+                    
+                    comparison_data['models'][model_name] = {
+                        'test_results': test_results,
+                        'overall_stats': stats,
+                        'category_stats': category_stats,
+                        'difficulty_stats': difficulty_stats,
+                        'metadata': results.get('metadata', {})
+                    }
+                    
+                except Exception as e:
+                    print(f"Error processing {filepath}: {e}")
+            
+            # Convert sets to lists for JSON serialization
+            comparison_data['categories'] = sorted(list(comparison_data['categories']))
+            comparison_data['difficulty_levels'] = sorted(list(comparison_data['difficulty_levels']))
+            
+            return jsonify(comparison_data)
+        
+        @self.app.route('/api/statistics')
+        def get_statistics():
+            """Get advanced statistical analysis"""
+            result_files = self.analyzer.find_result_files()
+            
+            all_data = []
+            model_names = []
+            
+            for filepath in result_files:
+                try:
+                    results = self.analyzer.load_result_file(filepath)
+                    model_name = results['metadata']['model_name']
+                    model_names.append(model_name)
+                    
+                    scores = [test.get('score') or test.get('overall_score') 
+                            for test in results.get('test_results', [])]
+                    scores = [s for s in scores if s is not None]
+                    all_data.append(scores)
+                except Exception as e:
+                    print(f"Error in statistics: {e}")
+            
+            # Calculate advanced statistics
+            statistics = {
+                'models': model_names,
+                'variance': [],
+                'std_dev': [],
+                'consistency_score': [],
+                'robustness_score': []
+            }
+            
+            for i, scores in enumerate(all_data):
+                if scores:
+                    variance = np.var(scores)
+                    std_dev = np.std(scores)
+                    
+                    # Consistency: lower variance is better (inverse normalized)
+                    consistency = max(0, 100 - (std_dev * 20))
+                    
+                    # Robustness: combination of average and consistency
+                    avg_score = np.mean(scores)
+                    robustness = (avg_score * 20) * (consistency / 100)
+                    
+                    statistics['variance'].append(float(variance))
+                    statistics['std_dev'].append(float(std_dev))
+                    statistics['consistency_score'].append(float(consistency))
+                    statistics['robustness_score'].append(float(robustness))
+                else:
+                    statistics['variance'].append(0)
+                    statistics['std_dev'].append(0)
+                    statistics['consistency_score'].append(0)
+                    statistics['robustness_score'].append(0)
+            
+            return jsonify(statistics)
+        
+        @self.app.route('/api/intelligence_metrics')
+        def get_intelligence_metrics():
+            """Calculate intelligence evaluation metrics"""
+            result_files = self.analyzer.find_result_files()
+            
+            metrics = {}
+            
+            for filepath in result_files:
+                try:
+                    results = self.analyzer.load_result_file(filepath)
+                    model_name = results['metadata']['model_name']
+                    test_results = results.get('test_results', [])
+                    
+                    # Define intelligence dimensions
+                    dimensions = {
+                        'logical_reasoning': ['Logic & Reasoning'],
+                        'mathematical_ability': ['Mathematics & Calculation'],
+                        'instruction_following': ['Instruction Following'],
+                        'creativity': ['Creative Writing'],
+                        'technical_knowledge': ['Code Generation', 'IT Forensics'],
+                        'linguistic_nuance': ['Language Nuance'],
+                        'conversational_depth': ['Multi-turn Conversations']
+                    }
+                    
+                    model_metrics = {}
+                    
+                    for dimension, categories in dimensions.items():
+                        dim_scores = [
+                            test.get('score') or test.get('overall_score')
+                            for test in test_results
+                            if test.get('category') in categories and 
+                               (test.get('score') or test.get('overall_score')) is not None
+                        ]
+                        
+                        if dim_scores:
+                            avg = sum(dim_scores) / len(dim_scores)
+                            model_metrics[dimension] = {
+                                'score': avg,
+                                'normalized': (avg / 5.0) * 100,
+                                'count': len(dim_scores)
+                            }
+                        else:
+                            model_metrics[dimension] = {
+                                'score': 0,
+                                'normalized': 0,
+                                'count': 0
+                            }
+                    
+                    # Calculate overall intelligence quotient (IQ score)
+                    weighted_scores = []
+                    weights = {
+                        'logical_reasoning': 1.5,  # Higher weight for core reasoning
+                        'mathematical_ability': 1.3,
+                        'instruction_following': 1.2,
+                        'creativity': 1.0,
+                        'technical_knowledge': 1.4,
+                        'linguistic_nuance': 1.1,
+                        'conversational_depth': 1.0
+                    }
+                    
+                    total_weight = 0
+                    for dim, data in model_metrics.items():
+                        if data['count'] > 0:
+                            weighted_scores.append(data['score'] * weights[dim])
+                            total_weight += weights[dim]
+                    
+                    iq_score = (sum(weighted_scores) / total_weight * 20) if total_weight > 0 else 0
+                    
+                    # Adaptability: performance across diverse categories
+                    category_scores = {}
+                    for test in test_results:
+                        cat = test.get('category')
+                        score = test.get('score') or test.get('overall_score')
+                        if cat and score is not None:
+                            if cat not in category_scores:
+                                category_scores[cat] = []
+                            category_scores[cat].append(score)
+                    
+                    adaptability = len([cat for cat, scores in category_scores.items() 
+                                      if sum(scores)/len(scores) >= 2.5]) / max(len(category_scores), 1) * 100
+                    
+                    # Problem-solving depth: performance on hard problems
+                    hard_scores = [
+                        test.get('score') or test.get('overall_score')
+                        for test in test_results
+                        if test.get('difficulty') in ['hard', 'very_hard'] and 
+                           (test.get('score') or test.get('overall_score')) is not None
+                    ]
+                    
+                    problem_solving_depth = (sum(hard_scores) / len(hard_scores) * 20) if hard_scores else 0
+                    
+                    metrics[model_name] = {
+                        'dimensions': model_metrics,
+                        'iq_score': iq_score,
+                        'adaptability': adaptability,
+                        'problem_solving_depth': problem_solving_depth,
+                        'overall_intelligence': (iq_score * 0.5 + adaptability * 0.3 + problem_solving_depth * 0.2)
+                    }
+                    
+                except Exception as e:
+                    print(f"Error calculating intelligence metrics: {e}")
+            
+            return jsonify(metrics)
+    
+    def run(self, host='127.0.0.1', port=5000, debug=False):
+        """Start the web server"""
+        
+        # Create templates directory if it doesn't exist
+        templates_dir = Path(__file__).parent / 'templates'
+        templates_dir.mkdir(exist_ok=True)
+        
+        # Generate HTML template
+        self.create_dashboard_template(templates_dir)
+        
+        # Open browser automatically
+        def open_browser():
+            webbrowser.open(f'http://{host}:{port}')
+        
+        if not debug:
+            Timer(1.5, open_browser).start()
+        
+        print(f"\n🌐 Starting web interface at http://{host}:{port}")
+        print(f"📊 Dashboard will open automatically in your browser")
+        print(f"🛑 Press Ctrl+C to stop the server\n")
+        
+        self.app.run(host=host, port=port, debug=debug)
+    
+    def create_dashboard_template(self, templates_dir: Path):
+        """Create the HTML dashboard template"""
+        
+        html_content = '''<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>LLM Evaluation Dashboard</title>
+    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
+    <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        
+        :root {
+            --bg-gradient-start: #667eea;
+            --bg-gradient-end: #764ba2;
+            --card-bg: #ffffff;
+            --text-primary: #333333;
+            --text-secondary: #666666;
+            --border-color: #e0e0e0;
+            --stat-card-bg: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
+            --shadow: rgba(0,0,0,0.1);
+            --shadow-hover: rgba(0,0,0,0.15);
+        }
+        
+        body.dark-mode {
+            --bg-gradient-start: #1a1a2e;
+            --bg-gradient-end: #16213e;
+            --card-bg: #0f1419;
+            --text-primary: #e0e0e0;
+            --text-secondary: #a0a0a0;
+            --border-color: #2a2a3e;
+            --stat-card-bg: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
+            --shadow: rgba(0,0,0,0.3);
+            --shadow-hover: rgba(0,0,0,0.5);
+        }
+        
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
+            background: linear-gradient(135deg, var(--bg-gradient-start) 0%, var(--bg-gradient-end) 100%);
+            color: var(--text-primary);
+            min-height: 100vh;
+            padding: 20px;
+            transition: all 0.3s ease;
+        }
+        
+        .container {
+            max-width: 1400px;
+            margin: 0 auto;
+        }
+        
+        header {
+            background: var(--card-bg);
+            padding: 30px;
+            border-radius: 15px;
+            box-shadow: 0 10px 40px var(--shadow);
+            margin-bottom: 30px;
+            position: relative;
+        }
+        
+        .theme-toggle {
+            position: absolute;
+            top: 30px;
+            right: 30px;
+            background: var(--border-color);
+            border: none;
+            padding: 10px 20px;
+            border-radius: 20px;
+            cursor: pointer;
+            font-size: 1em;
+            transition: all 0.3s;
+        }
+        
+        .theme-toggle:hover {
+            transform: scale(1.05);
+            box-shadow: 0 4px 15px var(--shadow-hover);
+        }
+        
+        h1 {
+            font-size: 2.5em;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            -webkit-background-clip: text;
+            -webkit-text-fill-color: transparent;
+            margin-bottom: 10px;
+        }
+        
+        .subtitle {
+            color: var(--text-secondary);
+            font-size: 1.1em;
+        }
+        
+        .tabs {
+            display: flex;
+            gap: 10px;
+            margin-bottom: 20px;
+            flex-wrap: wrap;
+        }
+        
+        .tab {
+            background: var(--card-bg);
+            border: none;
+            padding: 12px 24px;
+            border-radius: 8px;
+            cursor: pointer;
+            font-size: 1em;
+            transition: all 0.3s;
+            box-shadow: 0 2px 10px var(--shadow);
+            color: var(--text-primary);
+        }
+        
+        .tab:hover {
+            transform: translateY(-2px);
+            box-shadow: 0 4px 15px var(--shadow-hover);
+        }
+        
+        .tab.active {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+        }
+        
+        .content-panel {
+            display: none;
+            background: var(--card-bg);
+            padding: 30px;
+            border-radius: 15px;
+            box-shadow: 0 10px 40px var(--shadow);
+            animation: fadeIn 0.3s;
+        }
+        
+        .content-panel.active {
+            display: block;
+        }
+        
+        @keyframes fadeIn {
+            from { opacity: 0; transform: translateY(10px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
+        
+        .stats-grid {
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
+            gap: 20px;
+            margin-bottom: 30px;
+        }
+        
+        .stat-card {
+            background: var(--stat-card-bg);
+            padding: 20px;
+            border-radius: 10px;
+            text-align: center;
+        }
+        
+        .stat-card h3 {
+            font-size: 0.9em;
+            color: var(--text-secondary);
+            margin-bottom: 10px;
+            text-transform: uppercase;
+        }
+        
+        .stat-card .value {
+            font-size: 2.5em;
+            font-weight: bold;
+            color: #667eea;
+        }
+        
+        .chart-container {
+            position: relative;
+            height: 400px;
+            margin-bottom: 30px;
+        }
+        
+        .controls {
+            display: flex;
+            gap: 15px;
+            margin-bottom: 20px;
+            flex-wrap: wrap;
+        }
+        
+        select, input {
+            padding: 10px 15px;
+            border: 2px solid var(--border-color);
+            border-radius: 8px;
+            font-size: 1em;
+            background: var(--card-bg);
+            color: var(--text-primary);
+            cursor: pointer;
+            transition: border-color 0.3s;
+        }
+        
+        select:hover, input:hover {
+            border-color: #667eea;
+        }
+        
+        select:focus, input:focus {
+            outline: none;
+            border-color: #764ba2;
+        }
+        
+        table {
+            width: 100%;
+            border-collapse: collapse;
+            margin-top: 20px;
+        }
+        
+        th, td {
+            padding: 12px;
+            text-align: left;
+            border-bottom: 1px solid var(--border-color);
+        }
+        
+        th {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            font-weight: 600;
+            cursor: pointer;
+            user-select: none;
+        }
+        
+        th:hover {
+            opacity: 0.9;
+        }
+        
+        tr:hover {
+            background: var(--border-color);
+        }
+        
+        .score-badge {
+            display: inline-block;
+            padding: 5px 12px;
+            border-radius: 20px;
+            font-weight: bold;
+            font-size: 0.9em;
+        }
+        
+        .score-exceptional {
+            background: #10b981;
+            color: white;
+        }
+        
+        .score-pass {
+            background: #f59e0b;
+            color: white;
+        }
+        
+        .score-fail {
+            background: #ef4444;
+            color: white;
+        }
+        
+        .loading {
+            text-align: center;
+            padding: 40px;
+            color: var(--text-secondary);
+        }
+        
+        .spinner {
+            border: 3px solid var(--border-color);
+            border-top: 3px solid #667eea;
+            border-radius: 50%;
+            width: 40px;
+            height: 40px;
+            animation: spin 1s linear infinite;
+            margin: 20px auto;
+        }
+        
+        @keyframes spin {
+            0% { transform: rotate(0deg); }
+            100% { transform: rotate(360deg); }
+        }
+        
+        .model-selector {
+            display: flex;
+            gap: 10px;
+            flex-wrap: wrap;
+            margin-bottom: 20px;
+        }
+        
+        .model-chip {
+            padding: 8px 16px;
+            border-radius: 20px;
+            border: 2px solid #667eea;
+            background: var(--card-bg);
+            color: var(--text-primary);
+            cursor: pointer;
+            transition: all 0.3s;
+        }
+        
+        .model-chip:hover {
+            background: #667eea;
+            color: white;
+        }
+        
+        .model-chip.selected {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+        }
+        
+        .metric-card {
+            background: var(--card-bg);
+            border: 2px solid var(--border-color);
+            border-radius: 10px;
+            padding: 20px;
+            margin-bottom: 20px;
+        }
+        
+        .metric-card h3 {
+            color: #667eea;
+            margin-bottom: 15px;
+        }
+        
+        .progress-bar {
+            background: var(--border-color);
+            height: 30px;
+            border-radius: 15px;
+            overflow: hidden;
+            margin: 10px 0;
+            position: relative;
+            cursor: help;
+        }
+        
+        .progress-fill {
+            height: 100%;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            transition: width 0.5s;
+            display: flex;
+            align-items: center;
+            justify-content: flex-end;
+            padding-right: 10px;
+            color: white;
+            font-weight: bold;
+        }
+        
+        /* Tooltip styles */
+        .tooltip {
+            position: relative;
+            display: inline-block;
+        }
+        
+        .tooltip .tooltiptext {
+            visibility: hidden;
+            width: 300px;
+            background-color: rgba(0, 0, 0, 0.9);
+            color: #fff;
+            text-align: left;
+            border-radius: 8px;
+            padding: 12px;
+            position: absolute;
+            z-index: 1000;
+            bottom: 125%;
+            left: 50%;
+            margin-left: -150px;
+            opacity: 0;
+            transition: opacity 0.3s;
+            font-size: 0.85em;
+            line-height: 1.4;
+            box-shadow: 0 4px 20px rgba(0,0,0,0.3);
+        }
+        
+        .tooltip .tooltiptext::after {
+            content: "";
+            position: absolute;
+            top: 100%;
+            left: 50%;
+            margin-left: -5px;
+            border-width: 5px;
+            border-style: solid;
+            border-color: rgba(0, 0, 0, 0.9) transparent transparent transparent;
+        }
+        
+        .tooltip:hover .tooltiptext {
+            visibility: visible;
+            opacity: 1;
+        }
+        
+        .tooltiptext code {
+            background: rgba(255, 255, 255, 0.1);
+            padding: 2px 6px;
+            border-radius: 3px;
+            font-family: monospace;
+            font-size: 0.9em;
+        }
+        
+        .tooltiptext strong {
+            color: #667eea;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <header>
+            <button class="theme-toggle" onclick="toggleTheme()">🌓 Toggle Dark Mode</button>
+            <h1>🧠 LLM Evaluation Dashboard</h1>
+            <p class="subtitle">Comprehensive Intelligence & Performance Analysis</p>
+        </header>
+        
+        <div class="tabs">
+            <button class="tab active" onclick="switchTab('overview')">📊 Overview</button>
+            <button class="tab" onclick="switchTab('comparison')">⚔️ Model Comparison</button>
+            <button class="tab" onclick="switchTab('intelligence')">🎯 Intelligence Metrics</button>
+            <button class="tab" onclick="switchTab('categories')">📂 Category Analysis</button>
+            <button class="tab" onclick="switchTab('details')">🔍 Detailed Results</button>
+        </div>
+        
+        <div id="overview" class="content-panel active">
+            <h2>System Overview</h2>
+            <div class="stats-grid" id="overviewStats">
+                <div class="loading">
+                    <div class="spinner"></div>
+                    Loading data...
+                </div>
+            </div>
+            <div class="chart-container">
+                <canvas id="overviewChart"></canvas>
+            </div>
+        </div>
+        
+        <div id="comparison" class="content-panel">
+            <h2>Model Performance Comparison</h2>
+            <div class="controls">
+                <select id="metricSelect" onchange="updateComparisonChart()">
+                    <option value="average">Average Score</option>
+                    <option value="pass_rate">Pass Rate</option>
+                    <option value="exceptional_rate">Exceptional Rate</option>
+                    <option value="consistency">Consistency</option>
+                    <option value="robustness">Robustness</option>
+                </select>
+            </div>
+            <div class="chart-container">
+                <canvas id="comparisonChart"></canvas>
+            </div>
+        </div>
+        
+        <div id="intelligence" class="content-panel">
+            <h2>Intelligence Metrics Analysis</h2>
+            <p style="margin-bottom: 20px; color: #666;">
+                Advanced metrics evaluating different dimensions of AI intelligence and reasoning capabilities.
+            </p>
+            <div id="intelligenceMetrics">
+                <div class="loading">
+                    <div class="spinner"></div>
+                    Calculating intelligence metrics...
+                </div>
+            </div>
+        </div>
+        
+        <div id="categories" class="content-panel">
+            <h2>Performance by Category</h2>
+            <div class="controls">
+                <select id="categorySelect" onchange="updateCategoryChart()">
+                    <option value="">Loading categories...</option>
+                </select>
+            </div>
+            <div class="chart-container">
+                <canvas id="categoryChart"></canvas>
+            </div>
+        </div>
+        
+        <div id="details" class="content-panel">
+            <h2>Detailed Test Results</h2>
+            <div class="controls">
+                <select id="modelSelect" onchange="loadModelDetails()">
+                    <option value="">Select a model...</option>
+                </select>
+                <input type="text" id="searchInput" placeholder="Search tests..." onkeyup="filterTable()">
+                <select id="filterCategory" onchange="filterTable()">
+                    <option value="">All Categories</option>
+                </select>
+                <select id="filterScore" onchange="filterTable()">
+                    <option value="">All Scores</option>
+                    <option value="exceptional">Exceptional (4-5)</option>
+                    <option value="pass">Pass (2-3)</option>
+                    <option value="fail">Fail (0-1)</option>
+                </select>
+            </div>
+            <div id="detailsTable">
+                <p class="loading">Select a model to view detailed results</p>
+            </div>
+        </div>
+    </div>
+    
+    <script>
+        let comparisonData = null;
+        let statisticsData = null;
+        let intelligenceData = null;
+        let currentModelDetails = null;
+        
+        // Theme toggle functionality
+        function toggleTheme() {
+            document.body.classList.toggle('dark-mode');
+            const isDark = document.body.classList.contains('dark-mode');
+            localStorage.setItem('darkMode', isDark ? 'enabled' : 'disabled');
+        }
+        
+        // Load theme preference
+        function loadThemePreference() {
+            const darkMode = localStorage.getItem('darkMode');
+            if (darkMode === 'enabled') {
+                document.body.classList.add('dark-mode');
+            }
+        }
+        
+        // Tab switching
+        function switchTab(tabName) {
+            document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
+            document.querySelectorAll('.content-panel').forEach(p => p.classList.remove('active'));
+            
+            event.target.classList.add('active');
+            document.getElementById(tabName).classList.add('active');
+        }
+        
+        // Initialize dashboard
+        async function initDashboard() {
+            loadThemePreference();
+            await loadOverview();
+            await loadComparison();
+            await loadStatistics();
+            await loadIntelligenceMetrics();
+            populateModelSelector();
+        }
+        
+        async function loadOverview() {
+            try {
+                const response = await axios.get('/api/comparison');
+                comparisonData = response.data;
+                
+                const models = Object.keys(comparisonData.models);
+                const totalTests = models.reduce((sum, model) => 
+                    sum + comparisonData.models[model].metadata.total_tests, 0);
+                const avgScore = models.reduce((sum, model) => 
+                    sum + (comparisonData.models[model].overall_stats.average || 0), 0) / models.length;
+                
+                const statsHtml = `
+                    <div class="stat-card">
+                        <h3>Models Evaluated</h3>
+                        <div class="value">${models.length}</div>
+                    </div>
+                    <div class="stat-card">
+                        <h3>Total Tests</h3>
+                        <div class="value">${totalTests}</div>
+                    </div>
+                    <div class="stat-card">
+                        <h3>Average Score</h3>
+                        <div class="value">${avgScore.toFixed(2)}</div>
+                    </div>
+                    <div class="stat-card">
+                        <h3>Categories</h3>
+                        <div class="value">${comparisonData.categories.length}</div>
+                    </div>
+                `;
+                
+                document.getElementById('overviewStats').innerHTML = statsHtml;
+                
+                // Create overview chart
+                const ctx = document.getElementById('overviewChart').getContext('2d');
+                new Chart(ctx, {
+                    type: 'bar',
+                    data: {
+                        labels: models,
+                        datasets: [{
+                            label: 'Average Score',
+                            data: models.map(m => comparisonData.models[m].overall_stats.average || 0),
+                            backgroundColor: 'rgba(102, 126, 234, 0.6)',
+                            borderColor: 'rgba(102, 126, 234, 1)',
+                            borderWidth: 2
+                        }]
+                    },
+                    options: {
+                        responsive: true,
+                        maintainAspectRatio: false,
+                        scales: {
+                            y: {
+                                beginAtZero: true,
+                                max: 5
+                            }
+                        }
+                    }
+                });
+                
+            } catch (error) {
+                console.error('Error loading overview:', error);
+            }
+        }
+        
+        async function loadComparison() {
+            updateComparisonChart();
+        }
+        
+        async function updateComparisonChart() {
+            if (!comparisonData) return;
+            
+            const metric = document.getElementById('metricSelect').value;
+            const models = Object.keys(comparisonData.models);
+            
+            let data, label;
+            
+            if (metric === 'consistency' || metric === 'robustness') {
+                if (!statisticsData) {
+                    await loadStatistics();
+                }
+                const index = statisticsData.models.indexOf(models[0]);
+                data = models.map((m, i) => statisticsData[metric + '_score'][i]);
+                label = metric.charAt(0).toUpperCase() + metric.slice(1) + ' Score';
+            } else {
+                data = models.map(m => comparisonData.models[m].overall_stats[metric] || 0);
+                label = metric.split('_').map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' ');
+            }
+            
+            const ctx = document.getElementById('comparisonChart');
+            if (window.comparisonChartInstance) {
+                window.comparisonChartInstance.destroy();
+            }
+            
+            window.comparisonChartInstance = new Chart(ctx, {
+                type: 'radar',
+                data: {
+                    labels: models,
+                    datasets: [{
+                        label: label,
+                        data: data,
+                        backgroundColor: 'rgba(118, 75, 162, 0.2)',
+                        borderColor: 'rgba(118, 75, 162, 1)',
+                        pointBackgroundColor: 'rgba(118, 75, 162, 1)',
+                        pointBorderColor: '#fff',
+                        pointHoverBackgroundColor: '#fff',
+                        pointHoverBorderColor: 'rgba(118, 75, 162, 1)'
+                    }]
+                },
+                options: {
+                    responsive: true,
+                    maintainAspectRatio: false,
+                    scales: {
+                        r: {
+                            beginAtZero: true
+                        }
+                    }
+                }
+            });
+        }
+        
+        async function loadStatistics() {
+            try {
+                const response = await axios.get('/api/statistics');
+                statisticsData = response.data;
+            } catch (error) {
+                console.error('Error loading statistics:', error);
+            }
+        }
+        
+        async function loadIntelligenceMetrics() {
+            try {
+                const response = await axios.get('/api/intelligence_metrics');
+                intelligenceData = response.data;
+                
+                let html = '';
+                
+                for (const [model, metrics] of Object.entries(intelligenceData)) {
+                    html += `
+                        <div class="metric-card">
+                            <h3>${model}</h3>
+                            
+                            <div style="margin-bottom: 20px;" class="tooltip">
+                                <strong>Overall Intelligence Score:</strong>
+                                <span class="tooltiptext">
+                                    <strong>Calculation:</strong><br>
+                                    Overall = (IQ × 0.5) + (Adaptability × 0.3) + (Problem-Solving × 0.2)<br><br>
+                                    <strong>Values:</strong><br>
+                                    • IQ: ${metrics.iq_score.toFixed(1)}<br>
+                                    • Adaptability: ${metrics.adaptability.toFixed(1)}%<br>
+                                    • Problem-Solving: ${metrics.problem_solving_depth.toFixed(1)}<br><br>
+                                    Result: ${metrics.overall_intelligence.toFixed(1)}
+                                </span>
+                                <div class="progress-bar">
+                                    <div class="progress-fill" style="width: ${metrics.overall_intelligence}%">
+                                        ${metrics.overall_intelligence.toFixed(1)}
+                                    </div>
+                                </div>
+                            </div>
+                            
+                            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 15px;">
+                                <div class="tooltip">
+                                    <strong>IQ Score:</strong>
+                                    <span class="tooltiptext">
+                                        <strong>Weighted Average of Dimensions:</strong><br><br>
+                                        ${Object.entries(metrics.dimensions).map(([dim, data]) => {
+                                            const weights = {
+                                                'logical_reasoning': 1.5,
+                                                'mathematical_ability': 1.3,
+                                                'technical_knowledge': 1.4,
+                                                'instruction_following': 1.2,
+                                                'linguistic_nuance': 1.1,
+                                                'creativity': 1.0,
+                                                'conversational_depth': 1.0
+                                            };
+                                            return `• ${dim.replace(/_/g, ' ')}: ${data.score.toFixed(1)} × ${weights[dim] || 1.0}`;
+                                        }).join('<br>')}<br><br>
+                                        Normalized to 0-100 scale
+                                    </span>
+                                    <div class="progress-bar">
+                                        <div class="progress-fill" style="width: ${metrics.iq_score}%">
+                                            ${metrics.iq_score.toFixed(1)}
+                                        </div>
+                                    </div>
+                                </div>
+                                
+                                <div class="tooltip">
+                                    <strong>Adaptability:</strong>
+                                    <span class="tooltiptext">
+                                        <strong>Cross-Category Performance:</strong><br><br>
+                                        Measures versatility across different task types.<br><br>
+                                        Formula: (Categories with avg ≥ 2.5) / (Total categories) × 100<br><br>
+                                        Higher score = more versatile model
+                                    </span>
+                                    <div class="progress-bar">
+                                        <div class="progress-fill" style="width: ${metrics.adaptability}%">
+                                            ${metrics.adaptability.toFixed(1)}%
+                                        </div>
+                                    </div>
+                                </div>
+                                
+                                <div class="tooltip">
+                                    <strong>Problem-Solving Depth:</strong>
+                                    <span class="tooltiptext">
+                                        <strong>Performance on Challenging Tasks:</strong><br><br>
+                                        Average score on "hard" and "very_hard" difficulty tests.<br><br>
+                                        Formula: (Avg score on hard tests) × 20<br><br>
+                                        Tests critical thinking and complex reasoning
+                                    </span>
+                                    <div class="progress-bar">
+                                        <div class="progress-fill" style="width: ${metrics.problem_solving_depth}%">
+                                            ${metrics.problem_solving_depth.toFixed(1)}
+                                        </div>
+                                    </div>
+                                </div>
+                            </div>
+                            
+                            <h4 style="margin-top: 20px; color: #764ba2;">Cognitive Dimensions:</h4>
+                            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 10px; margin-top: 10px;">
+                    `;
+                    
+                    const dimensionWeights = {
+                        'logical_reasoning': 1.5,
+                        'mathematical_ability': 1.3,
+                        'technical_knowledge': 1.4,
+                        'instruction_following': 1.2,
+                        'linguistic_nuance': 1.1,
+                        'creativity': 1.0,
+                        'conversational_depth': 1.0
+                    };
+                    
+                    for (const [dim, data] of Object.entries(metrics.dimensions)) {
+                        const weight = dimensionWeights[dim] || 1.0;
+                        html += `
+                            <div class="tooltip">
+                                <small>${dim.replace(/_/g, ' ').toUpperCase()}</small>
+                                <span class="tooltiptext">
+                                    <strong>${dim.replace(/_/g, ' ').toUpperCase()}</strong><br><br>
+                                    Score: <code>${data.score.toFixed(2)}/5.00</code><br>
+                                    Weight in IQ: <code>${weight}</code><br>
+                                    Tests evaluated: <code>${data.count}</code><br><br>
+                                    Normalized: ${data.normalized.toFixed(1)}%
+                                </span>
+                                <div class="progress-bar" style="height: 20px;">
+                                    <div class="progress-fill" style="width: ${data.normalized}%; font-size: 0.8em;">
+                                        ${data.score.toFixed(1)}
+                                    </div>
+                                </div>
+                            </div>
+                        `;
+                    }
+                    
+                    html += `
+                            </div>
+                        </div>
+                    `;
+                }
+                
+                document.getElementById('intelligenceMetrics').innerHTML = html;
+                
+            } catch (error) {
+                console.error('Error loading intelligence metrics:', error);
+                document.getElementById('intelligenceMetrics').innerHTML = 
+                    '<p class="loading">Error loading intelligence metrics</p>';
+            }
+        }
+        
+        function populateModelSelector() {
+            if (!comparisonData) return;
+            
+            const models = Object.keys(comparisonData.models);
+            const select = document.getElementById('modelSelect');
+            
+            select.innerHTML = '<option value="">Select a model...</option>';
+            models.forEach(model => {
+                const option = document.createElement('option');
+                option.value = model;
+                option.textContent = model;
+                select.appendChild(option);
+            });
+            
+            // Populate category filter
+            const categoryFilter = document.getElementById('filterCategory');
+            categoryFilter.innerHTML = '<option value="">All Categories</option>';
+            comparisonData.categories.forEach(cat => {
+                const option = document.createElement('option');
+                option.value = cat;
+                option.textContent = cat;
+                categoryFilter.appendChild(option);
+            });
+            
+            // Populate category chart selector
+            const categorySelect = document.getElementById('categorySelect');
+            categorySelect.innerHTML = '';
+            comparisonData.categories.forEach(cat => {
+                const option = document.createElement('option');
+                option.value = cat;
+                option.textContent = cat;
+                categorySelect.appendChild(option);
+            });
+            
+            if (comparisonData.categories.length > 0) {
+                updateCategoryChart();
+            }
+        }
+        
+        function updateCategoryChart() {
+            if (!comparisonData) return;
+            
+            const category = document.getElementById('categorySelect').value;
+            const models = Object.keys(comparisonData.models);
+            
+            const data = models.map(model => {
+                const stats = comparisonData.models[model].category_stats[category];
+                return stats ? stats.average : 0;
+            });
+            
+            const ctx = document.getElementById('categoryChart');
+            if (window.categoryChartInstance) {
+                window.categoryChartInstance.destroy();
+            }
+            
+            window.categoryChartInstance = new Chart(ctx, {
+                type: 'bar',
+                data: {
+                    labels: models,
+                    datasets: [{
+                        label: `${category} - Average Score`,
+                        data: data,
+                        backgroundColor: 'rgba(102, 126, 234, 0.6)',
+                        borderColor: 'rgba(102, 126, 234, 1)',
+                        borderWidth: 2
+                    }]
+                },
+                options: {
+                    responsive: true,
+                    maintainAspectRatio: false,
+                    scales: {
+                        y: {
+                            beginAtZero: true,
+                            max: 5
+                        }
+                    }
+                }
+            });
+        }
+        
+        async function loadModelDetails() {
+            const modelName = document.getElementById('modelSelect').value;
+            if (!modelName || !comparisonData) return;
+            
+            currentModelDetails = comparisonData.models[modelName].test_results;
+            displayDetailsTable(currentModelDetails);
+        }
+        
+        function displayDetailsTable(results) {
+            let html = `
+                <table>
+                    <thead>
+                        <tr>
+                            <th onclick="sortTable('test_name')">Test Name</th>
+                            <th onclick="sortTable('category')">Category</th>
+                            <th onclick="sortTable('difficulty')">Difficulty</th>
+                            <th onclick="sortTable('score')">Score</th>
+                            <th onclick="sortTable('generation_time')">Time (s)</th>
+                            <th onclick="sortTable('tokens')">Tokens</th>
+                            <th onclick="sortTable('status')">Status</th>
+                            <th>Notes</th>
+                        </tr>
+                    </thead>
+                    <tbody>
+            `;
+            
+            results.forEach(test => {
+                const scoreClass = test.score >= 4 ? 'exceptional' : test.score >= 2 ? 'pass' : 'fail';
+                const scoreDisplay = test.score !== null ? test.score.toFixed(1) : 'N/A';
+                
+                // Extract timing and token info
+                const genTime = test.generation_time ? test.generation_time.toFixed(2) : 'N/A';
+                let tokenInfo = 'N/A';
+                let tokensPerSec = '';
+                
+                if (test.api_metrics && test.api_metrics.usage) {
+                    const usage = test.api_metrics.usage;
+                    const totalTokens = usage.total_tokens || usage.eval_count || 'N/A';
+                    const completionTokens = usage.completion_tokens || usage.eval_count;
+                    
+                    if (totalTokens !== 'N/A') {
+                        tokenInfo = totalTokens.toString();
+                        
+                        // Calculate tokens/sec if we have both values
+                        if (test.generation_time && completionTokens) {
+                            const tps = completionTokens / test.generation_time;
+                            tokensPerSec = `<br><small>(${tps.toFixed(1)} t/s)</small>`;
+                        }
+                    }
+                }
+                
+                html += `
+                    <tr>
+                        <td><strong>${test.test_name}</strong></td>
+                        <td>${test.category}</td>
+                        <td>${test.difficulty}</td>
+                        <td><span class="score-badge score-${scoreClass}">${scoreDisplay}</span></td>
+                        <td>${genTime}</td>
+                        <td>${tokenInfo}${tokensPerSec}</td>
+                        <td>${test.status}</td>
+                        <td><small>${test.notes}</small></td>
+                    </tr>
+                `;
+            });
+            
+            html += '</tbody></table>';
+            document.getElementById('detailsTable').innerHTML = html;
+        }
+        
+        function filterTable() {
+            if (!currentModelDetails) return;
+            
+            const searchTerm = document.getElementById('searchInput').value.toLowerCase();
+            const categoryFilter = document.getElementById('filterCategory').value;
+            const scoreFilter = document.getElementById('filterScore').value;
+            
+            const filtered = currentModelDetails.filter(test => {
+                const matchesSearch = test.test_name.toLowerCase().includes(searchTerm) ||
+                                    test.category.toLowerCase().includes(searchTerm);
+                const matchesCategory = !categoryFilter || test.category === categoryFilter;
+                
+                let matchesScore = true;
+                if (scoreFilter === 'exceptional') matchesScore = test.score >= 4;
+                else if (scoreFilter === 'pass') matchesScore = test.score >= 2 && test.score < 4;
+                else if (scoreFilter === 'fail') matchesScore = test.score < 2;
+                
+                return matchesSearch && matchesCategory && matchesScore;
+            });
+            
+            displayDetailsTable(filtered);
+        }
+        
+        function sortTable(column) {
+            if (!currentModelDetails) return;
+            
+            currentModelDetails.sort((a, b) => {
+                if (column === 'score') {
+                    return (b[column] || 0) - (a[column] || 0);
+                }
+                return (a[column] || '').toString().localeCompare((b[column] || '').toString());
+            });
+            
+            filterTable();
+        }
+        
+        // Initialize on load
+        initDashboard();
+    </script>
+</body>
+</html>'''
+        
+        template_path = templates_dir / 'dashboard.html'
+        with open(template_path, 'w', encoding='utf-8') as f:
+            f.write(html_content)
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Analyze and compare AI model evaluation results",
@@ -330,8 +1627,32 @@ Examples:
         help='Export results to CSV file'
     )
     
+    parser.add_argument(
+        '--web',
+        action='store_true',
+        help='Launch interactive web dashboard'
+    )
+    
+    parser.add_argument(
+        '--host',
+        default='127.0.0.1',
+        help='Web server host (default: 127.0.0.1)'
+    )
+    
+    parser.add_argument(
+        '--port',
+        type=int,
+        default=5000,
+        help='Web server port (default: 5000)'
+    )
+    
     args = parser.parse_args()
     
+    if args.web:
+        web = WebInterface(results_dir=args.results_dir)
+        web.run(host=args.host, port=args.port)
+        return
+    
     analyzer = ResultsAnalyzer(results_dir=args.results_dir)
     
     if args.compare:
diff --git a/batch_test.sh b/batch_test.sh
deleted file mode 100755
index 452afb2..0000000
--- a/batch_test.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash
-# Batch Test Script for AI Model Evaluation
-# Tests multiple models and generates comparison report
-
-# Configuration
-ENDPOINT="${ENDPOINT:-http://localhost:11434}"
-API_KEY="${API_KEY:-}"
-
-# Color output
-GREEN='\033[0;32m'
-BLUE='\033[0;34m'
-YELLOW='\033[1;33m'
-NC='\033[0m' # No Color
-
-echo -e "${BLUE}========================================${NC}"
-echo -e "${BLUE}AI Model Batch Testing${NC}"
-echo -e "${BLUE}========================================${NC}"
-echo ""
-echo "Endpoint: $ENDPOINT"
-echo "API Key: ${API_KEY:0:10}${API_KEY:+...}"
-echo ""
-
-# Function to run test
-run_test() {
-    local model=$1
-    echo -e "${GREEN}Testing: $model${NC}"
-    
-    if [ -z "$API_KEY" ]; then
-        python ai_eval.py --endpoint "$ENDPOINT" --model "$model"
-    else
-        python ai_eval.py --endpoint "$ENDPOINT" --api-key "$API_KEY" --model "$model"
-    fi
-    
-    if [ $? -eq 0 ]; then
-        echo -e "${GREEN}✓ Completed: $model${NC}"
-    else
-        echo -e "${YELLOW}⚠ Failed or interrupted: $model${NC}"
-    fi
-    echo ""
-}
-
-# Test qwen3:4b models with different quantizations
-echo -e "${BLUE}=== Testing qwen3:4b with different quantizations ===${NC}"
-echo ""
-
-models_4b=(
-    "qwen3:4b-q4_K_M"
-    "qwen3:4b-q8_0"
-    "qwen3:4b-fp16"
-)
-
-for model in "${models_4b[@]}"; do
-    run_test "$model"
-done
-
-# Test different model sizes with q4_K_M quantization
-echo -e "${BLUE}=== Testing different model sizes (q4_K_M) ===${NC}"
-echo ""
-
-models_sizes=(
-    "qwen3:4b-q4_K_M"
-    "qwen3:8b-q4_K_M"
-    "qwen3:14b-q4_K_M"
-)
-
-for model in "${models_sizes[@]}"; do
-    run_test "$model"
-done
-
-# Generate comparison report
-echo -e "${BLUE}========================================${NC}"
-echo -e "${BLUE}Generating Comparison Report${NC}"
-echo -e "${BLUE}========================================${NC}"
-echo ""
-
-python analyze_results.py --compare
-python analyze_results.py --export batch_comparison.csv
-
-echo ""
-echo -e "${GREEN}========================================${NC}"
-echo -e "${GREEN}Batch Testing Complete!${NC}"
-echo -e "${GREEN}========================================${NC}"
-echo ""
-echo "Results saved in ./results/"
-echo "Comparison CSV: ./results/batch_comparison.csv"
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 0f2eecc..813e601 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,5 @@
 pyyaml
-requests
\ No newline at end of file
+requests
+python-dotenv
+flask
+numpy
\ No newline at end of file
diff --git a/templates/dashboard.html b/templates/dashboard.html
new file mode 100644
index 0000000..e9640b1
--- /dev/null
+++ b/templates/dashboard.html
@@ -0,0 +1,977 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>LLM Evaluation Dashboard</title>
+    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
+    <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        
+        :root {
+            --bg-gradient-start: #667eea;
+            --bg-gradient-end: #764ba2;
+            --card-bg: #ffffff;
+            --text-primary: #333333;
+            --text-secondary: #666666;
+            --border-color: #e0e0e0;
+            --stat-card-bg: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
+            --shadow: rgba(0,0,0,0.1);
+            --shadow-hover: rgba(0,0,0,0.15);
+        }
+        
+        body.dark-mode {
+            --bg-gradient-start: #1a1a2e;
+            --bg-gradient-end: #16213e;
+            --card-bg: #0f1419;
+            --text-primary: #e0e0e0;
+            --text-secondary: #a0a0a0;
+            --border-color: #2a2a3e;
+            --stat-card-bg: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
+            --shadow: rgba(0,0,0,0.3);
+            --shadow-hover: rgba(0,0,0,0.5);
+        }
+        
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
+            background: linear-gradient(135deg, var(--bg-gradient-start) 0%, var(--bg-gradient-end) 100%);
+            color: var(--text-primary);
+            min-height: 100vh;
+            padding: 20px;
+            transition: all 0.3s ease;
+        }
+        
+        .container {
+            max-width: 1400px;
+            margin: 0 auto;
+        }
+        
+        header {
+            background: var(--card-bg);
+            padding: 30px;
+            border-radius: 15px;
+            box-shadow: 0 10px 40px var(--shadow);
+            margin-bottom: 30px;
+            position: relative;
+        }
+        
+        .theme-toggle {
+            position: absolute;
+            top: 30px;
+            right: 30px;
+            background: var(--border-color);
+            border: none;
+            padding: 10px 20px;
+            border-radius: 20px;
+            cursor: pointer;
+            font-size: 1em;
+            transition: all 0.3s;
+        }
+        
+        .theme-toggle:hover {
+            transform: scale(1.05);
+            box-shadow: 0 4px 15px var(--shadow-hover);
+        }
+        
+        h1 {
+            font-size: 2.5em;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            -webkit-background-clip: text;
+            -webkit-text-fill-color: transparent;
+            margin-bottom: 10px;
+        }
+        
+        .subtitle {
+            color: var(--text-secondary);
+            font-size: 1.1em;
+        }
+        
+        .tabs {
+            display: flex;
+            gap: 10px;
+            margin-bottom: 20px;
+            flex-wrap: wrap;
+        }
+        
+        .tab {
+            background: var(--card-bg);
+            border: none;
+            padding: 12px 24px;
+            border-radius: 8px;
+            cursor: pointer;
+            font-size: 1em;
+            transition: all 0.3s;
+            box-shadow: 0 2px 10px var(--shadow);
+            color: var(--text-primary);
+        }
+        
+        .tab:hover {
+            transform: translateY(-2px);
+            box-shadow: 0 4px 15px var(--shadow-hover);
+        }
+        
+        .tab.active {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+        }
+        
+        .content-panel {
+            display: none;
+            background: var(--card-bg);
+            padding: 30px;
+            border-radius: 15px;
+            box-shadow: 0 10px 40px var(--shadow);
+            animation: fadeIn 0.3s;
+        }
+        
+        .content-panel.active {
+            display: block;
+        }
+        
+        @keyframes fadeIn {
+            from { opacity: 0; transform: translateY(10px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
+        
+        .stats-grid {
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
+            gap: 20px;
+            margin-bottom: 30px;
+        }
+        
+        .stat-card {
+            background: var(--stat-card-bg);
+            padding: 20px;
+            border-radius: 10px;
+            text-align: center;
+        }
+        
+        .stat-card h3 {
+            font-size: 0.9em;
+            color: var(--text-secondary);
+            margin-bottom: 10px;
+            text-transform: uppercase;
+        }
+        
+        .stat-card .value {
+            font-size: 2.5em;
+            font-weight: bold;
+            color: #667eea;
+        }
+        
+        .chart-container {
+            position: relative;
+            height: 400px;
+            margin-bottom: 30px;
+        }
+        
+        .controls {
+            display: flex;
+            gap: 15px;
+            margin-bottom: 20px;
+            flex-wrap: wrap;
+        }
+        
+        select, input {
+            padding: 10px 15px;
+            border: 2px solid var(--border-color);
+            border-radius: 8px;
+            font-size: 1em;
+            background: var(--card-bg);
+            color: var(--text-primary);
+            cursor: pointer;
+            transition: border-color 0.3s;
+        }
+        
+        select:hover, input:hover {
+            border-color: #667eea;
+        }
+        
+        select:focus, input:focus {
+            outline: none;
+            border-color: #764ba2;
+        }
+        
+        table {
+            width: 100%;
+            border-collapse: collapse;
+            margin-top: 20px;
+        }
+        
+        th, td {
+            padding: 12px;
+            text-align: left;
+            border-bottom: 1px solid var(--border-color);
+        }
+        
+        th {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            font-weight: 600;
+            cursor: pointer;
+            user-select: none;
+        }
+        
+        th:hover {
+            opacity: 0.9;
+        }
+        
+        tr:hover {
+            background: var(--border-color);
+        }
+        
+        .score-badge {
+            display: inline-block;
+            padding: 5px 12px;
+            border-radius: 20px;
+            font-weight: bold;
+            font-size: 0.9em;
+        }
+        
+        .score-exceptional {
+            background: #10b981;
+            color: white;
+        }
+        
+        .score-pass {
+            background: #f59e0b;
+            color: white;
+        }
+        
+        .score-fail {
+            background: #ef4444;
+            color: white;
+        }
+        
+        .loading {
+            text-align: center;
+            padding: 40px;
+            color: var(--text-secondary);
+        }
+        
+        .spinner {
+            border: 3px solid var(--border-color);
+            border-top: 3px solid #667eea;
+            border-radius: 50%;
+            width: 40px;
+            height: 40px;
+            animation: spin 1s linear infinite;
+            margin: 20px auto;
+        }
+        
+        @keyframes spin {
+            0% { transform: rotate(0deg); }
+            100% { transform: rotate(360deg); }
+        }
+        
+        .model-selector {
+            display: flex;
+            gap: 10px;
+            flex-wrap: wrap;
+            margin-bottom: 20px;
+        }
+        
+        .model-chip {
+            padding: 8px 16px;
+            border-radius: 20px;
+            border: 2px solid #667eea;
+            background: var(--card-bg);
+            color: var(--text-primary);
+            cursor: pointer;
+            transition: all 0.3s;
+        }
+        
+        .model-chip:hover {
+            background: #667eea;
+            color: white;
+        }
+        
+        .model-chip.selected {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+        }
+        
+        .metric-card {
+            background: var(--card-bg);
+            border: 2px solid var(--border-color);
+            border-radius: 10px;
+            padding: 20px;
+            margin-bottom: 20px;
+        }
+        
+        .metric-card h3 {
+            color: #667eea;
+            margin-bottom: 15px;
+        }
+        
+        .progress-bar {
+            background: var(--border-color);
+            height: 30px;
+            border-radius: 15px;
+            overflow: hidden;
+            margin: 10px 0;
+            position: relative;
+            cursor: help;
+        }
+        
+        .progress-fill {
+            height: 100%;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            transition: width 0.5s;
+            display: flex;
+            align-items: center;
+            justify-content: flex-end;
+            padding-right: 10px;
+            color: white;
+            font-weight: bold;
+        }
+        
+        /* Tooltip styles */
+        .tooltip {
+            position: relative;
+            display: inline-block;
+        }
+        
+        .tooltip .tooltiptext {
+            visibility: hidden;
+            width: 300px;
+            background-color: rgba(0, 0, 0, 0.9);
+            color: #fff;
+            text-align: left;
+            border-radius: 8px;
+            padding: 12px;
+            position: absolute;
+            z-index: 1000;
+            bottom: 125%;
+            left: 50%;
+            margin-left: -150px;
+            opacity: 0;
+            transition: opacity 0.3s;
+            font-size: 0.85em;
+            line-height: 1.4;
+            box-shadow: 0 4px 20px rgba(0,0,0,0.3);
+        }
+        
+        .tooltip .tooltiptext::after {
+            content: "";
+            position: absolute;
+            top: 100%;
+            left: 50%;
+            margin-left: -5px;
+            border-width: 5px;
+            border-style: solid;
+            border-color: rgba(0, 0, 0, 0.9) transparent transparent transparent;
+        }
+        
+        .tooltip:hover .tooltiptext {
+            visibility: visible;
+            opacity: 1;
+        }
+        
+        .tooltiptext code {
+            background: rgba(255, 255, 255, 0.1);
+            padding: 2px 6px;
+            border-radius: 3px;
+            font-family: monospace;
+            font-size: 0.9em;
+        }
+        
+        .tooltiptext strong {
+            color: #667eea;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <header>
+            <button class="theme-toggle" onclick="toggleTheme()">🌓 Toggle Dark Mode</button>
+            <h1>🧠 LLM Evaluation Dashboard</h1>
+            <p class="subtitle">Comprehensive Intelligence & Performance Analysis</p>
+        </header>
+        
+        <div class="tabs">
+            <button class="tab active" onclick="switchTab('overview')">📊 Overview</button>
+            <button class="tab" onclick="switchTab('comparison')">⚔️ Model Comparison</button>
+            <button class="tab" onclick="switchTab('intelligence')">🎯 Intelligence Metrics</button>
+            <button class="tab" onclick="switchTab('categories')">📂 Category Analysis</button>
+            <button class="tab" onclick="switchTab('details')">🔍 Detailed Results</button>
+        </div>
+        
+        <div id="overview" class="content-panel active">
+            <h2>System Overview</h2>
+            <div class="stats-grid" id="overviewStats">
+                <div class="loading">
+                    <div class="spinner"></div>
+                    Loading data...
+                </div>
+            </div>
+            <div class="chart-container">
+                <canvas id="overviewChart"></canvas>
+            </div>
+        </div>
+        
+        <div id="comparison" class="content-panel">
+            <h2>Model Performance Comparison</h2>
+            <div class="controls">
+                <select id="metricSelect" onchange="updateComparisonChart()">
+                    <option value="average">Average Score</option>
+                    <option value="pass_rate">Pass Rate</option>
+                    <option value="exceptional_rate">Exceptional Rate</option>
+                    <option value="consistency">Consistency</option>
+                    <option value="robustness">Robustness</option>
+                </select>
+            </div>
+            <div class="chart-container">
+                <canvas id="comparisonChart"></canvas>
+            </div>
+        </div>
+        
+        <div id="intelligence" class="content-panel">
+            <h2>Intelligence Metrics Analysis</h2>
+            <p style="margin-bottom: 20px; color: #666;">
+                Advanced metrics evaluating different dimensions of AI intelligence and reasoning capabilities.
+            </p>
+            <div id="intelligenceMetrics">
+                <div class="loading">
+                    <div class="spinner"></div>
+                    Calculating intelligence metrics...
+                </div>
+            </div>
+        </div>
+        
+        <div id="categories" class="content-panel">
+            <h2>Performance by Category</h2>
+            <div class="controls">
+                <select id="categorySelect" onchange="updateCategoryChart()">
+                    <option value="">Loading categories...</option>
+                </select>
+            </div>
+            <div class="chart-container">
+                <canvas id="categoryChart"></canvas>
+            </div>
+        </div>
+        
+        <div id="details" class="content-panel">
+            <h2>Detailed Test Results</h2>
+            <div class="controls">
+                <select id="modelSelect" onchange="loadModelDetails()">
+                    <option value="">Select a model...</option>
+                </select>
+                <input type="text" id="searchInput" placeholder="Search tests..." onkeyup="filterTable()">
+                <select id="filterCategory" onchange="filterTable()">
+                    <option value="">All Categories</option>
+                </select>
+                <select id="filterScore" onchange="filterTable()">
+                    <option value="">All Scores</option>
+                    <option value="exceptional">Exceptional (4-5)</option>
+                    <option value="pass">Pass (2-3)</option>
+                    <option value="fail">Fail (0-1)</option>
+                </select>
+            </div>
+            <div id="detailsTable">
+                <p class="loading">Select a model to view detailed results</p>
+            </div>
+        </div>
+    </div>
+    
+    <script>
+        let comparisonData = null;
+        let statisticsData = null;
+        let intelligenceData = null;
+        let currentModelDetails = null;
+        
+        // Theme toggle functionality
+        function toggleTheme() {
+            document.body.classList.toggle('dark-mode');
+            const isDark = document.body.classList.contains('dark-mode');
+            localStorage.setItem('darkMode', isDark ? 'enabled' : 'disabled');
+        }
+        
+        // Load theme preference
+        function loadThemePreference() {
+            const darkMode = localStorage.getItem('darkMode');
+            if (darkMode === 'enabled') {
+                document.body.classList.add('dark-mode');
+            }
+        }
+        
+        // Tab switching
+        function switchTab(tabName) {
+            document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
+            document.querySelectorAll('.content-panel').forEach(p => p.classList.remove('active'));
+            
+            event.target.classList.add('active');
+            document.getElementById(tabName).classList.add('active');
+        }
+        
+        // Initialize dashboard
+        async function initDashboard() {
+            loadThemePreference();
+            await loadOverview();
+            await loadComparison();
+            await loadStatistics();
+            await loadIntelligenceMetrics();
+            populateModelSelector();
+        }
+        
+        async function loadOverview() {
+            try {
+                const response = await axios.get('/api/comparison');
+                comparisonData = response.data;
+                
+                const models = Object.keys(comparisonData.models);
+                const totalTests = models.reduce((sum, model) => 
+                    sum + comparisonData.models[model].metadata.total_tests, 0);
+                const avgScore = models.reduce((sum, model) => 
+                    sum + (comparisonData.models[model].overall_stats.average || 0), 0) / models.length;
+                
+                const statsHtml = `
+                    <div class="stat-card">
+                        <h3>Models Evaluated</h3>
+                        <div class="value">${models.length}</div>
+                    </div>
+                    <div class="stat-card">
+                        <h3>Total Tests</h3>
+                        <div class="value">${totalTests}</div>
+                    </div>
+                    <div class="stat-card">
+                        <h3>Average Score</h3>
+                        <div class="value">${avgScore.toFixed(2)}</div>
+                    </div>
+                    <div class="stat-card">
+                        <h3>Categories</h3>
+                        <div class="value">${comparisonData.categories.length}</div>
+                    </div>
+                `;
+                
+                document.getElementById('overviewStats').innerHTML = statsHtml;
+                
+                // Create overview chart
+                const ctx = document.getElementById('overviewChart').getContext('2d');
+                new Chart(ctx, {
+                    type: 'bar',
+                    data: {
+                        labels: models,
+                        datasets: [{
+                            label: 'Average Score',
+                            data: models.map(m => comparisonData.models[m].overall_stats.average || 0),
+                            backgroundColor: 'rgba(102, 126, 234, 0.6)',
+                            borderColor: 'rgba(102, 126, 234, 1)',
+                            borderWidth: 2
+                        }]
+                    },
+                    options: {
+                        responsive: true,
+                        maintainAspectRatio: false,
+                        scales: {
+                            y: {
+                                beginAtZero: true,
+                                max: 5
+                            }
+                        }
+                    }
+                });
+                
+            } catch (error) {
+                console.error('Error loading overview:', error);
+            }
+        }
+        
+        async function loadComparison() {
+            updateComparisonChart();
+        }
+        
+        async function updateComparisonChart() {
+            if (!comparisonData) return;
+            
+            const metric = document.getElementById('metricSelect').value;
+            const models = Object.keys(comparisonData.models);
+            
+            let data, label;
+            
+            if (metric === 'consistency' || metric === 'robustness') {
+                if (!statisticsData) {
+                    await loadStatistics();
+                }
+                const index = statisticsData.models.indexOf(models[0]);
+                data = models.map((m, i) => statisticsData[metric + '_score'][i]);
+                label = metric.charAt(0).toUpperCase() + metric.slice(1) + ' Score';
+            } else {
+                data = models.map(m => comparisonData.models[m].overall_stats[metric] || 0);
+                label = metric.split('_').map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' ');
+            }
+            
+            const ctx = document.getElementById('comparisonChart');
+            if (window.comparisonChartInstance) {
+                window.comparisonChartInstance.destroy();
+            }
+            
+            window.comparisonChartInstance = new Chart(ctx, {
+                type: 'radar',
+                data: {
+                    labels: models,
+                    datasets: [{
+                        label: label,
+                        data: data,
+                        backgroundColor: 'rgba(118, 75, 162, 0.2)',
+                        borderColor: 'rgba(118, 75, 162, 1)',
+                        pointBackgroundColor: 'rgba(118, 75, 162, 1)',
+                        pointBorderColor: '#fff',
+                        pointHoverBackgroundColor: '#fff',
+                        pointHoverBorderColor: 'rgba(118, 75, 162, 1)'
+                    }]
+                },
+                options: {
+                    responsive: true,
+                    maintainAspectRatio: false,
+                    scales: {
+                        r: {
+                            beginAtZero: true
+                        }
+                    }
+                }
+            });
+        }
+        
+        async function loadStatistics() {
+            try {
+                const response = await axios.get('/api/statistics');
+                statisticsData = response.data;
+            } catch (error) {
+                console.error('Error loading statistics:', error);
+            }
+        }
+        
+        async function loadIntelligenceMetrics() {
+            try {
+                const response = await axios.get('/api/intelligence_metrics');
+                intelligenceData = response.data;
+                
+                let html = '';
+                
+                for (const [model, metrics] of Object.entries(intelligenceData)) {
+                    html += `
+                        <div class="metric-card">
+                            <h3>${model}</h3>
+                            
+                            <div style="margin-bottom: 20px;" class="tooltip">
+                                <strong>Overall Intelligence Score:</strong>
+                                <span class="tooltiptext">
+                                    <strong>Calculation:</strong><br>
+                                    Overall = (IQ × 0.5) + (Adaptability × 0.3) + (Problem-Solving × 0.2)<br><br>
+                                    <strong>Values:</strong><br>
+                                    • IQ: ${metrics.iq_score.toFixed(1)}<br>
+                                    • Adaptability: ${metrics.adaptability.toFixed(1)}%<br>
+                                    • Problem-Solving: ${metrics.problem_solving_depth.toFixed(1)}<br><br>
+                                    Result: ${metrics.overall_intelligence.toFixed(1)}
+                                </span>
+                                <div class="progress-bar">
+                                    <div class="progress-fill" style="width: ${metrics.overall_intelligence}%">
+                                        ${metrics.overall_intelligence.toFixed(1)}
+                                    </div>
+                                </div>
+                            </div>
+                            
+                            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 15px;">
+                                <div class="tooltip">
+                                    <strong>IQ Score:</strong>
+                                    <span class="tooltiptext">
+                                        <strong>Weighted Average of Dimensions:</strong><br><br>
+                                        ${Object.entries(metrics.dimensions).map(([dim, data]) => {
+                                            const weights = {
+                                                'logical_reasoning': 1.5,
+                                                'mathematical_ability': 1.3,
+                                                'technical_knowledge': 1.4,
+                                                'instruction_following': 1.2,
+                                                'linguistic_nuance': 1.1,
+                                                'creativity': 1.0,
+                                                'conversational_depth': 1.0
+                                            };
+                                            return `• ${dim.replace(/_/g, ' ')}: ${data.score.toFixed(1)} × ${weights[dim] || 1.0}`;
+                                        }).join('<br>')}<br><br>
+                                        Normalized to 0-100 scale
+                                    </span>
+                                    <div class="progress-bar">
+                                        <div class="progress-fill" style="width: ${metrics.iq_score}%">
+                                            ${metrics.iq_score.toFixed(1)}
+                                        </div>
+                                    </div>
+                                </div>
+                                
+                                <div class="tooltip">
+                                    <strong>Adaptability:</strong>
+                                    <span class="tooltiptext">
+                                        <strong>Cross-Category Performance:</strong><br><br>
+                                        Measures versatility across different task types.<br><br>
+                                        Formula: (Categories with avg ≥ 2.5) / (Total categories) × 100<br><br>
+                                        Higher score = more versatile model
+                                    </span>
+                                    <div class="progress-bar">
+                                        <div class="progress-fill" style="width: ${metrics.adaptability}%">
+                                            ${metrics.adaptability.toFixed(1)}%
+                                        </div>
+                                    </div>
+                                </div>
+                                
+                                <div class="tooltip">
+                                    <strong>Problem-Solving Depth:</strong>
+                                    <span class="tooltiptext">
+                                        <strong>Performance on Challenging Tasks:</strong><br><br>
+                                        Average score on "hard" and "very_hard" difficulty tests.<br><br>
+                                        Formula: (Avg score on hard tests) × 20<br><br>
+                                        Tests critical thinking and complex reasoning
+                                    </span>
+                                    <div class="progress-bar">
+                                        <div class="progress-fill" style="width: ${metrics.problem_solving_depth}%">
+                                            ${metrics.problem_solving_depth.toFixed(1)}
+                                        </div>
+                                    </div>
+                                </div>
+                            </div>
+                            
+                            <h4 style="margin-top: 20px; color: #764ba2;">Cognitive Dimensions:</h4>
+                            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 10px; margin-top: 10px;">
+                    `;
+                    
+                    const dimensionWeights = {
+                        'logical_reasoning': 1.5,
+                        'mathematical_ability': 1.3,
+                        'technical_knowledge': 1.4,
+                        'instruction_following': 1.2,
+                        'linguistic_nuance': 1.1,
+                        'creativity': 1.0,
+                        'conversational_depth': 1.0
+                    };
+                    
+                    for (const [dim, data] of Object.entries(metrics.dimensions)) {
+                        const weight = dimensionWeights[dim] || 1.0;
+                        html += `
+                            <div class="tooltip">
+                                <small>${dim.replace(/_/g, ' ').toUpperCase()}</small>
+                                <span class="tooltiptext">
+                                    <strong>${dim.replace(/_/g, ' ').toUpperCase()}</strong><br><br>
+                                    Score: <code>${data.score.toFixed(2)}/5.00</code><br>
+                                    Weight in IQ: <code>${weight}</code><br>
+                                    Tests evaluated: <code>${data.count}</code><br><br>
+                                    Normalized: ${data.normalized.toFixed(1)}%
+                                </span>
+                                <div class="progress-bar" style="height: 20px;">
+                                    <div class="progress-fill" style="width: ${data.normalized}%; font-size: 0.8em;">
+                                        ${data.score.toFixed(1)}
+                                    </div>
+                                </div>
+                            </div>
+                        `;
+                    }
+                    
+                    html += `
+                            </div>
+                        </div>
+                    `;
+                }
+                
+                document.getElementById('intelligenceMetrics').innerHTML = html;
+                
+            } catch (error) {
+                console.error('Error loading intelligence metrics:', error);
+                document.getElementById('intelligenceMetrics').innerHTML = 
+                    '<p class="loading">Error loading intelligence metrics</p>';
+            }
+        }
+        
+        function populateModelSelector() {
+            if (!comparisonData) return;
+            
+            const models = Object.keys(comparisonData.models);
+            const select = document.getElementById('modelSelect');
+            
+            select.innerHTML = '<option value="">Select a model...</option>';
+            models.forEach(model => {
+                const option = document.createElement('option');
+                option.value = model;
+                option.textContent = model;
+                select.appendChild(option);
+            });
+            
+            // Populate category filter
+            const categoryFilter = document.getElementById('filterCategory');
+            categoryFilter.innerHTML = '<option value="">All Categories</option>';
+            comparisonData.categories.forEach(cat => {
+                const option = document.createElement('option');
+                option.value = cat;
+                option.textContent = cat;
+                categoryFilter.appendChild(option);
+            });
+            
+            // Populate category chart selector
+            const categorySelect = document.getElementById('categorySelect');
+            categorySelect.innerHTML = '';
+            comparisonData.categories.forEach(cat => {
+                const option = document.createElement('option');
+                option.value = cat;
+                option.textContent = cat;
+                categorySelect.appendChild(option);
+            });
+            
+            if (comparisonData.categories.length > 0) {
+                updateCategoryChart();
+            }
+        }
+        
+        function updateCategoryChart() {
+            if (!comparisonData) return;
+            
+            const category = document.getElementById('categorySelect').value;
+            const models = Object.keys(comparisonData.models);
+            
+            const data = models.map(model => {
+                const stats = comparisonData.models[model].category_stats[category];
+                return stats ? stats.average : 0;
+            });
+            
+            const ctx = document.getElementById('categoryChart');
+            if (window.categoryChartInstance) {
+                window.categoryChartInstance.destroy();
+            }
+            
+            window.categoryChartInstance = new Chart(ctx, {
+                type: 'bar',
+                data: {
+                    labels: models,
+                    datasets: [{
+                        label: `${category} - Average Score`,
+                        data: data,
+                        backgroundColor: 'rgba(102, 126, 234, 0.6)',
+                        borderColor: 'rgba(102, 126, 234, 1)',
+                        borderWidth: 2
+                    }]
+                },
+                options: {
+                    responsive: true,
+                    maintainAspectRatio: false,
+                    scales: {
+                        y: {
+                            beginAtZero: true,
+                            max: 5
+                        }
+                    }
+                }
+            });
+        }
+        
+        async function loadModelDetails() {
+            const modelName = document.getElementById('modelSelect').value;
+            if (!modelName || !comparisonData) return;
+            
+            currentModelDetails = comparisonData.models[modelName].test_results;
+            displayDetailsTable(currentModelDetails);
+        }
+        
+        function displayDetailsTable(results) {
+            let html = `
+                <table>
+                    <thead>
+                        <tr>
+                            <th onclick="sortTable('test_name')">Test Name</th>
+                            <th onclick="sortTable('category')">Category</th>
+                            <th onclick="sortTable('difficulty')">Difficulty</th>
+                            <th onclick="sortTable('score')">Score</th>
+                            <th onclick="sortTable('generation_time')">Time (s)</th>
+                            <th onclick="sortTable('tokens')">Tokens</th>
+                            <th onclick="sortTable('status')">Status</th>
+                            <th>Notes</th>
+                        </tr>
+                    </thead>
+                    <tbody>
+            `;
+            
+            results.forEach(test => {
+                const scoreClass = test.score >= 4 ? 'exceptional' : test.score >= 2 ? 'pass' : 'fail';
+                const scoreDisplay = test.score !== null ? test.score.toFixed(1) : 'N/A';
+                
+                // Extract timing and token info
+                const genTime = test.generation_time ? test.generation_time.toFixed(2) : 'N/A';
+                let tokenInfo = 'N/A';
+                let tokensPerSec = '';
+                
+                if (test.api_metrics && test.api_metrics.usage) {
+                    const usage = test.api_metrics.usage;
+                    const totalTokens = usage.total_tokens || usage.eval_count || 'N/A';
+                    const completionTokens = usage.completion_tokens || usage.eval_count;
+                    
+                    if (totalTokens !== 'N/A') {
+                        tokenInfo = totalTokens.toString();
+                        
+                        // Calculate tokens/sec if we have both values
+                        if (test.generation_time && completionTokens) {
+                            const tps = completionTokens / test.generation_time;
+                            tokensPerSec = `<br><small>(${tps.toFixed(1)} t/s)</small>`;
+                        }
+                    }
+                }
+                
+                html += `
+                    <tr>
+                        <td><strong>${test.test_name}</strong></td>
+                        <td>${test.category}</td>
+                        <td>${test.difficulty}</td>
+                        <td><span class="score-badge score-${scoreClass}">${scoreDisplay}</span></td>
+                        <td>${genTime}</td>
+                        <td>${tokenInfo}${tokensPerSec}</td>
+                        <td>${test.status}</td>
+                        <td><small>${test.notes}</small></td>
+                    </tr>
+                `;
+            });
+            
+            html += '</tbody></table>';
+            document.getElementById('detailsTable').innerHTML = html;
+        }
+        
+        function filterTable() {
+            if (!currentModelDetails) return;
+            
+            const searchTerm = document.getElementById('searchInput').value.toLowerCase();
+            const categoryFilter = document.getElementById('filterCategory').value;
+            const scoreFilter = document.getElementById('filterScore').value;
+            
+            const filtered = currentModelDetails.filter(test => {
+                const matchesSearch = test.test_name.toLowerCase().includes(searchTerm) ||
+                                    test.category.toLowerCase().includes(searchTerm);
+                const matchesCategory = !categoryFilter || test.category === categoryFilter;
+                
+                let matchesScore = true;
+                if (scoreFilter === 'exceptional') matchesScore = test.score >= 4;
+                else if (scoreFilter === 'pass') matchesScore = test.score >= 2 && test.score < 4;
+                else if (scoreFilter === 'fail') matchesScore = test.score < 2;
+                
+                return matchesSearch && matchesCategory && matchesScore;
+            });
+            
+            displayDetailsTable(filtered);
+        }
+        
+        function sortTable(column) {
+            if (!currentModelDetails) return;
+            
+            currentModelDetails.sort((a, b) => {
+                if (column === 'score') {
+                    return (b[column] || 0) - (a[column] || 0);
+                }
+                return (a[column] || '').toString().localeCompare((b[column] || '').toString());
+            });
+            
+            filterTable();
+        }
+        
+        // Initialize on load
+        initDashboard();
+    </script>
+</body>
+</html>
\ No newline at end of file
diff --git a/test_suite.yaml b/test_suite.yaml
index c4333e1..48e4562 100644
--- a/test_suite.yaml
+++ b/test_suite.yaml
@@ -1,9 +1,16 @@
-# AI Model Evaluation Test Suite
-# Focus: General reasoning + IT Forensics (Academic)
+# AI Model Evaluation Test Suite - Enhanced Version
+# Based on performance analysis of gemma3:4b-it-qat results
+# Strengthened tests in categories where model performed too well
+# Added multilingual challenges
 
 metadata:
-  version: "1.0"
+  version: "2.0"
   author: "AI Evaluation Framework"
+  changes_from_v1:
+    - "Added harder variants for Creative Writing, Language Nuance, Code Generation"
+    - "Added Multilingual category with 4 tests"
+    - "Ensured minimum 3 tests per category at varying difficulties"
+    - "Strengthened instruction-following constraints"
   focus_areas:
     - Logic & Reasoning
     - Mathematics & Calculation
@@ -11,10 +18,11 @@ metadata:
     - Creative Writing
     - Code Generation
     - Language Nuance
+    - Problem Solving & Logistics
     - IT Forensics
+    - Multilingual Competence
     - Multi-turn Conversations
 
-# Scoring rubric for all tests
 scoring_rubric:
   fail: 
     score: 0-1
@@ -26,10 +34,9 @@ scoring_rubric:
     score: 4-5
     description: "Exceeds requirements, demonstrates deep understanding"
 
-# Individual test categories
 test_categories:
 
-  # ========== GENERAL REASONING TESTS ==========
+  # ========== LOGIC & REASONING (3 tests) ==========
   
   - category: "Logic & Reasoning"
     tests:
@@ -49,10 +56,43 @@ test_categories:
         prompt: "If it was two hours ago, it would have been as long after 1:00 PM as it was before 1:00 PM today. What time is it now? Explain your deduction step-by-step."
         evaluation_criteria:
           - "Shows algebraic setup: (t-2) - 13:00 = 13:00 - (t-2)"
-          - "Correct answer: 5:00 PM (17:00)"
+          - "Correct answer: 3:00 PM (15:00)"
           - "Clear step-by-step reasoning"
         expected_difficulty: "hard"
 
+      - id: "logic_03"
+        name: "Multi-Constraint Deduction"
+        type: "single_turn"
+        prompt: |
+          Five houses in a row are painted different colors. Their owners are from different countries, drink different beverages, smoke different brands, and keep different pets.
+          
+          Facts:
+          1. The Brit lives in the red house.
+          2. The Swede keeps dogs.
+          3. The Dane drinks tea.
+          4. The green house is immediately to the left of the white house.
+          5. The owner of the green house drinks coffee.
+          6. The person who smokes Pall Mall keeps birds.
+          7. The owner of the yellow house smokes Dunhill.
+          8. The person in the center house drinks milk.
+          9. The Norwegian lives in the first house.
+          10. The person who smokes Blend lives next to the one who keeps cats.
+          11. The person who keeps horses lives next to the one who smokes Dunhill.
+          12. The person who smokes Blue Master drinks beer.
+          13. The German smokes Prince.
+          14. The Norwegian lives next to the blue house.
+          15. The person who smokes Blend has a neighbor who drinks water.
+          
+          Who owns the fish?
+        evaluation_criteria:
+          - "Systematically works through constraints"
+          - "Correctly identifies the German owns the fish"
+          - "Shows logical deduction process"
+          - "Handles constraint propagation correctly"
+        expected_difficulty: "very_hard"
+
+  # ========== MATHEMATICS & CALCULATION (3 tests) ==========
+  
   - category: "Mathematics & Calculation"
     tests:
       - id: "math_01"
@@ -73,10 +113,30 @@ test_categories:
         evaluation_criteria:
           - "Correct unit conversions (gallons to liters, miles to km)"
           - "Accurate fuel consumption calculation"
-          - "Remaining range calculation: approximately 570-580 km"
+          - "Remaining range calculation: approximately 475 km"
           - "Shows intermediate steps"
         expected_difficulty: "hard"
 
+      - id: "math_03"
+        name: "Compound Interest with Variable Rates and Withdrawals"
+        type: "single_turn"
+        prompt: |
+          An investment account starts with $10,000. The following occurs:
+          - Year 1: 5% annual interest, compounded quarterly
+          - Year 2: 4.5% annual interest, compounded monthly, with a $500 withdrawal at the end of Q2
+          - Year 3: 6% annual interest, compounded daily (assume 365 days), with a $1,000 deposit at the start of the year
+          
+          Calculate the final balance at the end of Year 3. Show all intermediate calculations with at least 2 decimal places precision.
+        evaluation_criteria:
+          - "Correct Year 1 calculation with quarterly compounding"
+          - "Correct Year 2 with monthly compounding and mid-year withdrawal"
+          - "Correct Year 3 with daily compounding and initial deposit"
+          - "Final answer approximately $11,847-$11,850"
+          - "Shows all intermediate steps"
+        expected_difficulty: "very_hard"
+
+  # ========== INSTRUCTION FOLLOWING (4 tests) ==========
+  
   - category: "Instruction Following"
     tests:
       - id: "instr_01"
@@ -101,8 +161,52 @@ test_categories:
           - "No forbidden words (particle, physics, Einstein)"
           - "Third sentence is a question"
           - "Ends with 'connected'"
+        expected_difficulty: "hard"
+
+      - id: "instr_03"
+        name: "Acrostic Technical Explanation"
+        type: "single_turn"
+        prompt: |
+          Write a 7-sentence explanation of how blockchain technology works.
+          
+          Constraints:
+          1. The first letter of each sentence must spell out "SECURED" (S-E-C-U-R-E-D)
+          2. Sentence 3 must contain exactly 15 words
+          3. Sentence 5 must be a rhetorical question
+          4. You cannot use the words "Bitcoin", "cryptocurrency", or "mining"
+          5. The explanation must mention "consensus mechanism" at least once
+          6. Total word count must be between 80-100 words
+        evaluation_criteria:
+          - "First letters spell SECURED"
+          - "Sentence 3 has exactly 15 words"
+          - "Sentence 5 is a rhetorical question"
+          - "No forbidden words"
+          - "Contains 'consensus mechanism'"
+          - "Word count 80-100"
+          - "Technically accurate"
         expected_difficulty: "very_hard"
 
+      - id: "instr_04"
+        name: "Structured Data Extraction with Format"
+        type: "single_turn"
+        prompt: |
+          Read this text and extract information in the EXACT format specified:
+          
+          "Dr. Maria Santos-Ferreira, aged 47, joined TechCorp Industries on March 15, 2019 as Chief Technology Officer. She previously worked at DataSystems Inc. for 12 years. Her annual salary is $425,000 with a 15% bonus structure. She holds patents US2018/0012345 and EU2020/9876543. Contact: msantos@techcorp.com, +1-555-0147."
+          
+          Output format (must match exactly, including brackets and pipes):
+          [NAME] | [AGE] | [COMPANY] | [ROLE] | [START_DATE:YYYY-MM-DD] | [PREV_EMPLOYER] | [PREV_YEARS] | [SALARY_USD] | [BONUS_%] | [PATENTS:semicolon-separated] | [EMAIL] | [PHONE]
+        evaluation_criteria:
+          - "Exact format match with pipes and brackets"
+          - "Correct date format conversion (2019-03-15)"
+          - "Salary as number without $ or comma"
+          - "Bonus as number without %"
+          - "Patents semicolon-separated"
+          - "All 12 fields present and correct"
+        expected_difficulty: "hard"
+
+  # ========== CREATIVE WRITING (4 tests - added harder variants) ==========
+  
   - category: "Creative Writing"
     tests:
       - id: "creative_01"
@@ -129,6 +233,52 @@ test_categories:
           - "Atmospheric and evocative"
         expected_difficulty: "hard"
 
+      - id: "creative_03"
+        name: "Unreliable Narrator Technical Document"
+        type: "single_turn"
+        prompt: |
+          Write a 3-paragraph product manual excerpt for a "Time Displacement Device" from the perspective of an unreliable narrator who is clearly lying or delusional, but the text must still function as a technically coherent manual.
+          
+          Requirements:
+          1. Include at least 3 numbered safety warnings that are subtly absurd but grammatically serious
+          2. The narrator must contradict themselves at least twice
+          3. Include one footnote that undermines the main text
+          4. Do not use exclamation marks anywhere
+          5. Maintain formal technical writing style throughout
+          6. Do not explicitly state the narrator is unreliable
+        evaluation_criteria:
+          - "3 paragraphs"
+          - "3+ numbered safety warnings (absurd but formal)"
+          - "At least 2 self-contradictions"
+          - "Footnote that undermines text"
+          - "No exclamation marks"
+          - "Formal technical style maintained"
+          - "Unreliability shown not told"
+        expected_difficulty: "very_hard"
+
+      - id: "creative_04"
+        name: "Reverse Chronology Micro-Fiction"
+        type: "single_turn"
+        prompt: |
+          Write a complete 5-sentence story told in reverse chronological order (last event first, first event last). The story must be about a scientist making a discovery.
+          
+          Additional constraints:
+          - Each sentence must be from a different point in time (clearly distinguishable)
+          - The true meaning of the story should only become clear when you reach the "first" event (last sentence)
+          - Include at least one piece of dialogue
+          - The word count must be exactly 75 words (not 74, not 76)
+        evaluation_criteria:
+          - "Exactly 5 sentences"
+          - "Clear reverse chronological order"
+          - "About a scientist's discovery"
+          - "Each sentence distinct time point"
+          - "Meaning emerges at end"
+          - "Contains dialogue"
+          - "Exactly 75 words"
+        expected_difficulty: "very_hard"
+
+  # ========== CODE GENERATION (4 tests) ==========
+  
   - category: "Code Generation"
     tests:
       - id: "code_01"
@@ -154,6 +304,55 @@ test_categories:
           - "Three distinct test cases provided"
         expected_difficulty: "hard"
 
+      - id: "code_03"
+        name: "Concurrent Rate Limiter"
+        type: "single_turn"
+        prompt: |
+          Write a Python class `RateLimiter` that implements a token bucket rate limiter with the following requirements:
+          
+          1. Constructor takes `rate` (tokens per second) and `capacity` (max tokens)
+          2. Method `acquire(tokens=1)` that returns True if tokens available, False otherwise
+          3. Method `wait_and_acquire(tokens=1)` that blocks until tokens are available (use asyncio)
+          4. Must be thread-safe for the synchronous `acquire` method
+          5. Include a method `get_available_tokens()` that returns current token count
+          
+          Provide a complete implementation with:
+          - Proper time-based token replenishment
+          - A test demonstrating both sync and async usage
+          - Handle edge case where requested tokens > capacity
+        evaluation_criteria:
+          - "Correct token bucket algorithm"
+          - "Thread-safe synchronous acquire"
+          - "Working async wait_and_acquire"
+          - "Proper time-based replenishment"
+          - "Edge case handling"
+          - "Complete test code"
+        expected_difficulty: "very_hard"
+
+      - id: "code_04"
+        name: "SQL Query Builder with Injection Prevention"
+        type: "single_turn"
+        prompt: |
+          Write a Python class `SafeQueryBuilder` that builds SELECT SQL queries with the following features:
+          
+          1. Fluent interface: `builder.select('name', 'age').from_table('users').where('age', '>', 18).where('status', '=', 'active').order_by('name').limit(10).build()`
+          2. Must prevent SQL injection - all values must be parameterized
+          3. The `build()` method returns a tuple of (query_string, parameters_list)
+          4. Support for: SELECT, FROM, WHERE (multiple), ORDER BY, LIMIT, OFFSET
+          5. WHERE conditions can use: =, !=, >, <, >=, <=, LIKE, IN
+          
+          Show the output for a query that selects users where name LIKE '%john%' AND age IN (25, 30, 35) ordered by created_at DESC with limit 5.
+        evaluation_criteria:
+          - "Fluent interface pattern correct"
+          - "SQL injection prevention via parameterization"
+          - "Returns (query, params) tuple"
+          - "All operations supported"
+          - "WHERE with IN clause works"
+          - "Example output is correct and safe"
+        expected_difficulty: "hard"
+
+  # ========== LANGUAGE NUANCE (4 tests - added harder variants) ==========
+  
   - category: "Language Nuance"
     tests:
       - id: "nuance_01"
@@ -181,6 +380,60 @@ test_categories:
           - "Demonstrates understanding of pragmatics"
         expected_difficulty: "hard"
 
+      - id: "nuance_03"
+        name: "Register Shifting and Code-Switching"
+        type: "single_turn"
+        prompt: |
+          Rewrite the following message in FOUR different registers, maintaining the same core information but adjusting tone, vocabulary, and structure appropriately:
+          
+          Original: "The quarterly report shows we lost money because our main product didn't sell well and we spent too much on advertising."
+          
+          Rewrite for:
+          1. A formal board presentation (C-suite executives)
+          2. A casual Slack message to your team
+          3. A legal disclosure document
+          4. An email to a non-English speaking business partner (using simple, clear language)
+          
+          After the four rewrites, explain three specific linguistic changes you made for each register and why.
+        evaluation_criteria:
+          - "Board version uses formal financial terminology"
+          - "Slack version uses casual/colloquial language appropriately"
+          - "Legal version uses hedging, passive voice, precise language"
+          - "Simple version avoids idioms and complex structures"
+          - "Identifies 3 specific changes per register"
+          - "Explanations demonstrate metalinguistic awareness"
+        expected_difficulty: "very_hard"
+
+      - id: "nuance_04"
+        name: "Implicature and Presupposition Detection"
+        type: "single_turn"
+        prompt: |
+          Analyze the following dialogue for all implicatures, presuppositions, and indirect speech acts:
+          
+          A: "Have you finished the Anderson report yet?"
+          B: "I've been dealing with the server outage all morning."
+          A: "Right. Well, the client is flying in tomorrow."
+          B: "I noticed you CC'd the whole department on that email."
+          A: "Just keeping everyone in the loop."
+          
+          For each line, identify:
+          1. What is directly stated (locution)
+          2. What is implied but not stated (implicature)
+          3. What is assumed to be true (presupposition)
+          4. What action is being performed through speech (illocutionary force)
+          
+          Then explain the underlying conflict or tension this exchange reveals.
+        evaluation_criteria:
+          - "Correctly identifies B's implicature (excuse/reason for not finishing)"
+          - "Identifies A's implied criticism in 'Right. Well...'"
+          - "Recognizes B's counter-accusation in CC comment"
+          - "Identifies presuppositions (report exists, server outage occurred)"
+          - "Correctly labels illocutionary acts (request, excuse, threat, accusation)"
+          - "Explains underlying workplace tension/conflict"
+        expected_difficulty: "very_hard"
+
+  # ========== PROBLEM SOLVING & LOGISTICS (3 tests) ==========
+  
   - category: "Problem Solving & Logistics"
     tests:
       - id: "logistics_01"
@@ -207,8 +460,34 @@ test_categories:
           - "Reaches exactly 500 kg total"
         expected_difficulty: "very_hard"
 
-  # ========== IT FORENSICS TESTS ==========
+      - id: "logistics_03"
+        name: "Resource Scheduling with Constraints"
+        type: "single_turn"
+        prompt: |
+          Schedule these 6 tasks across 3 workers (A, B, C) to minimize total completion time:
+          
+          Task 1: 2 hours, requires Worker A or B, must complete before Task 4
+          Task 2: 3 hours, any worker, must complete before Task 5
+          Task 3: 1 hour, requires Worker C only, no dependencies
+          Task 4: 2 hours, requires Worker B or C, depends on Task 1
+          Task 5: 4 hours, requires Worker A only, depends on Task 2
+          Task 6: 2 hours, any worker, depends on Tasks 3 and 4
+          
+          Provide:
+          1. A timeline showing when each task starts and ends
+          2. Which worker does each task
+          3. The total completion time
+          4. Explain why this is optimal (or near-optimal)
+        evaluation_criteria:
+          - "Respects all worker constraints"
+          - "Respects all dependencies"
+          - "Provides clear timeline"
+          - "Achieves reasonable completion time (≤9 hours possible)"
+          - "Explains optimization reasoning"
+        expected_difficulty: "hard"
 
+  # ========== IT FORENSICS - FILE SYSTEMS (3 tests) ==========
+  
   - category: "IT Forensics - File Systems"
     tests:
       - id: "forensics_mft_01"
@@ -281,6 +560,8 @@ test_categories:
           - "Explains significance of magic numbers"
         expected_difficulty: "medium"
 
+  # ========== IT FORENSICS - REGISTRY & ARTIFACTS (3 tests) ==========
+  
   - category: "IT Forensics - Registry & Artifacts"
     tests:
       - id: "forensics_registry_01"
@@ -323,6 +604,27 @@ test_categories:
           - "Explains conversion steps"
         expected_difficulty: "very_hard"
 
+      - id: "forensics_prefetch_01"
+        name: "Windows Prefetch Analysis"
+        type: "single_turn"
+        prompt: |
+          A Windows prefetch file is named: NOTEPAD.EXE-D4A5B5E5.pf
+          
+          Questions:
+          1) What does the hash portion (D4A5B5E5) represent?
+          2) If you found multiple prefetch files for the same executable with different hashes, what would that indicate?
+          3) What forensically relevant information can typically be extracted from prefetch files?
+          4) In which Windows versions is prefetch enabled by default, and where are these files stored?
+        evaluation_criteria:
+          - "Hash represents file path (or explains path-based hashing)"
+          - "Different hashes = different paths/locations for same exe"
+          - "Lists: execution count, timestamps, loaded DLLs, files accessed"
+          - "Knows location (C:\\Windows\\Prefetch) and version availability"
+          - "Demonstrates practical forensic understanding"
+        expected_difficulty: "medium"
+
+  # ========== IT FORENSICS - MEMORY & NETWORK (3 tests) ==========
+  
   - category: "IT Forensics - Memory & Network"
     tests:
       - id: "forensics_memory_01"
@@ -371,6 +673,33 @@ test_categories:
           - "Shows understanding of TCP header structure"
         expected_difficulty: "hard"
 
+      - id: "forensics_pcap_01"
+        name: "PCAP Three-Way Handshake Analysis"
+        type: "single_turn"
+        prompt: |
+          Given these three TCP packets from a capture (simplified):
+          
+          Packet 1: 10.0.0.5:49152 -> 93.184.216.34:80, Flags=SYN, Seq=1000, Ack=0
+          Packet 2: 93.184.216.34:80 -> 10.0.0.5:49152, Flags=SYN,ACK, Seq=5000, Ack=???
+          Packet 3: 10.0.0.5:49152 -> 93.184.216.34:80, Flags=ACK, Seq=???, Ack=???
+          
+          Questions:
+          1) Fill in the missing Ack value for Packet 2
+          2) Fill in the missing Seq and Ack values for Packet 3
+          3) What is the client IP and what is the server IP?
+          4) What service is likely being accessed?
+          5) After this handshake, what sequence number will the client use for its first data byte?
+        evaluation_criteria:
+          - "Packet 2 Ack = 1001"
+          - "Packet 3 Seq = 1001, Ack = 5001"
+          - "Client: 10.0.0.5, Server: 93.184.216.34"
+          - "Service: HTTP (port 80)"
+          - "First data byte seq = 1001"
+          - "Demonstrates understanding of TCP handshake mechanics"
+        expected_difficulty: "hard"
+
+  # ========== IT FORENSICS - TIMELINE & LOG ANALYSIS (3 tests) ==========
+  
   - category: "IT Forensics - Timeline & Log Analysis"
     tests:
       - id: "forensics_timeline_01"
@@ -399,6 +728,147 @@ test_categories:
           - "Identifies this as potential compromise scenario"
         expected_difficulty: "hard"
 
+      - id: "forensics_timeline_02"
+        name: "Anti-Forensics Detection"
+        type: "single_turn"
+        prompt: |
+          Analyze these filesystem timestamps for a file 'financial_report.xlsx':
+          
+          - Created (crtime): 2024-03-15 09:30:00
+          - Modified (mtime): 2024-03-14 16:45:00  
+          - Accessed (atime): 2024-03-15 10:00:00
+          - Changed (ctime): 2024-03-15 09:30:00
+          
+          And these additional artifacts:
+          - $MFT entry shows file created 2024-03-15
+          - $UsnJrnl shows rename from 'temp_8x7k2.xlsx' to 'financial_report.xlsx' at 2024-03-15 09:30:00
+          - $LogFile shows no entries for this file before 2024-03-15
+          
+          What anomalies exist and what do they suggest about the file's history?
+        evaluation_criteria:
+          - "Identifies mtime < crtime anomaly (impossible normally)"
+          - "Recognizes timestamp manipulation/timestomping"
+          - "Notes rename from suspicious temp filename"
+          - "Correlates $UsnJrnl rename evidence"
+          - "Understands ctime cannot be easily forged"
+          - "Suggests file was likely copied/moved with modified timestamps"
+        expected_difficulty: "very_hard"
+
+      - id: "forensics_timeline_03"
+        name: "Windows Event Log Correlation"
+        type: "single_turn"
+        prompt: |
+          Correlate these Windows Event Log entries:
+          
+          Security Log:
+          - Event 4624 (Logon): User CORP\jdoe, Type 10 (RemoteInteractive), 2024-06-01 02:15:33, Source: 192.168.1.50
+          - Event 4672 (Special Privileges): User CORP\jdoe, Privileges: SeDebugPrivilege, SeBackupPrivilege
+          - Event 4688 (Process Created): cmd.exe by CORP\jdoe, 02:16:01
+          - Event 4688 (Process Created): powershell.exe by CORP\jdoe, 02:16:15, CommandLine: "-ep bypass -enc SQBFAFgA..."
+          
+          System Log:
+          - Event 7045 (Service Installed): "Windows Update Helper", 02:17:30
+          
+          What type of attack pattern does this represent? What would be your next investigative steps?
+        evaluation_criteria:
+          - "Identifies RDP logon (Type 10)"
+          - "Recognizes privilege escalation indicators"
+          - "Identifies encoded PowerShell (likely malicious)"
+          - "Recognizes service installation for persistence"
+          - "Identifies late-night timing as suspicious"
+          - "Suggests checking service binary, decoding PowerShell, network logs"
+        expected_difficulty: "hard"
+
+  # ========== MULTILINGUAL COMPETENCE (4 tests - NEW CATEGORY) ==========
+  
+  - category: "Multilingual Competence"
+    tests:
+      - id: "multilingual_01"
+        name: "Cross-Language Instruction Following"
+        type: "single_turn"
+        prompt: |
+          Follow these instructions, which are given in three different languages. Your response must address all three:
+          
+          English: Write one sentence explaining what machine learning is.
+          Deutsch: Schreiben Sie einen Satz, der erklärt, warum maschinelles Lernen wichtig ist.
+          Español: Escriba una oración dando un ejemplo de aplicación del aprendizaje automático.
+          
+          Respond to each instruction in the language it was given.
+        evaluation_criteria:
+          - "English response is in English and accurate"
+          - "German response is in German and grammatically correct"
+          - "Spanish response is in Spanish and grammatically correct"
+          - "All three are topically coherent (about ML)"
+          - "Each is exactly one sentence"
+        expected_difficulty: "medium"
+
+      - id: "multilingual_02"
+        name: "Translation with Technical Terminology Preservation"
+        type: "single_turn"
+        prompt: |
+          Translate the following technical paragraph into French and Japanese. Preserve technical terms that are commonly used untranslated in those languages (e.g., 'API' typically stays as 'API').
+          
+          "The microservices architecture implements a RESTful API gateway that handles authentication via OAuth 2.0 tokens. The backend uses a Kubernetes cluster with horizontal pod autoscaling, while the database layer employs PostgreSQL with read replicas for improved throughput."
+          
+          After translating, list which technical terms you kept in English for each language and briefly explain why.
+        evaluation_criteria:
+          - "French translation is grammatically correct"
+          - "Japanese translation is grammatically correct"
+          - "Appropriate terms preserved (API, OAuth, Kubernetes, PostgreSQL)"
+          - "Explains rationale for preserved terms"
+          - "Technical meaning preserved accurately"
+        expected_difficulty: "hard"
+
+      - id: "multilingual_03"
+        name: "Idiomatic Expression Cross-Mapping"
+        type: "single_turn"
+        prompt: |
+          For each of the following idiomatic expressions, provide:
+          1. The literal translation
+          2. The actual meaning
+          3. An equivalent idiom in English (if the original isn't English) or in another language (if the original is English)
+          
+          A) German: "Da steppt der Bär"
+          B) Japanese: "猿も木から落ちる" (Saru mo ki kara ochiru)
+          C) English: "It's raining cats and dogs"
+          D) French: "Avoir le cafard"
+          E) Spanish: "Estar en las nubes"
+          
+          Then identify which two idioms from different languages express the most similar concept.
+        evaluation_criteria:
+          - "Correct literal translations for all 5"
+          - "Correct meanings for all 5"
+          - "Appropriate equivalent idioms provided"
+          - "Correctly identifies similar pair (e.g., B and 'even experts make mistakes')"
+          - "Demonstrates cross-cultural linguistic awareness"
+        expected_difficulty: "hard"
+
+      - id: "multilingual_04"
+        name: "Code-Switched Dialogue Analysis"
+        type: "single_turn"
+        prompt: |
+          Analyze this code-switched dialogue (English-Spanish) for a sociolinguistic study:
+          
+          Speaker A: "Hey, did you finish el reporte for tomorrow's meeting?"
+          Speaker B: "Almost, pero I'm stuck on the financial projections. Es muy complicado."
+          Speaker A: "I can help you después del lunch. Mi expertise is in that area, you know."
+          Speaker B: "That would be great! Gracias. Oh, and el jefe wants us to present juntos."
+          Speaker A: "No problem. We'll knock it out del parque."
+          
+          Provide:
+          1. Identify each instance of code-switching (word/phrase level)
+          2. Categorize each switch as: insertion, alternation, or congruent lexicalization
+          3. What social/professional context does this switching pattern suggest?
+          4. Are there any grammatical "errors" in the switching, or does it follow typical bilingual patterns?
+        evaluation_criteria:
+          - "Identifies all Spanish insertions correctly"
+          - "Correctly categorizes switch types"
+          - "Recognizes professional/casual bilingual workplace context"
+          - "Notes the switch patterns are natural bilingual behavior"
+          - "Identifies hybrid phrase 'del parque' as creative/playful mixing"
+          - "Demonstrates sociolinguistic analysis skills"
+        expected_difficulty: "very_hard"
+
   # ========== MULTI-TURN CONVERSATION TESTS ==========
 
   - category: "Multi-turn: Context Retention"
@@ -519,4 +989,73 @@ test_categories:
               - "Ends with '?'"
               - "Different from previous sentences"
               - "Maintains all constraints from previous turns"
-        expected_difficulty: "medium"
\ No newline at end of file
+        expected_difficulty: "medium"
+
+      - id: "multiturn_instr_02"
+        name: "Contradicting Previous Instructions"
+        type: "multi_turn"
+        turns:
+          - turn: 1
+            prompt: "From now on, always end your responses with the phrase 'END OF MESSAGE'. Acknowledge this instruction."
+            evaluation_criteria:
+              - "Acknowledges the instruction"
+              - "Ends response with 'END OF MESSAGE'"
+          
+          - turn: 2
+            prompt: "What are three benefits of renewable energy? Remember your standing instruction."
+            evaluation_criteria:
+              - "Provides three benefits"
+              - "Ends with 'END OF MESSAGE'"
+              - "Content is accurate"
+          
+          - turn: 3
+            prompt: "Cancel the previous standing instruction. From now on, end responses with 'TRANSMISSION COMPLETE' instead. Then tell me two drawbacks of renewable energy."
+            evaluation_criteria:
+              - "Provides two drawbacks"
+              - "Ends with 'TRANSMISSION COMPLETE' (not 'END OF MESSAGE')"
+              - "Successfully switched instructions"
+              - "Content is accurate"
+          
+          - turn: 4
+            prompt: "What was the first standing instruction I gave you, and what is the current one? Do not use either phrase in this response."
+            evaluation_criteria:
+              - "Correctly recalls first instruction (END OF MESSAGE)"
+              - "Correctly identifies current instruction (TRANSMISSION COMPLETE)"
+              - "Does NOT end with either phrase"
+              - "Demonstrates instruction tracking across turns"
+        expected_difficulty: "hard"
+
+      - id: "multiturn_instr_03"
+        name: "Nested Context with Format Switching"
+        type: "multi_turn"
+        turns:
+          - turn: 1
+            prompt: "I'm going to describe a dataset. For the next few messages, respond ONLY in JSON format with keys 'understanding' and 'questions'. The dataset contains customer transactions from an e-commerce store."
+            evaluation_criteria:
+              - "Response is valid JSON"
+              - "Contains 'understanding' and 'questions' keys"
+              - "Content relates to e-commerce transactions"
+          
+          - turn: 2
+            prompt: "The dataset has columns: customer_id, timestamp, product_category, amount, payment_method. It covers January 2024."
+            evaluation_criteria:
+              - "Response is valid JSON"
+              - "Contains 'understanding' and 'questions' keys"
+              - "Understanding reflects the column information"
+          
+          - turn: 3
+            prompt: "STOP using JSON format. Now respond in plain bullet points. What analyses would you recommend for this dataset?"
+            evaluation_criteria:
+              - "Switches to bullet point format"
+              - "NOT in JSON format"
+              - "Recommendations are relevant to the dataset described"
+              - "References information from previous turns"
+          
+          - turn: 4
+            prompt: "Switch back to JSON. Add a third key 'recommendations' with your top 3 analyses. Also include your understanding from turn 2."
+            evaluation_criteria:
+              - "Returns to JSON format"
+              - "Has three keys: understanding, questions, recommendations"
+              - "Recommendations from turn 3 included"
+              - "Understanding references turn 2 context"
+        expected_difficulty: "very_hard"
\ No newline at end of file