Files
llm-eval-forensics/ai_eval.py
2026-01-16 12:48:56 +01:00

1247 lines
52 KiB
Python

#!/usr/bin/env python3
"""
AI Model Evaluation Automation Script
Runs comprehensive test suite against OpenAI-compatible API endpoints
"""
import yaml
import json
import requests
import os
import sys
import time
from datetime import datetime
from typing import Dict, List, Any, Optional
from pathlib import Path
import argparse
from dotenv import load_dotenv
class AIModelTester:
def __init__(self, endpoint: str, api_key: str, model_name: str, output_dir: str = "results",
non_interactive: bool = False, evaluator_endpoint: Optional[str] = None,
evaluator_api_key: Optional[str] = None, evaluator_model: Optional[str] = None,
evaluator_temperature: float = 0.3):
"""
Initialize the AI Model Tester
Args:
endpoint: OpenAI-compatible API endpoint URL
api_key: API key for authentication
model_name: Name/identifier of the model being tested
output_dir: Directory to save results
non_interactive: If True, use evaluator API for automatic scoring
evaluator_endpoint: API endpoint for evaluation model
evaluator_api_key: API key for evaluator
evaluator_model: Model name for evaluator
evaluator_temperature: Temperature for evaluator
"""
self.endpoint = endpoint.rstrip('/')
self.api_key = api_key
self.model_name = model_name
self.output_dir = Path(output_dir)
self.output_dir.mkdir(exist_ok=True)
# Non-interactive mode settings
self.non_interactive = non_interactive
self.evaluator_endpoint = evaluator_endpoint.rstrip('/') if evaluator_endpoint else None
self.evaluator_api_key = evaluator_api_key
self.evaluator_model = evaluator_model
self.evaluator_temperature = evaluator_temperature
if self.non_interactive:
if not all([self.evaluator_endpoint, self.evaluator_model]):
raise ValueError("Non-interactive mode requires evaluator_endpoint and evaluator_model")
print(f"🤖 Non-interactive mode enabled")
print(f" Evaluator: {self.evaluator_model} @ {self.evaluator_endpoint}")
# Results storage
self.results = {
"metadata": {
"model_name": model_name,
"endpoint": endpoint,
"test_start": datetime.now().isoformat(),
"test_end": None,
"total_tests": 0,
"completed_tests": 0
},
"test_results": []
}
# Current test session info
self.current_test_id = None
self.conversation_history = []
# Track failed auto-evaluations for manual review
self.failed_evaluations = []
def test_connection(self, endpoint: str, api_key: str, model: str, endpoint_name: str = "API") -> bool:
"""
Test if an API endpoint can be reached and authenticated against
Args:
endpoint: API endpoint URL
api_key: API key for authentication
model: Model name to test
endpoint_name: Name for display purposes
Returns:
True if connection successful, False otherwise
"""
print(f"\n🔍 Testing {endpoint_name} connection...")
print(f" Endpoint: {endpoint}")
print(f" Model: {model}")
headers = {
"Content-Type": "application/json"
}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
print(f" API Key: {api_key[:10]}..." if len(api_key) > 10 else " API Key: [set]")
else:
print(f" API Key: [none]")
# Simple test message
payload = {
"model": model,
"messages": [{"role": "user", "content": "test"}],
"max_tokens": 1
}
try:
response = requests.post(
f"{endpoint}/v1/chat/completions",
headers=headers,
json=payload,
timeout=10
)
response.raise_for_status()
print(f"{endpoint_name} connection successful")
return True
except requests.exceptions.HTTPError as e:
print(f"{endpoint_name} HTTP error: {e.response.status_code}")
print(f" {e.response.text[:200]}")
return False
except requests.exceptions.ConnectionError as e:
print(f"{endpoint_name} connection failed: Cannot reach endpoint")
return False
except requests.exceptions.Timeout:
print(f"{endpoint_name} connection timeout")
return False
except requests.exceptions.RequestException as e:
print(f"{endpoint_name} error: {e}")
return False
def load_test_suite(self, yaml_file: str) -> Dict:
"""Load test suite from YAML file"""
try:
with open(yaml_file, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except FileNotFoundError:
print(f"Error: Test suite file not found: {yaml_file}")
print(f"Please ensure {yaml_file} is in the current directory.")
sys.exit(1)
except yaml.YAMLError as e:
print(f"Error: Invalid YAML format in {yaml_file}")
print(f"Details: {e}")
sys.exit(1)
except Exception as e:
print(f"Error loading test suite: {e}")
sys.exit(1)
def call_api(self, messages: List[Dict], temperature: float = 0.7, max_tokens: int = 2000) -> Optional[Dict]:
"""
Call the OpenAI-compatible API
Args:
messages: List of message dicts with 'role' and 'content'
temperature: Sampling temperature
max_tokens: Maximum tokens in response
Returns:
API response dict or None if error
"""
headers = {
"Content-Type": "application/json"
}
# Only add Authorization header if API key is provided
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
payload = {
"model": self.model_name,
"messages": messages,
"temperature": temperature,
"max_tokens": max_tokens
}
try:
print(f"\nDEBUG: Calling API endpoint: {self.endpoint}/v1/chat/completions")
print(f"DEBUG: Model name: {self.model_name}")
response = requests.post(
f"{self.endpoint}/v1/chat/completions",
headers=headers,
json=payload,
timeout=240
)
response.raise_for_status()
return response.json()
except requests.exceptions.Timeout as e:
print(f"\n⚠️ API Timeout Error: Request exceeded 120 seconds")
print(f" This usually means the model is taking too long to generate a response.")
print(f" Consider using a faster model or reducing the complexity of the prompt.")
return None
except requests.exceptions.HTTPError as e:
print(f"\n⚠️ API HTTP Error: {e}")
print(f" Status Code: {e.response.status_code}")
print(f" Response: {e.response.text[:500]}")
# Show request details for debugging
print("\n REQUEST DETAILS:")
print(f" URL: {self.endpoint}/v1/chat/completions")
print(f" Model: {self.model_name}")
return None
except requests.exceptions.ConnectionError as e:
print(f"\n⚠️ API Connection Error: {e}")
print(f" Could not connect to {self.endpoint}")
print(f" Please check your network connection and endpoint URL.")
return None
except requests.exceptions.RequestException as e:
print(f"\n⚠️ API Request Error: {e}")
if hasattr(e, 'response') and e.response is not None:
print(f" Response: {e.response.text[:500]}")
return None
def display_test_info(self, test: Dict, category: str):
"""Display test information to user"""
print("\n" + "="*80)
print(f"📋 CATEGORY: {category}")
print(f"🆔 Test ID: {test['id']}")
print(f"📝 Test Name: {test['name']}")
print(f"🎯 Type: {test['type']}")
print(f"⚡ Difficulty: {test.get('expected_difficulty', 'N/A')}")
print("="*80)
def display_prompt(self, prompt: str, turn: Optional[int] = None):
"""Display the prompt being sent"""
if turn is not None:
print(f"\n🔄 TURN {turn}:")
else:
print(f"\n💬 PROMPT:")
print("-"*80)
print(prompt)
print("-"*80)
def display_response(self, response_text: str, raw_response: Optional[Dict] = None, generation_time: Optional[float] = None):
"""Display the model's response with timing and token usage metrics"""
print("\nRAW API RESPONSE:")
print("="*80)
import json
print(json.dumps(raw_response, indent=2))
print("="*80)
# Display timing and token metrics if available
if generation_time is not None:
print(f"\n⏱️ Generation Time: {generation_time:.2f}s")
if raw_response and 'usage' in raw_response:
usage = raw_response['usage']
print(f"\n📊 Token Usage:")
if 'prompt_tokens' in usage:
print(f" Prompt Tokens: {usage['prompt_tokens']}")
if 'completion_tokens' in usage:
print(f" Completion Tokens: {usage['completion_tokens']}")
if 'total_tokens' in usage:
print(f" Total Tokens: {usage['total_tokens']}")
# Calculate tokens per second if we have timing
if generation_time and generation_time > 0 and 'completion_tokens' in usage:
tokens_per_sec = usage['completion_tokens'] / generation_time
print(f" Speed: {tokens_per_sec:.2f} tokens/sec")
print(f"\n\nMODEL RESPONSE:")
print("-"*80)
print(response_text)
print("-"*80)
def display_evaluation_criteria(self, criteria: List[str]):
"""Display evaluation criteria for the test"""
print(f"\n✅ EVALUATION CRITERIA:")
for i, criterion in enumerate(criteria, 1):
print(f" {i}. {criterion}")
def get_user_score(self) -> Dict:
"""Prompt user for evaluation score"""
print("\n" + "="*80)
print("📊 EVALUATION SCORING RUBRIC:")
print(" 0-1: FAIL - Major errors, fails to meet basic requirements")
print(" 2-3: PASS - Meets requirements with minor issues")
print(" 4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding")
print("="*80)
while True:
try:
score_input = input("\n👉 Enter score (0-5) or 'skip' to skip this test: ").strip().lower()
if score_input == 'skip':
return {"score": None, "notes": "Skipped by user"}
score = int(score_input)
if 0 <= score <= 5:
notes = input("📝 Notes (optional, press Enter to skip): ").strip()
return {"score": score, "notes": notes if notes else ""}
else:
print("❌ Score must be between 0 and 5")
except ValueError:
print("❌ Invalid input. Please enter a number between 0 and 5, or 'skip'")
except KeyboardInterrupt:
print("\n\n⚠️ Test interrupted by user")
return {"score": None, "notes": "Interrupted"}
def extract_api_metrics(self, response: Dict) -> Optional[Dict]:
"""
Extract all available metrics from API response
Args:
response: The API response dict
Returns:
Dict with usage statistics and timing information, or None if no metrics available
"""
metrics = {}
if response and isinstance(response, dict):
# Extract usage statistics if available
usage = response.get('usage', {})
if usage:
metrics['usage'] = {
'prompt_tokens': usage.get('prompt_tokens'),
'completion_tokens': usage.get('completion_tokens'),
'total_tokens': usage.get('total_tokens'),
'prompt_eval_count': usage.get('prompt_eval_count'),
'eval_count': usage.get('eval_count'),
'prompt_eval_duration': usage.get('prompt_eval_duration'),
'eval_duration': usage.get('eval_duration'),
'load_duration': usage.get('load_duration'),
'total_duration': usage.get('total_duration'),
'response_token_s': usage.get('response_token/s'),
'prompt_token_s': usage.get('prompt_token/s'),
}
# Remove None values
metrics['usage'] = {k: v for k, v in metrics['usage'].items() if v is not None}
# Extract model info
if 'model' in response:
metrics['model'] = response['model']
# Extract creation timestamp
if 'created' in response:
metrics['created'] = response['created']
# Extract finish reason
if 'choices' in response and len(response['choices']) > 0:
finish_reason = response['choices'][0].get('finish_reason')
if finish_reason:
metrics['finish_reason'] = finish_reason
return metrics if metrics else None
def get_evaluator_score(self, prompt: str, response: str, criteria: List[str]) -> Dict:
"""
Get automated score from evaluator API
Args:
prompt: The original prompt sent to the model under test
response: The model's response to evaluate
criteria: List of evaluation criteria
Returns:
Dict with 'score', 'notes', and 'auto_eval_failed' keys
"""
# Construct evaluation prompt
criteria_text = "\n".join([f"{i+1}. {c}" for i, c in enumerate(criteria)])
eval_system_prompt = """You are an expert AI model evaluator. Your task is to objectively assess AI model responses based on specific criteria.
Scoring Rubric:
- 0-1: FAIL - Major errors, fails to meet basic requirements, incorrect or misleading information
- 2-3: PASS - Meets requirements with minor issues, mostly correct with some imperfections
- 4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding, accurate and comprehensive
CRITICAL: You MUST respond with ONLY valid JSON. Do not include any explanatory text before or after the JSON.
Response format (use ONLY this, nothing else):
{"score": <integer 0-5>, "notes": "<brief explanation>"}"""
eval_user_prompt = f"""Evaluate this AI model response against the criteria below.
**Original Prompt:**
{prompt}
**Model Response:**
{response}
**Evaluation Criteria:**
{criteria_text}
Respond with ONLY valid JSON in this exact format:
{{"score": <integer 0-5>, "notes": "<brief explanation of the score>"}}
Do not include any text before or after the JSON object."""
messages = [
{"role": "system", "content": eval_system_prompt},
{"role": "user", "content": eval_user_prompt}
]
# Call evaluator API
headers = {
"Content-Type": "application/json"
}
if self.evaluator_api_key:
headers["Authorization"] = f"Bearer {self.evaluator_api_key}"
payload = {
"model": self.evaluator_model,
"messages": messages,
"temperature": self.evaluator_temperature,
"max_tokens": 500
}
try:
print(f"\n🤖 Calling evaluator API for automated scoring...")
response_obj = requests.post(
f"{self.evaluator_endpoint}/v1/chat/completions",
headers=headers,
json=payload,
timeout=90 # Increased timeout for evaluator
)
response_obj.raise_for_status()
eval_response = response_obj.json()
# Extract the evaluation
eval_text = eval_response['choices'][0]['message']['content'].strip()
# Try to extract JSON from the response
# Handle case where model wraps JSON in markdown code blocks or adds extra text
json_obj = None
# Try direct parsing first
try:
json_obj = json.loads(eval_text)
except json.JSONDecodeError:
# Try to find JSON in markdown code blocks
if '```json' in eval_text or '```' in eval_text:
# Extract content between code fences
import re
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', eval_text, re.DOTALL)
if json_match:
json_obj = json.loads(json_match.group(1))
# Try to find any JSON object in the text
if json_obj is None:
import re
json_match = re.search(r'\{[^{}]*"score"[^{}]*"notes"[^{}]*\}', eval_text, re.DOTALL)
if json_match:
json_obj = json.loads(json_match.group(0))
if json_obj is None:
raise json.JSONDecodeError("No valid JSON found", eval_text, 0)
score = int(json_obj.get('score', 0))
notes = json_obj.get('notes', 'Automated evaluation')
# Validate score range
if not 0 <= score <= 5:
print(f"⚠️ Evaluator returned invalid score {score}, clamping to 0-5")
score = max(0, min(5, score))
print(f" Score: {score}/5")
print(f" Notes: {notes}")
return {"score": score, "notes": f"[Auto-Reviev by {self.evaluator_model}] {notes}", "auto_eval_failed": False}
except json.JSONDecodeError as e:
print(f"⚠️ Failed to parse evaluator response as JSON: {e}")
print(f" Raw response: {eval_text[:200]}")
print(f" Marking for manual review")
return {"score": None, "notes": f"[AUTO-ERROR] Evaluator returned non-JSON response", "auto_eval_failed": True}
except (KeyError, IndexError, ValueError) as e:
print(f"⚠️ Invalid evaluator response format: {e}")
print(f" Marking for manual review")
return {"score": None, "notes": f"[AUTO-ERROR] Invalid response format: {str(e)}", "auto_eval_failed": True}
except requests.exceptions.Timeout as e:
print(f"⚠️ Evaluator API timeout: Request exceeded 90 seconds")
print(f" The evaluator model is taking too long to respond.")
print(f" Marking for manual review")
return {"score": None, "notes": "[AUTO-ERROR] Evaluator API timeout", "auto_eval_failed": True}
except requests.exceptions.ConnectionError as e:
print(f"⚠️ Evaluator connection error: {e}")
print(f" Could not connect to evaluator endpoint.")
print(f" Marking for manual review")
return {"score": None, "notes": "[AUTO-ERROR] Cannot connect to evaluator", "auto_eval_failed": True}
except requests.exceptions.RequestException as e:
print(f"⚠️ Evaluator API error: {e}")
print(f" Marking for manual review")
return {"score": None, "notes": f"[AUTO-ERROR] API call failed: {str(e)[:100]}", "auto_eval_failed": True}
def run_single_turn_test(self, test: Dict, category: str) -> Dict:
"""Run a single-turn test"""
self.display_test_info(test, category)
self.display_prompt(test['prompt'])
# Prepare messages
messages = [{"role": "user", "content": test['prompt']}]
# Call API and measure time
start_time = time.time()
response = self.call_api(messages)
generation_time = time.time() - start_time
if response is None:
return {
"test_id": test['id'],
"test_name": test['name'],
"category": category,
"type": "single_turn",
"status": "api_error",
"score": None,
"notes": "API call failed",
"generation_time": generation_time
}
# Extract response text with better error handling
try:
message = response['choices'][0]['message']
# Try to get content, if empty check for reasoning_content
response_text = message.get('content', '')
# If content is empty but reasoning_content exists, use that
if not response_text and 'reasoning_content' in message:
response_text = message['reasoning_content']
print("\n⚠️ Note: Response contained only reasoning_content, no actual content generated")
# If still empty, check for tool_calls (model might be trying to call functions)
if not response_text and 'tool_calls' in message:
print("\n⚠️ Warning: Model attempted to call tools instead of generating content")
tool_info = json.dumps(message['tool_calls'], indent=2)
response_text = f"[MODEL ERROR: Attempted tool calls instead of text response]\n{tool_info}"
# If completely empty, this is an error
if not response_text:
print("\n⚠️ ERROR: Model returned completely empty response")
response_text = "[ERROR: Empty response from model]"
except (KeyError, IndexError, TypeError) as e:
print(f"\nERROR: Failed to parse response - {e}")
print("\nRAW API RESPONSE:")
print("="*80)
import json
print(json.dumps(response, indent=2))
print("="*80)
response_text = f"[PARSING ERROR: {e}]"
self.display_response(response_text, raw_response=response, generation_time=generation_time)
# Display evaluation criteria
self.display_evaluation_criteria(test.get('evaluation_criteria', []))
# Get evaluation (interactive or automated)
# Skip automated evaluation if response is an error
if self.non_interactive:
# Check if response is actually an error
if response_text.startswith('[ERROR:') or response_text.startswith('[MODEL ERROR:') or response_text.startswith('[PARSING ERROR:'):
print(f"\n⚠️ Skipping automated evaluation due to model error")
evaluation = {"score": 0, "notes": "[AUTO-SKIP] Model failed to generate valid response", "auto_eval_failed": False}
else:
evaluation = self.get_evaluator_score(
prompt=test['prompt'],
response=response_text,
criteria=test.get('evaluation_criteria', [])
)
# Track failed evaluations for manual review
if evaluation.get('auto_eval_failed', False):
self.failed_evaluations.append({
'test_id': test['id'],
'test_name': test['name'],
'category': category,
'type': 'single_turn',
'prompt': test['prompt'],
'response': response_text,
'criteria': test.get('evaluation_criteria', []),
'error': evaluation['notes']
})
else:
evaluation = self.get_user_score()
# Extract API metrics
api_metrics = self.extract_api_metrics(response)
result = {
"test_id": test['id'],
"test_name": test['name'],
"category": category,
"type": "single_turn",
"difficulty": test.get('expected_difficulty', 'unknown'),
"prompt": test['prompt'],
"response": response_text,
"raw_response": response if response_text.startswith("[PARSING ERROR") else None,
"evaluation_criteria": test.get('evaluation_criteria', []),
"score": evaluation['score'],
"notes": evaluation['notes'],
"status": "completed" if evaluation['score'] is not None else "skipped",
"timestamp": datetime.now().isoformat(),
"generation_time": generation_time
}
# Add metrics if available
if api_metrics:
result['api_metrics'] = api_metrics
return result
def run_multi_turn_test(self, test: Dict, category: str) -> Dict:
"""Run a multi-turn test"""
self.display_test_info(test, category)
# Initialize conversation history
self.conversation_history = []
turn_results = []
for i, turn_data in enumerate(test['turns'], 1):
turn_num = turn_data['turn']
prompt = turn_data['prompt']
self.display_prompt(prompt, turn_num)
# Add to conversation history
self.conversation_history.append({"role": "user", "content": prompt})
# Call API with full conversation history and measure time
start_time = time.time()
response = self.call_api(self.conversation_history)
generation_time = time.time() - start_time
if response is None:
turn_results.append({
"turn": turn_num,
"status": "api_error",
"prompt": prompt,
"response": None,
"score": None,
"notes": "API error - failed to get response",
"generation_time": generation_time
})
break
# Extract and display response with better error handling
try:
message = response['choices'][0]['message']
# Try to get content, if empty check for reasoning_content
response_text = message.get('content', '')
# If content is empty but reasoning_content exists, use that
if not response_text and 'reasoning_content' in message:
response_text = message['reasoning_content']
print("\n⚠️ Note: Response contained only reasoning_content, no actual content generated")
# If still empty, check for tool_calls
if not response_text and 'tool_calls' in message:
print("\n⚠️ Warning: Model attempted to call tools instead of generating content")
import json
tool_info = json.dumps(message['tool_calls'], indent=2)
response_text = f"[MODEL ERROR: Attempted tool calls instead of text response]\n{tool_info}"
# If completely empty, this is an error
if not response_text:
print("\n⚠️ ERROR: Model returned completely empty response")
response_text = "[ERROR: Empty response from model]"
except (KeyError, IndexError, TypeError) as e:
print(f"\nERROR: Failed to parse response - {e}")
print("\nRAW API RESPONSE:")
print("="*80)
import json
print(json.dumps(response, indent=2))
print("="*80)
response_text = f"[PARSING ERROR: {e}]"
self.display_response(response_text, raw_response=response, generation_time=generation_time)
# Add assistant response to history
self.conversation_history.append({"role": "assistant", "content": response_text})
# Display criteria for this turn
self.display_evaluation_criteria(turn_data.get('evaluation_criteria', []))
# Get evaluation for this turn (interactive or automated)
print(f"\nEvaluate Turn {turn_num}:")
if self.non_interactive:
# Skip automated evaluation if response is an error
if response_text.startswith('[ERROR:') or response_text.startswith('[MODEL ERROR:') or response_text.startswith('[PARSING ERROR:'):
print(f"\n⚠️ Skipping automated evaluation due to model error")
evaluation = {"score": 0, "notes": "[AUTO-SKIP] Model failed to generate valid response", "auto_eval_failed": False}
else:
evaluation = self.get_evaluator_score(
prompt=prompt,
response=response_text,
criteria=turn_data.get('evaluation_criteria', [])
)
# Track failed evaluations for manual review
if evaluation.get('auto_eval_failed', False):
self.failed_evaluations.append({
'test_id': test['id'],
'test_name': test['name'],
'category': category,
'type': 'multi_turn',
'turn': turn_num,
'prompt': prompt,
'response': response_text,
'criteria': turn_data.get('evaluation_criteria', []),
'error': evaluation['notes']
})
else:
evaluation = self.get_user_score()
# Extract API metrics for this turn
api_metrics = self.extract_api_metrics(response)
turn_result = {
"turn": turn_num,
"prompt": prompt,
"response": response_text,
"evaluation_criteria": turn_data.get('evaluation_criteria', []),
"score": evaluation['score'],
"notes": evaluation['notes'],
"status": "completed" if evaluation['score'] is not None else "skipped",
"generation_time": generation_time
}
# Add metrics if available
if api_metrics:
turn_result['api_metrics'] = api_metrics
turn_results.append(turn_result)
if evaluation['score'] is None:
print(f"\nTurn {turn_num} skipped, stopping multi-turn test")
break
# Calculate overall score for multi-turn test
valid_scores = [t['score'] for t in turn_results if t['score'] is not None]
overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else None
# Aggregate metrics across all turns
aggregate_metrics = {}
turn_metrics = [t.get('api_metrics') for t in turn_results if t.get('api_metrics')]
if turn_metrics:
# Sum up token counts and durations
total_prompt_tokens = sum(m.get('usage', {}).get('prompt_tokens', 0) for m in turn_metrics)
total_completion_tokens = sum(m.get('usage', {}).get('completion_tokens', 0) for m in turn_metrics)
total_duration = sum(m.get('usage', {}).get('total_duration', 0) for m in turn_metrics)
aggregate_metrics['usage'] = {
'total_prompt_tokens': total_prompt_tokens if total_prompt_tokens else None,
'total_completion_tokens': total_completion_tokens if total_completion_tokens else None,
'total_tokens': (total_prompt_tokens + total_completion_tokens) if (total_prompt_tokens or total_completion_tokens) else None,
'total_duration': total_duration if total_duration else None,
'turn_count': len(turn_metrics)
}
# Remove None values
aggregate_metrics['usage'] = {k: v for k, v in aggregate_metrics['usage'].items() if v is not None}
result = {
"test_id": test['id'],
"test_name": test['name'],
"category": category,
"type": "multi_turn",
"difficulty": test.get('expected_difficulty', 'unknown'),
"turns": turn_results,
"overall_score": overall_score,
"status": "completed" if overall_score is not None else "incomplete",
"timestamp": datetime.now().isoformat()
}
# Add aggregate metrics if available
if aggregate_metrics:
result['aggregate_metrics'] = aggregate_metrics
return result
def manual_review_failed_evaluations(self):
"""Present failed automated evaluations for manual review"""
print("\n\n" + "="*80)
print("⚠️ MANUAL REVIEW REQUIRED")
print("="*80)
print(f"\n{len(self.failed_evaluations)} test(s) could not be automatically evaluated.")
print("Please provide manual scores for these tests.\n")
for idx, failed in enumerate(self.failed_evaluations, 1):
print("\n" + "="*80)
print(f"📋 MANUAL REVIEW {idx}/{len(self.failed_evaluations)}")
print("="*80)
print(f"🆔 Test ID: {failed['test_id']}")
print(f"📝 Test Name: {failed['test_name']}")
print(f"📂 Category: {failed['category']}")
if failed['type'] == 'multi_turn':
print(f"🔄 Turn: {failed['turn']}")
print(f"❌ Auto-Eval Error: {failed['error']}")
print(f"\n💬 PROMPT:")
print("-"*80)
print(failed['prompt'])
print("-"*80)
print(f"\nMODEL RESPONSE:")
print("-"*80)
print(failed['response'])
print("-"*80)
print(f"\n✅ EVALUATION CRITERIA:")
for i, criterion in enumerate(failed['criteria'], 1):
print(f" {i}. {criterion}")
# Get manual score
print("\n" + "="*80)
print("📊 EVALUATION SCORING RUBRIC:")
print(" 0-1: FAIL - Major errors, fails to meet basic requirements")
print(" 2-3: PASS - Meets requirements with minor issues")
print(" 4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding")
print("="*80)
manual_evaluation = self.get_user_score()
# Update the corresponding test result
self.update_test_result_with_manual_score(
failed['test_id'],
failed.get('turn'),
manual_evaluation
)
# Save updated results
self.save_results()
print("\n✅ All manual reviews completed and saved!")
def update_test_result_with_manual_score(self, test_id: str, turn: Optional[int], evaluation: Dict):
"""Update a test result with manually provided score"""
for result in self.results['test_results']:
if result['test_id'] == test_id:
if turn is None:
# Single-turn test
result['score'] = evaluation['score']
# Only add MANUAL-OVERRIDE prefix if a score was actually provided
if evaluation['score'] is not None:
result['notes'] = f"[MANUAL-OVERRIDE] {evaluation['notes']}" if evaluation['notes'] else "[MANUAL-OVERRIDE]"
result['status'] = 'completed'
else:
result['notes'] = evaluation['notes'] if evaluation['notes'] else "Manual review skipped"
result['status'] = 'manual_review_skipped'
else:
# Multi-turn test - update specific turn
for turn_result in result.get('turns', []):
if turn_result['turn'] == turn:
turn_result['score'] = evaluation['score']
# Only add MANUAL-OVERRIDE prefix if a score was actually provided
if evaluation['score'] is not None:
turn_result['notes'] = f"[MANUAL-OVERRIDE] {evaluation['notes']}" if evaluation['notes'] else "[MANUAL-OVERRIDE]"
turn_result['status'] = 'completed'
else:
turn_result['notes'] = evaluation['notes'] if evaluation['notes'] else "Manual review skipped"
turn_result['status'] = 'manual_review_skipped'
# Recalculate overall score
valid_scores = [t['score'] for t in result['turns'] if t['score'] is not None]
result['overall_score'] = sum(valid_scores) / len(valid_scores) if valid_scores else None
result['status'] = 'completed' if result['overall_score'] is not None else 'incomplete'
break
def run_test_suite(self, test_suite: Dict, filter_category: Optional[str] = None):
"""Run the complete test suite"""
print("\n" + "="*80)
print(f"🚀 STARTING TEST SUITE")
print(f"📦 Model: {self.model_name}")
print(f"🔗 Endpoint: {self.endpoint}")
print("="*80)
# Count total tests
total_tests = 0
for cat_data in test_suite.get('test_categories', []):
if filter_category and cat_data['category'] != filter_category:
continue
total_tests += len(cat_data.get('tests', []))
self.results['metadata']['total_tests'] = total_tests
# Run tests by category
test_count = 0
for cat_data in test_suite.get('test_categories', []):
category = cat_data['category']
# Apply category filter if specified
if filter_category and category != filter_category:
continue
print(f"\n\n{'='*80}")
print(f"📂 CATEGORY: {category}")
print(f"{'='*80}")
for test in cat_data.get('tests', []):
test_count += 1
print(f"\n📊 Progress: {test_count}/{total_tests}")
# Run appropriate test type
if test.get('type') == 'single_turn':
result = self.run_single_turn_test(test, category)
elif test.get('type') == 'multi_turn':
result = self.run_multi_turn_test(test, category)
else:
print(f"⚠️ Unknown test type: {test.get('type')}")
continue
self.results['test_results'].append(result)
self.results['metadata']['completed_tests'] += 1
# Save after each test (in case of interruption)
self.save_results()
# Mark test suite as complete
self.results['metadata']['test_end'] = datetime.now().isoformat()
self.save_results()
# Handle failed evaluations if in non-interactive mode
if self.non_interactive and self.failed_evaluations:
self.manual_review_failed_evaluations()
print("\n\n" + "="*80)
print("✅ TEST SUITE COMPLETE")
if self.non_interactive and self.failed_evaluations:
print(f" ({len(self.failed_evaluations)} test(s) manually reviewed)")
print("="*80)
self.display_summary()
def save_results(self):
"""Save results to JSON file"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{self.model_name.replace(':', '_')}_{timestamp}.json"
filepath = self.output_dir / filename
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(self.results, f, indent=2, ensure_ascii=False)
# Also save as "latest" for this model
latest_file = self.output_dir / f"{self.model_name.replace(':', '_')}_latest.json"
with open(latest_file, 'w', encoding='utf-8') as f:
json.dump(self.results, f, indent=2, ensure_ascii=False)
def display_summary(self):
"""Display test summary"""
total = self.results['metadata']['total_tests']
completed = self.results['metadata']['completed_tests']
# Calculate statistics
scores = [r.get('score') or r.get('overall_score')
for r in self.results['test_results']]
scores = [s for s in scores if s is not None]
if scores:
avg_score = sum(scores) / len(scores)
print(f"\n📊 SUMMARY:")
print(f" Total Tests: {total}")
print(f" Completed: {completed}")
print(f" Average Score: {avg_score:.2f}/5.00")
print(f" Pass Rate: {len([s for s in scores if s >= 2]) / len(scores) * 100:.1f}%")
print(f" Exceptional Rate: {len([s for s in scores if s >= 4]) / len(scores) * 100:.1f}%")
# Calculate aggregate API metrics
total_prompt_tokens = 0
total_completion_tokens = 0
total_duration = 0
tests_with_metrics = 0
for result in self.results['test_results']:
# Single-turn tests
if result.get('api_metrics'):
usage = result['api_metrics'].get('usage', {})
total_prompt_tokens += usage.get('prompt_tokens', 0)
total_completion_tokens += usage.get('completion_tokens', 0)
total_duration += usage.get('total_duration', 0)
tests_with_metrics += 1
# Multi-turn tests
elif result.get('aggregate_metrics'):
usage = result['aggregate_metrics'].get('usage', {})
total_prompt_tokens += usage.get('total_prompt_tokens', 0)
total_completion_tokens += usage.get('total_completion_tokens', 0)
total_duration += usage.get('total_duration', 0)
tests_with_metrics += 1
if tests_with_metrics > 0:
print(f"\n⚡ API METRICS:")
print(f" Total Prompt Tokens: {total_prompt_tokens:,}")
print(f" Total Completion Tokens: {total_completion_tokens:,}")
print(f" Total Tokens: {total_prompt_tokens + total_completion_tokens:,}")
if total_duration > 0:
# Convert nanoseconds to seconds
duration_seconds = total_duration / 1_000_000_000
print(f" Total Duration: {duration_seconds:.2f}s")
if total_completion_tokens > 0:
tokens_per_second = total_completion_tokens / duration_seconds
print(f" Average Speed: {tokens_per_second:.2f} tokens/s")
print(f"\n💾 Results saved to: {self.output_dir}")
def main():
# Load environment variables from .env file if it exists
load_dotenv()
parser = argparse.ArgumentParser(
description="AI Model Evaluation Test Suite",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Test a single model (interactive mode)
python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q4_K_M
# Test with API key
python ai_eval.py --endpoint https://api.example.com --api-key sk-xxx --model qwen3:8b
# Non-interactive mode with evaluator
python ai_eval.py --non-interactive --evaluator-endpoint http://localhost:11434 --evaluator-model qwen3:14b
# Use .env file for configuration (recommended)
cp .env.example .env
# Edit .env with your settings
python ai_eval.py
# Test only forensics category
python ai_eval.py --endpoint http://localhost:11434 --model qwen3:14b --category "IT Forensics - File Systems"
"""
)
parser.add_argument(
'--endpoint',
default=os.getenv('MUT_ENDPOINT'),
help='OpenAI-compatible API endpoint for model under test (default: from .env MUT_ENDPOINT)'
)
parser.add_argument(
'--api-key',
default=os.getenv('MUT_API_KEY', ''),
help='API key for model under test (default: from .env MUT_API_KEY)'
)
parser.add_argument(
'--model',
default=os.getenv('MUT_MODEL'),
help='Model name/identifier to test (default: from .env MUT_MODEL)'
)
parser.add_argument(
'--test-suite',
default=os.getenv('TEST_SUITE', 'test_suite.yaml'),
help='Path to test suite YAML file (default: from .env TEST_SUITE or test_suite.yaml)'
)
parser.add_argument(
'--output-dir',
default=os.getenv('OUTPUT_DIR', 'results'),
help='Directory to save results (default: from .env OUTPUT_DIR or results)'
)
parser.add_argument(
'--category',
default=os.getenv('FILTER_CATEGORY'),
help='Filter tests by category (default: from .env FILTER_CATEGORY)'
)
parser.add_argument(
'--non-interactive',
action='store_true',
default=os.getenv('NON_INTERACTIVE', '').lower() in ('true', '1', 'yes'),
help='Run in non-interactive mode with automated evaluation (default: from .env NON_INTERACTIVE)'
)
parser.add_argument(
'--evaluator-endpoint',
default=os.getenv('EVALUATOR_ENDPOINT'),
help='API endpoint for evaluator model (required for non-interactive mode, default: from .env EVALUATOR_ENDPOINT)'
)
parser.add_argument(
'--evaluator-api-key',
default=os.getenv('EVALUATOR_API_KEY', ''),
help='API key for evaluator (default: from .env EVALUATOR_API_KEY)'
)
parser.add_argument(
'--evaluator-model',
default=os.getenv('EVALUATOR_MODEL'),
help='Model name for evaluator (required for non-interactive mode, default: from .env EVALUATOR_MODEL)'
)
parser.add_argument(
'--evaluator-temperature',
type=float,
default=float(os.getenv('EVALUATOR_TEMPERATURE', '0.3')),
help='Temperature for evaluator model (default: from .env EVALUATOR_TEMPERATURE or 0.3)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Test API connectivity without running the full test suite'
)
args = parser.parse_args()
# Validate required arguments
if not args.endpoint:
print("❌ Error: --endpoint is required (or set MUT_ENDPOINT in .env)")
print(" Example: --endpoint http://localhost:11434")
sys.exit(1)
if not args.model:
print("❌ Error: --model is required (or set MUT_MODEL in .env)")
print(" Example: --model qwen3:4b-q4_K_M")
sys.exit(1)
if args.non_interactive:
if not args.evaluator_endpoint:
print("❌ Error: --evaluator-endpoint is required for non-interactive mode")
print(" (or set EVALUATOR_ENDPOINT in .env)")
sys.exit(1)
if not args.evaluator_model:
print("❌ Error: --evaluator-model is required for non-interactive mode")
print(" (or set EVALUATOR_MODEL in .env)")
sys.exit(1)
# Parse model list (supports comma-separated models)
model_list = [m.strip() for m in args.model.split(',') if m.strip()]
# Dry run mode - just test connections
if args.dry_run:
print(f"\n{'='*80}")
print("🧪 DRY RUN MODE - Testing API Connectivity")
print(f"{'='*80}")
all_success = True
# Test MUT endpoint for each model
for idx, model_name in enumerate(model_list, 1):
if len(model_list) > 1:
print(f"\n--- Model {idx}/{len(model_list)} ---")
tester = AIModelTester(
endpoint=args.endpoint,
api_key=args.api_key,
model_name=model_name,
output_dir=args.output_dir,
non_interactive=args.non_interactive,
evaluator_endpoint=args.evaluator_endpoint,
evaluator_api_key=args.evaluator_api_key,
evaluator_model=args.evaluator_model,
evaluator_temperature=args.evaluator_temperature
)
success = tester.test_connection(
endpoint=args.endpoint,
api_key=args.api_key,
model=model_name,
endpoint_name="Model Under Test"
)
all_success = all_success and success
# Test evaluator endpoint if non-interactive mode
if args.non_interactive and args.evaluator_endpoint and args.evaluator_model:
print(f"\n{'='*80}")
tester = AIModelTester(
endpoint=args.endpoint,
api_key=args.api_key,
model_name=model_list[0],
output_dir=args.output_dir,
non_interactive=args.non_interactive,
evaluator_endpoint=args.evaluator_endpoint,
evaluator_api_key=args.evaluator_api_key,
evaluator_model=args.evaluator_model,
evaluator_temperature=args.evaluator_temperature
)
success = tester.test_connection(
endpoint=args.evaluator_endpoint,
api_key=args.evaluator_api_key,
model=args.evaluator_model,
endpoint_name="Evaluator"
)
all_success = all_success and success
print(f"\n{'='*80}")
if all_success:
print("✅ All connectivity tests passed")
print(f"{'='*80}")
sys.exit(0)
else:
print("❌ Some connectivity tests failed")
print(f"{'='*80}")
sys.exit(1)
if len(model_list) > 1:
print(f"\n🔄 Batch mode: Testing {len(model_list)} models")
print("=" * 80)
# Test each model
for idx, model_name in enumerate(model_list, 1):
if len(model_list) > 1:
print(f"\n{'='*80}")
print(f"📊 Model {idx}/{len(model_list)}: {model_name}")
print(f"{'='*80}\n")
# Initialize tester
tester = AIModelTester(
endpoint=args.endpoint,
api_key=args.api_key,
model_name=model_name,
output_dir=args.output_dir,
non_interactive=args.non_interactive,
evaluator_endpoint=args.evaluator_endpoint,
evaluator_api_key=args.evaluator_api_key,
evaluator_model=args.evaluator_model,
evaluator_temperature=args.evaluator_temperature
)
# Load test suite
if idx == 1 or len(model_list) == 1:
print(f"📁 Loading test suite from: {args.test_suite}")
test_suite = tester.load_test_suite(args.test_suite)
# Run tests
try:
tester.run_test_suite(test_suite, filter_category=args.category)
except KeyboardInterrupt:
print("\n\n⚠️ Test suite interrupted by user")
tester.results['metadata']['test_end'] = datetime.now().isoformat()
tester.save_results()
print(f"\n💾 Partial results saved to: {tester.output_dir}")
if len(model_list) > 1 and idx < len(model_list):
print(f"\n⚠️ Skipping remaining {len(model_list) - idx} models")
sys.exit(1)
if len(model_list) > 1:
print(f"\n{'='*80}")
print(f"✅ BATCH COMPLETE: Tested {len(model_list)} models")
print(f"{'='*80}")
print(f"\n💾 Results saved to: {args.output_dir}/")
print("\nTo compare results, run:")
print(" python analyze_results.py --compare")
if __name__ == "__main__":
main()