1246 lines
52 KiB
Python
1246 lines
52 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
AI Model Evaluation Automation Script
|
|
Runs comprehensive test suite against OpenAI-compatible API endpoints
|
|
"""
|
|
|
|
import yaml
|
|
import json
|
|
import requests
|
|
import os
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
from typing import Dict, List, Any, Optional
|
|
from pathlib import Path
|
|
import argparse
|
|
from dotenv import load_dotenv
|
|
|
|
|
|
class AIModelTester:
|
|
def __init__(self, endpoint: str, api_key: str, model_name: str, output_dir: str = "results",
|
|
non_interactive: bool = False, evaluator_endpoint: Optional[str] = None,
|
|
evaluator_api_key: Optional[str] = None, evaluator_model: Optional[str] = None,
|
|
evaluator_temperature: float = 0.3):
|
|
"""
|
|
Initialize the AI Model Tester
|
|
|
|
Args:
|
|
endpoint: OpenAI-compatible API endpoint URL
|
|
api_key: API key for authentication
|
|
model_name: Name/identifier of the model being tested
|
|
output_dir: Directory to save results
|
|
non_interactive: If True, use evaluator API for automatic scoring
|
|
evaluator_endpoint: API endpoint for evaluation model
|
|
evaluator_api_key: API key for evaluator
|
|
evaluator_model: Model name for evaluator
|
|
evaluator_temperature: Temperature for evaluator
|
|
"""
|
|
self.endpoint = endpoint.rstrip('/')
|
|
self.api_key = api_key
|
|
self.model_name = model_name
|
|
self.output_dir = Path(output_dir)
|
|
self.output_dir.mkdir(exist_ok=True)
|
|
|
|
# Non-interactive mode settings
|
|
self.non_interactive = non_interactive
|
|
self.evaluator_endpoint = evaluator_endpoint.rstrip('/') if evaluator_endpoint else None
|
|
self.evaluator_api_key = evaluator_api_key
|
|
self.evaluator_model = evaluator_model
|
|
self.evaluator_temperature = evaluator_temperature
|
|
|
|
if self.non_interactive:
|
|
if not all([self.evaluator_endpoint, self.evaluator_model]):
|
|
raise ValueError("Non-interactive mode requires evaluator_endpoint and evaluator_model")
|
|
print(f"🤖 Non-interactive mode enabled")
|
|
print(f" Evaluator: {self.evaluator_model} @ {self.evaluator_endpoint}")
|
|
|
|
# Results storage
|
|
self.results = {
|
|
"metadata": {
|
|
"model_name": model_name,
|
|
"endpoint": endpoint,
|
|
"test_start": datetime.now().isoformat(),
|
|
"test_end": None,
|
|
"total_tests": 0,
|
|
"completed_tests": 0
|
|
},
|
|
"test_results": []
|
|
}
|
|
|
|
# Current test session info
|
|
self.current_test_id = None
|
|
self.conversation_history = []
|
|
|
|
# Track failed auto-evaluations for manual review
|
|
self.failed_evaluations = []
|
|
|
|
def test_connection(self, endpoint: str, api_key: str, model: str, endpoint_name: str = "API") -> bool:
|
|
"""
|
|
Test if an API endpoint can be reached and authenticated against
|
|
|
|
Args:
|
|
endpoint: API endpoint URL
|
|
api_key: API key for authentication
|
|
model: Model name to test
|
|
endpoint_name: Name for display purposes
|
|
|
|
Returns:
|
|
True if connection successful, False otherwise
|
|
"""
|
|
print(f"\n🔍 Testing {endpoint_name} connection...")
|
|
print(f" Endpoint: {endpoint}")
|
|
print(f" Model: {model}")
|
|
|
|
headers = {
|
|
"Content-Type": "application/json"
|
|
}
|
|
|
|
if api_key:
|
|
headers["Authorization"] = f"Bearer {api_key}"
|
|
print(f" API Key: {api_key[:10]}..." if len(api_key) > 10 else " API Key: [set]")
|
|
else:
|
|
print(f" API Key: [none]")
|
|
|
|
# Simple test message
|
|
payload = {
|
|
"model": model,
|
|
"messages": [{"role": "user", "content": "test"}],
|
|
"max_tokens": 1
|
|
}
|
|
|
|
try:
|
|
response = requests.post(
|
|
f"{endpoint}/v1/chat/completions",
|
|
headers=headers,
|
|
json=payload,
|
|
timeout=60
|
|
)
|
|
response.raise_for_status()
|
|
|
|
print(f" ✅ {endpoint_name} connection successful")
|
|
return True
|
|
|
|
except requests.exceptions.HTTPError as e:
|
|
print(f" ❌ {endpoint_name} HTTP error: {e.response.status_code}")
|
|
print(f" {e.response.text[:200]}")
|
|
return False
|
|
except requests.exceptions.ConnectionError as e:
|
|
print(f" ❌ {endpoint_name} connection failed: Cannot reach endpoint")
|
|
return False
|
|
except requests.exceptions.Timeout:
|
|
print(f" ❌ {endpoint_name} connection timeout")
|
|
return False
|
|
except requests.exceptions.RequestException as e:
|
|
print(f" ❌ {endpoint_name} error: {e}")
|
|
return False
|
|
|
|
def load_test_suite(self, yaml_file: str) -> Dict:
|
|
"""Load test suite from YAML file"""
|
|
try:
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
except FileNotFoundError:
|
|
print(f"Error: Test suite file not found: {yaml_file}")
|
|
print(f"Please ensure {yaml_file} is in the current directory.")
|
|
sys.exit(1)
|
|
except yaml.YAMLError as e:
|
|
print(f"Error: Invalid YAML format in {yaml_file}")
|
|
print(f"Details: {e}")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Error loading test suite: {e}")
|
|
sys.exit(1)
|
|
|
|
def call_api(self, messages: List[Dict], temperature: float = 0.7, max_tokens: int = 2000) -> Optional[Dict]:
|
|
"""
|
|
Call the OpenAI-compatible API
|
|
|
|
Args:
|
|
messages: List of message dicts with 'role' and 'content'
|
|
temperature: Sampling temperature
|
|
max_tokens: Maximum tokens in response
|
|
|
|
Returns:
|
|
API response dict or None if error
|
|
"""
|
|
headers = {
|
|
"Content-Type": "application/json"
|
|
}
|
|
|
|
# Only add Authorization header if API key is provided
|
|
if self.api_key:
|
|
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
|
|
payload = {
|
|
"model": self.model_name,
|
|
"messages": messages,
|
|
"temperature": temperature,
|
|
"max_tokens": max_tokens
|
|
}
|
|
|
|
try:
|
|
print(f"\nDEBUG: Calling API endpoint: {self.endpoint}/v1/chat/completions")
|
|
print(f"DEBUG: Model name: {self.model_name}")
|
|
|
|
response = requests.post(
|
|
f"{self.endpoint}/v1/chat/completions",
|
|
headers=headers,
|
|
json=payload,
|
|
timeout=240
|
|
)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except requests.exceptions.Timeout as e:
|
|
print(f"\n⚠️ API Timeout Error: Request exceeded 120 seconds")
|
|
print(f" This usually means the model is taking too long to generate a response.")
|
|
print(f" Consider using a faster model or reducing the complexity of the prompt.")
|
|
return None
|
|
except requests.exceptions.HTTPError as e:
|
|
print(f"\n⚠️ API HTTP Error: {e}")
|
|
print(f" Status Code: {e.response.status_code}")
|
|
print(f" Response: {e.response.text[:500]}")
|
|
|
|
# Show request details for debugging
|
|
print("\n REQUEST DETAILS:")
|
|
print(f" URL: {self.endpoint}/v1/chat/completions")
|
|
print(f" Model: {self.model_name}")
|
|
|
|
return None
|
|
except requests.exceptions.ConnectionError as e:
|
|
print(f"\n⚠️ API Connection Error: {e}")
|
|
print(f" Could not connect to {self.endpoint}")
|
|
print(f" Please check your network connection and endpoint URL.")
|
|
return None
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"\n⚠️ API Request Error: {e}")
|
|
if hasattr(e, 'response') and e.response is not None:
|
|
print(f" Response: {e.response.text[:500]}")
|
|
return None
|
|
|
|
def display_test_info(self, test: Dict, category: str):
|
|
"""Display test information to user"""
|
|
print("\n" + "="*80)
|
|
print(f"📋 CATEGORY: {category}")
|
|
print(f"🆔 Test ID: {test['id']}")
|
|
print(f"📝 Test Name: {test['name']}")
|
|
print(f"🎯 Type: {test['type']}")
|
|
print(f"⚡ Difficulty: {test.get('expected_difficulty', 'N/A')}")
|
|
print("="*80)
|
|
|
|
def display_prompt(self, prompt: str, turn: Optional[int] = None):
|
|
"""Display the prompt being sent"""
|
|
if turn is not None:
|
|
print(f"\n🔄 TURN {turn}:")
|
|
else:
|
|
print(f"\n💬 PROMPT:")
|
|
print("-"*80)
|
|
print(prompt)
|
|
print("-"*80)
|
|
|
|
def display_response(self, response_text: str, raw_response: Optional[Dict] = None, generation_time: Optional[float] = None):
|
|
"""Display the model's response with timing and token usage metrics"""
|
|
print("\nRAW API RESPONSE:")
|
|
print("="*80)
|
|
import json
|
|
print(json.dumps(raw_response, indent=2))
|
|
print("="*80)
|
|
|
|
# Display timing and token metrics if available
|
|
if generation_time is not None:
|
|
print(f"\n⏱️ Generation Time: {generation_time:.2f}s")
|
|
|
|
if raw_response and 'usage' in raw_response:
|
|
usage = raw_response['usage']
|
|
print(f"\n📊 Token Usage:")
|
|
if 'prompt_tokens' in usage:
|
|
print(f" Prompt Tokens: {usage['prompt_tokens']}")
|
|
if 'completion_tokens' in usage:
|
|
print(f" Completion Tokens: {usage['completion_tokens']}")
|
|
if 'total_tokens' in usage:
|
|
print(f" Total Tokens: {usage['total_tokens']}")
|
|
|
|
# Calculate tokens per second if we have timing
|
|
if generation_time and generation_time > 0 and 'completion_tokens' in usage:
|
|
tokens_per_sec = usage['completion_tokens'] / generation_time
|
|
print(f" Speed: {tokens_per_sec:.2f} tokens/sec")
|
|
|
|
print(f"\n\nMODEL RESPONSE:")
|
|
print("-"*80)
|
|
print(response_text)
|
|
print("-"*80)
|
|
|
|
def display_evaluation_criteria(self, criteria: List[str]):
|
|
"""Display evaluation criteria for the test"""
|
|
print(f"\n✅ EVALUATION CRITERIA:")
|
|
for i, criterion in enumerate(criteria, 1):
|
|
print(f" {i}. {criterion}")
|
|
|
|
def get_user_score(self) -> Dict:
|
|
"""Prompt user for evaluation score"""
|
|
print("\n" + "="*80)
|
|
print("📊 EVALUATION SCORING RUBRIC:")
|
|
print(" 0-1: FAIL - Major errors, fails to meet basic requirements")
|
|
print(" 2-3: PASS - Meets requirements with minor issues")
|
|
print(" 4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding")
|
|
print("="*80)
|
|
|
|
while True:
|
|
try:
|
|
score_input = input("\n👉 Enter score (0-5) or 'skip' to skip this test: ").strip().lower()
|
|
|
|
if score_input == 'skip':
|
|
return {"score": None, "notes": "Skipped by user"}
|
|
|
|
score = int(score_input)
|
|
if 0 <= score <= 5:
|
|
notes = input("📝 Notes (optional, press Enter to skip): ").strip()
|
|
return {"score": score, "notes": notes if notes else ""}
|
|
else:
|
|
print("❌ Score must be between 0 and 5")
|
|
except ValueError:
|
|
print("❌ Invalid input. Please enter a number between 0 and 5, or 'skip'")
|
|
except KeyboardInterrupt:
|
|
print("\n\n⚠️ Test interrupted by user")
|
|
return {"score": None, "notes": "Interrupted"}
|
|
|
|
def extract_api_metrics(self, response: Dict) -> Optional[Dict]:
|
|
"""
|
|
Extract all available metrics from API response
|
|
|
|
Args:
|
|
response: The API response dict
|
|
|
|
Returns:
|
|
Dict with usage statistics and timing information, or None if no metrics available
|
|
"""
|
|
metrics = {}
|
|
|
|
if response and isinstance(response, dict):
|
|
# Extract usage statistics if available
|
|
usage = response.get('usage', {})
|
|
if usage:
|
|
metrics['usage'] = {
|
|
'prompt_tokens': usage.get('prompt_tokens'),
|
|
'completion_tokens': usage.get('completion_tokens'),
|
|
'total_tokens': usage.get('total_tokens'),
|
|
'prompt_eval_count': usage.get('prompt_eval_count'),
|
|
'eval_count': usage.get('eval_count'),
|
|
'prompt_eval_duration': usage.get('prompt_eval_duration'),
|
|
'eval_duration': usage.get('eval_duration'),
|
|
'load_duration': usage.get('load_duration'),
|
|
'total_duration': usage.get('total_duration'),
|
|
'response_token_s': usage.get('response_token/s'),
|
|
'prompt_token_s': usage.get('prompt_token/s'),
|
|
}
|
|
# Remove None values
|
|
metrics['usage'] = {k: v for k, v in metrics['usage'].items() if v is not None}
|
|
|
|
# Extract model info
|
|
if 'model' in response:
|
|
metrics['model'] = response['model']
|
|
|
|
# Extract creation timestamp
|
|
if 'created' in response:
|
|
metrics['created'] = response['created']
|
|
|
|
# Extract finish reason
|
|
if 'choices' in response and len(response['choices']) > 0:
|
|
finish_reason = response['choices'][0].get('finish_reason')
|
|
if finish_reason:
|
|
metrics['finish_reason'] = finish_reason
|
|
|
|
return metrics if metrics else None
|
|
|
|
def get_evaluator_score(self, prompt: str, response: str, criteria: List[str]) -> Dict:
|
|
"""
|
|
Get automated score from evaluator API
|
|
|
|
Args:
|
|
prompt: The original prompt sent to the model under test
|
|
response: The model's response to evaluate
|
|
criteria: List of evaluation criteria
|
|
|
|
Returns:
|
|
Dict with 'score', 'notes', and 'auto_eval_failed' keys
|
|
"""
|
|
# Construct evaluation prompt
|
|
criteria_text = "\n".join([f"{i+1}. {c}" for i, c in enumerate(criteria)])
|
|
|
|
eval_system_prompt = """You are an expert AI model evaluator. Your task is to objectively assess AI model responses based on specific criteria.
|
|
|
|
Scoring Rubric:
|
|
- 0-1: FAIL - Major errors, fails to meet basic requirements, incorrect or misleading information
|
|
- 2-3: PASS - Meets requirements with minor issues, mostly correct with some imperfections
|
|
- 4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding, accurate and comprehensive
|
|
|
|
CRITICAL: You MUST respond with ONLY valid JSON. Do not include any explanatory text before or after the JSON.
|
|
|
|
Response format (use ONLY this, nothing else):
|
|
{"score": <integer 0-5>, "notes": "<brief explanation>"}"""
|
|
|
|
eval_user_prompt = f"""Evaluate this AI model response against the criteria below.
|
|
|
|
**Original Prompt:**
|
|
{prompt}
|
|
|
|
**Model Response:**
|
|
{response}
|
|
|
|
**Evaluation Criteria:**
|
|
{criteria_text}
|
|
|
|
Respond with ONLY valid JSON in this exact format:
|
|
{{"score": <integer 0-5>, "notes": "<brief explanation of the score>"}}
|
|
|
|
Do not include any text before or after the JSON object."""
|
|
|
|
messages = [
|
|
{"role": "system", "content": eval_system_prompt},
|
|
{"role": "user", "content": eval_user_prompt}
|
|
]
|
|
|
|
# Call evaluator API
|
|
headers = {
|
|
"Content-Type": "application/json"
|
|
}
|
|
|
|
if self.evaluator_api_key:
|
|
headers["Authorization"] = f"Bearer {self.evaluator_api_key}"
|
|
|
|
payload = {
|
|
"model": self.evaluator_model,
|
|
"messages": messages,
|
|
"temperature": self.evaluator_temperature,
|
|
"max_tokens": 500
|
|
}
|
|
|
|
try:
|
|
print(f"\n🤖 Calling evaluator API for automated scoring...")
|
|
response_obj = requests.post(
|
|
f"{self.evaluator_endpoint}/v1/chat/completions",
|
|
headers=headers,
|
|
json=payload,
|
|
timeout=90 # Increased timeout for evaluator
|
|
)
|
|
response_obj.raise_for_status()
|
|
eval_response = response_obj.json()
|
|
|
|
# Extract the evaluation
|
|
eval_text = eval_response['choices'][0]['message']['content'].strip()
|
|
|
|
# Try to extract JSON from the response
|
|
# Handle case where model wraps JSON in markdown code blocks or adds extra text
|
|
json_obj = None
|
|
|
|
# Try direct parsing first
|
|
try:
|
|
json_obj = json.loads(eval_text)
|
|
except json.JSONDecodeError:
|
|
# Try to find JSON in markdown code blocks
|
|
if '```json' in eval_text or '```' in eval_text:
|
|
# Extract content between code fences
|
|
import re
|
|
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', eval_text, re.DOTALL)
|
|
if json_match:
|
|
json_obj = json.loads(json_match.group(1))
|
|
|
|
# Try to find any JSON object in the text
|
|
if json_obj is None:
|
|
import re
|
|
json_match = re.search(r'\{[^{}]*"score"[^{}]*"notes"[^{}]*\}', eval_text, re.DOTALL)
|
|
if json_match:
|
|
json_obj = json.loads(json_match.group(0))
|
|
|
|
if json_obj is None:
|
|
raise json.JSONDecodeError("No valid JSON found", eval_text, 0)
|
|
|
|
score = int(json_obj.get('score', 0))
|
|
notes = json_obj.get('notes', 'Automated evaluation')
|
|
|
|
# Validate score range
|
|
if not 0 <= score <= 5:
|
|
print(f"⚠️ Evaluator returned invalid score {score}, clamping to 0-5")
|
|
score = max(0, min(5, score))
|
|
|
|
print(f" Score: {score}/5")
|
|
print(f" Notes: {notes}")
|
|
|
|
return {"score": score, "notes": f"[Auto-Reviev by {self.evaluator_model}] {notes}", "auto_eval_failed": False}
|
|
|
|
except json.JSONDecodeError as e:
|
|
print(f"⚠️ Failed to parse evaluator response as JSON: {e}")
|
|
print(f" Raw response: {eval_text[:200]}")
|
|
print(f" Marking for manual review")
|
|
return {"score": None, "notes": f"[AUTO-ERROR] Evaluator returned non-JSON response", "auto_eval_failed": True}
|
|
except (KeyError, IndexError, ValueError) as e:
|
|
print(f"⚠️ Invalid evaluator response format: {e}")
|
|
print(f" Marking for manual review")
|
|
return {"score": None, "notes": f"[AUTO-ERROR] Invalid response format: {str(e)}", "auto_eval_failed": True}
|
|
except requests.exceptions.Timeout as e:
|
|
print(f"⚠️ Evaluator API timeout: Request exceeded 90 seconds")
|
|
print(f" The evaluator model is taking too long to respond.")
|
|
print(f" Marking for manual review")
|
|
return {"score": None, "notes": "[AUTO-ERROR] Evaluator API timeout", "auto_eval_failed": True}
|
|
except requests.exceptions.ConnectionError as e:
|
|
print(f"⚠️ Evaluator connection error: {e}")
|
|
print(f" Could not connect to evaluator endpoint.")
|
|
print(f" Marking for manual review")
|
|
return {"score": None, "notes": "[AUTO-ERROR] Cannot connect to evaluator", "auto_eval_failed": True}
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"⚠️ Evaluator API error: {e}")
|
|
print(f" Marking for manual review")
|
|
return {"score": None, "notes": f"[AUTO-ERROR] API call failed: {str(e)[:100]}", "auto_eval_failed": True}
|
|
|
|
def run_single_turn_test(self, test: Dict, category: str) -> Dict:
|
|
"""Run a single-turn test"""
|
|
self.display_test_info(test, category)
|
|
self.display_prompt(test['prompt'])
|
|
|
|
# Prepare messages
|
|
messages = [{"role": "user", "content": test['prompt']}]
|
|
|
|
# Call API and measure time
|
|
start_time = time.time()
|
|
response = self.call_api(messages)
|
|
generation_time = time.time() - start_time
|
|
if response is None:
|
|
return {
|
|
"test_id": test['id'],
|
|
"test_name": test['name'],
|
|
"category": category,
|
|
"type": "single_turn",
|
|
"status": "api_error",
|
|
"score": None,
|
|
"notes": "API call failed",
|
|
"generation_time": generation_time
|
|
}
|
|
|
|
# Extract response text with better error handling
|
|
try:
|
|
message = response['choices'][0]['message']
|
|
# Try to get content, if empty check for reasoning_content
|
|
response_text = message.get('content', '')
|
|
|
|
# If content is empty but reasoning_content exists, use that
|
|
if not response_text and 'reasoning_content' in message:
|
|
response_text = message['reasoning_content']
|
|
print("\n⚠️ Note: Response contained only reasoning_content, no actual content generated")
|
|
|
|
# If still empty, check for tool_calls (model might be trying to call functions)
|
|
if not response_text and 'tool_calls' in message:
|
|
print("\n⚠️ Warning: Model attempted to call tools instead of generating content")
|
|
tool_info = json.dumps(message['tool_calls'], indent=2)
|
|
response_text = f"[MODEL ERROR: Attempted tool calls instead of text response]\n{tool_info}"
|
|
|
|
# If completely empty, this is an error
|
|
if not response_text:
|
|
print("\n⚠️ ERROR: Model returned completely empty response")
|
|
response_text = "[ERROR: Empty response from model]"
|
|
|
|
except (KeyError, IndexError, TypeError) as e:
|
|
print(f"\nERROR: Failed to parse response - {e}")
|
|
print("\nRAW API RESPONSE:")
|
|
print("="*80)
|
|
print(json.dumps(response, indent=2))
|
|
print("="*80)
|
|
|
|
response_text = f"[PARSING ERROR: {e}]"
|
|
|
|
self.display_response(response_text, raw_response=response, generation_time=generation_time)
|
|
|
|
# Display evaluation criteria
|
|
self.display_evaluation_criteria(test.get('evaluation_criteria', []))
|
|
|
|
# Get evaluation (interactive or automated)
|
|
# Skip automated evaluation if response is an error
|
|
if self.non_interactive:
|
|
# Check if response is actually an error
|
|
if response_text.startswith('[ERROR:') or response_text.startswith('[MODEL ERROR:') or response_text.startswith('[PARSING ERROR:'):
|
|
print(f"\n⚠️ Skipping automated evaluation due to model error")
|
|
evaluation = {"score": 0, "notes": "[AUTO-SKIP] Model failed to generate valid response", "auto_eval_failed": False}
|
|
else:
|
|
evaluation = self.get_evaluator_score(
|
|
prompt=test['prompt'],
|
|
response=response_text,
|
|
criteria=test.get('evaluation_criteria', [])
|
|
)
|
|
# Track failed evaluations for manual review
|
|
if evaluation.get('auto_eval_failed', False):
|
|
self.failed_evaluations.append({
|
|
'test_id': test['id'],
|
|
'test_name': test['name'],
|
|
'category': category,
|
|
'type': 'single_turn',
|
|
'prompt': test['prompt'],
|
|
'response': response_text,
|
|
'criteria': test.get('evaluation_criteria', []),
|
|
'error': evaluation['notes']
|
|
})
|
|
else:
|
|
evaluation = self.get_user_score()
|
|
|
|
# Extract API metrics
|
|
api_metrics = self.extract_api_metrics(response)
|
|
|
|
result = {
|
|
"test_id": test['id'],
|
|
"test_name": test['name'],
|
|
"category": category,
|
|
"type": "single_turn",
|
|
"difficulty": test.get('expected_difficulty', 'unknown'),
|
|
"prompt": test['prompt'],
|
|
"response": response_text,
|
|
"raw_response": response if response_text.startswith("[PARSING ERROR") else None,
|
|
"evaluation_criteria": test.get('evaluation_criteria', []),
|
|
"score": evaluation['score'],
|
|
"notes": evaluation['notes'],
|
|
"status": "completed" if evaluation['score'] is not None else "skipped",
|
|
"timestamp": datetime.now().isoformat(),
|
|
"generation_time": generation_time
|
|
}
|
|
|
|
# Add metrics if available
|
|
if api_metrics:
|
|
result['api_metrics'] = api_metrics
|
|
|
|
return result
|
|
|
|
def run_multi_turn_test(self, test: Dict, category: str) -> Dict:
|
|
"""Run a multi-turn test"""
|
|
self.display_test_info(test, category)
|
|
|
|
# Initialize conversation history
|
|
self.conversation_history = []
|
|
turn_results = []
|
|
|
|
for i, turn_data in enumerate(test['turns'], 1):
|
|
turn_num = turn_data['turn']
|
|
prompt = turn_data['prompt']
|
|
|
|
self.display_prompt(prompt, turn_num)
|
|
|
|
# Add to conversation history
|
|
self.conversation_history.append({"role": "user", "content": prompt})
|
|
|
|
# Call API with full conversation history and measure time
|
|
start_time = time.time()
|
|
response = self.call_api(self.conversation_history)
|
|
generation_time = time.time() - start_time
|
|
|
|
if response is None:
|
|
turn_results.append({
|
|
"turn": turn_num,
|
|
"status": "api_error",
|
|
"prompt": prompt,
|
|
"response": None,
|
|
"score": None,
|
|
"notes": "API error - failed to get response",
|
|
"generation_time": generation_time
|
|
})
|
|
break
|
|
|
|
# Extract and display response with better error handling
|
|
try:
|
|
message = response['choices'][0]['message']
|
|
# Try to get content, if empty check for reasoning_content
|
|
response_text = message.get('content', '')
|
|
|
|
# If content is empty but reasoning_content exists, use that
|
|
if not response_text and 'reasoning_content' in message:
|
|
response_text = message['reasoning_content']
|
|
print("\n⚠️ Note: Response contained only reasoning_content, no actual content generated")
|
|
|
|
# If still empty, check for tool_calls
|
|
if not response_text and 'tool_calls' in message:
|
|
print("\n⚠️ Warning: Model attempted to call tools instead of generating content")
|
|
import json
|
|
tool_info = json.dumps(message['tool_calls'], indent=2)
|
|
response_text = f"[MODEL ERROR: Attempted tool calls instead of text response]\n{tool_info}"
|
|
|
|
# If completely empty, this is an error
|
|
if not response_text:
|
|
print("\n⚠️ ERROR: Model returned completely empty response")
|
|
response_text = "[ERROR: Empty response from model]"
|
|
|
|
except (KeyError, IndexError, TypeError) as e:
|
|
print(f"\nERROR: Failed to parse response - {e}")
|
|
print("\nRAW API RESPONSE:")
|
|
print("="*80)
|
|
import json
|
|
print(json.dumps(response, indent=2))
|
|
print("="*80)
|
|
|
|
response_text = f"[PARSING ERROR: {e}]"
|
|
|
|
self.display_response(response_text, raw_response=response, generation_time=generation_time)
|
|
|
|
# Add assistant response to history
|
|
self.conversation_history.append({"role": "assistant", "content": response_text})
|
|
|
|
# Display criteria for this turn
|
|
self.display_evaluation_criteria(turn_data.get('evaluation_criteria', []))
|
|
|
|
# Get evaluation for this turn (interactive or automated)
|
|
print(f"\nEvaluate Turn {turn_num}:")
|
|
if self.non_interactive:
|
|
# Skip automated evaluation if response is an error
|
|
if response_text.startswith('[ERROR:') or response_text.startswith('[MODEL ERROR:') or response_text.startswith('[PARSING ERROR:'):
|
|
print(f"\n⚠️ Skipping automated evaluation due to model error")
|
|
evaluation = {"score": 0, "notes": "[AUTO-SKIP] Model failed to generate valid response", "auto_eval_failed": False}
|
|
else:
|
|
evaluation = self.get_evaluator_score(
|
|
prompt=prompt,
|
|
response=response_text,
|
|
criteria=turn_data.get('evaluation_criteria', [])
|
|
)
|
|
# Track failed evaluations for manual review
|
|
if evaluation.get('auto_eval_failed', False):
|
|
self.failed_evaluations.append({
|
|
'test_id': test['id'],
|
|
'test_name': test['name'],
|
|
'category': category,
|
|
'type': 'multi_turn',
|
|
'turn': turn_num,
|
|
'prompt': prompt,
|
|
'response': response_text,
|
|
'criteria': turn_data.get('evaluation_criteria', []),
|
|
'error': evaluation['notes']
|
|
})
|
|
else:
|
|
evaluation = self.get_user_score()
|
|
|
|
# Extract API metrics for this turn
|
|
api_metrics = self.extract_api_metrics(response)
|
|
|
|
turn_result = {
|
|
"turn": turn_num,
|
|
"prompt": prompt,
|
|
"response": response_text,
|
|
"evaluation_criteria": turn_data.get('evaluation_criteria', []),
|
|
"score": evaluation['score'],
|
|
"notes": evaluation['notes'],
|
|
"status": "completed" if evaluation['score'] is not None else "skipped",
|
|
"generation_time": generation_time
|
|
}
|
|
|
|
# Add metrics if available
|
|
if api_metrics:
|
|
turn_result['api_metrics'] = api_metrics
|
|
|
|
turn_results.append(turn_result)
|
|
|
|
if evaluation['score'] is None:
|
|
print(f"\nTurn {turn_num} skipped, stopping multi-turn test")
|
|
break
|
|
|
|
# Calculate overall score for multi-turn test
|
|
valid_scores = [t['score'] for t in turn_results if t['score'] is not None]
|
|
overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else None
|
|
|
|
# Aggregate metrics across all turns
|
|
aggregate_metrics = {}
|
|
turn_metrics = [t.get('api_metrics') for t in turn_results if t.get('api_metrics')]
|
|
|
|
if turn_metrics:
|
|
# Sum up token counts and durations
|
|
total_prompt_tokens = sum(m.get('usage', {}).get('prompt_tokens', 0) for m in turn_metrics)
|
|
total_completion_tokens = sum(m.get('usage', {}).get('completion_tokens', 0) for m in turn_metrics)
|
|
total_duration = sum(m.get('usage', {}).get('total_duration', 0) for m in turn_metrics)
|
|
|
|
aggregate_metrics['usage'] = {
|
|
'total_prompt_tokens': total_prompt_tokens if total_prompt_tokens else None,
|
|
'total_completion_tokens': total_completion_tokens if total_completion_tokens else None,
|
|
'total_tokens': (total_prompt_tokens + total_completion_tokens) if (total_prompt_tokens or total_completion_tokens) else None,
|
|
'total_duration': total_duration if total_duration else None,
|
|
'turn_count': len(turn_metrics)
|
|
}
|
|
# Remove None values
|
|
aggregate_metrics['usage'] = {k: v for k, v in aggregate_metrics['usage'].items() if v is not None}
|
|
|
|
result = {
|
|
"test_id": test['id'],
|
|
"test_name": test['name'],
|
|
"category": category,
|
|
"type": "multi_turn",
|
|
"difficulty": test.get('expected_difficulty', 'unknown'),
|
|
"turns": turn_results,
|
|
"overall_score": overall_score,
|
|
"status": "completed" if overall_score is not None else "incomplete",
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
# Add aggregate metrics if available
|
|
if aggregate_metrics:
|
|
result['aggregate_metrics'] = aggregate_metrics
|
|
|
|
return result
|
|
|
|
def manual_review_failed_evaluations(self):
|
|
"""Present failed automated evaluations for manual review"""
|
|
print("\n\n" + "="*80)
|
|
print("⚠️ MANUAL REVIEW REQUIRED")
|
|
print("="*80)
|
|
print(f"\n{len(self.failed_evaluations)} test(s) could not be automatically evaluated.")
|
|
print("Please provide manual scores for these tests.\n")
|
|
|
|
for idx, failed in enumerate(self.failed_evaluations, 1):
|
|
print("\n" + "="*80)
|
|
print(f"📋 MANUAL REVIEW {idx}/{len(self.failed_evaluations)}")
|
|
print("="*80)
|
|
print(f"🆔 Test ID: {failed['test_id']}")
|
|
print(f"📝 Test Name: {failed['test_name']}")
|
|
print(f"📂 Category: {failed['category']}")
|
|
if failed['type'] == 'multi_turn':
|
|
print(f"🔄 Turn: {failed['turn']}")
|
|
print(f"❌ Auto-Eval Error: {failed['error']}")
|
|
|
|
print(f"\n💬 PROMPT:")
|
|
print("-"*80)
|
|
print(failed['prompt'])
|
|
print("-"*80)
|
|
|
|
print(f"\nMODEL RESPONSE:")
|
|
print("-"*80)
|
|
print(failed['response'])
|
|
print("-"*80)
|
|
|
|
print(f"\n✅ EVALUATION CRITERIA:")
|
|
for i, criterion in enumerate(failed['criteria'], 1):
|
|
print(f" {i}. {criterion}")
|
|
|
|
# Get manual score
|
|
print("\n" + "="*80)
|
|
print("📊 EVALUATION SCORING RUBRIC:")
|
|
print(" 0-1: FAIL - Major errors, fails to meet basic requirements")
|
|
print(" 2-3: PASS - Meets requirements with minor issues")
|
|
print(" 4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding")
|
|
print("="*80)
|
|
|
|
manual_evaluation = self.get_user_score()
|
|
|
|
# Update the corresponding test result
|
|
self.update_test_result_with_manual_score(
|
|
failed['test_id'],
|
|
failed.get('turn'),
|
|
manual_evaluation
|
|
)
|
|
|
|
# Save updated results
|
|
self.save_results()
|
|
print("\n✅ All manual reviews completed and saved!")
|
|
|
|
def update_test_result_with_manual_score(self, test_id: str, turn: Optional[int], evaluation: Dict):
|
|
"""Update a test result with manually provided score"""
|
|
for result in self.results['test_results']:
|
|
if result['test_id'] == test_id:
|
|
if turn is None:
|
|
# Single-turn test
|
|
result['score'] = evaluation['score']
|
|
# Only add MANUAL-OVERRIDE prefix if a score was actually provided
|
|
if evaluation['score'] is not None:
|
|
result['notes'] = f"[MANUAL-OVERRIDE] {evaluation['notes']}" if evaluation['notes'] else "[MANUAL-OVERRIDE]"
|
|
result['status'] = 'completed'
|
|
else:
|
|
result['notes'] = evaluation['notes'] if evaluation['notes'] else "Manual review skipped"
|
|
result['status'] = 'manual_review_skipped'
|
|
else:
|
|
# Multi-turn test - update specific turn
|
|
for turn_result in result.get('turns', []):
|
|
if turn_result['turn'] == turn:
|
|
turn_result['score'] = evaluation['score']
|
|
# Only add MANUAL-OVERRIDE prefix if a score was actually provided
|
|
if evaluation['score'] is not None:
|
|
turn_result['notes'] = f"[MANUAL-OVERRIDE] {evaluation['notes']}" if evaluation['notes'] else "[MANUAL-OVERRIDE]"
|
|
turn_result['status'] = 'completed'
|
|
else:
|
|
turn_result['notes'] = evaluation['notes'] if evaluation['notes'] else "Manual review skipped"
|
|
turn_result['status'] = 'manual_review_skipped'
|
|
|
|
# Recalculate overall score
|
|
valid_scores = [t['score'] for t in result['turns'] if t['score'] is not None]
|
|
result['overall_score'] = sum(valid_scores) / len(valid_scores) if valid_scores else None
|
|
result['status'] = 'completed' if result['overall_score'] is not None else 'incomplete'
|
|
break
|
|
|
|
def run_test_suite(self, test_suite: Dict, filter_category: Optional[str] = None):
|
|
"""Run the complete test suite"""
|
|
print("\n" + "="*80)
|
|
print(f"🚀 STARTING TEST SUITE")
|
|
print(f"📦 Model: {self.model_name}")
|
|
print(f"🔗 Endpoint: {self.endpoint}")
|
|
print("="*80)
|
|
|
|
# Count total tests
|
|
total_tests = 0
|
|
for cat_data in test_suite.get('test_categories', []):
|
|
if filter_category and cat_data['category'] != filter_category:
|
|
continue
|
|
total_tests += len(cat_data.get('tests', []))
|
|
|
|
self.results['metadata']['total_tests'] = total_tests
|
|
|
|
# Run tests by category
|
|
test_count = 0
|
|
for cat_data in test_suite.get('test_categories', []):
|
|
category = cat_data['category']
|
|
|
|
# Apply category filter if specified
|
|
if filter_category and category != filter_category:
|
|
continue
|
|
|
|
print(f"\n\n{'='*80}")
|
|
print(f"📂 CATEGORY: {category}")
|
|
print(f"{'='*80}")
|
|
|
|
for test in cat_data.get('tests', []):
|
|
test_count += 1
|
|
print(f"\n📊 Progress: {test_count}/{total_tests}")
|
|
|
|
# Run appropriate test type
|
|
if test.get('type') == 'single_turn':
|
|
result = self.run_single_turn_test(test, category)
|
|
elif test.get('type') == 'multi_turn':
|
|
result = self.run_multi_turn_test(test, category)
|
|
else:
|
|
print(f"⚠️ Unknown test type: {test.get('type')}")
|
|
continue
|
|
|
|
self.results['test_results'].append(result)
|
|
self.results['metadata']['completed_tests'] += 1
|
|
|
|
# Save after each test (in case of interruption)
|
|
self.save_results()
|
|
|
|
# Mark test suite as complete
|
|
self.results['metadata']['test_end'] = datetime.now().isoformat()
|
|
self.save_results()
|
|
|
|
# Handle failed evaluations if in non-interactive mode
|
|
if self.non_interactive and self.failed_evaluations:
|
|
self.manual_review_failed_evaluations()
|
|
|
|
print("\n\n" + "="*80)
|
|
print("✅ TEST SUITE COMPLETE")
|
|
if self.non_interactive and self.failed_evaluations:
|
|
print(f" ({len(self.failed_evaluations)} test(s) manually reviewed)")
|
|
print("="*80)
|
|
self.display_summary()
|
|
|
|
def save_results(self):
|
|
"""Save results to JSON file"""
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"{self.model_name.replace(':', '_')}_{timestamp}.json"
|
|
filepath = self.output_dir / filename
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(self.results, f, indent=2, ensure_ascii=False)
|
|
|
|
# Also save as "latest" for this model
|
|
latest_file = self.output_dir / f"{self.model_name.replace(':', '_')}_latest.json"
|
|
with open(latest_file, 'w', encoding='utf-8') as f:
|
|
json.dump(self.results, f, indent=2, ensure_ascii=False)
|
|
|
|
def display_summary(self):
|
|
"""Display test summary"""
|
|
total = self.results['metadata']['total_tests']
|
|
completed = self.results['metadata']['completed_tests']
|
|
|
|
# Calculate statistics
|
|
scores = [r.get('score') or r.get('overall_score')
|
|
for r in self.results['test_results']]
|
|
scores = [s for s in scores if s is not None]
|
|
|
|
if scores:
|
|
avg_score = sum(scores) / len(scores)
|
|
print(f"\n📊 SUMMARY:")
|
|
print(f" Total Tests: {total}")
|
|
print(f" Completed: {completed}")
|
|
print(f" Average Score: {avg_score:.2f}/5.00")
|
|
print(f" Pass Rate: {len([s for s in scores if s >= 2]) / len(scores) * 100:.1f}%")
|
|
print(f" Exceptional Rate: {len([s for s in scores if s >= 4]) / len(scores) * 100:.1f}%")
|
|
|
|
# Calculate aggregate API metrics
|
|
total_prompt_tokens = 0
|
|
total_completion_tokens = 0
|
|
total_duration = 0
|
|
tests_with_metrics = 0
|
|
|
|
for result in self.results['test_results']:
|
|
# Single-turn tests
|
|
if result.get('api_metrics'):
|
|
usage = result['api_metrics'].get('usage', {})
|
|
total_prompt_tokens += usage.get('prompt_tokens', 0)
|
|
total_completion_tokens += usage.get('completion_tokens', 0)
|
|
total_duration += usage.get('total_duration', 0)
|
|
tests_with_metrics += 1
|
|
# Multi-turn tests
|
|
elif result.get('aggregate_metrics'):
|
|
usage = result['aggregate_metrics'].get('usage', {})
|
|
total_prompt_tokens += usage.get('total_prompt_tokens', 0)
|
|
total_completion_tokens += usage.get('total_completion_tokens', 0)
|
|
total_duration += usage.get('total_duration', 0)
|
|
tests_with_metrics += 1
|
|
|
|
if tests_with_metrics > 0:
|
|
print(f"\n⚡ API METRICS:")
|
|
print(f" Total Prompt Tokens: {total_prompt_tokens:,}")
|
|
print(f" Total Completion Tokens: {total_completion_tokens:,}")
|
|
print(f" Total Tokens: {total_prompt_tokens + total_completion_tokens:,}")
|
|
if total_duration > 0:
|
|
# Convert nanoseconds to seconds
|
|
duration_seconds = total_duration / 1_000_000_000
|
|
print(f" Total Duration: {duration_seconds:.2f}s")
|
|
if total_completion_tokens > 0:
|
|
tokens_per_second = total_completion_tokens / duration_seconds
|
|
print(f" Average Speed: {tokens_per_second:.2f} tokens/s")
|
|
|
|
print(f"\n💾 Results saved to: {self.output_dir}")
|
|
|
|
|
|
def main():
|
|
# Load environment variables from .env file if it exists
|
|
load_dotenv()
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="AI Model Evaluation Test Suite",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Test a single model (interactive mode)
|
|
python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q4_K_M
|
|
|
|
# Test with API key
|
|
python ai_eval.py --endpoint https://api.example.com --api-key sk-xxx --model qwen3:8b
|
|
|
|
# Non-interactive mode with evaluator
|
|
python ai_eval.py --non-interactive --evaluator-endpoint http://localhost:11434 --evaluator-model qwen3:14b
|
|
|
|
# Use .env file for configuration (recommended)
|
|
cp .env.example .env
|
|
# Edit .env with your settings
|
|
python ai_eval.py
|
|
|
|
# Test only forensics category
|
|
python ai_eval.py --endpoint http://localhost:11434 --model qwen3:14b --category "IT Forensics - File Systems"
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--endpoint',
|
|
default=os.getenv('MUT_ENDPOINT'),
|
|
help='OpenAI-compatible API endpoint for model under test (default: from .env MUT_ENDPOINT)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--api-key',
|
|
default=os.getenv('MUT_API_KEY', ''),
|
|
help='API key for model under test (default: from .env MUT_API_KEY)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--model',
|
|
default=os.getenv('MUT_MODEL'),
|
|
help='Model name/identifier to test (default: from .env MUT_MODEL)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--test-suite',
|
|
default=os.getenv('TEST_SUITE', 'test_suite.yaml'),
|
|
help='Path to test suite YAML file (default: from .env TEST_SUITE or test_suite.yaml)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--output-dir',
|
|
default=os.getenv('OUTPUT_DIR', 'results'),
|
|
help='Directory to save results (default: from .env OUTPUT_DIR or results)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--category',
|
|
default=os.getenv('FILTER_CATEGORY'),
|
|
help='Filter tests by category (default: from .env FILTER_CATEGORY)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--non-interactive',
|
|
action='store_true',
|
|
default=os.getenv('NON_INTERACTIVE', '').lower() in ('true', '1', 'yes'),
|
|
help='Run in non-interactive mode with automated evaluation (default: from .env NON_INTERACTIVE)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--evaluator-endpoint',
|
|
default=os.getenv('EVALUATOR_ENDPOINT'),
|
|
help='API endpoint for evaluator model (required for non-interactive mode, default: from .env EVALUATOR_ENDPOINT)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--evaluator-api-key',
|
|
default=os.getenv('EVALUATOR_API_KEY', ''),
|
|
help='API key for evaluator (default: from .env EVALUATOR_API_KEY)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--evaluator-model',
|
|
default=os.getenv('EVALUATOR_MODEL'),
|
|
help='Model name for evaluator (required for non-interactive mode, default: from .env EVALUATOR_MODEL)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--evaluator-temperature',
|
|
type=float,
|
|
default=float(os.getenv('EVALUATOR_TEMPERATURE', '0.3')),
|
|
help='Temperature for evaluator model (default: from .env EVALUATOR_TEMPERATURE or 0.3)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Test API connectivity without running the full test suite'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate required arguments
|
|
if not args.endpoint:
|
|
print("❌ Error: --endpoint is required (or set MUT_ENDPOINT in .env)")
|
|
print(" Example: --endpoint http://localhost:11434")
|
|
sys.exit(1)
|
|
|
|
if not args.model:
|
|
print("❌ Error: --model is required (or set MUT_MODEL in .env)")
|
|
print(" Example: --model qwen3:4b-q4_K_M")
|
|
sys.exit(1)
|
|
|
|
if args.non_interactive:
|
|
if not args.evaluator_endpoint:
|
|
print("❌ Error: --evaluator-endpoint is required for non-interactive mode")
|
|
print(" (or set EVALUATOR_ENDPOINT in .env)")
|
|
sys.exit(1)
|
|
if not args.evaluator_model:
|
|
print("❌ Error: --evaluator-model is required for non-interactive mode")
|
|
print(" (or set EVALUATOR_MODEL in .env)")
|
|
sys.exit(1)
|
|
|
|
# Parse model list (supports comma-separated models)
|
|
model_list = [m.strip() for m in args.model.split(',') if m.strip()]
|
|
|
|
# Dry run mode - just test connections
|
|
if args.dry_run:
|
|
print(f"\n{'='*80}")
|
|
print("🧪 DRY RUN MODE - Testing API Connectivity")
|
|
print(f"{'='*80}")
|
|
|
|
all_success = True
|
|
|
|
# Test MUT endpoint for each model
|
|
for idx, model_name in enumerate(model_list, 1):
|
|
if len(model_list) > 1:
|
|
print(f"\n--- Model {idx}/{len(model_list)} ---")
|
|
|
|
tester = AIModelTester(
|
|
endpoint=args.endpoint,
|
|
api_key=args.api_key,
|
|
model_name=model_name,
|
|
output_dir=args.output_dir,
|
|
non_interactive=args.non_interactive,
|
|
evaluator_endpoint=args.evaluator_endpoint,
|
|
evaluator_api_key=args.evaluator_api_key,
|
|
evaluator_model=args.evaluator_model,
|
|
evaluator_temperature=args.evaluator_temperature
|
|
)
|
|
|
|
success = tester.test_connection(
|
|
endpoint=args.endpoint,
|
|
api_key=args.api_key,
|
|
model=model_name,
|
|
endpoint_name="Model Under Test"
|
|
)
|
|
all_success = all_success and success
|
|
|
|
# Test evaluator endpoint if non-interactive mode
|
|
if args.non_interactive and args.evaluator_endpoint and args.evaluator_model:
|
|
print(f"\n{'='*80}")
|
|
tester = AIModelTester(
|
|
endpoint=args.endpoint,
|
|
api_key=args.api_key,
|
|
model_name=model_list[0],
|
|
output_dir=args.output_dir,
|
|
non_interactive=args.non_interactive,
|
|
evaluator_endpoint=args.evaluator_endpoint,
|
|
evaluator_api_key=args.evaluator_api_key,
|
|
evaluator_model=args.evaluator_model,
|
|
evaluator_temperature=args.evaluator_temperature
|
|
)
|
|
|
|
success = tester.test_connection(
|
|
endpoint=args.evaluator_endpoint,
|
|
api_key=args.evaluator_api_key,
|
|
model=args.evaluator_model,
|
|
endpoint_name="Evaluator"
|
|
)
|
|
all_success = all_success and success
|
|
|
|
print(f"\n{'='*80}")
|
|
if all_success:
|
|
print("✅ All connectivity tests passed")
|
|
print(f"{'='*80}")
|
|
sys.exit(0)
|
|
else:
|
|
print("❌ Some connectivity tests failed")
|
|
print(f"{'='*80}")
|
|
sys.exit(1)
|
|
|
|
if len(model_list) > 1:
|
|
print(f"\n🔄 Batch mode: Testing {len(model_list)} models")
|
|
print("=" * 80)
|
|
|
|
# Test each model
|
|
for idx, model_name in enumerate(model_list, 1):
|
|
if len(model_list) > 1:
|
|
print(f"\n{'='*80}")
|
|
print(f"📊 Model {idx}/{len(model_list)}: {model_name}")
|
|
print(f"{'='*80}\n")
|
|
|
|
# Initialize tester
|
|
tester = AIModelTester(
|
|
endpoint=args.endpoint,
|
|
api_key=args.api_key,
|
|
model_name=model_name,
|
|
output_dir=args.output_dir,
|
|
non_interactive=args.non_interactive,
|
|
evaluator_endpoint=args.evaluator_endpoint,
|
|
evaluator_api_key=args.evaluator_api_key,
|
|
evaluator_model=args.evaluator_model,
|
|
evaluator_temperature=args.evaluator_temperature
|
|
)
|
|
|
|
# Load test suite
|
|
if idx == 1 or len(model_list) == 1:
|
|
print(f"📁 Loading test suite from: {args.test_suite}")
|
|
test_suite = tester.load_test_suite(args.test_suite)
|
|
|
|
# Run tests
|
|
try:
|
|
tester.run_test_suite(test_suite, filter_category=args.category)
|
|
except KeyboardInterrupt:
|
|
print("\n\n⚠️ Test suite interrupted by user")
|
|
tester.results['metadata']['test_end'] = datetime.now().isoformat()
|
|
tester.save_results()
|
|
print(f"\n💾 Partial results saved to: {tester.output_dir}")
|
|
if len(model_list) > 1 and idx < len(model_list):
|
|
print(f"\n⚠️ Skipping remaining {len(model_list) - idx} models")
|
|
sys.exit(1)
|
|
|
|
if len(model_list) > 1:
|
|
print(f"\n{'='*80}")
|
|
print(f"✅ BATCH COMPLETE: Tested {len(model_list)} models")
|
|
print(f"{'='*80}")
|
|
print(f"\n💾 Results saved to: {args.output_dir}/")
|
|
print("\nTo compare results, run:")
|
|
print(" python analyze_results.py --compare")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |