#!/usr/bin/env python3 """ AI Model Evaluation Automation Script Runs comprehensive test suite against OpenAI-compatible API endpoints """ import yaml import json import requests import os import sys import time from datetime import datetime from typing import Dict, List, Any, Optional from pathlib import Path import argparse from dotenv import load_dotenv class AIModelTester: def __init__(self, endpoint: str, api_key: str, model_name: str, output_dir: str = "results", non_interactive: bool = False, evaluator_endpoint: Optional[str] = None, evaluator_api_key: Optional[str] = None, evaluator_model: Optional[str] = None, evaluator_temperature: float = 0.3): """ Initialize the AI Model Tester Args: endpoint: OpenAI-compatible API endpoint URL api_key: API key for authentication model_name: Name/identifier of the model being tested output_dir: Directory to save results non_interactive: If True, use evaluator API for automatic scoring evaluator_endpoint: API endpoint for evaluation model evaluator_api_key: API key for evaluator evaluator_model: Model name for evaluator evaluator_temperature: Temperature for evaluator """ self.endpoint = endpoint.rstrip('/') self.api_key = api_key self.model_name = model_name self.output_dir = Path(output_dir) self.output_dir.mkdir(exist_ok=True) # Non-interactive mode settings self.non_interactive = non_interactive self.evaluator_endpoint = evaluator_endpoint.rstrip('/') if evaluator_endpoint else None self.evaluator_api_key = evaluator_api_key self.evaluator_model = evaluator_model self.evaluator_temperature = evaluator_temperature if self.non_interactive: if not all([self.evaluator_endpoint, self.evaluator_model]): raise ValueError("Non-interactive mode requires evaluator_endpoint and evaluator_model") print(f"๐Ÿค– Non-interactive mode enabled") print(f" Evaluator: {self.evaluator_model} @ {self.evaluator_endpoint}") # Results storage self.results = { "metadata": { "model_name": model_name, "endpoint": endpoint, "test_start": datetime.now().isoformat(), "test_end": None, "total_tests": 0, "completed_tests": 0 }, "test_results": [] } # Current test session info self.current_test_id = None self.conversation_history = [] # Track failed auto-evaluations for manual review self.failed_evaluations = [] def test_connection(self, endpoint: str, api_key: str, model: str, endpoint_name: str = "API") -> bool: """ Test if an API endpoint can be reached and authenticated against Args: endpoint: API endpoint URL api_key: API key for authentication model: Model name to test endpoint_name: Name for display purposes Returns: True if connection successful, False otherwise """ print(f"\n๐Ÿ” Testing {endpoint_name} connection...") print(f" Endpoint: {endpoint}") print(f" Model: {model}") headers = { "Content-Type": "application/json" } if api_key: headers["Authorization"] = f"Bearer {api_key}" print(f" API Key: {api_key[:10]}..." if len(api_key) > 10 else " API Key: [set]") else: print(f" API Key: [none]") # Simple test message payload = { "model": model, "messages": [{"role": "user", "content": "test"}], "max_tokens": 1 } try: response = requests.post( f"{endpoint}/v1/chat/completions", headers=headers, json=payload, timeout=60 ) response.raise_for_status() print(f" โœ… {endpoint_name} connection successful") return True except requests.exceptions.HTTPError as e: print(f" โŒ {endpoint_name} HTTP error: {e.response.status_code}") print(f" {e.response.text[:200]}") return False except requests.exceptions.ConnectionError as e: print(f" โŒ {endpoint_name} connection failed: Cannot reach endpoint") return False except requests.exceptions.Timeout: print(f" โŒ {endpoint_name} connection timeout") return False except requests.exceptions.RequestException as e: print(f" โŒ {endpoint_name} error: {e}") return False def load_test_suite(self, yaml_file: str) -> Dict: """Load test suite from YAML file""" try: with open(yaml_file, 'r', encoding='utf-8') as f: return yaml.safe_load(f) except FileNotFoundError: print(f"Error: Test suite file not found: {yaml_file}") print(f"Please ensure {yaml_file} is in the current directory.") sys.exit(1) except yaml.YAMLError as e: print(f"Error: Invalid YAML format in {yaml_file}") print(f"Details: {e}") sys.exit(1) except Exception as e: print(f"Error loading test suite: {e}") sys.exit(1) def call_api(self, messages: List[Dict], temperature: float = 0.7, max_tokens: int = 2000) -> Optional[Dict]: """ Call the OpenAI-compatible API Args: messages: List of message dicts with 'role' and 'content' temperature: Sampling temperature max_tokens: Maximum tokens in response Returns: API response dict or None if error """ headers = { "Content-Type": "application/json" } # Only add Authorization header if API key is provided if self.api_key: headers["Authorization"] = f"Bearer {self.api_key}" payload = { "model": self.model_name, "messages": messages, "temperature": temperature, "max_tokens": max_tokens } try: print(f"\nDEBUG: Calling API endpoint: {self.endpoint}/v1/chat/completions") print(f"DEBUG: Model name: {self.model_name}") response = requests.post( f"{self.endpoint}/v1/chat/completions", headers=headers, json=payload, timeout=240 ) response.raise_for_status() return response.json() except requests.exceptions.Timeout as e: print(f"\nโš ๏ธ API Timeout Error: Request exceeded 120 seconds") print(f" This usually means the model is taking too long to generate a response.") print(f" Consider using a faster model or reducing the complexity of the prompt.") return None except requests.exceptions.HTTPError as e: print(f"\nโš ๏ธ API HTTP Error: {e}") print(f" Status Code: {e.response.status_code}") print(f" Response: {e.response.text[:500]}") # Show request details for debugging print("\n REQUEST DETAILS:") print(f" URL: {self.endpoint}/v1/chat/completions") print(f" Model: {self.model_name}") return None except requests.exceptions.ConnectionError as e: print(f"\nโš ๏ธ API Connection Error: {e}") print(f" Could not connect to {self.endpoint}") print(f" Please check your network connection and endpoint URL.") return None except requests.exceptions.RequestException as e: print(f"\nโš ๏ธ API Request Error: {e}") if hasattr(e, 'response') and e.response is not None: print(f" Response: {e.response.text[:500]}") return None def display_test_info(self, test: Dict, category: str): """Display test information to user""" print("\n" + "="*80) print(f"๐Ÿ“‹ CATEGORY: {category}") print(f"๐Ÿ†” Test ID: {test['id']}") print(f"๐Ÿ“ Test Name: {test['name']}") print(f"๐ŸŽฏ Type: {test['type']}") print(f"โšก Difficulty: {test.get('expected_difficulty', 'N/A')}") print("="*80) def display_prompt(self, prompt: str, turn: Optional[int] = None): """Display the prompt being sent""" if turn is not None: print(f"\n๐Ÿ”„ TURN {turn}:") else: print(f"\n๐Ÿ’ฌ PROMPT:") print("-"*80) print(prompt) print("-"*80) def display_response(self, response_text: str, raw_response: Optional[Dict] = None, generation_time: Optional[float] = None): """Display the model's response with timing and token usage metrics""" print("\nRAW API RESPONSE:") print("="*80) import json print(json.dumps(raw_response, indent=2)) print("="*80) # Display timing and token metrics if available if generation_time is not None: print(f"\nโฑ๏ธ Generation Time: {generation_time:.2f}s") if raw_response and 'usage' in raw_response: usage = raw_response['usage'] print(f"\n๐Ÿ“Š Token Usage:") if 'prompt_tokens' in usage: print(f" Prompt Tokens: {usage['prompt_tokens']}") if 'completion_tokens' in usage: print(f" Completion Tokens: {usage['completion_tokens']}") if 'total_tokens' in usage: print(f" Total Tokens: {usage['total_tokens']}") # Calculate tokens per second if we have timing if generation_time and generation_time > 0 and 'completion_tokens' in usage: tokens_per_sec = usage['completion_tokens'] / generation_time print(f" Speed: {tokens_per_sec:.2f} tokens/sec") print(f"\n\nMODEL RESPONSE:") print("-"*80) print(response_text) print("-"*80) def display_evaluation_criteria(self, criteria: List[str]): """Display evaluation criteria for the test""" print(f"\nโœ… EVALUATION CRITERIA:") for i, criterion in enumerate(criteria, 1): print(f" {i}. {criterion}") def get_user_score(self) -> Dict: """Prompt user for evaluation score""" print("\n" + "="*80) print("๐Ÿ“Š EVALUATION SCORING RUBRIC:") print(" 0-1: FAIL - Major errors, fails to meet basic requirements") print(" 2-3: PASS - Meets requirements with minor issues") print(" 4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding") print("="*80) while True: try: score_input = input("\n๐Ÿ‘‰ Enter score (0-5) or 'skip' to skip this test: ").strip().lower() if score_input == 'skip': return {"score": None, "notes": "Skipped by user"} score = int(score_input) if 0 <= score <= 5: notes = input("๐Ÿ“ Notes (optional, press Enter to skip): ").strip() return {"score": score, "notes": notes if notes else ""} else: print("โŒ Score must be between 0 and 5") except ValueError: print("โŒ Invalid input. Please enter a number between 0 and 5, or 'skip'") except KeyboardInterrupt: print("\n\nโš ๏ธ Test interrupted by user") return {"score": None, "notes": "Interrupted"} def extract_api_metrics(self, response: Dict) -> Optional[Dict]: """ Extract all available metrics from API response Args: response: The API response dict Returns: Dict with usage statistics and timing information, or None if no metrics available """ metrics = {} if response and isinstance(response, dict): # Extract usage statistics if available usage = response.get('usage', {}) if usage: metrics['usage'] = { 'prompt_tokens': usage.get('prompt_tokens'), 'completion_tokens': usage.get('completion_tokens'), 'total_tokens': usage.get('total_tokens'), 'prompt_eval_count': usage.get('prompt_eval_count'), 'eval_count': usage.get('eval_count'), 'prompt_eval_duration': usage.get('prompt_eval_duration'), 'eval_duration': usage.get('eval_duration'), 'load_duration': usage.get('load_duration'), 'total_duration': usage.get('total_duration'), 'response_token_s': usage.get('response_token/s'), 'prompt_token_s': usage.get('prompt_token/s'), } # Remove None values metrics['usage'] = {k: v for k, v in metrics['usage'].items() if v is not None} # Extract model info if 'model' in response: metrics['model'] = response['model'] # Extract creation timestamp if 'created' in response: metrics['created'] = response['created'] # Extract finish reason if 'choices' in response and len(response['choices']) > 0: finish_reason = response['choices'][0].get('finish_reason') if finish_reason: metrics['finish_reason'] = finish_reason return metrics if metrics else None def get_evaluator_score(self, prompt: str, response: str, criteria: List[str]) -> Dict: """ Get automated score from evaluator API Args: prompt: The original prompt sent to the model under test response: The model's response to evaluate criteria: List of evaluation criteria Returns: Dict with 'score', 'notes', and 'auto_eval_failed' keys """ # Construct evaluation prompt criteria_text = "\n".join([f"{i+1}. {c}" for i, c in enumerate(criteria)]) eval_system_prompt = """You are an expert AI model evaluator. Your task is to objectively assess AI model responses based on specific criteria. Scoring Rubric: - 0-1: FAIL - Major errors, fails to meet basic requirements, incorrect or misleading information - 2-3: PASS - Meets requirements with minor issues, mostly correct with some imperfections - 4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding, accurate and comprehensive CRITICAL: You MUST respond with ONLY valid JSON. Do not include any explanatory text before or after the JSON. Response format (use ONLY this, nothing else): {"score": , "notes": ""}""" eval_user_prompt = f"""Evaluate this AI model response against the criteria below. **Original Prompt:** {prompt} **Model Response:** {response} **Evaluation Criteria:** {criteria_text} Respond with ONLY valid JSON in this exact format: {{"score": , "notes": ""}} Do not include any text before or after the JSON object.""" messages = [ {"role": "system", "content": eval_system_prompt}, {"role": "user", "content": eval_user_prompt} ] # Call evaluator API headers = { "Content-Type": "application/json" } if self.evaluator_api_key: headers["Authorization"] = f"Bearer {self.evaluator_api_key}" payload = { "model": self.evaluator_model, "messages": messages, "temperature": self.evaluator_temperature, "max_tokens": 500 } try: print(f"\n๐Ÿค– Calling evaluator API for automated scoring...") response_obj = requests.post( f"{self.evaluator_endpoint}/v1/chat/completions", headers=headers, json=payload, timeout=90 # Increased timeout for evaluator ) response_obj.raise_for_status() eval_response = response_obj.json() # Extract the evaluation eval_text = eval_response['choices'][0]['message']['content'].strip() # Try to extract JSON from the response # Handle case where model wraps JSON in markdown code blocks or adds extra text json_obj = None # Try direct parsing first try: json_obj = json.loads(eval_text) except json.JSONDecodeError: # Try to find JSON in markdown code blocks if '```json' in eval_text or '```' in eval_text: # Extract content between code fences import re json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', eval_text, re.DOTALL) if json_match: json_obj = json.loads(json_match.group(1)) # Try to find any JSON object in the text if json_obj is None: import re json_match = re.search(r'\{[^{}]*"score"[^{}]*"notes"[^{}]*\}', eval_text, re.DOTALL) if json_match: json_obj = json.loads(json_match.group(0)) if json_obj is None: raise json.JSONDecodeError("No valid JSON found", eval_text, 0) score = int(json_obj.get('score', 0)) notes = json_obj.get('notes', 'Automated evaluation') # Validate score range if not 0 <= score <= 5: print(f"โš ๏ธ Evaluator returned invalid score {score}, clamping to 0-5") score = max(0, min(5, score)) print(f" Score: {score}/5") print(f" Notes: {notes}") return {"score": score, "notes": f"[Auto-Reviev by {self.evaluator_model}] {notes}", "auto_eval_failed": False} except json.JSONDecodeError as e: print(f"โš ๏ธ Failed to parse evaluator response as JSON: {e}") print(f" Raw response: {eval_text[:200]}") print(f" Marking for manual review") return {"score": None, "notes": f"[AUTO-ERROR] Evaluator returned non-JSON response", "auto_eval_failed": True} except (KeyError, IndexError, ValueError) as e: print(f"โš ๏ธ Invalid evaluator response format: {e}") print(f" Marking for manual review") return {"score": None, "notes": f"[AUTO-ERROR] Invalid response format: {str(e)}", "auto_eval_failed": True} except requests.exceptions.Timeout as e: print(f"โš ๏ธ Evaluator API timeout: Request exceeded 90 seconds") print(f" The evaluator model is taking too long to respond.") print(f" Marking for manual review") return {"score": None, "notes": "[AUTO-ERROR] Evaluator API timeout", "auto_eval_failed": True} except requests.exceptions.ConnectionError as e: print(f"โš ๏ธ Evaluator connection error: {e}") print(f" Could not connect to evaluator endpoint.") print(f" Marking for manual review") return {"score": None, "notes": "[AUTO-ERROR] Cannot connect to evaluator", "auto_eval_failed": True} except requests.exceptions.RequestException as e: print(f"โš ๏ธ Evaluator API error: {e}") print(f" Marking for manual review") return {"score": None, "notes": f"[AUTO-ERROR] API call failed: {str(e)[:100]}", "auto_eval_failed": True} def run_single_turn_test(self, test: Dict, category: str) -> Dict: """Run a single-turn test""" self.display_test_info(test, category) self.display_prompt(test['prompt']) # Prepare messages messages = [{"role": "user", "content": test['prompt']}] # Call API and measure time start_time = time.time() response = self.call_api(messages) generation_time = time.time() - start_time if response is None: return { "test_id": test['id'], "test_name": test['name'], "category": category, "type": "single_turn", "status": "api_error", "score": None, "notes": "API call failed", "generation_time": generation_time } # Extract response text with better error handling try: message = response['choices'][0]['message'] # Try to get content, if empty check for reasoning_content response_text = message.get('content', '') # If content is empty but reasoning_content exists, use that if not response_text and 'reasoning_content' in message: response_text = message['reasoning_content'] print("\nโš ๏ธ Note: Response contained only reasoning_content, no actual content generated") # If still empty, check for tool_calls (model might be trying to call functions) if not response_text and 'tool_calls' in message: print("\nโš ๏ธ Warning: Model attempted to call tools instead of generating content") tool_info = json.dumps(message['tool_calls'], indent=2) response_text = f"[MODEL ERROR: Attempted tool calls instead of text response]\n{tool_info}" # If completely empty, this is an error if not response_text: print("\nโš ๏ธ ERROR: Model returned completely empty response") response_text = "[ERROR: Empty response from model]" except (KeyError, IndexError, TypeError) as e: print(f"\nERROR: Failed to parse response - {e}") print("\nRAW API RESPONSE:") print("="*80) print(json.dumps(response, indent=2)) print("="*80) response_text = f"[PARSING ERROR: {e}]" self.display_response(response_text, raw_response=response, generation_time=generation_time) # Display evaluation criteria self.display_evaluation_criteria(test.get('evaluation_criteria', [])) # Get evaluation (interactive or automated) # Skip automated evaluation if response is an error if self.non_interactive: # Check if response is actually an error if response_text.startswith('[ERROR:') or response_text.startswith('[MODEL ERROR:') or response_text.startswith('[PARSING ERROR:'): print(f"\nโš ๏ธ Skipping automated evaluation due to model error") evaluation = {"score": 0, "notes": "[AUTO-SKIP] Model failed to generate valid response", "auto_eval_failed": False} else: evaluation = self.get_evaluator_score( prompt=test['prompt'], response=response_text, criteria=test.get('evaluation_criteria', []) ) # Track failed evaluations for manual review if evaluation.get('auto_eval_failed', False): self.failed_evaluations.append({ 'test_id': test['id'], 'test_name': test['name'], 'category': category, 'type': 'single_turn', 'prompt': test['prompt'], 'response': response_text, 'criteria': test.get('evaluation_criteria', []), 'error': evaluation['notes'] }) else: evaluation = self.get_user_score() # Extract API metrics api_metrics = self.extract_api_metrics(response) result = { "test_id": test['id'], "test_name": test['name'], "category": category, "type": "single_turn", "difficulty": test.get('expected_difficulty', 'unknown'), "prompt": test['prompt'], "response": response_text, "raw_response": response if response_text.startswith("[PARSING ERROR") else None, "evaluation_criteria": test.get('evaluation_criteria', []), "score": evaluation['score'], "notes": evaluation['notes'], "status": "completed" if evaluation['score'] is not None else "skipped", "timestamp": datetime.now().isoformat(), "generation_time": generation_time } # Add metrics if available if api_metrics: result['api_metrics'] = api_metrics return result def run_multi_turn_test(self, test: Dict, category: str) -> Dict: """Run a multi-turn test""" self.display_test_info(test, category) # Initialize conversation history self.conversation_history = [] turn_results = [] for i, turn_data in enumerate(test['turns'], 1): turn_num = turn_data['turn'] prompt = turn_data['prompt'] self.display_prompt(prompt, turn_num) # Add to conversation history self.conversation_history.append({"role": "user", "content": prompt}) # Call API with full conversation history and measure time start_time = time.time() response = self.call_api(self.conversation_history) generation_time = time.time() - start_time if response is None: turn_results.append({ "turn": turn_num, "status": "api_error", "prompt": prompt, "response": None, "score": None, "notes": "API error - failed to get response", "generation_time": generation_time }) break # Extract and display response with better error handling try: message = response['choices'][0]['message'] # Try to get content, if empty check for reasoning_content response_text = message.get('content', '') # If content is empty but reasoning_content exists, use that if not response_text and 'reasoning_content' in message: response_text = message['reasoning_content'] print("\nโš ๏ธ Note: Response contained only reasoning_content, no actual content generated") # If still empty, check for tool_calls if not response_text and 'tool_calls' in message: print("\nโš ๏ธ Warning: Model attempted to call tools instead of generating content") import json tool_info = json.dumps(message['tool_calls'], indent=2) response_text = f"[MODEL ERROR: Attempted tool calls instead of text response]\n{tool_info}" # If completely empty, this is an error if not response_text: print("\nโš ๏ธ ERROR: Model returned completely empty response") response_text = "[ERROR: Empty response from model]" except (KeyError, IndexError, TypeError) as e: print(f"\nERROR: Failed to parse response - {e}") print("\nRAW API RESPONSE:") print("="*80) import json print(json.dumps(response, indent=2)) print("="*80) response_text = f"[PARSING ERROR: {e}]" self.display_response(response_text, raw_response=response, generation_time=generation_time) # Add assistant response to history self.conversation_history.append({"role": "assistant", "content": response_text}) # Display criteria for this turn self.display_evaluation_criteria(turn_data.get('evaluation_criteria', [])) # Get evaluation for this turn (interactive or automated) print(f"\nEvaluate Turn {turn_num}:") if self.non_interactive: # Skip automated evaluation if response is an error if response_text.startswith('[ERROR:') or response_text.startswith('[MODEL ERROR:') or response_text.startswith('[PARSING ERROR:'): print(f"\nโš ๏ธ Skipping automated evaluation due to model error") evaluation = {"score": 0, "notes": "[AUTO-SKIP] Model failed to generate valid response", "auto_eval_failed": False} else: evaluation = self.get_evaluator_score( prompt=prompt, response=response_text, criteria=turn_data.get('evaluation_criteria', []) ) # Track failed evaluations for manual review if evaluation.get('auto_eval_failed', False): self.failed_evaluations.append({ 'test_id': test['id'], 'test_name': test['name'], 'category': category, 'type': 'multi_turn', 'turn': turn_num, 'prompt': prompt, 'response': response_text, 'criteria': turn_data.get('evaluation_criteria', []), 'error': evaluation['notes'] }) else: evaluation = self.get_user_score() # Extract API metrics for this turn api_metrics = self.extract_api_metrics(response) turn_result = { "turn": turn_num, "prompt": prompt, "response": response_text, "evaluation_criteria": turn_data.get('evaluation_criteria', []), "score": evaluation['score'], "notes": evaluation['notes'], "status": "completed" if evaluation['score'] is not None else "skipped", "generation_time": generation_time } # Add metrics if available if api_metrics: turn_result['api_metrics'] = api_metrics turn_results.append(turn_result) if evaluation['score'] is None: print(f"\nTurn {turn_num} skipped, stopping multi-turn test") break # Calculate overall score for multi-turn test valid_scores = [t['score'] for t in turn_results if t['score'] is not None] overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else None # Aggregate metrics across all turns aggregate_metrics = {} turn_metrics = [t.get('api_metrics') for t in turn_results if t.get('api_metrics')] if turn_metrics: # Sum up token counts and durations total_prompt_tokens = sum(m.get('usage', {}).get('prompt_tokens', 0) for m in turn_metrics) total_completion_tokens = sum(m.get('usage', {}).get('completion_tokens', 0) for m in turn_metrics) total_duration = sum(m.get('usage', {}).get('total_duration', 0) for m in turn_metrics) aggregate_metrics['usage'] = { 'total_prompt_tokens': total_prompt_tokens if total_prompt_tokens else None, 'total_completion_tokens': total_completion_tokens if total_completion_tokens else None, 'total_tokens': (total_prompt_tokens + total_completion_tokens) if (total_prompt_tokens or total_completion_tokens) else None, 'total_duration': total_duration if total_duration else None, 'turn_count': len(turn_metrics) } # Remove None values aggregate_metrics['usage'] = {k: v for k, v in aggregate_metrics['usage'].items() if v is not None} result = { "test_id": test['id'], "test_name": test['name'], "category": category, "type": "multi_turn", "difficulty": test.get('expected_difficulty', 'unknown'), "turns": turn_results, "overall_score": overall_score, "status": "completed" if overall_score is not None else "incomplete", "timestamp": datetime.now().isoformat() } # Add aggregate metrics if available if aggregate_metrics: result['aggregate_metrics'] = aggregate_metrics return result def manual_review_failed_evaluations(self): """Present failed automated evaluations for manual review""" print("\n\n" + "="*80) print("โš ๏ธ MANUAL REVIEW REQUIRED") print("="*80) print(f"\n{len(self.failed_evaluations)} test(s) could not be automatically evaluated.") print("Please provide manual scores for these tests.\n") for idx, failed in enumerate(self.failed_evaluations, 1): print("\n" + "="*80) print(f"๐Ÿ“‹ MANUAL REVIEW {idx}/{len(self.failed_evaluations)}") print("="*80) print(f"๐Ÿ†” Test ID: {failed['test_id']}") print(f"๐Ÿ“ Test Name: {failed['test_name']}") print(f"๐Ÿ“‚ Category: {failed['category']}") if failed['type'] == 'multi_turn': print(f"๐Ÿ”„ Turn: {failed['turn']}") print(f"โŒ Auto-Eval Error: {failed['error']}") print(f"\n๐Ÿ’ฌ PROMPT:") print("-"*80) print(failed['prompt']) print("-"*80) print(f"\nMODEL RESPONSE:") print("-"*80) print(failed['response']) print("-"*80) print(f"\nโœ… EVALUATION CRITERIA:") for i, criterion in enumerate(failed['criteria'], 1): print(f" {i}. {criterion}") # Get manual score print("\n" + "="*80) print("๐Ÿ“Š EVALUATION SCORING RUBRIC:") print(" 0-1: FAIL - Major errors, fails to meet basic requirements") print(" 2-3: PASS - Meets requirements with minor issues") print(" 4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding") print("="*80) manual_evaluation = self.get_user_score() # Update the corresponding test result self.update_test_result_with_manual_score( failed['test_id'], failed.get('turn'), manual_evaluation ) # Save updated results self.save_results() print("\nโœ… All manual reviews completed and saved!") def update_test_result_with_manual_score(self, test_id: str, turn: Optional[int], evaluation: Dict): """Update a test result with manually provided score""" for result in self.results['test_results']: if result['test_id'] == test_id: if turn is None: # Single-turn test result['score'] = evaluation['score'] # Only add MANUAL-OVERRIDE prefix if a score was actually provided if evaluation['score'] is not None: result['notes'] = f"[MANUAL-OVERRIDE] {evaluation['notes']}" if evaluation['notes'] else "[MANUAL-OVERRIDE]" result['status'] = 'completed' else: result['notes'] = evaluation['notes'] if evaluation['notes'] else "Manual review skipped" result['status'] = 'manual_review_skipped' else: # Multi-turn test - update specific turn for turn_result in result.get('turns', []): if turn_result['turn'] == turn: turn_result['score'] = evaluation['score'] # Only add MANUAL-OVERRIDE prefix if a score was actually provided if evaluation['score'] is not None: turn_result['notes'] = f"[MANUAL-OVERRIDE] {evaluation['notes']}" if evaluation['notes'] else "[MANUAL-OVERRIDE]" turn_result['status'] = 'completed' else: turn_result['notes'] = evaluation['notes'] if evaluation['notes'] else "Manual review skipped" turn_result['status'] = 'manual_review_skipped' # Recalculate overall score valid_scores = [t['score'] for t in result['turns'] if t['score'] is not None] result['overall_score'] = sum(valid_scores) / len(valid_scores) if valid_scores else None result['status'] = 'completed' if result['overall_score'] is not None else 'incomplete' break def run_test_suite(self, test_suite: Dict, filter_category: Optional[str] = None): """Run the complete test suite""" print("\n" + "="*80) print(f"๐Ÿš€ STARTING TEST SUITE") print(f"๐Ÿ“ฆ Model: {self.model_name}") print(f"๐Ÿ”— Endpoint: {self.endpoint}") print("="*80) # Count total tests total_tests = 0 for cat_data in test_suite.get('test_categories', []): if filter_category and cat_data['category'] != filter_category: continue total_tests += len(cat_data.get('tests', [])) self.results['metadata']['total_tests'] = total_tests # Run tests by category test_count = 0 for cat_data in test_suite.get('test_categories', []): category = cat_data['category'] # Apply category filter if specified if filter_category and category != filter_category: continue print(f"\n\n{'='*80}") print(f"๐Ÿ“‚ CATEGORY: {category}") print(f"{'='*80}") for test in cat_data.get('tests', []): test_count += 1 print(f"\n๐Ÿ“Š Progress: {test_count}/{total_tests}") # Run appropriate test type if test.get('type') == 'single_turn': result = self.run_single_turn_test(test, category) elif test.get('type') == 'multi_turn': result = self.run_multi_turn_test(test, category) else: print(f"โš ๏ธ Unknown test type: {test.get('type')}") continue self.results['test_results'].append(result) self.results['metadata']['completed_tests'] += 1 # Save after each test (in case of interruption) self.save_results() # Mark test suite as complete self.results['metadata']['test_end'] = datetime.now().isoformat() self.save_results() # Handle failed evaluations if in non-interactive mode if self.non_interactive and self.failed_evaluations: self.manual_review_failed_evaluations() print("\n\n" + "="*80) print("โœ… TEST SUITE COMPLETE") if self.non_interactive and self.failed_evaluations: print(f" ({len(self.failed_evaluations)} test(s) manually reviewed)") print("="*80) self.display_summary() def save_results(self): """Save results to JSON file""" # Ensure output directory exists self.output_dir.mkdir(parents=True, exist_ok=True) # Sanitize model name for use in filename (replace problematic characters) safe_model_name = self.model_name.replace('/', '_').replace(':', '_') timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"{safe_model_name}_{timestamp}.json" filepath = self.output_dir / filename with open(filepath, 'w', encoding='utf-8') as f: json.dump(self.results, f, indent=2, ensure_ascii=False) # Also save as "latest" for this model latest_file = self.output_dir / f"{safe_model_name}_latest.json" with open(latest_file, 'w', encoding='utf-8') as f: json.dump(self.results, f, indent=2, ensure_ascii=False) def display_summary(self): """Display test summary""" total = self.results['metadata']['total_tests'] completed = self.results['metadata']['completed_tests'] # Calculate statistics scores = [r.get('score') or r.get('overall_score') for r in self.results['test_results']] scores = [s for s in scores if s is not None] if scores: avg_score = sum(scores) / len(scores) print(f"\n๐Ÿ“Š SUMMARY:") print(f" Total Tests: {total}") print(f" Completed: {completed}") print(f" Average Score: {avg_score:.2f}/5.00") print(f" Pass Rate: {len([s for s in scores if s >= 2]) / len(scores) * 100:.1f}%") print(f" Exceptional Rate: {len([s for s in scores if s >= 4]) / len(scores) * 100:.1f}%") # Calculate aggregate API metrics total_prompt_tokens = 0 total_completion_tokens = 0 total_duration = 0 tests_with_metrics = 0 for result in self.results['test_results']: # Single-turn tests if result.get('api_metrics'): usage = result['api_metrics'].get('usage', {}) total_prompt_tokens += usage.get('prompt_tokens', 0) total_completion_tokens += usage.get('completion_tokens', 0) total_duration += usage.get('total_duration', 0) tests_with_metrics += 1 # Multi-turn tests elif result.get('aggregate_metrics'): usage = result['aggregate_metrics'].get('usage', {}) total_prompt_tokens += usage.get('total_prompt_tokens', 0) total_completion_tokens += usage.get('total_completion_tokens', 0) total_duration += usage.get('total_duration', 0) tests_with_metrics += 1 if tests_with_metrics > 0: print(f"\nโšก API METRICS:") print(f" Total Prompt Tokens: {total_prompt_tokens:,}") print(f" Total Completion Tokens: {total_completion_tokens:,}") print(f" Total Tokens: {total_prompt_tokens + total_completion_tokens:,}") if total_duration > 0: # Convert nanoseconds to seconds duration_seconds = total_duration / 1_000_000_000 print(f" Total Duration: {duration_seconds:.2f}s") if total_completion_tokens > 0: tokens_per_second = total_completion_tokens / duration_seconds print(f" Average Speed: {tokens_per_second:.2f} tokens/s") print(f"\n๐Ÿ’พ Results saved to: {self.output_dir}") def main(): # Load environment variables from .env file if it exists load_dotenv() parser = argparse.ArgumentParser( description="AI Model Evaluation Test Suite", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Test a single model (interactive mode) python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q4_K_M # Test with API key python ai_eval.py --endpoint https://api.example.com --api-key sk-xxx --model qwen3:8b # Non-interactive mode with evaluator python ai_eval.py --non-interactive --evaluator-endpoint http://localhost:11434 --evaluator-model qwen3:14b # Use .env file for configuration (recommended) cp .env.example .env # Edit .env with your settings python ai_eval.py # Test only forensics category python ai_eval.py --endpoint http://localhost:11434 --model qwen3:14b --category "IT Forensics - File Systems" """ ) parser.add_argument( '--endpoint', default=os.getenv('MUT_ENDPOINT'), help='OpenAI-compatible API endpoint for model under test (default: from .env MUT_ENDPOINT)' ) parser.add_argument( '--api-key', default=os.getenv('MUT_API_KEY', ''), help='API key for model under test (default: from .env MUT_API_KEY)' ) parser.add_argument( '--model', default=os.getenv('MUT_MODEL'), help='Model name/identifier to test (default: from .env MUT_MODEL)' ) parser.add_argument( '--test-suite', default=os.getenv('TEST_SUITE', 'test_suite.yaml'), help='Path to test suite YAML file (default: from .env TEST_SUITE or test_suite.yaml)' ) parser.add_argument( '--output-dir', default=os.getenv('OUTPUT_DIR', 'results'), help='Directory to save results (default: from .env OUTPUT_DIR or results)' ) parser.add_argument( '--category', default=os.getenv('FILTER_CATEGORY'), help='Filter tests by category (default: from .env FILTER_CATEGORY)' ) parser.add_argument( '--non-interactive', action='store_true', default=os.getenv('NON_INTERACTIVE', '').lower() in ('true', '1', 'yes'), help='Run in non-interactive mode with automated evaluation (default: from .env NON_INTERACTIVE)' ) parser.add_argument( '--evaluator-endpoint', default=os.getenv('EVALUATOR_ENDPOINT'), help='API endpoint for evaluator model (required for non-interactive mode, default: from .env EVALUATOR_ENDPOINT)' ) parser.add_argument( '--evaluator-api-key', default=os.getenv('EVALUATOR_API_KEY', ''), help='API key for evaluator (default: from .env EVALUATOR_API_KEY)' ) parser.add_argument( '--evaluator-model', default=os.getenv('EVALUATOR_MODEL'), help='Model name for evaluator (required for non-interactive mode, default: from .env EVALUATOR_MODEL)' ) parser.add_argument( '--evaluator-temperature', type=float, default=float(os.getenv('EVALUATOR_TEMPERATURE', '0.3')), help='Temperature for evaluator model (default: from .env EVALUATOR_TEMPERATURE or 0.3)' ) parser.add_argument( '--dry-run', action='store_true', help='Test API connectivity without running the full test suite' ) args = parser.parse_args() # Validate required arguments if not args.endpoint: print("โŒ Error: --endpoint is required (or set MUT_ENDPOINT in .env)") print(" Example: --endpoint http://localhost:11434") sys.exit(1) if not args.model: print("โŒ Error: --model is required (or set MUT_MODEL in .env)") print(" Example: --model qwen3:4b-q4_K_M") sys.exit(1) if args.non_interactive: if not args.evaluator_endpoint: print("โŒ Error: --evaluator-endpoint is required for non-interactive mode") print(" (or set EVALUATOR_ENDPOINT in .env)") sys.exit(1) if not args.evaluator_model: print("โŒ Error: --evaluator-model is required for non-interactive mode") print(" (or set EVALUATOR_MODEL in .env)") sys.exit(1) # Parse model list (supports comma-separated models) model_list = [m.strip() for m in args.model.split(',') if m.strip()] # Dry run mode - just test connections if args.dry_run: print(f"\n{'='*80}") print("๐Ÿงช DRY RUN MODE - Testing API Connectivity") print(f"{'='*80}") all_success = True # Test MUT endpoint for each model for idx, model_name in enumerate(model_list, 1): if len(model_list) > 1: print(f"\n--- Model {idx}/{len(model_list)} ---") tester = AIModelTester( endpoint=args.endpoint, api_key=args.api_key, model_name=model_name, output_dir=args.output_dir, non_interactive=args.non_interactive, evaluator_endpoint=args.evaluator_endpoint, evaluator_api_key=args.evaluator_api_key, evaluator_model=args.evaluator_model, evaluator_temperature=args.evaluator_temperature ) success = tester.test_connection( endpoint=args.endpoint, api_key=args.api_key, model=model_name, endpoint_name="Model Under Test" ) all_success = all_success and success # Test evaluator endpoint if non-interactive mode if args.non_interactive and args.evaluator_endpoint and args.evaluator_model: print(f"\n{'='*80}") tester = AIModelTester( endpoint=args.endpoint, api_key=args.api_key, model_name=model_list[0], output_dir=args.output_dir, non_interactive=args.non_interactive, evaluator_endpoint=args.evaluator_endpoint, evaluator_api_key=args.evaluator_api_key, evaluator_model=args.evaluator_model, evaluator_temperature=args.evaluator_temperature ) success = tester.test_connection( endpoint=args.evaluator_endpoint, api_key=args.evaluator_api_key, model=args.evaluator_model, endpoint_name="Evaluator" ) all_success = all_success and success print(f"\n{'='*80}") if all_success: print("โœ… All connectivity tests passed") print(f"{'='*80}") sys.exit(0) else: print("โŒ Some connectivity tests failed") print(f"{'='*80}") sys.exit(1) if len(model_list) > 1: print(f"\n๐Ÿ”„ Batch mode: Testing {len(model_list)} models") print("=" * 80) # Test each model for idx, model_name in enumerate(model_list, 1): if len(model_list) > 1: print(f"\n{'='*80}") print(f"๐Ÿ“Š Model {idx}/{len(model_list)}: {model_name}") print(f"{'='*80}\n") # Initialize tester tester = AIModelTester( endpoint=args.endpoint, api_key=args.api_key, model_name=model_name, output_dir=args.output_dir, non_interactive=args.non_interactive, evaluator_endpoint=args.evaluator_endpoint, evaluator_api_key=args.evaluator_api_key, evaluator_model=args.evaluator_model, evaluator_temperature=args.evaluator_temperature ) # Load test suite if idx == 1 or len(model_list) == 1: print(f"๐Ÿ“ Loading test suite from: {args.test_suite}") test_suite = tester.load_test_suite(args.test_suite) # Run tests try: tester.run_test_suite(test_suite, filter_category=args.category) except KeyboardInterrupt: print("\n\nโš ๏ธ Test suite interrupted by user") tester.results['metadata']['test_end'] = datetime.now().isoformat() tester.save_results() print(f"\n๐Ÿ’พ Partial results saved to: {tester.output_dir}") if len(model_list) > 1 and idx < len(model_list): print(f"\nโš ๏ธ Skipping remaining {len(model_list) - idx} models") sys.exit(1) if len(model_list) > 1: print(f"\n{'='*80}") print(f"โœ… BATCH COMPLETE: Tested {len(model_list)} models") print(f"{'='*80}") print(f"\n๐Ÿ’พ Results saved to: {args.output_dir}/") print("\nTo compare results, run:") print(" python analyze_results.py --compare") if __name__ == "__main__": main()