#!/usr/bin/env python3 """ AI Model Evaluation Automation Script Runs comprehensive test suite against OpenAI-compatible API endpoints """ import yaml import json import requests import os import sys from datetime import datetime from typing import Dict, List, Any, Optional from pathlib import Path import argparse class AIModelTester: def __init__(self, endpoint: str, api_key: str, model_name: str, output_dir: str = "results"): """ Initialize the AI Model Tester Args: endpoint: OpenAI-compatible API endpoint URL api_key: API key for authentication model_name: Name/identifier of the model being tested output_dir: Directory to save results """ self.endpoint = endpoint.rstrip('/') self.api_key = api_key self.model_name = model_name self.output_dir = Path(output_dir) self.output_dir.mkdir(exist_ok=True) # Results storage self.results = { "metadata": { "model_name": model_name, "endpoint": endpoint, "test_start": datetime.now().isoformat(), "test_end": None, "total_tests": 0, "completed_tests": 0 }, "test_results": [] } # Current test session info self.current_test_id = None self.conversation_history = [] def load_test_suite(self, yaml_file: str) -> Dict: """Load test suite from YAML file""" try: with open(yaml_file, 'r', encoding='utf-8') as f: return yaml.safe_load(f) except FileNotFoundError: print(f"Error: Test suite file not found: {yaml_file}") print(f"Please ensure {yaml_file} is in the current directory.") sys.exit(1) except yaml.YAMLError as e: print(f"Error: Invalid YAML format in {yaml_file}") print(f"Details: {e}") sys.exit(1) except Exception as e: print(f"Error loading test suite: {e}") sys.exit(1) def call_api(self, messages: List[Dict], temperature: float = 0.7, max_tokens: int = 2000) -> Optional[Dict]: """ Call the OpenAI-compatible API Args: messages: List of message dicts with 'role' and 'content' temperature: Sampling temperature max_tokens: Maximum tokens in response Returns: API response dict or None if error """ headers = { "Content-Type": "application/json" } # Only add Authorization header if API key is provided if self.api_key: headers["Authorization"] = f"Bearer {self.api_key}" payload = { "model": self.model_name, "messages": messages, "temperature": temperature, "max_tokens": max_tokens } try: response = requests.post( f"{self.endpoint}/v1/chat/completions", headers=headers, json=payload, timeout=120 ) response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: print(f"\nāŒ API Error: {e}") if hasattr(e, 'response') and e.response is not None: print(f"Response: {e.response.text}") return None def display_test_info(self, test: Dict, category: str): """Display test information to user""" print("\n" + "="*80) print(f"šŸ“‹ CATEGORY: {category}") print(f"šŸ†” Test ID: {test['id']}") print(f"šŸ“ Test Name: {test['name']}") print(f"šŸŽÆ Type: {test['type']}") print(f"⚔ Difficulty: {test.get('expected_difficulty', 'N/A')}") print("="*80) def display_prompt(self, prompt: str, turn: Optional[int] = None): """Display the prompt being sent""" if turn is not None: print(f"\nšŸ”„ TURN {turn}:") else: print(f"\nšŸ’¬ PROMPT:") print("-"*80) print(prompt) print("-"*80) def display_response(self, response_text: str): """Display the model's response""" print(f"\nšŸ¤– MODEL RESPONSE:") print("-"*80) print(response_text) print("-"*80) def display_evaluation_criteria(self, criteria: List[str]): """Display evaluation criteria for the test""" print(f"\nāœ… EVALUATION CRITERIA:") for i, criterion in enumerate(criteria, 1): print(f" {i}. {criterion}") def get_user_score(self) -> Dict: """Prompt user for evaluation score""" print("\n" + "="*80) print("šŸ“Š EVALUATION SCORING RUBRIC:") print(" 0-1: FAIL - Major errors, fails to meet basic requirements") print(" 2-3: PASS - Meets requirements with minor issues") print(" 4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding") print("="*80) while True: try: score_input = input("\nšŸ‘‰ Enter score (0-5) or 'skip' to skip this test: ").strip().lower() if score_input == 'skip': return {"score": None, "notes": "Skipped by user"} score = int(score_input) if 0 <= score <= 5: notes = input("šŸ“ Notes (optional, press Enter to skip): ").strip() return {"score": score, "notes": notes if notes else ""} else: print("āŒ Score must be between 0 and 5") except ValueError: print("āŒ Invalid input. Please enter a number between 0 and 5, or 'skip'") except KeyboardInterrupt: print("\n\nāš ļø Test interrupted by user") return {"score": None, "notes": "Interrupted"} def run_single_turn_test(self, test: Dict, category: str) -> Dict: """Run a single-turn test""" self.display_test_info(test, category) self.display_prompt(test['prompt']) # Prepare messages messages = [{"role": "user", "content": test['prompt']}] # Call API response = self.call_api(messages) if response is None: return { "test_id": test['id'], "test_name": test['name'], "category": category, "type": "single_turn", "status": "api_error", "score": None, "notes": "API call failed" } # Extract response text response_text = response['choices'][0]['message']['content'] self.display_response(response_text) # Display evaluation criteria self.display_evaluation_criteria(test.get('evaluation_criteria', [])) # Get user evaluation evaluation = self.get_user_score() return { "test_id": test['id'], "test_name": test['name'], "category": category, "type": "single_turn", "difficulty": test.get('expected_difficulty', 'unknown'), "prompt": test['prompt'], "response": response_text, "evaluation_criteria": test.get('evaluation_criteria', []), "score": evaluation['score'], "notes": evaluation['notes'], "status": "completed" if evaluation['score'] is not None else "skipped", "timestamp": datetime.now().isoformat() } def run_multi_turn_test(self, test: Dict, category: str) -> Dict: """Run a multi-turn test""" self.display_test_info(test, category) # Initialize conversation history self.conversation_history = [] turn_results = [] for i, turn_data in enumerate(test['turns'], 1): turn_num = turn_data['turn'] prompt = turn_data['prompt'] self.display_prompt(prompt, turn_num) # Add to conversation history self.conversation_history.append({"role": "user", "content": prompt}) # Call API with full conversation history response = self.call_api(self.conversation_history) if response is None: turn_results.append({ "turn": turn_num, "status": "api_error", "prompt": prompt, "response": None }) break # Extract and display response response_text = response['choices'][0]['message']['content'] self.display_response(response_text) # Add assistant response to history self.conversation_history.append({"role": "assistant", "content": response_text}) # Display criteria for this turn self.display_evaluation_criteria(turn_data.get('evaluation_criteria', [])) # Get evaluation for this turn print(f"\nšŸŽÆ Evaluate Turn {turn_num}:") evaluation = self.get_user_score() turn_results.append({ "turn": turn_num, "prompt": prompt, "response": response_text, "evaluation_criteria": turn_data.get('evaluation_criteria', []), "score": evaluation['score'], "notes": evaluation['notes'], "status": "completed" if evaluation['score'] is not None else "skipped" }) if evaluation['score'] is None: print(f"\nāš ļø Turn {turn_num} skipped, stopping multi-turn test") break # Calculate overall score for multi-turn test valid_scores = [t['score'] for t in turn_results if t['score'] is not None] overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else None return { "test_id": test['id'], "test_name": test['name'], "category": category, "type": "multi_turn", "difficulty": test.get('expected_difficulty', 'unknown'), "turns": turn_results, "overall_score": overall_score, "status": "completed" if overall_score is not None else "incomplete", "timestamp": datetime.now().isoformat() } def run_test_suite(self, test_suite: Dict, filter_category: Optional[str] = None): """Run the complete test suite""" print("\n" + "="*80) print(f"šŸš€ STARTING TEST SUITE") print(f"šŸ“¦ Model: {self.model_name}") print(f"šŸ”— Endpoint: {self.endpoint}") print("="*80) # Count total tests total_tests = 0 for cat_data in test_suite.get('test_categories', []): if filter_category and cat_data['category'] != filter_category: continue total_tests += len(cat_data.get('tests', [])) self.results['metadata']['total_tests'] = total_tests # Run tests by category test_count = 0 for cat_data in test_suite.get('test_categories', []): category = cat_data['category'] # Apply category filter if specified if filter_category and category != filter_category: continue print(f"\n\n{'='*80}") print(f"šŸ“‚ CATEGORY: {category}") print(f"{'='*80}") for test in cat_data.get('tests', []): test_count += 1 print(f"\nšŸ“Š Progress: {test_count}/{total_tests}") # Run appropriate test type if test.get('type') == 'single_turn': result = self.run_single_turn_test(test, category) elif test.get('type') == 'multi_turn': result = self.run_multi_turn_test(test, category) else: print(f"āš ļø Unknown test type: {test.get('type')}") continue self.results['test_results'].append(result) self.results['metadata']['completed_tests'] += 1 # Save after each test (in case of interruption) self.save_results() # Mark test suite as complete self.results['metadata']['test_end'] = datetime.now().isoformat() self.save_results() print("\n\n" + "="*80) print("āœ… TEST SUITE COMPLETE") print("="*80) self.display_summary() def save_results(self): """Save results to JSON file""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"{self.model_name.replace(':', '_')}_{timestamp}.json" filepath = self.output_dir / filename with open(filepath, 'w', encoding='utf-8') as f: json.dump(self.results, f, indent=2, ensure_ascii=False) # Also save as "latest" for this model latest_file = self.output_dir / f"{self.model_name.replace(':', '_')}_latest.json" with open(latest_file, 'w', encoding='utf-8') as f: json.dump(self.results, f, indent=2, ensure_ascii=False) def display_summary(self): """Display test summary""" total = self.results['metadata']['total_tests'] completed = self.results['metadata']['completed_tests'] # Calculate statistics scores = [r.get('score') or r.get('overall_score') for r in self.results['test_results']] scores = [s for s in scores if s is not None] if scores: avg_score = sum(scores) / len(scores) print(f"\nšŸ“Š SUMMARY:") print(f" Total Tests: {total}") print(f" Completed: {completed}") print(f" Average Score: {avg_score:.2f}/5.00") print(f" Pass Rate: {len([s for s in scores if s >= 2]) / len(scores) * 100:.1f}%") print(f" Exceptional Rate: {len([s for s in scores if s >= 4]) / len(scores) * 100:.1f}%") print(f"\nšŸ’¾ Results saved to: {self.output_dir}") def main(): parser = argparse.ArgumentParser( description="AI Model Evaluation Test Suite", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Test a single model python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q4_K_M # Test with API key python ai_eval.py --endpoint https://api.example.com --api-key sk-xxx --model qwen3:8b # Test only forensics category python ai_eval.py --endpoint http://localhost:11434 --model qwen3:14b --category "IT Forensics - File Systems" # Test multiple models (run separately) python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q4_K_M python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q8_0 python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-fp16 """ ) parser.add_argument( '--endpoint', required=True, help='OpenAI-compatible API endpoint (e.g., http://localhost:11434 for Ollama)' ) parser.add_argument( '--api-key', default='', help='API key for authentication (optional for local endpoints)' ) parser.add_argument( '--model', required=True, help='Model name/identifier (e.g., qwen3:4b-q4_K_M)' ) parser.add_argument( '--test-suite', default='test_suite.yaml', help='Path to test suite YAML file (default: test_suite.yaml)' ) parser.add_argument( '--output-dir', default='results', help='Directory to save results (default: results)' ) parser.add_argument( '--category', default=None, help='Filter tests by category (optional)' ) args = parser.parse_args() # Initialize tester tester = AIModelTester( endpoint=args.endpoint, api_key=args.api_key, model_name=args.model, output_dir=args.output_dir ) # Load test suite print(f"šŸ“ Loading test suite from: {args.test_suite}") test_suite = tester.load_test_suite(args.test_suite) # Run tests try: tester.run_test_suite(test_suite, filter_category=args.category) except KeyboardInterrupt: print("\n\nāš ļø Test suite interrupted by user") tester.results['metadata']['test_end'] = datetime.now().isoformat() tester.save_results() print(f"\nšŸ’¾ Partial results saved to: {tester.output_dir}") sys.exit(1) if __name__ == "__main__": main()