llm-eval-forensics/ai_eval.py

#!/usr/bin/env python3
"""
AI Model Evaluation Automation Script
Runs comprehensive test suite against OpenAI-compatible API endpoints
"""

import yaml
import json
import requests
import os
import sys
from datetime import datetime
from typing import Dict, List, Any, Optional
from pathlib import Path
import argparse


class AIModelTester:
    def __init__(self, endpoint: str, api_key: str, model_name: str, output_dir: str = "results"):
        """
        Initialize the AI Model Tester

        Args:
            endpoint: OpenAI-compatible API endpoint URL
            api_key: API key for authentication
            model_name: Name/identifier of the model being tested
            output_dir: Directory to save results
        """
        self.endpoint = endpoint.rstrip('/')
        self.api_key = api_key
        self.model_name = model_name
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)

        # Results storage
        self.results = {
            "metadata": {
                "model_name": model_name,
                "endpoint": endpoint,
                "test_start": datetime.now().isoformat(),
                "test_end": None,
                "total_tests": 0,
                "completed_tests": 0
            },
            "test_results": []
        }

        # Current test session info
        self.current_test_id = None
        self.conversation_history = []

    def load_test_suite(self, yaml_file: str) -> Dict:
        """Load test suite from YAML file"""
        try:
            with open(yaml_file, 'r', encoding='utf-8') as f:
                return yaml.safe_load(f)
        except FileNotFoundError:
            print(f"Error: Test suite file not found: {yaml_file}")
            print(f"Please ensure {yaml_file} is in the current directory.")
            sys.exit(1)
        except yaml.YAMLError as e:
            print(f"Error: Invalid YAML format in {yaml_file}")
            print(f"Details: {e}")
            sys.exit(1)
        except Exception as e:
            print(f"Error loading test suite: {e}")
            sys.exit(1)

    def call_api(self, messages: List[Dict], temperature: float = 0.7, max_tokens: int = 2000) -> Optional[Dict]:
        """
        Call the OpenAI-compatible API

        Args:
            messages: List of message dicts with 'role' and 'content'
            temperature: Sampling temperature
            max_tokens: Maximum tokens in response

        Returns:
            API response dict or None if error
        """
        headers = {
            "Content-Type": "application/json"
        }

        # Only add Authorization header if API key is provided
        if self.api_key:
            headers["Authorization"] = f"Bearer {self.api_key}"

        payload = {
            "model": self.model_name,
            "messages": messages,
            "temperature": temperature,
            "max_tokens": max_tokens
        }

        try:
            response = requests.post(
                f"{self.endpoint}/v1/chat/completions",
                headers=headers,
                json=payload,
                timeout=120
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"\n❌ API Error: {e}")
            if hasattr(e, 'response') and e.response is not None:
                print(f"Response: {e.response.text}")
            return None

    def display_test_info(self, test: Dict, category: str):
        """Display test information to user"""
        print("\n" + "="*80)
        print(f"📋 CATEGORY: {category}")
        print(f"🆔 Test ID: {test['id']}")
        print(f"📝 Test Name: {test['name']}")
        print(f"🎯 Type: {test['type']}")
        print(f"⚡ Difficulty: {test.get('expected_difficulty', 'N/A')}")
        print("="*80)

    def display_prompt(self, prompt: str, turn: Optional[int] = None):
        """Display the prompt being sent"""
        if turn is not None:
            print(f"\n🔄 TURN {turn}:")
        else:
            print(f"\n💬 PROMPT:")
        print("-"*80)
        print(prompt)
        print("-"*80)

    def display_response(self, response_text: str):
        """Display the model's response"""
        print(f"\n🤖 MODEL RESPONSE:")
        print("-"*80)
        print(response_text)
        print("-"*80)

    def display_evaluation_criteria(self, criteria: List[str]):
        """Display evaluation criteria for the test"""
        print(f"\n✅ EVALUATION CRITERIA:")
        for i, criterion in enumerate(criteria, 1):
            print(f"  {i}. {criterion}")

    def get_user_score(self) -> Dict:
        """Prompt user for evaluation score"""
        print("\n" + "="*80)
        print("📊 EVALUATION SCORING RUBRIC:")
        print("  0-1: FAIL - Major errors, fails to meet basic requirements")
        print("  2-3: PASS - Meets requirements with minor issues")
        print("  4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding")
        print("="*80)

        while True:
            try:
                score_input = input("\n👉 Enter score (0-5) or 'skip' to skip this test: ").strip().lower()

                if score_input == 'skip':
                    return {"score": None, "notes": "Skipped by user"}

                score = int(score_input)
                if 0 <= score <= 5:
                    notes = input("📝 Notes (optional, press Enter to skip): ").strip()
                    return {"score": score, "notes": notes if notes else ""}
                else:
                    print("❌ Score must be between 0 and 5")
            except ValueError:
                print("❌ Invalid input. Please enter a number between 0 and 5, or 'skip'")
            except KeyboardInterrupt:
                print("\n\n⚠️  Test interrupted by user")
                return {"score": None, "notes": "Interrupted"}

    def run_single_turn_test(self, test: Dict, category: str) -> Dict:
        """Run a single-turn test"""
        self.display_test_info(test, category)
        self.display_prompt(test['prompt'])

        # Prepare messages
        messages = [{"role": "user", "content": test['prompt']}]

        # Call API
        response = self.call_api(messages)
        if response is None:
            return {
                "test_id": test['id'],
                "test_name": test['name'],
                "category": category,
                "type": "single_turn",
                "status": "api_error",
                "score": None,
                "notes": "API call failed"
            }

        # Extract response text
        response_text = response['choices'][0]['message']['content']
        self.display_response(response_text)

        # Display evaluation criteria
        self.display_evaluation_criteria(test.get('evaluation_criteria', []))

        # Get user evaluation
        evaluation = self.get_user_score()

        return {
            "test_id": test['id'],
            "test_name": test['name'],
            "category": category,
            "type": "single_turn",
            "difficulty": test.get('expected_difficulty', 'unknown'),
            "prompt": test['prompt'],
            "response": response_text,
            "evaluation_criteria": test.get('evaluation_criteria', []),
            "score": evaluation['score'],
            "notes": evaluation['notes'],
            "status": "completed" if evaluation['score'] is not None else "skipped",
            "timestamp": datetime.now().isoformat()
        }

    def run_multi_turn_test(self, test: Dict, category: str) -> Dict:
        """Run a multi-turn test"""
        self.display_test_info(test, category)

        # Initialize conversation history
        self.conversation_history = []
        turn_results = []

        for i, turn_data in enumerate(test['turns'], 1):
            turn_num = turn_data['turn']
            prompt = turn_data['prompt']

            self.display_prompt(prompt, turn_num)

            # Add to conversation history
            self.conversation_history.append({"role": "user", "content": prompt})

            # Call API with full conversation history
            response = self.call_api(self.conversation_history)
            if response is None:
                turn_results.append({
                    "turn": turn_num,
                    "status": "api_error",
                    "prompt": prompt,
                    "response": None
                })
                break

            # Extract and display response
            response_text = response['choices'][0]['message']['content']
            self.display_response(response_text)

            # Add assistant response to history
            self.conversation_history.append({"role": "assistant", "content": response_text})

            # Display criteria for this turn
            self.display_evaluation_criteria(turn_data.get('evaluation_criteria', []))

            # Get evaluation for this turn
            print(f"\n🎯 Evaluate Turn {turn_num}:")
            evaluation = self.get_user_score()

            turn_results.append({
                "turn": turn_num,
                "prompt": prompt,
                "response": response_text,
                "evaluation_criteria": turn_data.get('evaluation_criteria', []),
                "score": evaluation['score'],
                "notes": evaluation['notes'],
                "status": "completed" if evaluation['score'] is not None else "skipped"
            })

            if evaluation['score'] is None:
                print(f"\n⚠️  Turn {turn_num} skipped, stopping multi-turn test")
                break

        # Calculate overall score for multi-turn test
        valid_scores = [t['score'] for t in turn_results if t['score'] is not None]
        overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else None

        return {
            "test_id": test['id'],
            "test_name": test['name'],
            "category": category,
            "type": "multi_turn",
            "difficulty": test.get('expected_difficulty', 'unknown'),
            "turns": turn_results,
            "overall_score": overall_score,
            "status": "completed" if overall_score is not None else "incomplete",
            "timestamp": datetime.now().isoformat()
        }

    def run_test_suite(self, test_suite: Dict, filter_category: Optional[str] = None):
        """Run the complete test suite"""
        print("\n" + "="*80)
        print(f"🚀 STARTING TEST SUITE")
        print(f"📦 Model: {self.model_name}")
        print(f"🔗 Endpoint: {self.endpoint}")
        print("="*80)

        # Count total tests
        total_tests = 0
        for cat_data in test_suite.get('test_categories', []):
            if filter_category and cat_data['category'] != filter_category:
                continue
            total_tests += len(cat_data.get('tests', []))

        self.results['metadata']['total_tests'] = total_tests

        # Run tests by category
        test_count = 0
        for cat_data in test_suite.get('test_categories', []):
            category = cat_data['category']

            # Apply category filter if specified
            if filter_category and category != filter_category:
                continue

            print(f"\n\n{'='*80}")
            print(f"📂 CATEGORY: {category}")
            print(f"{'='*80}")

            for test in cat_data.get('tests', []):
                test_count += 1
                print(f"\n📊 Progress: {test_count}/{total_tests}")

                # Run appropriate test type
                if test.get('type') == 'single_turn':
                    result = self.run_single_turn_test(test, category)
                elif test.get('type') == 'multi_turn':
                    result = self.run_multi_turn_test(test, category)
                else:
                    print(f"⚠️  Unknown test type: {test.get('type')}")
                    continue

                self.results['test_results'].append(result)
                self.results['metadata']['completed_tests'] += 1

                # Save after each test (in case of interruption)
                self.save_results()

        # Mark test suite as complete
        self.results['metadata']['test_end'] = datetime.now().isoformat()
        self.save_results()

        print("\n\n" + "="*80)
        print("✅ TEST SUITE COMPLETE")
        print("="*80)
        self.display_summary()

    def save_results(self):
        """Save results to JSON file"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{self.model_name.replace(':', '_')}_{timestamp}.json"
        filepath = self.output_dir / filename

        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(self.results, f, indent=2, ensure_ascii=False)

        # Also save as "latest" for this model
        latest_file = self.output_dir / f"{self.model_name.replace(':', '_')}_latest.json"
        with open(latest_file, 'w', encoding='utf-8') as f:
            json.dump(self.results, f, indent=2, ensure_ascii=False)

    def display_summary(self):
        """Display test summary"""
        total = self.results['metadata']['total_tests']
        completed = self.results['metadata']['completed_tests']

        # Calculate statistics
        scores = [r.get('score') or r.get('overall_score')
                 for r in self.results['test_results']]
        scores = [s for s in scores if s is not None]

        if scores:
            avg_score = sum(scores) / len(scores)
            print(f"\n📊 SUMMARY:")
            print(f"  Total Tests: {total}")
            print(f"  Completed: {completed}")
            print(f"  Average Score: {avg_score:.2f}/5.00")
            print(f"  Pass Rate: {len([s for s in scores if s >= 2]) / len(scores) * 100:.1f}%")
            print(f"  Exceptional Rate: {len([s for s in scores if s >= 4]) / len(scores) * 100:.1f}%")

        print(f"\n💾 Results saved to: {self.output_dir}")


def main():
    parser = argparse.ArgumentParser(
        description="AI Model Evaluation Test Suite",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Test a single model
  python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q4_K_M

  # Test with API key
  python ai_eval.py --endpoint https://api.example.com --api-key sk-xxx --model qwen3:8b

  # Test only forensics category
  python ai_eval.py --endpoint http://localhost:11434 --model qwen3:14b --category "IT Forensics - File Systems"

  # Test multiple models (run separately)
  python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q4_K_M
  python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q8_0
  python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-fp16
        """
    )

    parser.add_argument(
        '--endpoint',
        required=True,
        help='OpenAI-compatible API endpoint (e.g., http://localhost:11434 for Ollama)'
    )

    parser.add_argument(
        '--api-key',
        default='',
        help='API key for authentication (optional for local endpoints)'
    )

    parser.add_argument(
        '--model',
        required=True,
        help='Model name/identifier (e.g., qwen3:4b-q4_K_M)'
    )

    parser.add_argument(
        '--test-suite',
        default='test_suite.yaml',
        help='Path to test suite YAML file (default: test_suite.yaml)'
    )

    parser.add_argument(
        '--output-dir',
        default='results',
        help='Directory to save results (default: results)'
    )

    parser.add_argument(
        '--category',
        default=None,
        help='Filter tests by category (optional)'
    )

    args = parser.parse_args()

    # Initialize tester
    tester = AIModelTester(
        endpoint=args.endpoint,
        api_key=args.api_key,
        model_name=args.model,
        output_dir=args.output_dir
    )

    # Load test suite
    print(f"📁 Loading test suite from: {args.test_suite}")
    test_suite = tester.load_test_suite(args.test_suite)

    # Run tests
    try:
        tester.run_test_suite(test_suite, filter_category=args.category)
    except KeyboardInterrupt:
        print("\n\n⚠️  Test suite interrupted by user")
        tester.results['metadata']['test_end'] = datetime.now().isoformat()
        tester.save_results()
        print(f"\n💾 Partial results saved to: {tester.output_dir}")
        sys.exit(1)


if __name__ == "__main__":
    main()