llm-eval-forensics/ai_eval.py

#!/usr/bin/env python3
"""
AI Model Evaluation Automation Script
Runs comprehensive test suite against OpenAI-compatible API endpoints
"""

import yaml
import json
import requests
import os
import sys
import time
from datetime import datetime
from typing import Dict, List, Any, Optional
from pathlib import Path
import argparse
from dotenv import load_dotenv


class AIModelTester:
    def __init__(self, endpoint: str, api_key: str, model_name: str, output_dir: str = "results",
                 non_interactive: bool = False, evaluator_endpoint: Optional[str] = None,
                 evaluator_api_key: Optional[str] = None, evaluator_model: Optional[str] = None,
                 evaluator_temperature: float = 0.3):
        """
        Initialize the AI Model Tester

        Args:
            endpoint: OpenAI-compatible API endpoint URL
            api_key: API key for authentication
            model_name: Name/identifier of the model being tested
            output_dir: Directory to save results
            non_interactive: If True, use evaluator API for automatic scoring
            evaluator_endpoint: API endpoint for evaluation model
            evaluator_api_key: API key for evaluator
            evaluator_model: Model name for evaluator
            evaluator_temperature: Temperature for evaluator
        """
        self.endpoint = endpoint.rstrip('/')
        self.api_key = api_key
        self.model_name = model_name
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)

        # Non-interactive mode settings
        self.non_interactive = non_interactive
        self.evaluator_endpoint = evaluator_endpoint.rstrip('/') if evaluator_endpoint else None
        self.evaluator_api_key = evaluator_api_key
        self.evaluator_model = evaluator_model
        self.evaluator_temperature = evaluator_temperature

        if self.non_interactive:
            if not all([self.evaluator_endpoint, self.evaluator_model]):
                raise ValueError("Non-interactive mode requires evaluator_endpoint and evaluator_model")
            print(f"🤖 Non-interactive mode enabled")
            print(f"   Evaluator: {self.evaluator_model} @ {self.evaluator_endpoint}")

        # Results storage
        self.results = {
            "metadata": {
                "model_name": model_name,
                "endpoint": endpoint,
                "test_start": datetime.now().isoformat(),
                "test_end": None,
                "total_tests": 0,
                "completed_tests": 0
            },
            "test_results": []
        }

        # Current test session info
        self.current_test_id = None
        self.conversation_history = []

        # Track failed auto-evaluations for manual review
        self.failed_evaluations = []

    def test_connection(self, endpoint: str, api_key: str, model: str, endpoint_name: str = "API") -> bool:
        """
        Test if an API endpoint can be reached and authenticated against

        Args:
            endpoint: API endpoint URL
            api_key: API key for authentication
            model: Model name to test
            endpoint_name: Name for display purposes

        Returns:
            True if connection successful, False otherwise
        """
        print(f"\n🔍 Testing {endpoint_name} connection...")
        print(f"   Endpoint: {endpoint}")
        print(f"   Model: {model}")

        headers = {
            "Content-Type": "application/json"
        }

        if api_key:
            headers["Authorization"] = f"Bearer {api_key}"
            print(f"   API Key: {api_key[:10]}..." if len(api_key) > 10 else "   API Key: [set]")
        else:
            print(f"   API Key: [none]")

        # Simple test message
        payload = {
            "model": model,
            "messages": [{"role": "user", "content": "test"}],
            "max_tokens": 1
        }

        try:
            response = requests.post(
                f"{endpoint}/v1/chat/completions",
                headers=headers,
                json=payload,
                timeout=60
            )
            response.raise_for_status()

            print(f"   ✅ {endpoint_name} connection successful")
            return True

        except requests.exceptions.HTTPError as e:
            print(f"   ❌ {endpoint_name} HTTP error: {e.response.status_code}")
            print(f"      {e.response.text[:200]}")
            return False
        except requests.exceptions.ConnectionError as e:
            print(f"   ❌ {endpoint_name} connection failed: Cannot reach endpoint")
            return False
        except requests.exceptions.Timeout:
            print(f"   ❌ {endpoint_name} connection timeout")
            return False
        except requests.exceptions.RequestException as e:
            print(f"   ❌ {endpoint_name} error: {e}")
            return False

    def load_test_suite(self, yaml_file: str) -> Dict:
        """Load test suite from YAML file"""
        try:
            with open(yaml_file, 'r', encoding='utf-8') as f:
                return yaml.safe_load(f)
        except FileNotFoundError:
            print(f"Error: Test suite file not found: {yaml_file}")
            print(f"Please ensure {yaml_file} is in the current directory.")
            sys.exit(1)
        except yaml.YAMLError as e:
            print(f"Error: Invalid YAML format in {yaml_file}")
            print(f"Details: {e}")
            sys.exit(1)
        except Exception as e:
            print(f"Error loading test suite: {e}")
            sys.exit(1)

    def call_api(self, messages: List[Dict], temperature: float = 0.7, max_tokens: int = 2000) -> Optional[Dict]:
        """
        Call the OpenAI-compatible API

        Args:
            messages: List of message dicts with 'role' and 'content'
            temperature: Sampling temperature
            max_tokens: Maximum tokens in response

        Returns:
            API response dict or None if error
        """
        headers = {
            "Content-Type": "application/json"
        }

        # Only add Authorization header if API key is provided
        if self.api_key:
            headers["Authorization"] = f"Bearer {self.api_key}"

        payload = {
            "model": self.model_name,
            "messages": messages,
            "temperature": temperature,
            "max_tokens": max_tokens
        }

        try:
            print(f"\nDEBUG: Calling API endpoint: {self.endpoint}/v1/chat/completions")
            print(f"DEBUG: Model name: {self.model_name}")

            response = requests.post(
                f"{self.endpoint}/v1/chat/completions",
                headers=headers,
                json=payload,
                timeout=240
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.Timeout as e:
            print(f"\n⚠️  API Timeout Error: Request exceeded 120 seconds")
            print(f"   This usually means the model is taking too long to generate a response.")
            print(f"   Consider using a faster model or reducing the complexity of the prompt.")
            return None
        except requests.exceptions.HTTPError as e:
            print(f"\n⚠️  API HTTP Error: {e}")
            print(f"   Status Code: {e.response.status_code}")
            print(f"   Response: {e.response.text[:500]}")

            # Show request details for debugging
            print("\n   REQUEST DETAILS:")
            print(f"     URL: {self.endpoint}/v1/chat/completions")
            print(f"     Model: {self.model_name}")

            return None
        except requests.exceptions.ConnectionError as e:
            print(f"\n⚠️  API Connection Error: {e}")
            print(f"   Could not connect to {self.endpoint}")
            print(f"   Please check your network connection and endpoint URL.")
            return None
        except requests.exceptions.RequestException as e:
            print(f"\n⚠️  API Request Error: {e}")
            if hasattr(e, 'response') and e.response is not None:
                print(f"   Response: {e.response.text[:500]}")
            return None

    def display_test_info(self, test: Dict, category: str):
        """Display test information to user"""
        print("\n" + "="*80)
        print(f"📋 CATEGORY: {category}")
        print(f"🆔 Test ID: {test['id']}")
        print(f"📝 Test Name: {test['name']}")
        print(f"🎯 Type: {test['type']}")
        print(f"⚡ Difficulty: {test.get('expected_difficulty', 'N/A')}")
        print("="*80)

    def display_prompt(self, prompt: str, turn: Optional[int] = None):
        """Display the prompt being sent"""
        if turn is not None:
            print(f"\n🔄 TURN {turn}:")
        else:
            print(f"\n💬 PROMPT:")
        print("-"*80)
        print(prompt)
        print("-"*80)

    def display_response(self, response_text: str, raw_response: Optional[Dict] = None, generation_time: Optional[float] = None):
        """Display the model's response with timing and token usage metrics"""
        print("\nRAW API RESPONSE:")
        print("="*80)
        import json
        print(json.dumps(raw_response, indent=2))
        print("="*80)

        # Display timing and token metrics if available
        if generation_time is not None:
            print(f"\n⏱️  Generation Time: {generation_time:.2f}s")

        if raw_response and 'usage' in raw_response:
            usage = raw_response['usage']
            print(f"\n📊 Token Usage:")
            if 'prompt_tokens' in usage:
                print(f"   Prompt Tokens: {usage['prompt_tokens']}")
            if 'completion_tokens' in usage:
                print(f"   Completion Tokens: {usage['completion_tokens']}")
            if 'total_tokens' in usage:
                print(f"   Total Tokens: {usage['total_tokens']}")

            # Calculate tokens per second if we have timing
            if generation_time and generation_time > 0 and 'completion_tokens' in usage:
                tokens_per_sec = usage['completion_tokens'] / generation_time
                print(f"   Speed: {tokens_per_sec:.2f} tokens/sec")

        print(f"\n\nMODEL RESPONSE:")
        print("-"*80)
        print(response_text)
        print("-"*80)

    def display_evaluation_criteria(self, criteria: List[str]):
        """Display evaluation criteria for the test"""
        print(f"\n✅ EVALUATION CRITERIA:")
        for i, criterion in enumerate(criteria, 1):
            print(f"  {i}. {criterion}")

    def get_user_score(self) -> Dict:
        """Prompt user for evaluation score"""
        print("\n" + "="*80)
        print("📊 EVALUATION SCORING RUBRIC:")
        print("  0-1: FAIL - Major errors, fails to meet basic requirements")
        print("  2-3: PASS - Meets requirements with minor issues")
        print("  4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding")
        print("="*80)

        while True:
            try:
                score_input = input("\n👉 Enter score (0-5) or 'skip' to skip this test: ").strip().lower()

                if score_input == 'skip':
                    return {"score": None, "notes": "Skipped by user"}

                score = int(score_input)
                if 0 <= score <= 5:
                    notes = input("📝 Notes (optional, press Enter to skip): ").strip()
                    return {"score": score, "notes": notes if notes else ""}
                else:
                    print("❌ Score must be between 0 and 5")
            except ValueError:
                print("❌ Invalid input. Please enter a number between 0 and 5, or 'skip'")
            except KeyboardInterrupt:
                print("\n\n⚠️  Test interrupted by user")
                return {"score": None, "notes": "Interrupted"}

    def extract_api_metrics(self, response: Dict) -> Optional[Dict]:
        """
        Extract all available metrics from API response

        Args:
            response: The API response dict

        Returns:
            Dict with usage statistics and timing information, or None if no metrics available
        """
        metrics = {}

        if response and isinstance(response, dict):
            # Extract usage statistics if available
            usage = response.get('usage', {})
            if usage:
                metrics['usage'] = {
                    'prompt_tokens': usage.get('prompt_tokens'),
                    'completion_tokens': usage.get('completion_tokens'),
                    'total_tokens': usage.get('total_tokens'),
                    'prompt_eval_count': usage.get('prompt_eval_count'),
                    'eval_count': usage.get('eval_count'),
                    'prompt_eval_duration': usage.get('prompt_eval_duration'),
                    'eval_duration': usage.get('eval_duration'),
                    'load_duration': usage.get('load_duration'),
                    'total_duration': usage.get('total_duration'),
                    'response_token_s': usage.get('response_token/s'),
                    'prompt_token_s': usage.get('prompt_token/s'),
                }
                # Remove None values
                metrics['usage'] = {k: v for k, v in metrics['usage'].items() if v is not None}

            # Extract model info
            if 'model' in response:
                metrics['model'] = response['model']

            # Extract creation timestamp
            if 'created' in response:
                metrics['created'] = response['created']

            # Extract finish reason
            if 'choices' in response and len(response['choices']) > 0:
                finish_reason = response['choices'][0].get('finish_reason')
                if finish_reason:
                    metrics['finish_reason'] = finish_reason

        return metrics if metrics else None

    def get_evaluator_score(self, prompt: str, response: str, criteria: List[str]) -> Dict:
        """
        Get automated score from evaluator API

        Args:
            prompt: The original prompt sent to the model under test
            response: The model's response to evaluate
            criteria: List of evaluation criteria

        Returns:
            Dict with 'score', 'notes', and 'auto_eval_failed' keys
        """
        # Construct evaluation prompt
        criteria_text = "\n".join([f"{i+1}. {c}" for i, c in enumerate(criteria)])

        eval_system_prompt = """You are an expert AI model evaluator. Your task is to objectively assess AI model responses based on specific criteria.

Scoring Rubric:
- 0-1: FAIL - Major errors, fails to meet basic requirements, incorrect or misleading information
- 2-3: PASS - Meets requirements with minor issues, mostly correct with some imperfections
- 4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding, accurate and comprehensive

CRITICAL: You MUST respond with ONLY valid JSON. Do not include any explanatory text before or after the JSON.

Response format (use ONLY this, nothing else):
{"score": <integer 0-5>, "notes": "<brief explanation>"}"""

        eval_user_prompt = f"""Evaluate this AI model response against the criteria below.

**Original Prompt:**
{prompt}

**Model Response:**
{response}

**Evaluation Criteria:**
{criteria_text}

Respond with ONLY valid JSON in this exact format:
{{"score": <integer 0-5>, "notes": "<brief explanation of the score>"}}

Do not include any text before or after the JSON object."""

        messages = [
            {"role": "system", "content": eval_system_prompt},
            {"role": "user", "content": eval_user_prompt}
        ]

        # Call evaluator API
        headers = {
            "Content-Type": "application/json"
        }

        if self.evaluator_api_key:
            headers["Authorization"] = f"Bearer {self.evaluator_api_key}"

        payload = {
            "model": self.evaluator_model,
            "messages": messages,
            "temperature": self.evaluator_temperature,
            "max_tokens": 500
        }

        try:
            print(f"\n🤖 Calling evaluator API for automated scoring...")
            response_obj = requests.post(
                f"{self.evaluator_endpoint}/v1/chat/completions",
                headers=headers,
                json=payload,
                timeout=90  # Increased timeout for evaluator
            )
            response_obj.raise_for_status()
            eval_response = response_obj.json()

            # Extract the evaluation
            eval_text = eval_response['choices'][0]['message']['content'].strip()

            # Try to extract JSON from the response
            # Handle case where model wraps JSON in markdown code blocks or adds extra text
            json_obj = None

            # Try direct parsing first
            try:
                json_obj = json.loads(eval_text)
            except json.JSONDecodeError:
                # Try to find JSON in markdown code blocks
                if '```json' in eval_text or '```' in eval_text:
                    # Extract content between code fences
                    import re
                    json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', eval_text, re.DOTALL)
                    if json_match:
                        json_obj = json.loads(json_match.group(1))

                # Try to find any JSON object in the text
                if json_obj is None:
                    import re
                    json_match = re.search(r'\{[^{}]*"score"[^{}]*"notes"[^{}]*\}', eval_text, re.DOTALL)
                    if json_match:
                        json_obj = json.loads(json_match.group(0))

            if json_obj is None:
                raise json.JSONDecodeError("No valid JSON found", eval_text, 0)

            score = int(json_obj.get('score', 0))
            notes = json_obj.get('notes', 'Automated evaluation')

            # Validate score range
            if not 0 <= score <= 5:
                print(f"⚠️  Evaluator returned invalid score {score}, clamping to 0-5")
                score = max(0, min(5, score))

            print(f"   Score: {score}/5")
            print(f"   Notes: {notes}")

            return {"score": score, "notes": f"[Auto-Reviev by {self.evaluator_model}] {notes}", "auto_eval_failed": False}

        except json.JSONDecodeError as e:
            print(f"⚠️  Failed to parse evaluator response as JSON: {e}")
            print(f"   Raw response: {eval_text[:200]}")
            print(f"   Marking for manual review")
            return {"score": None, "notes": f"[AUTO-ERROR] Evaluator returned non-JSON response", "auto_eval_failed": True}
        except (KeyError, IndexError, ValueError) as e:
            print(f"⚠️  Invalid evaluator response format: {e}")
            print(f"   Marking for manual review")
            return {"score": None, "notes": f"[AUTO-ERROR] Invalid response format: {str(e)}", "auto_eval_failed": True}
        except requests.exceptions.Timeout as e:
            print(f"⚠️  Evaluator API timeout: Request exceeded 90 seconds")
            print(f"   The evaluator model is taking too long to respond.")
            print(f"   Marking for manual review")
            return {"score": None, "notes": "[AUTO-ERROR] Evaluator API timeout", "auto_eval_failed": True}
        except requests.exceptions.ConnectionError as e:
            print(f"⚠️  Evaluator connection error: {e}")
            print(f"   Could not connect to evaluator endpoint.")
            print(f"   Marking for manual review")
            return {"score": None, "notes": "[AUTO-ERROR] Cannot connect to evaluator", "auto_eval_failed": True}
        except requests.exceptions.RequestException as e:
            print(f"⚠️  Evaluator API error: {e}")
            print(f"   Marking for manual review")
            return {"score": None, "notes": f"[AUTO-ERROR] API call failed: {str(e)[:100]}", "auto_eval_failed": True}

    def run_single_turn_test(self, test: Dict, category: str) -> Dict:
        """Run a single-turn test"""
        self.display_test_info(test, category)
        self.display_prompt(test['prompt'])

        # Prepare messages
        messages = [{"role": "user", "content": test['prompt']}]

        # Call API and measure time
        start_time = time.time()
        response = self.call_api(messages)
        generation_time = time.time() - start_time
        if response is None:
            return {
                "test_id": test['id'],
                "test_name": test['name'],
                "category": category,
                "type": "single_turn",
                "status": "api_error",
                "score": None,
                "notes": "API call failed",
                "generation_time": generation_time
            }

        # Extract response text with better error handling
        try:
            message = response['choices'][0]['message']
            # Try to get content, if empty check for reasoning_content
            response_text = message.get('content', '')

            # If content is empty but reasoning_content exists, use that
            if not response_text and 'reasoning_content' in message:
                response_text = message['reasoning_content']
                print("\n⚠️  Note: Response contained only reasoning_content, no actual content generated")

            # If still empty, check for tool_calls (model might be trying to call functions)
            if not response_text and 'tool_calls' in message:
                print("\n⚠️  Warning: Model attempted to call tools instead of generating content")
                tool_info = json.dumps(message['tool_calls'], indent=2)
                response_text = f"[MODEL ERROR: Attempted tool calls instead of text response]\n{tool_info}"

            # If completely empty, this is an error
            if not response_text:
                print("\n⚠️  ERROR: Model returned completely empty response")
                response_text = "[ERROR: Empty response from model]"

        except (KeyError, IndexError, TypeError) as e:
            print(f"\nERROR: Failed to parse response - {e}")
            print("\nRAW API RESPONSE:")
            print("="*80)
            print(json.dumps(response, indent=2))
            print("="*80)

            response_text = f"[PARSING ERROR: {e}]"

        self.display_response(response_text, raw_response=response, generation_time=generation_time)

        # Display evaluation criteria
        self.display_evaluation_criteria(test.get('evaluation_criteria', []))

        # Get evaluation (interactive or automated)
        # Skip automated evaluation if response is an error
        if self.non_interactive:
            # Check if response is actually an error
            if response_text.startswith('[ERROR:') or response_text.startswith('[MODEL ERROR:') or response_text.startswith('[PARSING ERROR:'):
                print(f"\n⚠️  Skipping automated evaluation due to model error")
                evaluation = {"score": 0, "notes": "[AUTO-SKIP] Model failed to generate valid response", "auto_eval_failed": False}
            else:
                evaluation = self.get_evaluator_score(
                    prompt=test['prompt'],
                    response=response_text,
                    criteria=test.get('evaluation_criteria', [])
                )
                # Track failed evaluations for manual review
                if evaluation.get('auto_eval_failed', False):
                    self.failed_evaluations.append({
                        'test_id': test['id'],
                        'test_name': test['name'],
                        'category': category,
                        'type': 'single_turn',
                        'prompt': test['prompt'],
                        'response': response_text,
                        'criteria': test.get('evaluation_criteria', []),
                        'error': evaluation['notes']
                    })
        else:
            evaluation = self.get_user_score()

        # Extract API metrics
        api_metrics = self.extract_api_metrics(response)

        result = {
            "test_id": test['id'],
            "test_name": test['name'],
            "category": category,
            "type": "single_turn",
            "difficulty": test.get('expected_difficulty', 'unknown'),
            "prompt": test['prompt'],
            "response": response_text,
            "raw_response": response if response_text.startswith("[PARSING ERROR") else None,
            "evaluation_criteria": test.get('evaluation_criteria', []),
            "score": evaluation['score'],
            "notes": evaluation['notes'],
            "status": "completed" if evaluation['score'] is not None else "skipped",
            "timestamp": datetime.now().isoformat(),
            "generation_time": generation_time
        }

        # Add metrics if available
        if api_metrics:
            result['api_metrics'] = api_metrics

        return result

    def run_multi_turn_test(self, test: Dict, category: str) -> Dict:
        """Run a multi-turn test"""
        self.display_test_info(test, category)

        # Initialize conversation history
        self.conversation_history = []
        turn_results = []

        for i, turn_data in enumerate(test['turns'], 1):
            turn_num = turn_data['turn']
            prompt = turn_data['prompt']

            self.display_prompt(prompt, turn_num)

            # Add to conversation history
            self.conversation_history.append({"role": "user", "content": prompt})

            # Call API with full conversation history and measure time
            start_time = time.time()
            response = self.call_api(self.conversation_history)
            generation_time = time.time() - start_time

            if response is None:
                turn_results.append({
                    "turn": turn_num,
                    "status": "api_error",
                    "prompt": prompt,
                    "response": None,
                    "score": None,
                    "notes": "API error - failed to get response",
                    "generation_time": generation_time
                })
                break

            # Extract and display response with better error handling
            try:
                message = response['choices'][0]['message']
                # Try to get content, if empty check for reasoning_content
                response_text = message.get('content', '')

                # If content is empty but reasoning_content exists, use that
                if not response_text and 'reasoning_content' in message:
                    response_text = message['reasoning_content']
                    print("\n⚠️  Note: Response contained only reasoning_content, no actual content generated")

                # If still empty, check for tool_calls
                if not response_text and 'tool_calls' in message:
                    print("\n⚠️  Warning: Model attempted to call tools instead of generating content")
                    import json
                    tool_info = json.dumps(message['tool_calls'], indent=2)
                    response_text = f"[MODEL ERROR: Attempted tool calls instead of text response]\n{tool_info}"

                # If completely empty, this is an error
                if not response_text:
                    print("\n⚠️  ERROR: Model returned completely empty response")
                    response_text = "[ERROR: Empty response from model]"

            except (KeyError, IndexError, TypeError) as e:
                print(f"\nERROR: Failed to parse response - {e}")
                print("\nRAW API RESPONSE:")
                print("="*80)
                import json
                print(json.dumps(response, indent=2))
                print("="*80)

                response_text = f"[PARSING ERROR: {e}]"

            self.display_response(response_text, raw_response=response, generation_time=generation_time)

            # Add assistant response to history
            self.conversation_history.append({"role": "assistant", "content": response_text})

            # Display criteria for this turn
            self.display_evaluation_criteria(turn_data.get('evaluation_criteria', []))

            # Get evaluation for this turn (interactive or automated)
            print(f"\nEvaluate Turn {turn_num}:")
            if self.non_interactive:
                # Skip automated evaluation if response is an error
                if response_text.startswith('[ERROR:') or response_text.startswith('[MODEL ERROR:') or response_text.startswith('[PARSING ERROR:'):
                    print(f"\n⚠️  Skipping automated evaluation due to model error")
                    evaluation = {"score": 0, "notes": "[AUTO-SKIP] Model failed to generate valid response", "auto_eval_failed": False}
                else:
                    evaluation = self.get_evaluator_score(
                        prompt=prompt,
                        response=response_text,
                        criteria=turn_data.get('evaluation_criteria', [])
                    )
                    # Track failed evaluations for manual review
                    if evaluation.get('auto_eval_failed', False):
                        self.failed_evaluations.append({
                            'test_id': test['id'],
                            'test_name': test['name'],
                            'category': category,
                            'type': 'multi_turn',
                            'turn': turn_num,
                            'prompt': prompt,
                            'response': response_text,
                            'criteria': turn_data.get('evaluation_criteria', []),
                            'error': evaluation['notes']
                        })
            else:
                evaluation = self.get_user_score()

            # Extract API metrics for this turn
            api_metrics = self.extract_api_metrics(response)

            turn_result = {
                "turn": turn_num,
                "prompt": prompt,
                "response": response_text,
                "evaluation_criteria": turn_data.get('evaluation_criteria', []),
                "score": evaluation['score'],
                "notes": evaluation['notes'],
                "status": "completed" if evaluation['score'] is not None else "skipped",
                "generation_time": generation_time
            }

            # Add metrics if available
            if api_metrics:
                turn_result['api_metrics'] = api_metrics

            turn_results.append(turn_result)

            if evaluation['score'] is None:
                print(f"\nTurn {turn_num} skipped, stopping multi-turn test")
                break

        # Calculate overall score for multi-turn test
        valid_scores = [t['score'] for t in turn_results if t['score'] is not None]
        overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else None

        # Aggregate metrics across all turns
        aggregate_metrics = {}
        turn_metrics = [t.get('api_metrics') for t in turn_results if t.get('api_metrics')]

        if turn_metrics:
            # Sum up token counts and durations
            total_prompt_tokens = sum(m.get('usage', {}).get('prompt_tokens', 0) for m in turn_metrics)
            total_completion_tokens = sum(m.get('usage', {}).get('completion_tokens', 0) for m in turn_metrics)
            total_duration = sum(m.get('usage', {}).get('total_duration', 0) for m in turn_metrics)

            aggregate_metrics['usage'] = {
                'total_prompt_tokens': total_prompt_tokens if total_prompt_tokens else None,
                'total_completion_tokens': total_completion_tokens if total_completion_tokens else None,
                'total_tokens': (total_prompt_tokens + total_completion_tokens) if (total_prompt_tokens or total_completion_tokens) else None,
                'total_duration': total_duration if total_duration else None,
                'turn_count': len(turn_metrics)
            }
            # Remove None values
            aggregate_metrics['usage'] = {k: v for k, v in aggregate_metrics['usage'].items() if v is not None}

        result = {
            "test_id": test['id'],
            "test_name": test['name'],
            "category": category,
            "type": "multi_turn",
            "difficulty": test.get('expected_difficulty', 'unknown'),
            "turns": turn_results,
            "overall_score": overall_score,
            "status": "completed" if overall_score is not None else "incomplete",
            "timestamp": datetime.now().isoformat()
        }

        # Add aggregate metrics if available
        if aggregate_metrics:
            result['aggregate_metrics'] = aggregate_metrics

        return result

    def manual_review_failed_evaluations(self):
        """Present failed automated evaluations for manual review"""
        print("\n\n" + "="*80)
        print("⚠️  MANUAL REVIEW REQUIRED")
        print("="*80)
        print(f"\n{len(self.failed_evaluations)} test(s) could not be automatically evaluated.")
        print("Please provide manual scores for these tests.\n")

        for idx, failed in enumerate(self.failed_evaluations, 1):
            print("\n" + "="*80)
            print(f"📋 MANUAL REVIEW {idx}/{len(self.failed_evaluations)}")
            print("="*80)
            print(f"🆔 Test ID: {failed['test_id']}")
            print(f"📝 Test Name: {failed['test_name']}")
            print(f"📂 Category: {failed['category']}")
            if failed['type'] == 'multi_turn':
                print(f"🔄 Turn: {failed['turn']}")
            print(f"❌ Auto-Eval Error: {failed['error']}")

            print(f"\n💬 PROMPT:")
            print("-"*80)
            print(failed['prompt'])
            print("-"*80)

            print(f"\nMODEL RESPONSE:")
            print("-"*80)
            print(failed['response'])
            print("-"*80)

            print(f"\n✅ EVALUATION CRITERIA:")
            for i, criterion in enumerate(failed['criteria'], 1):
                print(f"  {i}. {criterion}")

            # Get manual score
            print("\n" + "="*80)
            print("📊 EVALUATION SCORING RUBRIC:")
            print("  0-1: FAIL - Major errors, fails to meet basic requirements")
            print("  2-3: PASS - Meets requirements with minor issues")
            print("  4-5: EXCEPTIONAL - Exceeds requirements, demonstrates deep understanding")
            print("="*80)

            manual_evaluation = self.get_user_score()

            # Update the corresponding test result
            self.update_test_result_with_manual_score(
                failed['test_id'],
                failed.get('turn'),
                manual_evaluation
            )

        # Save updated results
        self.save_results()
        print("\n✅ All manual reviews completed and saved!")

    def update_test_result_with_manual_score(self, test_id: str, turn: Optional[int], evaluation: Dict):
        """Update a test result with manually provided score"""
        for result in self.results['test_results']:
            if result['test_id'] == test_id:
                if turn is None:
                    # Single-turn test
                    result['score'] = evaluation['score']
                    # Only add MANUAL-OVERRIDE prefix if a score was actually provided
                    if evaluation['score'] is not None:
                        result['notes'] = f"[MANUAL-OVERRIDE] {evaluation['notes']}" if evaluation['notes'] else "[MANUAL-OVERRIDE]"
                        result['status'] = 'completed'
                    else:
                        result['notes'] = evaluation['notes'] if evaluation['notes'] else "Manual review skipped"
                        result['status'] = 'manual_review_skipped'
                else:
                    # Multi-turn test - update specific turn
                    for turn_result in result.get('turns', []):
                        if turn_result['turn'] == turn:
                            turn_result['score'] = evaluation['score']
                            # Only add MANUAL-OVERRIDE prefix if a score was actually provided
                            if evaluation['score'] is not None:
                                turn_result['notes'] = f"[MANUAL-OVERRIDE] {evaluation['notes']}" if evaluation['notes'] else "[MANUAL-OVERRIDE]"
                                turn_result['status'] = 'completed'
                            else:
                                turn_result['notes'] = evaluation['notes'] if evaluation['notes'] else "Manual review skipped"
                                turn_result['status'] = 'manual_review_skipped'

                    # Recalculate overall score
                    valid_scores = [t['score'] for t in result['turns'] if t['score'] is not None]
                    result['overall_score'] = sum(valid_scores) / len(valid_scores) if valid_scores else None
                    result['status'] = 'completed' if result['overall_score'] is not None else 'incomplete'
                break

    def run_test_suite(self, test_suite: Dict, filter_category: Optional[str] = None):
        """Run the complete test suite"""
        print("\n" + "="*80)
        print(f"🚀 STARTING TEST SUITE")
        print(f"📦 Model: {self.model_name}")
        print(f"🔗 Endpoint: {self.endpoint}")
        print("="*80)

        # Count total tests
        total_tests = 0
        for cat_data in test_suite.get('test_categories', []):
            if filter_category and cat_data['category'] != filter_category:
                continue
            total_tests += len(cat_data.get('tests', []))

        self.results['metadata']['total_tests'] = total_tests

        # Run tests by category
        test_count = 0
        for cat_data in test_suite.get('test_categories', []):
            category = cat_data['category']

            # Apply category filter if specified
            if filter_category and category != filter_category:
                continue

            print(f"\n\n{'='*80}")
            print(f"📂 CATEGORY: {category}")
            print(f"{'='*80}")

            for test in cat_data.get('tests', []):
                test_count += 1
                print(f"\n📊 Progress: {test_count}/{total_tests}")

                # Run appropriate test type
                if test.get('type') == 'single_turn':
                    result = self.run_single_turn_test(test, category)
                elif test.get('type') == 'multi_turn':
                    result = self.run_multi_turn_test(test, category)
                else:
                    print(f"⚠️  Unknown test type: {test.get('type')}")
                    continue

                self.results['test_results'].append(result)
                self.results['metadata']['completed_tests'] += 1

                # Save after each test (in case of interruption)
                self.save_results()

        # Mark test suite as complete
        self.results['metadata']['test_end'] = datetime.now().isoformat()
        self.save_results()

        # Handle failed evaluations if in non-interactive mode
        if self.non_interactive and self.failed_evaluations:
            self.manual_review_failed_evaluations()

        print("\n\n" + "="*80)
        print("✅ TEST SUITE COMPLETE")
        if self.non_interactive and self.failed_evaluations:
            print(f"   ({len(self.failed_evaluations)} test(s) manually reviewed)")
        print("="*80)
        self.display_summary()

    def save_results(self):
        """Save results to JSON file"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{self.model_name.replace(':', '_')}_{timestamp}.json"
        filepath = self.output_dir / filename

        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(self.results, f, indent=2, ensure_ascii=False)

        # Also save as "latest" for this model
        latest_file = self.output_dir / f"{self.model_name.replace(':', '_')}_latest.json"
        with open(latest_file, 'w', encoding='utf-8') as f:
            json.dump(self.results, f, indent=2, ensure_ascii=False)

    def display_summary(self):
        """Display test summary"""
        total = self.results['metadata']['total_tests']
        completed = self.results['metadata']['completed_tests']

        # Calculate statistics
        scores = [r.get('score') or r.get('overall_score')
                 for r in self.results['test_results']]
        scores = [s for s in scores if s is not None]

        if scores:
            avg_score = sum(scores) / len(scores)
            print(f"\n📊 SUMMARY:")
            print(f"  Total Tests: {total}")
            print(f"  Completed: {completed}")
            print(f"  Average Score: {avg_score:.2f}/5.00")
            print(f"  Pass Rate: {len([s for s in scores if s >= 2]) / len(scores) * 100:.1f}%")
            print(f"  Exceptional Rate: {len([s for s in scores if s >= 4]) / len(scores) * 100:.1f}%")

        # Calculate aggregate API metrics
        total_prompt_tokens = 0
        total_completion_tokens = 0
        total_duration = 0
        tests_with_metrics = 0

        for result in self.results['test_results']:
            # Single-turn tests
            if result.get('api_metrics'):
                usage = result['api_metrics'].get('usage', {})
                total_prompt_tokens += usage.get('prompt_tokens', 0)
                total_completion_tokens += usage.get('completion_tokens', 0)
                total_duration += usage.get('total_duration', 0)
                tests_with_metrics += 1
            # Multi-turn tests
            elif result.get('aggregate_metrics'):
                usage = result['aggregate_metrics'].get('usage', {})
                total_prompt_tokens += usage.get('total_prompt_tokens', 0)
                total_completion_tokens += usage.get('total_completion_tokens', 0)
                total_duration += usage.get('total_duration', 0)
                tests_with_metrics += 1

        if tests_with_metrics > 0:
            print(f"\n⚡ API METRICS:")
            print(f"  Total Prompt Tokens: {total_prompt_tokens:,}")
            print(f"  Total Completion Tokens: {total_completion_tokens:,}")
            print(f"  Total Tokens: {total_prompt_tokens + total_completion_tokens:,}")
            if total_duration > 0:
                # Convert nanoseconds to seconds
                duration_seconds = total_duration / 1_000_000_000
                print(f"  Total Duration: {duration_seconds:.2f}s")
                if total_completion_tokens > 0:
                    tokens_per_second = total_completion_tokens / duration_seconds
                    print(f"  Average Speed: {tokens_per_second:.2f} tokens/s")

        print(f"\n💾 Results saved to: {self.output_dir}")


def main():
    # Load environment variables from .env file if it exists
    load_dotenv()

    parser = argparse.ArgumentParser(
        description="AI Model Evaluation Test Suite",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Test a single model (interactive mode)
  python ai_eval.py --endpoint http://localhost:11434 --model qwen3:4b-q4_K_M

  # Test with API key
  python ai_eval.py --endpoint https://api.example.com --api-key sk-xxx --model qwen3:8b

  # Non-interactive mode with evaluator
  python ai_eval.py --non-interactive --evaluator-endpoint http://localhost:11434 --evaluator-model qwen3:14b

  # Use .env file for configuration (recommended)
  cp .env.example .env
  # Edit .env with your settings
  python ai_eval.py

  # Test only forensics category
  python ai_eval.py --endpoint http://localhost:11434 --model qwen3:14b --category "IT Forensics - File Systems"
        """
    )

    parser.add_argument(
        '--endpoint',
        default=os.getenv('MUT_ENDPOINT'),
        help='OpenAI-compatible API endpoint for model under test (default: from .env MUT_ENDPOINT)'
    )

    parser.add_argument(
        '--api-key',
        default=os.getenv('MUT_API_KEY', ''),
        help='API key for model under test (default: from .env MUT_API_KEY)'
    )

    parser.add_argument(
        '--model',
        default=os.getenv('MUT_MODEL'),
        help='Model name/identifier to test (default: from .env MUT_MODEL)'
    )

    parser.add_argument(
        '--test-suite',
        default=os.getenv('TEST_SUITE', 'test_suite.yaml'),
        help='Path to test suite YAML file (default: from .env TEST_SUITE or test_suite.yaml)'
    )

    parser.add_argument(
        '--output-dir',
        default=os.getenv('OUTPUT_DIR', 'results'),
        help='Directory to save results (default: from .env OUTPUT_DIR or results)'
    )

    parser.add_argument(
        '--category',
        default=os.getenv('FILTER_CATEGORY'),
        help='Filter tests by category (default: from .env FILTER_CATEGORY)'
    )

    parser.add_argument(
        '--non-interactive',
        action='store_true',
        default=os.getenv('NON_INTERACTIVE', '').lower() in ('true', '1', 'yes'),
        help='Run in non-interactive mode with automated evaluation (default: from .env NON_INTERACTIVE)'
    )

    parser.add_argument(
        '--evaluator-endpoint',
        default=os.getenv('EVALUATOR_ENDPOINT'),
        help='API endpoint for evaluator model (required for non-interactive mode, default: from .env EVALUATOR_ENDPOINT)'
    )

    parser.add_argument(
        '--evaluator-api-key',
        default=os.getenv('EVALUATOR_API_KEY', ''),
        help='API key for evaluator (default: from .env EVALUATOR_API_KEY)'
    )

    parser.add_argument(
        '--evaluator-model',
        default=os.getenv('EVALUATOR_MODEL'),
        help='Model name for evaluator (required for non-interactive mode, default: from .env EVALUATOR_MODEL)'
    )

    parser.add_argument(
        '--evaluator-temperature',
        type=float,
        default=float(os.getenv('EVALUATOR_TEMPERATURE', '0.3')),
        help='Temperature for evaluator model (default: from .env EVALUATOR_TEMPERATURE or 0.3)'
    )

    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Test API connectivity without running the full test suite'
    )

    args = parser.parse_args()

    # Validate required arguments
    if not args.endpoint:
        print("❌ Error: --endpoint is required (or set MUT_ENDPOINT in .env)")
        print("   Example: --endpoint http://localhost:11434")
        sys.exit(1)

    if not args.model:
        print("❌ Error: --model is required (or set MUT_MODEL in .env)")
        print("   Example: --model qwen3:4b-q4_K_M")
        sys.exit(1)

    if args.non_interactive:
        if not args.evaluator_endpoint:
            print("❌ Error: --evaluator-endpoint is required for non-interactive mode")
            print("   (or set EVALUATOR_ENDPOINT in .env)")
            sys.exit(1)
        if not args.evaluator_model:
            print("❌ Error: --evaluator-model is required for non-interactive mode")
            print("   (or set EVALUATOR_MODEL in .env)")
            sys.exit(1)

    # Parse model list (supports comma-separated models)
    model_list = [m.strip() for m in args.model.split(',') if m.strip()]

    # Dry run mode - just test connections
    if args.dry_run:
        print(f"\n{'='*80}")
        print("🧪 DRY RUN MODE - Testing API Connectivity")
        print(f"{'='*80}")

        all_success = True

        # Test MUT endpoint for each model
        for idx, model_name in enumerate(model_list, 1):
            if len(model_list) > 1:
                print(f"\n--- Model {idx}/{len(model_list)} ---")

            tester = AIModelTester(
                endpoint=args.endpoint,
                api_key=args.api_key,
                model_name=model_name,
                output_dir=args.output_dir,
                non_interactive=args.non_interactive,
                evaluator_endpoint=args.evaluator_endpoint,
                evaluator_api_key=args.evaluator_api_key,
                evaluator_model=args.evaluator_model,
                evaluator_temperature=args.evaluator_temperature
            )

            success = tester.test_connection(
                endpoint=args.endpoint,
                api_key=args.api_key,
                model=model_name,
                endpoint_name="Model Under Test"
            )
            all_success = all_success and success

        # Test evaluator endpoint if non-interactive mode
        if args.non_interactive and args.evaluator_endpoint and args.evaluator_model:
            print(f"\n{'='*80}")
            tester = AIModelTester(
                endpoint=args.endpoint,
                api_key=args.api_key,
                model_name=model_list[0],
                output_dir=args.output_dir,
                non_interactive=args.non_interactive,
                evaluator_endpoint=args.evaluator_endpoint,
                evaluator_api_key=args.evaluator_api_key,
                evaluator_model=args.evaluator_model,
                evaluator_temperature=args.evaluator_temperature
            )

            success = tester.test_connection(
                endpoint=args.evaluator_endpoint,
                api_key=args.evaluator_api_key,
                model=args.evaluator_model,
                endpoint_name="Evaluator"
            )
            all_success = all_success and success

        print(f"\n{'='*80}")
        if all_success:
            print("✅ All connectivity tests passed")
            print(f"{'='*80}")
            sys.exit(0)
        else:
            print("❌ Some connectivity tests failed")
            print(f"{'='*80}")
            sys.exit(1)

    if len(model_list) > 1:
        print(f"\n🔄 Batch mode: Testing {len(model_list)} models")
        print("=" * 80)

    # Test each model
    for idx, model_name in enumerate(model_list, 1):
        if len(model_list) > 1:
            print(f"\n{'='*80}")
            print(f"📊 Model {idx}/{len(model_list)}: {model_name}")
            print(f"{'='*80}\n")

        # Initialize tester
        tester = AIModelTester(
            endpoint=args.endpoint,
            api_key=args.api_key,
            model_name=model_name,
            output_dir=args.output_dir,
            non_interactive=args.non_interactive,
            evaluator_endpoint=args.evaluator_endpoint,
            evaluator_api_key=args.evaluator_api_key,
            evaluator_model=args.evaluator_model,
            evaluator_temperature=args.evaluator_temperature
        )

        # Load test suite
        if idx == 1 or len(model_list) == 1:
            print(f"📁 Loading test suite from: {args.test_suite}")
        test_suite = tester.load_test_suite(args.test_suite)

        # Run tests
        try:
            tester.run_test_suite(test_suite, filter_category=args.category)
        except KeyboardInterrupt:
            print("\n\n⚠️  Test suite interrupted by user")
            tester.results['metadata']['test_end'] = datetime.now().isoformat()
            tester.save_results()
            print(f"\n💾 Partial results saved to: {tester.output_dir}")
            if len(model_list) > 1 and idx < len(model_list):
                print(f"\n⚠️  Skipping remaining {len(model_list) - idx} models")
            sys.exit(1)

    if len(model_list) > 1:
        print(f"\n{'='*80}")
        print(f"✅ BATCH COMPLETE: Tested {len(model_list)} models")
        print(f"{'='*80}")
        print(f"\n💾 Results saved to: {args.output_dir}/")
        print("\nTo compare results, run:")
        print("  python analyze_results.py --compare")


if __name__ == "__main__":
    main()