improvements

2026-01-16 12:48:56 +01:00
parent 514bd9b571
commit 345aa419c7
9 changed files with 3966 additions and 204 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,52 @@
+# AI Model Evaluation Configuration
+# Copy this file to .env and fill in your values
+
+# =============================================================================
+# MODEL UNDER TEST (MUT) - The model being evaluated
+# =============================================================================
+# OpenAI-compatible API endpoint for the model under test
+MUT_ENDPOINT=http://localhost:11434
+
+# API key for the model under test (optional for local endpoints like Ollama)
+MUT_API_KEY=
+
+# Model name/identifier to test
+# Supports multiple models separated by commas for batch testing:
+# MUT_MODEL=qwen3:4b-q4_K_M,qwen3:4b-q8_0,qwen3:4b-fp16,qwen3:8b-q4_K_M
+# Or specify a single model:
+MUT_MODEL=qwen3:4b-q4_K_M
+
+# =============================================================================
+# EVALUATOR API - Used for non-interactive mode to automatically score responses
+# =============================================================================
+# OpenAI-compatible API endpoint for the evaluator model
+EVALUATOR_ENDPOINT=http://localhost:11434
+
+# API key for the evaluator API
+EVALUATOR_API_KEY=
+
+# Evaluator model name (should be a capable model for evaluation tasks)
+EVALUATOR_MODEL=qwen3:14b
+
+# Temperature for evaluator (lower = more consistent scoring)
+EVALUATOR_TEMPERATURE=0.3
+
+# =============================================================================
+# TEST CONFIGURATION
+# =============================================================================
+# Path to test suite YAML file
+TEST_SUITE=test_suite.yaml
+
+# Output directory for results
+OUTPUT_DIR=results
+
+# Filter tests by category (optional, leave empty for all categories)
+FILTER_CATEGORY=
+
+# =============================================================================
+# EXECUTION MODE
+# =============================================================================
+# Run in non-interactive mode (true/false)
+# When true, uses EVALUATOR_* settings for automated scoring
+# When false, prompts user for manual evaluation
+NON_INTERACTIVE=false