Files
llm-eval-forensics/.env.example
2026-01-16 12:48:56 +01:00

53 lines
2.0 KiB
Plaintext

# AI Model Evaluation Configuration
# Copy this file to .env and fill in your values
# =============================================================================
# MODEL UNDER TEST (MUT) - The model being evaluated
# =============================================================================
# OpenAI-compatible API endpoint for the model under test
MUT_ENDPOINT=http://localhost:11434
# API key for the model under test (optional for local endpoints like Ollama)
MUT_API_KEY=
# Model name/identifier to test
# Supports multiple models separated by commas for batch testing:
# MUT_MODEL=qwen3:4b-q4_K_M,qwen3:4b-q8_0,qwen3:4b-fp16,qwen3:8b-q4_K_M
# Or specify a single model:
MUT_MODEL=qwen3:4b-q4_K_M
# =============================================================================
# EVALUATOR API - Used for non-interactive mode to automatically score responses
# =============================================================================
# OpenAI-compatible API endpoint for the evaluator model
EVALUATOR_ENDPOINT=http://localhost:11434
# API key for the evaluator API
EVALUATOR_API_KEY=
# Evaluator model name (should be a capable model for evaluation tasks)
EVALUATOR_MODEL=qwen3:14b
# Temperature for evaluator (lower = more consistent scoring)
EVALUATOR_TEMPERATURE=0.3
# =============================================================================
# TEST CONFIGURATION
# =============================================================================
# Path to test suite YAML file
TEST_SUITE=test_suite.yaml
# Output directory for results
OUTPUT_DIR=results
# Filter tests by category (optional, leave empty for all categories)
FILTER_CATEGORY=
# =============================================================================
# EXECUTION MODE
# =============================================================================
# Run in non-interactive mode (true/false)
# When true, uses EVALUATOR_* settings for automated scoring
# When false, prompts user for manual evaluation
NON_INTERACTIVE=false