53 lines
2.0 KiB
Plaintext
53 lines
2.0 KiB
Plaintext
# AI Model Evaluation Configuration
|
|
# Copy this file to .env and fill in your values
|
|
|
|
# =============================================================================
|
|
# MODEL UNDER TEST (MUT) - The model being evaluated
|
|
# =============================================================================
|
|
# OpenAI-compatible API endpoint for the model under test
|
|
MUT_ENDPOINT=http://localhost:11434
|
|
|
|
# API key for the model under test (optional for local endpoints like Ollama)
|
|
MUT_API_KEY=
|
|
|
|
# Model name/identifier to test
|
|
# Supports multiple models separated by commas for batch testing:
|
|
# MUT_MODEL=qwen3:4b-q4_K_M,qwen3:4b-q8_0,qwen3:4b-fp16,qwen3:8b-q4_K_M
|
|
# Or specify a single model:
|
|
MUT_MODEL=qwen3:4b-q4_K_M
|
|
|
|
# =============================================================================
|
|
# EVALUATOR API - Used for non-interactive mode to automatically score responses
|
|
# =============================================================================
|
|
# OpenAI-compatible API endpoint for the evaluator model
|
|
EVALUATOR_ENDPOINT=http://localhost:11434
|
|
|
|
# API key for the evaluator API
|
|
EVALUATOR_API_KEY=
|
|
|
|
# Evaluator model name (should be a capable model for evaluation tasks)
|
|
EVALUATOR_MODEL=qwen3:14b
|
|
|
|
# Temperature for evaluator (lower = more consistent scoring)
|
|
EVALUATOR_TEMPERATURE=0.3
|
|
|
|
# =============================================================================
|
|
# TEST CONFIGURATION
|
|
# =============================================================================
|
|
# Path to test suite YAML file
|
|
TEST_SUITE=test_suite.yaml
|
|
|
|
# Output directory for results
|
|
OUTPUT_DIR=results
|
|
|
|
# Filter tests by category (optional, leave empty for all categories)
|
|
FILTER_CATEGORY=
|
|
|
|
# =============================================================================
|
|
# EXECUTION MODE
|
|
# =============================================================================
|
|
# Run in non-interactive mode (true/false)
|
|
# When true, uses EVALUATOR_* settings for automated scoring
|
|
# When false, prompts user for manual evaluation
|
|
NON_INTERACTIVE=false
|