improvements

This commit is contained in:
2026-01-16 15:24:34 +01:00
parent 345aa419c7
commit 5d8deb1e3c
2 changed files with 84 additions and 46 deletions

View File

@@ -487,11 +487,18 @@ class WebInterface:
dimensions = { dimensions = {
'logical_reasoning': ['Logic & Reasoning'], 'logical_reasoning': ['Logic & Reasoning'],
'mathematical_ability': ['Mathematics & Calculation'], 'mathematical_ability': ['Mathematics & Calculation'],
'instruction_following': ['Instruction Following'], 'instruction_following': ['Instruction Following', 'Multi-turn: Instruction Following'],
'creativity': ['Creative Writing'], 'creativity': ['Creative Writing'],
'technical_knowledge': ['Code Generation', 'IT Forensics'], 'technical_knowledge': [
'linguistic_nuance': ['Language Nuance'], 'Code Generation',
'conversational_depth': ['Multi-turn Conversations'] 'IT Forensics - File Systems',
'IT Forensics - Registry & Artifacts',
'IT Forensics - Memory & Network',
'IT Forensics - Timeline & Log Analysis'
],
'linguistic_nuance': ['Language Nuance', 'Multilingual Competence'],
'problem_solving': ['Problem Solving & Logistics'],
'conversational_depth': ['Multi-turn: Context Retention']
} }
model_metrics = {} model_metrics = {}
@@ -527,6 +534,7 @@ class WebInterface:
'creativity': 1.0, 'creativity': 1.0,
'technical_knowledge': 1.4, 'technical_knowledge': 1.4,
'linguistic_nuance': 1.1, 'linguistic_nuance': 1.1,
'problem_solving': 1.4,
'conversational_depth': 1.0 'conversational_depth': 1.0
} }

View File

@@ -135,75 +135,105 @@ test_categories:
- "Shows all intermediate steps" - "Shows all intermediate steps"
expected_difficulty: "very_hard" expected_difficulty: "very_hard"
# ========== INSTRUCTION FOLLOWING (4 tests) ========== # ========== INSTRUCTION FOLLOWING (5 tests) ==========
- category: "Instruction Following" - category: "Instruction Following"
tests: tests:
- id: "instr_01" - id: "instr_01"
name: "Photosynthesis Constraints" name: "Multi-Constraint Word Counting"
type: "single_turn" type: "single_turn"
prompt: "Write exactly 3 sentences about photosynthesis. The first sentence must be exactly 8 words long. The second must contain the word 'chlorophyll'. The third must end with a question mark." prompt: "Write exactly 4 sentences about photosynthesis. The first sentence must be exactly 7 words. The second sentence must be exactly 11 words. The third sentence must contain the word 'chloroplast' but NOT the word 'plant'. The fourth sentence must be exactly 9 words and end with an exclamation mark."
evaluation_criteria:
- "Exactly 3 sentences"
- "First sentence exactly 8 words"
- "Second contains 'chlorophyll'"
- "Third ends with '?'"
- "Content is accurate about photosynthesis"
expected_difficulty: "medium"
- id: "instr_02"
name: "Quantum Entanglement Negative Constraints"
type: "single_turn"
prompt: "Summarize the concept of 'Quantum Entanglement' in exactly 4 sentences. 1) The first sentence must be exactly 12 words long. 2) You CANNOT use the words 'particle', 'physics', or 'Einstein' in any part of the response. 3) The third sentence must be a question. 4) The final word of the summary must be 'connected'."
evaluation_criteria: evaluation_criteria:
- "Exactly 4 sentences" - "Exactly 4 sentences"
- "First sentence exactly 12 words" - "Sentence 1: exactly 7 words"
- "No forbidden words (particle, physics, Einstein)" - "Sentence 2: exactly 11 words"
- "Third sentence is a question" - "Sentence 3: contains 'chloroplast', no 'plant'"
- "Ends with 'connected'" - "Sentence 4: exactly 9 words, ends with '!'"
- "Content is accurate about photosynthesis"
expected_difficulty: "hard" expected_difficulty: "hard"
- id: "instr_02"
name: "Negative Constraints with Counting"
type: "single_turn"
prompt: "Explain neural networks in exactly 5 sentences. Requirements: 1) Total word count must be between 65-75 words. 2) You CANNOT use these words anywhere: 'brain', 'artificial', 'intelligence', 'AI', 'learning'. 3) Sentence 2 must be exactly 13 words. 4) Sentence 4 must be a question. 5) Every sentence must contain at least one word with 10+ letters."
evaluation_criteria:
- "Exactly 5 sentences"
- "Total 65-75 words"
- "No forbidden words (brain, artificial, intelligence, AI, learning)"
- "Sentence 2: exactly 13 words"
- "Sentence 4: is a question"
- "Each sentence has 10+ letter word"
- "Technically accurate"
expected_difficulty: "very_hard"
- id: "instr_03" - id: "instr_03"
name: "Acrostic Technical Explanation" name: "Acrostic with Multiple Constraints"
type: "single_turn" type: "single_turn"
prompt: | prompt: |
Write a 7-sentence explanation of how blockchain technology works. Write a 6-sentence explanation of quantum computing.
Constraints: Constraints:
1. The first letter of each sentence must spell out "SECURED" (S-E-C-U-R-E-D) 1. The first letter of each sentence must spell "QUANTA" (Q-U-A-N-T-A)
2. Sentence 3 must contain exactly 15 words 2. Do NOT use any markdown formatting (no **, __, *, etc.)
3. Sentence 5 must be a rhetorical question 3. Sentence 3 must contain exactly 14 words
4. You cannot use the words "Bitcoin", "cryptocurrency", or "mining" 4. Sentence 5 must be a rhetorical question (ends with ?)
5. The explanation must mention "consensus mechanism" at least once 5. You cannot use the words "supercomputer", "bit", or "IBM"
6. Total word count must be between 80-100 words 6. Total word count must be between 70-85 words
7. The word "superposition" must appear exactly once
evaluation_criteria: evaluation_criteria:
- "First letters spell SECURED" - "First letters spell QUANTA (no markdown tricks)"
- "Sentence 3 has exactly 15 words" - "No markdown formatting anywhere"
- "Sentence 3 has exactly 14 words"
- "Sentence 5 is a rhetorical question" - "Sentence 5 is a rhetorical question"
- "No forbidden words" - "No forbidden words"
- "Contains 'consensus mechanism'" - "Word count 70-85"
- "Word count 80-100" - "'superposition' appears exactly once"
- "Technically accurate" - "Technically accurate"
expected_difficulty: "very_hard" expected_difficulty: "very_hard"
- id: "instr_04" - id: "instr_04"
name: "Structured Data Extraction with Format" name: "Structured Template Completion"
type: "single_turn" type: "single_turn"
prompt: | prompt: |
Read this text and extract information in the EXACT format specified: Extract information from this text and fill in the template below with the extracted values.
DO NOT modify the template structure - only replace the placeholder VALUES inside brackets.
"Dr. Maria Santos-Ferreira, aged 47, joined TechCorp Industries on March 15, 2019 as Chief Technology Officer. She previously worked at DataSystems Inc. for 12 years. Her annual salary is $425,000 with a 15% bonus structure. She holds patents US2018/0012345 and EU2020/9876543. Contact: msantos@techcorp.com, +1-555-0147." Text: "Dr. Maria Santos-Ferreira, aged 47, joined TechCorp Industries on March 15, 2019 as Chief Technology Officer. She previously worked at DataSystems Inc. for 12 years. Her annual salary is $425,000 with a 15% bonus structure. She holds patents US2018/0012345 and EU2020/9876543. Contact: msantos@techcorp.com, +1-555-0147."
Output format (must match exactly, including brackets and pipes): Template (fill in values, keep all labels and pipes exactly as shown):
[NAME] | [AGE] | [COMPANY] | [ROLE] | [START_DATE:YYYY-MM-DD] | [PREV_EMPLOYER] | [PREV_YEARS] | [SALARY_USD] | [BONUS_%] | [PATENTS:semicolon-separated] | [EMAIL] | [PHONE] NAME=[value] | AGE=[value] | COMPANY=[value] | ROLE=[value] | START=[YYYY-MM-DD] | PREV_EMPLOYER=[value] | PREV_YEARS=[value] | SALARY_USD=[number] | BONUS_PCT=[number] | PATENTS=[semicolon-separated] | EMAIL=[value] | PHONE=[value]
evaluation_criteria: evaluation_criteria:
- "Exact format match with pipes and brackets" - "Template labels preserved exactly (NAME=, AGE=, etc.)"
- "Correct date format conversion (2019-03-15)" - "All pipes | in correct positions"
- "Salary as number without $ or comma" - "Date format: 2019-03-15"
- "Bonus as number without %" - "Salary as number: 425000 (no $ or comma)"
- "Bonus as number: 15 (no %)"
- "Patents semicolon-separated" - "Patents semicolon-separated"
- "All 12 fields present and correct" - "All 12 fields filled correctly"
expected_difficulty: "hard" expected_difficulty: "medium"
- id: "instr_05"
name: "Paragraph Structure with Alternating Constraints"
type: "single_turn"
prompt: |
Write a 3-paragraph explanation of how GPS works.
Constraints:
1. Paragraph 1: Must be exactly 3 sentences, first sentence exactly 10 words
2. Paragraph 2: Must be exactly 4 sentences, must contain the word 'triangulation' but NOT 'satellite'
3. Paragraph 3: Must be exactly 2 sentences, both sentences must be questions
4. Total word count for entire response: 95-110 words
5. You cannot use the words 'Google', 'phone', or 'navigation'
6. Each paragraph must contain at least one number or numerical word (e.g., 'three', 'multiple')
evaluation_criteria:
- "Paragraph 1: 3 sentences, first is 10 words"
- "Paragraph 2: 4 sentences, has 'triangulation', no 'satellite'"
- "Paragraph 3: 2 sentences, both questions"
- "Total 95-110 words"
- "No forbidden words"
- "Each paragraph has a number"
- "Technically accurate"
expected_difficulty: "very_hard"
# ========== CREATIVE WRITING (4 tests - added harder variants) ========== # ========== CREATIVE WRITING (4 tests - added harder variants) ==========