From 5d8deb1e3cda47287b34ddbef1c9ee498a46833b Mon Sep 17 00:00:00 2001 From: mstoeck3 Date: Fri, 16 Jan 2026 15:24:34 +0100 Subject: [PATCH] improvements --- analyze_results.py | 16 +++++-- test_suite.yaml | 114 ++++++++++++++++++++++++++++----------------- 2 files changed, 84 insertions(+), 46 deletions(-) diff --git a/analyze_results.py b/analyze_results.py index df9bcb8..4904db3 100644 --- a/analyze_results.py +++ b/analyze_results.py @@ -487,11 +487,18 @@ class WebInterface: dimensions = { 'logical_reasoning': ['Logic & Reasoning'], 'mathematical_ability': ['Mathematics & Calculation'], - 'instruction_following': ['Instruction Following'], + 'instruction_following': ['Instruction Following', 'Multi-turn: Instruction Following'], 'creativity': ['Creative Writing'], - 'technical_knowledge': ['Code Generation', 'IT Forensics'], - 'linguistic_nuance': ['Language Nuance'], - 'conversational_depth': ['Multi-turn Conversations'] + 'technical_knowledge': [ + 'Code Generation', + 'IT Forensics - File Systems', + 'IT Forensics - Registry & Artifacts', + 'IT Forensics - Memory & Network', + 'IT Forensics - Timeline & Log Analysis' + ], + 'linguistic_nuance': ['Language Nuance', 'Multilingual Competence'], + 'problem_solving': ['Problem Solving & Logistics'], + 'conversational_depth': ['Multi-turn: Context Retention'] } model_metrics = {} @@ -527,6 +534,7 @@ class WebInterface: 'creativity': 1.0, 'technical_knowledge': 1.4, 'linguistic_nuance': 1.1, + 'problem_solving': 1.4, 'conversational_depth': 1.0 } diff --git a/test_suite.yaml b/test_suite.yaml index 48e4562..bcda023 100644 --- a/test_suite.yaml +++ b/test_suite.yaml @@ -135,75 +135,105 @@ test_categories: - "Shows all intermediate steps" expected_difficulty: "very_hard" - # ========== INSTRUCTION FOLLOWING (4 tests) ========== + # ========== INSTRUCTION FOLLOWING (5 tests) ========== - category: "Instruction Following" tests: - id: "instr_01" - name: "Photosynthesis Constraints" + name: "Multi-Constraint Word Counting" type: "single_turn" - prompt: "Write exactly 3 sentences about photosynthesis. The first sentence must be exactly 8 words long. The second must contain the word 'chlorophyll'. The third must end with a question mark." - evaluation_criteria: - - "Exactly 3 sentences" - - "First sentence exactly 8 words" - - "Second contains 'chlorophyll'" - - "Third ends with '?'" - - "Content is accurate about photosynthesis" - expected_difficulty: "medium" - - - id: "instr_02" - name: "Quantum Entanglement Negative Constraints" - type: "single_turn" - prompt: "Summarize the concept of 'Quantum Entanglement' in exactly 4 sentences. 1) The first sentence must be exactly 12 words long. 2) You CANNOT use the words 'particle', 'physics', or 'Einstein' in any part of the response. 3) The third sentence must be a question. 4) The final word of the summary must be 'connected'." + prompt: "Write exactly 4 sentences about photosynthesis. The first sentence must be exactly 7 words. The second sentence must be exactly 11 words. The third sentence must contain the word 'chloroplast' but NOT the word 'plant'. The fourth sentence must be exactly 9 words and end with an exclamation mark." evaluation_criteria: - "Exactly 4 sentences" - - "First sentence exactly 12 words" - - "No forbidden words (particle, physics, Einstein)" - - "Third sentence is a question" - - "Ends with 'connected'" + - "Sentence 1: exactly 7 words" + - "Sentence 2: exactly 11 words" + - "Sentence 3: contains 'chloroplast', no 'plant'" + - "Sentence 4: exactly 9 words, ends with '!'" + - "Content is accurate about photosynthesis" expected_difficulty: "hard" + - id: "instr_02" + name: "Negative Constraints with Counting" + type: "single_turn" + prompt: "Explain neural networks in exactly 5 sentences. Requirements: 1) Total word count must be between 65-75 words. 2) You CANNOT use these words anywhere: 'brain', 'artificial', 'intelligence', 'AI', 'learning'. 3) Sentence 2 must be exactly 13 words. 4) Sentence 4 must be a question. 5) Every sentence must contain at least one word with 10+ letters." + evaluation_criteria: + - "Exactly 5 sentences" + - "Total 65-75 words" + - "No forbidden words (brain, artificial, intelligence, AI, learning)" + - "Sentence 2: exactly 13 words" + - "Sentence 4: is a question" + - "Each sentence has 10+ letter word" + - "Technically accurate" + expected_difficulty: "very_hard" + - id: "instr_03" - name: "Acrostic Technical Explanation" + name: "Acrostic with Multiple Constraints" type: "single_turn" prompt: | - Write a 7-sentence explanation of how blockchain technology works. + Write a 6-sentence explanation of quantum computing. Constraints: - 1. The first letter of each sentence must spell out "SECURED" (S-E-C-U-R-E-D) - 2. Sentence 3 must contain exactly 15 words - 3. Sentence 5 must be a rhetorical question - 4. You cannot use the words "Bitcoin", "cryptocurrency", or "mining" - 5. The explanation must mention "consensus mechanism" at least once - 6. Total word count must be between 80-100 words + 1. The first letter of each sentence must spell "QUANTA" (Q-U-A-N-T-A) + 2. Do NOT use any markdown formatting (no **, __, *, etc.) + 3. Sentence 3 must contain exactly 14 words + 4. Sentence 5 must be a rhetorical question (ends with ?) + 5. You cannot use the words "supercomputer", "bit", or "IBM" + 6. Total word count must be between 70-85 words + 7. The word "superposition" must appear exactly once evaluation_criteria: - - "First letters spell SECURED" - - "Sentence 3 has exactly 15 words" + - "First letters spell QUANTA (no markdown tricks)" + - "No markdown formatting anywhere" + - "Sentence 3 has exactly 14 words" - "Sentence 5 is a rhetorical question" - "No forbidden words" - - "Contains 'consensus mechanism'" - - "Word count 80-100" + - "Word count 70-85" + - "'superposition' appears exactly once" - "Technically accurate" expected_difficulty: "very_hard" - id: "instr_04" - name: "Structured Data Extraction with Format" + name: "Structured Template Completion" type: "single_turn" prompt: | - Read this text and extract information in the EXACT format specified: + Extract information from this text and fill in the template below with the extracted values. + DO NOT modify the template structure - only replace the placeholder VALUES inside brackets. - "Dr. Maria Santos-Ferreira, aged 47, joined TechCorp Industries on March 15, 2019 as Chief Technology Officer. She previously worked at DataSystems Inc. for 12 years. Her annual salary is $425,000 with a 15% bonus structure. She holds patents US2018/0012345 and EU2020/9876543. Contact: msantos@techcorp.com, +1-555-0147." + Text: "Dr. Maria Santos-Ferreira, aged 47, joined TechCorp Industries on March 15, 2019 as Chief Technology Officer. She previously worked at DataSystems Inc. for 12 years. Her annual salary is $425,000 with a 15% bonus structure. She holds patents US2018/0012345 and EU2020/9876543. Contact: msantos@techcorp.com, +1-555-0147." - Output format (must match exactly, including brackets and pipes): - [NAME] | [AGE] | [COMPANY] | [ROLE] | [START_DATE:YYYY-MM-DD] | [PREV_EMPLOYER] | [PREV_YEARS] | [SALARY_USD] | [BONUS_%] | [PATENTS:semicolon-separated] | [EMAIL] | [PHONE] + Template (fill in values, keep all labels and pipes exactly as shown): + NAME=[value] | AGE=[value] | COMPANY=[value] | ROLE=[value] | START=[YYYY-MM-DD] | PREV_EMPLOYER=[value] | PREV_YEARS=[value] | SALARY_USD=[number] | BONUS_PCT=[number] | PATENTS=[semicolon-separated] | EMAIL=[value] | PHONE=[value] evaluation_criteria: - - "Exact format match with pipes and brackets" - - "Correct date format conversion (2019-03-15)" - - "Salary as number without $ or comma" - - "Bonus as number without %" + - "Template labels preserved exactly (NAME=, AGE=, etc.)" + - "All pipes | in correct positions" + - "Date format: 2019-03-15" + - "Salary as number: 425000 (no $ or comma)" + - "Bonus as number: 15 (no %)" - "Patents semicolon-separated" - - "All 12 fields present and correct" - expected_difficulty: "hard" + - "All 12 fields filled correctly" + expected_difficulty: "medium" + + - id: "instr_05" + name: "Paragraph Structure with Alternating Constraints" + type: "single_turn" + prompt: | + Write a 3-paragraph explanation of how GPS works. + + Constraints: + 1. Paragraph 1: Must be exactly 3 sentences, first sentence exactly 10 words + 2. Paragraph 2: Must be exactly 4 sentences, must contain the word 'triangulation' but NOT 'satellite' + 3. Paragraph 3: Must be exactly 2 sentences, both sentences must be questions + 4. Total word count for entire response: 95-110 words + 5. You cannot use the words 'Google', 'phone', or 'navigation' + 6. Each paragraph must contain at least one number or numerical word (e.g., 'three', 'multiple') + evaluation_criteria: + - "Paragraph 1: 3 sentences, first is 10 words" + - "Paragraph 2: 4 sentences, has 'triangulation', no 'satellite'" + - "Paragraph 3: 2 sentences, both questions" + - "Total 95-110 words" + - "No forbidden words" + - "Each paragraph has a number" + - "Technically accurate" + expected_difficulty: "very_hard" # ========== CREATIVE WRITING (4 tests - added harder variants) ==========