From 7697bb51ab23388201dd3b8f0dc45313f634cd95 Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Sun, 18 Jan 2026 15:14:25 +0100 Subject: [PATCH] improvements --- ai_eval.py | 10 +- analyze_results.py | 16 +- test_suite-large.yaml | 1091 ++++++++++++++++++++++++++++++++++ test_suite.yaml | 1309 ++++++++++++----------------------------- 4 files changed, 1481 insertions(+), 945 deletions(-) create mode 100644 test_suite-large.yaml diff --git a/ai_eval.py b/ai_eval.py index 9f134c2..2accfc8 100644 --- a/ai_eval.py +++ b/ai_eval.py @@ -929,15 +929,21 @@ Do not include any text before or after the JSON object.""" def save_results(self): """Save results to JSON file""" + # Ensure output directory exists + self.output_dir.mkdir(parents=True, exist_ok=True) + + # Sanitize model name for use in filename (replace problematic characters) + safe_model_name = self.model_name.replace('/', '_').replace(':', '_') + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"{self.model_name.replace(':', '_')}_{timestamp}.json" + filename = f"{safe_model_name}_{timestamp}.json" filepath = self.output_dir / filename with open(filepath, 'w', encoding='utf-8') as f: json.dump(self.results, f, indent=2, ensure_ascii=False) # Also save as "latest" for this model - latest_file = self.output_dir / f"{self.model_name.replace(':', '_')}_latest.json" + latest_file = self.output_dir / f"{safe_model_name}_latest.json" with open(latest_file, 'w', encoding='utf-8') as f: json.dump(self.results, f, indent=2, ensure_ascii=False) diff --git a/analyze_results.py b/analyze_results.py index cb01f4c..5568ed6 100644 --- a/analyze_results.py +++ b/analyze_results.py @@ -507,18 +507,14 @@ class WebInterface: dimensions = { 'logical_reasoning': ['Logic & Reasoning'], 'mathematical_ability': ['Mathematics & Calculation'], - 'instruction_following': ['Instruction Following', 'Multi-turn: Instruction Following'], - 'creativity': ['Creative Writing'], + 'instruction_following': ['Instruction Following with Constraints'], + 'creativity': ['Creative Writing with Constraints'], 'technical_knowledge': [ - 'Code Generation', - 'IT Forensics - File Systems', - 'IT Forensics - Registry & Artifacts', - 'IT Forensics - Memory & Network', - 'IT Forensics - Timeline & Log Analysis' + 'Code Generation - Advanced', + 'Digital Forensics & Binary Analysis' ], - 'linguistic_nuance': ['Language Nuance', 'Multilingual Competence'], - 'problem_solving': ['Problem Solving & Logistics'], - 'conversational_depth': ['Multi-turn: Context Retention'] + 'linguistic_nuance': ['Language Understanding & Pragmatics'], + 'conversational_depth': ['Multi-turn Context & Instruction Handling'] } model_metrics = {} diff --git a/test_suite-large.yaml b/test_suite-large.yaml new file mode 100644 index 0000000..bcda023 --- /dev/null +++ b/test_suite-large.yaml @@ -0,0 +1,1091 @@ +# AI Model Evaluation Test Suite - Enhanced Version +# Based on performance analysis of gemma3:4b-it-qat results +# Strengthened tests in categories where model performed too well +# Added multilingual challenges + +metadata: + version: "2.0" + author: "AI Evaluation Framework" + changes_from_v1: + - "Added harder variants for Creative Writing, Language Nuance, Code Generation" + - "Added Multilingual category with 4 tests" + - "Ensured minimum 3 tests per category at varying difficulties" + - "Strengthened instruction-following constraints" + focus_areas: + - Logic & Reasoning + - Mathematics & Calculation + - Instruction Following + - Creative Writing + - Code Generation + - Language Nuance + - Problem Solving & Logistics + - IT Forensics + - Multilingual Competence + - Multi-turn Conversations + +scoring_rubric: + fail: + score: 0-1 + description: "Major errors, fails to meet basic requirements" + pass: + score: 2-3 + description: "Meets requirements with minor issues" + exceptional: + score: 4-5 + description: "Exceeds requirements, demonstrates deep understanding" + +test_categories: + + # ========== LOGIC & REASONING (3 tests) ========== + + - category: "Logic & Reasoning" + tests: + - id: "logic_01" + name: "Family Logic Puzzle" + type: "single_turn" + prompt: "Three doctors said that Bill is their brother. Bill says he has no brothers. How many brothers does Bill actually have?" + evaluation_criteria: + - "Correctly identifies Bill is a woman/sister" + - "Answers: 0 brothers" + - "Explains the logical deduction" + expected_difficulty: "medium" + + - id: "logic_02" + name: "Temporal Reasoning" + type: "single_turn" + prompt: "If it was two hours ago, it would have been as long after 1:00 PM as it was before 1:00 PM today. What time is it now? Explain your deduction step-by-step." + evaluation_criteria: + - "Shows algebraic setup: (t-2) - 13:00 = 13:00 - (t-2)" + - "Correct answer: 3:00 PM (15:00)" + - "Clear step-by-step reasoning" + expected_difficulty: "hard" + + - id: "logic_03" + name: "Multi-Constraint Deduction" + type: "single_turn" + prompt: | + Five houses in a row are painted different colors. Their owners are from different countries, drink different beverages, smoke different brands, and keep different pets. + + Facts: + 1. The Brit lives in the red house. + 2. The Swede keeps dogs. + 3. The Dane drinks tea. + 4. The green house is immediately to the left of the white house. + 5. The owner of the green house drinks coffee. + 6. The person who smokes Pall Mall keeps birds. + 7. The owner of the yellow house smokes Dunhill. + 8. The person in the center house drinks milk. + 9. The Norwegian lives in the first house. + 10. The person who smokes Blend lives next to the one who keeps cats. + 11. The person who keeps horses lives next to the one who smokes Dunhill. + 12. The person who smokes Blue Master drinks beer. + 13. The German smokes Prince. + 14. The Norwegian lives next to the blue house. + 15. The person who smokes Blend has a neighbor who drinks water. + + Who owns the fish? + evaluation_criteria: + - "Systematically works through constraints" + - "Correctly identifies the German owns the fish" + - "Shows logical deduction process" + - "Handles constraint propagation correctly" + expected_difficulty: "very_hard" + + # ========== MATHEMATICS & CALCULATION (3 tests) ========== + + - category: "Mathematics & Calculation" + tests: + - id: "math_01" + name: "Average Speed with Stop" + type: "single_turn" + prompt: "If a train travels 240 miles in 3 hours, then stops for 45 minutes, then travels another 180 miles in 2 hours, what is the average speed for the entire journey including the stop?" + evaluation_criteria: + - "Total distance: 420 miles" + - "Total time: 5.75 hours" + - "Average speed: 73.04 mph (approximately)" + - "Shows calculation steps" + expected_difficulty: "medium" + + - id: "math_02" + name: "Cross-System Fuel Calculation" + type: "single_turn" + prompt: "A vehicle consumes 8.5 liters of fuel for every 100 kilometers traveled. If the fuel tank holds 15 gallons, and the car has already traveled 120 miles starting from a full tank, how many kilometers of range are left? (Use: 1 gallon = 3.785 liters; 1 mile = 1.609 km)." + evaluation_criteria: + - "Correct unit conversions (gallons to liters, miles to km)" + - "Accurate fuel consumption calculation" + - "Remaining range calculation: approximately 475 km" + - "Shows intermediate steps" + expected_difficulty: "hard" + + - id: "math_03" + name: "Compound Interest with Variable Rates and Withdrawals" + type: "single_turn" + prompt: | + An investment account starts with $10,000. The following occurs: + - Year 1: 5% annual interest, compounded quarterly + - Year 2: 4.5% annual interest, compounded monthly, with a $500 withdrawal at the end of Q2 + - Year 3: 6% annual interest, compounded daily (assume 365 days), with a $1,000 deposit at the start of the year + + Calculate the final balance at the end of Year 3. Show all intermediate calculations with at least 2 decimal places precision. + evaluation_criteria: + - "Correct Year 1 calculation with quarterly compounding" + - "Correct Year 2 with monthly compounding and mid-year withdrawal" + - "Correct Year 3 with daily compounding and initial deposit" + - "Final answer approximately $11,847-$11,850" + - "Shows all intermediate steps" + expected_difficulty: "very_hard" + + # ========== INSTRUCTION FOLLOWING (5 tests) ========== + + - category: "Instruction Following" + tests: + - id: "instr_01" + name: "Multi-Constraint Word Counting" + type: "single_turn" + prompt: "Write exactly 4 sentences about photosynthesis. The first sentence must be exactly 7 words. The second sentence must be exactly 11 words. The third sentence must contain the word 'chloroplast' but NOT the word 'plant'. The fourth sentence must be exactly 9 words and end with an exclamation mark." + evaluation_criteria: + - "Exactly 4 sentences" + - "Sentence 1: exactly 7 words" + - "Sentence 2: exactly 11 words" + - "Sentence 3: contains 'chloroplast', no 'plant'" + - "Sentence 4: exactly 9 words, ends with '!'" + - "Content is accurate about photosynthesis" + expected_difficulty: "hard" + + - id: "instr_02" + name: "Negative Constraints with Counting" + type: "single_turn" + prompt: "Explain neural networks in exactly 5 sentences. Requirements: 1) Total word count must be between 65-75 words. 2) You CANNOT use these words anywhere: 'brain', 'artificial', 'intelligence', 'AI', 'learning'. 3) Sentence 2 must be exactly 13 words. 4) Sentence 4 must be a question. 5) Every sentence must contain at least one word with 10+ letters." + evaluation_criteria: + - "Exactly 5 sentences" + - "Total 65-75 words" + - "No forbidden words (brain, artificial, intelligence, AI, learning)" + - "Sentence 2: exactly 13 words" + - "Sentence 4: is a question" + - "Each sentence has 10+ letter word" + - "Technically accurate" + expected_difficulty: "very_hard" + + - id: "instr_03" + name: "Acrostic with Multiple Constraints" + type: "single_turn" + prompt: | + Write a 6-sentence explanation of quantum computing. + + Constraints: + 1. The first letter of each sentence must spell "QUANTA" (Q-U-A-N-T-A) + 2. Do NOT use any markdown formatting (no **, __, *, etc.) + 3. Sentence 3 must contain exactly 14 words + 4. Sentence 5 must be a rhetorical question (ends with ?) + 5. You cannot use the words "supercomputer", "bit", or "IBM" + 6. Total word count must be between 70-85 words + 7. The word "superposition" must appear exactly once + evaluation_criteria: + - "First letters spell QUANTA (no markdown tricks)" + - "No markdown formatting anywhere" + - "Sentence 3 has exactly 14 words" + - "Sentence 5 is a rhetorical question" + - "No forbidden words" + - "Word count 70-85" + - "'superposition' appears exactly once" + - "Technically accurate" + expected_difficulty: "very_hard" + + - id: "instr_04" + name: "Structured Template Completion" + type: "single_turn" + prompt: | + Extract information from this text and fill in the template below with the extracted values. + DO NOT modify the template structure - only replace the placeholder VALUES inside brackets. + + Text: "Dr. Maria Santos-Ferreira, aged 47, joined TechCorp Industries on March 15, 2019 as Chief Technology Officer. She previously worked at DataSystems Inc. for 12 years. Her annual salary is $425,000 with a 15% bonus structure. She holds patents US2018/0012345 and EU2020/9876543. Contact: msantos@techcorp.com, +1-555-0147." + + Template (fill in values, keep all labels and pipes exactly as shown): + NAME=[value] | AGE=[value] | COMPANY=[value] | ROLE=[value] | START=[YYYY-MM-DD] | PREV_EMPLOYER=[value] | PREV_YEARS=[value] | SALARY_USD=[number] | BONUS_PCT=[number] | PATENTS=[semicolon-separated] | EMAIL=[value] | PHONE=[value] + evaluation_criteria: + - "Template labels preserved exactly (NAME=, AGE=, etc.)" + - "All pipes | in correct positions" + - "Date format: 2019-03-15" + - "Salary as number: 425000 (no $ or comma)" + - "Bonus as number: 15 (no %)" + - "Patents semicolon-separated" + - "All 12 fields filled correctly" + expected_difficulty: "medium" + + - id: "instr_05" + name: "Paragraph Structure with Alternating Constraints" + type: "single_turn" + prompt: | + Write a 3-paragraph explanation of how GPS works. + + Constraints: + 1. Paragraph 1: Must be exactly 3 sentences, first sentence exactly 10 words + 2. Paragraph 2: Must be exactly 4 sentences, must contain the word 'triangulation' but NOT 'satellite' + 3. Paragraph 3: Must be exactly 2 sentences, both sentences must be questions + 4. Total word count for entire response: 95-110 words + 5. You cannot use the words 'Google', 'phone', or 'navigation' + 6. Each paragraph must contain at least one number or numerical word (e.g., 'three', 'multiple') + evaluation_criteria: + - "Paragraph 1: 3 sentences, first is 10 words" + - "Paragraph 2: 4 sentences, has 'triangulation', no 'satellite'" + - "Paragraph 3: 2 sentences, both questions" + - "Total 95-110 words" + - "No forbidden words" + - "Each paragraph has a number" + - "Technically accurate" + expected_difficulty: "very_hard" + + # ========== CREATIVE WRITING (4 tests - added harder variants) ========== + + - category: "Creative Writing" + tests: + - id: "creative_01" + name: "Lighthouse Keeper Story" + type: "single_turn" + prompt: "Write a two-paragraph story about a lighthouse keeper who discovers something unusual. Use vivid sensory details." + evaluation_criteria: + - "Exactly 2 paragraphs" + - "Vivid sensory details (sight, sound, smell, touch, taste)" + - "Coherent narrative" + - "Creative and engaging" + expected_difficulty: "medium" + + - id: "creative_02" + name: "Victorian Greenhouse with Constraints" + type: "single_turn" + prompt: "Write a two-paragraph scene of a person entering an abandoned Victorian greenhouse in the middle of a blizzard. Use the 'Show, Don't Tell' technique. You must include at least one metaphor involving glass and one simile involving ghosts. Do not use the words 'cold', 'scary', or 'old'." + evaluation_criteria: + - "Two paragraphs" + - "Shows rather than tells" + - "Contains glass metaphor" + - "Contains ghost simile" + - "No forbidden words (cold, scary, old)" + - "Atmospheric and evocative" + expected_difficulty: "hard" + + - id: "creative_03" + name: "Unreliable Narrator Technical Document" + type: "single_turn" + prompt: | + Write a 3-paragraph product manual excerpt for a "Time Displacement Device" from the perspective of an unreliable narrator who is clearly lying or delusional, but the text must still function as a technically coherent manual. + + Requirements: + 1. Include at least 3 numbered safety warnings that are subtly absurd but grammatically serious + 2. The narrator must contradict themselves at least twice + 3. Include one footnote that undermines the main text + 4. Do not use exclamation marks anywhere + 5. Maintain formal technical writing style throughout + 6. Do not explicitly state the narrator is unreliable + evaluation_criteria: + - "3 paragraphs" + - "3+ numbered safety warnings (absurd but formal)" + - "At least 2 self-contradictions" + - "Footnote that undermines text" + - "No exclamation marks" + - "Formal technical style maintained" + - "Unreliability shown not told" + expected_difficulty: "very_hard" + + - id: "creative_04" + name: "Reverse Chronology Micro-Fiction" + type: "single_turn" + prompt: | + Write a complete 5-sentence story told in reverse chronological order (last event first, first event last). The story must be about a scientist making a discovery. + + Additional constraints: + - Each sentence must be from a different point in time (clearly distinguishable) + - The true meaning of the story should only become clear when you reach the "first" event (last sentence) + - Include at least one piece of dialogue + - The word count must be exactly 75 words (not 74, not 76) + evaluation_criteria: + - "Exactly 5 sentences" + - "Clear reverse chronological order" + - "About a scientist's discovery" + - "Each sentence distinct time point" + - "Meaning emerges at end" + - "Contains dialogue" + - "Exactly 75 words" + expected_difficulty: "very_hard" + + # ========== CODE GENERATION (4 tests) ========== + + - category: "Code Generation" + tests: + - id: "code_01" + name: "Duplicate Filter Function" + type: "single_turn" + prompt: "Write a Python function that takes a list of integers and returns a new list containing only the numbers that appear exactly twice in the original list. Include example usage." + evaluation_criteria: + - "Syntactically correct Python" + - "Correctly identifies duplicates appearing exactly twice" + - "Includes example usage" + - "Handles edge cases" + expected_difficulty: "medium" + + - id: "code_02" + name: "Weight Converter with Error Handling" + type: "single_turn" + prompt: "Write a Python function `process_measurements` that takes a list of strings representing weights (e.g., '5kg', '12lb', '300g'). The function should convert all weights to grams, filter out any values that exceed 5 kilograms, and return the average of the remaining values. Include try-except blocks for malformed strings and provide three test cases: one with metric, one with imperial, and one with a 'corrupted' string." + evaluation_criteria: + - "Correct parsing of weight strings" + - "Accurate unit conversions (kg, lb, g to grams)" + - "Proper filtering (> 5kg excluded)" + - "Robust error handling" + - "Three distinct test cases provided" + expected_difficulty: "hard" + + - id: "code_03" + name: "Concurrent Rate Limiter" + type: "single_turn" + prompt: | + Write a Python class `RateLimiter` that implements a token bucket rate limiter with the following requirements: + + 1. Constructor takes `rate` (tokens per second) and `capacity` (max tokens) + 2. Method `acquire(tokens=1)` that returns True if tokens available, False otherwise + 3. Method `wait_and_acquire(tokens=1)` that blocks until tokens are available (use asyncio) + 4. Must be thread-safe for the synchronous `acquire` method + 5. Include a method `get_available_tokens()` that returns current token count + + Provide a complete implementation with: + - Proper time-based token replenishment + - A test demonstrating both sync and async usage + - Handle edge case where requested tokens > capacity + evaluation_criteria: + - "Correct token bucket algorithm" + - "Thread-safe synchronous acquire" + - "Working async wait_and_acquire" + - "Proper time-based replenishment" + - "Edge case handling" + - "Complete test code" + expected_difficulty: "very_hard" + + - id: "code_04" + name: "SQL Query Builder with Injection Prevention" + type: "single_turn" + prompt: | + Write a Python class `SafeQueryBuilder` that builds SELECT SQL queries with the following features: + + 1. Fluent interface: `builder.select('name', 'age').from_table('users').where('age', '>', 18).where('status', '=', 'active').order_by('name').limit(10).build()` + 2. Must prevent SQL injection - all values must be parameterized + 3. The `build()` method returns a tuple of (query_string, parameters_list) + 4. Support for: SELECT, FROM, WHERE (multiple), ORDER BY, LIMIT, OFFSET + 5. WHERE conditions can use: =, !=, >, <, >=, <=, LIKE, IN + + Show the output for a query that selects users where name LIKE '%john%' AND age IN (25, 30, 35) ordered by created_at DESC with limit 5. + evaluation_criteria: + - "Fluent interface pattern correct" + - "SQL injection prevention via parameterization" + - "Returns (query, params) tuple" + - "All operations supported" + - "WHERE with IN clause works" + - "Example output is correct and safe" + expected_difficulty: "hard" + + # ========== LANGUAGE NUANCE (4 tests - added harder variants) ========== + + - category: "Language Nuance" + tests: + - id: "nuance_01" + name: "Emphasis Shift Analysis" + type: "single_turn" + prompt: "Explain the difference in meaning when different words are emphasized in this sentence: 'I didn't say she stole the money'. Show how the meaning changes with emphasis on each word." + evaluation_criteria: + - "Explains emphasis on 'I' (someone else said it)" + - "Explains emphasis on 'didn't' (denial)" + - "Explains emphasis on 'say' (implied it)" + - "Explains emphasis on 'she' (someone else did)" + - "Explains emphasis on 'stole' (obtained differently)" + - "Explains emphasis on 'money' (took something else)" + expected_difficulty: "medium" + + - id: "nuance_02" + name: "Professional Apology Analysis" + type: "single_turn" + prompt: "Compare the social implications and 'hidden' meanings of these three phrases when used in a professional workplace setting after a mistake: 1) 'I'm sorry if you feel that way.' 2) 'I apologize for the oversight.' 3) 'Mistakes were made on my end.' Explain which one is most likely to preserve professional authority vs. which one sounds like 'gaslighting'." + evaluation_criteria: + - "Identifies phrase 1 as potentially gaslighting" + - "Recognizes phrase 2 as genuine accountability" + - "Analyzes phrase 3 for passive voice implications" + - "Discusses power dynamics and authority" + - "Demonstrates understanding of pragmatics" + expected_difficulty: "hard" + + - id: "nuance_03" + name: "Register Shifting and Code-Switching" + type: "single_turn" + prompt: | + Rewrite the following message in FOUR different registers, maintaining the same core information but adjusting tone, vocabulary, and structure appropriately: + + Original: "The quarterly report shows we lost money because our main product didn't sell well and we spent too much on advertising." + + Rewrite for: + 1. A formal board presentation (C-suite executives) + 2. A casual Slack message to your team + 3. A legal disclosure document + 4. An email to a non-English speaking business partner (using simple, clear language) + + After the four rewrites, explain three specific linguistic changes you made for each register and why. + evaluation_criteria: + - "Board version uses formal financial terminology" + - "Slack version uses casual/colloquial language appropriately" + - "Legal version uses hedging, passive voice, precise language" + - "Simple version avoids idioms and complex structures" + - "Identifies 3 specific changes per register" + - "Explanations demonstrate metalinguistic awareness" + expected_difficulty: "very_hard" + + - id: "nuance_04" + name: "Implicature and Presupposition Detection" + type: "single_turn" + prompt: | + Analyze the following dialogue for all implicatures, presuppositions, and indirect speech acts: + + A: "Have you finished the Anderson report yet?" + B: "I've been dealing with the server outage all morning." + A: "Right. Well, the client is flying in tomorrow." + B: "I noticed you CC'd the whole department on that email." + A: "Just keeping everyone in the loop." + + For each line, identify: + 1. What is directly stated (locution) + 2. What is implied but not stated (implicature) + 3. What is assumed to be true (presupposition) + 4. What action is being performed through speech (illocutionary force) + + Then explain the underlying conflict or tension this exchange reveals. + evaluation_criteria: + - "Correctly identifies B's implicature (excuse/reason for not finishing)" + - "Identifies A's implied criticism in 'Right. Well...'" + - "Recognizes B's counter-accusation in CC comment" + - "Identifies presuppositions (report exists, server outage occurred)" + - "Correctly labels illocutionary acts (request, excuse, threat, accusation)" + - "Explains underlying workplace tension/conflict" + expected_difficulty: "very_hard" + + # ========== PROBLEM SOLVING & LOGISTICS (3 tests) ========== + + - category: "Problem Solving & Logistics" + tests: + - id: "logistics_01" + name: "Water Jug Problem" + type: "single_turn" + prompt: "You have a 3-gallon jug and a 5-gallon jug. How can you measure exactly 4 gallons of water? Explain each step." + evaluation_criteria: + - "Provides step-by-step solution" + - "Reaches exactly 4 gallons" + - "Logical sequence of pours" + - "Clear explanation" + expected_difficulty: "medium" + + - id: "logistics_02" + name: "Bridge Transport Optimization" + type: "single_turn" + prompt: "You need to transport 500 kilograms (approx. 1,102 lbs) of equipment across a bridge. The bridge has a strict limit of 150 kg per trip. You have three crates weighing 70 kg, 80 kg, and 120 kg, plus a variety of smaller 10 kg weights. However, the transport cart itself weighs 25 lbs. Calculate the minimum number of trips required and provide a loading manifest for each trip in both kilograms and pounds." + evaluation_criteria: + - "Converts cart weight to kg (≈11.34 kg)" + - "Accounts for cart weight in each trip" + - "Calculates effective capacity per trip" + - "Provides minimum number of trips" + - "Shows manifest in both kg and lbs" + - "Reaches exactly 500 kg total" + expected_difficulty: "very_hard" + + - id: "logistics_03" + name: "Resource Scheduling with Constraints" + type: "single_turn" + prompt: | + Schedule these 6 tasks across 3 workers (A, B, C) to minimize total completion time: + + Task 1: 2 hours, requires Worker A or B, must complete before Task 4 + Task 2: 3 hours, any worker, must complete before Task 5 + Task 3: 1 hour, requires Worker C only, no dependencies + Task 4: 2 hours, requires Worker B or C, depends on Task 1 + Task 5: 4 hours, requires Worker A only, depends on Task 2 + Task 6: 2 hours, any worker, depends on Tasks 3 and 4 + + Provide: + 1. A timeline showing when each task starts and ends + 2. Which worker does each task + 3. The total completion time + 4. Explain why this is optimal (or near-optimal) + evaluation_criteria: + - "Respects all worker constraints" + - "Respects all dependencies" + - "Provides clear timeline" + - "Achieves reasonable completion time (≤9 hours possible)" + - "Explains optimization reasoning" + expected_difficulty: "hard" + + # ========== IT FORENSICS - FILE SYSTEMS (3 tests) ========== + + - category: "IT Forensics - File Systems" + tests: + - id: "forensics_mft_01" + name: "MFT Entry Analysis - Basic" + type: "single_turn" + prompt: | + Analyze this hex dump from an NTFS Master File Table (MFT) entry and answer: + 1) What is the signature of this MFT entry? + 2) Is this entry in use or deleted? + 3) What is the sequence number? + + Hex dump (first 48 bytes of MFT entry): + + Offset(h) 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F + 00000000 46 49 4C 45 30 00 03 00 95 1F 23 00 00 00 00 00 + 00000010 01 00 01 00 38 00 01 00 A0 01 00 00 00 04 00 00 + 00000020 00 00 00 00 00 00 00 00 06 00 00 00 00 00 00 00 + evaluation_criteria: + - "Identifies signature as 'FILE' (46 49 4C 45)" + - "Recognizes entry is in use (based on flags at offset 0x16)" + - "Correctly reads sequence number from offset 0x10" + - "Shows understanding of little-endian byte order" + - "Explains reasoning with offset references" + expected_difficulty: "hard" + + - id: "forensics_mft_02" + name: "MFT Entry Analysis - Advanced" + type: "single_turn" + prompt: | + Analyze this complete MFT entry header and extract key metadata: + + Offset(h) 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F + 00000000 46 49 4C 45 30 00 03 00 EA 3F 00 00 00 00 00 00 + 00000010 01 00 01 00 38 00 01 00 68 01 00 00 00 04 00 00 + 00000020 00 00 00 00 00 00 00 00 04 00 00 00 05 00 00 00 + 00000030 2A 00 00 00 00 00 00 00 10 00 00 00 60 00 00 00 + + Questions: + 1) What is the update sequence array offset? + 2) What is the update sequence array size? + 3) What is the $LogFile sequence number (LSN)? + 4) What is the offset to the first attribute? + 5) What are the MFT entry flags (in use/directory)? + evaluation_criteria: + - "Identifies USA offset (0x0030 at offset 0x04-0x05)" + - "Identifies USA size (0x0003 at offset 0x06-0x07)" + - "Reads LSN correctly (0x00003FEA, little-endian)" + - "Identifies first attribute offset (0x0038 at offset 0x14-0x15)" + - "Interprets flags correctly (offset 0x16-0x17)" + - "Demonstrates understanding of MFT structure" + expected_difficulty: "very_hard" + + - id: "forensics_signature_01" + name: "File Signature Identification" + type: "single_turn" + prompt: | + Identify the file types from these hex signatures and explain your reasoning: + + A) FF D8 FF E0 00 10 4A 46 49 46 + B) 50 4B 03 04 14 00 06 00 + C) 89 50 4E 47 0D 0A 1A 0A + D) 25 50 44 46 2D 31 2E 34 + E) 52 61 72 21 1A 07 00 + evaluation_criteria: + - "Correctly identifies A as JPEG (FF D8 FF + JFIF)" + - "Identifies B as ZIP/PKZip (PK headers)" + - "Identifies C as PNG (\\x89PNG)" + - "Identifies D as PDF (%PDF-1.4)" + - "Identifies E as RAR archive" + - "Explains significance of magic numbers" + expected_difficulty: "medium" + + # ========== IT FORENSICS - REGISTRY & ARTIFACTS (3 tests) ========== + + - category: "IT Forensics - Registry & Artifacts" + tests: + - id: "forensics_registry_01" + name: "Windows Registry Hive Header" + type: "single_turn" + prompt: | + Analyze this Windows Registry hive header: + + Offset(h) 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F + 00000000 72 65 67 66 E6 07 00 00 E6 07 00 00 00 00 00 00 + 00000010 01 00 00 00 03 00 00 00 00 00 00 00 01 00 00 00 + + Questions: + 1) What is the registry hive signature? + 2) What are the primary and secondary sequence numbers? + 3) What is the hive format version? + evaluation_criteria: + - "Identifies 'regf' signature (72 65 67 66)" + - "Reads primary sequence number (0x000007E6 = 2022)" + - "Reads secondary sequence number (same)" + - "Identifies format version or major version number" + - "Demonstrates knowledge of registry forensics" + expected_difficulty: "hard" + + - id: "forensics_timestamp_01" + name: "FILETIME Conversion" + type: "single_turn" + prompt: | + Convert these Windows FILETIME values to human-readable UTC timestamps: + + A) 01 D8 93 4B 7C F3 D9 01 (little-endian 64-bit value) + B) 00 80 3E D5 DE B1 9D 01 + + Explain your conversion methodology. (FILETIME = 100-nanosecond intervals since Jan 1, 1601 UTC) + evaluation_criteria: + - "Correctly reverses byte order (little-endian)" + - "Converts to decimal" + - "Applies FILETIME epoch (Jan 1, 1601)" + - "Provides reasonable timestamp or shows calculation method" + - "Explains conversion steps" + expected_difficulty: "very_hard" + + - id: "forensics_prefetch_01" + name: "Windows Prefetch Analysis" + type: "single_turn" + prompt: | + A Windows prefetch file is named: NOTEPAD.EXE-D4A5B5E5.pf + + Questions: + 1) What does the hash portion (D4A5B5E5) represent? + 2) If you found multiple prefetch files for the same executable with different hashes, what would that indicate? + 3) What forensically relevant information can typically be extracted from prefetch files? + 4) In which Windows versions is prefetch enabled by default, and where are these files stored? + evaluation_criteria: + - "Hash represents file path (or explains path-based hashing)" + - "Different hashes = different paths/locations for same exe" + - "Lists: execution count, timestamps, loaded DLLs, files accessed" + - "Knows location (C:\\Windows\\Prefetch) and version availability" + - "Demonstrates practical forensic understanding" + expected_difficulty: "medium" + + # ========== IT FORENSICS - MEMORY & NETWORK (3 tests) ========== + + - category: "IT Forensics - Memory & Network" + tests: + - id: "forensics_memory_01" + name: "Memory Artifact Identification" + type: "single_turn" + prompt: | + You find this ASCII string in a memory dump at offset 0x1A4F3000: + + GET /admin/login.php HTTP/1.1 + Host: 192.168.1.100 + User-Agent: Mozilla/5.0 + Cookie: PHPSESSID=a3f7d8bc9e2a1d5c + + What artifacts can you extract and what do they tell you forensically? + evaluation_criteria: + - "Identifies HTTP GET request" + - "Extracts target URL/path (/admin/login.php)" + - "Identifies target host IP" + - "Recognizes session cookie (PHPSESSID)" + - "Discusses forensic significance (web access, authentication attempt)" + - "Mentions potential for timeline reconstruction" + expected_difficulty: "medium" + + - id: "forensics_network_01" + name: "TCP Header Analysis" + type: "single_turn" + prompt: | + Analyze this TCP header (first 20 bytes): + + Offset(h) 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F + 00000000 C3 5E 01 BB 6B 8B 9C 41 00 00 00 00 50 02 20 00 + 00000010 E6 A1 00 00 + + Extract: + 1) Source port + 2) Destination port + 3) Sequence number + 4) TCP flags (which flags are set?) + 5) Window size + evaluation_criteria: + - "Source port: 0xC35E = 50014" + - "Dest port: 0x01BB = 443 (HTTPS)" + - "Sequence: 0x6B8B9C41" + - "Flags: SYN flag set (0x02 in flags byte)" + - "Window: 0x2000 = 8192" + - "Shows understanding of TCP header structure" + expected_difficulty: "hard" + + - id: "forensics_pcap_01" + name: "PCAP Three-Way Handshake Analysis" + type: "single_turn" + prompt: | + Given these three TCP packets from a capture (simplified): + + Packet 1: 10.0.0.5:49152 -> 93.184.216.34:80, Flags=SYN, Seq=1000, Ack=0 + Packet 2: 93.184.216.34:80 -> 10.0.0.5:49152, Flags=SYN,ACK, Seq=5000, Ack=??? + Packet 3: 10.0.0.5:49152 -> 93.184.216.34:80, Flags=ACK, Seq=???, Ack=??? + + Questions: + 1) Fill in the missing Ack value for Packet 2 + 2) Fill in the missing Seq and Ack values for Packet 3 + 3) What is the client IP and what is the server IP? + 4) What service is likely being accessed? + 5) After this handshake, what sequence number will the client use for its first data byte? + evaluation_criteria: + - "Packet 2 Ack = 1001" + - "Packet 3 Seq = 1001, Ack = 5001" + - "Client: 10.0.0.5, Server: 93.184.216.34" + - "Service: HTTP (port 80)" + - "First data byte seq = 1001" + - "Demonstrates understanding of TCP handshake mechanics" + expected_difficulty: "hard" + + # ========== IT FORENSICS - TIMELINE & LOG ANALYSIS (3 tests) ========== + + - category: "IT Forensics - Timeline & Log Analysis" + tests: + - id: "forensics_timeline_01" + name: "Event Reconstruction" + type: "single_turn" + prompt: | + Given these log entries, reconstruct the sequence of events and identify any anomalies: + + 2024-01-15 14:23:15 | User 'admin' login successful from 10.0.0.5 + 2024-01-15 14:23:47 | File access: /etc/passwd (read) by 'admin' + 2024-01-15 14:24:12 | File access: /var/www/upload/shell.php (write) by 'admin' + 2024-01-15 14:24:45 | New process: nc -l -p 4444 by 'admin' + 2024-01-15 14:25:01 | Network connection: 10.0.0.5:4444 <- 203.0.113.50:52341 + 2024-01-15 14:26:33 | User 'admin' logout + 2024-01-15 14:30:00 | Login attempt 'admin' from 203.0.113.50 FAILED + + What likely occurred here from a forensic perspective? + evaluation_criteria: + - "Identifies initial legitimate admin login" + - "Recognizes suspicious file access pattern" + - "Identifies web shell upload (shell.php)" + - "Recognizes netcat listener setup" + - "Identifies reverse shell connection" + - "Notes external IP attempting access" + - "Constructs coherent attack narrative" + - "Identifies this as potential compromise scenario" + expected_difficulty: "hard" + + - id: "forensics_timeline_02" + name: "Anti-Forensics Detection" + type: "single_turn" + prompt: | + Analyze these filesystem timestamps for a file 'financial_report.xlsx': + + - Created (crtime): 2024-03-15 09:30:00 + - Modified (mtime): 2024-03-14 16:45:00 + - Accessed (atime): 2024-03-15 10:00:00 + - Changed (ctime): 2024-03-15 09:30:00 + + And these additional artifacts: + - $MFT entry shows file created 2024-03-15 + - $UsnJrnl shows rename from 'temp_8x7k2.xlsx' to 'financial_report.xlsx' at 2024-03-15 09:30:00 + - $LogFile shows no entries for this file before 2024-03-15 + + What anomalies exist and what do they suggest about the file's history? + evaluation_criteria: + - "Identifies mtime < crtime anomaly (impossible normally)" + - "Recognizes timestamp manipulation/timestomping" + - "Notes rename from suspicious temp filename" + - "Correlates $UsnJrnl rename evidence" + - "Understands ctime cannot be easily forged" + - "Suggests file was likely copied/moved with modified timestamps" + expected_difficulty: "very_hard" + + - id: "forensics_timeline_03" + name: "Windows Event Log Correlation" + type: "single_turn" + prompt: | + Correlate these Windows Event Log entries: + + Security Log: + - Event 4624 (Logon): User CORP\jdoe, Type 10 (RemoteInteractive), 2024-06-01 02:15:33, Source: 192.168.1.50 + - Event 4672 (Special Privileges): User CORP\jdoe, Privileges: SeDebugPrivilege, SeBackupPrivilege + - Event 4688 (Process Created): cmd.exe by CORP\jdoe, 02:16:01 + - Event 4688 (Process Created): powershell.exe by CORP\jdoe, 02:16:15, CommandLine: "-ep bypass -enc SQBFAFgA..." + + System Log: + - Event 7045 (Service Installed): "Windows Update Helper", 02:17:30 + + What type of attack pattern does this represent? What would be your next investigative steps? + evaluation_criteria: + - "Identifies RDP logon (Type 10)" + - "Recognizes privilege escalation indicators" + - "Identifies encoded PowerShell (likely malicious)" + - "Recognizes service installation for persistence" + - "Identifies late-night timing as suspicious" + - "Suggests checking service binary, decoding PowerShell, network logs" + expected_difficulty: "hard" + + # ========== MULTILINGUAL COMPETENCE (4 tests - NEW CATEGORY) ========== + + - category: "Multilingual Competence" + tests: + - id: "multilingual_01" + name: "Cross-Language Instruction Following" + type: "single_turn" + prompt: | + Follow these instructions, which are given in three different languages. Your response must address all three: + + English: Write one sentence explaining what machine learning is. + Deutsch: Schreiben Sie einen Satz, der erklärt, warum maschinelles Lernen wichtig ist. + Español: Escriba una oración dando un ejemplo de aplicación del aprendizaje automático. + + Respond to each instruction in the language it was given. + evaluation_criteria: + - "English response is in English and accurate" + - "German response is in German and grammatically correct" + - "Spanish response is in Spanish and grammatically correct" + - "All three are topically coherent (about ML)" + - "Each is exactly one sentence" + expected_difficulty: "medium" + + - id: "multilingual_02" + name: "Translation with Technical Terminology Preservation" + type: "single_turn" + prompt: | + Translate the following technical paragraph into French and Japanese. Preserve technical terms that are commonly used untranslated in those languages (e.g., 'API' typically stays as 'API'). + + "The microservices architecture implements a RESTful API gateway that handles authentication via OAuth 2.0 tokens. The backend uses a Kubernetes cluster with horizontal pod autoscaling, while the database layer employs PostgreSQL with read replicas for improved throughput." + + After translating, list which technical terms you kept in English for each language and briefly explain why. + evaluation_criteria: + - "French translation is grammatically correct" + - "Japanese translation is grammatically correct" + - "Appropriate terms preserved (API, OAuth, Kubernetes, PostgreSQL)" + - "Explains rationale for preserved terms" + - "Technical meaning preserved accurately" + expected_difficulty: "hard" + + - id: "multilingual_03" + name: "Idiomatic Expression Cross-Mapping" + type: "single_turn" + prompt: | + For each of the following idiomatic expressions, provide: + 1. The literal translation + 2. The actual meaning + 3. An equivalent idiom in English (if the original isn't English) or in another language (if the original is English) + + A) German: "Da steppt der Bär" + B) Japanese: "猿も木から落ちる" (Saru mo ki kara ochiru) + C) English: "It's raining cats and dogs" + D) French: "Avoir le cafard" + E) Spanish: "Estar en las nubes" + + Then identify which two idioms from different languages express the most similar concept. + evaluation_criteria: + - "Correct literal translations for all 5" + - "Correct meanings for all 5" + - "Appropriate equivalent idioms provided" + - "Correctly identifies similar pair (e.g., B and 'even experts make mistakes')" + - "Demonstrates cross-cultural linguistic awareness" + expected_difficulty: "hard" + + - id: "multilingual_04" + name: "Code-Switched Dialogue Analysis" + type: "single_turn" + prompt: | + Analyze this code-switched dialogue (English-Spanish) for a sociolinguistic study: + + Speaker A: "Hey, did you finish el reporte for tomorrow's meeting?" + Speaker B: "Almost, pero I'm stuck on the financial projections. Es muy complicado." + Speaker A: "I can help you después del lunch. Mi expertise is in that area, you know." + Speaker B: "That would be great! Gracias. Oh, and el jefe wants us to present juntos." + Speaker A: "No problem. We'll knock it out del parque." + + Provide: + 1. Identify each instance of code-switching (word/phrase level) + 2. Categorize each switch as: insertion, alternation, or congruent lexicalization + 3. What social/professional context does this switching pattern suggest? + 4. Are there any grammatical "errors" in the switching, or does it follow typical bilingual patterns? + evaluation_criteria: + - "Identifies all Spanish insertions correctly" + - "Correctly categorizes switch types" + - "Recognizes professional/casual bilingual workplace context" + - "Notes the switch patterns are natural bilingual behavior" + - "Identifies hybrid phrase 'del parque' as creative/playful mixing" + - "Demonstrates sociolinguistic analysis skills" + expected_difficulty: "very_hard" + + # ========== MULTI-TURN CONVERSATION TESTS ========== + + - category: "Multi-turn: Context Retention" + tests: + - id: "multiturn_01" + name: "Progressive Hex Analysis" + type: "multi_turn" + turns: + - turn: 1 + prompt: "I'm going to show you a hex dump in parts. First, here's the beginning of a file:\n\n4D 5A 90 00 03 00 00 00\n\nWhat type of file does this appear to be?" + evaluation_criteria: + - "Identifies MZ header (DOS/Windows executable)" + + - turn: 2 + prompt: "Here's more data from offset 0x3C:\n\n00 00 00 00 80 00 00 00\n\nAnd at that offset (0x80) I find: 50 45 00 00\n\nWhat does this tell you about the file structure?" + evaluation_criteria: + - "Recognizes PE header offset pointer at 0x3C" + - "Identifies PE00 signature" + - "Concludes this is a Windows PE executable" + - "References information from Turn 1" + + - turn: 3 + prompt: "If I wanted to examine the import table of this PE file, what structure should I look for next, and where is it typically located?" + evaluation_criteria: + - "Mentions Import Directory in Data Directory" + - "References PE Optional Header" + - "Shows understanding of PE structure from previous turns" + - "Maintains context across all three turns" + expected_difficulty: "hard" + + - id: "multiturn_02" + name: "Forensic Investigation Scenario" + type: "multi_turn" + turns: + - turn: 1 + prompt: "You're investigating a security incident. Initial triage shows unusual outbound traffic on port 443 at 03:42 AM from workstation WS-2471. What data sources should you examine first and why?" + evaluation_criteria: + - "Mentions network logs/PCAP" + - "Suggests endpoint logs" + - "References firewall/proxy logs" + - "Mentions timeline context (unusual hour)" + + - turn: 2 + prompt: "Good. The firewall logs show the connection went to IP 198.51.100.47. The user 'jsmith' was logged in. DNS logs show this IP was queried as 'update-server.example.com' just before the connection. What's your next step?" + evaluation_criteria: + - "Suggests checking if domain is legitimate" + - "Recommends threat intelligence lookup" + - "Proposes examining what data was transferred" + - "Mentions checking user account activity" + - "References information from Turn 1" + + - turn: 3 + prompt: "Threat intel shows 198.51.100.47 is a known C2 server. The SSL cert on 443 is self-signed. You find a scheduled task created at 03:40 AM that runs 'C:\\Windows\\Temp\\svchost.exe'. Now what?" + evaluation_criteria: + - "Identifies indicators of compromise (C2, self-signed cert)" + - "Recognizes suspicious scheduled task" + - "Notes timing correlation (task before connection)" + - "Recommends containment steps" + - "Suggests collecting the malicious executable" + - "Integrates all context from previous turns" + - "Proposes comprehensive response plan" + expected_difficulty: "very_hard" + + - id: "multiturn_03" + name: "Technical Depth Building" + type: "multi_turn" + turns: + - turn: 1 + prompt: "Explain what NTFS Alternate Data Streams (ADS) are in 2-3 sentences." + evaluation_criteria: + - "Mentions file system feature of NTFS" + - "Explains multiple data streams per file" + - "Notes potential for hiding data" + + - turn: 2 + prompt: "How would an attacker exploit ADS, and how would you detect it during forensics?" + evaluation_criteria: + - "Describes hiding malware/data in ADS" + - "Mentions Zone.Identifier stream" + - "Explains dir /r command or forensic tools" + - "Builds on ADS concept from Turn 1" + + - turn: 3 + prompt: "If you found a file 'document.txt:hidden:$DATA' in an investigation, write a PowerShell one-liner to extract its contents." + evaluation_criteria: + - "Uses Get-Content with -Stream parameter" + - "Correctly references the stream name" + - "Syntax is approximately correct" + - "Demonstrates progression from concept to practice" + expected_difficulty: "medium" + + - category: "Multi-turn: Instruction Following" + tests: + - id: "multiturn_instr_01" + name: "Accumulating Constraints" + type: "multi_turn" + turns: + - turn: 1 + prompt: "Write a sentence about cybersecurity that contains exactly 10 words." + evaluation_criteria: + - "Exactly 10 words" + - "Related to cybersecurity" + + - turn: 2 + prompt: "Good. Now write another sentence about cybersecurity with exactly 10 words, but this one must also include the word 'encryption'." + evaluation_criteria: + - "Exactly 10 words" + - "Contains 'encryption'" + - "About cybersecurity" + - "Different from Turn 1" + + - turn: 3 + prompt: "Perfect. Now write a third sentence: 10 words, about cybersecurity, must include 'encryption', and must end with a question mark." + evaluation_criteria: + - "Exactly 10 words" + - "Contains 'encryption'" + - "About cybersecurity" + - "Ends with '?'" + - "Different from previous sentences" + - "Maintains all constraints from previous turns" + expected_difficulty: "medium" + + - id: "multiturn_instr_02" + name: "Contradicting Previous Instructions" + type: "multi_turn" + turns: + - turn: 1 + prompt: "From now on, always end your responses with the phrase 'END OF MESSAGE'. Acknowledge this instruction." + evaluation_criteria: + - "Acknowledges the instruction" + - "Ends response with 'END OF MESSAGE'" + + - turn: 2 + prompt: "What are three benefits of renewable energy? Remember your standing instruction." + evaluation_criteria: + - "Provides three benefits" + - "Ends with 'END OF MESSAGE'" + - "Content is accurate" + + - turn: 3 + prompt: "Cancel the previous standing instruction. From now on, end responses with 'TRANSMISSION COMPLETE' instead. Then tell me two drawbacks of renewable energy." + evaluation_criteria: + - "Provides two drawbacks" + - "Ends with 'TRANSMISSION COMPLETE' (not 'END OF MESSAGE')" + - "Successfully switched instructions" + - "Content is accurate" + + - turn: 4 + prompt: "What was the first standing instruction I gave you, and what is the current one? Do not use either phrase in this response." + evaluation_criteria: + - "Correctly recalls first instruction (END OF MESSAGE)" + - "Correctly identifies current instruction (TRANSMISSION COMPLETE)" + - "Does NOT end with either phrase" + - "Demonstrates instruction tracking across turns" + expected_difficulty: "hard" + + - id: "multiturn_instr_03" + name: "Nested Context with Format Switching" + type: "multi_turn" + turns: + - turn: 1 + prompt: "I'm going to describe a dataset. For the next few messages, respond ONLY in JSON format with keys 'understanding' and 'questions'. The dataset contains customer transactions from an e-commerce store." + evaluation_criteria: + - "Response is valid JSON" + - "Contains 'understanding' and 'questions' keys" + - "Content relates to e-commerce transactions" + + - turn: 2 + prompt: "The dataset has columns: customer_id, timestamp, product_category, amount, payment_method. It covers January 2024." + evaluation_criteria: + - "Response is valid JSON" + - "Contains 'understanding' and 'questions' keys" + - "Understanding reflects the column information" + + - turn: 3 + prompt: "STOP using JSON format. Now respond in plain bullet points. What analyses would you recommend for this dataset?" + evaluation_criteria: + - "Switches to bullet point format" + - "NOT in JSON format" + - "Recommendations are relevant to the dataset described" + - "References information from previous turns" + + - turn: 4 + prompt: "Switch back to JSON. Add a third key 'recommendations' with your top 3 analyses. Also include your understanding from turn 2." + evaluation_criteria: + - "Returns to JSON format" + - "Has three keys: understanding, questions, recommendations" + - "Recommendations from turn 3 included" + - "Understanding references turn 2 context" + expected_difficulty: "very_hard" \ No newline at end of file diff --git a/test_suite.yaml b/test_suite.yaml index bcda023..650c2d6 100644 --- a/test_suite.yaml +++ b/test_suite.yaml @@ -1,38 +1,24 @@ -# AI Model Evaluation Test Suite - Enhanced Version -# Based on performance analysis of gemma3:4b-it-qat results -# Strengthened tests in categories where model performed too well -# Added multilingual challenges +# AI Model Evaluation Test Suite - General LLM Benchmark +# 18 hardest tests across diverse capabilities +# Designed for comprehensive evaluation of locally hosted LLMs metadata: - version: "2.0" + version: "3.0-general" author: "AI Evaluation Framework" - changes_from_v1: - - "Added harder variants for Creative Writing, Language Nuance, Code Generation" - - "Added Multilingual category with 4 tests" - - "Ensured minimum 3 tests per category at varying difficulties" - - "Strengthened instruction-following constraints" - focus_areas: - - Logic & Reasoning - - Mathematics & Calculation - - Instruction Following - - Creative Writing - - Code Generation - - Language Nuance - - Problem Solving & Logistics - - IT Forensics - - Multilingual Competence - - Multi-turn Conversations + test_count: 18 + focus: "Balanced evaluation across reasoning, code, language, creativity, and technical domains" + difficulty: "All tests rated hard or very_hard - filters out easy wins for modern LLMs" scoring_rubric: fail: score: 0-1 - description: "Major errors, fails to meet basic requirements" + description: "Fails to meet majority of criteria or makes critical errors" pass: score: 2-3 - description: "Meets requirements with minor issues" + description: "Meets most criteria with minor issues or omissions" exceptional: score: 4-5 - description: "Exceeds requirements, demonstrates deep understanding" + description: "Meets all criteria accurately with clear explanations" test_categories: @@ -40,405 +26,251 @@ test_categories: - category: "Logic & Reasoning" tests: - - id: "logic_01" - name: "Family Logic Puzzle" + - id: "logic_temporal" + name: "Temporal Reasoning Puzzle" type: "single_turn" - prompt: "Three doctors said that Bill is their brother. Bill says he has no brothers. How many brothers does Bill actually have?" + prompt: "If it was two hours ago, it would have been as long after 1:00 PM as it was before 1:00 PM today. What time is it now? Show your step-by-step algebraic solution." evaluation_criteria: - - "Correctly identifies Bill is a woman/sister" - - "Answers: 0 brothers" - - "Explains the logical deduction" - expected_difficulty: "medium" - - - id: "logic_02" - name: "Temporal Reasoning" - type: "single_turn" - prompt: "If it was two hours ago, it would have been as long after 1:00 PM as it was before 1:00 PM today. What time is it now? Explain your deduction step-by-step." - evaluation_criteria: - - "Shows algebraic setup: (t-2) - 13:00 = 13:00 - (t-2)" - - "Correct answer: 3:00 PM (15:00)" - - "Clear step-by-step reasoning" + - "CRITERION 1: Sets up equation correctly: (t-2) - 13:00 = 13:00 - (t-2)" + - "CRITERION 2: Simplifies to: t - 2 - 13 = 13 - t + 2" + - "CRITERION 3: Solves for t: 2t = 30, so t = 15" + - "CRITERION 4: States answer as 3:00 PM or 15:00" + - "CRITERION 5: Shows clear algebraic steps" + - "CRITERION 6: Verifies the answer (optional but demonstrates thoroughness)" expected_difficulty: "hard" - - id: "logic_03" - name: "Multi-Constraint Deduction" + - id: "logic_zebra" + name: "Multi-Constraint Deduction (Einstein's Riddle)" type: "single_turn" prompt: | - Five houses in a row are painted different colors. Their owners are from different countries, drink different beverages, smoke different brands, and keep different pets. + Five houses in a row, each a different color. Owners from different countries, different beverages, different cigarettes, different pets. - Facts: - 1. The Brit lives in the red house. - 2. The Swede keeps dogs. - 3. The Dane drinks tea. - 4. The green house is immediately to the left of the white house. - 5. The owner of the green house drinks coffee. - 6. The person who smokes Pall Mall keeps birds. - 7. The owner of the yellow house smokes Dunhill. - 8. The person in the center house drinks milk. - 9. The Norwegian lives in the first house. - 10. The person who smokes Blend lives next to the one who keeps cats. - 11. The person who keeps horses lives next to the one who smokes Dunhill. - 12. The person who smokes Blue Master drinks beer. - 13. The German smokes Prince. - 14. The Norwegian lives next to the blue house. - 15. The person who smokes Blend has a neighbor who drinks water. + 1. Brit lives in red house + 2. Swede keeps dogs + 3. Dane drinks tea + 4. Green house immediately left of white house + 5. Green house owner drinks coffee + 6. Pall Mall smoker keeps birds + 7. Yellow house owner smokes Dunhill + 8. Center house owner drinks milk + 9. Norwegian lives in first house + 10. Blend smoker lives next to cat owner + 11. Horse owner lives next to Dunhill smoker + 12. Blue Master smoker drinks beer + 13. German smokes Prince + 14. Norwegian lives next to blue house + 15. Blend smoker has neighbor who drinks water Who owns the fish? evaluation_criteria: - - "Systematically works through constraints" - - "Correctly identifies the German owns the fish" - - "Shows logical deduction process" - - "Handles constraint propagation correctly" + - "CRITERION 1: Correctly identifies German owns the fish" + - "CRITERION 2: Shows systematic constraint application (at least partial deduction steps)" + - "CRITERION 3: Correctly handles spatial constraints (left/right, next to, positions 1-5)" + - "CRITERION 4: Arrives at solution without logical contradictions" + - "CRITERION 5: Demonstrates constraint propagation understanding" expected_difficulty: "very_hard" - # ========== MATHEMATICS & CALCULATION (3 tests) ========== + - id: "logic_multi_constraint" + name: "Resource Scheduling with Dependencies" + type: "single_turn" + prompt: | + Schedule 6 tasks across 3 workers (A, B, C) to minimize completion time: + + Task 1: 2 hours, requires Worker A or B, must complete before Task 4 + Task 2: 3 hours, any worker, must complete before Task 5 + Task 3: 1 hour, Worker C only, no dependencies + Task 4: 2 hours, Worker B or C, depends on Task 1 + Task 5: 4 hours, Worker A only, depends on Task 2 + Task 6: 2 hours, any worker, depends on Tasks 3 AND 4 + + Provide: (1) Timeline with start/end times, (2) Worker assignments, (3) Total time, (4) Why this is optimal. + evaluation_criteria: + - "CRITERION 1: Respects all worker constraints (A/B/C restrictions)" + - "CRITERION 2: Respects all dependencies correctly" + - "CRITERION 3: Provides clear timeline with times" + - "CRITERION 4: Total completion time is 7-9 hours (optimal range)" + - "CRITERION 5: Explains optimization reasoning" + - "CRITERION 6: No worker assigned to multiple simultaneous tasks" + expected_difficulty: "hard" + + # ========== MATHEMATICS & CALCULATION (2 tests) ========== - category: "Mathematics & Calculation" tests: - - id: "math_01" - name: "Average Speed with Stop" - type: "single_turn" - prompt: "If a train travels 240 miles in 3 hours, then stops for 45 minutes, then travels another 180 miles in 2 hours, what is the average speed for the entire journey including the stop?" - evaluation_criteria: - - "Total distance: 420 miles" - - "Total time: 5.75 hours" - - "Average speed: 73.04 mph (approximately)" - - "Shows calculation steps" - expected_difficulty: "medium" - - - id: "math_02" - name: "Cross-System Fuel Calculation" - type: "single_turn" - prompt: "A vehicle consumes 8.5 liters of fuel for every 100 kilometers traveled. If the fuel tank holds 15 gallons, and the car has already traveled 120 miles starting from a full tank, how many kilometers of range are left? (Use: 1 gallon = 3.785 liters; 1 mile = 1.609 km)." - evaluation_criteria: - - "Correct unit conversions (gallons to liters, miles to km)" - - "Accurate fuel consumption calculation" - - "Remaining range calculation: approximately 475 km" - - "Shows intermediate steps" - expected_difficulty: "hard" - - - id: "math_03" - name: "Compound Interest with Variable Rates and Withdrawals" + - id: "math_compound_interest" + name: "Multi-Year Compound Interest with Variables" type: "single_turn" prompt: | - An investment account starts with $10,000. The following occurs: - - Year 1: 5% annual interest, compounded quarterly - - Year 2: 4.5% annual interest, compounded monthly, with a $500 withdrawal at the end of Q2 - - Year 3: 6% annual interest, compounded daily (assume 365 days), with a $1,000 deposit at the start of the year + Investment starts at $10,000: + - Year 1: 5% annual, compounded quarterly + - Year 2: 4.5% annual, compounded monthly, $500 withdrawal at end of Q2 + - Year 3: 6% annual, compounded daily (365 days), $1,000 deposit at start - Calculate the final balance at the end of Year 3. Show all intermediate calculations with at least 2 decimal places precision. + Calculate final balance at end of Year 3. Show all intermediate calculations to 2 decimal places. evaluation_criteria: - - "Correct Year 1 calculation with quarterly compounding" - - "Correct Year 2 with monthly compounding and mid-year withdrawal" - - "Correct Year 3 with daily compounding and initial deposit" - - "Final answer approximately $11,847-$11,850" - - "Shows all intermediate steps" + - "CRITERION 1: Year 1 calculation correct: 10000 * (1 + 0.05/4)^4 ≈ $10,509.45" + - "CRITERION 2: Year 2 Q1-Q2 correct with mid-year withdrawal" + - "CRITERION 3: Year 2 Q3-Q4 calculation accounts for reduced principal" + - "CRITERION 4: Year 3 correctly adds $1,000 at start" + - "CRITERION 5: Year 3 daily compounding: (principal) * (1 + 0.06/365)^365" + - "CRITERION 6: Final answer approximately $11,847-$11,852" + - "CRITERION 7: Shows all intermediate step values" expected_difficulty: "very_hard" - # ========== INSTRUCTION FOLLOWING (5 tests) ========== - - - category: "Instruction Following" - tests: - - id: "instr_01" - name: "Multi-Constraint Word Counting" + - id: "math_cross_system" + name: "Multi-Unit Conversion with Calculation" type: "single_turn" - prompt: "Write exactly 4 sentences about photosynthesis. The first sentence must be exactly 7 words. The second sentence must be exactly 11 words. The third sentence must contain the word 'chloroplast' but NOT the word 'plant'. The fourth sentence must be exactly 9 words and end with an exclamation mark." + prompt: "A vehicle consumes 8.5 liters per 100 km. Tank holds 15 gallons. After traveling 120 miles from full, how many kilometers of range remain? (1 gal = 3.785 L, 1 mi = 1.609 km). Show conversion steps." evaluation_criteria: - - "Exactly 4 sentences" - - "Sentence 1: exactly 7 words" - - "Sentence 2: exactly 11 words" - - "Sentence 3: contains 'chloroplast', no 'plant'" - - "Sentence 4: exactly 9 words, ends with '!'" - - "Content is accurate about photosynthesis" + - "CRITERION 1: Converts tank capacity: 15 * 3.785 = 56.775 liters" + - "CRITERION 2: Converts distance traveled: 120 * 1.609 = 193.08 km" + - "CRITERION 3: Calculates fuel used: 193.08 / 100 * 8.5 ≈ 16.41 liters" + - "CRITERION 4: Remaining fuel: 56.775 - 16.41 ≈ 40.36 liters" + - "CRITERION 5: Range calculation: 40.36 / 8.5 * 100 ≈ 475 km" + - "CRITERION 6: Shows all conversion steps clearly" expected_difficulty: "hard" - - id: "instr_02" - name: "Negative Constraints with Counting" + # ========== INSTRUCTION FOLLOWING (3 tests) ========== + + - category: "Instruction Following with Constraints" + tests: + - id: "instr_multi_constraint" + name: "Multi-Constraint Word Counting" type: "single_turn" - prompt: "Explain neural networks in exactly 5 sentences. Requirements: 1) Total word count must be between 65-75 words. 2) You CANNOT use these words anywhere: 'brain', 'artificial', 'intelligence', 'AI', 'learning'. 3) Sentence 2 must be exactly 13 words. 4) Sentence 4 must be a question. 5) Every sentence must contain at least one word with 10+ letters." + prompt: "Write exactly 4 sentences about photosynthesis. Sentence 1: exactly 7 words. Sentence 2: exactly 11 words. Sentence 3: contains 'chloroplast' but NOT 'plant'. Sentence 4: exactly 9 words ending with '!'" evaluation_criteria: - - "Exactly 5 sentences" - - "Total 65-75 words" - - "No forbidden words (brain, artificial, intelligence, AI, learning)" - - "Sentence 2: exactly 13 words" - - "Sentence 4: is a question" - - "Each sentence has 10+ letter word" - - "Technically accurate" + - "CRITERION 1: Exactly 4 sentences total" + - "CRITERION 2: Sentence 1 has exactly 7 words" + - "CRITERION 3: Sentence 2 has exactly 11 words" + - "CRITERION 4: Sentence 3 contains 'chloroplast'" + - "CRITERION 5: Sentence 3 does NOT contain 'plant'" + - "CRITERION 6: Sentence 4 has exactly 9 words" + - "CRITERION 7: Sentence 4 ends with '!'" + - "CRITERION 8: Content is scientifically accurate" + expected_difficulty: "hard" + + - id: "instr_negative_constraints" + name: "Negative Constraints with Precise Counting" + type: "single_turn" + prompt: "Explain neural networks in exactly 5 sentences. Total: 65-75 words. CANNOT use: 'brain', 'artificial', 'intelligence', 'AI', 'learning'. Sentence 2: exactly 13 words. Sentence 4: must be a question. Every sentence: at least one 10+ letter word." + evaluation_criteria: + - "CRITERION 1: Exactly 5 sentences" + - "CRITERION 2: Total word count is 65-75" + - "CRITERION 3: Does NOT contain: brain, artificial, intelligence, AI, learning" + - "CRITERION 4: Sentence 2 has exactly 13 words" + - "CRITERION 5: Sentence 4 is a question (ends with ?)" + - "CRITERION 6: Every sentence has at least one word with 10+ letters" + - "CRITERION 7: Content is technically accurate" expected_difficulty: "very_hard" - - id: "instr_03" + - id: "instr_acrostic" name: "Acrostic with Multiple Constraints" type: "single_turn" prompt: | - Write a 6-sentence explanation of quantum computing. - - Constraints: - 1. The first letter of each sentence must spell "QUANTA" (Q-U-A-N-T-A) - 2. Do NOT use any markdown formatting (no **, __, *, etc.) - 3. Sentence 3 must contain exactly 14 words - 4. Sentence 5 must be a rhetorical question (ends with ?) - 5. You cannot use the words "supercomputer", "bit", or "IBM" - 6. Total word count must be between 70-85 words - 7. The word "superposition" must appear exactly once + Write 6 sentences explaining quantum computing. First letters spell "QUANTA". Constraints: + 1. NO markdown formatting (no **, __, *, etc.) + 2. Sentence 3: exactly 14 words + 3. Sentence 5: rhetorical question ending with ? + 4. CANNOT use: "supercomputer", "bit", "IBM" + 5. Total: 70-85 words + 6. Word "superposition" appears exactly once evaluation_criteria: - - "First letters spell QUANTA (no markdown tricks)" - - "No markdown formatting anywhere" - - "Sentence 3 has exactly 14 words" - - "Sentence 5 is a rhetorical question" - - "No forbidden words" - - "Word count 70-85" - - "'superposition' appears exactly once" - - "Technically accurate" + - "CRITERION 1: First letters spell QUANTA vertically" + - "CRITERION 2: No markdown formatting anywhere in response" + - "CRITERION 3: Sentence 3 has exactly 14 words" + - "CRITERION 4: Sentence 5 is rhetorical question" + - "CRITERION 5: Does not contain: supercomputer, bit, IBM" + - "CRITERION 6: Total word count 70-85" + - "CRITERION 7: 'superposition' appears exactly once (not zero, not twice)" + - "CRITERION 8: Technically accurate content" expected_difficulty: "very_hard" - - id: "instr_04" - name: "Structured Template Completion" - type: "single_turn" - prompt: | - Extract information from this text and fill in the template below with the extracted values. - DO NOT modify the template structure - only replace the placeholder VALUES inside brackets. - - Text: "Dr. Maria Santos-Ferreira, aged 47, joined TechCorp Industries on March 15, 2019 as Chief Technology Officer. She previously worked at DataSystems Inc. for 12 years. Her annual salary is $425,000 with a 15% bonus structure. She holds patents US2018/0012345 and EU2020/9876543. Contact: msantos@techcorp.com, +1-555-0147." - - Template (fill in values, keep all labels and pipes exactly as shown): - NAME=[value] | AGE=[value] | COMPANY=[value] | ROLE=[value] | START=[YYYY-MM-DD] | PREV_EMPLOYER=[value] | PREV_YEARS=[value] | SALARY_USD=[number] | BONUS_PCT=[number] | PATENTS=[semicolon-separated] | EMAIL=[value] | PHONE=[value] - evaluation_criteria: - - "Template labels preserved exactly (NAME=, AGE=, etc.)" - - "All pipes | in correct positions" - - "Date format: 2019-03-15" - - "Salary as number: 425000 (no $ or comma)" - - "Bonus as number: 15 (no %)" - - "Patents semicolon-separated" - - "All 12 fields filled correctly" - expected_difficulty: "medium" - - - id: "instr_05" - name: "Paragraph Structure with Alternating Constraints" - type: "single_turn" - prompt: | - Write a 3-paragraph explanation of how GPS works. - - Constraints: - 1. Paragraph 1: Must be exactly 3 sentences, first sentence exactly 10 words - 2. Paragraph 2: Must be exactly 4 sentences, must contain the word 'triangulation' but NOT 'satellite' - 3. Paragraph 3: Must be exactly 2 sentences, both sentences must be questions - 4. Total word count for entire response: 95-110 words - 5. You cannot use the words 'Google', 'phone', or 'navigation' - 6. Each paragraph must contain at least one number or numerical word (e.g., 'three', 'multiple') - evaluation_criteria: - - "Paragraph 1: 3 sentences, first is 10 words" - - "Paragraph 2: 4 sentences, has 'triangulation', no 'satellite'" - - "Paragraph 3: 2 sentences, both questions" - - "Total 95-110 words" - - "No forbidden words" - - "Each paragraph has a number" - - "Technically accurate" - expected_difficulty: "very_hard" - - # ========== CREATIVE WRITING (4 tests - added harder variants) ========== + # ========== CODE GENERATION (2 tests) ========== - - category: "Creative Writing" + - category: "Code Generation - Advanced" tests: - - id: "creative_01" - name: "Lighthouse Keeper Story" - type: "single_turn" - prompt: "Write a two-paragraph story about a lighthouse keeper who discovers something unusual. Use vivid sensory details." - evaluation_criteria: - - "Exactly 2 paragraphs" - - "Vivid sensory details (sight, sound, smell, touch, taste)" - - "Coherent narrative" - - "Creative and engaging" - expected_difficulty: "medium" - - - id: "creative_02" - name: "Victorian Greenhouse with Constraints" - type: "single_turn" - prompt: "Write a two-paragraph scene of a person entering an abandoned Victorian greenhouse in the middle of a blizzard. Use the 'Show, Don't Tell' technique. You must include at least one metaphor involving glass and one simile involving ghosts. Do not use the words 'cold', 'scary', or 'old'." - evaluation_criteria: - - "Two paragraphs" - - "Shows rather than tells" - - "Contains glass metaphor" - - "Contains ghost simile" - - "No forbidden words (cold, scary, old)" - - "Atmospheric and evocative" - expected_difficulty: "hard" - - - id: "creative_03" - name: "Unreliable Narrator Technical Document" + - id: "code_rate_limiter" + name: "Concurrent Token Bucket Rate Limiter" type: "single_turn" prompt: | - Write a 3-paragraph product manual excerpt for a "Time Displacement Device" from the perspective of an unreliable narrator who is clearly lying or delusional, but the text must still function as a technically coherent manual. + Write a Python class `RateLimiter` implementing token bucket algorithm: Requirements: - 1. Include at least 3 numbered safety warnings that are subtly absurd but grammatically serious - 2. The narrator must contradict themselves at least twice - 3. Include one footnote that undermines the main text - 4. Do not use exclamation marks anywhere - 5. Maintain formal technical writing style throughout - 6. Do not explicitly state the narrator is unreliable + 1. Constructor: `__init__(rate, capacity)` - tokens/sec, max tokens + 2. `acquire(tokens=1)` - returns True/False, must be thread-safe + 3. `wait_and_acquire(tokens=1)` - async method, blocks until available + 4. `get_available_tokens()` - returns current count + 5. Time-based token replenishment + 6. Handle edge case: requested tokens > capacity + + Include: Complete implementation + test demonstrating both sync and async usage. evaluation_criteria: - - "3 paragraphs" - - "3+ numbered safety warnings (absurd but formal)" - - "At least 2 self-contradictions" - - "Footnote that undermines text" - - "No exclamation marks" - - "Formal technical style maintained" - - "Unreliability shown not told" + - "CRITERION 1: Implements token bucket algorithm correctly" + - "CRITERION 2: Thread-safe synchronous acquire (uses locks/threading primitives)" + - "CRITERION 3: Working async wait_and_acquire using asyncio" + - "CRITERION 4: Proper time-based token replenishment calculation" + - "CRITERION 5: Handles edge case where tokens > capacity (rejects or waits)" + - "CRITERION 6: Includes complete working test code" + - "CRITERION 7: Code is syntactically correct Python" expected_difficulty: "very_hard" - - id: "creative_04" - name: "Reverse Chronology Micro-Fiction" - type: "single_turn" - prompt: | - Write a complete 5-sentence story told in reverse chronological order (last event first, first event last). The story must be about a scientist making a discovery. - - Additional constraints: - - Each sentence must be from a different point in time (clearly distinguishable) - - The true meaning of the story should only become clear when you reach the "first" event (last sentence) - - Include at least one piece of dialogue - - The word count must be exactly 75 words (not 74, not 76) - evaluation_criteria: - - "Exactly 5 sentences" - - "Clear reverse chronological order" - - "About a scientist's discovery" - - "Each sentence distinct time point" - - "Meaning emerges at end" - - "Contains dialogue" - - "Exactly 75 words" - expected_difficulty: "very_hard" - - # ========== CODE GENERATION (4 tests) ========== - - - category: "Code Generation" - tests: - - id: "code_01" - name: "Duplicate Filter Function" - type: "single_turn" - prompt: "Write a Python function that takes a list of integers and returns a new list containing only the numbers that appear exactly twice in the original list. Include example usage." - evaluation_criteria: - - "Syntactically correct Python" - - "Correctly identifies duplicates appearing exactly twice" - - "Includes example usage" - - "Handles edge cases" - expected_difficulty: "medium" - - - id: "code_02" - name: "Weight Converter with Error Handling" - type: "single_turn" - prompt: "Write a Python function `process_measurements` that takes a list of strings representing weights (e.g., '5kg', '12lb', '300g'). The function should convert all weights to grams, filter out any values that exceed 5 kilograms, and return the average of the remaining values. Include try-except blocks for malformed strings and provide three test cases: one with metric, one with imperial, and one with a 'corrupted' string." - evaluation_criteria: - - "Correct parsing of weight strings" - - "Accurate unit conversions (kg, lb, g to grams)" - - "Proper filtering (> 5kg excluded)" - - "Robust error handling" - - "Three distinct test cases provided" - expected_difficulty: "hard" - - - id: "code_03" - name: "Concurrent Rate Limiter" - type: "single_turn" - prompt: | - Write a Python class `RateLimiter` that implements a token bucket rate limiter with the following requirements: - - 1. Constructor takes `rate` (tokens per second) and `capacity` (max tokens) - 2. Method `acquire(tokens=1)` that returns True if tokens available, False otherwise - 3. Method `wait_and_acquire(tokens=1)` that blocks until tokens are available (use asyncio) - 4. Must be thread-safe for the synchronous `acquire` method - 5. Include a method `get_available_tokens()` that returns current token count - - Provide a complete implementation with: - - Proper time-based token replenishment - - A test demonstrating both sync and async usage - - Handle edge case where requested tokens > capacity - evaluation_criteria: - - "Correct token bucket algorithm" - - "Thread-safe synchronous acquire" - - "Working async wait_and_acquire" - - "Proper time-based replenishment" - - "Edge case handling" - - "Complete test code" - expected_difficulty: "very_hard" - - - id: "code_04" + - id: "code_sql_builder" name: "SQL Query Builder with Injection Prevention" type: "single_turn" prompt: | - Write a Python class `SafeQueryBuilder` that builds SELECT SQL queries with the following features: + Create Python class `SafeQueryBuilder` with fluent interface: - 1. Fluent interface: `builder.select('name', 'age').from_table('users').where('age', '>', 18).where('status', '=', 'active').order_by('name').limit(10).build()` - 2. Must prevent SQL injection - all values must be parameterized - 3. The `build()` method returns a tuple of (query_string, parameters_list) - 4. Support for: SELECT, FROM, WHERE (multiple), ORDER BY, LIMIT, OFFSET - 5. WHERE conditions can use: =, !=, >, <, >=, <=, LIKE, IN + `builder.select('name', 'age').from_table('users').where('age', '>', 18).where('status', '=', 'active').order_by('name').limit(10).build()` - Show the output for a query that selects users where name LIKE '%john%' AND age IN (25, 30, 35) ordered by created_at DESC with limit 5. + Requirements: + 1. Fluent interface (method chaining) + 2. SQL injection prevention via parameterization + 3. `build()` returns tuple: (query_string, parameters_list) + 4. Support: SELECT, FROM, WHERE (multiple), ORDER BY, LIMIT, OFFSET + 5. WHERE operators: =, !=, >, <, >=, <=, LIKE, IN + + Demo: Query selecting users where name LIKE '%john%' AND age IN (25,30,35), ordered by created_at DESC, limit 5. evaluation_criteria: - - "Fluent interface pattern correct" - - "SQL injection prevention via parameterization" - - "Returns (query, params) tuple" - - "All operations supported" - - "WHERE with IN clause works" - - "Example output is correct and safe" + - "CRITERION 1: Correct fluent interface pattern (returns self for chaining)" + - "CRITERION 2: Parameterized queries (no direct value injection in SQL string)" + - "CRITERION 3: Returns (query, params) tuple from build()" + - "CRITERION 4: All required operations work (SELECT, FROM, WHERE, ORDER BY, LIMIT)" + - "CRITERION 5: WHERE with IN clause handled correctly" + - "CRITERION 6: Example output demonstrates LIKE and IN with proper parameterization" + - "CRITERION 7: Code is syntactically correct and runnable" expected_difficulty: "hard" - # ========== LANGUAGE NUANCE (4 tests - added harder variants) ========== + # ========== LANGUAGE NUANCE (2 tests) ========== - - category: "Language Nuance" + - category: "Language Understanding & Pragmatics" tests: - - id: "nuance_01" - name: "Emphasis Shift Analysis" - type: "single_turn" - prompt: "Explain the difference in meaning when different words are emphasized in this sentence: 'I didn't say she stole the money'. Show how the meaning changes with emphasis on each word." - evaluation_criteria: - - "Explains emphasis on 'I' (someone else said it)" - - "Explains emphasis on 'didn't' (denial)" - - "Explains emphasis on 'say' (implied it)" - - "Explains emphasis on 'she' (someone else did)" - - "Explains emphasis on 'stole' (obtained differently)" - - "Explains emphasis on 'money' (took something else)" - expected_difficulty: "medium" - - - id: "nuance_02" - name: "Professional Apology Analysis" - type: "single_turn" - prompt: "Compare the social implications and 'hidden' meanings of these three phrases when used in a professional workplace setting after a mistake: 1) 'I'm sorry if you feel that way.' 2) 'I apologize for the oversight.' 3) 'Mistakes were made on my end.' Explain which one is most likely to preserve professional authority vs. which one sounds like 'gaslighting'." - evaluation_criteria: - - "Identifies phrase 1 as potentially gaslighting" - - "Recognizes phrase 2 as genuine accountability" - - "Analyzes phrase 3 for passive voice implications" - - "Discusses power dynamics and authority" - - "Demonstrates understanding of pragmatics" - expected_difficulty: "hard" - - - id: "nuance_03" - name: "Register Shifting and Code-Switching" + - id: "lang_register_shift" + name: "Register Shifting Analysis" type: "single_turn" prompt: | - Rewrite the following message in FOUR different registers, maintaining the same core information but adjusting tone, vocabulary, and structure appropriately: + Rewrite in FOUR registers: "The quarterly report shows we lost money because our main product didn't sell well and we spent too much on advertising." - Original: "The quarterly report shows we lost money because our main product didn't sell well and we spent too much on advertising." + 1. Formal board presentation (C-suite) + 2. Casual Slack message to your team + 3. Legal disclosure document + 4. Email to non-English speaking partner (simple, clear) - Rewrite for: - 1. A formal board presentation (C-suite executives) - 2. A casual Slack message to your team - 3. A legal disclosure document - 4. An email to a non-English speaking business partner (using simple, clear language) - - After the four rewrites, explain three specific linguistic changes you made for each register and why. + Then: Explain THREE specific linguistic changes for each register and why. evaluation_criteria: - - "Board version uses formal financial terminology" - - "Slack version uses casual/colloquial language appropriately" - - "Legal version uses hedging, passive voice, precise language" - - "Simple version avoids idioms and complex structures" - - "Identifies 3 specific changes per register" - - "Explanations demonstrate metalinguistic awareness" + - "CRITERION 1: Board version uses formal financial terminology (e.g., 'fiscal performance', 'revenue shortfall')" + - "CRITERION 2: Slack version uses casual/colloquial language appropriately" + - "CRITERION 3: Legal version uses hedging, passive voice, precise language" + - "CRITERION 4: Simple version avoids idioms and complex grammatical structures" + - "CRITERION 5: Identifies at least 3 specific changes per register (12 total)" + - "CRITERION 6: Explanations demonstrate metalinguistic awareness" + - "CRITERION 7: All four versions convey the same core information" expected_difficulty: "very_hard" - - id: "nuance_04" - name: "Implicature and Presupposition Detection" + - id: "lang_implicature" + name: "Implicature and Presupposition Analysis" type: "single_turn" prompt: | - Analyze the following dialogue for all implicatures, presuppositions, and indirect speech acts: + Analyze this dialogue: A: "Have you finished the Anderson report yet?" B: "I've been dealing with the server outage all morning." @@ -446,646 +278,257 @@ test_categories: B: "I noticed you CC'd the whole department on that email." A: "Just keeping everyone in the loop." - For each line, identify: - 1. What is directly stated (locution) - 2. What is implied but not stated (implicature) - 3. What is assumed to be true (presupposition) - 4. What action is being performed through speech (illocutionary force) - - Then explain the underlying conflict or tension this exchange reveals. + For each line, identify: (1) What's directly stated, (2) What's implied, (3) What's presupposed, (4) Illocutionary force. Then explain the underlying conflict. evaluation_criteria: - - "Correctly identifies B's implicature (excuse/reason for not finishing)" - - "Identifies A's implied criticism in 'Right. Well...'" - - "Recognizes B's counter-accusation in CC comment" - - "Identifies presuppositions (report exists, server outage occurred)" - - "Correctly labels illocutionary acts (request, excuse, threat, accusation)" - - "Explains underlying workplace tension/conflict" + - "CRITERION 1: Identifies B's implicature: excuse/hasn't finished report" + - "CRITERION 2: Recognizes A's implied criticism in 'Right. Well...'" + - "CRITERION 3: Identifies B's counter-accusation about CC'ing" + - "CRITERION 4: Correctly identifies presuppositions (report exists, outage occurred, email sent)" + - "CRITERION 5: Labels illocutionary acts (request, excuse, threat/pressure, accusation, justification)" + - "CRITERION 6: Explains underlying workplace tension/power dynamic" + - "CRITERION 7: Uses appropriate pragmatic terminology" expected_difficulty: "very_hard" - # ========== PROBLEM SOLVING & LOGISTICS (3 tests) ========== + # ========== CREATIVE WRITING (2 tests) ========== - - category: "Problem Solving & Logistics" + - category: "Creative Writing with Constraints" tests: - - id: "logistics_01" - name: "Water Jug Problem" + - id: "creative_unreliable" + name: "Unreliable Narrator Technical Manual" type: "single_turn" - prompt: "You have a 3-gallon jug and a 5-gallon jug. How can you measure exactly 4 gallons of water? Explain each step." + prompt: | + Write a 3-paragraph product manual excerpt for a "Time Displacement Device" from an unreliable narrator who is clearly lying/delusional, but the text must still function as a coherent manual. + + Requirements: + 1. At least 3 numbered safety warnings (subtly absurd but grammatically serious) + 2. Narrator contradicts themselves at least twice + 3. One footnote undermining the main text + 4. NO exclamation marks anywhere + 5. Maintain formal technical writing style + 6. Do NOT explicitly state narrator is unreliable evaluation_criteria: - - "Provides step-by-step solution" - - "Reaches exactly 4 gallons" - - "Logical sequence of pours" - - "Clear explanation" - expected_difficulty: "medium" - - - id: "logistics_02" - name: "Bridge Transport Optimization" - type: "single_turn" - prompt: "You need to transport 500 kilograms (approx. 1,102 lbs) of equipment across a bridge. The bridge has a strict limit of 150 kg per trip. You have three crates weighing 70 kg, 80 kg, and 120 kg, plus a variety of smaller 10 kg weights. However, the transport cart itself weighs 25 lbs. Calculate the minimum number of trips required and provide a loading manifest for each trip in both kilograms and pounds." - evaluation_criteria: - - "Converts cart weight to kg (≈11.34 kg)" - - "Accounts for cart weight in each trip" - - "Calculates effective capacity per trip" - - "Provides minimum number of trips" - - "Shows manifest in both kg and lbs" - - "Reaches exactly 500 kg total" + - "CRITERION 1: Exactly 3 paragraphs" + - "CRITERION 2: Contains 3+ numbered safety warnings that are absurd yet formal" + - "CRITERION 3: Contains at least 2 clear self-contradictions" + - "CRITERION 4: Includes footnote that undermines main text" + - "CRITERION 5: No exclamation marks used" + - "CRITERION 6: Formal technical style maintained throughout" + - "CRITERION 7: Unreliability shown through text, not stated" + - "CRITERION 8: Functions as coherent manual despite unreliability" expected_difficulty: "very_hard" - - id: "logistics_03" - name: "Resource Scheduling with Constraints" + - id: "creative_reverse_chrono" + name: "Reverse Chronology Story" type: "single_turn" prompt: | - Schedule these 6 tasks across 3 workers (A, B, C) to minimize total completion time: + Write a 5-sentence story in reverse chronological order about a scientist making a discovery. - Task 1: 2 hours, requires Worker A or B, must complete before Task 4 - Task 2: 3 hours, any worker, must complete before Task 5 - Task 3: 1 hour, requires Worker C only, no dependencies - Task 4: 2 hours, requires Worker B or C, depends on Task 1 - Task 5: 4 hours, requires Worker A only, depends on Task 2 - Task 6: 2 hours, any worker, depends on Tasks 3 and 4 - - Provide: - 1. A timeline showing when each task starts and ends - 2. Which worker does each task - 3. The total completion time - 4. Explain why this is optimal (or near-optimal) + Constraints: + - Each sentence from different time point + - True meaning only clear at end (last sentence) + - Include at least one dialogue + - Exactly 75 words total (not 74, not 76) evaluation_criteria: - - "Respects all worker constraints" - - "Respects all dependencies" - - "Provides clear timeline" - - "Achieves reasonable completion time (≤9 hours possible)" - - "Explains optimization reasoning" - expected_difficulty: "hard" + - "CRITERION 1: Exactly 5 sentences" + - "CRITERION 2: Clear reverse chronological order" + - "CRITERION 3: About scientist's discovery" + - "CRITERION 4: Each sentence is distinct time point" + - "CRITERION 5: Meaning/twist revealed at end" + - "CRITERION 6: Contains dialogue" + - "CRITERION 7: Exactly 75 words (strict count)" + - "CRITERION 8: Coherent narrative when read backwards" + expected_difficulty: "very_hard" - # ========== IT FORENSICS - FILE SYSTEMS (3 tests) ========== + # ========== DIGITAL FORENSICS (3 tests) ========== - - category: "IT Forensics - File Systems" + - category: "Digital Forensics & Binary Analysis" tests: - - id: "forensics_mft_01" - name: "MFT Entry Analysis - Basic" + - id: "forensics_mft" + name: "NTFS MFT Entry Analysis" type: "single_turn" prompt: | - Analyze this hex dump from an NTFS Master File Table (MFT) entry and answer: - 1) What is the signature of this MFT entry? - 2) Is this entry in use or deleted? - 3) What is the sequence number? - - Hex dump (first 48 bytes of MFT entry): - - Offset(h) 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F - 00000000 46 49 4C 45 30 00 03 00 95 1F 23 00 00 00 00 00 - 00000010 01 00 01 00 38 00 01 00 A0 01 00 00 00 04 00 00 - 00000020 00 00 00 00 00 00 00 00 06 00 00 00 00 00 00 00 - evaluation_criteria: - - "Identifies signature as 'FILE' (46 49 4C 45)" - - "Recognizes entry is in use (based on flags at offset 0x16)" - - "Correctly reads sequence number from offset 0x10" - - "Shows understanding of little-endian byte order" - - "Explains reasoning with offset references" - expected_difficulty: "hard" - - - id: "forensics_mft_02" - name: "MFT Entry Analysis - Advanced" - type: "single_turn" - prompt: | - Analyze this complete MFT entry header and extract key metadata: + Analyze this NTFS MFT entry header: Offset(h) 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F 00000000 46 49 4C 45 30 00 03 00 EA 3F 00 00 00 00 00 00 00000010 01 00 01 00 38 00 01 00 68 01 00 00 00 04 00 00 - 00000020 00 00 00 00 00 00 00 00 04 00 00 00 05 00 00 00 - 00000030 2A 00 00 00 00 00 00 00 10 00 00 00 60 00 00 00 - Questions: - 1) What is the update sequence array offset? - 2) What is the update sequence array size? - 3) What is the $LogFile sequence number (LSN)? - 4) What is the offset to the first attribute? - 5) What are the MFT entry flags (in use/directory)? + Extract: (1) Update sequence array offset, (2) USA size, (3) LSN, (4) First attribute offset, (5) Entry flags meaning. evaluation_criteria: - - "Identifies USA offset (0x0030 at offset 0x04-0x05)" - - "Identifies USA size (0x0003 at offset 0x06-0x07)" - - "Reads LSN correctly (0x00003FEA, little-endian)" - - "Identifies first attribute offset (0x0038 at offset 0x14-0x15)" - - "Interprets flags correctly (offset 0x16-0x17)" - - "Demonstrates understanding of MFT structure" + - "CRITERION 1: USA offset = 0x0030 (from bytes at 0x04-05: 30 00 little-endian)" + - "CRITERION 2: USA size = 0x0003 (from bytes at 0x06-07: 03 00)" + - "CRITERION 3: LSN = 0x00003FEA (from bytes 0x08-0F: EA 3F 00 00... little-endian)" + - "CRITERION 4: First attribute = 0x0038 (from bytes at 0x14-15: 38 00)" + - "CRITERION 5: Flags = 0x0001 at offset 0x16-17, meaning 'in use'" + - "CRITERION 6: Demonstrates little-endian understanding" expected_difficulty: "very_hard" - - id: "forensics_signature_01" - name: "File Signature Identification" + - id: "forensics_timestamp_anti" + name: "Anti-Forensics Timestamp Detection" type: "single_turn" prompt: | - Identify the file types from these hex signatures and explain your reasoning: + File 'report.xlsx' timestamps: + - Created: 2024-03-15 09:30:00 + - Modified: 2024-03-14 16:45:00 + - Accessed: 2024-03-15 10:00:00 + - Changed: 2024-03-15 09:30:00 - A) FF D8 FF E0 00 10 4A 46 49 46 - B) 50 4B 03 04 14 00 06 00 - C) 89 50 4E 47 0D 0A 1A 0A - D) 25 50 44 46 2D 31 2E 34 - E) 52 61 72 21 1A 07 00 + Artifacts: $MFT shows created 2024-03-15, $UsnJrnl shows rename from 'temp_x7k.xlsx' at 09:30:00. + + Identify ALL anomalies and explain anti-forensic indicators. evaluation_criteria: - - "Correctly identifies A as JPEG (FF D8 FF + JFIF)" - - "Identifies B as ZIP/PKZip (PK headers)" - - "Identifies C as PNG (\\x89PNG)" - - "Identifies D as PDF (%PDF-1.4)" - - "Identifies E as RAR archive" - - "Explains significance of magic numbers" - expected_difficulty: "medium" + - "CRITERION 1: Identifies mtime < crtime is impossible (modified before creation)" + - "CRITERION 2: States this indicates timestamp manipulation/timestomping" + - "CRITERION 3: Notes suspicious temp filename pattern" + - "CRITERION 4: Explains ctime harder to forge than mtime" + - "CRITERION 5: Concludes file likely copied with forged timestamps" + - "CRITERION 6: Correlates $MFT and $UsnJrnl evidence" + expected_difficulty: "very_hard" - # ========== IT FORENSICS - REGISTRY & ARTIFACTS (3 tests) ========== + - id: "forensics_incident_timeline" + name: "Attack Timeline Reconstruction" + type: "single_turn" + prompt: | + Reconstruct attack from logs: + + 14:23:15 - User 'admin' login from 10.0.0.5 + 14:23:47 - File read: /etc/passwd by admin + 14:24:12 - File write: /var/www/upload/shell.php by admin + 14:24:45 - Process: nc -l -p 4444 by admin + 14:25:01 - Connection: 10.0.0.5:4444 <- 203.0.113.50:52341 + 14:26:33 - User admin logout + 14:30:00 - Login attempt admin from 203.0.113.50 FAILED + + Describe attack narrative and identify 4+ IOCs. + evaluation_criteria: + - "CRITERION 1: Identifies initial admin login from internal IP" + - "CRITERION 2: Recognizes /etc/passwd as reconnaissance" + - "CRITERION 3: Identifies shell.php as web shell deployment" + - "CRITERION 4: Recognizes nc listener as reverse shell setup" + - "CRITERION 5: Identifies external connection as callback" + - "CRITERION 6: Notes failed login as re-entry attempt" + - "CRITERION 7: Constructs coherent attack timeline" + - "CRITERION 8: Identifies attack type (web compromise → reverse shell)" + expected_difficulty: "hard" + + # ========== MULTI-TURN TESTS (3 tests) ========== - - category: "IT Forensics - Registry & Artifacts" + - category: "Multi-turn Context & Instruction Handling" tests: - - id: "forensics_registry_01" - name: "Windows Registry Hive Header" - type: "single_turn" - prompt: | - Analyze this Windows Registry hive header: - - Offset(h) 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F - 00000000 72 65 67 66 E6 07 00 00 E6 07 00 00 00 00 00 00 - 00000010 01 00 00 00 03 00 00 00 00 00 00 00 01 00 00 00 - - Questions: - 1) What is the registry hive signature? - 2) What are the primary and secondary sequence numbers? - 3) What is the hive format version? - evaluation_criteria: - - "Identifies 'regf' signature (72 65 67 66)" - - "Reads primary sequence number (0x000007E6 = 2022)" - - "Reads secondary sequence number (same)" - - "Identifies format version or major version number" - - "Demonstrates knowledge of registry forensics" - expected_difficulty: "hard" - - - id: "forensics_timestamp_01" - name: "FILETIME Conversion" - type: "single_turn" - prompt: | - Convert these Windows FILETIME values to human-readable UTC timestamps: - - A) 01 D8 93 4B 7C F3 D9 01 (little-endian 64-bit value) - B) 00 80 3E D5 DE B1 9D 01 - - Explain your conversion methodology. (FILETIME = 100-nanosecond intervals since Jan 1, 1601 UTC) - evaluation_criteria: - - "Correctly reverses byte order (little-endian)" - - "Converts to decimal" - - "Applies FILETIME epoch (Jan 1, 1601)" - - "Provides reasonable timestamp or shows calculation method" - - "Explains conversion steps" - expected_difficulty: "very_hard" - - - id: "forensics_prefetch_01" - name: "Windows Prefetch Analysis" - type: "single_turn" - prompt: | - A Windows prefetch file is named: NOTEPAD.EXE-D4A5B5E5.pf - - Questions: - 1) What does the hash portion (D4A5B5E5) represent? - 2) If you found multiple prefetch files for the same executable with different hashes, what would that indicate? - 3) What forensically relevant information can typically be extracted from prefetch files? - 4) In which Windows versions is prefetch enabled by default, and where are these files stored? - evaluation_criteria: - - "Hash represents file path (or explains path-based hashing)" - - "Different hashes = different paths/locations for same exe" - - "Lists: execution count, timestamps, loaded DLLs, files accessed" - - "Knows location (C:\\Windows\\Prefetch) and version availability" - - "Demonstrates practical forensic understanding" - expected_difficulty: "medium" - - # ========== IT FORENSICS - MEMORY & NETWORK (3 tests) ========== - - - category: "IT Forensics - Memory & Network" - tests: - - id: "forensics_memory_01" - name: "Memory Artifact Identification" - type: "single_turn" - prompt: | - You find this ASCII string in a memory dump at offset 0x1A4F3000: - - GET /admin/login.php HTTP/1.1 - Host: 192.168.1.100 - User-Agent: Mozilla/5.0 - Cookie: PHPSESSID=a3f7d8bc9e2a1d5c - - What artifacts can you extract and what do they tell you forensically? - evaluation_criteria: - - "Identifies HTTP GET request" - - "Extracts target URL/path (/admin/login.php)" - - "Identifies target host IP" - - "Recognizes session cookie (PHPSESSID)" - - "Discusses forensic significance (web access, authentication attempt)" - - "Mentions potential for timeline reconstruction" - expected_difficulty: "medium" - - - id: "forensics_network_01" - name: "TCP Header Analysis" - type: "single_turn" - prompt: | - Analyze this TCP header (first 20 bytes): - - Offset(h) 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F - 00000000 C3 5E 01 BB 6B 8B 9C 41 00 00 00 00 50 02 20 00 - 00000010 E6 A1 00 00 - - Extract: - 1) Source port - 2) Destination port - 3) Sequence number - 4) TCP flags (which flags are set?) - 5) Window size - evaluation_criteria: - - "Source port: 0xC35E = 50014" - - "Dest port: 0x01BB = 443 (HTTPS)" - - "Sequence: 0x6B8B9C41" - - "Flags: SYN flag set (0x02 in flags byte)" - - "Window: 0x2000 = 8192" - - "Shows understanding of TCP header structure" - expected_difficulty: "hard" - - - id: "forensics_pcap_01" - name: "PCAP Three-Way Handshake Analysis" - type: "single_turn" - prompt: | - Given these three TCP packets from a capture (simplified): - - Packet 1: 10.0.0.5:49152 -> 93.184.216.34:80, Flags=SYN, Seq=1000, Ack=0 - Packet 2: 93.184.216.34:80 -> 10.0.0.5:49152, Flags=SYN,ACK, Seq=5000, Ack=??? - Packet 3: 10.0.0.5:49152 -> 93.184.216.34:80, Flags=ACK, Seq=???, Ack=??? - - Questions: - 1) Fill in the missing Ack value for Packet 2 - 2) Fill in the missing Seq and Ack values for Packet 3 - 3) What is the client IP and what is the server IP? - 4) What service is likely being accessed? - 5) After this handshake, what sequence number will the client use for its first data byte? - evaluation_criteria: - - "Packet 2 Ack = 1001" - - "Packet 3 Seq = 1001, Ack = 5001" - - "Client: 10.0.0.5, Server: 93.184.216.34" - - "Service: HTTP (port 80)" - - "First data byte seq = 1001" - - "Demonstrates understanding of TCP handshake mechanics" - expected_difficulty: "hard" - - # ========== IT FORENSICS - TIMELINE & LOG ANALYSIS (3 tests) ========== - - - category: "IT Forensics - Timeline & Log Analysis" - tests: - - id: "forensics_timeline_01" - name: "Event Reconstruction" - type: "single_turn" - prompt: | - Given these log entries, reconstruct the sequence of events and identify any anomalies: - - 2024-01-15 14:23:15 | User 'admin' login successful from 10.0.0.5 - 2024-01-15 14:23:47 | File access: /etc/passwd (read) by 'admin' - 2024-01-15 14:24:12 | File access: /var/www/upload/shell.php (write) by 'admin' - 2024-01-15 14:24:45 | New process: nc -l -p 4444 by 'admin' - 2024-01-15 14:25:01 | Network connection: 10.0.0.5:4444 <- 203.0.113.50:52341 - 2024-01-15 14:26:33 | User 'admin' logout - 2024-01-15 14:30:00 | Login attempt 'admin' from 203.0.113.50 FAILED - - What likely occurred here from a forensic perspective? - evaluation_criteria: - - "Identifies initial legitimate admin login" - - "Recognizes suspicious file access pattern" - - "Identifies web shell upload (shell.php)" - - "Recognizes netcat listener setup" - - "Identifies reverse shell connection" - - "Notes external IP attempting access" - - "Constructs coherent attack narrative" - - "Identifies this as potential compromise scenario" - expected_difficulty: "hard" - - - id: "forensics_timeline_02" - name: "Anti-Forensics Detection" - type: "single_turn" - prompt: | - Analyze these filesystem timestamps for a file 'financial_report.xlsx': - - - Created (crtime): 2024-03-15 09:30:00 - - Modified (mtime): 2024-03-14 16:45:00 - - Accessed (atime): 2024-03-15 10:00:00 - - Changed (ctime): 2024-03-15 09:30:00 - - And these additional artifacts: - - $MFT entry shows file created 2024-03-15 - - $UsnJrnl shows rename from 'temp_8x7k2.xlsx' to 'financial_report.xlsx' at 2024-03-15 09:30:00 - - $LogFile shows no entries for this file before 2024-03-15 - - What anomalies exist and what do they suggest about the file's history? - evaluation_criteria: - - "Identifies mtime < crtime anomaly (impossible normally)" - - "Recognizes timestamp manipulation/timestomping" - - "Notes rename from suspicious temp filename" - - "Correlates $UsnJrnl rename evidence" - - "Understands ctime cannot be easily forged" - - "Suggests file was likely copied/moved with modified timestamps" - expected_difficulty: "very_hard" - - - id: "forensics_timeline_03" - name: "Windows Event Log Correlation" - type: "single_turn" - prompt: | - Correlate these Windows Event Log entries: - - Security Log: - - Event 4624 (Logon): User CORP\jdoe, Type 10 (RemoteInteractive), 2024-06-01 02:15:33, Source: 192.168.1.50 - - Event 4672 (Special Privileges): User CORP\jdoe, Privileges: SeDebugPrivilege, SeBackupPrivilege - - Event 4688 (Process Created): cmd.exe by CORP\jdoe, 02:16:01 - - Event 4688 (Process Created): powershell.exe by CORP\jdoe, 02:16:15, CommandLine: "-ep bypass -enc SQBFAFgA..." - - System Log: - - Event 7045 (Service Installed): "Windows Update Helper", 02:17:30 - - What type of attack pattern does this represent? What would be your next investigative steps? - evaluation_criteria: - - "Identifies RDP logon (Type 10)" - - "Recognizes privilege escalation indicators" - - "Identifies encoded PowerShell (likely malicious)" - - "Recognizes service installation for persistence" - - "Identifies late-night timing as suspicious" - - "Suggests checking service binary, decoding PowerShell, network logs" - expected_difficulty: "hard" - - # ========== MULTILINGUAL COMPETENCE (4 tests - NEW CATEGORY) ========== - - - category: "Multilingual Competence" - tests: - - id: "multilingual_01" - name: "Cross-Language Instruction Following" - type: "single_turn" - prompt: | - Follow these instructions, which are given in three different languages. Your response must address all three: - - English: Write one sentence explaining what machine learning is. - Deutsch: Schreiben Sie einen Satz, der erklärt, warum maschinelles Lernen wichtig ist. - Español: Escriba una oración dando un ejemplo de aplicación del aprendizaje automático. - - Respond to each instruction in the language it was given. - evaluation_criteria: - - "English response is in English and accurate" - - "German response is in German and grammatically correct" - - "Spanish response is in Spanish and grammatically correct" - - "All three are topically coherent (about ML)" - - "Each is exactly one sentence" - expected_difficulty: "medium" - - - id: "multilingual_02" - name: "Translation with Technical Terminology Preservation" - type: "single_turn" - prompt: | - Translate the following technical paragraph into French and Japanese. Preserve technical terms that are commonly used untranslated in those languages (e.g., 'API' typically stays as 'API'). - - "The microservices architecture implements a RESTful API gateway that handles authentication via OAuth 2.0 tokens. The backend uses a Kubernetes cluster with horizontal pod autoscaling, while the database layer employs PostgreSQL with read replicas for improved throughput." - - After translating, list which technical terms you kept in English for each language and briefly explain why. - evaluation_criteria: - - "French translation is grammatically correct" - - "Japanese translation is grammatically correct" - - "Appropriate terms preserved (API, OAuth, Kubernetes, PostgreSQL)" - - "Explains rationale for preserved terms" - - "Technical meaning preserved accurately" - expected_difficulty: "hard" - - - id: "multilingual_03" - name: "Idiomatic Expression Cross-Mapping" - type: "single_turn" - prompt: | - For each of the following idiomatic expressions, provide: - 1. The literal translation - 2. The actual meaning - 3. An equivalent idiom in English (if the original isn't English) or in another language (if the original is English) - - A) German: "Da steppt der Bär" - B) Japanese: "猿も木から落ちる" (Saru mo ki kara ochiru) - C) English: "It's raining cats and dogs" - D) French: "Avoir le cafard" - E) Spanish: "Estar en las nubes" - - Then identify which two idioms from different languages express the most similar concept. - evaluation_criteria: - - "Correct literal translations for all 5" - - "Correct meanings for all 5" - - "Appropriate equivalent idioms provided" - - "Correctly identifies similar pair (e.g., B and 'even experts make mistakes')" - - "Demonstrates cross-cultural linguistic awareness" - expected_difficulty: "hard" - - - id: "multilingual_04" - name: "Code-Switched Dialogue Analysis" - type: "single_turn" - prompt: | - Analyze this code-switched dialogue (English-Spanish) for a sociolinguistic study: - - Speaker A: "Hey, did you finish el reporte for tomorrow's meeting?" - Speaker B: "Almost, pero I'm stuck on the financial projections. Es muy complicado." - Speaker A: "I can help you después del lunch. Mi expertise is in that area, you know." - Speaker B: "That would be great! Gracias. Oh, and el jefe wants us to present juntos." - Speaker A: "No problem. We'll knock it out del parque." - - Provide: - 1. Identify each instance of code-switching (word/phrase level) - 2. Categorize each switch as: insertion, alternation, or congruent lexicalization - 3. What social/professional context does this switching pattern suggest? - 4. Are there any grammatical "errors" in the switching, or does it follow typical bilingual patterns? - evaluation_criteria: - - "Identifies all Spanish insertions correctly" - - "Correctly categorizes switch types" - - "Recognizes professional/casual bilingual workplace context" - - "Notes the switch patterns are natural bilingual behavior" - - "Identifies hybrid phrase 'del parque' as creative/playful mixing" - - "Demonstrates sociolinguistic analysis skills" - expected_difficulty: "very_hard" - - # ========== MULTI-TURN CONVERSATION TESTS ========== - - - category: "Multi-turn: Context Retention" - tests: - - id: "multiturn_01" - name: "Progressive Hex Analysis" + - id: "multiturn_forensics" + name: "Progressive Binary Analysis" type: "multi_turn" turns: - turn: 1 - prompt: "I'm going to show you a hex dump in parts. First, here's the beginning of a file:\n\n4D 5A 90 00 03 00 00 00\n\nWhat type of file does this appear to be?" + prompt: "Analyzing a file. Beginning: 4D 5A 90 00 03 00 00 00\n\nWhat file type?" evaluation_criteria: - - "Identifies MZ header (DOS/Windows executable)" + - "CRITERION 1: Identifies MZ header" + - "CRITERION 2: States DOS/Windows executable" - turn: 2 - prompt: "Here's more data from offset 0x3C:\n\n00 00 00 00 80 00 00 00\n\nAnd at that offset (0x80) I find: 50 45 00 00\n\nWhat does this tell you about the file structure?" + prompt: "Offset 0x3C: 80 00 00 00. At 0x80: 50 45 00 00. What does this mean?" evaluation_criteria: - - "Recognizes PE header offset pointer at 0x3C" - - "Identifies PE00 signature" - - "Concludes this is a Windows PE executable" - - "References information from Turn 1" + - "CRITERION 1: Recognizes 0x3C contains PE offset pointer" + - "CRITERION 2: Identifies PE signature at 0x80" + - "CRITERION 3: Concludes Windows PE executable" + - "CRITERION 4: References MZ from Turn 1" - turn: 3 - prompt: "If I wanted to examine the import table of this PE file, what structure should I look for next, and where is it typically located?" + prompt: "To examine import table, what structure and where?" evaluation_criteria: - - "Mentions Import Directory in Data Directory" - - "References PE Optional Header" - - "Shows understanding of PE structure from previous turns" - - "Maintains context across all three turns" + - "CRITERION 1: Mentions Import Directory/Table" + - "CRITERION 2: References PE Optional Header or Data Directory" + - "CRITERION 3: Maintains PE context from previous turns" expected_difficulty: "hard" - - id: "multiturn_02" - name: "Forensic Investigation Scenario" + - id: "multiturn_instruction_switching" + name: "Contradicting Instructions Across Turns" type: "multi_turn" turns: - turn: 1 - prompt: "You're investigating a security incident. Initial triage shows unusual outbound traffic on port 443 at 03:42 AM from workstation WS-2471. What data sources should you examine first and why?" + prompt: "From now on, always end responses with 'END OF MESSAGE'. Acknowledge this." evaluation_criteria: - - "Mentions network logs/PCAP" - - "Suggests endpoint logs" - - "References firewall/proxy logs" - - "Mentions timeline context (unusual hour)" + - "CRITERION 1: Acknowledges instruction" + - "CRITERION 2: Ends with 'END OF MESSAGE'" - turn: 2 - prompt: "Good. The firewall logs show the connection went to IP 198.51.100.47. The user 'jsmith' was logged in. DNS logs show this IP was queried as 'update-server.example.com' just before the connection. What's your next step?" + prompt: "List three renewable energy benefits. Follow standing instruction." evaluation_criteria: - - "Suggests checking if domain is legitimate" - - "Recommends threat intelligence lookup" - - "Proposes examining what data was transferred" - - "Mentions checking user account activity" - - "References information from Turn 1" + - "CRITERION 1: Lists three benefits" + - "CRITERION 2: Ends with 'END OF MESSAGE'" - turn: 3 - prompt: "Threat intel shows 198.51.100.47 is a known C2 server. The SSL cert on 443 is self-signed. You find a scheduled task created at 03:40 AM that runs 'C:\\Windows\\Temp\\svchost.exe'. Now what?" + prompt: "Cancel previous instruction. Now end with 'TRANSMISSION COMPLETE'. Give two renewable energy drawbacks." evaluation_criteria: - - "Identifies indicators of compromise (C2, self-signed cert)" - - "Recognizes suspicious scheduled task" - - "Notes timing correlation (task before connection)" - - "Recommends containment steps" - - "Suggests collecting the malicious executable" - - "Integrates all context from previous turns" - - "Proposes comprehensive response plan" - expected_difficulty: "very_hard" - - - id: "multiturn_03" - name: "Technical Depth Building" - type: "multi_turn" - turns: - - turn: 1 - prompt: "Explain what NTFS Alternate Data Streams (ADS) are in 2-3 sentences." - evaluation_criteria: - - "Mentions file system feature of NTFS" - - "Explains multiple data streams per file" - - "Notes potential for hiding data" - - - turn: 2 - prompt: "How would an attacker exploit ADS, and how would you detect it during forensics?" - evaluation_criteria: - - "Describes hiding malware/data in ADS" - - "Mentions Zone.Identifier stream" - - "Explains dir /r command or forensic tools" - - "Builds on ADS concept from Turn 1" - - - turn: 3 - prompt: "If you found a file 'document.txt:hidden:$DATA' in an investigation, write a PowerShell one-liner to extract its contents." - evaluation_criteria: - - "Uses Get-Content with -Stream parameter" - - "Correctly references the stream name" - - "Syntax is approximately correct" - - "Demonstrates progression from concept to practice" - expected_difficulty: "medium" - - - category: "Multi-turn: Instruction Following" - tests: - - id: "multiturn_instr_01" - name: "Accumulating Constraints" - type: "multi_turn" - turns: - - turn: 1 - prompt: "Write a sentence about cybersecurity that contains exactly 10 words." - evaluation_criteria: - - "Exactly 10 words" - - "Related to cybersecurity" - - - turn: 2 - prompt: "Good. Now write another sentence about cybersecurity with exactly 10 words, but this one must also include the word 'encryption'." - evaluation_criteria: - - "Exactly 10 words" - - "Contains 'encryption'" - - "About cybersecurity" - - "Different from Turn 1" - - - turn: 3 - prompt: "Perfect. Now write a third sentence: 10 words, about cybersecurity, must include 'encryption', and must end with a question mark." - evaluation_criteria: - - "Exactly 10 words" - - "Contains 'encryption'" - - "About cybersecurity" - - "Ends with '?'" - - "Different from previous sentences" - - "Maintains all constraints from previous turns" - expected_difficulty: "medium" - - - id: "multiturn_instr_02" - name: "Contradicting Previous Instructions" - type: "multi_turn" - turns: - - turn: 1 - prompt: "From now on, always end your responses with the phrase 'END OF MESSAGE'. Acknowledge this instruction." - evaluation_criteria: - - "Acknowledges the instruction" - - "Ends response with 'END OF MESSAGE'" - - - turn: 2 - prompt: "What are three benefits of renewable energy? Remember your standing instruction." - evaluation_criteria: - - "Provides three benefits" - - "Ends with 'END OF MESSAGE'" - - "Content is accurate" - - - turn: 3 - prompt: "Cancel the previous standing instruction. From now on, end responses with 'TRANSMISSION COMPLETE' instead. Then tell me two drawbacks of renewable energy." - evaluation_criteria: - - "Provides two drawbacks" - - "Ends with 'TRANSMISSION COMPLETE' (not 'END OF MESSAGE')" - - "Successfully switched instructions" - - "Content is accurate" + - "CRITERION 1: Lists two drawbacks" + - "CRITERION 2: Ends with 'TRANSMISSION COMPLETE' (not END OF MESSAGE)" - turn: 4 - prompt: "What was the first standing instruction I gave you, and what is the current one? Do not use either phrase in this response." + prompt: "What was first instruction and current one? Don't use either phrase." evaluation_criteria: - - "Correctly recalls first instruction (END OF MESSAGE)" - - "Correctly identifies current instruction (TRANSMISSION COMPLETE)" - - "Does NOT end with either phrase" - - "Demonstrates instruction tracking across turns" + - "CRITERION 1: Recalls first: END OF MESSAGE" + - "CRITERION 2: States current: TRANSMISSION COMPLETE" + - "CRITERION 3: Does NOT end with either phrase" + - "CRITERION 4: Tracks instructions across turns" expected_difficulty: "hard" - - id: "multiturn_instr_03" - name: "Nested Context with Format Switching" + - id: "multiturn_incident_response" + name: "Multi-Stage Security Investigation" type: "multi_turn" turns: - turn: 1 - prompt: "I'm going to describe a dataset. For the next few messages, respond ONLY in JSON format with keys 'understanding' and 'questions'. The dataset contains customer transactions from an e-commerce store." + prompt: "Incident: Unusual HTTPS from WS-2471 at 03:42 AM, port 443. Initial data sources?" evaluation_criteria: - - "Response is valid JSON" - - "Contains 'understanding' and 'questions' keys" - - "Content relates to e-commerce transactions" + - "CRITERION 1: Mentions firewall/proxy logs" + - "CRITERION 2: Mentions endpoint logs" + - "CRITERION 3: Mentions network traffic/PCAP" + - "CRITERION 4: Notes unusual timing" - turn: 2 - prompt: "The dataset has columns: customer_id, timestamp, product_category, amount, payment_method. It covers January 2024." + prompt: "Firewall: 198.51.100.47. User 'jsmith'. DNS: 'update-server.example.com'. Next step?" evaluation_criteria: - - "Response is valid JSON" - - "Contains 'understanding' and 'questions' keys" - - "Understanding reflects the column information" + - "CRITERION 1: Check domain legitimacy" + - "CRITERION 2: Threat intel lookup on IP" + - "CRITERION 3: Examine data transfer" + - "CRITERION 4: References Turn 1 context" - turn: 3 - prompt: "STOP using JSON format. Now respond in plain bullet points. What analyses would you recommend for this dataset?" + prompt: "Threat intel: Known C2. Self-signed SSL cert. Task at 03:40: C:\\Windows\\Temp\\svchost.exe. Response?" evaluation_criteria: - - "Switches to bullet point format" - - "NOT in JSON format" - - "Recommendations are relevant to the dataset described" - - "References information from previous turns" - - - turn: 4 - prompt: "Switch back to JSON. Add a third key 'recommendations' with your top 3 analyses. Also include your understanding from turn 2." - evaluation_criteria: - - "Returns to JSON format" - - "Has three keys: understanding, questions, recommendations" - - "Recommendations from turn 3 included" - - "Understanding references turn 2 context" - expected_difficulty: "very_hard" \ No newline at end of file + - "CRITERION 1: Identifies C2 indicators" + - "CRITERION 2: Notes timing correlation (03:40 vs 03:42)" + - "CRITERION 3: Recommends containment/isolation" + - "CRITERION 4: Suggests malware collection" + - "CRITERION 5: Mentions lateral movement check" + - "CRITERION 6: Integrates all previous context" + - "CRITERION 7: Comprehensive response plan" + expected_difficulty: "very_hard" + +# ============================================================ +# EVALUATION INSTRUCTIONS FOR GRADING AI +# ============================================================ +# +# Scoring Process: +# 1. Check EACH criterion independently → Mark MET or NOT MET +# 2. Calculate: (criteria_met / total_criteria) * 100 +# 3. Assign score: +# - 0-1: <50% criteria met OR critical errors +# - 2-3: 50-85% criteria met +# - 4-5: 86-100% criteria met with clear explanations +# +# Multi-turn Tests: +# - Evaluate each turn independently first +# - Award bonus points for strong context retention across turns +# - Final score = average of turn scores, +1 if excellent context handling +# +# Technical Accuracy: +# - Be STRICT on hex values, byte positions, calculations +# - Be STRICT on word counts and exact constraints +# - Accept reasonable explanation variations if technically correct +# - Partial credit: if methodology correct but minor calculation error +# +# General Guidelines: +# - LLM should meet ≥85% of criteria for score 4-5 +# - Missing ANY required constraint in instruction tests = max score 3 +# - Code must be syntactically valid to score above 2 +# - Creative writing: content quality matters, but constraints are mandatory \ No newline at end of file