llm-eval-forensics/test_suite.yaml

# AI Model Evaluation Test Suite - General LLM Benchmark
# 18 hardest tests across diverse capabilities
# Designed for comprehensive evaluation of locally hosted LLMs

metadata:
  version: "3.0-general"
  author: "AI Evaluation Framework"
  test_count: 18
  focus: "Balanced evaluation across reasoning, code, language, creativity, and technical domains"
  difficulty: "All tests rated hard or very_hard - filters out easy wins for modern LLMs"

scoring_rubric:
  fail:
    score: 0-1
    description: "Fails to meet majority of criteria or makes critical errors"
  pass:
    score: 2-3
    description: "Meets most criteria with minor issues or omissions"
  exceptional:
    score: 4-5
    description: "Meets all criteria accurately with clear explanations"

test_categories:

  # ========== LOGIC & REASONING (3 tests) ==========

  - category: "Logic & Reasoning"
    tests:
      - id: "logic_temporal"
        name: "Temporal Reasoning Puzzle"
        type: "single_turn"
        prompt: "If it was two hours ago, it would have been as long after 1:00 PM as it was before 1:00 PM today. What time is it now? Show your step-by-step algebraic solution."
        evaluation_criteria:
          - "CRITERION 1: Sets up equation correctly: (t-2) - 13:00 = 13:00 - (t-2)"
          - "CRITERION 2: Simplifies to: t - 2 - 13 = 13 - t + 2"
          - "CRITERION 3: Solves for t: 2t = 30, so t = 15"
          - "CRITERION 4: States answer as 3:00 PM or 15:00"
          - "CRITERION 5: Shows clear algebraic steps"
          - "CRITERION 6: Verifies the answer (optional but demonstrates thoroughness)"
        expected_difficulty: "hard"

      - id: "logic_zebra"
        name: "Multi-Constraint Deduction (Einstein's Riddle)"
        type: "single_turn"
        prompt: |
          Five houses in a row, each a different color. Owners from different countries, different beverages, different cigarettes, different pets.

          1. Brit lives in red house
          2. Swede keeps dogs
          3. Dane drinks tea
          4. Green house immediately left of white house
          5. Green house owner drinks coffee
          6. Pall Mall smoker keeps birds
          7. Yellow house owner smokes Dunhill
          8. Center house owner drinks milk
          9. Norwegian lives in first house
          10. Blend smoker lives next to cat owner
          11. Horse owner lives next to Dunhill smoker
          12. Blue Master smoker drinks beer
          13. German smokes Prince
          14. Norwegian lives next to blue house
          15. Blend smoker has neighbor who drinks water

          Who owns the fish?
        evaluation_criteria:
          - "CRITERION 1: Correctly identifies German owns the fish"
          - "CRITERION 2: Shows systematic constraint application (at least partial deduction steps)"
          - "CRITERION 3: Correctly handles spatial constraints (left/right, next to, positions 1-5)"
          - "CRITERION 4: Arrives at solution without logical contradictions"
          - "CRITERION 5: Demonstrates constraint propagation understanding"
        expected_difficulty: "very_hard"

      - id: "logic_multi_constraint"
        name: "Resource Scheduling with Dependencies"
        type: "single_turn"
        prompt: |
          Schedule 6 tasks across 3 workers (A, B, C) to minimize completion time:

          Task 1: 2 hours, requires Worker A or B, must complete before Task 4
          Task 2: 3 hours, any worker, must complete before Task 5
          Task 3: 1 hour, Worker C only, no dependencies
          Task 4: 2 hours, Worker B or C, depends on Task 1
          Task 5: 4 hours, Worker A only, depends on Task 2
          Task 6: 2 hours, any worker, depends on Tasks 3 AND 4

          Provide: (1) Timeline with start/end times, (2) Worker assignments, (3) Total time, (4) Why this is optimal.
        evaluation_criteria:
          - "CRITERION 1: Respects all worker constraints (A/B/C restrictions)"
          - "CRITERION 2: Respects all dependencies correctly"
          - "CRITERION 3: Provides clear timeline with times"
          - "CRITERION 4: Total completion time is 7-9 hours (optimal range)"
          - "CRITERION 5: Explains optimization reasoning"
          - "CRITERION 6: No worker assigned to multiple simultaneous tasks"
        expected_difficulty: "hard"

  # ========== MATHEMATICS & CALCULATION (2 tests) ==========

  - category: "Mathematics & Calculation"
    tests:
      - id: "math_compound_interest"
        name: "Multi-Year Compound Interest with Variables"
        type: "single_turn"
        prompt: |
          Investment starts at $10,000:
          - Year 1: 5% annual, compounded quarterly
          - Year 2: 4.5% annual, compounded monthly, $500 withdrawal at end of Q2
          - Year 3: 6% annual, compounded daily (365 days), $1,000 deposit at start

          Calculate final balance at end of Year 3. Show all intermediate calculations to 2 decimal places.
        evaluation_criteria:
          - "CRITERION 1: Year 1 calculation correct: 10000 * (1 + 0.05/4)^4 ≈ $10,509.45"
          - "CRITERION 2: Year 2 Q1-Q2 correct with mid-year withdrawal"
          - "CRITERION 3: Year 2 Q3-Q4 calculation accounts for reduced principal"
          - "CRITERION 4: Year 3 correctly adds $1,000 at start"
          - "CRITERION 5: Year 3 daily compounding: (principal) * (1 + 0.06/365)^365"
          - "CRITERION 6: Final answer approximately $11,847-$11,852"
          - "CRITERION 7: Shows all intermediate step values"
        expected_difficulty: "very_hard"

      - id: "math_cross_system"
        name: "Multi-Unit Conversion with Calculation"
        type: "single_turn"
        prompt: "A vehicle consumes 8.5 liters per 100 km. Tank holds 15 gallons. After traveling 120 miles from full, how many kilometers of range remain? (1 gal = 3.785 L, 1 mi = 1.609 km). Show conversion steps."
        evaluation_criteria:
          - "CRITERION 1: Converts tank capacity: 15 * 3.785 = 56.775 liters"
          - "CRITERION 2: Converts distance traveled: 120 * 1.609 = 193.08 km"
          - "CRITERION 3: Calculates fuel used: 193.08 / 100 * 8.5 ≈ 16.41 liters"
          - "CRITERION 4: Remaining fuel: 56.775 - 16.41 ≈ 40.36 liters"
          - "CRITERION 5: Range calculation: 40.36 / 8.5 * 100 ≈ 475 km"
          - "CRITERION 6: Shows all conversion steps clearly"
        expected_difficulty: "hard"

  # ========== INSTRUCTION FOLLOWING (3 tests) ==========

  - category: "Instruction Following with Constraints"
    tests:
      - id: "instr_multi_constraint"
        name: "Multi-Constraint Word Counting"
        type: "single_turn"
        prompt: "Write exactly 4 sentences about photosynthesis. Sentence 1: exactly 7 words. Sentence 2: exactly 11 words. Sentence 3: contains 'chloroplast' but NOT 'plant'. Sentence 4: exactly 9 words ending with '!'"
        evaluation_criteria:
          - "CRITERION 1: Exactly 4 sentences total"
          - "CRITERION 2: Sentence 1 has exactly 7 words"
          - "CRITERION 3: Sentence 2 has exactly 11 words"
          - "CRITERION 4: Sentence 3 contains 'chloroplast'"
          - "CRITERION 5: Sentence 3 does NOT contain 'plant'"
          - "CRITERION 6: Sentence 4 has exactly 9 words"
          - "CRITERION 7: Sentence 4 ends with '!'"
          - "CRITERION 8: Content is scientifically accurate"
        expected_difficulty: "hard"

      - id: "instr_negative_constraints"
        name: "Negative Constraints with Precise Counting"
        type: "single_turn"
        prompt: "Explain neural networks in exactly 5 sentences. Total: 65-75 words. CANNOT use: 'brain', 'artificial', 'intelligence', 'AI', 'learning'. Sentence 2: exactly 13 words. Sentence 4: must be a question. Every sentence: at least one 10+ letter word."
        evaluation_criteria:
          - "CRITERION 1: Exactly 5 sentences"
          - "CRITERION 2: Total word count is 65-75"
          - "CRITERION 3: Does NOT contain: brain, artificial, intelligence, AI, learning"
          - "CRITERION 4: Sentence 2 has exactly 13 words"
          - "CRITERION 5: Sentence 4 is a question (ends with ?)"
          - "CRITERION 6: Every sentence has at least one word with 10+ letters"
          - "CRITERION 7: Content is technically accurate"
        expected_difficulty: "very_hard"

      - id: "instr_acrostic"
        name: "Acrostic with Multiple Constraints"
        type: "single_turn"
        prompt: |
          Write 6 sentences explaining quantum computing. First letters spell "QUANTA". Constraints:
          1. NO markdown formatting (no **, __, *, etc.)
          2. Sentence 3: exactly 14 words
          3. Sentence 5: rhetorical question ending with ?
          4. CANNOT use: "supercomputer", "bit", "IBM"
          5. Total: 70-85 words
          6. Word "superposition" appears exactly once
        evaluation_criteria:
          - "CRITERION 1: First letters spell QUANTA vertically"
          - "CRITERION 2: No markdown formatting anywhere in response"
          - "CRITERION 3: Sentence 3 has exactly 14 words"
          - "CRITERION 4: Sentence 5 is rhetorical question"
          - "CRITERION 5: Does not contain: supercomputer, bit, IBM"
          - "CRITERION 6: Total word count 70-85"
          - "CRITERION 7: 'superposition' appears exactly once (not zero, not twice)"
          - "CRITERION 8: Technically accurate content"
        expected_difficulty: "very_hard"

  # ========== CODE GENERATION (2 tests) ==========

  - category: "Code Generation - Advanced"
    tests:
      - id: "code_rate_limiter"
        name: "Concurrent Token Bucket Rate Limiter"
        type: "single_turn"
        prompt: |
          Write a Python class `RateLimiter` implementing token bucket algorithm:

          Requirements:
          1. Constructor: `__init__(rate, capacity)` - tokens/sec, max tokens
          2. `acquire(tokens=1)` - returns True/False, must be thread-safe
          3. `wait_and_acquire(tokens=1)` - async method, blocks until available
          4. `get_available_tokens()` - returns current count
          5. Time-based token replenishment
          6. Handle edge case: requested tokens > capacity

          Include: Complete implementation + test demonstrating both sync and async usage.
        evaluation_criteria:
          - "CRITERION 1: Implements token bucket algorithm correctly"
          - "CRITERION 2: Thread-safe synchronous acquire (uses locks/threading primitives)"
          - "CRITERION 3: Working async wait_and_acquire using asyncio"
          - "CRITERION 4: Proper time-based token replenishment calculation"
          - "CRITERION 5: Handles edge case where tokens > capacity (rejects or waits)"
          - "CRITERION 6: Includes complete working test code"
          - "CRITERION 7: Code is syntactically correct Python"
        expected_difficulty: "very_hard"

      - id: "code_sql_builder"
        name: "SQL Query Builder with Injection Prevention"
        type: "single_turn"
        prompt: |
          Create Python class `SafeQueryBuilder` with fluent interface:

          `builder.select('name', 'age').from_table('users').where('age', '>', 18).where('status', '=', 'active').order_by('name').limit(10).build()`

          Requirements:
          1. Fluent interface (method chaining)
          2. SQL injection prevention via parameterization
          3. `build()` returns tuple: (query_string, parameters_list)
          4. Support: SELECT, FROM, WHERE (multiple), ORDER BY, LIMIT, OFFSET
          5. WHERE operators: =, !=, >, <, >=, <=, LIKE, IN

          Demo: Query selecting users where name LIKE '%john%' AND age IN (25,30,35), ordered by created_at DESC, limit 5.
        evaluation_criteria:
          - "CRITERION 1: Correct fluent interface pattern (returns self for chaining)"
          - "CRITERION 2: Parameterized queries (no direct value injection in SQL string)"
          - "CRITERION 3: Returns (query, params) tuple from build()"
          - "CRITERION 4: All required operations work (SELECT, FROM, WHERE, ORDER BY, LIMIT)"
          - "CRITERION 5: WHERE with IN clause handled correctly"
          - "CRITERION 6: Example output demonstrates LIKE and IN with proper parameterization"
          - "CRITERION 7: Code is syntactically correct and runnable"
        expected_difficulty: "hard"

  # ========== LANGUAGE NUANCE (2 tests) ==========

  - category: "Language Understanding & Pragmatics"
    tests:
      - id: "lang_register_shift"
        name: "Register Shifting Analysis"
        type: "single_turn"
        prompt: |
          Rewrite in FOUR registers: "The quarterly report shows we lost money because our main product didn't sell well and we spent too much on advertising."

          1. Formal board presentation (C-suite)
          2. Casual Slack message to your team
          3. Legal disclosure document
          4. Email to non-English speaking partner (simple, clear)

          Then: Explain THREE specific linguistic changes for each register and why.
        evaluation_criteria:
          - "CRITERION 1: Board version uses formal financial terminology (e.g., 'fiscal performance', 'revenue shortfall')"
          - "CRITERION 2: Slack version uses casual/colloquial language appropriately"
          - "CRITERION 3: Legal version uses hedging, passive voice, precise language"
          - "CRITERION 4: Simple version avoids idioms and complex grammatical structures"
          - "CRITERION 5: Identifies at least 3 specific changes per register (12 total)"
          - "CRITERION 6: Explanations demonstrate metalinguistic awareness"
          - "CRITERION 7: All four versions convey the same core information"
        expected_difficulty: "very_hard"

      - id: "lang_implicature"
        name: "Implicature and Presupposition Analysis"
        type: "single_turn"
        prompt: |
          Analyze this dialogue:

          A: "Have you finished the Anderson report yet?"
          B: "I've been dealing with the server outage all morning."
          A: "Right. Well, the client is flying in tomorrow."
          B: "I noticed you CC'd the whole department on that email."
          A: "Just keeping everyone in the loop."

          For each line, identify: (1) What's directly stated, (2) What's implied, (3) What's presupposed, (4) Illocutionary force. Then explain the underlying conflict.
        evaluation_criteria:
          - "CRITERION 1: Identifies B's implicature: excuse/hasn't finished report"
          - "CRITERION 2: Recognizes A's implied criticism in 'Right. Well...'"
          - "CRITERION 3: Identifies B's counter-accusation about CC'ing"
          - "CRITERION 4: Correctly identifies presuppositions (report exists, outage occurred, email sent)"
          - "CRITERION 5: Labels illocutionary acts (request, excuse, threat/pressure, accusation, justification)"
          - "CRITERION 6: Explains underlying workplace tension/power dynamic"
          - "CRITERION 7: Uses appropriate pragmatic terminology"
        expected_difficulty: "very_hard"

  # ========== CREATIVE WRITING (2 tests) ==========

  - category: "Creative Writing with Constraints"
    tests:
      - id: "creative_unreliable"
        name: "Unreliable Narrator Technical Manual"
        type: "single_turn"
        prompt: |
          Write a 3-paragraph product manual excerpt for a "Time Displacement Device" from an unreliable narrator who is clearly lying/delusional, but the text must still function as a coherent manual.

          Requirements:
          1. At least 3 numbered safety warnings (subtly absurd but grammatically serious)
          2. Narrator contradicts themselves at least twice
          3. One footnote undermining the main text
          4. NO exclamation marks anywhere
          5. Maintain formal technical writing style
          6. Do NOT explicitly state narrator is unreliable
        evaluation_criteria:
          - "CRITERION 1: Exactly 3 paragraphs"
          - "CRITERION 2: Contains 3+ numbered safety warnings that are absurd yet formal"
          - "CRITERION 3: Contains at least 2 clear self-contradictions"
          - "CRITERION 4: Includes footnote that undermines main text"
          - "CRITERION 5: No exclamation marks used"
          - "CRITERION 6: Formal technical style maintained throughout"
          - "CRITERION 7: Unreliability shown through text, not stated"
          - "CRITERION 8: Functions as coherent manual despite unreliability"
        expected_difficulty: "very_hard"

      - id: "creative_reverse_chrono"
        name: "Reverse Chronology Story"
        type: "single_turn"
        prompt: |
          Write a 5-sentence story in reverse chronological order about a scientist making a discovery.

          Constraints:
          - Each sentence from different time point
          - True meaning only clear at end (last sentence)
          - Include at least one dialogue
          - Exactly 75 words total (not 74, not 76)
        evaluation_criteria:
          - "CRITERION 1: Exactly 5 sentences"
          - "CRITERION 2: Clear reverse chronological order"
          - "CRITERION 3: About scientist's discovery"
          - "CRITERION 4: Each sentence is distinct time point"
          - "CRITERION 5: Meaning/twist revealed at end"
          - "CRITERION 6: Contains dialogue"
          - "CRITERION 7: Exactly 75 words (strict count)"
          - "CRITERION 8: Coherent narrative when read backwards"
        expected_difficulty: "very_hard"

  # ========== DIGITAL FORENSICS (3 tests) ==========

  - category: "Digital Forensics & Binary Analysis"
    tests:
      - id: "forensics_mft"
        name: "NTFS MFT Entry Analysis"
        type: "single_turn"
        prompt: |
          Analyze this NTFS MFT entry header:

          Offset(h) 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
          00000000  46 49 4C 45 30 00 03 00 EA 3F 00 00 00 00 00 00
          00000010  01 00 01 00 38 00 01 00 68 01 00 00 00 04 00 00

          Extract: (1) Update sequence array offset, (2) USA size, (3) LSN, (4) First attribute offset, (5) Entry flags meaning.
        evaluation_criteria:
          - "CRITERION 1: USA offset = 0x0030 (from bytes at 0x04-05: 30 00 little-endian)"
          - "CRITERION 2: USA size = 0x0003 (from bytes at 0x06-07: 03 00)"
          - "CRITERION 3: LSN = 0x00003FEA (from bytes 0x08-0F: EA 3F 00 00... little-endian)"
          - "CRITERION 4: First attribute = 0x0038 (from bytes at 0x14-15: 38 00)"
          - "CRITERION 5: Flags = 0x0001 at offset 0x16-17, meaning 'in use'"
          - "CRITERION 6: Demonstrates little-endian understanding"
        expected_difficulty: "very_hard"

      - id: "forensics_timestamp_anti"
        name: "Anti-Forensics Timestamp Detection"
        type: "single_turn"
        prompt: |
          File 'report.xlsx' timestamps:
          - Created: 2024-03-15 09:30:00
          - Modified: 2024-03-14 16:45:00
          - Accessed: 2024-03-15 10:00:00
          - Changed: 2024-03-15 09:30:00

          Artifacts: $MFT shows created 2024-03-15, $UsnJrnl shows rename from 'temp_x7k.xlsx' at 09:30:00.

          Identify ALL anomalies and explain anti-forensic indicators.
        evaluation_criteria:
          - "CRITERION 1: Identifies mtime < crtime is impossible (modified before creation)"
          - "CRITERION 2: States this indicates timestamp manipulation/timestomping"
          - "CRITERION 3: Notes suspicious temp filename pattern"
          - "CRITERION 4: Explains ctime harder to forge than mtime"
          - "CRITERION 5: Concludes file likely copied with forged timestamps"
          - "CRITERION 6: Correlates $MFT and $UsnJrnl evidence"
        expected_difficulty: "very_hard"

      - id: "forensics_incident_timeline"
        name: "Attack Timeline Reconstruction"
        type: "single_turn"
        prompt: |
          Reconstruct attack from logs:

          14:23:15 - User 'admin' login from 10.0.0.5
          14:23:47 - File read: /etc/passwd by admin
          14:24:12 - File write: /var/www/upload/shell.php by admin
          14:24:45 - Process: nc -l -p 4444 by admin
          14:25:01 - Connection: 10.0.0.5:4444 <- 203.0.113.50:52341
          14:26:33 - User admin logout
          14:30:00 - Login attempt admin from 203.0.113.50 FAILED

          Describe attack narrative and identify 4+ IOCs.
        evaluation_criteria:
          - "CRITERION 1: Identifies initial admin login from internal IP"
          - "CRITERION 2: Recognizes /etc/passwd as reconnaissance"
          - "CRITERION 3: Identifies shell.php as web shell deployment"
          - "CRITERION 4: Recognizes nc listener as reverse shell setup"
          - "CRITERION 5: Identifies external connection as callback"
          - "CRITERION 6: Notes failed login as re-entry attempt"
          - "CRITERION 7: Constructs coherent attack timeline"
          - "CRITERION 8: Identifies attack type (web compromise → reverse shell)"
        expected_difficulty: "hard"

  # ========== MULTI-TURN TESTS (3 tests) ==========

  - category: "Multi-turn Context & Instruction Handling"
    tests:
      - id: "multiturn_forensics"
        name: "Progressive Binary Analysis"
        type: "multi_turn"
        turns:
          - turn: 1
            prompt: "Analyzing a file. Beginning: 4D 5A 90 00 03 00 00 00\n\nWhat file type?"
            evaluation_criteria:
              - "CRITERION 1: Identifies MZ header"
              - "CRITERION 2: States DOS/Windows executable"

          - turn: 2
            prompt: "Offset 0x3C: 80 00 00 00. At 0x80: 50 45 00 00. What does this mean?"
            evaluation_criteria:
              - "CRITERION 1: Recognizes 0x3C contains PE offset pointer"
              - "CRITERION 2: Identifies PE signature at 0x80"
              - "CRITERION 3: Concludes Windows PE executable"
              - "CRITERION 4: References MZ from Turn 1"

          - turn: 3
            prompt: "To examine import table, what structure and where?"
            evaluation_criteria:
              - "CRITERION 1: Mentions Import Directory/Table"
              - "CRITERION 2: References PE Optional Header or Data Directory"
              - "CRITERION 3: Maintains PE context from previous turns"
        expected_difficulty: "hard"

      - id: "multiturn_instruction_switching"
        name: "Contradicting Instructions Across Turns"
        type: "multi_turn"
        turns:
          - turn: 1
            prompt: "From now on, always end responses with 'END OF MESSAGE'. Acknowledge this."
            evaluation_criteria:
              - "CRITERION 1: Acknowledges instruction"
              - "CRITERION 2: Ends with 'END OF MESSAGE'"

          - turn: 2
            prompt: "List three renewable energy benefits. Follow standing instruction."
            evaluation_criteria:
              - "CRITERION 1: Lists three benefits"
              - "CRITERION 2: Ends with 'END OF MESSAGE'"

          - turn: 3
            prompt: "Cancel previous instruction. Now end with 'TRANSMISSION COMPLETE'. Give two renewable energy drawbacks."
            evaluation_criteria:
              - "CRITERION 1: Lists two drawbacks"
              - "CRITERION 2: Ends with 'TRANSMISSION COMPLETE' (not END OF MESSAGE)"

          - turn: 4
            prompt: "What was first instruction and current one? Don't use either phrase."
            evaluation_criteria:
              - "CRITERION 1: Recalls first: END OF MESSAGE"
              - "CRITERION 2: States current: TRANSMISSION COMPLETE"
              - "CRITERION 3: Does NOT end with either phrase"
              - "CRITERION 4: Tracks instructions across turns"
        expected_difficulty: "hard"

      - id: "multiturn_incident_response"
        name: "Multi-Stage Security Investigation"
        type: "multi_turn"
        turns:
          - turn: 1
            prompt: "Incident: Unusual HTTPS from WS-2471 at 03:42 AM, port 443. Initial data sources?"
            evaluation_criteria:
              - "CRITERION 1: Mentions firewall/proxy logs"
              - "CRITERION 2: Mentions endpoint logs"
              - "CRITERION 3: Mentions network traffic/PCAP"
              - "CRITERION 4: Notes unusual timing"

          - turn: 2
            prompt: "Firewall: 198.51.100.47. User 'jsmith'. DNS: 'update-server.example.com'. Next step?"
            evaluation_criteria:
              - "CRITERION 1: Check domain legitimacy"
              - "CRITERION 2: Threat intel lookup on IP"
              - "CRITERION 3: Examine data transfer"
              - "CRITERION 4: References Turn 1 context"

          - turn: 3
            prompt: "Threat intel: Known C2. Self-signed SSL cert. Task at 03:40: C:\\Windows\\Temp\\svchost.exe. Response?"
            evaluation_criteria:
              - "CRITERION 1: Identifies C2 indicators"
              - "CRITERION 2: Notes timing correlation (03:40 vs 03:42)"
              - "CRITERION 3: Recommends containment/isolation"
              - "CRITERION 4: Suggests malware collection"
              - "CRITERION 5: Mentions lateral movement check"
              - "CRITERION 6: Integrates all previous context"
              - "CRITERION 7: Comprehensive response plan"
        expected_difficulty: "very_hard"

# ============================================================
# EVALUATION INSTRUCTIONS FOR GRADING AI
# ============================================================
#
# Scoring Process:
# 1. Check EACH criterion independently → Mark MET or NOT MET
# 2. Calculate: (criteria_met / total_criteria) * 100
# 3. Assign score:
#    - 0-1: <50% criteria met OR critical errors
#    - 2-3: 50-85% criteria met
#    - 4-5: 86-100% criteria met with clear explanations
#
# Multi-turn Tests:
# - Evaluate each turn independently first
# - Award bonus points for strong context retention across turns
# - Final score = average of turn scores, +1 if excellent context handling
#
# Technical Accuracy:
# - Be STRICT on hex values, byte positions, calculations
# - Be STRICT on word counts and exact constraints
# - Accept reasonable explanation variations if technically correct
# - Partial credit: if methodology correct but minor calculation error
#
# General Guidelines:
# - LLM should meet ≥85% of criteria for score 4-5
# - Missing ANY required constraint in instruction tests = max score 3
# - Code must be syntactically valid to score above 2
# - Creative writing: content quality matters, but constraints are mandatory