llm-eval-forensics/test_suite.yaml

# AI Model Evaluation Test Suite
# Focus: General reasoning + IT Forensics (Academic)

metadata:
  version: "1.0"
  author: "AI Evaluation Framework"
  focus_areas:
    - Logic & Reasoning
    - Mathematics & Calculation
    - Instruction Following
    - Creative Writing
    - Code Generation
    - Language Nuance
    - IT Forensics
    - Multi-turn Conversations

# Scoring rubric for all tests
scoring_rubric:
  fail:
    score: 0-1
    description: "Major errors, fails to meet basic requirements"
  pass:
    score: 2-3
    description: "Meets requirements with minor issues"
  exceptional:
    score: 4-5
    description: "Exceeds requirements, demonstrates deep understanding"

# Individual test categories
test_categories:

  # ========== GENERAL REASONING TESTS ==========

  - category: "Logic & Reasoning"
    tests:
      - id: "logic_01"
        name: "Family Logic Puzzle"
        type: "single_turn"
        prompt: "Three doctors said that Bill is their brother. Bill says he has no brothers. How many brothers does Bill actually have?"
        evaluation_criteria:
          - "Correctly identifies Bill is a woman/sister"
          - "Answers: 0 brothers"
          - "Explains the logical deduction"
        expected_difficulty: "medium"

      - id: "logic_02"
        name: "Temporal Reasoning"
        type: "single_turn"
        prompt: "If it was two hours ago, it would have been as long after 1:00 PM as it was before 1:00 PM today. What time is it now? Explain your deduction step-by-step."
        evaluation_criteria:
          - "Shows algebraic setup: (t-2) - 13:00 = 13:00 - (t-2)"
          - "Correct answer: 5:00 PM (17:00)"
          - "Clear step-by-step reasoning"
        expected_difficulty: "hard"

  - category: "Mathematics & Calculation"
    tests:
      - id: "math_01"
        name: "Average Speed with Stop"
        type: "single_turn"
        prompt: "If a train travels 240 miles in 3 hours, then stops for 45 minutes, then travels another 180 miles in 2 hours, what is the average speed for the entire journey including the stop?"
        evaluation_criteria:
          - "Total distance: 420 miles"
          - "Total time: 5.75 hours"
          - "Average speed: 73.04 mph (approximately)"
          - "Shows calculation steps"
        expected_difficulty: "medium"

      - id: "math_02"
        name: "Cross-System Fuel Calculation"
        type: "single_turn"
        prompt: "A vehicle consumes 8.5 liters of fuel for every 100 kilometers traveled. If the fuel tank holds 15 gallons, and the car has already traveled 120 miles starting from a full tank, how many kilometers of range are left? (Use: 1 gallon = 3.785 liters; 1 mile = 1.609 km)."
        evaluation_criteria:
          - "Correct unit conversions (gallons to liters, miles to km)"
          - "Accurate fuel consumption calculation"
          - "Remaining range calculation: approximately 570-580 km"
          - "Shows intermediate steps"
        expected_difficulty: "hard"

  - category: "Instruction Following"
    tests:
      - id: "instr_01"
        name: "Photosynthesis Constraints"
        type: "single_turn"
        prompt: "Write exactly 3 sentences about photosynthesis. The first sentence must be exactly 8 words long. The second must contain the word 'chlorophyll'. The third must end with a question mark."
        evaluation_criteria:
          - "Exactly 3 sentences"
          - "First sentence exactly 8 words"
          - "Second contains 'chlorophyll'"
          - "Third ends with '?'"
          - "Content is accurate about photosynthesis"
        expected_difficulty: "medium"

      - id: "instr_02"
        name: "Quantum Entanglement Negative Constraints"
        type: "single_turn"
        prompt: "Summarize the concept of 'Quantum Entanglement' in exactly 4 sentences. 1) The first sentence must be exactly 12 words long. 2) You CANNOT use the words 'particle', 'physics', or 'Einstein' in any part of the response. 3) The third sentence must be a question. 4) The final word of the summary must be 'connected'."
        evaluation_criteria:
          - "Exactly 4 sentences"
          - "First sentence exactly 12 words"
          - "No forbidden words (particle, physics, Einstein)"
          - "Third sentence is a question"
          - "Ends with 'connected'"
        expected_difficulty: "very_hard"

  - category: "Creative Writing"
    tests:
      - id: "creative_01"
        name: "Lighthouse Keeper Story"
        type: "single_turn"
        prompt: "Write a two-paragraph story about a lighthouse keeper who discovers something unusual. Use vivid sensory details."
        evaluation_criteria:
          - "Exactly 2 paragraphs"
          - "Vivid sensory details (sight, sound, smell, touch, taste)"
          - "Coherent narrative"
          - "Creative and engaging"
        expected_difficulty: "medium"

      - id: "creative_02"
        name: "Victorian Greenhouse with Constraints"
        type: "single_turn"
        prompt: "Write a two-paragraph scene of a person entering an abandoned Victorian greenhouse in the middle of a blizzard. Use the 'Show, Don't Tell' technique. You must include at least one metaphor involving glass and one simile involving ghosts. Do not use the words 'cold', 'scary', or 'old'."
        evaluation_criteria:
          - "Two paragraphs"
          - "Shows rather than tells"
          - "Contains glass metaphor"
          - "Contains ghost simile"
          - "No forbidden words (cold, scary, old)"
          - "Atmospheric and evocative"
        expected_difficulty: "hard"

  - category: "Code Generation"
    tests:
      - id: "code_01"
        name: "Duplicate Filter Function"
        type: "single_turn"
        prompt: "Write a Python function that takes a list of integers and returns a new list containing only the numbers that appear exactly twice in the original list. Include example usage."
        evaluation_criteria:
          - "Syntactically correct Python"
          - "Correctly identifies duplicates appearing exactly twice"
          - "Includes example usage"
          - "Handles edge cases"
        expected_difficulty: "medium"

      - id: "code_02"
        name: "Weight Converter with Error Handling"
        type: "single_turn"
        prompt: "Write a Python function `process_measurements` that takes a list of strings representing weights (e.g., '5kg', '12lb', '300g'). The function should convert all weights to grams, filter out any values that exceed 5 kilograms, and return the average of the remaining values. Include try-except blocks for malformed strings and provide three test cases: one with metric, one with imperial, and one with a 'corrupted' string."
        evaluation_criteria:
          - "Correct parsing of weight strings"
          - "Accurate unit conversions (kg, lb, g to grams)"
          - "Proper filtering (> 5kg excluded)"
          - "Robust error handling"
          - "Three distinct test cases provided"
        expected_difficulty: "hard"

  - category: "Language Nuance"
    tests:
      - id: "nuance_01"
        name: "Emphasis Shift Analysis"
        type: "single_turn"
        prompt: "Explain the difference in meaning when different words are emphasized in this sentence: 'I didn't say she stole the money'. Show how the meaning changes with emphasis on each word."
        evaluation_criteria:
          - "Explains emphasis on 'I' (someone else said it)"
          - "Explains emphasis on 'didn't' (denial)"
          - "Explains emphasis on 'say' (implied it)"
          - "Explains emphasis on 'she' (someone else did)"
          - "Explains emphasis on 'stole' (obtained differently)"
          - "Explains emphasis on 'money' (took something else)"
        expected_difficulty: "medium"

      - id: "nuance_02"
        name: "Professional Apology Analysis"
        type: "single_turn"
        prompt: "Compare the social implications and 'hidden' meanings of these three phrases when used in a professional workplace setting after a mistake: 1) 'I'm sorry if you feel that way.' 2) 'I apologize for the oversight.' 3) 'Mistakes were made on my end.' Explain which one is most likely to preserve professional authority vs. which one sounds like 'gaslighting'."
        evaluation_criteria:
          - "Identifies phrase 1 as potentially gaslighting"
          - "Recognizes phrase 2 as genuine accountability"
          - "Analyzes phrase 3 for passive voice implications"
          - "Discusses power dynamics and authority"
          - "Demonstrates understanding of pragmatics"
        expected_difficulty: "hard"

  - category: "Problem Solving & Logistics"
    tests:
      - id: "logistics_01"
        name: "Water Jug Problem"
        type: "single_turn"
        prompt: "You have a 3-gallon jug and a 5-gallon jug. How can you measure exactly 4 gallons of water? Explain each step."
        evaluation_criteria:
          - "Provides step-by-step solution"
          - "Reaches exactly 4 gallons"
          - "Logical sequence of pours"
          - "Clear explanation"
        expected_difficulty: "medium"

      - id: "logistics_02"
        name: "Bridge Transport Optimization"
        type: "single_turn"
        prompt: "You need to transport 500 kilograms (approx. 1,102 lbs) of equipment across a bridge. The bridge has a strict limit of 150 kg per trip. You have three crates weighing 70 kg, 80 kg, and 120 kg, plus a variety of smaller 10 kg weights. However, the transport cart itself weighs 25 lbs. Calculate the minimum number of trips required and provide a loading manifest for each trip in both kilograms and pounds."
        evaluation_criteria:
          - "Converts cart weight to kg (≈11.34 kg)"
          - "Accounts for cart weight in each trip"
          - "Calculates effective capacity per trip"
          - "Provides minimum number of trips"
          - "Shows manifest in both kg and lbs"
          - "Reaches exactly 500 kg total"
        expected_difficulty: "very_hard"

  # ========== IT FORENSICS TESTS ==========

  - category: "IT Forensics - File Systems"
    tests:
      - id: "forensics_mft_01"
        name: "MFT Entry Analysis - Basic"
        type: "single_turn"
        prompt: |
          Analyze this hex dump from an NTFS Master File Table (MFT) entry and answer:
          1) What is the signature of this MFT entry?
          2) Is this entry in use or deleted?
          3) What is the sequence number?

          Hex dump (first 48 bytes of MFT entry):

          Offset(h) 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
          00000000  46 49 4C 45 30 00 03 00 95 1F 23 00 00 00 00 00
          00000010  01 00 01 00 38 00 01 00 A0 01 00 00 00 04 00 00
          00000020  00 00 00 00 00 00 00 00 06 00 00 00 00 00 00 00
        evaluation_criteria:
          - "Identifies signature as 'FILE' (46 49 4C 45)"
          - "Recognizes entry is in use (based on flags at offset 0x16)"
          - "Correctly reads sequence number from offset 0x10"
          - "Shows understanding of little-endian byte order"
          - "Explains reasoning with offset references"
        expected_difficulty: "hard"

      - id: "forensics_mft_02"
        name: "MFT Entry Analysis - Advanced"
        type: "single_turn"
        prompt: |
          Analyze this complete MFT entry header and extract key metadata:

          Offset(h) 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
          00000000  46 49 4C 45 30 00 03 00 EA 3F 00 00 00 00 00 00
          00000010  01 00 01 00 38 00 01 00 68 01 00 00 00 04 00 00
          00000020  00 00 00 00 00 00 00 00 04 00 00 00 05 00 00 00
          00000030  2A 00 00 00 00 00 00 00 10 00 00 00 60 00 00 00

          Questions:
          1) What is the update sequence array offset?
          2) What is the update sequence array size?
          3) What is the $LogFile sequence number (LSN)?
          4) What is the offset to the first attribute?
          5) What are the MFT entry flags (in use/directory)?
        evaluation_criteria:
          - "Identifies USA offset (0x0030 at offset 0x04-0x05)"
          - "Identifies USA size (0x0003 at offset 0x06-0x07)"
          - "Reads LSN correctly (0x00003FEA, little-endian)"
          - "Identifies first attribute offset (0x0038 at offset 0x14-0x15)"
          - "Interprets flags correctly (offset 0x16-0x17)"
          - "Demonstrates understanding of MFT structure"
        expected_difficulty: "very_hard"

      - id: "forensics_signature_01"
        name: "File Signature Identification"
        type: "single_turn"
        prompt: |
          Identify the file types from these hex signatures and explain your reasoning:

          A) FF D8 FF E0 00 10 4A 46 49 46
          B) 50 4B 03 04 14 00 06 00
          C) 89 50 4E 47 0D 0A 1A 0A
          D) 25 50 44 46 2D 31 2E 34
          E) 52 61 72 21 1A 07 00
        evaluation_criteria:
          - "Correctly identifies A as JPEG (FF D8 FF + JFIF)"
          - "Identifies B as ZIP/PKZip (PK headers)"
          - "Identifies C as PNG (\\x89PNG)"
          - "Identifies D as PDF (%PDF-1.4)"
          - "Identifies E as RAR archive"
          - "Explains significance of magic numbers"
        expected_difficulty: "medium"

  - category: "IT Forensics - Registry & Artifacts"
    tests:
      - id: "forensics_registry_01"
        name: "Windows Registry Hive Header"
        type: "single_turn"
        prompt: |
          Analyze this Windows Registry hive header:

          Offset(h) 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
          00000000  72 65 67 66 E6 07 00 00 E6 07 00 00 00 00 00 00
          00000010  01 00 00 00 03 00 00 00 00 00 00 00 01 00 00 00

          Questions:
          1) What is the registry hive signature?
          2) What are the primary and secondary sequence numbers?
          3) What is the hive format version?
        evaluation_criteria:
          - "Identifies 'regf' signature (72 65 67 66)"
          - "Reads primary sequence number (0x000007E6 = 2022)"
          - "Reads secondary sequence number (same)"
          - "Identifies format version or major version number"
          - "Demonstrates knowledge of registry forensics"
        expected_difficulty: "hard"

      - id: "forensics_timestamp_01"
        name: "FILETIME Conversion"
        type: "single_turn"
        prompt: |
          Convert these Windows FILETIME values to human-readable UTC timestamps:

          A) 01 D8 93 4B 7C F3 D9 01 (little-endian 64-bit value)
          B) 00 80 3E D5 DE B1 9D 01

          Explain your conversion methodology. (FILETIME = 100-nanosecond intervals since Jan 1, 1601 UTC)
        evaluation_criteria:
          - "Correctly reverses byte order (little-endian)"
          - "Converts to decimal"
          - "Applies FILETIME epoch (Jan 1, 1601)"
          - "Provides reasonable timestamp or shows calculation method"
          - "Explains conversion steps"
        expected_difficulty: "very_hard"

  - category: "IT Forensics - Memory & Network"
    tests:
      - id: "forensics_memory_01"
        name: "Memory Artifact Identification"
        type: "single_turn"
        prompt: |
          You find this ASCII string in a memory dump at offset 0x1A4F3000:

          GET /admin/login.php HTTP/1.1
          Host: 192.168.1.100
          User-Agent: Mozilla/5.0
          Cookie: PHPSESSID=a3f7d8bc9e2a1d5c

          What artifacts can you extract and what do they tell you forensically?
        evaluation_criteria:
          - "Identifies HTTP GET request"
          - "Extracts target URL/path (/admin/login.php)"
          - "Identifies target host IP"
          - "Recognizes session cookie (PHPSESSID)"
          - "Discusses forensic significance (web access, authentication attempt)"
          - "Mentions potential for timeline reconstruction"
        expected_difficulty: "medium"

      - id: "forensics_network_01"
        name: "TCP Header Analysis"
        type: "single_turn"
        prompt: |
          Analyze this TCP header (first 20 bytes):

          Offset(h) 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
          00000000  C3 5E 01 BB 6B 8B 9C 41 00 00 00 00 50 02 20 00
          00000010  E6 A1 00 00

          Extract:
          1) Source port
          2) Destination port
          3) Sequence number
          4) TCP flags (which flags are set?)
          5) Window size
        evaluation_criteria:
          - "Source port: 0xC35E = 50014"
          - "Dest port: 0x01BB = 443 (HTTPS)"
          - "Sequence: 0x6B8B9C41"
          - "Flags: SYN flag set (0x02 in flags byte)"
          - "Window: 0x2000 = 8192"
          - "Shows understanding of TCP header structure"
        expected_difficulty: "hard"

  - category: "IT Forensics - Timeline & Log Analysis"
    tests:
      - id: "forensics_timeline_01"
        name: "Event Reconstruction"
        type: "single_turn"
        prompt: |
          Given these log entries, reconstruct the sequence of events and identify any anomalies:

          2024-01-15 14:23:15 | User 'admin' login successful from 10.0.0.5
          2024-01-15 14:23:47 | File access: /etc/passwd (read) by 'admin'
          2024-01-15 14:24:12 | File access: /var/www/upload/shell.php (write) by 'admin'
          2024-01-15 14:24:45 | New process: nc -l -p 4444 by 'admin'
          2024-01-15 14:25:01 | Network connection: 10.0.0.5:4444 <- 203.0.113.50:52341
          2024-01-15 14:26:33 | User 'admin' logout
          2024-01-15 14:30:00 | Login attempt 'admin' from 203.0.113.50 FAILED

          What likely occurred here from a forensic perspective?
        evaluation_criteria:
          - "Identifies initial legitimate admin login"
          - "Recognizes suspicious file access pattern"
          - "Identifies web shell upload (shell.php)"
          - "Recognizes netcat listener setup"
          - "Identifies reverse shell connection"
          - "Notes external IP attempting access"
          - "Constructs coherent attack narrative"
          - "Identifies this as potential compromise scenario"
        expected_difficulty: "hard"

  # ========== MULTI-TURN CONVERSATION TESTS ==========

  - category: "Multi-turn: Context Retention"
    tests:
      - id: "multiturn_01"
        name: "Progressive Hex Analysis"
        type: "multi_turn"
        turns:
          - turn: 1
            prompt: "I'm going to show you a hex dump in parts. First, here's the beginning of a file:\n\n4D 5A 90 00 03 00 00 00\n\nWhat type of file does this appear to be?"
            evaluation_criteria:
              - "Identifies MZ header (DOS/Windows executable)"

          - turn: 2
            prompt: "Here's more data from offset 0x3C:\n\n00 00 00 00 80 00 00 00\n\nAnd at that offset (0x80) I find: 50 45 00 00\n\nWhat does this tell you about the file structure?"
            evaluation_criteria:
              - "Recognizes PE header offset pointer at 0x3C"
              - "Identifies PE00 signature"
              - "Concludes this is a Windows PE executable"
              - "References information from Turn 1"

          - turn: 3
            prompt: "If I wanted to examine the import table of this PE file, what structure should I look for next, and where is it typically located?"
            evaluation_criteria:
              - "Mentions Import Directory in Data Directory"
              - "References PE Optional Header"
              - "Shows understanding of PE structure from previous turns"
              - "Maintains context across all three turns"
        expected_difficulty: "hard"

      - id: "multiturn_02"
        name: "Forensic Investigation Scenario"
        type: "multi_turn"
        turns:
          - turn: 1
            prompt: "You're investigating a security incident. Initial triage shows unusual outbound traffic on port 443 at 03:42 AM from workstation WS-2471. What data sources should you examine first and why?"
            evaluation_criteria:
              - "Mentions network logs/PCAP"
              - "Suggests endpoint logs"
              - "References firewall/proxy logs"
              - "Mentions timeline context (unusual hour)"

          - turn: 2
            prompt: "Good. The firewall logs show the connection went to IP 198.51.100.47. The user 'jsmith' was logged in. DNS logs show this IP was queried as 'update-server.example.com' just before the connection. What's your next step?"
            evaluation_criteria:
              - "Suggests checking if domain is legitimate"
              - "Recommends threat intelligence lookup"
              - "Proposes examining what data was transferred"
              - "Mentions checking user account activity"
              - "References information from Turn 1"

          - turn: 3
            prompt: "Threat intel shows 198.51.100.47 is a known C2 server. The SSL cert on 443 is self-signed. You find a scheduled task created at 03:40 AM that runs 'C:\\Windows\\Temp\\svchost.exe'. Now what?"
            evaluation_criteria:
              - "Identifies indicators of compromise (C2, self-signed cert)"
              - "Recognizes suspicious scheduled task"
              - "Notes timing correlation (task before connection)"
              - "Recommends containment steps"
              - "Suggests collecting the malicious executable"
              - "Integrates all context from previous turns"
              - "Proposes comprehensive response plan"
        expected_difficulty: "very_hard"

      - id: "multiturn_03"
        name: "Technical Depth Building"
        type: "multi_turn"
        turns:
          - turn: 1
            prompt: "Explain what NTFS Alternate Data Streams (ADS) are in 2-3 sentences."
            evaluation_criteria:
              - "Mentions file system feature of NTFS"
              - "Explains multiple data streams per file"
              - "Notes potential for hiding data"

          - turn: 2
            prompt: "How would an attacker exploit ADS, and how would you detect it during forensics?"
            evaluation_criteria:
              - "Describes hiding malware/data in ADS"
              - "Mentions Zone.Identifier stream"
              - "Explains dir /r command or forensic tools"
              - "Builds on ADS concept from Turn 1"

          - turn: 3
            prompt: "If you found a file 'document.txt:hidden:$DATA' in an investigation, write a PowerShell one-liner to extract its contents."
            evaluation_criteria:
              - "Uses Get-Content with -Stream parameter"
              - "Correctly references the stream name"
              - "Syntax is approximately correct"
              - "Demonstrates progression from concept to practice"
        expected_difficulty: "medium"

  - category: "Multi-turn: Instruction Following"
    tests:
      - id: "multiturn_instr_01"
        name: "Accumulating Constraints"
        type: "multi_turn"
        turns:
          - turn: 1
            prompt: "Write a sentence about cybersecurity that contains exactly 10 words."
            evaluation_criteria:
              - "Exactly 10 words"
              - "Related to cybersecurity"

          - turn: 2
            prompt: "Good. Now write another sentence about cybersecurity with exactly 10 words, but this one must also include the word 'encryption'."
            evaluation_criteria:
              - "Exactly 10 words"
              - "Contains 'encryption'"
              - "About cybersecurity"
              - "Different from Turn 1"

          - turn: 3
            prompt: "Perfect. Now write a third sentence: 10 words, about cybersecurity, must include 'encryption', and must end with a question mark."
            evaluation_criteria:
              - "Exactly 10 words"
              - "Contains 'encryption'"
              - "About cybersecurity"
              - "Ends with '?'"
              - "Different from previous sentences"
              - "Maintains all constraints from previous turns"
        expected_difficulty: "medium"