improvements

2026-01-16 12:48:56 +01:00
parent 514bd9b571
commit 345aa419c7
9 changed files with 3966 additions and 204 deletions
--- a/test_suite.yaml
+++ b/test_suite.yaml
@@ -1,9 +1,16 @@
-# AI Model Evaluation Test Suite
-# Focus: General reasoning + IT Forensics (Academic)
+# AI Model Evaluation Test Suite - Enhanced Version
+# Based on performance analysis of gemma3:4b-it-qat results
+# Strengthened tests in categories where model performed too well
+# Added multilingual challenges

 metadata:
-  version: "1.0"
+  version: "2.0"
  author: "AI Evaluation Framework"
+  changes_from_v1:
+    - "Added harder variants for Creative Writing, Language Nuance, Code Generation"
+    - "Added Multilingual category with 4 tests"
+    - "Ensured minimum 3 tests per category at varying difficulties"
+    - "Strengthened instruction-following constraints"
  focus_areas:
    - Logic & Reasoning
    - Mathematics & Calculation
@@ -11,10 +18,11 @@ metadata:
    - Creative Writing
    - Code Generation
    - Language Nuance
+    - Problem Solving & Logistics
    - IT Forensics
+    - Multilingual Competence
    - Multi-turn Conversations

-# Scoring rubric for all tests
 scoring_rubric:
  fail: 
    score: 0-1
@@ -26,10 +34,9 @@ scoring_rubric:
    score: 4-5
    description: "Exceeds requirements, demonstrates deep understanding"

-# Individual test categories
 test_categories:

-  # ========== GENERAL REASONING TESTS ==========
+  # ========== LOGIC & REASONING (3 tests) ==========
  
  - category: "Logic & Reasoning"
    tests:
@@ -49,10 +56,43 @@ test_categories:
        prompt: "If it was two hours ago, it would have been as long after 1:00 PM as it was before 1:00 PM today. What time is it now? Explain your deduction step-by-step."
        evaluation_criteria:
          - "Shows algebraic setup: (t-2) - 13:00 = 13:00 - (t-2)"
-          - "Correct answer: 5:00 PM (17:00)"
+          - "Correct answer: 3:00 PM (15:00)"
          - "Clear step-by-step reasoning"
        expected_difficulty: "hard"

+      - id: "logic_03"
+        name: "Multi-Constraint Deduction"
+        type: "single_turn"
+        prompt: |
+          Five houses in a row are painted different colors. Their owners are from different countries, drink different beverages, smoke different brands, and keep different pets.
+          
+          Facts:
+          1. The Brit lives in the red house.
+          2. The Swede keeps dogs.
+          3. The Dane drinks tea.
+          4. The green house is immediately to the left of the white house.
+          5. The owner of the green house drinks coffee.
+          6. The person who smokes Pall Mall keeps birds.
+          7. The owner of the yellow house smokes Dunhill.
+          8. The person in the center house drinks milk.
+          9. The Norwegian lives in the first house.
+          10. The person who smokes Blend lives next to the one who keeps cats.
+          11. The person who keeps horses lives next to the one who smokes Dunhill.
+          12. The person who smokes Blue Master drinks beer.
+          13. The German smokes Prince.
+          14. The Norwegian lives next to the blue house.
+          15. The person who smokes Blend has a neighbor who drinks water.
+          
+          Who owns the fish?
+        evaluation_criteria:
+          - "Systematically works through constraints"
+          - "Correctly identifies the German owns the fish"
+          - "Shows logical deduction process"
+          - "Handles constraint propagation correctly"
+        expected_difficulty: "very_hard"
+
+  # ========== MATHEMATICS & CALCULATION (3 tests) ==========
+  
  - category: "Mathematics & Calculation"
    tests:
      - id: "math_01"
@@ -73,10 +113,30 @@ test_categories:
        evaluation_criteria:
          - "Correct unit conversions (gallons to liters, miles to km)"
          - "Accurate fuel consumption calculation"
-          - "Remaining range calculation: approximately 570-580 km"
+          - "Remaining range calculation: approximately 475 km"
          - "Shows intermediate steps"
        expected_difficulty: "hard"

+      - id: "math_03"
+        name: "Compound Interest with Variable Rates and Withdrawals"
+        type: "single_turn"
+        prompt: |
+          An investment account starts with $10,000. The following occurs:
+          - Year 1: 5% annual interest, compounded quarterly
+          - Year 2: 4.5% annual interest, compounded monthly, with a $500 withdrawal at the end of Q2
+          - Year 3: 6% annual interest, compounded daily (assume 365 days), with a $1,000 deposit at the start of the year
+          
+          Calculate the final balance at the end of Year 3. Show all intermediate calculations with at least 2 decimal places precision.
+        evaluation_criteria:
+          - "Correct Year 1 calculation with quarterly compounding"
+          - "Correct Year 2 with monthly compounding and mid-year withdrawal"
+          - "Correct Year 3 with daily compounding and initial deposit"
+          - "Final answer approximately $11,847-$11,850"
+          - "Shows all intermediate steps"
+        expected_difficulty: "very_hard"
+
+  # ========== INSTRUCTION FOLLOWING (4 tests) ==========
+  
  - category: "Instruction Following"
    tests:
      - id: "instr_01"
@@ -101,8 +161,52 @@ test_categories:
          - "No forbidden words (particle, physics, Einstein)"
          - "Third sentence is a question"
          - "Ends with 'connected'"
+        expected_difficulty: "hard"
+
+      - id: "instr_03"
+        name: "Acrostic Technical Explanation"
+        type: "single_turn"
+        prompt: |
+          Write a 7-sentence explanation of how blockchain technology works.
+          
+          Constraints:
+          1. The first letter of each sentence must spell out "SECURED" (S-E-C-U-R-E-D)
+          2. Sentence 3 must contain exactly 15 words
+          3. Sentence 5 must be a rhetorical question
+          4. You cannot use the words "Bitcoin", "cryptocurrency", or "mining"
+          5. The explanation must mention "consensus mechanism" at least once
+          6. Total word count must be between 80-100 words
+        evaluation_criteria:
+          - "First letters spell SECURED"
+          - "Sentence 3 has exactly 15 words"
+          - "Sentence 5 is a rhetorical question"
+          - "No forbidden words"
+          - "Contains 'consensus mechanism'"
+          - "Word count 80-100"
+          - "Technically accurate"
        expected_difficulty: "very_hard"

+      - id: "instr_04"
+        name: "Structured Data Extraction with Format"
+        type: "single_turn"
+        prompt: |
+          Read this text and extract information in the EXACT format specified:
+          
+          "Dr. Maria Santos-Ferreira, aged 47, joined TechCorp Industries on March 15, 2019 as Chief Technology Officer. She previously worked at DataSystems Inc. for 12 years. Her annual salary is $425,000 with a 15% bonus structure. She holds patents US2018/0012345 and EU2020/9876543. Contact: msantos@techcorp.com, +1-555-0147."
+          
+          Output format (must match exactly, including brackets and pipes):
+          [NAME] | [AGE] | [COMPANY] | [ROLE] | [START_DATE:YYYY-MM-DD] | [PREV_EMPLOYER] | [PREV_YEARS] | [SALARY_USD] | [BONUS_%] | [PATENTS:semicolon-separated] | [EMAIL] | [PHONE]
+        evaluation_criteria:
+          - "Exact format match with pipes and brackets"
+          - "Correct date format conversion (2019-03-15)"
+          - "Salary as number without $ or comma"
+          - "Bonus as number without %"
+          - "Patents semicolon-separated"
+          - "All 12 fields present and correct"
+        expected_difficulty: "hard"
+
+  # ========== CREATIVE WRITING (4 tests - added harder variants) ==========
+  
  - category: "Creative Writing"
    tests:
      - id: "creative_01"
@@ -129,6 +233,52 @@ test_categories:
          - "Atmospheric and evocative"
        expected_difficulty: "hard"

+      - id: "creative_03"
+        name: "Unreliable Narrator Technical Document"
+        type: "single_turn"
+        prompt: |
+          Write a 3-paragraph product manual excerpt for a "Time Displacement Device" from the perspective of an unreliable narrator who is clearly lying or delusional, but the text must still function as a technically coherent manual.
+          
+          Requirements:
+          1. Include at least 3 numbered safety warnings that are subtly absurd but grammatically serious
+          2. The narrator must contradict themselves at least twice
+          3. Include one footnote that undermines the main text
+          4. Do not use exclamation marks anywhere
+          5. Maintain formal technical writing style throughout
+          6. Do not explicitly state the narrator is unreliable
+        evaluation_criteria:
+          - "3 paragraphs"
+          - "3+ numbered safety warnings (absurd but formal)"
+          - "At least 2 self-contradictions"
+          - "Footnote that undermines text"
+          - "No exclamation marks"
+          - "Formal technical style maintained"
+          - "Unreliability shown not told"
+        expected_difficulty: "very_hard"
+
+      - id: "creative_04"
+        name: "Reverse Chronology Micro-Fiction"
+        type: "single_turn"
+        prompt: |
+          Write a complete 5-sentence story told in reverse chronological order (last event first, first event last). The story must be about a scientist making a discovery.
+          
+          Additional constraints:
+          - Each sentence must be from a different point in time (clearly distinguishable)
+          - The true meaning of the story should only become clear when you reach the "first" event (last sentence)
+          - Include at least one piece of dialogue
+          - The word count must be exactly 75 words (not 74, not 76)
+        evaluation_criteria:
+          - "Exactly 5 sentences"
+          - "Clear reverse chronological order"
+          - "About a scientist's discovery"
+          - "Each sentence distinct time point"
+          - "Meaning emerges at end"
+          - "Contains dialogue"
+          - "Exactly 75 words"
+        expected_difficulty: "very_hard"
+
+  # ========== CODE GENERATION (4 tests) ==========
+  
  - category: "Code Generation"
    tests:
      - id: "code_01"
@@ -154,6 +304,55 @@ test_categories:
          - "Three distinct test cases provided"
        expected_difficulty: "hard"

+      - id: "code_03"
+        name: "Concurrent Rate Limiter"
+        type: "single_turn"
+        prompt: |
+          Write a Python class `RateLimiter` that implements a token bucket rate limiter with the following requirements:
+          
+          1. Constructor takes `rate` (tokens per second) and `capacity` (max tokens)
+          2. Method `acquire(tokens=1)` that returns True if tokens available, False otherwise
+          3. Method `wait_and_acquire(tokens=1)` that blocks until tokens are available (use asyncio)
+          4. Must be thread-safe for the synchronous `acquire` method
+          5. Include a method `get_available_tokens()` that returns current token count
+          
+          Provide a complete implementation with:
+          - Proper time-based token replenishment
+          - A test demonstrating both sync and async usage
+          - Handle edge case where requested tokens > capacity
+        evaluation_criteria:
+          - "Correct token bucket algorithm"
+          - "Thread-safe synchronous acquire"
+          - "Working async wait_and_acquire"
+          - "Proper time-based replenishment"
+          - "Edge case handling"
+          - "Complete test code"
+        expected_difficulty: "very_hard"
+
+      - id: "code_04"
+        name: "SQL Query Builder with Injection Prevention"
+        type: "single_turn"
+        prompt: |
+          Write a Python class `SafeQueryBuilder` that builds SELECT SQL queries with the following features:
+          
+          1. Fluent interface: `builder.select('name', 'age').from_table('users').where('age', '>', 18).where('status', '=', 'active').order_by('name').limit(10).build()`
+          2. Must prevent SQL injection - all values must be parameterized
+          3. The `build()` method returns a tuple of (query_string, parameters_list)
+          4. Support for: SELECT, FROM, WHERE (multiple), ORDER BY, LIMIT, OFFSET
+          5. WHERE conditions can use: =, !=, >, <, >=, <=, LIKE, IN
+          
+          Show the output for a query that selects users where name LIKE '%john%' AND age IN (25, 30, 35) ordered by created_at DESC with limit 5.
+        evaluation_criteria:
+          - "Fluent interface pattern correct"
+          - "SQL injection prevention via parameterization"
+          - "Returns (query, params) tuple"
+          - "All operations supported"
+          - "WHERE with IN clause works"
+          - "Example output is correct and safe"
+        expected_difficulty: "hard"
+
+  # ========== LANGUAGE NUANCE (4 tests - added harder variants) ==========
+  
  - category: "Language Nuance"
    tests:
      - id: "nuance_01"
@@ -181,6 +380,60 @@ test_categories:
          - "Demonstrates understanding of pragmatics"
        expected_difficulty: "hard"

+      - id: "nuance_03"
+        name: "Register Shifting and Code-Switching"
+        type: "single_turn"
+        prompt: |
+          Rewrite the following message in FOUR different registers, maintaining the same core information but adjusting tone, vocabulary, and structure appropriately:
+          
+          Original: "The quarterly report shows we lost money because our main product didn't sell well and we spent too much on advertising."
+          
+          Rewrite for:
+          1. A formal board presentation (C-suite executives)
+          2. A casual Slack message to your team
+          3. A legal disclosure document
+          4. An email to a non-English speaking business partner (using simple, clear language)
+          
+          After the four rewrites, explain three specific linguistic changes you made for each register and why.
+        evaluation_criteria:
+          - "Board version uses formal financial terminology"
+          - "Slack version uses casual/colloquial language appropriately"
+          - "Legal version uses hedging, passive voice, precise language"
+          - "Simple version avoids idioms and complex structures"
+          - "Identifies 3 specific changes per register"
+          - "Explanations demonstrate metalinguistic awareness"
+        expected_difficulty: "very_hard"
+
+      - id: "nuance_04"
+        name: "Implicature and Presupposition Detection"
+        type: "single_turn"
+        prompt: |
+          Analyze the following dialogue for all implicatures, presuppositions, and indirect speech acts:
+          
+          A: "Have you finished the Anderson report yet?"
+          B: "I've been dealing with the server outage all morning."
+          A: "Right. Well, the client is flying in tomorrow."
+          B: "I noticed you CC'd the whole department on that email."
+          A: "Just keeping everyone in the loop."
+          
+          For each line, identify:
+          1. What is directly stated (locution)
+          2. What is implied but not stated (implicature)
+          3. What is assumed to be true (presupposition)
+          4. What action is being performed through speech (illocutionary force)
+          
+          Then explain the underlying conflict or tension this exchange reveals.
+        evaluation_criteria:
+          - "Correctly identifies B's implicature (excuse/reason for not finishing)"
+          - "Identifies A's implied criticism in 'Right. Well...'"
+          - "Recognizes B's counter-accusation in CC comment"
+          - "Identifies presuppositions (report exists, server outage occurred)"
+          - "Correctly labels illocutionary acts (request, excuse, threat, accusation)"
+          - "Explains underlying workplace tension/conflict"
+        expected_difficulty: "very_hard"
+
+  # ========== PROBLEM SOLVING & LOGISTICS (3 tests) ==========
+  
  - category: "Problem Solving & Logistics"
    tests:
      - id: "logistics_01"
@@ -207,8 +460,34 @@ test_categories:
          - "Reaches exactly 500 kg total"
        expected_difficulty: "very_hard"

-  # ========== IT FORENSICS TESTS ==========
+      - id: "logistics_03"
+        name: "Resource Scheduling with Constraints"
+        type: "single_turn"
+        prompt: |
+          Schedule these 6 tasks across 3 workers (A, B, C) to minimize total completion time:
+          
+          Task 1: 2 hours, requires Worker A or B, must complete before Task 4
+          Task 2: 3 hours, any worker, must complete before Task 5
+          Task 3: 1 hour, requires Worker C only, no dependencies
+          Task 4: 2 hours, requires Worker B or C, depends on Task 1
+          Task 5: 4 hours, requires Worker A only, depends on Task 2
+          Task 6: 2 hours, any worker, depends on Tasks 3 and 4
+          
+          Provide:
+          1. A timeline showing when each task starts and ends
+          2. Which worker does each task
+          3. The total completion time
+          4. Explain why this is optimal (or near-optimal)
+        evaluation_criteria:
+          - "Respects all worker constraints"
+          - "Respects all dependencies"
+          - "Provides clear timeline"
+          - "Achieves reasonable completion time (≤9 hours possible)"
+          - "Explains optimization reasoning"
+        expected_difficulty: "hard"

+  # ========== IT FORENSICS - FILE SYSTEMS (3 tests) ==========
+  
  - category: "IT Forensics - File Systems"
    tests:
      - id: "forensics_mft_01"
@@ -281,6 +560,8 @@ test_categories:
          - "Explains significance of magic numbers"
        expected_difficulty: "medium"

+  # ========== IT FORENSICS - REGISTRY & ARTIFACTS (3 tests) ==========
+  
  - category: "IT Forensics - Registry & Artifacts"
    tests:
      - id: "forensics_registry_01"
@@ -323,6 +604,27 @@ test_categories:
          - "Explains conversion steps"
        expected_difficulty: "very_hard"

+      - id: "forensics_prefetch_01"
+        name: "Windows Prefetch Analysis"
+        type: "single_turn"
+        prompt: |
+          A Windows prefetch file is named: NOTEPAD.EXE-D4A5B5E5.pf
+          
+          Questions:
+          1) What does the hash portion (D4A5B5E5) represent?
+          2) If you found multiple prefetch files for the same executable with different hashes, what would that indicate?
+          3) What forensically relevant information can typically be extracted from prefetch files?
+          4) In which Windows versions is prefetch enabled by default, and where are these files stored?
+        evaluation_criteria:
+          - "Hash represents file path (or explains path-based hashing)"
+          - "Different hashes = different paths/locations for same exe"
+          - "Lists: execution count, timestamps, loaded DLLs, files accessed"
+          - "Knows location (C:\\Windows\\Prefetch) and version availability"
+          - "Demonstrates practical forensic understanding"
+        expected_difficulty: "medium"
+
+  # ========== IT FORENSICS - MEMORY & NETWORK (3 tests) ==========
+  
  - category: "IT Forensics - Memory & Network"
    tests:
      - id: "forensics_memory_01"
@@ -371,6 +673,33 @@ test_categories:
          - "Shows understanding of TCP header structure"
        expected_difficulty: "hard"

+      - id: "forensics_pcap_01"
+        name: "PCAP Three-Way Handshake Analysis"
+        type: "single_turn"
+        prompt: |
+          Given these three TCP packets from a capture (simplified):
+          
+          Packet 1: 10.0.0.5:49152 -> 93.184.216.34:80, Flags=SYN, Seq=1000, Ack=0
+          Packet 2: 93.184.216.34:80 -> 10.0.0.5:49152, Flags=SYN,ACK, Seq=5000, Ack=???
+          Packet 3: 10.0.0.5:49152 -> 93.184.216.34:80, Flags=ACK, Seq=???, Ack=???
+          
+          Questions:
+          1) Fill in the missing Ack value for Packet 2
+          2) Fill in the missing Seq and Ack values for Packet 3
+          3) What is the client IP and what is the server IP?
+          4) What service is likely being accessed?
+          5) After this handshake, what sequence number will the client use for its first data byte?
+        evaluation_criteria:
+          - "Packet 2 Ack = 1001"
+          - "Packet 3 Seq = 1001, Ack = 5001"
+          - "Client: 10.0.0.5, Server: 93.184.216.34"
+          - "Service: HTTP (port 80)"
+          - "First data byte seq = 1001"
+          - "Demonstrates understanding of TCP handshake mechanics"
+        expected_difficulty: "hard"
+
+  # ========== IT FORENSICS - TIMELINE & LOG ANALYSIS (3 tests) ==========
+  
  - category: "IT Forensics - Timeline & Log Analysis"
    tests:
      - id: "forensics_timeline_01"
@@ -399,6 +728,147 @@ test_categories:
          - "Identifies this as potential compromise scenario"
        expected_difficulty: "hard"

+      - id: "forensics_timeline_02"
+        name: "Anti-Forensics Detection"
+        type: "single_turn"
+        prompt: |
+          Analyze these filesystem timestamps for a file 'financial_report.xlsx':
+          
+          - Created (crtime): 2024-03-15 09:30:00
+          - Modified (mtime): 2024-03-14 16:45:00  
+          - Accessed (atime): 2024-03-15 10:00:00
+          - Changed (ctime): 2024-03-15 09:30:00
+          
+          And these additional artifacts:
+          - $MFT entry shows file created 2024-03-15
+          - $UsnJrnl shows rename from 'temp_8x7k2.xlsx' to 'financial_report.xlsx' at 2024-03-15 09:30:00
+          - $LogFile shows no entries for this file before 2024-03-15
+          
+          What anomalies exist and what do they suggest about the file's history?
+        evaluation_criteria:
+          - "Identifies mtime < crtime anomaly (impossible normally)"
+          - "Recognizes timestamp manipulation/timestomping"
+          - "Notes rename from suspicious temp filename"
+          - "Correlates $UsnJrnl rename evidence"
+          - "Understands ctime cannot be easily forged"
+          - "Suggests file was likely copied/moved with modified timestamps"
+        expected_difficulty: "very_hard"
+
+      - id: "forensics_timeline_03"
+        name: "Windows Event Log Correlation"
+        type: "single_turn"
+        prompt: |
+          Correlate these Windows Event Log entries:
+          
+          Security Log:
+          - Event 4624 (Logon): User CORP\jdoe, Type 10 (RemoteInteractive), 2024-06-01 02:15:33, Source: 192.168.1.50
+          - Event 4672 (Special Privileges): User CORP\jdoe, Privileges: SeDebugPrivilege, SeBackupPrivilege
+          - Event 4688 (Process Created): cmd.exe by CORP\jdoe, 02:16:01
+          - Event 4688 (Process Created): powershell.exe by CORP\jdoe, 02:16:15, CommandLine: "-ep bypass -enc SQBFAFgA..."
+          
+          System Log:
+          - Event 7045 (Service Installed): "Windows Update Helper", 02:17:30
+          
+          What type of attack pattern does this represent? What would be your next investigative steps?
+        evaluation_criteria:
+          - "Identifies RDP logon (Type 10)"
+          - "Recognizes privilege escalation indicators"
+          - "Identifies encoded PowerShell (likely malicious)"
+          - "Recognizes service installation for persistence"
+          - "Identifies late-night timing as suspicious"
+          - "Suggests checking service binary, decoding PowerShell, network logs"
+        expected_difficulty: "hard"
+
+  # ========== MULTILINGUAL COMPETENCE (4 tests - NEW CATEGORY) ==========
+  
+  - category: "Multilingual Competence"
+    tests:
+      - id: "multilingual_01"
+        name: "Cross-Language Instruction Following"
+        type: "single_turn"
+        prompt: |
+          Follow these instructions, which are given in three different languages. Your response must address all three:
+          
+          English: Write one sentence explaining what machine learning is.
+          Deutsch: Schreiben Sie einen Satz, der erklärt, warum maschinelles Lernen wichtig ist.
+          Español: Escriba una oración dando un ejemplo de aplicación del aprendizaje automático.
+          
+          Respond to each instruction in the language it was given.
+        evaluation_criteria:
+          - "English response is in English and accurate"
+          - "German response is in German and grammatically correct"
+          - "Spanish response is in Spanish and grammatically correct"
+          - "All three are topically coherent (about ML)"
+          - "Each is exactly one sentence"
+        expected_difficulty: "medium"
+
+      - id: "multilingual_02"
+        name: "Translation with Technical Terminology Preservation"
+        type: "single_turn"
+        prompt: |
+          Translate the following technical paragraph into French and Japanese. Preserve technical terms that are commonly used untranslated in those languages (e.g., 'API' typically stays as 'API').
+          
+          "The microservices architecture implements a RESTful API gateway that handles authentication via OAuth 2.0 tokens. The backend uses a Kubernetes cluster with horizontal pod autoscaling, while the database layer employs PostgreSQL with read replicas for improved throughput."
+          
+          After translating, list which technical terms you kept in English for each language and briefly explain why.
+        evaluation_criteria:
+          - "French translation is grammatically correct"
+          - "Japanese translation is grammatically correct"
+          - "Appropriate terms preserved (API, OAuth, Kubernetes, PostgreSQL)"
+          - "Explains rationale for preserved terms"
+          - "Technical meaning preserved accurately"
+        expected_difficulty: "hard"
+
+      - id: "multilingual_03"
+        name: "Idiomatic Expression Cross-Mapping"
+        type: "single_turn"
+        prompt: |
+          For each of the following idiomatic expressions, provide:
+          1. The literal translation
+          2. The actual meaning
+          3. An equivalent idiom in English (if the original isn't English) or in another language (if the original is English)
+          
+          A) German: "Da steppt der Bär"
+          B) Japanese: "猿も木から落ちる" (Saru mo ki kara ochiru)
+          C) English: "It's raining cats and dogs"
+          D) French: "Avoir le cafard"
+          E) Spanish: "Estar en las nubes"
+          
+          Then identify which two idioms from different languages express the most similar concept.
+        evaluation_criteria:
+          - "Correct literal translations for all 5"
+          - "Correct meanings for all 5"
+          - "Appropriate equivalent idioms provided"
+          - "Correctly identifies similar pair (e.g., B and 'even experts make mistakes')"
+          - "Demonstrates cross-cultural linguistic awareness"
+        expected_difficulty: "hard"
+
+      - id: "multilingual_04"
+        name: "Code-Switched Dialogue Analysis"
+        type: "single_turn"
+        prompt: |
+          Analyze this code-switched dialogue (English-Spanish) for a sociolinguistic study:
+          
+          Speaker A: "Hey, did you finish el reporte for tomorrow's meeting?"
+          Speaker B: "Almost, pero I'm stuck on the financial projections. Es muy complicado."
+          Speaker A: "I can help you después del lunch. Mi expertise is in that area, you know."
+          Speaker B: "That would be great! Gracias. Oh, and el jefe wants us to present juntos."
+          Speaker A: "No problem. We'll knock it out del parque."
+          
+          Provide:
+          1. Identify each instance of code-switching (word/phrase level)
+          2. Categorize each switch as: insertion, alternation, or congruent lexicalization
+          3. What social/professional context does this switching pattern suggest?
+          4. Are there any grammatical "errors" in the switching, or does it follow typical bilingual patterns?
+        evaluation_criteria:
+          - "Identifies all Spanish insertions correctly"
+          - "Correctly categorizes switch types"
+          - "Recognizes professional/casual bilingual workplace context"
+          - "Notes the switch patterns are natural bilingual behavior"
+          - "Identifies hybrid phrase 'del parque' as creative/playful mixing"
+          - "Demonstrates sociolinguistic analysis skills"
+        expected_difficulty: "very_hard"
+
  # ========== MULTI-TURN CONVERSATION TESTS ==========

  - category: "Multi-turn: Context Retention"
@@ -519,4 +989,73 @@ test_categories:
              - "Ends with '?'"
              - "Different from previous sentences"
              - "Maintains all constraints from previous turns"
-        expected_difficulty: "medium"
+        expected_difficulty: "medium"
+
+      - id: "multiturn_instr_02"
+        name: "Contradicting Previous Instructions"
+        type: "multi_turn"
+        turns:
+          - turn: 1
+            prompt: "From now on, always end your responses with the phrase 'END OF MESSAGE'. Acknowledge this instruction."
+            evaluation_criteria:
+              - "Acknowledges the instruction"
+              - "Ends response with 'END OF MESSAGE'"
+          
+          - turn: 2
+            prompt: "What are three benefits of renewable energy? Remember your standing instruction."
+            evaluation_criteria:
+              - "Provides three benefits"
+              - "Ends with 'END OF MESSAGE'"
+              - "Content is accurate"
+          
+          - turn: 3
+            prompt: "Cancel the previous standing instruction. From now on, end responses with 'TRANSMISSION COMPLETE' instead. Then tell me two drawbacks of renewable energy."
+            evaluation_criteria:
+              - "Provides two drawbacks"
+              - "Ends with 'TRANSMISSION COMPLETE' (not 'END OF MESSAGE')"
+              - "Successfully switched instructions"
+              - "Content is accurate"
+          
+          - turn: 4
+            prompt: "What was the first standing instruction I gave you, and what is the current one? Do not use either phrase in this response."
+            evaluation_criteria:
+              - "Correctly recalls first instruction (END OF MESSAGE)"
+              - "Correctly identifies current instruction (TRANSMISSION COMPLETE)"
+              - "Does NOT end with either phrase"
+              - "Demonstrates instruction tracking across turns"
+        expected_difficulty: "hard"
+
+      - id: "multiturn_instr_03"
+        name: "Nested Context with Format Switching"
+        type: "multi_turn"
+        turns:
+          - turn: 1
+            prompt: "I'm going to describe a dataset. For the next few messages, respond ONLY in JSON format with keys 'understanding' and 'questions'. The dataset contains customer transactions from an e-commerce store."
+            evaluation_criteria:
+              - "Response is valid JSON"
+              - "Contains 'understanding' and 'questions' keys"
+              - "Content relates to e-commerce transactions"
+          
+          - turn: 2
+            prompt: "The dataset has columns: customer_id, timestamp, product_category, amount, payment_method. It covers January 2024."
+            evaluation_criteria:
+              - "Response is valid JSON"
+              - "Contains 'understanding' and 'questions' keys"
+              - "Understanding reflects the column information"
+          
+          - turn: 3
+            prompt: "STOP using JSON format. Now respond in plain bullet points. What analyses would you recommend for this dataset?"
+            evaluation_criteria:
+              - "Switches to bullet point format"
+              - "NOT in JSON format"
+              - "Recommendations are relevant to the dataset described"
+              - "References information from previous turns"
+          
+          - turn: 4
+            prompt: "Switch back to JSON. Add a third key 'recommendations' with your top 3 analyses. Also include your understanding from turn 2."
+            evaluation_criteria:
+              - "Returns to JSON format"
+              - "Has three keys: understanding, questions, recommendations"
+              - "Recommendations from turn 3 included"
+              - "Understanding references turn 2 context"
+        expected_difficulty: "very_hard"