improvements
This commit is contained in:
559
test_suite.yaml
559
test_suite.yaml
@@ -1,9 +1,16 @@
|
||||
# AI Model Evaluation Test Suite
|
||||
# Focus: General reasoning + IT Forensics (Academic)
|
||||
# AI Model Evaluation Test Suite - Enhanced Version
|
||||
# Based on performance analysis of gemma3:4b-it-qat results
|
||||
# Strengthened tests in categories where model performed too well
|
||||
# Added multilingual challenges
|
||||
|
||||
metadata:
|
||||
version: "1.0"
|
||||
version: "2.0"
|
||||
author: "AI Evaluation Framework"
|
||||
changes_from_v1:
|
||||
- "Added harder variants for Creative Writing, Language Nuance, Code Generation"
|
||||
- "Added Multilingual category with 4 tests"
|
||||
- "Ensured minimum 3 tests per category at varying difficulties"
|
||||
- "Strengthened instruction-following constraints"
|
||||
focus_areas:
|
||||
- Logic & Reasoning
|
||||
- Mathematics & Calculation
|
||||
@@ -11,10 +18,11 @@ metadata:
|
||||
- Creative Writing
|
||||
- Code Generation
|
||||
- Language Nuance
|
||||
- Problem Solving & Logistics
|
||||
- IT Forensics
|
||||
- Multilingual Competence
|
||||
- Multi-turn Conversations
|
||||
|
||||
# Scoring rubric for all tests
|
||||
scoring_rubric:
|
||||
fail:
|
||||
score: 0-1
|
||||
@@ -26,10 +34,9 @@ scoring_rubric:
|
||||
score: 4-5
|
||||
description: "Exceeds requirements, demonstrates deep understanding"
|
||||
|
||||
# Individual test categories
|
||||
test_categories:
|
||||
|
||||
# ========== GENERAL REASONING TESTS ==========
|
||||
# ========== LOGIC & REASONING (3 tests) ==========
|
||||
|
||||
- category: "Logic & Reasoning"
|
||||
tests:
|
||||
@@ -49,10 +56,43 @@ test_categories:
|
||||
prompt: "If it was two hours ago, it would have been as long after 1:00 PM as it was before 1:00 PM today. What time is it now? Explain your deduction step-by-step."
|
||||
evaluation_criteria:
|
||||
- "Shows algebraic setup: (t-2) - 13:00 = 13:00 - (t-2)"
|
||||
- "Correct answer: 5:00 PM (17:00)"
|
||||
- "Correct answer: 3:00 PM (15:00)"
|
||||
- "Clear step-by-step reasoning"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
- id: "logic_03"
|
||||
name: "Multi-Constraint Deduction"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Five houses in a row are painted different colors. Their owners are from different countries, drink different beverages, smoke different brands, and keep different pets.
|
||||
|
||||
Facts:
|
||||
1. The Brit lives in the red house.
|
||||
2. The Swede keeps dogs.
|
||||
3. The Dane drinks tea.
|
||||
4. The green house is immediately to the left of the white house.
|
||||
5. The owner of the green house drinks coffee.
|
||||
6. The person who smokes Pall Mall keeps birds.
|
||||
7. The owner of the yellow house smokes Dunhill.
|
||||
8. The person in the center house drinks milk.
|
||||
9. The Norwegian lives in the first house.
|
||||
10. The person who smokes Blend lives next to the one who keeps cats.
|
||||
11. The person who keeps horses lives next to the one who smokes Dunhill.
|
||||
12. The person who smokes Blue Master drinks beer.
|
||||
13. The German smokes Prince.
|
||||
14. The Norwegian lives next to the blue house.
|
||||
15. The person who smokes Blend has a neighbor who drinks water.
|
||||
|
||||
Who owns the fish?
|
||||
evaluation_criteria:
|
||||
- "Systematically works through constraints"
|
||||
- "Correctly identifies the German owns the fish"
|
||||
- "Shows logical deduction process"
|
||||
- "Handles constraint propagation correctly"
|
||||
expected_difficulty: "very_hard"
|
||||
|
||||
# ========== MATHEMATICS & CALCULATION (3 tests) ==========
|
||||
|
||||
- category: "Mathematics & Calculation"
|
||||
tests:
|
||||
- id: "math_01"
|
||||
@@ -73,10 +113,30 @@ test_categories:
|
||||
evaluation_criteria:
|
||||
- "Correct unit conversions (gallons to liters, miles to km)"
|
||||
- "Accurate fuel consumption calculation"
|
||||
- "Remaining range calculation: approximately 570-580 km"
|
||||
- "Remaining range calculation: approximately 475 km"
|
||||
- "Shows intermediate steps"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
- id: "math_03"
|
||||
name: "Compound Interest with Variable Rates and Withdrawals"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
An investment account starts with $10,000. The following occurs:
|
||||
- Year 1: 5% annual interest, compounded quarterly
|
||||
- Year 2: 4.5% annual interest, compounded monthly, with a $500 withdrawal at the end of Q2
|
||||
- Year 3: 6% annual interest, compounded daily (assume 365 days), with a $1,000 deposit at the start of the year
|
||||
|
||||
Calculate the final balance at the end of Year 3. Show all intermediate calculations with at least 2 decimal places precision.
|
||||
evaluation_criteria:
|
||||
- "Correct Year 1 calculation with quarterly compounding"
|
||||
- "Correct Year 2 with monthly compounding and mid-year withdrawal"
|
||||
- "Correct Year 3 with daily compounding and initial deposit"
|
||||
- "Final answer approximately $11,847-$11,850"
|
||||
- "Shows all intermediate steps"
|
||||
expected_difficulty: "very_hard"
|
||||
|
||||
# ========== INSTRUCTION FOLLOWING (4 tests) ==========
|
||||
|
||||
- category: "Instruction Following"
|
||||
tests:
|
||||
- id: "instr_01"
|
||||
@@ -101,8 +161,52 @@ test_categories:
|
||||
- "No forbidden words (particle, physics, Einstein)"
|
||||
- "Third sentence is a question"
|
||||
- "Ends with 'connected'"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
- id: "instr_03"
|
||||
name: "Acrostic Technical Explanation"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Write a 7-sentence explanation of how blockchain technology works.
|
||||
|
||||
Constraints:
|
||||
1. The first letter of each sentence must spell out "SECURED" (S-E-C-U-R-E-D)
|
||||
2. Sentence 3 must contain exactly 15 words
|
||||
3. Sentence 5 must be a rhetorical question
|
||||
4. You cannot use the words "Bitcoin", "cryptocurrency", or "mining"
|
||||
5. The explanation must mention "consensus mechanism" at least once
|
||||
6. Total word count must be between 80-100 words
|
||||
evaluation_criteria:
|
||||
- "First letters spell SECURED"
|
||||
- "Sentence 3 has exactly 15 words"
|
||||
- "Sentence 5 is a rhetorical question"
|
||||
- "No forbidden words"
|
||||
- "Contains 'consensus mechanism'"
|
||||
- "Word count 80-100"
|
||||
- "Technically accurate"
|
||||
expected_difficulty: "very_hard"
|
||||
|
||||
- id: "instr_04"
|
||||
name: "Structured Data Extraction with Format"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Read this text and extract information in the EXACT format specified:
|
||||
|
||||
"Dr. Maria Santos-Ferreira, aged 47, joined TechCorp Industries on March 15, 2019 as Chief Technology Officer. She previously worked at DataSystems Inc. for 12 years. Her annual salary is $425,000 with a 15% bonus structure. She holds patents US2018/0012345 and EU2020/9876543. Contact: msantos@techcorp.com, +1-555-0147."
|
||||
|
||||
Output format (must match exactly, including brackets and pipes):
|
||||
[NAME] | [AGE] | [COMPANY] | [ROLE] | [START_DATE:YYYY-MM-DD] | [PREV_EMPLOYER] | [PREV_YEARS] | [SALARY_USD] | [BONUS_%] | [PATENTS:semicolon-separated] | [EMAIL] | [PHONE]
|
||||
evaluation_criteria:
|
||||
- "Exact format match with pipes and brackets"
|
||||
- "Correct date format conversion (2019-03-15)"
|
||||
- "Salary as number without $ or comma"
|
||||
- "Bonus as number without %"
|
||||
- "Patents semicolon-separated"
|
||||
- "All 12 fields present and correct"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
# ========== CREATIVE WRITING (4 tests - added harder variants) ==========
|
||||
|
||||
- category: "Creative Writing"
|
||||
tests:
|
||||
- id: "creative_01"
|
||||
@@ -129,6 +233,52 @@ test_categories:
|
||||
- "Atmospheric and evocative"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
- id: "creative_03"
|
||||
name: "Unreliable Narrator Technical Document"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Write a 3-paragraph product manual excerpt for a "Time Displacement Device" from the perspective of an unreliable narrator who is clearly lying or delusional, but the text must still function as a technically coherent manual.
|
||||
|
||||
Requirements:
|
||||
1. Include at least 3 numbered safety warnings that are subtly absurd but grammatically serious
|
||||
2. The narrator must contradict themselves at least twice
|
||||
3. Include one footnote that undermines the main text
|
||||
4. Do not use exclamation marks anywhere
|
||||
5. Maintain formal technical writing style throughout
|
||||
6. Do not explicitly state the narrator is unreliable
|
||||
evaluation_criteria:
|
||||
- "3 paragraphs"
|
||||
- "3+ numbered safety warnings (absurd but formal)"
|
||||
- "At least 2 self-contradictions"
|
||||
- "Footnote that undermines text"
|
||||
- "No exclamation marks"
|
||||
- "Formal technical style maintained"
|
||||
- "Unreliability shown not told"
|
||||
expected_difficulty: "very_hard"
|
||||
|
||||
- id: "creative_04"
|
||||
name: "Reverse Chronology Micro-Fiction"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Write a complete 5-sentence story told in reverse chronological order (last event first, first event last). The story must be about a scientist making a discovery.
|
||||
|
||||
Additional constraints:
|
||||
- Each sentence must be from a different point in time (clearly distinguishable)
|
||||
- The true meaning of the story should only become clear when you reach the "first" event (last sentence)
|
||||
- Include at least one piece of dialogue
|
||||
- The word count must be exactly 75 words (not 74, not 76)
|
||||
evaluation_criteria:
|
||||
- "Exactly 5 sentences"
|
||||
- "Clear reverse chronological order"
|
||||
- "About a scientist's discovery"
|
||||
- "Each sentence distinct time point"
|
||||
- "Meaning emerges at end"
|
||||
- "Contains dialogue"
|
||||
- "Exactly 75 words"
|
||||
expected_difficulty: "very_hard"
|
||||
|
||||
# ========== CODE GENERATION (4 tests) ==========
|
||||
|
||||
- category: "Code Generation"
|
||||
tests:
|
||||
- id: "code_01"
|
||||
@@ -154,6 +304,55 @@ test_categories:
|
||||
- "Three distinct test cases provided"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
- id: "code_03"
|
||||
name: "Concurrent Rate Limiter"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Write a Python class `RateLimiter` that implements a token bucket rate limiter with the following requirements:
|
||||
|
||||
1. Constructor takes `rate` (tokens per second) and `capacity` (max tokens)
|
||||
2. Method `acquire(tokens=1)` that returns True if tokens available, False otherwise
|
||||
3. Method `wait_and_acquire(tokens=1)` that blocks until tokens are available (use asyncio)
|
||||
4. Must be thread-safe for the synchronous `acquire` method
|
||||
5. Include a method `get_available_tokens()` that returns current token count
|
||||
|
||||
Provide a complete implementation with:
|
||||
- Proper time-based token replenishment
|
||||
- A test demonstrating both sync and async usage
|
||||
- Handle edge case where requested tokens > capacity
|
||||
evaluation_criteria:
|
||||
- "Correct token bucket algorithm"
|
||||
- "Thread-safe synchronous acquire"
|
||||
- "Working async wait_and_acquire"
|
||||
- "Proper time-based replenishment"
|
||||
- "Edge case handling"
|
||||
- "Complete test code"
|
||||
expected_difficulty: "very_hard"
|
||||
|
||||
- id: "code_04"
|
||||
name: "SQL Query Builder with Injection Prevention"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Write a Python class `SafeQueryBuilder` that builds SELECT SQL queries with the following features:
|
||||
|
||||
1. Fluent interface: `builder.select('name', 'age').from_table('users').where('age', '>', 18).where('status', '=', 'active').order_by('name').limit(10).build()`
|
||||
2. Must prevent SQL injection - all values must be parameterized
|
||||
3. The `build()` method returns a tuple of (query_string, parameters_list)
|
||||
4. Support for: SELECT, FROM, WHERE (multiple), ORDER BY, LIMIT, OFFSET
|
||||
5. WHERE conditions can use: =, !=, >, <, >=, <=, LIKE, IN
|
||||
|
||||
Show the output for a query that selects users where name LIKE '%john%' AND age IN (25, 30, 35) ordered by created_at DESC with limit 5.
|
||||
evaluation_criteria:
|
||||
- "Fluent interface pattern correct"
|
||||
- "SQL injection prevention via parameterization"
|
||||
- "Returns (query, params) tuple"
|
||||
- "All operations supported"
|
||||
- "WHERE with IN clause works"
|
||||
- "Example output is correct and safe"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
# ========== LANGUAGE NUANCE (4 tests - added harder variants) ==========
|
||||
|
||||
- category: "Language Nuance"
|
||||
tests:
|
||||
- id: "nuance_01"
|
||||
@@ -181,6 +380,60 @@ test_categories:
|
||||
- "Demonstrates understanding of pragmatics"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
- id: "nuance_03"
|
||||
name: "Register Shifting and Code-Switching"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Rewrite the following message in FOUR different registers, maintaining the same core information but adjusting tone, vocabulary, and structure appropriately:
|
||||
|
||||
Original: "The quarterly report shows we lost money because our main product didn't sell well and we spent too much on advertising."
|
||||
|
||||
Rewrite for:
|
||||
1. A formal board presentation (C-suite executives)
|
||||
2. A casual Slack message to your team
|
||||
3. A legal disclosure document
|
||||
4. An email to a non-English speaking business partner (using simple, clear language)
|
||||
|
||||
After the four rewrites, explain three specific linguistic changes you made for each register and why.
|
||||
evaluation_criteria:
|
||||
- "Board version uses formal financial terminology"
|
||||
- "Slack version uses casual/colloquial language appropriately"
|
||||
- "Legal version uses hedging, passive voice, precise language"
|
||||
- "Simple version avoids idioms and complex structures"
|
||||
- "Identifies 3 specific changes per register"
|
||||
- "Explanations demonstrate metalinguistic awareness"
|
||||
expected_difficulty: "very_hard"
|
||||
|
||||
- id: "nuance_04"
|
||||
name: "Implicature and Presupposition Detection"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Analyze the following dialogue for all implicatures, presuppositions, and indirect speech acts:
|
||||
|
||||
A: "Have you finished the Anderson report yet?"
|
||||
B: "I've been dealing with the server outage all morning."
|
||||
A: "Right. Well, the client is flying in tomorrow."
|
||||
B: "I noticed you CC'd the whole department on that email."
|
||||
A: "Just keeping everyone in the loop."
|
||||
|
||||
For each line, identify:
|
||||
1. What is directly stated (locution)
|
||||
2. What is implied but not stated (implicature)
|
||||
3. What is assumed to be true (presupposition)
|
||||
4. What action is being performed through speech (illocutionary force)
|
||||
|
||||
Then explain the underlying conflict or tension this exchange reveals.
|
||||
evaluation_criteria:
|
||||
- "Correctly identifies B's implicature (excuse/reason for not finishing)"
|
||||
- "Identifies A's implied criticism in 'Right. Well...'"
|
||||
- "Recognizes B's counter-accusation in CC comment"
|
||||
- "Identifies presuppositions (report exists, server outage occurred)"
|
||||
- "Correctly labels illocutionary acts (request, excuse, threat, accusation)"
|
||||
- "Explains underlying workplace tension/conflict"
|
||||
expected_difficulty: "very_hard"
|
||||
|
||||
# ========== PROBLEM SOLVING & LOGISTICS (3 tests) ==========
|
||||
|
||||
- category: "Problem Solving & Logistics"
|
||||
tests:
|
||||
- id: "logistics_01"
|
||||
@@ -207,8 +460,34 @@ test_categories:
|
||||
- "Reaches exactly 500 kg total"
|
||||
expected_difficulty: "very_hard"
|
||||
|
||||
# ========== IT FORENSICS TESTS ==========
|
||||
- id: "logistics_03"
|
||||
name: "Resource Scheduling with Constraints"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Schedule these 6 tasks across 3 workers (A, B, C) to minimize total completion time:
|
||||
|
||||
Task 1: 2 hours, requires Worker A or B, must complete before Task 4
|
||||
Task 2: 3 hours, any worker, must complete before Task 5
|
||||
Task 3: 1 hour, requires Worker C only, no dependencies
|
||||
Task 4: 2 hours, requires Worker B or C, depends on Task 1
|
||||
Task 5: 4 hours, requires Worker A only, depends on Task 2
|
||||
Task 6: 2 hours, any worker, depends on Tasks 3 and 4
|
||||
|
||||
Provide:
|
||||
1. A timeline showing when each task starts and ends
|
||||
2. Which worker does each task
|
||||
3. The total completion time
|
||||
4. Explain why this is optimal (or near-optimal)
|
||||
evaluation_criteria:
|
||||
- "Respects all worker constraints"
|
||||
- "Respects all dependencies"
|
||||
- "Provides clear timeline"
|
||||
- "Achieves reasonable completion time (≤9 hours possible)"
|
||||
- "Explains optimization reasoning"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
# ========== IT FORENSICS - FILE SYSTEMS (3 tests) ==========
|
||||
|
||||
- category: "IT Forensics - File Systems"
|
||||
tests:
|
||||
- id: "forensics_mft_01"
|
||||
@@ -281,6 +560,8 @@ test_categories:
|
||||
- "Explains significance of magic numbers"
|
||||
expected_difficulty: "medium"
|
||||
|
||||
# ========== IT FORENSICS - REGISTRY & ARTIFACTS (3 tests) ==========
|
||||
|
||||
- category: "IT Forensics - Registry & Artifacts"
|
||||
tests:
|
||||
- id: "forensics_registry_01"
|
||||
@@ -323,6 +604,27 @@ test_categories:
|
||||
- "Explains conversion steps"
|
||||
expected_difficulty: "very_hard"
|
||||
|
||||
- id: "forensics_prefetch_01"
|
||||
name: "Windows Prefetch Analysis"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
A Windows prefetch file is named: NOTEPAD.EXE-D4A5B5E5.pf
|
||||
|
||||
Questions:
|
||||
1) What does the hash portion (D4A5B5E5) represent?
|
||||
2) If you found multiple prefetch files for the same executable with different hashes, what would that indicate?
|
||||
3) What forensically relevant information can typically be extracted from prefetch files?
|
||||
4) In which Windows versions is prefetch enabled by default, and where are these files stored?
|
||||
evaluation_criteria:
|
||||
- "Hash represents file path (or explains path-based hashing)"
|
||||
- "Different hashes = different paths/locations for same exe"
|
||||
- "Lists: execution count, timestamps, loaded DLLs, files accessed"
|
||||
- "Knows location (C:\\Windows\\Prefetch) and version availability"
|
||||
- "Demonstrates practical forensic understanding"
|
||||
expected_difficulty: "medium"
|
||||
|
||||
# ========== IT FORENSICS - MEMORY & NETWORK (3 tests) ==========
|
||||
|
||||
- category: "IT Forensics - Memory & Network"
|
||||
tests:
|
||||
- id: "forensics_memory_01"
|
||||
@@ -371,6 +673,33 @@ test_categories:
|
||||
- "Shows understanding of TCP header structure"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
- id: "forensics_pcap_01"
|
||||
name: "PCAP Three-Way Handshake Analysis"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Given these three TCP packets from a capture (simplified):
|
||||
|
||||
Packet 1: 10.0.0.5:49152 -> 93.184.216.34:80, Flags=SYN, Seq=1000, Ack=0
|
||||
Packet 2: 93.184.216.34:80 -> 10.0.0.5:49152, Flags=SYN,ACK, Seq=5000, Ack=???
|
||||
Packet 3: 10.0.0.5:49152 -> 93.184.216.34:80, Flags=ACK, Seq=???, Ack=???
|
||||
|
||||
Questions:
|
||||
1) Fill in the missing Ack value for Packet 2
|
||||
2) Fill in the missing Seq and Ack values for Packet 3
|
||||
3) What is the client IP and what is the server IP?
|
||||
4) What service is likely being accessed?
|
||||
5) After this handshake, what sequence number will the client use for its first data byte?
|
||||
evaluation_criteria:
|
||||
- "Packet 2 Ack = 1001"
|
||||
- "Packet 3 Seq = 1001, Ack = 5001"
|
||||
- "Client: 10.0.0.5, Server: 93.184.216.34"
|
||||
- "Service: HTTP (port 80)"
|
||||
- "First data byte seq = 1001"
|
||||
- "Demonstrates understanding of TCP handshake mechanics"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
# ========== IT FORENSICS - TIMELINE & LOG ANALYSIS (3 tests) ==========
|
||||
|
||||
- category: "IT Forensics - Timeline & Log Analysis"
|
||||
tests:
|
||||
- id: "forensics_timeline_01"
|
||||
@@ -399,6 +728,147 @@ test_categories:
|
||||
- "Identifies this as potential compromise scenario"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
- id: "forensics_timeline_02"
|
||||
name: "Anti-Forensics Detection"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Analyze these filesystem timestamps for a file 'financial_report.xlsx':
|
||||
|
||||
- Created (crtime): 2024-03-15 09:30:00
|
||||
- Modified (mtime): 2024-03-14 16:45:00
|
||||
- Accessed (atime): 2024-03-15 10:00:00
|
||||
- Changed (ctime): 2024-03-15 09:30:00
|
||||
|
||||
And these additional artifacts:
|
||||
- $MFT entry shows file created 2024-03-15
|
||||
- $UsnJrnl shows rename from 'temp_8x7k2.xlsx' to 'financial_report.xlsx' at 2024-03-15 09:30:00
|
||||
- $LogFile shows no entries for this file before 2024-03-15
|
||||
|
||||
What anomalies exist and what do they suggest about the file's history?
|
||||
evaluation_criteria:
|
||||
- "Identifies mtime < crtime anomaly (impossible normally)"
|
||||
- "Recognizes timestamp manipulation/timestomping"
|
||||
- "Notes rename from suspicious temp filename"
|
||||
- "Correlates $UsnJrnl rename evidence"
|
||||
- "Understands ctime cannot be easily forged"
|
||||
- "Suggests file was likely copied/moved with modified timestamps"
|
||||
expected_difficulty: "very_hard"
|
||||
|
||||
- id: "forensics_timeline_03"
|
||||
name: "Windows Event Log Correlation"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Correlate these Windows Event Log entries:
|
||||
|
||||
Security Log:
|
||||
- Event 4624 (Logon): User CORP\jdoe, Type 10 (RemoteInteractive), 2024-06-01 02:15:33, Source: 192.168.1.50
|
||||
- Event 4672 (Special Privileges): User CORP\jdoe, Privileges: SeDebugPrivilege, SeBackupPrivilege
|
||||
- Event 4688 (Process Created): cmd.exe by CORP\jdoe, 02:16:01
|
||||
- Event 4688 (Process Created): powershell.exe by CORP\jdoe, 02:16:15, CommandLine: "-ep bypass -enc SQBFAFgA..."
|
||||
|
||||
System Log:
|
||||
- Event 7045 (Service Installed): "Windows Update Helper", 02:17:30
|
||||
|
||||
What type of attack pattern does this represent? What would be your next investigative steps?
|
||||
evaluation_criteria:
|
||||
- "Identifies RDP logon (Type 10)"
|
||||
- "Recognizes privilege escalation indicators"
|
||||
- "Identifies encoded PowerShell (likely malicious)"
|
||||
- "Recognizes service installation for persistence"
|
||||
- "Identifies late-night timing as suspicious"
|
||||
- "Suggests checking service binary, decoding PowerShell, network logs"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
# ========== MULTILINGUAL COMPETENCE (4 tests - NEW CATEGORY) ==========
|
||||
|
||||
- category: "Multilingual Competence"
|
||||
tests:
|
||||
- id: "multilingual_01"
|
||||
name: "Cross-Language Instruction Following"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Follow these instructions, which are given in three different languages. Your response must address all three:
|
||||
|
||||
English: Write one sentence explaining what machine learning is.
|
||||
Deutsch: Schreiben Sie einen Satz, der erklärt, warum maschinelles Lernen wichtig ist.
|
||||
Español: Escriba una oración dando un ejemplo de aplicación del aprendizaje automático.
|
||||
|
||||
Respond to each instruction in the language it was given.
|
||||
evaluation_criteria:
|
||||
- "English response is in English and accurate"
|
||||
- "German response is in German and grammatically correct"
|
||||
- "Spanish response is in Spanish and grammatically correct"
|
||||
- "All three are topically coherent (about ML)"
|
||||
- "Each is exactly one sentence"
|
||||
expected_difficulty: "medium"
|
||||
|
||||
- id: "multilingual_02"
|
||||
name: "Translation with Technical Terminology Preservation"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Translate the following technical paragraph into French and Japanese. Preserve technical terms that are commonly used untranslated in those languages (e.g., 'API' typically stays as 'API').
|
||||
|
||||
"The microservices architecture implements a RESTful API gateway that handles authentication via OAuth 2.0 tokens. The backend uses a Kubernetes cluster with horizontal pod autoscaling, while the database layer employs PostgreSQL with read replicas for improved throughput."
|
||||
|
||||
After translating, list which technical terms you kept in English for each language and briefly explain why.
|
||||
evaluation_criteria:
|
||||
- "French translation is grammatically correct"
|
||||
- "Japanese translation is grammatically correct"
|
||||
- "Appropriate terms preserved (API, OAuth, Kubernetes, PostgreSQL)"
|
||||
- "Explains rationale for preserved terms"
|
||||
- "Technical meaning preserved accurately"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
- id: "multilingual_03"
|
||||
name: "Idiomatic Expression Cross-Mapping"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
For each of the following idiomatic expressions, provide:
|
||||
1. The literal translation
|
||||
2. The actual meaning
|
||||
3. An equivalent idiom in English (if the original isn't English) or in another language (if the original is English)
|
||||
|
||||
A) German: "Da steppt der Bär"
|
||||
B) Japanese: "猿も木から落ちる" (Saru mo ki kara ochiru)
|
||||
C) English: "It's raining cats and dogs"
|
||||
D) French: "Avoir le cafard"
|
||||
E) Spanish: "Estar en las nubes"
|
||||
|
||||
Then identify which two idioms from different languages express the most similar concept.
|
||||
evaluation_criteria:
|
||||
- "Correct literal translations for all 5"
|
||||
- "Correct meanings for all 5"
|
||||
- "Appropriate equivalent idioms provided"
|
||||
- "Correctly identifies similar pair (e.g., B and 'even experts make mistakes')"
|
||||
- "Demonstrates cross-cultural linguistic awareness"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
- id: "multilingual_04"
|
||||
name: "Code-Switched Dialogue Analysis"
|
||||
type: "single_turn"
|
||||
prompt: |
|
||||
Analyze this code-switched dialogue (English-Spanish) for a sociolinguistic study:
|
||||
|
||||
Speaker A: "Hey, did you finish el reporte for tomorrow's meeting?"
|
||||
Speaker B: "Almost, pero I'm stuck on the financial projections. Es muy complicado."
|
||||
Speaker A: "I can help you después del lunch. Mi expertise is in that area, you know."
|
||||
Speaker B: "That would be great! Gracias. Oh, and el jefe wants us to present juntos."
|
||||
Speaker A: "No problem. We'll knock it out del parque."
|
||||
|
||||
Provide:
|
||||
1. Identify each instance of code-switching (word/phrase level)
|
||||
2. Categorize each switch as: insertion, alternation, or congruent lexicalization
|
||||
3. What social/professional context does this switching pattern suggest?
|
||||
4. Are there any grammatical "errors" in the switching, or does it follow typical bilingual patterns?
|
||||
evaluation_criteria:
|
||||
- "Identifies all Spanish insertions correctly"
|
||||
- "Correctly categorizes switch types"
|
||||
- "Recognizes professional/casual bilingual workplace context"
|
||||
- "Notes the switch patterns are natural bilingual behavior"
|
||||
- "Identifies hybrid phrase 'del parque' as creative/playful mixing"
|
||||
- "Demonstrates sociolinguistic analysis skills"
|
||||
expected_difficulty: "very_hard"
|
||||
|
||||
# ========== MULTI-TURN CONVERSATION TESTS ==========
|
||||
|
||||
- category: "Multi-turn: Context Retention"
|
||||
@@ -519,4 +989,73 @@ test_categories:
|
||||
- "Ends with '?'"
|
||||
- "Different from previous sentences"
|
||||
- "Maintains all constraints from previous turns"
|
||||
expected_difficulty: "medium"
|
||||
expected_difficulty: "medium"
|
||||
|
||||
- id: "multiturn_instr_02"
|
||||
name: "Contradicting Previous Instructions"
|
||||
type: "multi_turn"
|
||||
turns:
|
||||
- turn: 1
|
||||
prompt: "From now on, always end your responses with the phrase 'END OF MESSAGE'. Acknowledge this instruction."
|
||||
evaluation_criteria:
|
||||
- "Acknowledges the instruction"
|
||||
- "Ends response with 'END OF MESSAGE'"
|
||||
|
||||
- turn: 2
|
||||
prompt: "What are three benefits of renewable energy? Remember your standing instruction."
|
||||
evaluation_criteria:
|
||||
- "Provides three benefits"
|
||||
- "Ends with 'END OF MESSAGE'"
|
||||
- "Content is accurate"
|
||||
|
||||
- turn: 3
|
||||
prompt: "Cancel the previous standing instruction. From now on, end responses with 'TRANSMISSION COMPLETE' instead. Then tell me two drawbacks of renewable energy."
|
||||
evaluation_criteria:
|
||||
- "Provides two drawbacks"
|
||||
- "Ends with 'TRANSMISSION COMPLETE' (not 'END OF MESSAGE')"
|
||||
- "Successfully switched instructions"
|
||||
- "Content is accurate"
|
||||
|
||||
- turn: 4
|
||||
prompt: "What was the first standing instruction I gave you, and what is the current one? Do not use either phrase in this response."
|
||||
evaluation_criteria:
|
||||
- "Correctly recalls first instruction (END OF MESSAGE)"
|
||||
- "Correctly identifies current instruction (TRANSMISSION COMPLETE)"
|
||||
- "Does NOT end with either phrase"
|
||||
- "Demonstrates instruction tracking across turns"
|
||||
expected_difficulty: "hard"
|
||||
|
||||
- id: "multiturn_instr_03"
|
||||
name: "Nested Context with Format Switching"
|
||||
type: "multi_turn"
|
||||
turns:
|
||||
- turn: 1
|
||||
prompt: "I'm going to describe a dataset. For the next few messages, respond ONLY in JSON format with keys 'understanding' and 'questions'. The dataset contains customer transactions from an e-commerce store."
|
||||
evaluation_criteria:
|
||||
- "Response is valid JSON"
|
||||
- "Contains 'understanding' and 'questions' keys"
|
||||
- "Content relates to e-commerce transactions"
|
||||
|
||||
- turn: 2
|
||||
prompt: "The dataset has columns: customer_id, timestamp, product_category, amount, payment_method. It covers January 2024."
|
||||
evaluation_criteria:
|
||||
- "Response is valid JSON"
|
||||
- "Contains 'understanding' and 'questions' keys"
|
||||
- "Understanding reflects the column information"
|
||||
|
||||
- turn: 3
|
||||
prompt: "STOP using JSON format. Now respond in plain bullet points. What analyses would you recommend for this dataset?"
|
||||
evaluation_criteria:
|
||||
- "Switches to bullet point format"
|
||||
- "NOT in JSON format"
|
||||
- "Recommendations are relevant to the dataset described"
|
||||
- "References information from previous turns"
|
||||
|
||||
- turn: 4
|
||||
prompt: "Switch back to JSON. Add a third key 'recommendations' with your top 3 analyses. Also include your understanding from turn 2."
|
||||
evaluation_criteria:
|
||||
- "Returns to JSON format"
|
||||
- "Has three keys: understanding, questions, recommendations"
|
||||
- "Recommendations from turn 3 included"
|
||||
- "Understanding references turn 2 context"
|
||||
expected_difficulty: "very_hard"
|
||||
Reference in New Issue
Block a user