improvements

This commit is contained in:
overcuriousity
2026-01-18 15:14:25 +01:00
parent 9853faf09b
commit 7697bb51ab
4 changed files with 1481 additions and 945 deletions

View File

@@ -929,15 +929,21 @@ Do not include any text before or after the JSON object."""
def save_results(self): def save_results(self):
"""Save results to JSON file""" """Save results to JSON file"""
# Ensure output directory exists
self.output_dir.mkdir(parents=True, exist_ok=True)
# Sanitize model name for use in filename (replace problematic characters)
safe_model_name = self.model_name.replace('/', '_').replace(':', '_')
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{self.model_name.replace(':', '_')}_{timestamp}.json" filename = f"{safe_model_name}_{timestamp}.json"
filepath = self.output_dir / filename filepath = self.output_dir / filename
with open(filepath, 'w', encoding='utf-8') as f: with open(filepath, 'w', encoding='utf-8') as f:
json.dump(self.results, f, indent=2, ensure_ascii=False) json.dump(self.results, f, indent=2, ensure_ascii=False)
# Also save as "latest" for this model # Also save as "latest" for this model
latest_file = self.output_dir / f"{self.model_name.replace(':', '_')}_latest.json" latest_file = self.output_dir / f"{safe_model_name}_latest.json"
with open(latest_file, 'w', encoding='utf-8') as f: with open(latest_file, 'w', encoding='utf-8') as f:
json.dump(self.results, f, indent=2, ensure_ascii=False) json.dump(self.results, f, indent=2, ensure_ascii=False)

View File

@@ -507,18 +507,14 @@ class WebInterface:
dimensions = { dimensions = {
'logical_reasoning': ['Logic & Reasoning'], 'logical_reasoning': ['Logic & Reasoning'],
'mathematical_ability': ['Mathematics & Calculation'], 'mathematical_ability': ['Mathematics & Calculation'],
'instruction_following': ['Instruction Following', 'Multi-turn: Instruction Following'], 'instruction_following': ['Instruction Following with Constraints'],
'creativity': ['Creative Writing'], 'creativity': ['Creative Writing with Constraints'],
'technical_knowledge': [ 'technical_knowledge': [
'Code Generation', 'Code Generation - Advanced',
'IT Forensics - File Systems', 'Digital Forensics & Binary Analysis'
'IT Forensics - Registry & Artifacts',
'IT Forensics - Memory & Network',
'IT Forensics - Timeline & Log Analysis'
], ],
'linguistic_nuance': ['Language Nuance', 'Multilingual Competence'], 'linguistic_nuance': ['Language Understanding & Pragmatics'],
'problem_solving': ['Problem Solving & Logistics'], 'conversational_depth': ['Multi-turn Context & Instruction Handling']
'conversational_depth': ['Multi-turn: Context Retention']
} }
model_metrics = {} model_metrics = {}

1091
test_suite-large.yaml Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff