improvements
This commit is contained in:
10
ai_eval.py
10
ai_eval.py
@@ -929,15 +929,21 @@ Do not include any text before or after the JSON object."""
|
||||
|
||||
def save_results(self):
|
||||
"""Save results to JSON file"""
|
||||
# Ensure output directory exists
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Sanitize model name for use in filename (replace problematic characters)
|
||||
safe_model_name = self.model_name.replace('/', '_').replace(':', '_')
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"{self.model_name.replace(':', '_')}_{timestamp}.json"
|
||||
filename = f"{safe_model_name}_{timestamp}.json"
|
||||
filepath = self.output_dir / filename
|
||||
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Also save as "latest" for this model
|
||||
latest_file = self.output_dir / f"{self.model_name.replace(':', '_')}_latest.json"
|
||||
latest_file = self.output_dir / f"{safe_model_name}_latest.json"
|
||||
with open(latest_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
@@ -507,18 +507,14 @@ class WebInterface:
|
||||
dimensions = {
|
||||
'logical_reasoning': ['Logic & Reasoning'],
|
||||
'mathematical_ability': ['Mathematics & Calculation'],
|
||||
'instruction_following': ['Instruction Following', 'Multi-turn: Instruction Following'],
|
||||
'creativity': ['Creative Writing'],
|
||||
'instruction_following': ['Instruction Following with Constraints'],
|
||||
'creativity': ['Creative Writing with Constraints'],
|
||||
'technical_knowledge': [
|
||||
'Code Generation',
|
||||
'IT Forensics - File Systems',
|
||||
'IT Forensics - Registry & Artifacts',
|
||||
'IT Forensics - Memory & Network',
|
||||
'IT Forensics - Timeline & Log Analysis'
|
||||
'Code Generation - Advanced',
|
||||
'Digital Forensics & Binary Analysis'
|
||||
],
|
||||
'linguistic_nuance': ['Language Nuance', 'Multilingual Competence'],
|
||||
'problem_solving': ['Problem Solving & Logistics'],
|
||||
'conversational_depth': ['Multi-turn: Context Retention']
|
||||
'linguistic_nuance': ['Language Understanding & Pragmatics'],
|
||||
'conversational_depth': ['Multi-turn Context & Instruction Handling']
|
||||
}
|
||||
|
||||
model_metrics = {}
|
||||
|
||||
1091
test_suite-large.yaml
Normal file
1091
test_suite-large.yaml
Normal file
File diff suppressed because it is too large
Load Diff
1293
test_suite.yaml
1293
test_suite.yaml
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user