improvements

2026-01-18 15:14:25 +01:00
parent 9853faf09b
commit 7697bb51ab
4 changed files with 1481 additions and 945 deletions
--- a/ai_eval.py
+++ b/ai_eval.py
@@ -929,15 +929,21 @@ Do not include any text before or after the JSON object."""
    
    def save_results(self):
        """Save results to JSON file"""
+        # Ensure output directory exists
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        
+        # Sanitize model name for use in filename (replace problematic characters)
+        safe_model_name = self.model_name.replace('/', '_').replace(':', '_')
+        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"{self.model_name.replace(':', '_')}_{timestamp}.json"
+        filename = f"{safe_model_name}_{timestamp}.json"
        filepath = self.output_dir / filename
        
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(self.results, f, indent=2, ensure_ascii=False)
        
        # Also save as "latest" for this model
-        latest_file = self.output_dir / f"{self.model_name.replace(':', '_')}_latest.json"
+        latest_file = self.output_dir / f"{safe_model_name}_latest.json"
        with open(latest_file, 'w', encoding='utf-8') as f:
            json.dump(self.results, f, indent=2, ensure_ascii=False)
    
--- a/analyze_results.py
+++ b/analyze_results.py
@@ -507,18 +507,14 @@ class WebInterface:
                    dimensions = {
                        'logical_reasoning': ['Logic & Reasoning'],
                        'mathematical_ability': ['Mathematics & Calculation'],
-                        'instruction_following': ['Instruction Following', 'Multi-turn: Instruction Following'],
-                        'creativity': ['Creative Writing'],
+                        'instruction_following': ['Instruction Following with Constraints'],
+                        'creativity': ['Creative Writing with Constraints'],
                        'technical_knowledge': [
-                            'Code Generation', 
-                            'IT Forensics - File Systems',
-                            'IT Forensics - Registry & Artifacts',
-                            'IT Forensics - Memory & Network',
-                            'IT Forensics - Timeline & Log Analysis'
+                            'Code Generation - Advanced',
+                            'Digital Forensics & Binary Analysis'
                        ],
-                        'linguistic_nuance': ['Language Nuance', 'Multilingual Competence'],
-                        'problem_solving': ['Problem Solving & Logistics'],
-                        'conversational_depth': ['Multi-turn: Context Retention']
+                        'linguistic_nuance': ['Language Understanding & Pragmatics'],
+                        'conversational_depth': ['Multi-turn Context & Instruction Handling']
                    }
                    
                    model_metrics = {}
--- a/test_suite-large.yaml
+++ b/test_suite-large.yaml
--- a/test_suite.yaml
+++ b/test_suite.yaml