From fe1be323bb705d23bd5cbc1273de7a5df0d87fc3 Mon Sep 17 00:00:00 2001
From: overcuriousity <overcuriousity@posteo.org>
Date: Tue, 5 Aug 2025 21:35:38 +0200
Subject: [PATCH] confidence updates, content adjustment

---
 .env.example                          |  14 +-
 src/components/AIQueryInterface.astro |  29 +--
 src/config/prompts.ts                 |  87 ++++----
 src/data/tools.yaml                   |  12 +-
 src/utils/aiPipeline.ts               | 288 +++++++++-----------------
 5 files changed, 160 insertions(+), 270 deletions(-)
diff --git a/.env.example b/.env.example
index 2cddc9b..1814d9f 100644
--- a/.env.example
+++ b/.env.example
@@ -117,7 +117,7 @@ AI_MAX_CONTEXT_TOKENS=4000
 
 # Maximum tokens per individual AI prompt
 # Larger = more context per call | Smaller = faster responses
-AI_MAX_PROMPT_TOKENS=1500
+AI_MAX_PROMPT_TOKENS=2500
 
 # ============================================================================
 # 6. AUTHENTICATION & AUTHORIZATION (OPTIONAL)
@@ -190,18 +190,16 @@ FORENSIC_AUDIT_RETENTION_HOURS=24
 FORENSIC_AUDIT_MAX_ENTRIES=50
 
 # ============================================================================
-# 10. ENHANCED CONFIDENCE SCORING SYSTEM
+# 10. SIMPLIFIED CONFIDENCE SCORING SYSTEM  
 # ============================================================================
 
 # Confidence component weights (must sum to 1.0)
-CONFIDENCE_SEMANTIC_WEIGHT=0.25        # Weight for vector similarity quality  
-CONFIDENCE_SUITABILITY_WEIGHT=0.4      # Weight for AI-determined task fitness
-CONFIDENCE_CONSISTENCY_WEIGHT=0.2      # Weight for cross-validation agreement  
-CONFIDENCE_RELIABILITY_WEIGHT=0.15     # Weight for tool quality indicators
+CONFIDENCE_SEMANTIC_WEIGHT=0.5         # Weight for vector similarity quality  
+CONFIDENCE_SUITABILITY_WEIGHT=0.5      # Weight for AI-determined task fitness
 
 # Confidence thresholds (0-100)
-CONFIDENCE_MINIMUM_THRESHOLD=40        # Below this = weak recommendation
-CONFIDENCE_MEDIUM_THRESHOLD=60         # 40-59 = weak, 60-79 = moderate  
+CONFIDENCE_MINIMUM_THRESHOLD=50        # Below this = weak recommendation
+CONFIDENCE_MEDIUM_THRESHOLD=70         # 40-59 = weak, 60-79 = moderate  
 CONFIDENCE_HIGH_THRESHOLD=80           # 80+ = strong recommendation
 
 # ============================================================================
diff --git a/src/components/AIQueryInterface.astro b/src/components/AIQueryInterface.astro
index 3c855fb..99f3c6b 100644
--- a/src/components/AIQueryInterface.astro
+++ b/src/components/AIQueryInterface.astro
@@ -756,17 +756,14 @@ class AIQueryInterface {
 
   renderConfidenceTooltip(confidence) {
     if (!confidence || typeof confidence.overall !== 'number') {
-      console.log('[AI DEBUG] No confidence data or invalid format:', confidence);
       return '';
     }
     
     const confidenceColor = confidence.overall >= 80 ? 'var(--color-accent)' : 
-                           confidence.overall >= 60 ? 'var(--color-warning)' : 'var(--color-error)';
+                          confidence.overall >= 60 ? 'var(--color-warning)' : 'var(--color-error)';
     
     const tooltipId = `tooltip-${Math.random().toString(36).substr(2, 9)}`;
     
-    console.log(`[AI DEBUG] Generating confidence tooltip: ${confidence.overall}% with ID ${tooltipId}`);
-    
     return `
       <span class="confidence-tooltip-trigger" 
             style="display: inline-flex; align-items: center; gap: 0.125rem; cursor: help; margin-left: 0.25rem;"
@@ -789,7 +786,7 @@ class AIQueryInterface {
                 <strong style="color: var(--color-accent);">${confidence.semanticRelevance}%</strong>
               </div>
               <div style="font-size: 0.625rem; color: var(--color-text-secondary); line-height: 1.3;">
-                Wie gut die Tool-Beschreibung semantisch zu Ihrer Anfrage passt (basierend auf Vektor-Ähnlichkeit)
+                Wie gut die Tool-Beschreibung semantisch zu Ihrer Anfrage passt (Vektor-Ähnlichkeit)
               </div>
             </div>
             
@@ -802,26 +799,6 @@ class AIQueryInterface {
                 KI-bewertete Eignung des Tools für Ihre spezifische forensische Aufgabenstellung
               </div>
             </div>
-            
-            <div style="background: var(--color-bg-secondary); padding: 0.5rem; border-radius: 0.375rem; border-left: 3px solid var(--color-warning);">
-              <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 0.25rem;">
-                <span style="font-weight: 600; font-size: 0.6875rem;">🤝 Methodische Konsistenz</span>
-                <strong style="color: var(--color-warning);">${confidence.methodologicalConsistency}%</strong>
-              </div>
-              <div style="font-size: 0.625rem; color: var(--color-text-secondary); line-height: 1.3;">
-                Wie einheitlich verschiedene Analyseschritte dieses Tool bewerten (Kreuzvalidierung)
-              </div>
-            </div>
-            
-            <div style="background: var(--color-bg-secondary); padding: 0.5rem; border-radius: 0.375rem; border-left: 3px solid var(--color-text-secondary);">
-              <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 0.25rem;">
-                <span style="font-weight: 600; font-size: 0.6875rem;">🔧 Tool-Zuverlässigkeit</span>
-                <strong style="color: var(--color-text);">${confidence.toolReliability}%</strong>  
-              </div>
-              <div style="font-size: 0.625rem; color: var(--color-text-secondary); line-height: 1.3;">
-                Qualitätsindikatoren: Dokumentation, Wartung, Verfügbarkeit und Benutzerfreundlichkeit
-              </div>
-            </div>
           </div>
           
           ${confidence.strengthIndicators && confidence.strengthIndicators.length > 0 ? `
@@ -847,7 +824,7 @@ class AIQueryInterface {
           ` : ''}
           
           <div style="margin-top: 0.75rem; padding-top: 0.75rem; border-top: 1px solid var(--color-border); font-size: 0.625rem; color: var(--color-text-secondary); text-align: center;">
-            Mehrstufige KI-Analyse mit Kreuzvalidierung
+            Forensisch fundierte KI-Analyse
           </div>
         </div>
       </span>
diff --git a/src/config/prompts.ts b/src/config/prompts.ts
index 6b31a49..e0ac22d 100644
--- a/src/config/prompts.ts
+++ b/src/config/prompts.ts
@@ -120,61 +120,60 @@ ${aspects}
 WICHTIG: Antworten Sie NUR in fließendem deutschen Text ohne Listen oder Markdown. Maximum 120 Wörter.`;
   },
 
-  // Phase tool selection prompt
   phaseToolSelection: (userQuery: string, phase: any, phaseTools: any[]) => {
-    return `Wählen Sie 2-3 Methoden/Tools für die Phase "${phase.name}" basierend auf objektiven, fallbezogenen Kriterien.
+    return `Wählen Sie 2-3 Methoden/Tools für die Phase "${phase.name}" und bewerten Sie deren Aufgaben-Eignung VERGLEICHEND.
 
 SZENARIO: "${userQuery}"
+SPEZIFISCHE PHASE: ${phase.name} - ${phase.description || 'Forensische Untersuchungsphase'}
 
 VERFÜGBARE TOOLS FÜR ${phase.name.toUpperCase()}:
-${phaseTools.map((tool: any) => `- ${tool.name}: ${tool.description.slice(0, 100)}...`).join('\n')}
+${phaseTools.map((tool: any, index: number) => `${index + 1}. ${tool.name}: ${tool.description.slice(0, 150)}...
+  - Plattformen: ${tool.platforms?.join(', ') || 'N/A'}
+  - Skill Level: ${tool.skillLevel}
+  - Tags: ${tool.tags?.join(', ') || 'N/A'}`).join('\n\n')}
 
-Wählen Sie Methoden/Tools nach forensischen Kriterien aus:
-- Court admissibility und Chain of Custody Kompatibilität  
-- Integration in forensische Standard-Workflows
-- Reproduzierbarkeit und Dokumentationsqualität
-- Objektivität
+Bewerten Sie ALLE Tools vergleichend für diese spezifische Aufgabe UND Phase. Wählen Sie die 2-3 besten aus.
 
-Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format (kein zusätzlicher Text):
+BEWERTUNGSKRITERIEN:
+- Wie gut löst das Tool das forensische Problem im SZENARIO-Kontext?
+- Wie gut passt es zur spezifischen PHASE "${phase.name}"?
+- Wie vergleicht es sich mit den anderen verfügbaren Tools für diese Phase?
+
+Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format:
 [
   {
-    "toolName": "Exakter Methoden/Tool-Name",
-    "priority": "high|medium|low", 
-    "justification": "Objektive Begründung warum diese Methode/Tool für das spezifische Szenario besser geeignet ist"
+    "toolName": "Exakter Tool-Name",
+    "taskRelevance": 85,
+    "justification": "Vergleichende Begründung warum dieses Tool für diese Phase und Aufgabe besser/schlechter als die anderen geeignet ist",
+    "limitations": ["Spezifische Einschränkung 1", "Einschränkung 2"]
   }
-]`;
-  },
-
-  // Tool evaluation prompt
-  toolEvaluation: (userQuery: string, tool: any, rank: number) => {
-  return `Sie sind ein DFIR-Experte und bewerten ein forensisches Tool für eine spezifische Aufgabe.
-
-PROBLEM: "${userQuery}"
-
-TOOL: ${tool.name}
-BESCHREIBUNG: ${tool.description}
-PLATTFORMEN: ${tool.platforms?.join(', ') || 'N/A'}
-SKILL LEVEL: ${tool.skillLevel}
-DOMAINS: ${tool.domains?.join(', ') || 'N/A'}
-TAGS: ${tool.tags?.join(', ') || 'N/A'}
-
-Bewerten Sie nach forensischen Standards und antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format:
-{
-  "suitability_score": "high|medium|low",
-  "task_relevance": 85,
-  "detailed_explanation": "Detaillierte forensische Begründung warum diese Methode/Tool das Problem löst",
-  "implementation_approach": "Konkrete methodische Schritte zur korrekten Anwendung für dieses spezifische Problem",
-  "pros": ["Forensischer Vorteil 1", "Validierter Vorteil 2"],
-  "cons": ["Methodische Limitation 1", "Potenzielle Schwäche 2"],
-  "limitations": ["Spezifische Einschränkung 1", "Mögliche Problematik 2"],
-  "alternatives": "Alternative Ansätze falls diese Methode/Tool nicht optimal ist"
-}
+]
 
 WICHTIG:
-- task_relevance: Numerischer Wert 0-100 wie gut das Tool für DIESE SPEZIFISCHE Aufgabe geeignet ist
-- limitations: Konkrete Einschränkungen oder Situationen wo das Tool NICHT optimal wäre
-- Berücksichtigen Sie den Skill Level vs. Anfrage-Komplexität
-- Bewerten Sie objektiv, nicht beschönigend`;
+- taskRelevance: 0-100 Score basierend auf Szenario-Eignung UND Phasen-Passung im VERGLEICH zu anderen Tools
+- Nur die 2-3 BESTEN Tools auswählen und bewerten
+- justification soll VERGLEICHEND sein ("besser als X weil...", "für diese Phase ideal weil...")`;
+  },
+
+  toolEvaluation: (userQuery: string, tool: any, rank: number, taskRelevance: number) => {
+    return `Sie sind ein DFIR-Experte. Erklären Sie DETAILLIERT die Anwendung dieses bereits bewerteten Tools.
+
+PROBLEM: "${userQuery}"
+TOOL: ${tool.name} (bereits bewertet mit ${taskRelevance}% Aufgaben-Eignung)
+BESCHREIBUNG: ${tool.description}
+
+Das Tool wurde bereits als Rang ${rank} für diese Aufgabe bewertet. Erklären Sie nun:
+
+Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format:
+{
+  "detailed_explanation": "Detaillierte Erklärung warum und wie dieses Tool für diese spezifische Aufgabe eingesetzt wird",
+  "implementation_approach": "Konkrete Schritt-für-Schritt Anleitung zur korrekten Anwendung",
+  "pros": ["Spezifischer Vorteil 1", "Spezifischer Vorteil 2"],
+  "cons": ["Bekannte Limitation 1", "Bekannte Limitation 2"],
+  "alternatives": "Alternative Ansätze oder Tools falls dieses nicht verfügbar ist"
+}
+
+WICHTIG: Keine erneute Bewertung - nur detaillierte Erklärung der bereits bewerteten Eignung.`;
   },
 
   // Background knowledge selection prompt
@@ -229,7 +228,7 @@ export function getPrompt(key: 'scenarioAnalysis', isWorkflow: boolean, userQuer
 export function getPrompt(key: 'investigationApproach', isWorkflow: boolean, userQuery: string): string;
 export function getPrompt(key: 'criticalConsiderations', isWorkflow: boolean, userQuery: string): string;
 export function getPrompt(key: 'phaseToolSelection', userQuery: string, phase: any, phaseTools: any[]): string;
-export function getPrompt(key: 'toolEvaluation', userQuery: string, tool: any, rank: number): string;
+export function getPrompt(key: 'toolEvaluation', userQuery: string, tool: any, rank: number, taskRelevance: number): string;
 export function getPrompt(key: 'backgroundKnowledgeSelection', userQuery: string, mode: string, selectedToolNames: string[], availableConcepts: any[]): string;
 export function getPrompt(key: 'finalRecommendations', isWorkflow: boolean, userQuery: string, selectedToolNames: string[]): string;
 export function getPrompt(promptKey: keyof typeof AI_PROMPTS, ...args: any[]): string {
diff --git a/src/data/tools.yaml b/src/data/tools.yaml
index 0c3a185..808b804 100644
--- a/src/data/tools.yaml
+++ b/src/data/tools.yaml
@@ -3975,7 +3975,7 @@ tools:
   - name: KAPE
     type: software
     description: >-
-      Kroll Artifact Parser and Extractor revolutioniert Windows-Forensik durch 
+      Kroll Artifact Parser and Extractor versucht sich an Windows-Forensik durch 
       intelligente Ziel-basierte Sammlung. Statt Full-Disk-Images extrahiert
       KAPE  gezielt kritische Artefakte: Registry-Hives, Event-Logs, Prefetch,
       Browser- Daten, Scheduled-Tasks in Minuten statt Stunden. Die Target-Files
@@ -3983,12 +3983,10 @@ tools:
       Besonders clever:  Compound-Targets gruppieren zusammengehörige Artefakte
       (z.B. "Browser" sammelt  Chrome+Firefox+Edge), die gKAPE-GUI macht es auch
       für Nicht-Techniker  zugänglich. Batch-Mode verarbeitet mehrere Images
-      parallel. Output direkt  kompatibel zu Timeline-Tools wie Plaso. Die
-      ständigen Community-Updates  halten mit Windows-Entwicklungen Schritt.
+      parallel. Output direkt  kompatibel zu Timeline-Tools wie Plaso. 
       VSS-Processing analysiert Shadow- Copies automatisch. Der
-      Remote-Collection-Mode sammelt über Netzwerk.  Kostenlos aber
-      Enterprise-Support verfügbar. Der neue Standard für effiziente 
-      Windows-Forensik-Triage.
+      Remote-Collection-Mode sammelt über Netzwerk.  Kostenlos (mit Registrierung) aber
+      Enterprise-Support verfügbar.
     skillLevel: intermediate
     url: https://www.kroll.com/kape
     icon: 🧰
@@ -4003,7 +4001,7 @@ tools:
     platforms:
       - Windows
     accessType: download
-    license: Freeware
+    license: Proprietary
     knowledgebase: false
   - name: Kibana
     type: software
diff --git a/src/utils/aiPipeline.ts b/src/utils/aiPipeline.ts
index 396fcea..f291f3b 100644
--- a/src/utils/aiPipeline.ts
+++ b/src/utils/aiPipeline.ts
@@ -72,7 +72,6 @@ interface ConfidenceMetrics {
   semanticRelevance: number;          // How well tool description matches query (from embeddings)
   taskSuitability: number;           // AI-determined fitness for this specific task  
   methodologicalConsistency: number; // How well different analysis steps agree
-  toolReliability: number;           // Indicators of tool quality and maintenance
   uncertaintyFactors: string[];      // Specific reasons why this might not work
   strengthIndicators: string[];      // Specific reasons why this is a good choice
 }
@@ -146,17 +145,17 @@ class ImprovedMicroTaskAIPipeline {
     
     // Updated confidence weights - more focused on AI evaluation
     this.confidenceConfig = {
-      semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.25'),     // Embeddings similarity
-      suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.4'), // AI task fit evaluation  
-      consistencyWeight: parseFloat(process.env.CONFIDENCE_CONSISTENCY_WEIGHT || '0.2'), // Cross-validation agreement
-      reliabilityWeight: parseFloat(process.env.CONFIDENCE_RELIABILITY_WEIGHT || '0.15'), // Tool quality indicators
+      semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.3'),     // Embeddings similarity
+      suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.7'), // AI task fit evaluation  
+      consistencyWeight: 0,    
+      reliabilityWeight: 0,
       minimumThreshold: parseInt(process.env.CONFIDENCE_MINIMUM_THRESHOLD || '40', 10),
       mediumThreshold: parseInt(process.env.CONFIDENCE_MEDIUM_THRESHOLD || '60', 10),
       highThreshold: parseInt(process.env.CONFIDENCE_HIGH_THRESHOLD || '80', 10)
     };
-    
-    console.log('[AI PIPELINE] Enhanced confidence scoring enabled:', {
-      weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight} Consistency:${this.confidenceConfig.consistencyWeight} Reliability:${this.confidenceConfig.reliabilityWeight}`,
+
+    console.log('[AI PIPELINE] Simplified confidence scoring enabled:', {
+      weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight}`,
       thresholds: `${this.confidenceConfig.minimumThreshold}/${this.confidenceConfig.mediumThreshold}/${this.confidenceConfig.highThreshold}`
     });
   }
@@ -709,197 +708,107 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
   ): ConfidenceMetrics {
     
     // 1. Semantic Relevance: Real embeddings similarity score
-    const semanticRelevance = context.embeddingsSimilarities.has(tool.name) ? 
-      Math.round(context.embeddingsSimilarities.get(tool.name)! * 100) : 50;
+    const rawSemanticRelevance = context.embeddingsSimilarities.has(tool.name) ? 
+      context.embeddingsSimilarities.get(tool.name)! * 100 : 50;
     
-    // 2. Task Suitability: AI-determined fitness for specific task
-    const taskSuitability = Math.round(taskRelevance);
+    // 2. Task Suitability: Enhanced with phase-awareness for workflow mode
+    let enhancedTaskSuitability = taskRelevance;
     
-    // 3. Methodological Consistency: Cross-validation between micro-tasks
-    const methodologicalConsistency = this.calculateCrossValidationScore(tool.name, context);
+    if (context.mode === 'workflow') {
+      // In workflow mode, boost score if tool is well-matched to its assigned phase
+      const toolSelection = context.selectedTools?.find(st => st.tool.name === tool.name);
+      if (toolSelection && tool.phases && tool.phases.includes(toolSelection.phase)) {
+        // Boost for phase alignment (but cap at 100)
+        const phaseBonus = Math.min(15, 100 - taskRelevance);
+        enhancedTaskSuitability = Math.min(100, taskRelevance + phaseBonus);
+        
+        console.log(`[CONFIDENCE] Phase bonus for ${tool.name}: ${taskRelevance} -> ${enhancedTaskSuitability} (phase: ${toolSelection.phase})`);
+      }
+    }
     
-    // 4. Tool Reliability: Quality indicators
-    const toolReliability = this.calculateToolReliability(tool);
-    
-    // Debug logging
-    console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, {
-      semantic: semanticRelevance,
-      taskSuitability: taskSuitability,
-      consistency: methodologicalConsistency,
-      reliability: toolReliability,
-      hasEmbeddingsSimilarity: context.embeddingsSimilarities.has(tool.name),
-      rawTaskRelevance: taskRelevance
-    });
-    
-    // Calculate weighted overall score
+    // Simple weighted combination - no artificial scaling
     const overall = (
-      semanticRelevance * this.confidenceConfig.semanticWeight +
-      taskSuitability * this.confidenceConfig.suitabilityWeight +
-      methodologicalConsistency * this.confidenceConfig.consistencyWeight +
-      toolReliability * this.confidenceConfig.reliabilityWeight
+      rawSemanticRelevance * this.confidenceConfig.semanticWeight +
+      enhancedTaskSuitability * this.confidenceConfig.suitabilityWeight
     );
 
     const uncertaintyFactors = this.identifySpecificUncertaintyFactors(tool, context, limitations, overall);
     const strengthIndicators = this.identifySpecificStrengthIndicators(tool, context, overall);
 
+    console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, {
+      rawSemantic: Math.round(rawSemanticRelevance),
+      rawTaskSuitability: taskRelevance,
+      enhancedTaskSuitability: Math.round(enhancedTaskSuitability),
+      overall: Math.round(overall),
+      mode: context.mode
+    });
+
     return {
       overall: Math.round(overall),
-      semanticRelevance: Math.round(semanticRelevance),
-      taskSuitability: Math.round(taskSuitability), 
-      methodologicalConsistency: Math.round(methodologicalConsistency),
-      toolReliability: Math.round(toolReliability),
+      semanticRelevance: Math.round(rawSemanticRelevance),
+      taskSuitability: Math.round(enhancedTaskSuitability), 
+      methodologicalConsistency: 0,
       uncertaintyFactors,
       strengthIndicators
     };
   }
 
-  private calculateCrossValidationScore(toolName: string, context: AnalysisContext): number {
-    // Look for entries where this tool was mentioned across different phases
-    const relevantEntries = context.auditTrail.filter(entry => 
-      entry.phase === 'micro-task' || entry.phase === 'selection'
-    );
-    
-    let toolMentions = 0;
-    let positiveEvaluations = 0;
-    let confidenceSum = 0;
-    
-    relevantEntries.forEach(entry => {
-      let toolFound = false;
-      
-      // Check various ways the tool might be referenced in output
-      if (entry.output && typeof entry.output === 'object') {
-        // Check selectedTools arrays
-        if (Array.isArray(entry.output.selectedTools) && 
-            entry.output.selectedTools.includes(toolName)) {
-          toolFound = true;
-        }
-        
-        // Check finalToolNames arrays  
-        if (Array.isArray(entry.output.finalToolNames) && 
-            entry.output.finalToolNames.includes(toolName)) {
-          toolFound = true;
-        }
-        
-        // Check toolName in individual evaluation
-        if (entry.output.toolName === toolName) {
-          toolFound = true;
-        }
-      }
-      
-      if (toolFound) {
-        toolMentions++;
-        confidenceSum += entry.confidence;
-        
-        // Consider it positive if confidence >= 60
-        if (entry.confidence >= 60) {
-          positiveEvaluations++;
-        }
-      }
-    });
-    
-    console.log(`[AI PIPELINE] Cross-validation for ${toolName}: ${toolMentions} mentions, ${positiveEvaluations} positive, avg confidence: ${toolMentions > 0 ? Math.round(confidenceSum / toolMentions) : 0}`);
-    
-    if (toolMentions === 0) {
-      return 60; // Default when no cross-validation data available
-    }
-    
-    if (toolMentions === 1) {
-      // Single mention - use confidence directly but cap it
-      return Math.min(85, Math.max(40, confidenceSum));
-    }
-    
-    // Multiple mentions - calculate agreement ratio
-    const agreementRatio = positiveEvaluations / toolMentions;
-    const avgConfidence = confidenceSum / toolMentions;
-    
-    // Combine agreement ratio with average confidence
-    const crossValidationScore = (agreementRatio * 0.7 + (avgConfidence / 100) * 0.3) * 100;
-    
-    return Math.round(Math.min(95, Math.max(30, crossValidationScore)));
-  }
-
-  // NEW: Calculate tool reliability based on objective indicators
-  private calculateToolReliability(tool: any): number {
-    let reliability = 50; // Base score
-    
-    // Documentation availability
-    if (tool.knowledgebase === true) reliability += 25;
-    
-    // Active maintenance (hosted tools are typically maintained)
-    if (isToolHosted(tool)) reliability += 20;
-    
-    // Community support (open source often has community)
-    if (tool.license && tool.license !== 'Proprietary') reliability += 10;
-    
-    // Skill level appropriateness (not too complex, not too simple)
-    if (tool.skillLevel === 'intermediate' || tool.skillLevel === 'advanced') reliability += 10;
-    else if (tool.skillLevel === 'expert') reliability -= 5; // May be overcomplicated
-    
-    // Multi-platform support (more versatile)
-    if (tool.platforms && tool.platforms.length > 1) reliability += 5;
-    
-    return Math.min(100, reliability);
-  }
-
-  // NEW: Identify specific uncertainty factors based on analysis
   private identifySpecificUncertaintyFactors(tool: any, context: AnalysisContext, limitations: string[], confidence: number): string[] {
     const factors: string[] = [];
     
-    // Add AI-identified limitations
+    // Add AI-identified limitations first (most specific)
     if (limitations && limitations.length > 0) {
-      factors.push(...limitations.slice(0, 3)); // Limit to top 3
+      factors.push(...limitations.slice(0, 2)); // Limit to top 2 to leave room for others
     }
     
     // Low semantic similarity
     const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5;
-    if (similarity < 0.4) {
+    if (similarity < 0.7) {
       factors.push('Geringe semantische Ähnlichkeit zur Anfrage - Tool-Beschreibung passt möglicherweise nicht optimal');
     }
     
-    // Skill level mismatch
-    if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent/i.test(context.userQuery)) {
-      factors.push('Experten-Tool für Eilszenario - möglicherweise zu komplex für schnelle Antworten');
+    // Skill level vs scenario complexity mismatch
+    if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent|sofort/i.test(context.userQuery)) {
+      factors.push('Experten-Tool für zeitkritisches Szenario - Setup und Einarbeitung könnten zu lange dauern');
     }
     
-    if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced/i.test(context.userQuery)) {
-      factors.push('Einsteiger-Tool für komplexes Szenario - könnte funktionale Einschränkungen haben');
+    if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced|forensisch/i.test(context.userQuery)) {
+      factors.push('Einsteiger-Tool für komplexe Analyse - könnte funktionale Limitierungen haben');
     }
     
-    // Access limitations
+    // Platform availability concerns
+    if (tool.platforms && tool.platforms.length === 1 && tool.platforms[0] === 'Windows' && /linux|unix|server/i.test(context.userQuery)) {
+      factors.push('Nur Windows-Tool bei möglicher Linux/Server-Umgebung - Plattform-Inkompatibilität');
+    }
+    
+    // Access and deployment concerns
     if (tool.type === 'software' && !isToolHosted(tool) && tool.accessType === 'download') {
-      factors.push('Installation erforderlich - nicht sofort verfügbar ohne Setup');
+      factors.push('Installation und Setup erforderlich');
     }
     
-    // Cross-validation disagreement
-    const crossValidation = this.calculateCrossValidationScore(tool.name, context);
-    if (crossValidation < 50) {
-      factors.push('Uneinheitliche Bewertung in verschiedenen Analyseschritten - Empfehlung nicht eindeutig');
+    // License restrictions
+    if (tool.license === 'Proprietary') {
+      factors.push('Kommerzielle Software - Lizenzkosten und rechtliche Beschränkungen zu beachten');
     }
     
-    return factors.slice(0, 4); // Limit to 4 most important factors
+    // Low overall confidence warning
+    if (confidence < 60) {
+      factors.push('Moderate Gesamtbewertung - alternative Ansätze sollten ebenfalls betrachtet werden');
+    }
+    
+    return factors.slice(0, 4); // Limit to 4 most relevant factors
   }
 
   // NEW: Identify specific strength indicators
   private identifySpecificStrengthIndicators(tool: any, context: AnalysisContext, confidence: number): string[] {
     const indicators: string[] = [];
     
-    // High confidence overall
-    if (confidence >= this.confidenceConfig.highThreshold) {
-      indicators.push('Hohe Gesamtbewertung durch mehrfache Validierung');
-    }
-    
     // High semantic similarity
     const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5;
     if (similarity >= 0.7) {
       indicators.push('Sehr gute semantische Übereinstimmung mit Ihrer Anfrage');
     }
     
-    // Strong cross-validation
-    const crossValidation = this.calculateCrossValidationScore(tool.name, context);
-    if (crossValidation >= 80) {
-      indicators.push('Konsistente Empfehlung über verschiedene Analyseschritte hinweg');
-    }
-    
     // Quality indicators
     if (tool.knowledgebase === true) {
       indicators.push('Umfassende Dokumentation und Wissensbasis verfügbar');
@@ -985,7 +894,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
 
     const prompt = getPrompt('phaseToolSelection', context.userQuery, phase, phaseTools);
 
-    const result = await this.callMicroTaskAI(prompt, context, 800);
+    const result = await this.callMicroTaskAI(prompt, context, 1000);
     
     if (result.success) {
       const selections = this.safeParseJSON(result.content, []);
@@ -998,16 +907,30 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
         validSelections.forEach((sel: any) => {
           const tool = phaseTools.find((t: any) => t.name === sel.toolName);
           if (tool) {
-            this.addToolToSelection(context, tool, phase.id, sel.priority, sel.justification);
+            // Ensure taskRelevance is a number
+            const taskRelevance = typeof sel.taskRelevance === 'number' ? 
+              sel.taskRelevance : parseInt(String(sel.taskRelevance)) || 70;
+            
+            // Derive priority automatically from score
+            const priority = this.derivePriorityFromScore(taskRelevance);
+            
+            this.addToolToSelection(context, tool, phase.id, priority, sel.justification, taskRelevance, sel.limitations);
           }
         });
         
         this.addAuditEntry(context, 'micro-task', 'phase-tool-selection',
           { phase: phase.id, availableTools: phaseTools.length },
-          { validSelections: validSelections.length, selectedTools: validSelections.map(s => s.toolName) },
+          { 
+            validSelections: validSelections.length, 
+            selectedTools: validSelections.map(s => ({ 
+              name: s.toolName, 
+              taskRelevance: s.taskRelevance, 
+              derivedPriority: this.derivePriorityFromScore(s.taskRelevance) 
+            }))
+          },
           validSelections.length > 0 ? 75 : 30,
           Date.now() - result.processingTimeMs,
-          { phaseName: phase.name }
+          { phaseName: phase.name, comparativeEvaluation: true, priorityDerived: true }
         );
       }
     }
@@ -1016,56 +939,46 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
   }
 
   private async evaluateSpecificTool(context: AnalysisContext, tool: any, rank: number): Promise<MicroTaskResult> {
-    const prompt = getPrompt('toolEvaluation', context.userQuery, tool, rank);
+    // Get existing task relevance from previous phase selection
+    const existingSelection = context.selectedTools?.find(st => st.tool.name === tool.name);
+    const taskRelevance = existingSelection?.taskRelevance || 70;
+    const priority = this.derivePriorityFromScore(taskRelevance);
+    
+    const prompt = getPrompt('toolEvaluation', context.userQuery, tool, rank, taskRelevance);
 
-    const result = await this.callMicroTaskAI(prompt, context, 1200);
+    const result = await this.callMicroTaskAI(prompt, context, 1000);
     
     if (result.success) {
       const evaluation = this.safeParseJSON(result.content, {
-        suitability_score: 'medium',
-        task_relevance: '',
         detailed_explanation: 'Evaluation failed',
         implementation_approach: '',
         pros: [],
         cons: [],
-        limitations: [],
         alternatives: ''
       });
       
-      // Debug logging to see what we're getting
-      console.log(`[AI PIPELINE] Tool ${tool.name} evaluation:`, {
-        taskRelevance: evaluation.task_relevance,
-        suitabilityScore: evaluation.suitability_score,
-        limitationsCount: evaluation.limitations?.length || 0
-      });
-      
-      // Ensure task_relevance is a number
-      const taskRelevance = typeof evaluation.task_relevance === 'number' ? 
-        evaluation.task_relevance : 
-        parseInt(String(evaluation.task_relevance)) || 70;
-      
-      // Store enhanced evaluation data
+      // Store evaluation without re-scoring
       this.addToolToSelection(context, {
         ...tool,
         evaluation: {
           ...evaluation,
-          task_relevance: taskRelevance, // Ensure it's stored as number
-          rank
+          rank,
+          task_relevance: taskRelevance
         }
-      }, 'evaluation', evaluation.suitability_score, evaluation.detailed_explanation, 
-      taskRelevance, evaluation.limitations);
+      }, 'evaluation', priority, evaluation.detailed_explanation, 
+      taskRelevance, existingSelection?.limitations);
       
       this.addAuditEntry(context, 'micro-task', 'tool-evaluation',
-        { toolName: tool.name, rank },
+        { toolName: tool.name, rank, existingTaskRelevance: taskRelevance, derivedPriority: priority },
         { 
-          suitabilityScore: evaluation.suitability_score, 
-          taskRelevance: taskRelevance, // Use the cleaned number
           hasExplanation: !!evaluation.detailed_explanation,
-          limitationsIdentified: evaluation.limitations?.length || 0
+          hasImplementationApproach: !!evaluation.implementation_approach,
+          prosCount: evaluation.pros?.length || 0,
+          consCount: evaluation.cons?.length || 0
         },
-        evaluation.suitability_score === 'high' ? 85 : evaluation.suitability_score === 'medium' ? 70 : 50,
+        70,
         Date.now() - result.processingTimeMs,
-        { toolType: tool.type, taskRelevanceScore: taskRelevance }
+        { toolType: tool.type, explanationOnly: true, priorityDerived: true }
       );
     }
     
@@ -1173,6 +1086,12 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
     }
   }
 
+  private derivePriorityFromScore(taskRelevance: number): string {
+    if (taskRelevance >= 80) return 'high';
+    if (taskRelevance >= 60) return 'medium';
+    return 'low';
+  }
+
   async processQuery(userQuery: string, mode: string): Promise<AnalysisResult> {
     const startTime = Date.now();
     let completeTasks = 0;
@@ -1323,8 +1242,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
             components: {
               semantic: confidence.semanticRelevance,
               suitability: confidence.taskSuitability,
-              consistency: confidence.methodologicalConsistency,
-              reliability: confidence.toolReliability
+              consistency: confidence.methodologicalConsistency
             }
           },
           confidence.overall,