confidence updates, content adjustment

2025-08-05 21:35:38 +02:00
parent 27e64f05ca
commit fe1be323bb
5 changed files with 160 additions and 270 deletions
--- a/.env.example
+++ b/.env.example
@@ -117,7 +117,7 @@ AI_MAX_CONTEXT_TOKENS=4000
 # Maximum tokens per individual AI prompt
 # Larger = more context per call | Smaller = faster responses
-AI_MAX_PROMPT_TOKENS=1500
+AI_MAX_PROMPT_TOKENS=2500
 # ============================================================================
 # 6. AUTHENTICATION & AUTHORIZATION (OPTIONAL)
@@ -190,18 +190,16 @@ FORENSIC_AUDIT_RETENTION_HOURS=24
 FORENSIC_AUDIT_MAX_ENTRIES=50
 # ============================================================================
-# 10. ENHANCED CONFIDENCE SCORING SYSTEM
+# 10. SIMPLIFIED CONFIDENCE SCORING SYSTEM  
 # ============================================================================
 # Confidence component weights (must sum to 1.0)
-CONFIDENCE_SEMANTIC_WEIGHT=0.25        # Weight for vector similarity quality  
+CONFIDENCE_SEMANTIC_WEIGHT=0.5         # Weight for vector similarity quality  
-CONFIDENCE_SUITABILITY_WEIGHT=0.4      # Weight for AI-determined task fitness
+CONFIDENCE_SUITABILITY_WEIGHT=0.5      # Weight for AI-determined task fitness
 CONFIDENCE_CONSISTENCY_WEIGHT=0.2      # Weight for cross-validation agreement  
 CONFIDENCE_RELIABILITY_WEIGHT=0.15     # Weight for tool quality indicators
 # Confidence thresholds (0-100)
-CONFIDENCE_MINIMUM_THRESHOLD=40        # Below this = weak recommendation
+CONFIDENCE_MINIMUM_THRESHOLD=50        # Below this = weak recommendation
-CONFIDENCE_MEDIUM_THRESHOLD=60         # 40-59 = weak, 60-79 = moderate  
+CONFIDENCE_MEDIUM_THRESHOLD=70         # 40-59 = weak, 60-79 = moderate  
 CONFIDENCE_HIGH_THRESHOLD=80           # 80+ = strong recommendation
 # ============================================================================
--- a/src/components/AIQueryInterface.astro
+++ b/src/components/AIQueryInterface.astro
@@ -756,17 +756,14 @@ class AIQueryInterface {
  renderConfidenceTooltip(confidence) {
    if (!confidence || typeof confidence.overall !== 'number') {
      console.log('[AI DEBUG] No confidence data or invalid format:', confidence);
      return '';
    }
    const confidenceColor = confidence.overall >= 80 ? 'var(--color-accent)' : 
-                           confidence.overall >= 60 ? 'var(--color-warning)' : 'var(--color-error)';
+                          confidence.overall >= 60 ? 'var(--color-warning)' : 'var(--color-error)';
    const tooltipId = `tooltip-${Math.random().toString(36).substr(2, 9)}`;
    console.log(`[AI DEBUG] Generating confidence tooltip: ${confidence.overall}% with ID ${tooltipId}`);
    return `
      <span class="confidence-tooltip-trigger" 
            style="display: inline-flex; align-items: center; gap: 0.125rem; cursor: help; margin-left: 0.25rem;"
@@ -789,7 +786,7 @@ class AIQueryInterface {
                <strong style="color: var(--color-accent);">${confidence.semanticRelevance}%</strong>
              </div>
              <div style="font-size: 0.625rem; color: var(--color-text-secondary); line-height: 1.3;">
-                Wie gut die Tool-Beschreibung semantisch zu Ihrer Anfrage passt (basierend auf Vektor-Ähnlichkeit)
+                Wie gut die Tool-Beschreibung semantisch zu Ihrer Anfrage passt (Vektor-Ähnlichkeit)
              </div>
            </div>
@@ -802,26 +799,6 @@ class AIQueryInterface {
                KI-bewertete Eignung des Tools für Ihre spezifische forensische Aufgabenstellung
              </div>
            </div>
            <div style="background: var(--color-bg-secondary); padding: 0.5rem; border-radius: 0.375rem; border-left: 3px solid var(--color-warning);">
              <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 0.25rem;">
                <span style="font-weight: 600; font-size: 0.6875rem;">🤝 Methodische Konsistenz</span>
                <strong style="color: var(--color-warning);">${confidence.methodologicalConsistency}%</strong>
              </div>
              <div style="font-size: 0.625rem; color: var(--color-text-secondary); line-height: 1.3;">
                Wie einheitlich verschiedene Analyseschritte dieses Tool bewerten (Kreuzvalidierung)
              </div>
            </div>
            <div style="background: var(--color-bg-secondary); padding: 0.5rem; border-radius: 0.375rem; border-left: 3px solid var(--color-text-secondary);">
              <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 0.25rem;">
                <span style="font-weight: 600; font-size: 0.6875rem;">🔧 Tool-Zuverlässigkeit</span>
                <strong style="color: var(--color-text);">${confidence.toolReliability}%</strong>  
              </div>
              <div style="font-size: 0.625rem; color: var(--color-text-secondary); line-height: 1.3;">
                Qualitätsindikatoren: Dokumentation, Wartung, Verfügbarkeit und Benutzerfreundlichkeit
              </div>
            </div>
          </div>
          ${confidence.strengthIndicators && confidence.strengthIndicators.length > 0 ? `
@@ -847,7 +824,7 @@ class AIQueryInterface {
          ` : ''}
          <div style="margin-top: 0.75rem; padding-top: 0.75rem; border-top: 1px solid var(--color-border); font-size: 0.625rem; color: var(--color-text-secondary); text-align: center;">
-            Mehrstufige KI-Analyse mit Kreuzvalidierung
+            Forensisch fundierte KI-Analyse
          </div>
        </div>
      </span>
--- a/src/config/prompts.ts
+++ b/src/config/prompts.ts
@@ -120,61 +120,60 @@ ${aspects}
 WICHTIG: Antworten Sie NUR in fließendem deutschen Text ohne Listen oder Markdown. Maximum 120 Wörter.`;
  },
  // Phase tool selection prompt
  phaseToolSelection: (userQuery: string, phase: any, phaseTools: any[]) => {
-    return `Wählen Sie 2-3 Methoden/Tools für die Phase "${phase.name}" basierend auf objektiven, fallbezogenen Kriterien.
+    return `Wählen Sie 2-3 Methoden/Tools für die Phase "${phase.name}" und bewerten Sie deren Aufgaben-Eignung VERGLEICHEND.
 SZENARIO: "${userQuery}"
 SPEZIFISCHE PHASE: ${phase.name} - ${phase.description || 'Forensische Untersuchungsphase'}
 VERFÜGBARE TOOLS FÜR ${phase.name.toUpperCase()}:
-${phaseTools.map((tool: any) => `- ${tool.name}: ${tool.description.slice(0, 100)}...`).join('\n')}
+${phaseTools.map((tool: any, index: number) => `${index + 1}. ${tool.name}: ${tool.description.slice(0, 150)}...
  - Plattformen: ${tool.platforms?.join(', ') || 'N/A'}
  - Skill Level: ${tool.skillLevel}
  - Tags: ${tool.tags?.join(', ') || 'N/A'}`).join('\n\n')}
-Wählen Sie Methoden/Tools nach forensischen Kriterien aus:
+Bewerten Sie ALLE Tools vergleichend für diese spezifische Aufgabe UND Phase. Wählen Sie die 2-3 besten aus.
 - Court admissibility und Chain of Custody Kompatibilität  
 - Integration in forensische Standard-Workflows
 - Reproduzierbarkeit und Dokumentationsqualität
 - Objektivität
-Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format (kein zusätzlicher Text):
+BEWERTUNGSKRITERIEN:
 - Wie gut löst das Tool das forensische Problem im SZENARIO-Kontext?
 - Wie gut passt es zur spezifischen PHASE "${phase.name}"?
 - Wie vergleicht es sich mit den anderen verfügbaren Tools für diese Phase?
 Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format:
 [
  {
-    "toolName": "Exakter Methoden/Tool-Name",
+    "toolName": "Exakter Tool-Name",
-    "priority": "high|medium|low", 
+    "taskRelevance": 85,
-    "justification": "Objektive Begründung warum diese Methode/Tool für das spezifische Szenario besser geeignet ist"
+    "justification": "Vergleichende Begründung warum dieses Tool für diese Phase und Aufgabe besser/schlechter als die anderen geeignet ist",
    "limitations": ["Spezifische Einschränkung 1", "Einschränkung 2"]
  }
-]`;
+]
  },
  // Tool evaluation prompt
  toolEvaluation: (userQuery: string, tool: any, rank: number) => {
  return `Sie sind ein DFIR-Experte und bewerten ein forensisches Tool für eine spezifische Aufgabe.
 PROBLEM: "${userQuery}"
 TOOL: ${tool.name}
 BESCHREIBUNG: ${tool.description}
 PLATTFORMEN: ${tool.platforms?.join(', ') || 'N/A'}
 SKILL LEVEL: ${tool.skillLevel}
 DOMAINS: ${tool.domains?.join(', ') || 'N/A'}
 TAGS: ${tool.tags?.join(', ') || 'N/A'}
 Bewerten Sie nach forensischen Standards und antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format:
 {
  "suitability_score": "high|medium|low",
  "task_relevance": 85,
  "detailed_explanation": "Detaillierte forensische Begründung warum diese Methode/Tool das Problem löst",
  "implementation_approach": "Konkrete methodische Schritte zur korrekten Anwendung für dieses spezifische Problem",
  "pros": ["Forensischer Vorteil 1", "Validierter Vorteil 2"],
  "cons": ["Methodische Limitation 1", "Potenzielle Schwäche 2"],
  "limitations": ["Spezifische Einschränkung 1", "Mögliche Problematik 2"],
  "alternatives": "Alternative Ansätze falls diese Methode/Tool nicht optimal ist"
 }
 WICHTIG:
- task_relevance: Numerischer Wert 0-100 wie gut das Tool für DIESE SPEZIFISCHE Aufgabe geeignet ist
+- taskRelevance: 0-100 Score basierend auf Szenario-Eignung UND Phasen-Passung im VERGLEICH zu anderen Tools
- limitations: Konkrete Einschränkungen oder Situationen wo das Tool NICHT optimal wäre
+- Nur die 2-3 BESTEN Tools auswählen und bewerten
- Berücksichtigen Sie den Skill Level vs. Anfrage-Komplexität
+- justification soll VERGLEICHEND sein ("besser als X weil...", "für diese Phase ideal weil...")`;
- Bewerten Sie objektiv, nicht beschönigend`;
+  },
  toolEvaluation: (userQuery: string, tool: any, rank: number, taskRelevance: number) => {
    return `Sie sind ein DFIR-Experte. Erklären Sie DETAILLIERT die Anwendung dieses bereits bewerteten Tools.
 PROBLEM: "${userQuery}"
 TOOL: ${tool.name} (bereits bewertet mit ${taskRelevance}% Aufgaben-Eignung)
 BESCHREIBUNG: ${tool.description}
 Das Tool wurde bereits als Rang ${rank} für diese Aufgabe bewertet. Erklären Sie nun:
 Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format:
 {
  "detailed_explanation": "Detaillierte Erklärung warum und wie dieses Tool für diese spezifische Aufgabe eingesetzt wird",
  "implementation_approach": "Konkrete Schritt-für-Schritt Anleitung zur korrekten Anwendung",
  "pros": ["Spezifischer Vorteil 1", "Spezifischer Vorteil 2"],
  "cons": ["Bekannte Limitation 1", "Bekannte Limitation 2"],
  "alternatives": "Alternative Ansätze oder Tools falls dieses nicht verfügbar ist"
 }
 WICHTIG: Keine erneute Bewertung - nur detaillierte Erklärung der bereits bewerteten Eignung.`;
  },
  // Background knowledge selection prompt
@@ -229,7 +228,7 @@ export function getPrompt(key: 'scenarioAnalysis', isWorkflow: boolean, userQuer
 export function getPrompt(key: 'investigationApproach', isWorkflow: boolean, userQuery: string): string;
 export function getPrompt(key: 'criticalConsiderations', isWorkflow: boolean, userQuery: string): string;
 export function getPrompt(key: 'phaseToolSelection', userQuery: string, phase: any, phaseTools: any[]): string;
-export function getPrompt(key: 'toolEvaluation', userQuery: string, tool: any, rank: number): string;
+export function getPrompt(key: 'toolEvaluation', userQuery: string, tool: any, rank: number, taskRelevance: number): string;
 export function getPrompt(key: 'backgroundKnowledgeSelection', userQuery: string, mode: string, selectedToolNames: string[], availableConcepts: any[]): string;
 export function getPrompt(key: 'finalRecommendations', isWorkflow: boolean, userQuery: string, selectedToolNames: string[]): string;
 export function getPrompt(promptKey: keyof typeof AI_PROMPTS, ...args: any[]): string {
--- a/src/data/tools.yaml
+++ b/src/data/tools.yaml
@@ -3975,7 +3975,7 @@ tools:
  - name: KAPE
    type: software
    description: >-
-      Kroll Artifact Parser and Extractor revolutioniert Windows-Forensik durch 
+      Kroll Artifact Parser and Extractor versucht sich an Windows-Forensik durch 
      intelligente Ziel-basierte Sammlung. Statt Full-Disk-Images extrahiert
      KAPE  gezielt kritische Artefakte: Registry-Hives, Event-Logs, Prefetch,
      Browser- Daten, Scheduled-Tasks in Minuten statt Stunden. Die Target-Files
@@ -3983,12 +3983,10 @@ tools:
      Besonders clever:  Compound-Targets gruppieren zusammengehörige Artefakte
      (z.B. "Browser" sammelt  Chrome+Firefox+Edge), die gKAPE-GUI macht es auch
      für Nicht-Techniker  zugänglich. Batch-Mode verarbeitet mehrere Images
-      parallel. Output direkt  kompatibel zu Timeline-Tools wie Plaso. Die
+      parallel. Output direkt  kompatibel zu Timeline-Tools wie Plaso. 
      ständigen Community-Updates  halten mit Windows-Entwicklungen Schritt.
      VSS-Processing analysiert Shadow- Copies automatisch. Der
-      Remote-Collection-Mode sammelt über Netzwerk.  Kostenlos aber
+      Remote-Collection-Mode sammelt über Netzwerk.  Kostenlos (mit Registrierung) aber
-      Enterprise-Support verfügbar. Der neue Standard für effiziente 
+      Enterprise-Support verfügbar.
      Windows-Forensik-Triage.
    skillLevel: intermediate
    url: https://www.kroll.com/kape
    icon: 🧰
@@ -4003,7 +4001,7 @@ tools:
    platforms:
      - Windows
    accessType: download
-    license: Freeware
+    license: Proprietary
    knowledgebase: false
  - name: Kibana
    type: software
--- a/src/utils/aiPipeline.ts
+++ b/src/utils/aiPipeline.ts
@@ -72,7 +72,6 @@ interface ConfidenceMetrics {
  semanticRelevance: number;          // How well tool description matches query (from embeddings)
  taskSuitability: number;           // AI-determined fitness for this specific task  
  methodologicalConsistency: number; // How well different analysis steps agree
  toolReliability: number;           // Indicators of tool quality and maintenance
  uncertaintyFactors: string[];      // Specific reasons why this might not work
  strengthIndicators: string[];      // Specific reasons why this is a good choice
 }
@@ -146,17 +145,17 @@ class ImprovedMicroTaskAIPipeline {
    // Updated confidence weights - more focused on AI evaluation
    this.confidenceConfig = {
-      semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.25'),     // Embeddings similarity
+      semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.3'),     // Embeddings similarity
-      suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.4'), // AI task fit evaluation  
+      suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.7'), // AI task fit evaluation  
-      consistencyWeight: parseFloat(process.env.CONFIDENCE_CONSISTENCY_WEIGHT || '0.2'), // Cross-validation agreement
+      consistencyWeight: 0,    
-      reliabilityWeight: parseFloat(process.env.CONFIDENCE_RELIABILITY_WEIGHT || '0.15'), // Tool quality indicators
+      reliabilityWeight: 0,
      minimumThreshold: parseInt(process.env.CONFIDENCE_MINIMUM_THRESHOLD || '40', 10),
      mediumThreshold: parseInt(process.env.CONFIDENCE_MEDIUM_THRESHOLD || '60', 10),
      highThreshold: parseInt(process.env.CONFIDENCE_HIGH_THRESHOLD || '80', 10)
    };
-    
+
-    console.log('[AI PIPELINE] Enhanced confidence scoring enabled:', {
+    console.log('[AI PIPELINE] Simplified confidence scoring enabled:', {
-      weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight} Consistency:${this.confidenceConfig.consistencyWeight} Reliability:${this.confidenceConfig.reliabilityWeight}`,
+      weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight}`,
      thresholds: `${this.confidenceConfig.minimumThreshold}/${this.confidenceConfig.mediumThreshold}/${this.confidenceConfig.highThreshold}`
    });
  }
@@ -709,197 +708,107 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
  ): ConfidenceMetrics {
    // 1. Semantic Relevance: Real embeddings similarity score
-    const semanticRelevance = context.embeddingsSimilarities.has(tool.name) ? 
+    const rawSemanticRelevance = context.embeddingsSimilarities.has(tool.name) ? 
-      Math.round(context.embeddingsSimilarities.get(tool.name)! * 100) : 50;
+      context.embeddingsSimilarities.get(tool.name)! * 100 : 50;
-    // 2. Task Suitability: AI-determined fitness for specific task
+    // 2. Task Suitability: Enhanced with phase-awareness for workflow mode
-    const taskSuitability = Math.round(taskRelevance);
+    let enhancedTaskSuitability = taskRelevance;
-    // 3. Methodological Consistency: Cross-validation between micro-tasks
+    if (context.mode === 'workflow') {
-    const methodologicalConsistency = this.calculateCrossValidationScore(tool.name, context);
+      // In workflow mode, boost score if tool is well-matched to its assigned phase
      const toolSelection = context.selectedTools?.find(st => st.tool.name === tool.name);
      if (toolSelection && tool.phases && tool.phases.includes(toolSelection.phase)) {
        // Boost for phase alignment (but cap at 100)
        const phaseBonus = Math.min(15, 100 - taskRelevance);
        enhancedTaskSuitability = Math.min(100, taskRelevance + phaseBonus);
        console.log(`[CONFIDENCE] Phase bonus for ${tool.name}: ${taskRelevance} -> ${enhancedTaskSuitability} (phase: ${toolSelection.phase})`);
      }
    }
-    // 4. Tool Reliability: Quality indicators
+    // Simple weighted combination - no artificial scaling
    const toolReliability = this.calculateToolReliability(tool);
    // Debug logging
    console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, {
      semantic: semanticRelevance,
      taskSuitability: taskSuitability,
      consistency: methodologicalConsistency,
      reliability: toolReliability,
      hasEmbeddingsSimilarity: context.embeddingsSimilarities.has(tool.name),
      rawTaskRelevance: taskRelevance
    });
    // Calculate weighted overall score
    const overall = (
-      semanticRelevance * this.confidenceConfig.semanticWeight +
+      rawSemanticRelevance * this.confidenceConfig.semanticWeight +
-      taskSuitability * this.confidenceConfig.suitabilityWeight +
+      enhancedTaskSuitability * this.confidenceConfig.suitabilityWeight
      methodologicalConsistency * this.confidenceConfig.consistencyWeight +
      toolReliability * this.confidenceConfig.reliabilityWeight
    );
    const uncertaintyFactors = this.identifySpecificUncertaintyFactors(tool, context, limitations, overall);
    const strengthIndicators = this.identifySpecificStrengthIndicators(tool, context, overall);
    console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, {
      rawSemantic: Math.round(rawSemanticRelevance),
      rawTaskSuitability: taskRelevance,
      enhancedTaskSuitability: Math.round(enhancedTaskSuitability),
      overall: Math.round(overall),
      mode: context.mode
    });
    return {
      overall: Math.round(overall),
-      semanticRelevance: Math.round(semanticRelevance),
+      semanticRelevance: Math.round(rawSemanticRelevance),
-      taskSuitability: Math.round(taskSuitability), 
+      taskSuitability: Math.round(enhancedTaskSuitability), 
-      methodologicalConsistency: Math.round(methodologicalConsistency),
+      methodologicalConsistency: 0,
      toolReliability: Math.round(toolReliability),
      uncertaintyFactors,
      strengthIndicators
    };
  }
  private calculateCrossValidationScore(toolName: string, context: AnalysisContext): number {
    // Look for entries where this tool was mentioned across different phases
    const relevantEntries = context.auditTrail.filter(entry => 
      entry.phase === 'micro-task' || entry.phase === 'selection'
    );
    let toolMentions = 0;
    let positiveEvaluations = 0;
    let confidenceSum = 0;
    relevantEntries.forEach(entry => {
      let toolFound = false;
      // Check various ways the tool might be referenced in output
      if (entry.output && typeof entry.output === 'object') {
        // Check selectedTools arrays
        if (Array.isArray(entry.output.selectedTools) && 
            entry.output.selectedTools.includes(toolName)) {
          toolFound = true;
        }
        // Check finalToolNames arrays  
        if (Array.isArray(entry.output.finalToolNames) && 
            entry.output.finalToolNames.includes(toolName)) {
          toolFound = true;
        }
        // Check toolName in individual evaluation
        if (entry.output.toolName === toolName) {
          toolFound = true;
        }
      }
      if (toolFound) {
        toolMentions++;
        confidenceSum += entry.confidence;
        // Consider it positive if confidence >= 60
        if (entry.confidence >= 60) {
          positiveEvaluations++;
        }
      }
    });
    console.log(`[AI PIPELINE] Cross-validation for ${toolName}: ${toolMentions} mentions, ${positiveEvaluations} positive, avg confidence: ${toolMentions > 0 ? Math.round(confidenceSum / toolMentions) : 0}`);
    if (toolMentions === 0) {
      return 60; // Default when no cross-validation data available
    }
    if (toolMentions === 1) {
      // Single mention - use confidence directly but cap it
      return Math.min(85, Math.max(40, confidenceSum));
    }
    // Multiple mentions - calculate agreement ratio
    const agreementRatio = positiveEvaluations / toolMentions;
    const avgConfidence = confidenceSum / toolMentions;
    // Combine agreement ratio with average confidence
    const crossValidationScore = (agreementRatio * 0.7 + (avgConfidence / 100) * 0.3) * 100;
    return Math.round(Math.min(95, Math.max(30, crossValidationScore)));
  }
  // NEW: Calculate tool reliability based on objective indicators
  private calculateToolReliability(tool: any): number {
    let reliability = 50; // Base score
    // Documentation availability
    if (tool.knowledgebase === true) reliability += 25;
    // Active maintenance (hosted tools are typically maintained)
    if (isToolHosted(tool)) reliability += 20;
    // Community support (open source often has community)
    if (tool.license && tool.license !== 'Proprietary') reliability += 10;
    // Skill level appropriateness (not too complex, not too simple)
    if (tool.skillLevel === 'intermediate' || tool.skillLevel === 'advanced') reliability += 10;
    else if (tool.skillLevel === 'expert') reliability -= 5; // May be overcomplicated
    // Multi-platform support (more versatile)
    if (tool.platforms && tool.platforms.length > 1) reliability += 5;
    return Math.min(100, reliability);
  }
  // NEW: Identify specific uncertainty factors based on analysis
  private identifySpecificUncertaintyFactors(tool: any, context: AnalysisContext, limitations: string[], confidence: number): string[] {
    const factors: string[] = [];
-    // Add AI-identified limitations
+    // Add AI-identified limitations first (most specific)
    if (limitations && limitations.length > 0) {
-      factors.push(...limitations.slice(0, 3)); // Limit to top 3
+      factors.push(...limitations.slice(0, 2)); // Limit to top 2 to leave room for others
    }
    // Low semantic similarity
    const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5;
-    if (similarity < 0.4) {
+    if (similarity < 0.7) {
      factors.push('Geringe semantische Ähnlichkeit zur Anfrage - Tool-Beschreibung passt möglicherweise nicht optimal');
    }
-    // Skill level mismatch
+    // Skill level vs scenario complexity mismatch
-    if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent/i.test(context.userQuery)) {
+    if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent|sofort/i.test(context.userQuery)) {
-      factors.push('Experten-Tool für Eilszenario - möglicherweise zu komplex für schnelle Antworten');
+      factors.push('Experten-Tool für zeitkritisches Szenario - Setup und Einarbeitung könnten zu lange dauern');
    }
-    if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced/i.test(context.userQuery)) {
+    if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced|forensisch/i.test(context.userQuery)) {
-      factors.push('Einsteiger-Tool für komplexes Szenario - könnte funktionale Einschränkungen haben');
+      factors.push('Einsteiger-Tool für komplexe Analyse - könnte funktionale Limitierungen haben');
    }
-    // Access limitations
+    // Platform availability concerns
    if (tool.platforms && tool.platforms.length === 1 && tool.platforms[0] === 'Windows' && /linux|unix|server/i.test(context.userQuery)) {
      factors.push('Nur Windows-Tool bei möglicher Linux/Server-Umgebung - Plattform-Inkompatibilität');
    }
    // Access and deployment concerns
    if (tool.type === 'software' && !isToolHosted(tool) && tool.accessType === 'download') {
-      factors.push('Installation erforderlich - nicht sofort verfügbar ohne Setup');
+      factors.push('Installation und Setup erforderlich');
    }
-    // Cross-validation disagreement
+    // License restrictions
-    const crossValidation = this.calculateCrossValidationScore(tool.name, context);
+    if (tool.license === 'Proprietary') {
-    if (crossValidation < 50) {
+      factors.push('Kommerzielle Software - Lizenzkosten und rechtliche Beschränkungen zu beachten');
      factors.push('Uneinheitliche Bewertung in verschiedenen Analyseschritten - Empfehlung nicht eindeutig');
    }
-    return factors.slice(0, 4); // Limit to 4 most important factors
+    // Low overall confidence warning
    if (confidence < 60) {
      factors.push('Moderate Gesamtbewertung - alternative Ansätze sollten ebenfalls betrachtet werden');
    }
    return factors.slice(0, 4); // Limit to 4 most relevant factors
  }
  // NEW: Identify specific strength indicators
  private identifySpecificStrengthIndicators(tool: any, context: AnalysisContext, confidence: number): string[] {
    const indicators: string[] = [];
    // High confidence overall
    if (confidence >= this.confidenceConfig.highThreshold) {
      indicators.push('Hohe Gesamtbewertung durch mehrfache Validierung');
    }
    // High semantic similarity
    const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5;
    if (similarity >= 0.7) {
      indicators.push('Sehr gute semantische Übereinstimmung mit Ihrer Anfrage');
    }
    // Strong cross-validation
    const crossValidation = this.calculateCrossValidationScore(tool.name, context);
    if (crossValidation >= 80) {
      indicators.push('Konsistente Empfehlung über verschiedene Analyseschritte hinweg');
    }
    // Quality indicators
    if (tool.knowledgebase === true) {
      indicators.push('Umfassende Dokumentation und Wissensbasis verfügbar');
@@ -985,7 +894,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
    const prompt = getPrompt('phaseToolSelection', context.userQuery, phase, phaseTools);
-    const result = await this.callMicroTaskAI(prompt, context, 800);
+    const result = await this.callMicroTaskAI(prompt, context, 1000);
    if (result.success) {
      const selections = this.safeParseJSON(result.content, []);
@@ -998,16 +907,30 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
        validSelections.forEach((sel: any) => {
          const tool = phaseTools.find((t: any) => t.name === sel.toolName);
          if (tool) {
-            this.addToolToSelection(context, tool, phase.id, sel.priority, sel.justification);
+            // Ensure taskRelevance is a number
            const taskRelevance = typeof sel.taskRelevance === 'number' ? 
              sel.taskRelevance : parseInt(String(sel.taskRelevance)) || 70;
            // Derive priority automatically from score
            const priority = this.derivePriorityFromScore(taskRelevance);
            this.addToolToSelection(context, tool, phase.id, priority, sel.justification, taskRelevance, sel.limitations);
          }
        });
        this.addAuditEntry(context, 'micro-task', 'phase-tool-selection',
          { phase: phase.id, availableTools: phaseTools.length },
-          { validSelections: validSelections.length, selectedTools: validSelections.map(s => s.toolName) },
+          { 
            validSelections: validSelections.length, 
            selectedTools: validSelections.map(s => ({ 
              name: s.toolName, 
              taskRelevance: s.taskRelevance, 
              derivedPriority: this.derivePriorityFromScore(s.taskRelevance) 
            }))
          },
          validSelections.length > 0 ? 75 : 30,
          Date.now() - result.processingTimeMs,
-          { phaseName: phase.name }
+          { phaseName: phase.name, comparativeEvaluation: true, priorityDerived: true }
        );
      }
    }
@@ -1016,56 +939,46 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
  }
  private async evaluateSpecificTool(context: AnalysisContext, tool: any, rank: number): Promise<MicroTaskResult> {
-    const prompt = getPrompt('toolEvaluation', context.userQuery, tool, rank);
+    // Get existing task relevance from previous phase selection
    const existingSelection = context.selectedTools?.find(st => st.tool.name === tool.name);
    const taskRelevance = existingSelection?.taskRelevance || 70;
    const priority = this.derivePriorityFromScore(taskRelevance);
    const prompt = getPrompt('toolEvaluation', context.userQuery, tool, rank, taskRelevance);
-    const result = await this.callMicroTaskAI(prompt, context, 1200);
+    const result = await this.callMicroTaskAI(prompt, context, 1000);
    if (result.success) {
      const evaluation = this.safeParseJSON(result.content, {
        suitability_score: 'medium',
        task_relevance: '',
        detailed_explanation: 'Evaluation failed',
        implementation_approach: '',
        pros: [],
        cons: [],
        limitations: [],
        alternatives: ''
      });
-      // Debug logging to see what we're getting
+      // Store evaluation without re-scoring
      console.log(`[AI PIPELINE] Tool ${tool.name} evaluation:`, {
        taskRelevance: evaluation.task_relevance,
        suitabilityScore: evaluation.suitability_score,
        limitationsCount: evaluation.limitations?.length || 0
      });
      // Ensure task_relevance is a number
      const taskRelevance = typeof evaluation.task_relevance === 'number' ? 
        evaluation.task_relevance : 
        parseInt(String(evaluation.task_relevance)) || 70;
      // Store enhanced evaluation data
      this.addToolToSelection(context, {
        ...tool,
        evaluation: {
          ...evaluation,
-          task_relevance: taskRelevance, // Ensure it's stored as number
+          rank,
-          rank
+          task_relevance: taskRelevance
        }
-      }, 'evaluation', evaluation.suitability_score, evaluation.detailed_explanation, 
+      }, 'evaluation', priority, evaluation.detailed_explanation, 
-      taskRelevance, evaluation.limitations);
+      taskRelevance, existingSelection?.limitations);
      this.addAuditEntry(context, 'micro-task', 'tool-evaluation',
-        { toolName: tool.name, rank },
+        { toolName: tool.name, rank, existingTaskRelevance: taskRelevance, derivedPriority: priority },
        { 
          suitabilityScore: evaluation.suitability_score, 
          taskRelevance: taskRelevance, // Use the cleaned number
          hasExplanation: !!evaluation.detailed_explanation,
-          limitationsIdentified: evaluation.limitations?.length || 0
+          hasImplementationApproach: !!evaluation.implementation_approach,
          prosCount: evaluation.pros?.length || 0,
          consCount: evaluation.cons?.length || 0
        },
-        evaluation.suitability_score === 'high' ? 85 : evaluation.suitability_score === 'medium' ? 70 : 50,
+        70,
        Date.now() - result.processingTimeMs,
-        { toolType: tool.type, taskRelevanceScore: taskRelevance }
+        { toolType: tool.type, explanationOnly: true, priorityDerived: true }
      );
    }
@@ -1173,6 +1086,12 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
    }
  }
  private derivePriorityFromScore(taskRelevance: number): string {
    if (taskRelevance >= 80) return 'high';
    if (taskRelevance >= 60) return 'medium';
    return 'low';
  }
  async processQuery(userQuery: string, mode: string): Promise<AnalysisResult> {
    const startTime = Date.now();
    let completeTasks = 0;
@@ -1323,8 +1242,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
            components: {
              semantic: confidence.semanticRelevance,
              suitability: confidence.taskSuitability,
-              consistency: confidence.methodologicalConsistency,
+              consistency: confidence.methodologicalConsistency
              reliability: confidence.toolReliability
            }
          },
          confidence.overall,