confidence updates, content adjustment

2025-08-05 21:35:38 +02:00
parent 27e64f05ca
commit fe1be323bb
5 changed files with 160 additions and 270 deletions
--- a/src/utils/aiPipeline.ts
+++ b/src/utils/aiPipeline.ts
@@ -72,7 +72,6 @@ interface ConfidenceMetrics {
  semanticRelevance: number;          // How well tool description matches query (from embeddings)
  taskSuitability: number;           // AI-determined fitness for this specific task  
  methodologicalConsistency: number; // How well different analysis steps agree
-  toolReliability: number;           // Indicators of tool quality and maintenance
  uncertaintyFactors: string[];      // Specific reasons why this might not work
  strengthIndicators: string[];      // Specific reasons why this is a good choice
 }
@@ -146,17 +145,17 @@ class ImprovedMicroTaskAIPipeline {
    
    // Updated confidence weights - more focused on AI evaluation
    this.confidenceConfig = {
-      semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.25'),     // Embeddings similarity
-      suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.4'), // AI task fit evaluation  
-      consistencyWeight: parseFloat(process.env.CONFIDENCE_CONSISTENCY_WEIGHT || '0.2'), // Cross-validation agreement
-      reliabilityWeight: parseFloat(process.env.CONFIDENCE_RELIABILITY_WEIGHT || '0.15'), // Tool quality indicators
+      semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.3'),     // Embeddings similarity
+      suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.7'), // AI task fit evaluation  
+      consistencyWeight: 0,    
+      reliabilityWeight: 0,
      minimumThreshold: parseInt(process.env.CONFIDENCE_MINIMUM_THRESHOLD || '40', 10),
      mediumThreshold: parseInt(process.env.CONFIDENCE_MEDIUM_THRESHOLD || '60', 10),
      highThreshold: parseInt(process.env.CONFIDENCE_HIGH_THRESHOLD || '80', 10)
    };
-    
-    console.log('[AI PIPELINE] Enhanced confidence scoring enabled:', {
-      weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight} Consistency:${this.confidenceConfig.consistencyWeight} Reliability:${this.confidenceConfig.reliabilityWeight}`,
+
+    console.log('[AI PIPELINE] Simplified confidence scoring enabled:', {
+      weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight}`,
      thresholds: `${this.confidenceConfig.minimumThreshold}/${this.confidenceConfig.mediumThreshold}/${this.confidenceConfig.highThreshold}`
    });
  }
@@ -709,197 +708,107 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
  ): ConfidenceMetrics {
    
    // 1. Semantic Relevance: Real embeddings similarity score
-    const semanticRelevance = context.embeddingsSimilarities.has(tool.name) ? 
-      Math.round(context.embeddingsSimilarities.get(tool.name)! * 100) : 50;
+    const rawSemanticRelevance = context.embeddingsSimilarities.has(tool.name) ? 
+      context.embeddingsSimilarities.get(tool.name)! * 100 : 50;
    
-    // 2. Task Suitability: AI-determined fitness for specific task
-    const taskSuitability = Math.round(taskRelevance);
+    // 2. Task Suitability: Enhanced with phase-awareness for workflow mode
+    let enhancedTaskSuitability = taskRelevance;
    
-    // 3. Methodological Consistency: Cross-validation between micro-tasks
-    const methodologicalConsistency = this.calculateCrossValidationScore(tool.name, context);
+    if (context.mode === 'workflow') {
+      // In workflow mode, boost score if tool is well-matched to its assigned phase
+      const toolSelection = context.selectedTools?.find(st => st.tool.name === tool.name);
+      if (toolSelection && tool.phases && tool.phases.includes(toolSelection.phase)) {
+        // Boost for phase alignment (but cap at 100)
+        const phaseBonus = Math.min(15, 100 - taskRelevance);
+        enhancedTaskSuitability = Math.min(100, taskRelevance + phaseBonus);
+        
+        console.log(`[CONFIDENCE] Phase bonus for ${tool.name}: ${taskRelevance} -> ${enhancedTaskSuitability} (phase: ${toolSelection.phase})`);
+      }
+    }
    
-    // 4. Tool Reliability: Quality indicators
-    const toolReliability = this.calculateToolReliability(tool);
-    
-    // Debug logging
-    console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, {
-      semantic: semanticRelevance,
-      taskSuitability: taskSuitability,
-      consistency: methodologicalConsistency,
-      reliability: toolReliability,
-      hasEmbeddingsSimilarity: context.embeddingsSimilarities.has(tool.name),
-      rawTaskRelevance: taskRelevance
-    });
-    
-    // Calculate weighted overall score
+    // Simple weighted combination - no artificial scaling
    const overall = (
-      semanticRelevance * this.confidenceConfig.semanticWeight +
-      taskSuitability * this.confidenceConfig.suitabilityWeight +
-      methodologicalConsistency * this.confidenceConfig.consistencyWeight +
-      toolReliability * this.confidenceConfig.reliabilityWeight
+      rawSemanticRelevance * this.confidenceConfig.semanticWeight +
+      enhancedTaskSuitability * this.confidenceConfig.suitabilityWeight
    );

    const uncertaintyFactors = this.identifySpecificUncertaintyFactors(tool, context, limitations, overall);
    const strengthIndicators = this.identifySpecificStrengthIndicators(tool, context, overall);

+    console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, {
+      rawSemantic: Math.round(rawSemanticRelevance),
+      rawTaskSuitability: taskRelevance,
+      enhancedTaskSuitability: Math.round(enhancedTaskSuitability),
+      overall: Math.round(overall),
+      mode: context.mode
+    });
+
    return {
      overall: Math.round(overall),
-      semanticRelevance: Math.round(semanticRelevance),
-      taskSuitability: Math.round(taskSuitability), 
-      methodologicalConsistency: Math.round(methodologicalConsistency),
-      toolReliability: Math.round(toolReliability),
+      semanticRelevance: Math.round(rawSemanticRelevance),
+      taskSuitability: Math.round(enhancedTaskSuitability), 
+      methodologicalConsistency: 0,
      uncertaintyFactors,
      strengthIndicators
    };
  }

-  private calculateCrossValidationScore(toolName: string, context: AnalysisContext): number {
-    // Look for entries where this tool was mentioned across different phases
-    const relevantEntries = context.auditTrail.filter(entry => 
-      entry.phase === 'micro-task' || entry.phase === 'selection'
-    );
-    
-    let toolMentions = 0;
-    let positiveEvaluations = 0;
-    let confidenceSum = 0;
-    
-    relevantEntries.forEach(entry => {
-      let toolFound = false;
-      
-      // Check various ways the tool might be referenced in output
-      if (entry.output && typeof entry.output === 'object') {
-        // Check selectedTools arrays
-        if (Array.isArray(entry.output.selectedTools) && 
-            entry.output.selectedTools.includes(toolName)) {
-          toolFound = true;
-        }
-        
-        // Check finalToolNames arrays  
-        if (Array.isArray(entry.output.finalToolNames) && 
-            entry.output.finalToolNames.includes(toolName)) {
-          toolFound = true;
-        }
-        
-        // Check toolName in individual evaluation
-        if (entry.output.toolName === toolName) {
-          toolFound = true;
-        }
-      }
-      
-      if (toolFound) {
-        toolMentions++;
-        confidenceSum += entry.confidence;
-        
-        // Consider it positive if confidence >= 60
-        if (entry.confidence >= 60) {
-          positiveEvaluations++;
-        }
-      }
-    });
-    
-    console.log(`[AI PIPELINE] Cross-validation for ${toolName}: ${toolMentions} mentions, ${positiveEvaluations} positive, avg confidence: ${toolMentions > 0 ? Math.round(confidenceSum / toolMentions) : 0}`);
-    
-    if (toolMentions === 0) {
-      return 60; // Default when no cross-validation data available
-    }
-    
-    if (toolMentions === 1) {
-      // Single mention - use confidence directly but cap it
-      return Math.min(85, Math.max(40, confidenceSum));
-    }
-    
-    // Multiple mentions - calculate agreement ratio
-    const agreementRatio = positiveEvaluations / toolMentions;
-    const avgConfidence = confidenceSum / toolMentions;
-    
-    // Combine agreement ratio with average confidence
-    const crossValidationScore = (agreementRatio * 0.7 + (avgConfidence / 100) * 0.3) * 100;
-    
-    return Math.round(Math.min(95, Math.max(30, crossValidationScore)));
-  }
-
-  // NEW: Calculate tool reliability based on objective indicators
-  private calculateToolReliability(tool: any): number {
-    let reliability = 50; // Base score
-    
-    // Documentation availability
-    if (tool.knowledgebase === true) reliability += 25;
-    
-    // Active maintenance (hosted tools are typically maintained)
-    if (isToolHosted(tool)) reliability += 20;
-    
-    // Community support (open source often has community)
-    if (tool.license && tool.license !== 'Proprietary') reliability += 10;
-    
-    // Skill level appropriateness (not too complex, not too simple)
-    if (tool.skillLevel === 'intermediate' || tool.skillLevel === 'advanced') reliability += 10;
-    else if (tool.skillLevel === 'expert') reliability -= 5; // May be overcomplicated
-    
-    // Multi-platform support (more versatile)
-    if (tool.platforms && tool.platforms.length > 1) reliability += 5;
-    
-    return Math.min(100, reliability);
-  }
-
-  // NEW: Identify specific uncertainty factors based on analysis
  private identifySpecificUncertaintyFactors(tool: any, context: AnalysisContext, limitations: string[], confidence: number): string[] {
    const factors: string[] = [];
    
-    // Add AI-identified limitations
+    // Add AI-identified limitations first (most specific)
    if (limitations && limitations.length > 0) {
-      factors.push(...limitations.slice(0, 3)); // Limit to top 3
+      factors.push(...limitations.slice(0, 2)); // Limit to top 2 to leave room for others
    }
    
    // Low semantic similarity
    const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5;
-    if (similarity < 0.4) {
+    if (similarity < 0.7) {
      factors.push('Geringe semantische Ähnlichkeit zur Anfrage - Tool-Beschreibung passt möglicherweise nicht optimal');
    }
    
-    // Skill level mismatch
-    if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent/i.test(context.userQuery)) {
-      factors.push('Experten-Tool für Eilszenario - möglicherweise zu komplex für schnelle Antworten');
+    // Skill level vs scenario complexity mismatch
+    if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent|sofort/i.test(context.userQuery)) {
+      factors.push('Experten-Tool für zeitkritisches Szenario - Setup und Einarbeitung könnten zu lange dauern');
    }
    
-    if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced/i.test(context.userQuery)) {
-      factors.push('Einsteiger-Tool für komplexes Szenario - könnte funktionale Einschränkungen haben');
+    if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced|forensisch/i.test(context.userQuery)) {
+      factors.push('Einsteiger-Tool für komplexe Analyse - könnte funktionale Limitierungen haben');
    }
    
-    // Access limitations
+    // Platform availability concerns
+    if (tool.platforms && tool.platforms.length === 1 && tool.platforms[0] === 'Windows' && /linux|unix|server/i.test(context.userQuery)) {
+      factors.push('Nur Windows-Tool bei möglicher Linux/Server-Umgebung - Plattform-Inkompatibilität');
+    }
+    
+    // Access and deployment concerns
    if (tool.type === 'software' && !isToolHosted(tool) && tool.accessType === 'download') {
-      factors.push('Installation erforderlich - nicht sofort verfügbar ohne Setup');
+      factors.push('Installation und Setup erforderlich');
    }
    
-    // Cross-validation disagreement
-    const crossValidation = this.calculateCrossValidationScore(tool.name, context);
-    if (crossValidation < 50) {
-      factors.push('Uneinheitliche Bewertung in verschiedenen Analyseschritten - Empfehlung nicht eindeutig');
+    // License restrictions
+    if (tool.license === 'Proprietary') {
+      factors.push('Kommerzielle Software - Lizenzkosten und rechtliche Beschränkungen zu beachten');
    }
    
-    return factors.slice(0, 4); // Limit to 4 most important factors
+    // Low overall confidence warning
+    if (confidence < 60) {
+      factors.push('Moderate Gesamtbewertung - alternative Ansätze sollten ebenfalls betrachtet werden');
+    }
+    
+    return factors.slice(0, 4); // Limit to 4 most relevant factors
  }

  // NEW: Identify specific strength indicators
  private identifySpecificStrengthIndicators(tool: any, context: AnalysisContext, confidence: number): string[] {
    const indicators: string[] = [];
    
-    // High confidence overall
-    if (confidence >= this.confidenceConfig.highThreshold) {
-      indicators.push('Hohe Gesamtbewertung durch mehrfache Validierung');
-    }
-    
    // High semantic similarity
    const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5;
    if (similarity >= 0.7) {
      indicators.push('Sehr gute semantische Übereinstimmung mit Ihrer Anfrage');
    }
    
-    // Strong cross-validation
-    const crossValidation = this.calculateCrossValidationScore(tool.name, context);
-    if (crossValidation >= 80) {
-      indicators.push('Konsistente Empfehlung über verschiedene Analyseschritte hinweg');
-    }
-    
    // Quality indicators
    if (tool.knowledgebase === true) {
      indicators.push('Umfassende Dokumentation und Wissensbasis verfügbar');
@@ -985,7 +894,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;

    const prompt = getPrompt('phaseToolSelection', context.userQuery, phase, phaseTools);

-    const result = await this.callMicroTaskAI(prompt, context, 800);
+    const result = await this.callMicroTaskAI(prompt, context, 1000);
    
    if (result.success) {
      const selections = this.safeParseJSON(result.content, []);
@@ -998,16 +907,30 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
        validSelections.forEach((sel: any) => {
          const tool = phaseTools.find((t: any) => t.name === sel.toolName);
          if (tool) {
-            this.addToolToSelection(context, tool, phase.id, sel.priority, sel.justification);
+            // Ensure taskRelevance is a number
+            const taskRelevance = typeof sel.taskRelevance === 'number' ? 
+              sel.taskRelevance : parseInt(String(sel.taskRelevance)) || 70;
+            
+            // Derive priority automatically from score
+            const priority = this.derivePriorityFromScore(taskRelevance);
+            
+            this.addToolToSelection(context, tool, phase.id, priority, sel.justification, taskRelevance, sel.limitations);
          }
        });
        
        this.addAuditEntry(context, 'micro-task', 'phase-tool-selection',
          { phase: phase.id, availableTools: phaseTools.length },
-          { validSelections: validSelections.length, selectedTools: validSelections.map(s => s.toolName) },
+          { 
+            validSelections: validSelections.length, 
+            selectedTools: validSelections.map(s => ({ 
+              name: s.toolName, 
+              taskRelevance: s.taskRelevance, 
+              derivedPriority: this.derivePriorityFromScore(s.taskRelevance) 
+            }))
+          },
          validSelections.length > 0 ? 75 : 30,
          Date.now() - result.processingTimeMs,
-          { phaseName: phase.name }
+          { phaseName: phase.name, comparativeEvaluation: true, priorityDerived: true }
        );
      }
    }
@@ -1016,56 +939,46 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
  }

  private async evaluateSpecificTool(context: AnalysisContext, tool: any, rank: number): Promise<MicroTaskResult> {
-    const prompt = getPrompt('toolEvaluation', context.userQuery, tool, rank);
+    // Get existing task relevance from previous phase selection
+    const existingSelection = context.selectedTools?.find(st => st.tool.name === tool.name);
+    const taskRelevance = existingSelection?.taskRelevance || 70;
+    const priority = this.derivePriorityFromScore(taskRelevance);
+    
+    const prompt = getPrompt('toolEvaluation', context.userQuery, tool, rank, taskRelevance);

-    const result = await this.callMicroTaskAI(prompt, context, 1200);
+    const result = await this.callMicroTaskAI(prompt, context, 1000);
    
    if (result.success) {
      const evaluation = this.safeParseJSON(result.content, {
-        suitability_score: 'medium',
-        task_relevance: '',
        detailed_explanation: 'Evaluation failed',
        implementation_approach: '',
        pros: [],
        cons: [],
-        limitations: [],
        alternatives: ''
      });
      
-      // Debug logging to see what we're getting
-      console.log(`[AI PIPELINE] Tool ${tool.name} evaluation:`, {
-        taskRelevance: evaluation.task_relevance,
-        suitabilityScore: evaluation.suitability_score,
-        limitationsCount: evaluation.limitations?.length || 0
-      });
-      
-      // Ensure task_relevance is a number
-      const taskRelevance = typeof evaluation.task_relevance === 'number' ? 
-        evaluation.task_relevance : 
-        parseInt(String(evaluation.task_relevance)) || 70;
-      
-      // Store enhanced evaluation data
+      // Store evaluation without re-scoring
      this.addToolToSelection(context, {
        ...tool,
        evaluation: {
          ...evaluation,
-          task_relevance: taskRelevance, // Ensure it's stored as number
-          rank
+          rank,
+          task_relevance: taskRelevance
        }
-      }, 'evaluation', evaluation.suitability_score, evaluation.detailed_explanation, 
-      taskRelevance, evaluation.limitations);
+      }, 'evaluation', priority, evaluation.detailed_explanation, 
+      taskRelevance, existingSelection?.limitations);
      
      this.addAuditEntry(context, 'micro-task', 'tool-evaluation',
-        { toolName: tool.name, rank },
+        { toolName: tool.name, rank, existingTaskRelevance: taskRelevance, derivedPriority: priority },
        { 
-          suitabilityScore: evaluation.suitability_score, 
-          taskRelevance: taskRelevance, // Use the cleaned number
          hasExplanation: !!evaluation.detailed_explanation,
-          limitationsIdentified: evaluation.limitations?.length || 0
+          hasImplementationApproach: !!evaluation.implementation_approach,
+          prosCount: evaluation.pros?.length || 0,
+          consCount: evaluation.cons?.length || 0
        },
-        evaluation.suitability_score === 'high' ? 85 : evaluation.suitability_score === 'medium' ? 70 : 50,
+        70,
        Date.now() - result.processingTimeMs,
-        { toolType: tool.type, taskRelevanceScore: taskRelevance }
+        { toolType: tool.type, explanationOnly: true, priorityDerived: true }
      );
    }
    
@@ -1173,6 +1086,12 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
    }
  }

+  private derivePriorityFromScore(taskRelevance: number): string {
+    if (taskRelevance >= 80) return 'high';
+    if (taskRelevance >= 60) return 'medium';
+    return 'low';
+  }
+
  async processQuery(userQuery: string, mode: string): Promise<AnalysisResult> {
    const startTime = Date.now();
    let completeTasks = 0;
@@ -1323,8 +1242,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
            components: {
              semantic: confidence.semanticRelevance,
              suitability: confidence.taskSuitability,
-              consistency: confidence.methodologicalConsistency,
-              reliability: confidence.toolReliability
+              consistency: confidence.methodologicalConsistency
            }
          },
          confidence.overall,