From 3c6fb568d65da39f08b9422bbcbec95a678e3e4e Mon Sep 17 00:00:00 2001
From: overcuriousity <overcuriousity@posteo.org>
Date: Fri, 8 Aug 2025 22:54:47 +0200
Subject: [PATCH] fix pipeline

---
 src/config/prompts.ts   |  42 ++++++++++
 src/utils/aiPipeline.ts | 166 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 208 insertions(+)
diff --git a/src/config/prompts.ts b/src/config/prompts.ts
index bc51f4e..a75df21 100644
--- a/src/config/prompts.ts
+++ b/src/config/prompts.ts
@@ -189,6 +189,47 @@ ANTWORT AUSSCHLIESSLICH IM JSON-FORMAT OHNE JEGLICHEN TEXT AUSSERHALB DER JSON-S
 ]`;
   },
 
+  generatePhaseCompletionPrompt(
+    originalQuery: string,
+    phase: any,
+    candidateTools: any[],
+    candidateConcepts: any[]
+  ): string {
+    return `Du bist ein DFIR-Experte. Die Phase "${phase.name}" ist in der aktuellen Analyse unterrepräsentiert.
+
+ORIGINAL ANFRAGE: "${originalQuery}"
+PHASE ZU VERVOLLSTÄNDIGEN: ${phase.name} - ${phase.description || ''}
+
+Wähle 1-2 BESTE Tools aus den gefundenen Kandidaten, die diese Phase optimal ergänzen:
+
+VERFÜGBARE TOOLS (${candidateTools.length}):
+${candidateTools.map((tool: any) => `
+- ${tool.name} (${tool.type})
+  Beschreibung: ${tool.description.slice(0, 120)}...
+  Skill Level: ${tool.skillLevel}
+`).join('')}
+
+${candidateConcepts.length > 0 ? `
+VERFÜGBARE KONZEPTE (${candidateConcepts.length}):
+${candidateConcepts.map((concept: any) => `
+- ${concept.name}
+  Beschreibung: ${concept.description.slice(0, 120)}...
+`).join('')}
+` : ''}
+
+AUSWAHLREGELN:
+1. Wähle Tools, die die ${phase.name}-Phase der ursprünglichen Anfrage optimal ergänzen
+2. Priorisiere Tools, die zur Gesamtlösung beitragen
+3. Maximal 2 Tools für diese Phase
+
+ANTWORT AUSSCHLIESSLICH IM JSON-FORMAT:
+{
+  "selectedTools": ["ToolName1", "ToolName2"],
+  "selectedConcepts": ["ConceptName1"],
+  "reasoning": "Kurze Begründung der Auswahl für ${phase.name}"
+}`;
+  },
+
   finalRecommendations: (isWorkflow: boolean, userQuery: string, selectedToolNames: string[]) => {
     const focus = isWorkflow ? 
       'Workflow-Schritte, Best Practices, Objektivität' :
@@ -213,6 +254,7 @@ export function getPrompt(key: 'phaseToolSelection', userQuery: string, phase: a
 export function getPrompt(key: 'toolEvaluation', userQuery: string, tool: any, rank: number, taskRelevance: number): string;
 export function getPrompt(key: 'backgroundKnowledgeSelection', userQuery: string, mode: string, selectedToolNames: string[], availableConcepts: any[]): string;
 export function getPrompt(key: 'finalRecommendations', isWorkflow: boolean, userQuery: string, selectedToolNames: string[]): string;
+export function getPrompt(key: 'generatePhaseCompletionPrompt', originalQuery: string, phase: any, candidateTools: any[], candidateConcepts: any[]): string;
 export function getPrompt(promptKey: keyof typeof AI_PROMPTS, ...args: any[]): string {
   try {
     const promptFunction = AI_PROMPTS[promptKey];
diff --git a/src/utils/aiPipeline.ts b/src/utils/aiPipeline.ts
index cceec47..2ae0fd4 100644
--- a/src/utils/aiPipeline.ts
+++ b/src/utils/aiPipeline.ts
@@ -1096,6 +1096,168 @@ class ImprovedMicroTaskAIPipeline {
     return result;
   }
 
+  private async completeUnderrepresentedPhases(
+    context: AnalysisContext, 
+    toolsData: any,
+    originalQuery: string
+  ): Promise<void> {
+    const phases = toolsData.phases || [];
+    const selectedPhases = new Map<string, number>();
+    
+    // Count tools per phase from current selection
+    context.selectedTools?.forEach(st => {
+      const count = selectedPhases.get(st.phase) || 0;
+      selectedPhases.set(st.phase, count + 1);
+    });
+    
+    console.log(`[AI PIPELINE] Phase coverage analysis:`);
+    phases.forEach(phase => {
+      const count = selectedPhases.get(phase.id) || 0;
+      console.log(`[AI PIPELINE]   ${phase.id}: ${count} tools`);
+    });
+    
+    // Define phase-specific semantic queries
+    const phaseQueryTemplates = {
+      'data-collection': 'forensic data acquisition imaging memory disk capture evidence collection',
+      'examination': 'forensic analysis parsing extraction artifact examination file system',
+      'analysis': 'forensic correlation timeline analysis pattern detection investigation',
+      'reporting': 'forensic report documentation case management collaboration presentation findings'
+    };
+    
+    // Identify underrepresented phases (0 tools = missing, 1 tool = underrepresented)
+    const underrepresentedPhases = phases.filter(phase => {
+      const count = selectedPhases.get(phase.id) || 0;
+      return count <= 1; // Missing (0) or underrepresented (1)
+    });
+    
+    if (underrepresentedPhases.length === 0) {
+      console.log(`[AI PIPELINE] All phases adequately represented, no completion needed`);
+      return;
+    }
+    
+    console.log(`[AI PIPELINE] Underrepresented phases: ${underrepresentedPhases.map(p => p.id).join(', ')}`);
+    
+    // Process each underrepresented phase
+    for (const phase of underrepresentedPhases) {
+      await this.completePhaseWithSemanticSearch(context, phase, phaseQueryTemplates, toolsData, originalQuery);
+      await this.delay(this.microTaskDelay);
+    }
+  }
+
+  private async completePhaseWithSemanticSearch(
+    context: AnalysisContext,
+    phase: any,
+    phaseQueryTemplates: Record<string, string>,
+    toolsData: any,
+    originalQuery: string
+  ): Promise<void> {
+    const phaseStart = Date.now();
+    
+    // Generate phase-specific semantic query
+    const phaseQuery = phaseQueryTemplates[phase.id] || `forensic ${phase.name.toLowerCase()} tools methods`;
+    
+    console.log(`[AI PIPELINE] Completing phase ${phase.id} with query: "${phaseQuery}"`);
+    
+    try {
+      // Run semantic search with phase-specific query
+      const phaseResults = await embeddingsService.findSimilar(
+        phaseQuery,
+        20, // Smaller set for phase completion
+        0.2  // Lower threshold for more results
+      );
+      
+      if (phaseResults.length === 0) {
+        console.log(`[AI PIPELINE] No semantic results for phase ${phase.id}`);
+        return;
+      }
+      
+      // Filter to tools that actually belong to this phase
+      const toolsMap = new Map<string, any>(toolsData.tools.map((tool: any) => [tool.name, tool]));
+      const conceptsMap = new Map<string, any>(toolsData.concepts.map((concept: any) => [concept.name, concept]));
+      
+      const phaseTools = phaseResults
+        .filter(result => result.type === 'tool')
+        .map(result => toolsMap.get(result.name))
+        .filter((tool): tool is any => 
+          tool !== undefined && 
+          tool.phases && 
+          tool.phases.includes(phase.id) &&
+          !context.seenToolNames.has(tool.name) // Don't re-select already chosen tools
+        )
+        .slice(0, 5); // Top 5 candidates for this phase
+      
+      const phaseConcepts = phaseResults
+        .filter(result => result.type === 'concept')
+        .map(result => conceptsMap.get(result.name))
+        .filter((concept): concept is any => concept !== undefined)
+        .slice(0, 2); // Top 2 concepts
+      
+      console.log(`[AI PIPELINE] Phase ${phase.id} semantic search found: ${phaseTools.length} tools, ${phaseConcepts.length} concepts`);
+      
+      if (phaseTools.length === 0) {
+        console.log(`[AI PIPELINE] No suitable tools found for phase ${phase.id} after filtering`);
+        return;
+      }
+      
+      const prompt = AI_PROMPTS.generatePhaseCompletionPrompt(originalQuery, phase, phaseTools, phaseConcepts);
+      
+      const response = await this.callAI(prompt, 800);
+      const selection = this.safeParseJSON(response, { selectedTools: [], selectedConcepts: [] });
+      
+      const validTools = selection.selectedTools
+        .map(name => phaseTools.find(t => t.name === name))
+        .filter((tool): tool is any => tool !== undefined)
+        .slice(0, 2); 
+      
+      validTools.forEach(tool => {
+        console.log(`[AI PIPELINE] Adding phase completion tool: ${tool.name} for ${phase.id}`);
+        
+        this.addToolToSelection(
+          context,
+          tool,
+          phase.id,
+          'medium', // Phase completion tools get medium priority
+          `Hinzugefügt zur Vervollständigung der ${phase.name}-Phase`,
+          75, // Good relevance for phase-specific search
+          ['Via phasenspezifische semantische Suche hinzugefügt']
+        );
+      });
+      
+      // Audit the phase completion
+      this.addAuditEntry(context, 'validation', 'phase-completion',
+        { 
+          phase: phase.id, 
+          phaseQuery, 
+          candidatesFound: phaseTools.length,
+          originalQuery: originalQuery.slice(0, 100) + '...'
+        },
+        { 
+          toolsAdded: validTools.length,
+          addedTools: validTools.map(t => t.name),
+          semanticResults: phaseResults.length
+        },
+        validTools.length > 0 ? 80 : 40,
+        phaseStart,
+        { 
+          phaseCompletion: true, 
+          semanticSearch: true,
+          originalQueryBias: true 
+        }
+      );
+      
+    } catch (error) {
+      console.error(`[AI PIPELINE] Phase completion failed for ${phase.id}:`, error);
+      
+      this.addAuditEntry(context, 'validation', 'phase-completion-failed',
+        { phase: phase.id, phaseQuery },
+        { error: error.message },
+        10,
+        phaseStart,
+        { phaseCompletion: true, failed: true }
+      );
+    }
+  }
+
   private async evaluateSpecificTool(context: AnalysisContext, tool: any, rank: number): Promise<MicroTaskResult> {
     const existingSelection = context.selectedTools?.find(st => st.tool.name === tool.name);
     const taskRelevance = existingSelection?.taskRelevance || 70;
@@ -1373,6 +1535,10 @@ class ImprovedMicroTaskAIPipeline {
           if (toolSelectionResult.success) completeTasks++; else failedTasks++;
           await this.delay(this.microTaskDelay);
         }
+        
+        console.log('[AI PIPELINE] Checking for underrepresented phases...');
+        await this.completeUnderrepresentedPhases(context, toolsData, userQuery);
+        
       } else {
         const topTools = filteredData.tools.slice(0, 3);
         for (let i = 0; i < topTools.length; i++) {