From 28af56d6ef68b39addd6199a4eaf895238cf05b9 Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Mon, 18 Aug 2025 00:08:57 +0200 Subject: [PATCH] fix audit trail --- src/components/AIQueryInterface.astro | 129 +++++++++++++------------- src/utils/aiPipeline.ts | 99 ++++++++++++++++++++ src/utils/toolSelector.ts | 24 +++++ 3 files changed, 186 insertions(+), 66 deletions(-) diff --git a/src/components/AIQueryInterface.astro b/src/components/AIQueryInterface.astro index 6608db0..c09dc92 100644 --- a/src/components/AIQueryInterface.astro +++ b/src/components/AIQueryInterface.astro @@ -1131,10 +1131,55 @@ class AIQueryInterface { const lowConfidenceSteps = auditTrail.filter(entry => (entry.confidence || 0) < 60).length; const mediumConfidenceSteps = auditTrail.length - highConfidenceSteps - lowConfidenceSteps; + // FIX 1: Count actual AI decision actions only const aiDecisionCount = auditTrail.filter(entry => entry.action === 'ai-decision').length; - const embeddingsUsageCount = auditTrail.filter(entry => entry.metadata?.embeddingsUsed).length; + + // FIX 2: Count actual similarity search actions, not metadata flags + const embeddingsUsageCount = auditTrail.filter(entry => entry.action === 'similarity-search').length; + + // FIX 3: Maintain tool selection count (this was correct) const toolSelectionCount = auditTrail.filter(entry => entry.action === 'selection-decision').length; + // Additional diagnostic counts for debugging + const microTaskCount = auditTrail.filter(entry => + entry.action === 'ai-decision' && entry.metadata?.microTaskType + ).length; + + const phaseToolSelectionCount = auditTrail.filter(entry => + entry.action === 'phase-tool-selection' + ).length; + + const phaseEnhancementCount = auditTrail.filter(entry => + entry.action === 'phase-enhancement' + ).length; + + // Enhanced insights with diagnostic information + const keyInsights = []; + const potentialIssues = []; + + if (embeddingsUsageCount > 0) { + keyInsights.push(`Semantische Suche wurde ${embeddingsUsageCount}x erfolgreich eingesetzt`); + } else { + potentialIssues.push('Keine semantischen Suchen dokumentiert - möglicherweise fehlerhafte Auditierung'); + } + + if (aiDecisionCount >= 5) { + keyInsights.push(`${aiDecisionCount} KI-Entscheidungen mit detaillierter Begründung`); + } else { + potentialIssues.push(`Nur ${aiDecisionCount} KI-Entscheidungen dokumentiert - erwartet mindestens 5 für Vollständigkeit`); + } + + if (microTaskCount > 0) { + keyInsights.push(`${microTaskCount} spezialisierte Micro-Task-Analysen durchgeführt`); + } + + // Detect mode-specific patterns for validation + if (phaseToolSelectionCount > 0 || phaseEnhancementCount > 0) { + keyInsights.push('Workflow-Modus: Phasenspezifische Analyse durchgeführt'); + } else if (microTaskCount >= 3) { + keyInsights.push('Tool-Modus: Detaillierte Einzelbewertungen durchgeführt'); + } + const phaseBreakdown = {}; auditTrail.forEach(entry => { const phase = entry.phase || 'unknown'; @@ -1168,76 +1213,21 @@ class AIQueryInterface { analysisQuality = 'poor'; } - const keyInsights = []; - const embeddingsUsed = auditTrail.some(e => e.metadata?.embeddingsUsed); - if (embeddingsUsed) { - keyInsights.push('Semantische Suche wurde erfolgreich eingesetzt'); - } - - const aiDecisionsWithReasoning = auditTrail.filter(e => - e.action === 'ai-decision' && e.metadata?.reasoning - ).length; - if (aiDecisionsWithReasoning > 0) { - keyInsights.push(`${aiDecisionsWithReasoning} KI-Entscheidungen mit detaillierter Begründung`); - } - if (highConfidenceSteps > auditTrail.length * 0.7) { keyInsights.push('Mehrheit der Analyseschritte mit hoher Sicherheit'); } - const responseQualityEntries = auditTrail.filter(e => - e.metadata?.responseConfidence && e.metadata.finalConfidence - ); - if (responseQualityEntries.length > 0) { - const avgResponseQuality = responseQualityEntries.reduce((sum, e) => - sum + (e.metadata.responseConfidence || 0), 0 - ) / responseQualityEntries.length; - - if (avgResponseQuality >= 70) { - keyInsights.push(`Hohe AI-Antwortqualität (∅ ${Math.round(avgResponseQuality)}%)`); - } + // Validate expected counts based on mode detection + const isWorkflowMode = phaseToolSelectionCount > 0 || phaseEnhancementCount > 0; + const expectedMinAI = isWorkflowMode ? 11 : 8; // Workflow: 5 common + 6 phase selections, Tool: 5 common + 3 evaluations + const expectedMinEmbeddings = 1; // Both modes should have initial search + + if (aiDecisionCount < expectedMinAI) { + potentialIssues.push(`${expectedMinAI - aiDecisionCount} fehlende KI-Entscheidungen für ${isWorkflowMode ? 'Workflow' : 'Tool'}-Modus`); } - const potentialIssues = []; - if (lowConfidenceSteps > 2) { - potentialIssues.push(`${lowConfidenceSteps} Analyseschritte mit niedriger Konfidenz`); - } - - // FIXED: Only detect actual AI incompleteness, not display truncation - // The old code incorrectly flagged display truncation as incomplete responses: - // OLD (WRONG): e.output.response && e.output.response.includes('...') - - // NEW (CORRECT): Check metadata.aiResponse for actual incompleteness - const incompleteAIResponses = auditTrail.filter(e => - e.action === 'ai-decision' && - e.metadata?.aiResponse && - ( - // Detect actual AI incompleteness patterns: - e.metadata.aiResponse.trim().length < 10 || // Very short response - e.metadata.aiResponse.endsWith('...') || // AI itself truncated (rare but possible) - e.metadata.aiResponse.includes('[TRUNCATED]') || // Explicit truncation marker - e.metadata.aiResponse.includes('I cannot continue') || // AI stopped unexpectedly - e.metadata.aiResponse.includes('I need to stop here') || // AI indicated incompleteness - e.metadata.aiResponse.includes('[RESPONSE_TOO_LONG]') || // Length limit hit - // Also check if the AI response seems cut off mid-sentence - (e.metadata.aiResponse.length > 50 && - !e.metadata.aiResponse.trim().match(/[.!?:]$/)) // Doesn't end with proper punctuation - ) - ).length; - - if (incompleteAIResponses > 0) { - potentialIssues.push(`${incompleteAIResponses} möglicherweise unvollständige AI-Antworten`); - } - - // Additional quality checks - const veryShortResponses = auditTrail.filter(e => - e.action === 'ai-decision' && - e.metadata?.aiResponse && - e.metadata.aiResponse.trim().length < 20 - ).length; - - if (veryShortResponses > 1) { - potentialIssues.push(`${veryShortResponses} ungewöhnlich kurze AI-Antworten`); + if (embeddingsUsageCount < expectedMinEmbeddings) { + potentialIssues.push(`${expectedMinEmbeddings - embeddingsUsageCount} fehlende semantische Suchen`); } return { @@ -1260,7 +1250,14 @@ class AIQueryInterface { }, analysisQuality, keyInsights, - potentialIssues + potentialIssues, + // Debug information + debugCounts: { + microTaskCount, + phaseToolSelectionCount, + phaseEnhancementCount, + detectedMode: isWorkflowMode ? 'workflow' : 'tool' + } }; } diff --git a/src/utils/aiPipeline.ts b/src/utils/aiPipeline.ts index 23e32d2..d11ef8a 100644 --- a/src/utils/aiPipeline.ts +++ b/src/utils/aiPipeline.ts @@ -1184,6 +1184,37 @@ class AIPipeline { try { const response = await aiService.callMicroTaskAI(contextPrompt); + // FIX: Ensure ALL AI calls generate audit entries + const toolsDataHash = getDataVersion?.() || 'unknown'; + const aiConfig = aiService.getConfig(); + + // Calculate response confidence for audit trail + const responseConfidence = auditService.calculateAIResponseConfidence( + response.content, + this.getExpectedLengthForTaskType(taskType), + taskType + ); + + // FIX: Always add AI decision audit entry for micro-tasks + auditService.addAIDecision( + this.getPhaseForTaskType(taskType), + prompt, // Store original prompt without context + response.content, + responseConfidence, + this.getReasoningForTaskType(taskType, response.content), + startTime, + { + toolsDataHash: toolsDataHash, + microTaskType: taskType, + aiModel: aiConfig.model, + contextLength: contextPrompt.length, + originalPromptLength: prompt.length, + contextHistoryUsed: context.contextHistory.length > 0, + decisionBasis: 'ai-analysis', + ...response.usage + } + ); + return { taskType, content: response.content, @@ -1193,6 +1224,29 @@ class AIPipeline { }; } catch (error) { + // FIX: Also audit failed AI calls for completeness + auditService.addEntry( + this.getPhaseForTaskType(taskType), + 'ai-decision-failed', + { + prompt: prompt.slice(0, 200) + '...', + taskType: taskType, + error: error.message + }, + { + error: error.message, + success: false + }, + 0, // Zero confidence for failed calls + startTime, + { + toolsDataHash: getDataVersion?.() || 'unknown', + microTaskType: taskType, + failed: true, + decisionBasis: 'ai-analysis' + } + ); + return { taskType, content: '', @@ -1203,6 +1257,51 @@ class AIPipeline { } } + private getPhaseForTaskType(taskType: string): string { + const phaseMap: Record = { + 'scenario-analysis': 'contextual-analysis', + 'investigation-approach': 'contextual-analysis', + 'critical-considerations': 'contextual-analysis', + 'tool-evaluation': 'tool-evaluation', + 'background-knowledge': 'knowledge-synthesis', + 'final-recommendations': 'synthesis', + 'phase-completion-selection': 'phase-completion', + 'phase-completion-reasoning': 'phase-completion' + }; + return phaseMap[taskType] || 'contextual-analysis'; + } + + private getExpectedLengthForTaskType(taskType: string): { min: number; max: number } { + const lengthMap: Record = { + 'scenario-analysis': { min: 100, max: 500 }, + 'investigation-approach': { min: 100, max: 400 }, + 'critical-considerations': { min: 80, max: 300 }, + 'tool-evaluation': { min: 200, max: 800 }, + 'background-knowledge': { min: 50, max: 300 }, + 'final-recommendations': { min: 150, max: 600 }, + 'phase-completion-selection': { min: 50, max: 200 }, + 'phase-completion-reasoning': { min: 100, max: 300 } + }; + return lengthMap[taskType] || { min: 50, max: 300 }; + } + + private getReasoningForTaskType(taskType: string, response: string): string { + const responseLength = response.length; + const taskNames: Record = { + 'scenario-analysis': 'Szenario-Analyse', + 'investigation-approach': 'Untersuchungsansatz', + 'critical-considerations': 'Kritische Überlegungen', + 'tool-evaluation': 'Tool-Bewertung', + 'background-knowledge': 'Hintergrundwissen-Auswahl', + 'final-recommendations': 'Abschließende Empfehlungen', + 'phase-completion-selection': 'Phasen-Vervollständigung', + 'phase-completion-reasoning': 'Phasen-Begründung' + }; + + const taskName = taskNames[taskType] || taskType; + return `KI generierte ${taskName} (${responseLength} Zeichen) - forensisch fundierte Analyse mit methodischer Begründung`; + } + private addToContextHistory(context: PipelineContext, newEntry: string): void { const entryTokens = aiService.estimateTokens(newEntry); diff --git a/src/utils/toolSelector.ts b/src/utils/toolSelector.ts index 9c56964..fb34570 100644 --- a/src/utils/toolSelector.ts +++ b/src/utils/toolSelector.ts @@ -99,6 +99,9 @@ class ToolSelector { console.log('[TOOL-SELECTOR] Using embeddings for candidate selection'); + // FIX: Record the start time for audit trail + const embeddingsSearchStart = Date.now(); + const similarItems = await embeddingsService.findSimilar( userQuery, this.config.embeddingCandidates, @@ -107,6 +110,27 @@ class ToolSelector { console.log('[TOOL-SELECTOR] Embeddings found', similarItems.length, 'similar items'); + // FIX: Import and use auditService to record this embeddings search + const { auditService } = await import('./auditService.js'); + const { getDataVersion } = await import('./dataService.js'); + + const toolsDataHash = getDataVersion() || 'unknown'; + + // FIX: Add audit entry for initial embeddings search that happens in BOTH modes + auditService.addEmbeddingsSearch( + userQuery, + similarItems, + this.config.similarityThreshold, + embeddingsSearchStart, + { + toolsDataHash: toolsDataHash, + selectionPhase: 'initial-candidate-selection', + candidateLimit: this.config.embeddingCandidates, + mode: mode, + reasoning: `Initiale semantische Suche für ${mode}-Modus - Reduzierung der ${toolsData.tools.length} verfügbaren Tools auf ${similarItems.length} relevante Kandidaten` + } + ); + similarItems.forEach(item => { context.embeddingsSimilarities.set(item.name, item.similarity); });