From fe1be323bb705d23bd5cbc1273de7a5df0d87fc3 Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Tue, 5 Aug 2025 21:35:38 +0200 Subject: [PATCH] confidence updates, content adjustment --- .env.example | 14 +- src/components/AIQueryInterface.astro | 29 +-- src/config/prompts.ts | 87 ++++---- src/data/tools.yaml | 12 +- src/utils/aiPipeline.ts | 288 +++++++++----------------- 5 files changed, 160 insertions(+), 270 deletions(-) diff --git a/.env.example b/.env.example index 2cddc9b..1814d9f 100644 --- a/.env.example +++ b/.env.example @@ -117,7 +117,7 @@ AI_MAX_CONTEXT_TOKENS=4000 # Maximum tokens per individual AI prompt # Larger = more context per call | Smaller = faster responses -AI_MAX_PROMPT_TOKENS=1500 +AI_MAX_PROMPT_TOKENS=2500 # ============================================================================ # 6. AUTHENTICATION & AUTHORIZATION (OPTIONAL) @@ -190,18 +190,16 @@ FORENSIC_AUDIT_RETENTION_HOURS=24 FORENSIC_AUDIT_MAX_ENTRIES=50 # ============================================================================ -# 10. ENHANCED CONFIDENCE SCORING SYSTEM +# 10. SIMPLIFIED CONFIDENCE SCORING SYSTEM # ============================================================================ # Confidence component weights (must sum to 1.0) -CONFIDENCE_SEMANTIC_WEIGHT=0.25 # Weight for vector similarity quality -CONFIDENCE_SUITABILITY_WEIGHT=0.4 # Weight for AI-determined task fitness -CONFIDENCE_CONSISTENCY_WEIGHT=0.2 # Weight for cross-validation agreement -CONFIDENCE_RELIABILITY_WEIGHT=0.15 # Weight for tool quality indicators +CONFIDENCE_SEMANTIC_WEIGHT=0.5 # Weight for vector similarity quality +CONFIDENCE_SUITABILITY_WEIGHT=0.5 # Weight for AI-determined task fitness # Confidence thresholds (0-100) -CONFIDENCE_MINIMUM_THRESHOLD=40 # Below this = weak recommendation -CONFIDENCE_MEDIUM_THRESHOLD=60 # 40-59 = weak, 60-79 = moderate +CONFIDENCE_MINIMUM_THRESHOLD=50 # Below this = weak recommendation +CONFIDENCE_MEDIUM_THRESHOLD=70 # 40-59 = weak, 60-79 = moderate CONFIDENCE_HIGH_THRESHOLD=80 # 80+ = strong recommendation # ============================================================================ diff --git a/src/components/AIQueryInterface.astro b/src/components/AIQueryInterface.astro index 3c855fb..99f3c6b 100644 --- a/src/components/AIQueryInterface.astro +++ b/src/components/AIQueryInterface.astro @@ -756,17 +756,14 @@ class AIQueryInterface { renderConfidenceTooltip(confidence) { if (!confidence || typeof confidence.overall !== 'number') { - console.log('[AI DEBUG] No confidence data or invalid format:', confidence); return ''; } const confidenceColor = confidence.overall >= 80 ? 'var(--color-accent)' : - confidence.overall >= 60 ? 'var(--color-warning)' : 'var(--color-error)'; + confidence.overall >= 60 ? 'var(--color-warning)' : 'var(--color-error)'; const tooltipId = `tooltip-${Math.random().toString(36).substr(2, 9)}`; - console.log(`[AI DEBUG] Generating confidence tooltip: ${confidence.overall}% with ID ${tooltipId}`); - return ` ${confidence.semanticRelevance}%
- Wie gut die Tool-Beschreibung semantisch zu Ihrer Anfrage passt (basierend auf Vektor-Ähnlichkeit) + Wie gut die Tool-Beschreibung semantisch zu Ihrer Anfrage passt (Vektor-Ähnlichkeit)
@@ -802,26 +799,6 @@ class AIQueryInterface { KI-bewertete Eignung des Tools für Ihre spezifische forensische Aufgabenstellung - -
-
- 🤝 Methodische Konsistenz - ${confidence.methodologicalConsistency}% -
-
- Wie einheitlich verschiedene Analyseschritte dieses Tool bewerten (Kreuzvalidierung) -
-
- -
-
- 🔧 Tool-Zuverlässigkeit - ${confidence.toolReliability}% -
-
- Qualitätsindikatoren: Dokumentation, Wartung, Verfügbarkeit und Benutzerfreundlichkeit -
-
${confidence.strengthIndicators && confidence.strengthIndicators.length > 0 ? ` @@ -847,7 +824,7 @@ class AIQueryInterface { ` : ''}
- Mehrstufige KI-Analyse mit Kreuzvalidierung + Forensisch fundierte KI-Analyse
diff --git a/src/config/prompts.ts b/src/config/prompts.ts index 6b31a49..e0ac22d 100644 --- a/src/config/prompts.ts +++ b/src/config/prompts.ts @@ -120,61 +120,60 @@ ${aspects} WICHTIG: Antworten Sie NUR in fließendem deutschen Text ohne Listen oder Markdown. Maximum 120 Wörter.`; }, - // Phase tool selection prompt phaseToolSelection: (userQuery: string, phase: any, phaseTools: any[]) => { - return `Wählen Sie 2-3 Methoden/Tools für die Phase "${phase.name}" basierend auf objektiven, fallbezogenen Kriterien. + return `Wählen Sie 2-3 Methoden/Tools für die Phase "${phase.name}" und bewerten Sie deren Aufgaben-Eignung VERGLEICHEND. SZENARIO: "${userQuery}" +SPEZIFISCHE PHASE: ${phase.name} - ${phase.description || 'Forensische Untersuchungsphase'} VERFÜGBARE TOOLS FÜR ${phase.name.toUpperCase()}: -${phaseTools.map((tool: any) => `- ${tool.name}: ${tool.description.slice(0, 100)}...`).join('\n')} +${phaseTools.map((tool: any, index: number) => `${index + 1}. ${tool.name}: ${tool.description.slice(0, 150)}... + - Plattformen: ${tool.platforms?.join(', ') || 'N/A'} + - Skill Level: ${tool.skillLevel} + - Tags: ${tool.tags?.join(', ') || 'N/A'}`).join('\n\n')} -Wählen Sie Methoden/Tools nach forensischen Kriterien aus: -- Court admissibility und Chain of Custody Kompatibilität -- Integration in forensische Standard-Workflows -- Reproduzierbarkeit und Dokumentationsqualität -- Objektivität +Bewerten Sie ALLE Tools vergleichend für diese spezifische Aufgabe UND Phase. Wählen Sie die 2-3 besten aus. -Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format (kein zusätzlicher Text): +BEWERTUNGSKRITERIEN: +- Wie gut löst das Tool das forensische Problem im SZENARIO-Kontext? +- Wie gut passt es zur spezifischen PHASE "${phase.name}"? +- Wie vergleicht es sich mit den anderen verfügbaren Tools für diese Phase? + +Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format: [ { - "toolName": "Exakter Methoden/Tool-Name", - "priority": "high|medium|low", - "justification": "Objektive Begründung warum diese Methode/Tool für das spezifische Szenario besser geeignet ist" + "toolName": "Exakter Tool-Name", + "taskRelevance": 85, + "justification": "Vergleichende Begründung warum dieses Tool für diese Phase und Aufgabe besser/schlechter als die anderen geeignet ist", + "limitations": ["Spezifische Einschränkung 1", "Einschränkung 2"] } -]`; - }, - - // Tool evaluation prompt - toolEvaluation: (userQuery: string, tool: any, rank: number) => { - return `Sie sind ein DFIR-Experte und bewerten ein forensisches Tool für eine spezifische Aufgabe. - -PROBLEM: "${userQuery}" - -TOOL: ${tool.name} -BESCHREIBUNG: ${tool.description} -PLATTFORMEN: ${tool.platforms?.join(', ') || 'N/A'} -SKILL LEVEL: ${tool.skillLevel} -DOMAINS: ${tool.domains?.join(', ') || 'N/A'} -TAGS: ${tool.tags?.join(', ') || 'N/A'} - -Bewerten Sie nach forensischen Standards und antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format: -{ - "suitability_score": "high|medium|low", - "task_relevance": 85, - "detailed_explanation": "Detaillierte forensische Begründung warum diese Methode/Tool das Problem löst", - "implementation_approach": "Konkrete methodische Schritte zur korrekten Anwendung für dieses spezifische Problem", - "pros": ["Forensischer Vorteil 1", "Validierter Vorteil 2"], - "cons": ["Methodische Limitation 1", "Potenzielle Schwäche 2"], - "limitations": ["Spezifische Einschränkung 1", "Mögliche Problematik 2"], - "alternatives": "Alternative Ansätze falls diese Methode/Tool nicht optimal ist" -} +] WICHTIG: -- task_relevance: Numerischer Wert 0-100 wie gut das Tool für DIESE SPEZIFISCHE Aufgabe geeignet ist -- limitations: Konkrete Einschränkungen oder Situationen wo das Tool NICHT optimal wäre -- Berücksichtigen Sie den Skill Level vs. Anfrage-Komplexität -- Bewerten Sie objektiv, nicht beschönigend`; +- taskRelevance: 0-100 Score basierend auf Szenario-Eignung UND Phasen-Passung im VERGLEICH zu anderen Tools +- Nur die 2-3 BESTEN Tools auswählen und bewerten +- justification soll VERGLEICHEND sein ("besser als X weil...", "für diese Phase ideal weil...")`; + }, + + toolEvaluation: (userQuery: string, tool: any, rank: number, taskRelevance: number) => { + return `Sie sind ein DFIR-Experte. Erklären Sie DETAILLIERT die Anwendung dieses bereits bewerteten Tools. + +PROBLEM: "${userQuery}" +TOOL: ${tool.name} (bereits bewertet mit ${taskRelevance}% Aufgaben-Eignung) +BESCHREIBUNG: ${tool.description} + +Das Tool wurde bereits als Rang ${rank} für diese Aufgabe bewertet. Erklären Sie nun: + +Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format: +{ + "detailed_explanation": "Detaillierte Erklärung warum und wie dieses Tool für diese spezifische Aufgabe eingesetzt wird", + "implementation_approach": "Konkrete Schritt-für-Schritt Anleitung zur korrekten Anwendung", + "pros": ["Spezifischer Vorteil 1", "Spezifischer Vorteil 2"], + "cons": ["Bekannte Limitation 1", "Bekannte Limitation 2"], + "alternatives": "Alternative Ansätze oder Tools falls dieses nicht verfügbar ist" +} + +WICHTIG: Keine erneute Bewertung - nur detaillierte Erklärung der bereits bewerteten Eignung.`; }, // Background knowledge selection prompt @@ -229,7 +228,7 @@ export function getPrompt(key: 'scenarioAnalysis', isWorkflow: boolean, userQuer export function getPrompt(key: 'investigationApproach', isWorkflow: boolean, userQuery: string): string; export function getPrompt(key: 'criticalConsiderations', isWorkflow: boolean, userQuery: string): string; export function getPrompt(key: 'phaseToolSelection', userQuery: string, phase: any, phaseTools: any[]): string; -export function getPrompt(key: 'toolEvaluation', userQuery: string, tool: any, rank: number): string; +export function getPrompt(key: 'toolEvaluation', userQuery: string, tool: any, rank: number, taskRelevance: number): string; export function getPrompt(key: 'backgroundKnowledgeSelection', userQuery: string, mode: string, selectedToolNames: string[], availableConcepts: any[]): string; export function getPrompt(key: 'finalRecommendations', isWorkflow: boolean, userQuery: string, selectedToolNames: string[]): string; export function getPrompt(promptKey: keyof typeof AI_PROMPTS, ...args: any[]): string { diff --git a/src/data/tools.yaml b/src/data/tools.yaml index 0c3a185..808b804 100644 --- a/src/data/tools.yaml +++ b/src/data/tools.yaml @@ -3975,7 +3975,7 @@ tools: - name: KAPE type: software description: >- - Kroll Artifact Parser and Extractor revolutioniert Windows-Forensik durch + Kroll Artifact Parser and Extractor versucht sich an Windows-Forensik durch intelligente Ziel-basierte Sammlung. Statt Full-Disk-Images extrahiert KAPE gezielt kritische Artefakte: Registry-Hives, Event-Logs, Prefetch, Browser- Daten, Scheduled-Tasks in Minuten statt Stunden. Die Target-Files @@ -3983,12 +3983,10 @@ tools: Besonders clever: Compound-Targets gruppieren zusammengehörige Artefakte (z.B. "Browser" sammelt Chrome+Firefox+Edge), die gKAPE-GUI macht es auch für Nicht-Techniker zugänglich. Batch-Mode verarbeitet mehrere Images - parallel. Output direkt kompatibel zu Timeline-Tools wie Plaso. Die - ständigen Community-Updates halten mit Windows-Entwicklungen Schritt. + parallel. Output direkt kompatibel zu Timeline-Tools wie Plaso. VSS-Processing analysiert Shadow- Copies automatisch. Der - Remote-Collection-Mode sammelt über Netzwerk. Kostenlos aber - Enterprise-Support verfügbar. Der neue Standard für effiziente - Windows-Forensik-Triage. + Remote-Collection-Mode sammelt über Netzwerk. Kostenlos (mit Registrierung) aber + Enterprise-Support verfügbar. skillLevel: intermediate url: https://www.kroll.com/kape icon: 🧰 @@ -4003,7 +4001,7 @@ tools: platforms: - Windows accessType: download - license: Freeware + license: Proprietary knowledgebase: false - name: Kibana type: software diff --git a/src/utils/aiPipeline.ts b/src/utils/aiPipeline.ts index 396fcea..f291f3b 100644 --- a/src/utils/aiPipeline.ts +++ b/src/utils/aiPipeline.ts @@ -72,7 +72,6 @@ interface ConfidenceMetrics { semanticRelevance: number; // How well tool description matches query (from embeddings) taskSuitability: number; // AI-determined fitness for this specific task methodologicalConsistency: number; // How well different analysis steps agree - toolReliability: number; // Indicators of tool quality and maintenance uncertaintyFactors: string[]; // Specific reasons why this might not work strengthIndicators: string[]; // Specific reasons why this is a good choice } @@ -146,17 +145,17 @@ class ImprovedMicroTaskAIPipeline { // Updated confidence weights - more focused on AI evaluation this.confidenceConfig = { - semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.25'), // Embeddings similarity - suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.4'), // AI task fit evaluation - consistencyWeight: parseFloat(process.env.CONFIDENCE_CONSISTENCY_WEIGHT || '0.2'), // Cross-validation agreement - reliabilityWeight: parseFloat(process.env.CONFIDENCE_RELIABILITY_WEIGHT || '0.15'), // Tool quality indicators + semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.3'), // Embeddings similarity + suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.7'), // AI task fit evaluation + consistencyWeight: 0, + reliabilityWeight: 0, minimumThreshold: parseInt(process.env.CONFIDENCE_MINIMUM_THRESHOLD || '40', 10), mediumThreshold: parseInt(process.env.CONFIDENCE_MEDIUM_THRESHOLD || '60', 10), highThreshold: parseInt(process.env.CONFIDENCE_HIGH_THRESHOLD || '80', 10) }; - - console.log('[AI PIPELINE] Enhanced confidence scoring enabled:', { - weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight} Consistency:${this.confidenceConfig.consistencyWeight} Reliability:${this.confidenceConfig.reliabilityWeight}`, + + console.log('[AI PIPELINE] Simplified confidence scoring enabled:', { + weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight}`, thresholds: `${this.confidenceConfig.minimumThreshold}/${this.confidenceConfig.mediumThreshold}/${this.confidenceConfig.highThreshold}` }); } @@ -709,197 +708,107 @@ ${JSON.stringify(conceptsToSend, null, 2)}`; ): ConfidenceMetrics { // 1. Semantic Relevance: Real embeddings similarity score - const semanticRelevance = context.embeddingsSimilarities.has(tool.name) ? - Math.round(context.embeddingsSimilarities.get(tool.name)! * 100) : 50; + const rawSemanticRelevance = context.embeddingsSimilarities.has(tool.name) ? + context.embeddingsSimilarities.get(tool.name)! * 100 : 50; - // 2. Task Suitability: AI-determined fitness for specific task - const taskSuitability = Math.round(taskRelevance); + // 2. Task Suitability: Enhanced with phase-awareness for workflow mode + let enhancedTaskSuitability = taskRelevance; - // 3. Methodological Consistency: Cross-validation between micro-tasks - const methodologicalConsistency = this.calculateCrossValidationScore(tool.name, context); + if (context.mode === 'workflow') { + // In workflow mode, boost score if tool is well-matched to its assigned phase + const toolSelection = context.selectedTools?.find(st => st.tool.name === tool.name); + if (toolSelection && tool.phases && tool.phases.includes(toolSelection.phase)) { + // Boost for phase alignment (but cap at 100) + const phaseBonus = Math.min(15, 100 - taskRelevance); + enhancedTaskSuitability = Math.min(100, taskRelevance + phaseBonus); + + console.log(`[CONFIDENCE] Phase bonus for ${tool.name}: ${taskRelevance} -> ${enhancedTaskSuitability} (phase: ${toolSelection.phase})`); + } + } - // 4. Tool Reliability: Quality indicators - const toolReliability = this.calculateToolReliability(tool); - - // Debug logging - console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, { - semantic: semanticRelevance, - taskSuitability: taskSuitability, - consistency: methodologicalConsistency, - reliability: toolReliability, - hasEmbeddingsSimilarity: context.embeddingsSimilarities.has(tool.name), - rawTaskRelevance: taskRelevance - }); - - // Calculate weighted overall score + // Simple weighted combination - no artificial scaling const overall = ( - semanticRelevance * this.confidenceConfig.semanticWeight + - taskSuitability * this.confidenceConfig.suitabilityWeight + - methodologicalConsistency * this.confidenceConfig.consistencyWeight + - toolReliability * this.confidenceConfig.reliabilityWeight + rawSemanticRelevance * this.confidenceConfig.semanticWeight + + enhancedTaskSuitability * this.confidenceConfig.suitabilityWeight ); const uncertaintyFactors = this.identifySpecificUncertaintyFactors(tool, context, limitations, overall); const strengthIndicators = this.identifySpecificStrengthIndicators(tool, context, overall); + console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, { + rawSemantic: Math.round(rawSemanticRelevance), + rawTaskSuitability: taskRelevance, + enhancedTaskSuitability: Math.round(enhancedTaskSuitability), + overall: Math.round(overall), + mode: context.mode + }); + return { overall: Math.round(overall), - semanticRelevance: Math.round(semanticRelevance), - taskSuitability: Math.round(taskSuitability), - methodologicalConsistency: Math.round(methodologicalConsistency), - toolReliability: Math.round(toolReliability), + semanticRelevance: Math.round(rawSemanticRelevance), + taskSuitability: Math.round(enhancedTaskSuitability), + methodologicalConsistency: 0, uncertaintyFactors, strengthIndicators }; } - private calculateCrossValidationScore(toolName: string, context: AnalysisContext): number { - // Look for entries where this tool was mentioned across different phases - const relevantEntries = context.auditTrail.filter(entry => - entry.phase === 'micro-task' || entry.phase === 'selection' - ); - - let toolMentions = 0; - let positiveEvaluations = 0; - let confidenceSum = 0; - - relevantEntries.forEach(entry => { - let toolFound = false; - - // Check various ways the tool might be referenced in output - if (entry.output && typeof entry.output === 'object') { - // Check selectedTools arrays - if (Array.isArray(entry.output.selectedTools) && - entry.output.selectedTools.includes(toolName)) { - toolFound = true; - } - - // Check finalToolNames arrays - if (Array.isArray(entry.output.finalToolNames) && - entry.output.finalToolNames.includes(toolName)) { - toolFound = true; - } - - // Check toolName in individual evaluation - if (entry.output.toolName === toolName) { - toolFound = true; - } - } - - if (toolFound) { - toolMentions++; - confidenceSum += entry.confidence; - - // Consider it positive if confidence >= 60 - if (entry.confidence >= 60) { - positiveEvaluations++; - } - } - }); - - console.log(`[AI PIPELINE] Cross-validation for ${toolName}: ${toolMentions} mentions, ${positiveEvaluations} positive, avg confidence: ${toolMentions > 0 ? Math.round(confidenceSum / toolMentions) : 0}`); - - if (toolMentions === 0) { - return 60; // Default when no cross-validation data available - } - - if (toolMentions === 1) { - // Single mention - use confidence directly but cap it - return Math.min(85, Math.max(40, confidenceSum)); - } - - // Multiple mentions - calculate agreement ratio - const agreementRatio = positiveEvaluations / toolMentions; - const avgConfidence = confidenceSum / toolMentions; - - // Combine agreement ratio with average confidence - const crossValidationScore = (agreementRatio * 0.7 + (avgConfidence / 100) * 0.3) * 100; - - return Math.round(Math.min(95, Math.max(30, crossValidationScore))); - } - - // NEW: Calculate tool reliability based on objective indicators - private calculateToolReliability(tool: any): number { - let reliability = 50; // Base score - - // Documentation availability - if (tool.knowledgebase === true) reliability += 25; - - // Active maintenance (hosted tools are typically maintained) - if (isToolHosted(tool)) reliability += 20; - - // Community support (open source often has community) - if (tool.license && tool.license !== 'Proprietary') reliability += 10; - - // Skill level appropriateness (not too complex, not too simple) - if (tool.skillLevel === 'intermediate' || tool.skillLevel === 'advanced') reliability += 10; - else if (tool.skillLevel === 'expert') reliability -= 5; // May be overcomplicated - - // Multi-platform support (more versatile) - if (tool.platforms && tool.platforms.length > 1) reliability += 5; - - return Math.min(100, reliability); - } - - // NEW: Identify specific uncertainty factors based on analysis private identifySpecificUncertaintyFactors(tool: any, context: AnalysisContext, limitations: string[], confidence: number): string[] { const factors: string[] = []; - // Add AI-identified limitations + // Add AI-identified limitations first (most specific) if (limitations && limitations.length > 0) { - factors.push(...limitations.slice(0, 3)); // Limit to top 3 + factors.push(...limitations.slice(0, 2)); // Limit to top 2 to leave room for others } // Low semantic similarity const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5; - if (similarity < 0.4) { + if (similarity < 0.7) { factors.push('Geringe semantische Ähnlichkeit zur Anfrage - Tool-Beschreibung passt möglicherweise nicht optimal'); } - // Skill level mismatch - if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent/i.test(context.userQuery)) { - factors.push('Experten-Tool für Eilszenario - möglicherweise zu komplex für schnelle Antworten'); + // Skill level vs scenario complexity mismatch + if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent|sofort/i.test(context.userQuery)) { + factors.push('Experten-Tool für zeitkritisches Szenario - Setup und Einarbeitung könnten zu lange dauern'); } - if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced/i.test(context.userQuery)) { - factors.push('Einsteiger-Tool für komplexes Szenario - könnte funktionale Einschränkungen haben'); + if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced|forensisch/i.test(context.userQuery)) { + factors.push('Einsteiger-Tool für komplexe Analyse - könnte funktionale Limitierungen haben'); } - // Access limitations + // Platform availability concerns + if (tool.platforms && tool.platforms.length === 1 && tool.platforms[0] === 'Windows' && /linux|unix|server/i.test(context.userQuery)) { + factors.push('Nur Windows-Tool bei möglicher Linux/Server-Umgebung - Plattform-Inkompatibilität'); + } + + // Access and deployment concerns if (tool.type === 'software' && !isToolHosted(tool) && tool.accessType === 'download') { - factors.push('Installation erforderlich - nicht sofort verfügbar ohne Setup'); + factors.push('Installation und Setup erforderlich'); } - // Cross-validation disagreement - const crossValidation = this.calculateCrossValidationScore(tool.name, context); - if (crossValidation < 50) { - factors.push('Uneinheitliche Bewertung in verschiedenen Analyseschritten - Empfehlung nicht eindeutig'); + // License restrictions + if (tool.license === 'Proprietary') { + factors.push('Kommerzielle Software - Lizenzkosten und rechtliche Beschränkungen zu beachten'); } - return factors.slice(0, 4); // Limit to 4 most important factors + // Low overall confidence warning + if (confidence < 60) { + factors.push('Moderate Gesamtbewertung - alternative Ansätze sollten ebenfalls betrachtet werden'); + } + + return factors.slice(0, 4); // Limit to 4 most relevant factors } // NEW: Identify specific strength indicators private identifySpecificStrengthIndicators(tool: any, context: AnalysisContext, confidence: number): string[] { const indicators: string[] = []; - // High confidence overall - if (confidence >= this.confidenceConfig.highThreshold) { - indicators.push('Hohe Gesamtbewertung durch mehrfache Validierung'); - } - // High semantic similarity const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5; if (similarity >= 0.7) { indicators.push('Sehr gute semantische Übereinstimmung mit Ihrer Anfrage'); } - // Strong cross-validation - const crossValidation = this.calculateCrossValidationScore(tool.name, context); - if (crossValidation >= 80) { - indicators.push('Konsistente Empfehlung über verschiedene Analyseschritte hinweg'); - } - // Quality indicators if (tool.knowledgebase === true) { indicators.push('Umfassende Dokumentation und Wissensbasis verfügbar'); @@ -985,7 +894,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`; const prompt = getPrompt('phaseToolSelection', context.userQuery, phase, phaseTools); - const result = await this.callMicroTaskAI(prompt, context, 800); + const result = await this.callMicroTaskAI(prompt, context, 1000); if (result.success) { const selections = this.safeParseJSON(result.content, []); @@ -998,16 +907,30 @@ ${JSON.stringify(conceptsToSend, null, 2)}`; validSelections.forEach((sel: any) => { const tool = phaseTools.find((t: any) => t.name === sel.toolName); if (tool) { - this.addToolToSelection(context, tool, phase.id, sel.priority, sel.justification); + // Ensure taskRelevance is a number + const taskRelevance = typeof sel.taskRelevance === 'number' ? + sel.taskRelevance : parseInt(String(sel.taskRelevance)) || 70; + + // Derive priority automatically from score + const priority = this.derivePriorityFromScore(taskRelevance); + + this.addToolToSelection(context, tool, phase.id, priority, sel.justification, taskRelevance, sel.limitations); } }); this.addAuditEntry(context, 'micro-task', 'phase-tool-selection', { phase: phase.id, availableTools: phaseTools.length }, - { validSelections: validSelections.length, selectedTools: validSelections.map(s => s.toolName) }, + { + validSelections: validSelections.length, + selectedTools: validSelections.map(s => ({ + name: s.toolName, + taskRelevance: s.taskRelevance, + derivedPriority: this.derivePriorityFromScore(s.taskRelevance) + })) + }, validSelections.length > 0 ? 75 : 30, Date.now() - result.processingTimeMs, - { phaseName: phase.name } + { phaseName: phase.name, comparativeEvaluation: true, priorityDerived: true } ); } } @@ -1016,56 +939,46 @@ ${JSON.stringify(conceptsToSend, null, 2)}`; } private async evaluateSpecificTool(context: AnalysisContext, tool: any, rank: number): Promise { - const prompt = getPrompt('toolEvaluation', context.userQuery, tool, rank); + // Get existing task relevance from previous phase selection + const existingSelection = context.selectedTools?.find(st => st.tool.name === tool.name); + const taskRelevance = existingSelection?.taskRelevance || 70; + const priority = this.derivePriorityFromScore(taskRelevance); + + const prompt = getPrompt('toolEvaluation', context.userQuery, tool, rank, taskRelevance); - const result = await this.callMicroTaskAI(prompt, context, 1200); + const result = await this.callMicroTaskAI(prompt, context, 1000); if (result.success) { const evaluation = this.safeParseJSON(result.content, { - suitability_score: 'medium', - task_relevance: '', detailed_explanation: 'Evaluation failed', implementation_approach: '', pros: [], cons: [], - limitations: [], alternatives: '' }); - // Debug logging to see what we're getting - console.log(`[AI PIPELINE] Tool ${tool.name} evaluation:`, { - taskRelevance: evaluation.task_relevance, - suitabilityScore: evaluation.suitability_score, - limitationsCount: evaluation.limitations?.length || 0 - }); - - // Ensure task_relevance is a number - const taskRelevance = typeof evaluation.task_relevance === 'number' ? - evaluation.task_relevance : - parseInt(String(evaluation.task_relevance)) || 70; - - // Store enhanced evaluation data + // Store evaluation without re-scoring this.addToolToSelection(context, { ...tool, evaluation: { ...evaluation, - task_relevance: taskRelevance, // Ensure it's stored as number - rank + rank, + task_relevance: taskRelevance } - }, 'evaluation', evaluation.suitability_score, evaluation.detailed_explanation, - taskRelevance, evaluation.limitations); + }, 'evaluation', priority, evaluation.detailed_explanation, + taskRelevance, existingSelection?.limitations); this.addAuditEntry(context, 'micro-task', 'tool-evaluation', - { toolName: tool.name, rank }, + { toolName: tool.name, rank, existingTaskRelevance: taskRelevance, derivedPriority: priority }, { - suitabilityScore: evaluation.suitability_score, - taskRelevance: taskRelevance, // Use the cleaned number hasExplanation: !!evaluation.detailed_explanation, - limitationsIdentified: evaluation.limitations?.length || 0 + hasImplementationApproach: !!evaluation.implementation_approach, + prosCount: evaluation.pros?.length || 0, + consCount: evaluation.cons?.length || 0 }, - evaluation.suitability_score === 'high' ? 85 : evaluation.suitability_score === 'medium' ? 70 : 50, + 70, Date.now() - result.processingTimeMs, - { toolType: tool.type, taskRelevanceScore: taskRelevance } + { toolType: tool.type, explanationOnly: true, priorityDerived: true } ); } @@ -1173,6 +1086,12 @@ ${JSON.stringify(conceptsToSend, null, 2)}`; } } + private derivePriorityFromScore(taskRelevance: number): string { + if (taskRelevance >= 80) return 'high'; + if (taskRelevance >= 60) return 'medium'; + return 'low'; + } + async processQuery(userQuery: string, mode: string): Promise { const startTime = Date.now(); let completeTasks = 0; @@ -1323,8 +1242,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`; components: { semantic: confidence.semanticRelevance, suitability: confidence.taskSuitability, - consistency: confidence.methodologicalConsistency, - reliability: confidence.toolReliability + consistency: confidence.methodologicalConsistency } }, confidence.overall,