diff --git a/.env.example b/.env.example
index 2ad295d..2cddc9b 100644
--- a/.env.example
+++ b/.env.example
@@ -190,23 +190,20 @@ FORENSIC_AUDIT_RETENTION_HOURS=24
FORENSIC_AUDIT_MAX_ENTRIES=50
# ============================================================================
-# 10. CONFIDENCE SCORING SYSTEM (Enhancement 2)
+# 10. ENHANCED CONFIDENCE SCORING SYSTEM
# ============================================================================
# Confidence component weights (must sum to 1.0)
-CONFIDENCE_EMBEDDINGS_WEIGHT=0.3 # Weight for vector similarity quality
-CONFIDENCE_CONSENSUS_WEIGHT=0.25 # Weight for micro-task agreement
-CONFIDENCE_DOMAIN_MATCH_WEIGHT=0.25 # Weight for domain alignment
-CONFIDENCE_FRESHNESS_WEIGHT=0.2 # Weight for tool freshness/maintenance
+CONFIDENCE_SEMANTIC_WEIGHT=0.25 # Weight for vector similarity quality
+CONFIDENCE_SUITABILITY_WEIGHT=0.4 # Weight for AI-determined task fitness
+CONFIDENCE_CONSISTENCY_WEIGHT=0.2 # Weight for cross-validation agreement
+CONFIDENCE_RELIABILITY_WEIGHT=0.15 # Weight for tool quality indicators
# Confidence thresholds (0-100)
CONFIDENCE_MINIMUM_THRESHOLD=40 # Below this = weak recommendation
CONFIDENCE_MEDIUM_THRESHOLD=60 # 40-59 = weak, 60-79 = moderate
CONFIDENCE_HIGH_THRESHOLD=80 # 80+ = strong recommendation
-# Domain keywords for confidence scoring (domain:keyword1,keyword2|domain:keyword3,keyword4)
-CONFIDENCE_DOMAIN_KEYWORDS="incident-response:incident,breach,attack,compromise,response|malware-analysis:malware,virus,trojan,reverse,analysis|network-forensics:network,traffic,packet,pcap,wireshark|mobile-forensics:mobile,android,ios,phone,app|cloud-forensics:cloud,aws,azure,saas,paas"
-
# ============================================================================
# PERFORMANCE TUNING PRESETS
# ============================================================================
diff --git a/src/components/AIQueryInterface.astro b/src/components/AIQueryInterface.astro
index 38a5547..3c855fb 100644
--- a/src/components/AIQueryInterface.astro
+++ b/src/components/AIQueryInterface.astro
@@ -785,41 +785,41 @@ class AIQueryInterface {
- 🔍 Ähnlichkeit zur Anfrage
- ${confidence.embeddingsQuality}%
+ 🔍 Semantische Relevanz
+ ${confidence.semanticRelevance}%
- Wie gut die Tool-Beschreibung zu Ihrer Suchanfrage passt (basierend auf Vektor-Ähnlichkeit)
+ Wie gut die Tool-Beschreibung semantisch zu Ihrer Anfrage passt (basierend auf Vektor-Ähnlichkeit)
- 🎯 Domain-Passung
- ${confidence.domainAlignment}%
+ 🎯 Aufgaben-Eignung
+ ${confidence.taskSuitability}%
- Wie gut das Tool-Einsatzgebiet zu Ihrem forensischen Szenario passt
+ KI-bewertete Eignung des Tools für Ihre spezifische forensische Aufgabenstellung
- 🤝 KI-Konsens
- ${confidence.consensus}%
+ 🤝 Methodische Konsistenz
+ ${confidence.methodologicalConsistency}%
- Wie einig sich die verschiedenen KI-Analyseschritte über dieses Tool sind
+ Wie einheitlich verschiedene Analyseschritte dieses Tool bewerten (Kreuzvalidierung)
- 🔄 Aktualität
- ${confidence.freshness}%
+ 🔧 Tool-Zuverlässigkeit
+ ${confidence.toolReliability}%
- Wie aktuell und gut gepflegt das Tool ist (basierend auf Hosting-Status, Knowledge Base, Open Source)
+ Qualitätsindikatoren: Dokumentation, Wartung, Verfügbarkeit und Benutzerfreundlichkeit
@@ -827,7 +827,7 @@ class AIQueryInterface {
${confidence.strengthIndicators && confidence.strengthIndicators.length > 0 ? `
- ✓ Was für dieses Tool spricht:
+ ✓ Stärken dieser Empfehlung:
${confidence.strengthIndicators.slice(0, 3).map(s => `- ${this.sanitizeText(s)}
`).join('')}
@@ -838,7 +838,7 @@ class AIQueryInterface {
${confidence.uncertaintyFactors && confidence.uncertaintyFactors.length > 0 ? `
- ⚠ Unsicherheitsfaktoren:
+ ⚠ Mögliche Einschränkungen:
${confidence.uncertaintyFactors.slice(0, 3).map(f => `- ${this.sanitizeText(f)}
`).join('')}
@@ -847,7 +847,7 @@ class AIQueryInterface {
` : ''}
- Vertrauensscore basiert auf KI-Analyse • Forensisch validiert
+ Mehrstufige KI-Analyse mit Kreuzvalidierung
diff --git a/src/config/prompts.ts b/src/config/prompts.ts
index 112a15e..6b31a49 100644
--- a/src/config/prompts.ts
+++ b/src/config/prompts.ts
@@ -147,7 +147,7 @@ Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format (kein zusätzlicher Text):
// Tool evaluation prompt
toolEvaluation: (userQuery: string, tool: any, rank: number) => {
- return `Bewerten Sie diese Methode/Tool fallbezogen für das spezifische Problem nach forensischen Qualitätskriterien.
+ return `Sie sind ein DFIR-Experte und bewerten ein forensisches Tool für eine spezifische Aufgabe.
PROBLEM: "${userQuery}"
@@ -155,16 +155,26 @@ TOOL: ${tool.name}
BESCHREIBUNG: ${tool.description}
PLATTFORMEN: ${tool.platforms?.join(', ') || 'N/A'}
SKILL LEVEL: ${tool.skillLevel}
+DOMAINS: ${tool.domains?.join(', ') || 'N/A'}
+TAGS: ${tool.tags?.join(', ') || 'N/A'}
Bewerten Sie nach forensischen Standards und antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format:
{
"suitability_score": "high|medium|low",
+ "task_relevance": 85,
"detailed_explanation": "Detaillierte forensische Begründung warum diese Methode/Tool das Problem löst",
"implementation_approach": "Konkrete methodische Schritte zur korrekten Anwendung für dieses spezifische Problem",
"pros": ["Forensischer Vorteil 1", "Validierter Vorteil 2"],
"cons": ["Methodische Limitation 1", "Potenzielle Schwäche 2"],
+ "limitations": ["Spezifische Einschränkung 1", "Mögliche Problematik 2"],
"alternatives": "Alternative Ansätze falls diese Methode/Tool nicht optimal ist"
-}`;
+}
+
+WICHTIG:
+- task_relevance: Numerischer Wert 0-100 wie gut das Tool für DIESE SPEZIFISCHE Aufgabe geeignet ist
+- limitations: Konkrete Einschränkungen oder Situationen wo das Tool NICHT optimal wäre
+- Berücksichtigen Sie den Skill Level vs. Anfrage-Komplexität
+- Bewerten Sie objektiv, nicht beschönigend`;
},
// Background knowledge selection prompt
@@ -191,7 +201,7 @@ Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format:
// Final recommendations prompt
finalRecommendations: (isWorkflow: boolean, userQuery: string, selectedToolNames: string[]) => {
const prompt = isWorkflow ?
- `Erstellen Sie eine forensisch fundierte Workflow-Empfehlung basierend auf DFIR-Prinzipien.
+ `Erstellen Sie eine Workflow-Empfehlung basierend auf DFIR-Prinzipien.
SZENARIO: "${userQuery}"
AUSGEWÄHLTE TOOLS: ${selectedToolNames.join(', ') || 'Keine Tools ausgewählt'}
diff --git a/src/utils/aiPipeline.ts b/src/utils/aiPipeline.ts
index b1adefa..396fcea 100644
--- a/src/utils/aiPipeline.ts
+++ b/src/utils/aiPipeline.ts
@@ -1,7 +1,7 @@
-// src/utils/aiPipeline.ts - Enhanced with Audit Trail System
+// src/utils/aiPipeline.ts - Enhanced with Proper Confidence Scoring
import { getCompressedToolsDataForAI } from './dataService.js';
-import { embeddingsService, type EmbeddingData } from './embeddings.js';
+import { embeddingsService, type EmbeddingData, type SimilarityResult } from './embeddings.js';
import { AI_PROMPTS, getPrompt } from '../config/prompts.js';
import { isToolHosted } from './toolHelpers.js';
@@ -34,11 +34,11 @@ interface AnalysisResult {
interface AuditEntry {
timestamp: number;
- phase: string; // 'retrieval', 'selection', 'micro-task-N'
- action: string; // 'embeddings-search', 'ai-selection', 'tool-evaluation'
- input: any; // What went into this step
- output: any; // What came out of this step
- confidence: number; // 0-100: How confident we are in this step
+ phase: string;
+ action: string;
+ input: any;
+ output: any;
+ confidence: number;
processingTimeMs: number;
metadata: Record;
}
@@ -56,29 +56,27 @@ interface AnalysisContext {
problemAnalysis?: string;
investigationApproach?: string;
criticalConsiderations?: string;
- selectedTools?: Array<{tool: any, phase: string, priority: string, justification?: string}>;
+ selectedTools?: Array<{tool: any, phase: string, priority: string, justification?: string, taskRelevance?: number, limitations?: string[]}>;
backgroundKnowledge?: Array<{concept: any, relevance: string}>;
seenToolNames: Set;
auditTrail: AuditEntry[];
-}
-
-interface SimilarityResult extends EmbeddingData {
- similarity: number;
+
+ // Store actual similarity data from embeddings
+ embeddingsSimilarities: Map;
}
interface ConfidenceMetrics {
- overall: number; // 0-100: Combined confidence score
- embeddingsQuality: number; // How well embeddings matched
- domainAlignment: number; // How well tools match scenario domain
- consensus: number; // How much micro-tasks agree
- freshness: number; // How recent/up-to-date the selection is
- uncertaintyFactors: string[]; // What could make this wrong
- strengthIndicators: string[]; // What makes this recommendation strong
+ overall: number; // 0-100: Combined confidence score
+ semanticRelevance: number; // How well tool description matches query (from embeddings)
+ taskSuitability: number; // AI-determined fitness for this specific task
+ methodologicalConsistency: number; // How well different analysis steps agree
+ toolReliability: number; // Indicators of tool quality and maintenance
+ uncertaintyFactors: string[]; // Specific reasons why this might not work
+ strengthIndicators: string[]; // Specific reasons why this is a good choice
}
-
class ImprovedMicroTaskAIPipeline {
private config: AIConfig;
private maxSelectedItems: number;
@@ -105,10 +103,10 @@ class ImprovedMicroTaskAIPipeline {
};
private confidenceConfig: {
- embeddingsWeight: number;
- consensusWeight: number;
- domainMatchWeight: number;
- freshnessWeight: number;
+ semanticWeight: number; // Weight for embeddings similarity
+ suitabilityWeight: number; // Weight for AI task fit evaluation
+ consistencyWeight: number; // Weight for cross-validation agreement
+ reliabilityWeight: number; // Weight for tool quality indicators
minimumThreshold: number;
mediumThreshold: number;
highThreshold: number;
@@ -146,25 +144,19 @@ class ImprovedMicroTaskAIPipeline {
retentionHours: parseInt(process.env.FORENSIC_AUDIT_RETENTION_HOURS || '72', 10)
};
- console.log('[AI PIPELINE] Configuration loaded:', {
- embeddingCandidates: this.embeddingCandidates,
- embeddingSelection: `${this.embeddingSelectionLimit} tools, ${this.embeddingConceptsLimit} concepts`,
- noEmbeddingsLimits: `${this.noEmbeddingsToolLimit || 'unlimited'} tools, ${this.noEmbeddingsConceptLimit || 'unlimited'} concepts`,
- auditEnabled: this.auditConfig.enabled
- });
-
+ // Updated confidence weights - more focused on AI evaluation
this.confidenceConfig = {
- embeddingsWeight: parseFloat(process.env.CONFIDENCE_EMBEDDINGS_WEIGHT || '0.3'),
- consensusWeight: parseFloat(process.env.CONFIDENCE_CONSENSUS_WEIGHT || '0.25'),
- domainMatchWeight: parseFloat(process.env.CONFIDENCE_DOMAIN_MATCH_WEIGHT || '0.25'),
- freshnessWeight: parseFloat(process.env.CONFIDENCE_FRESHNESS_WEIGHT || '0.2'),
+ semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.25'), // Embeddings similarity
+ suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.4'), // AI task fit evaluation
+ consistencyWeight: parseFloat(process.env.CONFIDENCE_CONSISTENCY_WEIGHT || '0.2'), // Cross-validation agreement
+ reliabilityWeight: parseFloat(process.env.CONFIDENCE_RELIABILITY_WEIGHT || '0.15'), // Tool quality indicators
minimumThreshold: parseInt(process.env.CONFIDENCE_MINIMUM_THRESHOLD || '40', 10),
mediumThreshold: parseInt(process.env.CONFIDENCE_MEDIUM_THRESHOLD || '60', 10),
highThreshold: parseInt(process.env.CONFIDENCE_HIGH_THRESHOLD || '80', 10)
};
- console.log('[AI PIPELINE] Confidence scoring enabled:', {
- weights: `E:${this.confidenceConfig.embeddingsWeight} C:${this.confidenceConfig.consensusWeight} D:${this.confidenceConfig.domainMatchWeight} F:${this.confidenceConfig.freshnessWeight}`,
+ console.log('[AI PIPELINE] Enhanced confidence scoring enabled:', {
+ weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight} Consistency:${this.confidenceConfig.consistencyWeight} Reliability:${this.confidenceConfig.reliabilityWeight}`,
thresholds: `${this.confidenceConfig.minimumThreshold}/${this.confidenceConfig.mediumThreshold}/${this.confidenceConfig.highThreshold}`
});
}
@@ -247,8 +239,8 @@ class ImprovedMicroTaskAIPipeline {
let confidence = 60; // Base confidence
if (selectionRatio > 0.05 && selectionRatio < 0.3) confidence += 20;
- else if (selectionRatio <= 0.05) confidence -= 10; // Too few
- else confidence -= 15; // Too many
+ else if (selectionRatio <= 0.05) confidence -= 10;
+ else confidence -= 15;
if (hasReasoning) confidence += 15;
@@ -357,7 +349,7 @@ class ImprovedMicroTaskAIPipeline {
const possibleTools = toolMatches
.map(match => match.replace(/"/g, ''))
.filter(name => name.length > 2 && !['selectedTools', 'selectedConcepts', 'reasoning'].includes(name))
- .slice(0, 15); // Reasonable limit
+ .slice(0, 15);
if (possibleTools.length > 0) {
console.log(`[AI PIPELINE] Recovered ${possibleTools.length} possible tool names from broken JSON`);
@@ -374,7 +366,7 @@ class ImprovedMicroTaskAIPipeline {
}
}
- private addToolToSelection(context: AnalysisContext, tool: any, phase: string, priority: string, justification?: string): boolean {
+ private addToolToSelection(context: AnalysisContext, tool: any, phase: string, priority: string, justification?: string, taskRelevance?: number, limitations?: string[]): boolean {
context.seenToolNames.add(tool.name);
if (!context.selectedTools) context.selectedTools = [];
@@ -382,18 +374,22 @@ class ImprovedMicroTaskAIPipeline {
tool,
phase,
priority,
- justification
+ justification,
+ taskRelevance,
+ limitations
});
return true;
}
- private async getIntelligentCandidates(userQuery: string, toolsData: any, mode: string) {
+ private async getIntelligentCandidates(userQuery: string, toolsData: any, mode: string, context: AnalysisContext) {
let candidateTools: any[] = [];
let candidateConcepts: any[] = [];
let selectionMethod = 'unknown';
- // WAIT for embeddings initialization if embeddings are enabled
+ // Initialize embeddings similarities storage
+ context.embeddingsSimilarities = new Map();
+
if (process.env.AI_EMBEDDINGS_ENABLED === 'true') {
try {
console.log('[AI PIPELINE] Waiting for embeddings initialization...');
@@ -414,6 +410,11 @@ class ImprovedMicroTaskAIPipeline {
console.log(`[AI PIPELINE] Embeddings found ${similarItems.length} similar items`);
+ // Store actual similarity scores for confidence calculation
+ similarItems.forEach(item => {
+ context.embeddingsSimilarities.set(item.name, item.similarity);
+ });
+
const toolsMap = new Map(toolsData.tools.map((tool: any) => [tool.name, tool]));
const conceptsMap = new Map(toolsData.concepts.map((concept: any) => [concept.name, concept]));
@@ -450,7 +451,7 @@ class ImprovedMicroTaskAIPipeline {
}
if (this.auditConfig.enabled) {
- this.addAuditEntry(null, 'retrieval', 'embeddings-search',
+ this.addAuditEntry(context, 'retrieval', 'embeddings-search',
{ query: userQuery, threshold: this.similarityThreshold, candidates: this.embeddingCandidates },
{
candidatesFound: similarItems.length,
@@ -459,7 +460,8 @@ class ImprovedMicroTaskAIPipeline {
reductionRatio: reductionRatio,
usingEmbeddings: selectionMethod === 'embeddings_candidates',
totalAvailable: totalAvailableTools,
- filtered: similarTools.length
+ filtered: similarTools.length,
+ avgSimilarity: similarItems.length > 0 ? similarItems.reduce((sum, item) => sum + item.similarity, 0) / similarItems.length : 0
},
selectionMethod === 'embeddings_candidates' ? 85 : 60,
embeddingsStart,
@@ -479,7 +481,7 @@ class ImprovedMicroTaskAIPipeline {
}
console.log(`[AI PIPELINE] AI will analyze ${candidateTools.length} candidate tools (method: ${selectionMethod})`);
- const finalSelection = await this.aiSelectionWithFullData(userQuery, candidateTools, candidateConcepts, mode, selectionMethod);
+ const finalSelection = await this.aiSelectionWithFullData(userQuery, candidateTools, candidateConcepts, mode, selectionMethod, context);
return {
tools: finalSelection.selectedTools,
@@ -495,7 +497,8 @@ class ImprovedMicroTaskAIPipeline {
candidateTools: any[],
candidateConcepts: any[],
mode: string,
- selectionMethod: string
+ selectionMethod: string,
+ context: AnalysisContext
) {
const selectionStart = Date.now();
@@ -576,7 +579,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
console.error('[AI PIPELINE] AI selection returned invalid structure:', response.slice(0, 200));
if (this.auditConfig.enabled) {
- this.addAuditEntry(null, 'selection', 'ai-tool-selection-failed',
+ this.addAuditEntry(context, 'selection', 'ai-tool-selection-failed',
{ candidateCount: candidateTools.length, mode, prompt: prompt.slice(0, 200) },
{ error: 'Invalid JSON structure', response: response.slice(0, 200) },
10,
@@ -602,7 +605,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
if (this.auditConfig.enabled) {
const confidence = this.calculateSelectionConfidence(result, candidateTools.length);
- this.addAuditEntry(null, 'selection', 'ai-tool-selection',
+ this.addAuditEntry(context, 'selection', 'ai-tool-selection',
{ candidateCount: candidateTools.length, mode, promptLength: prompt.length },
{
selectedToolCount: result.selectedTools.length,
@@ -626,7 +629,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
console.error('[AI PIPELINE] AI selection failed:', error);
if (this.auditConfig.enabled) {
- this.addAuditEntry(null, 'selection', 'ai-tool-selection-error',
+ this.addAuditEntry(context, 'selection', 'ai-tool-selection-error',
{ candidateCount: candidateTools.length, mode },
{ error: error.message },
5,
@@ -700,38 +703,225 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
private calculateRecommendationConfidence(
tool: any,
- embeddingsSimilarity: number,
- domainMatch: boolean,
- microTaskAgreement: number,
- context: AnalysisContext
+ context: AnalysisContext,
+ taskRelevance: number = 70,
+ limitations: string[] = []
): ConfidenceMetrics {
- const embeddingsQuality = Math.min(100, embeddingsSimilarity * 100 * 2); // Scale 0.5 similarity to 100%
- const domainAlignment = domainMatch ? 90 : (tool.domains?.length > 0 ? 60 : 30);
- const consensus = Math.min(100, microTaskAgreement * 100);
- const freshness = this.calculateToolFreshness(tool);
+ // 1. Semantic Relevance: Real embeddings similarity score
+ const semanticRelevance = context.embeddingsSimilarities.has(tool.name) ?
+ Math.round(context.embeddingsSimilarities.get(tool.name)! * 100) : 50;
+ // 2. Task Suitability: AI-determined fitness for specific task
+ const taskSuitability = Math.round(taskRelevance);
+
+ // 3. Methodological Consistency: Cross-validation between micro-tasks
+ const methodologicalConsistency = this.calculateCrossValidationScore(tool.name, context);
+
+ // 4. Tool Reliability: Quality indicators
+ const toolReliability = this.calculateToolReliability(tool);
+
+ // Debug logging
+ console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, {
+ semantic: semanticRelevance,
+ taskSuitability: taskSuitability,
+ consistency: methodologicalConsistency,
+ reliability: toolReliability,
+ hasEmbeddingsSimilarity: context.embeddingsSimilarities.has(tool.name),
+ rawTaskRelevance: taskRelevance
+ });
+
+ // Calculate weighted overall score
const overall = (
- embeddingsQuality * this.confidenceConfig.embeddingsWeight +
- domainAlignment * this.confidenceConfig.domainMatchWeight +
- consensus * this.confidenceConfig.consensusWeight +
- freshness * this.confidenceConfig.freshnessWeight
+ semanticRelevance * this.confidenceConfig.semanticWeight +
+ taskSuitability * this.confidenceConfig.suitabilityWeight +
+ methodologicalConsistency * this.confidenceConfig.consistencyWeight +
+ toolReliability * this.confidenceConfig.reliabilityWeight
);
- const uncertaintyFactors = this.identifyUncertaintyFactors(tool, context, overall);
- const strengthIndicators = this.identifyStrengthIndicators(tool, context, overall);
+ const uncertaintyFactors = this.identifySpecificUncertaintyFactors(tool, context, limitations, overall);
+ const strengthIndicators = this.identifySpecificStrengthIndicators(tool, context, overall);
return {
overall: Math.round(overall),
- embeddingsQuality: Math.round(embeddingsQuality),
- domainAlignment: Math.round(domainAlignment),
- consensus: Math.round(consensus),
- freshness: Math.round(freshness),
+ semanticRelevance: Math.round(semanticRelevance),
+ taskSuitability: Math.round(taskSuitability),
+ methodologicalConsistency: Math.round(methodologicalConsistency),
+ toolReliability: Math.round(toolReliability),
uncertaintyFactors,
strengthIndicators
};
}
+ private calculateCrossValidationScore(toolName: string, context: AnalysisContext): number {
+ // Look for entries where this tool was mentioned across different phases
+ const relevantEntries = context.auditTrail.filter(entry =>
+ entry.phase === 'micro-task' || entry.phase === 'selection'
+ );
+
+ let toolMentions = 0;
+ let positiveEvaluations = 0;
+ let confidenceSum = 0;
+
+ relevantEntries.forEach(entry => {
+ let toolFound = false;
+
+ // Check various ways the tool might be referenced in output
+ if (entry.output && typeof entry.output === 'object') {
+ // Check selectedTools arrays
+ if (Array.isArray(entry.output.selectedTools) &&
+ entry.output.selectedTools.includes(toolName)) {
+ toolFound = true;
+ }
+
+ // Check finalToolNames arrays
+ if (Array.isArray(entry.output.finalToolNames) &&
+ entry.output.finalToolNames.includes(toolName)) {
+ toolFound = true;
+ }
+
+ // Check toolName in individual evaluation
+ if (entry.output.toolName === toolName) {
+ toolFound = true;
+ }
+ }
+
+ if (toolFound) {
+ toolMentions++;
+ confidenceSum += entry.confidence;
+
+ // Consider it positive if confidence >= 60
+ if (entry.confidence >= 60) {
+ positiveEvaluations++;
+ }
+ }
+ });
+
+ console.log(`[AI PIPELINE] Cross-validation for ${toolName}: ${toolMentions} mentions, ${positiveEvaluations} positive, avg confidence: ${toolMentions > 0 ? Math.round(confidenceSum / toolMentions) : 0}`);
+
+ if (toolMentions === 0) {
+ return 60; // Default when no cross-validation data available
+ }
+
+ if (toolMentions === 1) {
+ // Single mention - use confidence directly but cap it
+ return Math.min(85, Math.max(40, confidenceSum));
+ }
+
+ // Multiple mentions - calculate agreement ratio
+ const agreementRatio = positiveEvaluations / toolMentions;
+ const avgConfidence = confidenceSum / toolMentions;
+
+ // Combine agreement ratio with average confidence
+ const crossValidationScore = (agreementRatio * 0.7 + (avgConfidence / 100) * 0.3) * 100;
+
+ return Math.round(Math.min(95, Math.max(30, crossValidationScore)));
+ }
+
+ // NEW: Calculate tool reliability based on objective indicators
+ private calculateToolReliability(tool: any): number {
+ let reliability = 50; // Base score
+
+ // Documentation availability
+ if (tool.knowledgebase === true) reliability += 25;
+
+ // Active maintenance (hosted tools are typically maintained)
+ if (isToolHosted(tool)) reliability += 20;
+
+ // Community support (open source often has community)
+ if (tool.license && tool.license !== 'Proprietary') reliability += 10;
+
+ // Skill level appropriateness (not too complex, not too simple)
+ if (tool.skillLevel === 'intermediate' || tool.skillLevel === 'advanced') reliability += 10;
+ else if (tool.skillLevel === 'expert') reliability -= 5; // May be overcomplicated
+
+ // Multi-platform support (more versatile)
+ if (tool.platforms && tool.platforms.length > 1) reliability += 5;
+
+ return Math.min(100, reliability);
+ }
+
+ // NEW: Identify specific uncertainty factors based on analysis
+ private identifySpecificUncertaintyFactors(tool: any, context: AnalysisContext, limitations: string[], confidence: number): string[] {
+ const factors: string[] = [];
+
+ // Add AI-identified limitations
+ if (limitations && limitations.length > 0) {
+ factors.push(...limitations.slice(0, 3)); // Limit to top 3
+ }
+
+ // Low semantic similarity
+ const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5;
+ if (similarity < 0.4) {
+ factors.push('Geringe semantische Ähnlichkeit zur Anfrage - Tool-Beschreibung passt möglicherweise nicht optimal');
+ }
+
+ // Skill level mismatch
+ if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent/i.test(context.userQuery)) {
+ factors.push('Experten-Tool für Eilszenario - möglicherweise zu komplex für schnelle Antworten');
+ }
+
+ if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced/i.test(context.userQuery)) {
+ factors.push('Einsteiger-Tool für komplexes Szenario - könnte funktionale Einschränkungen haben');
+ }
+
+ // Access limitations
+ if (tool.type === 'software' && !isToolHosted(tool) && tool.accessType === 'download') {
+ factors.push('Installation erforderlich - nicht sofort verfügbar ohne Setup');
+ }
+
+ // Cross-validation disagreement
+ const crossValidation = this.calculateCrossValidationScore(tool.name, context);
+ if (crossValidation < 50) {
+ factors.push('Uneinheitliche Bewertung in verschiedenen Analyseschritten - Empfehlung nicht eindeutig');
+ }
+
+ return factors.slice(0, 4); // Limit to 4 most important factors
+ }
+
+ // NEW: Identify specific strength indicators
+ private identifySpecificStrengthIndicators(tool: any, context: AnalysisContext, confidence: number): string[] {
+ const indicators: string[] = [];
+
+ // High confidence overall
+ if (confidence >= this.confidenceConfig.highThreshold) {
+ indicators.push('Hohe Gesamtbewertung durch mehrfache Validierung');
+ }
+
+ // High semantic similarity
+ const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5;
+ if (similarity >= 0.7) {
+ indicators.push('Sehr gute semantische Übereinstimmung mit Ihrer Anfrage');
+ }
+
+ // Strong cross-validation
+ const crossValidation = this.calculateCrossValidationScore(tool.name, context);
+ if (crossValidation >= 80) {
+ indicators.push('Konsistente Empfehlung über verschiedene Analyseschritte hinweg');
+ }
+
+ // Quality indicators
+ if (tool.knowledgebase === true) {
+ indicators.push('Umfassende Dokumentation und Wissensbasis verfügbar');
+ }
+
+ if (isToolHosted(tool)) {
+ indicators.push('Sofort verfügbar über gehostete Lösung - kein Setup erforderlich');
+ }
+
+ // Skill level match
+ if (tool.skillLevel === 'intermediate' || tool.skillLevel === 'advanced') {
+ indicators.push('Ausgewogenes Verhältnis zwischen Funktionalität und Benutzerfreundlichkeit');
+ }
+
+ // Method alignment
+ if (tool.type === 'method' && /methodik|vorgehen|prozess|ansatz/i.test(context.userQuery)) {
+ indicators.push('Methodischer Ansatz passt zu Ihrer prozeduralen Anfrage');
+ }
+
+ return indicators.slice(0, 4); // Limit to 4 most important indicators
+ }
+
private async analyzeScenario(context: AnalysisContext): Promise {
const isWorkflow = context.mode === 'workflow';
const prompt = getPrompt('scenarioAnalysis', isWorkflow, context.userQuery);
@@ -833,27 +1023,49 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
if (result.success) {
const evaluation = this.safeParseJSON(result.content, {
suitability_score: 'medium',
+ task_relevance: '',
detailed_explanation: 'Evaluation failed',
implementation_approach: '',
pros: [],
cons: [],
+ limitations: [],
alternatives: ''
});
+ // Debug logging to see what we're getting
+ console.log(`[AI PIPELINE] Tool ${tool.name} evaluation:`, {
+ taskRelevance: evaluation.task_relevance,
+ suitabilityScore: evaluation.suitability_score,
+ limitationsCount: evaluation.limitations?.length || 0
+ });
+
+ // Ensure task_relevance is a number
+ const taskRelevance = typeof evaluation.task_relevance === 'number' ?
+ evaluation.task_relevance :
+ parseInt(String(evaluation.task_relevance)) || 70;
+
+ // Store enhanced evaluation data
this.addToolToSelection(context, {
...tool,
evaluation: {
...evaluation,
+ task_relevance: taskRelevance, // Ensure it's stored as number
rank
}
- }, 'evaluation', evaluation.suitability_score);
+ }, 'evaluation', evaluation.suitability_score, evaluation.detailed_explanation,
+ taskRelevance, evaluation.limitations);
this.addAuditEntry(context, 'micro-task', 'tool-evaluation',
{ toolName: tool.name, rank },
- { suitabilityScore: evaluation.suitability_score, hasExplanation: !!evaluation.detailed_explanation },
+ {
+ suitabilityScore: evaluation.suitability_score,
+ taskRelevance: taskRelevance, // Use the cleaned number
+ hasExplanation: !!evaluation.detailed_explanation,
+ limitationsIdentified: evaluation.limitations?.length || 0
+ },
evaluation.suitability_score === 'high' ? 85 : evaluation.suitability_score === 'medium' ? 70 : 50,
Date.now() - result.processingTimeMs,
- { toolType: tool.type }
+ { toolType: tool.type, taskRelevanceScore: taskRelevance }
);
}
@@ -963,28 +1175,31 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
async processQuery(userQuery: string, mode: string): Promise {
const startTime = Date.now();
- let completedTasks = 0;
+ let completeTasks = 0;
let failedTasks = 0;
this.tempAuditEntries = [];
- console.log(`[AI PIPELINE] Starting ${mode} query processing with context continuity and audit trail`);
+ console.log(`[AI PIPELINE] Starting ${mode} query processing with enhanced confidence scoring`);
try {
const toolsData = await getCompressedToolsDataForAI();
- const filteredData = await this.getIntelligentCandidates(userQuery, toolsData, mode);
const context: AnalysisContext = {
userQuery,
mode,
- filteredData,
+ filteredData: {}, // Will be populated by getIntelligentCandidates
contextHistory: [],
maxContextLength: this.maxContextTokens,
currentContextLength: 0,
seenToolNames: new Set(),
- auditTrail: []
+ auditTrail: [],
+ embeddingsSimilarities: new Map()
};
+ const filteredData = await this.getIntelligentCandidates(userQuery, toolsData, mode, context);
+ context.filteredData = filteredData;
+
this.mergeTemporaryAuditEntries(context);
console.log(`[AI PIPELINE] Starting micro-tasks with ${filteredData.tools.length} tools visible`);
@@ -994,58 +1209,54 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
{ candidateTools: filteredData.tools.length, candidateConcepts: filteredData.concepts.length },
90,
startTime,
- { auditEnabled: this.auditConfig.enabled }
+ { auditEnabled: this.auditConfig.enabled, confidenceScoringEnabled: true }
);
- // MICRO-TASK SEQUENCE
+ // MICRO-TASK SEQUENCE WITH ENHANCED CONFIDENCE TRACKING
- // Task 1: Scenario/Problem Analysis
const analysisResult = await this.analyzeScenario(context);
- if (analysisResult.success) completedTasks++; else failedTasks++;
+ if (analysisResult.success) completeTasks++; else failedTasks++;
await this.delay(this.microTaskDelay);
- // Task 2: Investigation/Solution Approach
const approachResult = await this.generateApproach(context);
- if (approachResult.success) completedTasks++; else failedTasks++;
+ if (approachResult.success) completeTasks++; else failedTasks++;
await this.delay(this.microTaskDelay);
- // Task 3: Critical Considerations
const considerationsResult = await this.generateCriticalConsiderations(context);
- if (considerationsResult.success) completedTasks++; else failedTasks++;
+ if (considerationsResult.success) completeTasks++; else failedTasks++;
await this.delay(this.microTaskDelay);
- // Task 4: Tool Selection/Evaluation (mode-dependent)
if (mode === 'workflow') {
const phases = toolsData.phases || [];
for (const phase of phases) {
const toolSelectionResult = await this.selectToolsForPhase(context, phase);
- if (toolSelectionResult.success) completedTasks++; else failedTasks++;
+ if (toolSelectionResult.success) completeTasks++; else failedTasks++;
await this.delay(this.microTaskDelay);
}
} else {
const topTools = filteredData.tools.slice(0, 3);
for (let i = 0; i < topTools.length; i++) {
const evaluationResult = await this.evaluateSpecificTool(context, topTools[i], i + 1);
- if (evaluationResult.success) completedTasks++; else failedTasks++;
+ if (evaluationResult.success) completeTasks++; else failedTasks++;
await this.delay(this.microTaskDelay);
}
}
const knowledgeResult = await this.selectBackgroundKnowledge(context);
- if (knowledgeResult.success) completedTasks++; else failedTasks++;
+ if (knowledgeResult.success) completeTasks++; else failedTasks++;
await this.delay(this.microTaskDelay);
const finalResult = await this.generateFinalRecommendations(context);
- if (finalResult.success) completedTasks++; else failedTasks++;
+ if (finalResult.success) completeTasks++; else failedTasks++;
const recommendation = this.buildRecommendation(context, mode, finalResult.content);
this.addAuditEntry(context, 'completion', 'pipeline-end',
- { completedTasks, failedTasks },
+ { completedTasks: completeTasks, failedTasks },
{ finalRecommendation: !!recommendation, auditEntriesGenerated: context.auditTrail.length },
- completedTasks > failedTasks ? 85 : 60,
+ completeTasks > failedTasks ? 85 : 60,
startTime,
- { totalProcessingTimeMs: Date.now() - startTime }
+ { totalProcessingTimeMs: Date.now() - startTime, confidenceScoresGenerated: context.selectedTools?.length || 0 }
);
const processingStats = {
@@ -1054,13 +1265,13 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
finalSelectedItems: (context.selectedTools?.length || 0) +
(context.backgroundKnowledge?.length || 0),
processingTimeMs: Date.now() - startTime,
- microTasksCompleted: completedTasks,
+ microTasksCompleted: completeTasks,
microTasksFailed: failedTasks,
contextContinuityUsed: true
};
- console.log(`[AI PIPELINE] Completed: ${completedTasks} tasks, Failed: ${failedTasks} tasks`);
- console.log(`[AI PIPELINE] Unique tools selected: ${context.seenToolNames.size}`);
+ console.log(`[AI PIPELINE] Completed: ${completeTasks} tasks, Failed: ${failedTasks} tasks`);
+ console.log(`[AI PIPELINE] Enhanced confidence scores generated: ${context.selectedTools?.length || 0}`);
console.log(`[AI PIPELINE] Audit trail entries: ${context.auditTrail.length}`);
return {
@@ -1080,128 +1291,6 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
}
}
- private calculateToolFreshness(tool: any): number {
- // Base freshness score
- let freshness = 70; // Default for tools without specific freshness data
-
- // Boost for tools with knowledge base (more maintained)
- if (tool.knowledgebase === true) freshness += 20;
-
- // Boost for hosted tools (actively maintained)
- if (isToolHosted(tool)) freshness += 15;
-
- // Slight boost for open source (community maintained)
- if (tool.license && tool.license !== 'Proprietary') freshness += 5;
-
- return Math.min(100, freshness);
- }
-
- private checkDomainMatch(tool: any, userQuery: string): boolean {
- if (!tool.domains || tool.domains.length === 0) return false;
-
- const queryLower = userQuery.toLowerCase();
-
- // Load domain keywords from environment with fallback
- const domainKeywordsEnv = process.env.CONFIDENCE_DOMAIN_KEYWORDS ||
- 'incident-response:incident,breach,attack,compromise,response|malware-analysis:malware,virus,trojan,reverse,analysis|network-forensics:network,traffic,packet,pcap,wireshark|mobile-forensics:mobile,android,ios,phone,app|cloud-forensics:cloud,aws,azure,saas,paas';
-
- const domainKeywords = domainKeywordsEnv.split('|').reduce((acc, pair) => {
- const [domain, keywords] = pair.split(':');
- if (domain && keywords) {
- acc[domain] = keywords.split(',');
- }
- return acc;
- }, {});
-
- return tool.domains.some(domain => {
- const keywords = domainKeywords[domain] || [domain.replace('-', ' ')];
- return keywords.some(keyword => queryLower.includes(keyword));
- });
- }
-
- private getMicroTaskAgreement(toolName: string, context: AnalysisContext): number {
- // Check how many micro-tasks selected this tool
- const microTaskEntries = context.auditTrail.filter(entry =>
- entry.phase === 'micro-task' &&
- entry.action.includes('selection') &&
- entry.output &&
- typeof entry.output === 'object' &&
- Array.isArray(entry.output.selectedTools) &&
- entry.output.selectedTools.includes(toolName)
- );
-
- const totalMicroTasks = context.auditTrail.filter(entry =>
- entry.phase === 'micro-task' && entry.action.includes('selection')
- ).length;
-
- return totalMicroTasks > 0 ? microTaskEntries.length / totalMicroTasks : 0.8; // Default high agreement
- }
-
- private getEmbeddingsSimilarity(toolName: string, context: AnalysisContext): number {
- // Extract similarity from audit trail embeddings entry
- const embeddingsEntry = context.auditTrail.find(entry =>
- entry.phase === 'retrieval' && entry.action === 'embeddings-search'
- );
-
- if (!embeddingsEntry || !embeddingsEntry.output) return 0.5; // Default medium similarity
-
- // Look for similarity data in the output (implementation specific)
- // This would need to be populated during embeddings search
- return 0.7; // Placeholder - would need actual similarity data from embeddings
- }
-
- private identifyUncertaintyFactors(tool: any, context: AnalysisContext, confidence: number): string[] {
- const factors: string[] = [];
-
- if (confidence < this.confidenceConfig.mediumThreshold) {
- factors.push('Low overall confidence - consider manual validation');
- }
-
- if (!this.checkDomainMatch(tool, context.userQuery)) {
- factors.push('Domain mismatch detected - tool may not be specifically designed for this scenario');
- }
-
- if (tool.skillLevel === 'expert' && /rapid|quick|urgent|triage/i.test(context.userQuery)) {
- factors.push('Expert-level tool for rapid scenario - may be overcomplicated');
- }
-
- if (tool.type === 'software' && !isToolHosted(tool) && !tool.url) {
- factors.push('Limited access information - availability uncertain');
- }
-
- if (tool.skillLevel === 'novice' && /complex|advanced|deep/i.test(context.userQuery)) {
- factors.push('Novice-level tool for complex scenario - may lack required capabilities');
- }
-
- return factors;
- }
-
- private identifyStrengthIndicators(tool: any, context: AnalysisContext, confidence: number): string[] {
- const indicators: string[] = [];
-
- if (confidence >= this.confidenceConfig.highThreshold) {
- indicators.push('High confidence recommendation based on multiple factors');
- }
-
- if (this.checkDomainMatch(tool, context.userQuery)) {
- indicators.push('Strong domain alignment with scenario requirements');
- }
-
- if (tool.knowledgebase === true) {
- indicators.push('Documentation and knowledge base available for guidance');
- }
-
- if (isToolHosted(tool)) {
- indicators.push('Hosted solution available for immediate access');
- }
-
- if (tool.type === 'method' && /methodology|approach|process/i.test(context.userQuery)) {
- indicators.push('Methodological approach matches procedural inquiry');
- }
-
- return indicators;
- }
-
private buildRecommendation(context: AnalysisContext, mode: string, finalContent: string): any {
const isWorkflow = mode === 'workflow';
@@ -1218,13 +1307,12 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
if (isWorkflow) {
const recommendedToolsWithConfidence = context.selectedTools?.map(st => {
- // Calculate confidence for each tool
+ // Calculate enhanced confidence for each tool
const confidence = this.calculateRecommendationConfidence(
st.tool,
- this.getEmbeddingsSimilarity(st.tool.name, context),
- this.checkDomainMatch(st.tool, context.userQuery),
- this.getMicroTaskAgreement(st.tool.name, context),
- context
+ context,
+ st.taskRelevance || 70,
+ st.limitations || []
);
// Add audit entry for confidence calculation
@@ -1233,15 +1321,15 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
{
overall: confidence.overall,
components: {
- embeddings: confidence.embeddingsQuality,
- domain: confidence.domainAlignment,
- consensus: confidence.consensus,
- freshness: confidence.freshness
+ semantic: confidence.semanticRelevance,
+ suitability: confidence.taskSuitability,
+ consistency: confidence.methodologicalConsistency,
+ reliability: confidence.toolReliability
}
},
confidence.overall,
Date.now(),
- { uncertaintyCount: confidence.uncertaintyFactors.length }
+ { uncertaintyCount: confidence.uncertaintyFactors.length, strengthCount: confidence.strengthIndicators.length }
);
return {
@@ -1264,10 +1352,9 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
const recommendedToolsWithConfidence = context.selectedTools?.map(st => {
const confidence = this.calculateRecommendationConfidence(
st.tool,
- this.getEmbeddingsSimilarity(st.tool.name, context),
- this.checkDomainMatch(st.tool, context.userQuery),
- this.getMicroTaskAgreement(st.tool.name, context),
- context
+ context,
+ st.taskRelevance || 70,
+ st.limitations || []
);
this.addAuditEntry(context, 'validation', 'confidence-scoring',
@@ -1278,7 +1365,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
},
confidence.overall,
Date.now(),
- { strengthCount: confidence.strengthIndicators.length }
+ { strengthCount: confidence.strengthIndicators.length, limitationsCount: confidence.uncertaintyFactors.length }
);
return {