confidence updates, content adjustment

This commit is contained in:
overcuriousity 2025-08-05 21:35:38 +02:00
parent 27e64f05ca
commit fe1be323bb
5 changed files with 160 additions and 270 deletions

View File

@ -117,7 +117,7 @@ AI_MAX_CONTEXT_TOKENS=4000
# Maximum tokens per individual AI prompt # Maximum tokens per individual AI prompt
# Larger = more context per call | Smaller = faster responses # Larger = more context per call | Smaller = faster responses
AI_MAX_PROMPT_TOKENS=1500 AI_MAX_PROMPT_TOKENS=2500
# ============================================================================ # ============================================================================
# 6. AUTHENTICATION & AUTHORIZATION (OPTIONAL) # 6. AUTHENTICATION & AUTHORIZATION (OPTIONAL)
@ -190,18 +190,16 @@ FORENSIC_AUDIT_RETENTION_HOURS=24
FORENSIC_AUDIT_MAX_ENTRIES=50 FORENSIC_AUDIT_MAX_ENTRIES=50
# ============================================================================ # ============================================================================
# 10. ENHANCED CONFIDENCE SCORING SYSTEM # 10. SIMPLIFIED CONFIDENCE SCORING SYSTEM
# ============================================================================ # ============================================================================
# Confidence component weights (must sum to 1.0) # Confidence component weights (must sum to 1.0)
CONFIDENCE_SEMANTIC_WEIGHT=0.25 # Weight for vector similarity quality CONFIDENCE_SEMANTIC_WEIGHT=0.5 # Weight for vector similarity quality
CONFIDENCE_SUITABILITY_WEIGHT=0.4 # Weight for AI-determined task fitness CONFIDENCE_SUITABILITY_WEIGHT=0.5 # Weight for AI-determined task fitness
CONFIDENCE_CONSISTENCY_WEIGHT=0.2 # Weight for cross-validation agreement
CONFIDENCE_RELIABILITY_WEIGHT=0.15 # Weight for tool quality indicators
# Confidence thresholds (0-100) # Confidence thresholds (0-100)
CONFIDENCE_MINIMUM_THRESHOLD=40 # Below this = weak recommendation CONFIDENCE_MINIMUM_THRESHOLD=50 # Below this = weak recommendation
CONFIDENCE_MEDIUM_THRESHOLD=60 # 40-59 = weak, 60-79 = moderate CONFIDENCE_MEDIUM_THRESHOLD=70 # 40-59 = weak, 60-79 = moderate
CONFIDENCE_HIGH_THRESHOLD=80 # 80+ = strong recommendation CONFIDENCE_HIGH_THRESHOLD=80 # 80+ = strong recommendation
# ============================================================================ # ============================================================================

View File

@ -756,17 +756,14 @@ class AIQueryInterface {
renderConfidenceTooltip(confidence) { renderConfidenceTooltip(confidence) {
if (!confidence || typeof confidence.overall !== 'number') { if (!confidence || typeof confidence.overall !== 'number') {
console.log('[AI DEBUG] No confidence data or invalid format:', confidence);
return ''; return '';
} }
const confidenceColor = confidence.overall >= 80 ? 'var(--color-accent)' : const confidenceColor = confidence.overall >= 80 ? 'var(--color-accent)' :
confidence.overall >= 60 ? 'var(--color-warning)' : 'var(--color-error)'; confidence.overall >= 60 ? 'var(--color-warning)' : 'var(--color-error)';
const tooltipId = `tooltip-${Math.random().toString(36).substr(2, 9)}`; const tooltipId = `tooltip-${Math.random().toString(36).substr(2, 9)}`;
console.log(`[AI DEBUG] Generating confidence tooltip: ${confidence.overall}% with ID ${tooltipId}`);
return ` return `
<span class="confidence-tooltip-trigger" <span class="confidence-tooltip-trigger"
style="display: inline-flex; align-items: center; gap: 0.125rem; cursor: help; margin-left: 0.25rem;" style="display: inline-flex; align-items: center; gap: 0.125rem; cursor: help; margin-left: 0.25rem;"
@ -789,7 +786,7 @@ class AIQueryInterface {
<strong style="color: var(--color-accent);">${confidence.semanticRelevance}%</strong> <strong style="color: var(--color-accent);">${confidence.semanticRelevance}%</strong>
</div> </div>
<div style="font-size: 0.625rem; color: var(--color-text-secondary); line-height: 1.3;"> <div style="font-size: 0.625rem; color: var(--color-text-secondary); line-height: 1.3;">
Wie gut die Tool-Beschreibung semantisch zu Ihrer Anfrage passt (basierend auf Vektor-Ähnlichkeit) Wie gut die Tool-Beschreibung semantisch zu Ihrer Anfrage passt (Vektor-Ähnlichkeit)
</div> </div>
</div> </div>
@ -802,26 +799,6 @@ class AIQueryInterface {
KI-bewertete Eignung des Tools für Ihre spezifische forensische Aufgabenstellung KI-bewertete Eignung des Tools für Ihre spezifische forensische Aufgabenstellung
</div> </div>
</div> </div>
<div style="background: var(--color-bg-secondary); padding: 0.5rem; border-radius: 0.375rem; border-left: 3px solid var(--color-warning);">
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 0.25rem;">
<span style="font-weight: 600; font-size: 0.6875rem;">🤝 Methodische Konsistenz</span>
<strong style="color: var(--color-warning);">${confidence.methodologicalConsistency}%</strong>
</div>
<div style="font-size: 0.625rem; color: var(--color-text-secondary); line-height: 1.3;">
Wie einheitlich verschiedene Analyseschritte dieses Tool bewerten (Kreuzvalidierung)
</div>
</div>
<div style="background: var(--color-bg-secondary); padding: 0.5rem; border-radius: 0.375rem; border-left: 3px solid var(--color-text-secondary);">
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 0.25rem;">
<span style="font-weight: 600; font-size: 0.6875rem;">🔧 Tool-Zuverlässigkeit</span>
<strong style="color: var(--color-text);">${confidence.toolReliability}%</strong>
</div>
<div style="font-size: 0.625rem; color: var(--color-text-secondary); line-height: 1.3;">
Qualitätsindikatoren: Dokumentation, Wartung, Verfügbarkeit und Benutzerfreundlichkeit
</div>
</div>
</div> </div>
${confidence.strengthIndicators && confidence.strengthIndicators.length > 0 ? ` ${confidence.strengthIndicators && confidence.strengthIndicators.length > 0 ? `
@ -847,7 +824,7 @@ class AIQueryInterface {
` : ''} ` : ''}
<div style="margin-top: 0.75rem; padding-top: 0.75rem; border-top: 1px solid var(--color-border); font-size: 0.625rem; color: var(--color-text-secondary); text-align: center;"> <div style="margin-top: 0.75rem; padding-top: 0.75rem; border-top: 1px solid var(--color-border); font-size: 0.625rem; color: var(--color-text-secondary); text-align: center;">
Mehrstufige KI-Analyse mit Kreuzvalidierung Forensisch fundierte KI-Analyse
</div> </div>
</div> </div>
</span> </span>

View File

@ -120,61 +120,60 @@ ${aspects}
WICHTIG: Antworten Sie NUR in fließendem deutschen Text ohne Listen oder Markdown. Maximum 120 Wörter.`; WICHTIG: Antworten Sie NUR in fließendem deutschen Text ohne Listen oder Markdown. Maximum 120 Wörter.`;
}, },
// Phase tool selection prompt
phaseToolSelection: (userQuery: string, phase: any, phaseTools: any[]) => { phaseToolSelection: (userQuery: string, phase: any, phaseTools: any[]) => {
return `Wählen Sie 2-3 Methoden/Tools für die Phase "${phase.name}" basierend auf objektiven, fallbezogenen Kriterien. return `Wählen Sie 2-3 Methoden/Tools für die Phase "${phase.name}" und bewerten Sie deren Aufgaben-Eignung VERGLEICHEND.
SZENARIO: "${userQuery}" SZENARIO: "${userQuery}"
SPEZIFISCHE PHASE: ${phase.name} - ${phase.description || 'Forensische Untersuchungsphase'}
VERFÜGBARE TOOLS FÜR ${phase.name.toUpperCase()}: VERFÜGBARE TOOLS FÜR ${phase.name.toUpperCase()}:
${phaseTools.map((tool: any) => `- ${tool.name}: ${tool.description.slice(0, 100)}...`).join('\n')} ${phaseTools.map((tool: any, index: number) => `${index + 1}. ${tool.name}: ${tool.description.slice(0, 150)}...
- Plattformen: ${tool.platforms?.join(', ') || 'N/A'}
- Skill Level: ${tool.skillLevel}
- Tags: ${tool.tags?.join(', ') || 'N/A'}`).join('\n\n')}
Wählen Sie Methoden/Tools nach forensischen Kriterien aus: Bewerten Sie ALLE Tools vergleichend für diese spezifische Aufgabe UND Phase. Wählen Sie die 2-3 besten aus.
- Court admissibility und Chain of Custody Kompatibilität
- Integration in forensische Standard-Workflows
- Reproduzierbarkeit und Dokumentationsqualität
- Objektivität
Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format (kein zusätzlicher Text): BEWERTUNGSKRITERIEN:
- Wie gut löst das Tool das forensische Problem im SZENARIO-Kontext?
- Wie gut passt es zur spezifischen PHASE "${phase.name}"?
- Wie vergleicht es sich mit den anderen verfügbaren Tools für diese Phase?
Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format:
[ [
{ {
"toolName": "Exakter Methoden/Tool-Name", "toolName": "Exakter Tool-Name",
"priority": "high|medium|low", "taskRelevance": 85,
"justification": "Objektive Begründung warum diese Methode/Tool für das spezifische Szenario besser geeignet ist" "justification": "Vergleichende Begründung warum dieses Tool für diese Phase und Aufgabe besser/schlechter als die anderen geeignet ist",
"limitations": ["Spezifische Einschränkung 1", "Einschränkung 2"]
} }
]`; ]
},
// Tool evaluation prompt
toolEvaluation: (userQuery: string, tool: any, rank: number) => {
return `Sie sind ein DFIR-Experte und bewerten ein forensisches Tool für eine spezifische Aufgabe.
PROBLEM: "${userQuery}"
TOOL: ${tool.name}
BESCHREIBUNG: ${tool.description}
PLATTFORMEN: ${tool.platforms?.join(', ') || 'N/A'}
SKILL LEVEL: ${tool.skillLevel}
DOMAINS: ${tool.domains?.join(', ') || 'N/A'}
TAGS: ${tool.tags?.join(', ') || 'N/A'}
Bewerten Sie nach forensischen Standards und antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format:
{
"suitability_score": "high|medium|low",
"task_relevance": 85,
"detailed_explanation": "Detaillierte forensische Begründung warum diese Methode/Tool das Problem löst",
"implementation_approach": "Konkrete methodische Schritte zur korrekten Anwendung für dieses spezifische Problem",
"pros": ["Forensischer Vorteil 1", "Validierter Vorteil 2"],
"cons": ["Methodische Limitation 1", "Potenzielle Schwäche 2"],
"limitations": ["Spezifische Einschränkung 1", "Mögliche Problematik 2"],
"alternatives": "Alternative Ansätze falls diese Methode/Tool nicht optimal ist"
}
WICHTIG: WICHTIG:
- task_relevance: Numerischer Wert 0-100 wie gut das Tool für DIESE SPEZIFISCHE Aufgabe geeignet ist - taskRelevance: 0-100 Score basierend auf Szenario-Eignung UND Phasen-Passung im VERGLEICH zu anderen Tools
- limitations: Konkrete Einschränkungen oder Situationen wo das Tool NICHT optimal wäre - Nur die 2-3 BESTEN Tools auswählen und bewerten
- Berücksichtigen Sie den Skill Level vs. Anfrage-Komplexität - justification soll VERGLEICHEND sein ("besser als X weil...", "für diese Phase ideal weil...")`;
- Bewerten Sie objektiv, nicht beschönigend`; },
toolEvaluation: (userQuery: string, tool: any, rank: number, taskRelevance: number) => {
return `Sie sind ein DFIR-Experte. Erklären Sie DETAILLIERT die Anwendung dieses bereits bewerteten Tools.
PROBLEM: "${userQuery}"
TOOL: ${tool.name} (bereits bewertet mit ${taskRelevance}% Aufgaben-Eignung)
BESCHREIBUNG: ${tool.description}
Das Tool wurde bereits als Rang ${rank} für diese Aufgabe bewertet. Erklären Sie nun:
Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format:
{
"detailed_explanation": "Detaillierte Erklärung warum und wie dieses Tool für diese spezifische Aufgabe eingesetzt wird",
"implementation_approach": "Konkrete Schritt-für-Schritt Anleitung zur korrekten Anwendung",
"pros": ["Spezifischer Vorteil 1", "Spezifischer Vorteil 2"],
"cons": ["Bekannte Limitation 1", "Bekannte Limitation 2"],
"alternatives": "Alternative Ansätze oder Tools falls dieses nicht verfügbar ist"
}
WICHTIG: Keine erneute Bewertung - nur detaillierte Erklärung der bereits bewerteten Eignung.`;
}, },
// Background knowledge selection prompt // Background knowledge selection prompt
@ -229,7 +228,7 @@ export function getPrompt(key: 'scenarioAnalysis', isWorkflow: boolean, userQuer
export function getPrompt(key: 'investigationApproach', isWorkflow: boolean, userQuery: string): string; export function getPrompt(key: 'investigationApproach', isWorkflow: boolean, userQuery: string): string;
export function getPrompt(key: 'criticalConsiderations', isWorkflow: boolean, userQuery: string): string; export function getPrompt(key: 'criticalConsiderations', isWorkflow: boolean, userQuery: string): string;
export function getPrompt(key: 'phaseToolSelection', userQuery: string, phase: any, phaseTools: any[]): string; export function getPrompt(key: 'phaseToolSelection', userQuery: string, phase: any, phaseTools: any[]): string;
export function getPrompt(key: 'toolEvaluation', userQuery: string, tool: any, rank: number): string; export function getPrompt(key: 'toolEvaluation', userQuery: string, tool: any, rank: number, taskRelevance: number): string;
export function getPrompt(key: 'backgroundKnowledgeSelection', userQuery: string, mode: string, selectedToolNames: string[], availableConcepts: any[]): string; export function getPrompt(key: 'backgroundKnowledgeSelection', userQuery: string, mode: string, selectedToolNames: string[], availableConcepts: any[]): string;
export function getPrompt(key: 'finalRecommendations', isWorkflow: boolean, userQuery: string, selectedToolNames: string[]): string; export function getPrompt(key: 'finalRecommendations', isWorkflow: boolean, userQuery: string, selectedToolNames: string[]): string;
export function getPrompt(promptKey: keyof typeof AI_PROMPTS, ...args: any[]): string { export function getPrompt(promptKey: keyof typeof AI_PROMPTS, ...args: any[]): string {

View File

@ -3975,7 +3975,7 @@ tools:
- name: KAPE - name: KAPE
type: software type: software
description: >- description: >-
Kroll Artifact Parser and Extractor revolutioniert Windows-Forensik durch Kroll Artifact Parser and Extractor versucht sich an Windows-Forensik durch
intelligente Ziel-basierte Sammlung. Statt Full-Disk-Images extrahiert intelligente Ziel-basierte Sammlung. Statt Full-Disk-Images extrahiert
KAPE gezielt kritische Artefakte: Registry-Hives, Event-Logs, Prefetch, KAPE gezielt kritische Artefakte: Registry-Hives, Event-Logs, Prefetch,
Browser- Daten, Scheduled-Tasks in Minuten statt Stunden. Die Target-Files Browser- Daten, Scheduled-Tasks in Minuten statt Stunden. Die Target-Files
@ -3983,12 +3983,10 @@ tools:
Besonders clever: Compound-Targets gruppieren zusammengehörige Artefakte Besonders clever: Compound-Targets gruppieren zusammengehörige Artefakte
(z.B. "Browser" sammelt Chrome+Firefox+Edge), die gKAPE-GUI macht es auch (z.B. "Browser" sammelt Chrome+Firefox+Edge), die gKAPE-GUI macht es auch
für Nicht-Techniker zugänglich. Batch-Mode verarbeitet mehrere Images für Nicht-Techniker zugänglich. Batch-Mode verarbeitet mehrere Images
parallel. Output direkt kompatibel zu Timeline-Tools wie Plaso. Die parallel. Output direkt kompatibel zu Timeline-Tools wie Plaso.
ständigen Community-Updates halten mit Windows-Entwicklungen Schritt.
VSS-Processing analysiert Shadow- Copies automatisch. Der VSS-Processing analysiert Shadow- Copies automatisch. Der
Remote-Collection-Mode sammelt über Netzwerk. Kostenlos aber Remote-Collection-Mode sammelt über Netzwerk. Kostenlos (mit Registrierung) aber
Enterprise-Support verfügbar. Der neue Standard für effiziente Enterprise-Support verfügbar.
Windows-Forensik-Triage.
skillLevel: intermediate skillLevel: intermediate
url: https://www.kroll.com/kape url: https://www.kroll.com/kape
icon: 🧰 icon: 🧰
@ -4003,7 +4001,7 @@ tools:
platforms: platforms:
- Windows - Windows
accessType: download accessType: download
license: Freeware license: Proprietary
knowledgebase: false knowledgebase: false
- name: Kibana - name: Kibana
type: software type: software

View File

@ -72,7 +72,6 @@ interface ConfidenceMetrics {
semanticRelevance: number; // How well tool description matches query (from embeddings) semanticRelevance: number; // How well tool description matches query (from embeddings)
taskSuitability: number; // AI-determined fitness for this specific task taskSuitability: number; // AI-determined fitness for this specific task
methodologicalConsistency: number; // How well different analysis steps agree methodologicalConsistency: number; // How well different analysis steps agree
toolReliability: number; // Indicators of tool quality and maintenance
uncertaintyFactors: string[]; // Specific reasons why this might not work uncertaintyFactors: string[]; // Specific reasons why this might not work
strengthIndicators: string[]; // Specific reasons why this is a good choice strengthIndicators: string[]; // Specific reasons why this is a good choice
} }
@ -146,17 +145,17 @@ class ImprovedMicroTaskAIPipeline {
// Updated confidence weights - more focused on AI evaluation // Updated confidence weights - more focused on AI evaluation
this.confidenceConfig = { this.confidenceConfig = {
semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.25'), // Embeddings similarity semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.3'), // Embeddings similarity
suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.4'), // AI task fit evaluation suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.7'), // AI task fit evaluation
consistencyWeight: parseFloat(process.env.CONFIDENCE_CONSISTENCY_WEIGHT || '0.2'), // Cross-validation agreement consistencyWeight: 0,
reliabilityWeight: parseFloat(process.env.CONFIDENCE_RELIABILITY_WEIGHT || '0.15'), // Tool quality indicators reliabilityWeight: 0,
minimumThreshold: parseInt(process.env.CONFIDENCE_MINIMUM_THRESHOLD || '40', 10), minimumThreshold: parseInt(process.env.CONFIDENCE_MINIMUM_THRESHOLD || '40', 10),
mediumThreshold: parseInt(process.env.CONFIDENCE_MEDIUM_THRESHOLD || '60', 10), mediumThreshold: parseInt(process.env.CONFIDENCE_MEDIUM_THRESHOLD || '60', 10),
highThreshold: parseInt(process.env.CONFIDENCE_HIGH_THRESHOLD || '80', 10) highThreshold: parseInt(process.env.CONFIDENCE_HIGH_THRESHOLD || '80', 10)
}; };
console.log('[AI PIPELINE] Enhanced confidence scoring enabled:', { console.log('[AI PIPELINE] Simplified confidence scoring enabled:', {
weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight} Consistency:${this.confidenceConfig.consistencyWeight} Reliability:${this.confidenceConfig.reliabilityWeight}`, weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight}`,
thresholds: `${this.confidenceConfig.minimumThreshold}/${this.confidenceConfig.mediumThreshold}/${this.confidenceConfig.highThreshold}` thresholds: `${this.confidenceConfig.minimumThreshold}/${this.confidenceConfig.mediumThreshold}/${this.confidenceConfig.highThreshold}`
}); });
} }
@ -709,197 +708,107 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
): ConfidenceMetrics { ): ConfidenceMetrics {
// 1. Semantic Relevance: Real embeddings similarity score // 1. Semantic Relevance: Real embeddings similarity score
const semanticRelevance = context.embeddingsSimilarities.has(tool.name) ? const rawSemanticRelevance = context.embeddingsSimilarities.has(tool.name) ?
Math.round(context.embeddingsSimilarities.get(tool.name)! * 100) : 50; context.embeddingsSimilarities.get(tool.name)! * 100 : 50;
// 2. Task Suitability: AI-determined fitness for specific task // 2. Task Suitability: Enhanced with phase-awareness for workflow mode
const taskSuitability = Math.round(taskRelevance); let enhancedTaskSuitability = taskRelevance;
// 3. Methodological Consistency: Cross-validation between micro-tasks if (context.mode === 'workflow') {
const methodologicalConsistency = this.calculateCrossValidationScore(tool.name, context); // In workflow mode, boost score if tool is well-matched to its assigned phase
const toolSelection = context.selectedTools?.find(st => st.tool.name === tool.name);
if (toolSelection && tool.phases && tool.phases.includes(toolSelection.phase)) {
// Boost for phase alignment (but cap at 100)
const phaseBonus = Math.min(15, 100 - taskRelevance);
enhancedTaskSuitability = Math.min(100, taskRelevance + phaseBonus);
console.log(`[CONFIDENCE] Phase bonus for ${tool.name}: ${taskRelevance} -> ${enhancedTaskSuitability} (phase: ${toolSelection.phase})`);
}
}
// 4. Tool Reliability: Quality indicators // Simple weighted combination - no artificial scaling
const toolReliability = this.calculateToolReliability(tool);
// Debug logging
console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, {
semantic: semanticRelevance,
taskSuitability: taskSuitability,
consistency: methodologicalConsistency,
reliability: toolReliability,
hasEmbeddingsSimilarity: context.embeddingsSimilarities.has(tool.name),
rawTaskRelevance: taskRelevance
});
// Calculate weighted overall score
const overall = ( const overall = (
semanticRelevance * this.confidenceConfig.semanticWeight + rawSemanticRelevance * this.confidenceConfig.semanticWeight +
taskSuitability * this.confidenceConfig.suitabilityWeight + enhancedTaskSuitability * this.confidenceConfig.suitabilityWeight
methodologicalConsistency * this.confidenceConfig.consistencyWeight +
toolReliability * this.confidenceConfig.reliabilityWeight
); );
const uncertaintyFactors = this.identifySpecificUncertaintyFactors(tool, context, limitations, overall); const uncertaintyFactors = this.identifySpecificUncertaintyFactors(tool, context, limitations, overall);
const strengthIndicators = this.identifySpecificStrengthIndicators(tool, context, overall); const strengthIndicators = this.identifySpecificStrengthIndicators(tool, context, overall);
console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, {
rawSemantic: Math.round(rawSemanticRelevance),
rawTaskSuitability: taskRelevance,
enhancedTaskSuitability: Math.round(enhancedTaskSuitability),
overall: Math.round(overall),
mode: context.mode
});
return { return {
overall: Math.round(overall), overall: Math.round(overall),
semanticRelevance: Math.round(semanticRelevance), semanticRelevance: Math.round(rawSemanticRelevance),
taskSuitability: Math.round(taskSuitability), taskSuitability: Math.round(enhancedTaskSuitability),
methodologicalConsistency: Math.round(methodologicalConsistency), methodologicalConsistency: 0,
toolReliability: Math.round(toolReliability),
uncertaintyFactors, uncertaintyFactors,
strengthIndicators strengthIndicators
}; };
} }
private calculateCrossValidationScore(toolName: string, context: AnalysisContext): number {
// Look for entries where this tool was mentioned across different phases
const relevantEntries = context.auditTrail.filter(entry =>
entry.phase === 'micro-task' || entry.phase === 'selection'
);
let toolMentions = 0;
let positiveEvaluations = 0;
let confidenceSum = 0;
relevantEntries.forEach(entry => {
let toolFound = false;
// Check various ways the tool might be referenced in output
if (entry.output && typeof entry.output === 'object') {
// Check selectedTools arrays
if (Array.isArray(entry.output.selectedTools) &&
entry.output.selectedTools.includes(toolName)) {
toolFound = true;
}
// Check finalToolNames arrays
if (Array.isArray(entry.output.finalToolNames) &&
entry.output.finalToolNames.includes(toolName)) {
toolFound = true;
}
// Check toolName in individual evaluation
if (entry.output.toolName === toolName) {
toolFound = true;
}
}
if (toolFound) {
toolMentions++;
confidenceSum += entry.confidence;
// Consider it positive if confidence >= 60
if (entry.confidence >= 60) {
positiveEvaluations++;
}
}
});
console.log(`[AI PIPELINE] Cross-validation for ${toolName}: ${toolMentions} mentions, ${positiveEvaluations} positive, avg confidence: ${toolMentions > 0 ? Math.round(confidenceSum / toolMentions) : 0}`);
if (toolMentions === 0) {
return 60; // Default when no cross-validation data available
}
if (toolMentions === 1) {
// Single mention - use confidence directly but cap it
return Math.min(85, Math.max(40, confidenceSum));
}
// Multiple mentions - calculate agreement ratio
const agreementRatio = positiveEvaluations / toolMentions;
const avgConfidence = confidenceSum / toolMentions;
// Combine agreement ratio with average confidence
const crossValidationScore = (agreementRatio * 0.7 + (avgConfidence / 100) * 0.3) * 100;
return Math.round(Math.min(95, Math.max(30, crossValidationScore)));
}
// NEW: Calculate tool reliability based on objective indicators
private calculateToolReliability(tool: any): number {
let reliability = 50; // Base score
// Documentation availability
if (tool.knowledgebase === true) reliability += 25;
// Active maintenance (hosted tools are typically maintained)
if (isToolHosted(tool)) reliability += 20;
// Community support (open source often has community)
if (tool.license && tool.license !== 'Proprietary') reliability += 10;
// Skill level appropriateness (not too complex, not too simple)
if (tool.skillLevel === 'intermediate' || tool.skillLevel === 'advanced') reliability += 10;
else if (tool.skillLevel === 'expert') reliability -= 5; // May be overcomplicated
// Multi-platform support (more versatile)
if (tool.platforms && tool.platforms.length > 1) reliability += 5;
return Math.min(100, reliability);
}
// NEW: Identify specific uncertainty factors based on analysis
private identifySpecificUncertaintyFactors(tool: any, context: AnalysisContext, limitations: string[], confidence: number): string[] { private identifySpecificUncertaintyFactors(tool: any, context: AnalysisContext, limitations: string[], confidence: number): string[] {
const factors: string[] = []; const factors: string[] = [];
// Add AI-identified limitations // Add AI-identified limitations first (most specific)
if (limitations && limitations.length > 0) { if (limitations && limitations.length > 0) {
factors.push(...limitations.slice(0, 3)); // Limit to top 3 factors.push(...limitations.slice(0, 2)); // Limit to top 2 to leave room for others
} }
// Low semantic similarity // Low semantic similarity
const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5; const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5;
if (similarity < 0.4) { if (similarity < 0.7) {
factors.push('Geringe semantische Ähnlichkeit zur Anfrage - Tool-Beschreibung passt möglicherweise nicht optimal'); factors.push('Geringe semantische Ähnlichkeit zur Anfrage - Tool-Beschreibung passt möglicherweise nicht optimal');
} }
// Skill level mismatch // Skill level vs scenario complexity mismatch
if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent/i.test(context.userQuery)) { if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent|sofort/i.test(context.userQuery)) {
factors.push('Experten-Tool für Eilszenario - möglicherweise zu komplex für schnelle Antworten'); factors.push('Experten-Tool für zeitkritisches Szenario - Setup und Einarbeitung könnten zu lange dauern');
} }
if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced/i.test(context.userQuery)) { if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced|forensisch/i.test(context.userQuery)) {
factors.push('Einsteiger-Tool für komplexes Szenario - könnte funktionale Einschränkungen haben'); factors.push('Einsteiger-Tool für komplexe Analyse - könnte funktionale Limitierungen haben');
} }
// Access limitations // Platform availability concerns
if (tool.platforms && tool.platforms.length === 1 && tool.platforms[0] === 'Windows' && /linux|unix|server/i.test(context.userQuery)) {
factors.push('Nur Windows-Tool bei möglicher Linux/Server-Umgebung - Plattform-Inkompatibilität');
}
// Access and deployment concerns
if (tool.type === 'software' && !isToolHosted(tool) && tool.accessType === 'download') { if (tool.type === 'software' && !isToolHosted(tool) && tool.accessType === 'download') {
factors.push('Installation erforderlich - nicht sofort verfügbar ohne Setup'); factors.push('Installation und Setup erforderlich');
} }
// Cross-validation disagreement // License restrictions
const crossValidation = this.calculateCrossValidationScore(tool.name, context); if (tool.license === 'Proprietary') {
if (crossValidation < 50) { factors.push('Kommerzielle Software - Lizenzkosten und rechtliche Beschränkungen zu beachten');
factors.push('Uneinheitliche Bewertung in verschiedenen Analyseschritten - Empfehlung nicht eindeutig');
} }
return factors.slice(0, 4); // Limit to 4 most important factors // Low overall confidence warning
if (confidence < 60) {
factors.push('Moderate Gesamtbewertung - alternative Ansätze sollten ebenfalls betrachtet werden');
}
return factors.slice(0, 4); // Limit to 4 most relevant factors
} }
// NEW: Identify specific strength indicators // NEW: Identify specific strength indicators
private identifySpecificStrengthIndicators(tool: any, context: AnalysisContext, confidence: number): string[] { private identifySpecificStrengthIndicators(tool: any, context: AnalysisContext, confidence: number): string[] {
const indicators: string[] = []; const indicators: string[] = [];
// High confidence overall
if (confidence >= this.confidenceConfig.highThreshold) {
indicators.push('Hohe Gesamtbewertung durch mehrfache Validierung');
}
// High semantic similarity // High semantic similarity
const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5; const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5;
if (similarity >= 0.7) { if (similarity >= 0.7) {
indicators.push('Sehr gute semantische Übereinstimmung mit Ihrer Anfrage'); indicators.push('Sehr gute semantische Übereinstimmung mit Ihrer Anfrage');
} }
// Strong cross-validation
const crossValidation = this.calculateCrossValidationScore(tool.name, context);
if (crossValidation >= 80) {
indicators.push('Konsistente Empfehlung über verschiedene Analyseschritte hinweg');
}
// Quality indicators // Quality indicators
if (tool.knowledgebase === true) { if (tool.knowledgebase === true) {
indicators.push('Umfassende Dokumentation und Wissensbasis verfügbar'); indicators.push('Umfassende Dokumentation und Wissensbasis verfügbar');
@ -985,7 +894,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
const prompt = getPrompt('phaseToolSelection', context.userQuery, phase, phaseTools); const prompt = getPrompt('phaseToolSelection', context.userQuery, phase, phaseTools);
const result = await this.callMicroTaskAI(prompt, context, 800); const result = await this.callMicroTaskAI(prompt, context, 1000);
if (result.success) { if (result.success) {
const selections = this.safeParseJSON(result.content, []); const selections = this.safeParseJSON(result.content, []);
@ -998,16 +907,30 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
validSelections.forEach((sel: any) => { validSelections.forEach((sel: any) => {
const tool = phaseTools.find((t: any) => t.name === sel.toolName); const tool = phaseTools.find((t: any) => t.name === sel.toolName);
if (tool) { if (tool) {
this.addToolToSelection(context, tool, phase.id, sel.priority, sel.justification); // Ensure taskRelevance is a number
const taskRelevance = typeof sel.taskRelevance === 'number' ?
sel.taskRelevance : parseInt(String(sel.taskRelevance)) || 70;
// Derive priority automatically from score
const priority = this.derivePriorityFromScore(taskRelevance);
this.addToolToSelection(context, tool, phase.id, priority, sel.justification, taskRelevance, sel.limitations);
} }
}); });
this.addAuditEntry(context, 'micro-task', 'phase-tool-selection', this.addAuditEntry(context, 'micro-task', 'phase-tool-selection',
{ phase: phase.id, availableTools: phaseTools.length }, { phase: phase.id, availableTools: phaseTools.length },
{ validSelections: validSelections.length, selectedTools: validSelections.map(s => s.toolName) }, {
validSelections: validSelections.length,
selectedTools: validSelections.map(s => ({
name: s.toolName,
taskRelevance: s.taskRelevance,
derivedPriority: this.derivePriorityFromScore(s.taskRelevance)
}))
},
validSelections.length > 0 ? 75 : 30, validSelections.length > 0 ? 75 : 30,
Date.now() - result.processingTimeMs, Date.now() - result.processingTimeMs,
{ phaseName: phase.name } { phaseName: phase.name, comparativeEvaluation: true, priorityDerived: true }
); );
} }
} }
@ -1016,56 +939,46 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
} }
private async evaluateSpecificTool(context: AnalysisContext, tool: any, rank: number): Promise<MicroTaskResult> { private async evaluateSpecificTool(context: AnalysisContext, tool: any, rank: number): Promise<MicroTaskResult> {
const prompt = getPrompt('toolEvaluation', context.userQuery, tool, rank); // Get existing task relevance from previous phase selection
const existingSelection = context.selectedTools?.find(st => st.tool.name === tool.name);
const taskRelevance = existingSelection?.taskRelevance || 70;
const priority = this.derivePriorityFromScore(taskRelevance);
const prompt = getPrompt('toolEvaluation', context.userQuery, tool, rank, taskRelevance);
const result = await this.callMicroTaskAI(prompt, context, 1200); const result = await this.callMicroTaskAI(prompt, context, 1000);
if (result.success) { if (result.success) {
const evaluation = this.safeParseJSON(result.content, { const evaluation = this.safeParseJSON(result.content, {
suitability_score: 'medium',
task_relevance: '',
detailed_explanation: 'Evaluation failed', detailed_explanation: 'Evaluation failed',
implementation_approach: '', implementation_approach: '',
pros: [], pros: [],
cons: [], cons: [],
limitations: [],
alternatives: '' alternatives: ''
}); });
// Debug logging to see what we're getting // Store evaluation without re-scoring
console.log(`[AI PIPELINE] Tool ${tool.name} evaluation:`, {
taskRelevance: evaluation.task_relevance,
suitabilityScore: evaluation.suitability_score,
limitationsCount: evaluation.limitations?.length || 0
});
// Ensure task_relevance is a number
const taskRelevance = typeof evaluation.task_relevance === 'number' ?
evaluation.task_relevance :
parseInt(String(evaluation.task_relevance)) || 70;
// Store enhanced evaluation data
this.addToolToSelection(context, { this.addToolToSelection(context, {
...tool, ...tool,
evaluation: { evaluation: {
...evaluation, ...evaluation,
task_relevance: taskRelevance, // Ensure it's stored as number rank,
rank task_relevance: taskRelevance
} }
}, 'evaluation', evaluation.suitability_score, evaluation.detailed_explanation, }, 'evaluation', priority, evaluation.detailed_explanation,
taskRelevance, evaluation.limitations); taskRelevance, existingSelection?.limitations);
this.addAuditEntry(context, 'micro-task', 'tool-evaluation', this.addAuditEntry(context, 'micro-task', 'tool-evaluation',
{ toolName: tool.name, rank }, { toolName: tool.name, rank, existingTaskRelevance: taskRelevance, derivedPriority: priority },
{ {
suitabilityScore: evaluation.suitability_score,
taskRelevance: taskRelevance, // Use the cleaned number
hasExplanation: !!evaluation.detailed_explanation, hasExplanation: !!evaluation.detailed_explanation,
limitationsIdentified: evaluation.limitations?.length || 0 hasImplementationApproach: !!evaluation.implementation_approach,
prosCount: evaluation.pros?.length || 0,
consCount: evaluation.cons?.length || 0
}, },
evaluation.suitability_score === 'high' ? 85 : evaluation.suitability_score === 'medium' ? 70 : 50, 70,
Date.now() - result.processingTimeMs, Date.now() - result.processingTimeMs,
{ toolType: tool.type, taskRelevanceScore: taskRelevance } { toolType: tool.type, explanationOnly: true, priorityDerived: true }
); );
} }
@ -1173,6 +1086,12 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
} }
} }
private derivePriorityFromScore(taskRelevance: number): string {
if (taskRelevance >= 80) return 'high';
if (taskRelevance >= 60) return 'medium';
return 'low';
}
async processQuery(userQuery: string, mode: string): Promise<AnalysisResult> { async processQuery(userQuery: string, mode: string): Promise<AnalysisResult> {
const startTime = Date.now(); const startTime = Date.now();
let completeTasks = 0; let completeTasks = 0;
@ -1323,8 +1242,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
components: { components: {
semantic: confidence.semanticRelevance, semantic: confidence.semanticRelevance,
suitability: confidence.taskSuitability, suitability: confidence.taskSuitability,
consistency: confidence.methodologicalConsistency, consistency: confidence.methodologicalConsistency
reliability: confidence.toolReliability
} }
}, },
confidence.overall, confidence.overall,