forensic-ai #4
14
.env.example
14
.env.example
@ -117,7 +117,7 @@ AI_MAX_CONTEXT_TOKENS=4000
|
||||
|
||||
# Maximum tokens per individual AI prompt
|
||||
# Larger = more context per call | Smaller = faster responses
|
||||
AI_MAX_PROMPT_TOKENS=1500
|
||||
AI_MAX_PROMPT_TOKENS=2500
|
||||
|
||||
# ============================================================================
|
||||
# 6. AUTHENTICATION & AUTHORIZATION (OPTIONAL)
|
||||
@ -190,18 +190,16 @@ FORENSIC_AUDIT_RETENTION_HOURS=24
|
||||
FORENSIC_AUDIT_MAX_ENTRIES=50
|
||||
|
||||
# ============================================================================
|
||||
# 10. ENHANCED CONFIDENCE SCORING SYSTEM
|
||||
# 10. SIMPLIFIED CONFIDENCE SCORING SYSTEM
|
||||
# ============================================================================
|
||||
|
||||
# Confidence component weights (must sum to 1.0)
|
||||
CONFIDENCE_SEMANTIC_WEIGHT=0.25 # Weight for vector similarity quality
|
||||
CONFIDENCE_SUITABILITY_WEIGHT=0.4 # Weight for AI-determined task fitness
|
||||
CONFIDENCE_CONSISTENCY_WEIGHT=0.2 # Weight for cross-validation agreement
|
||||
CONFIDENCE_RELIABILITY_WEIGHT=0.15 # Weight for tool quality indicators
|
||||
CONFIDENCE_SEMANTIC_WEIGHT=0.5 # Weight for vector similarity quality
|
||||
CONFIDENCE_SUITABILITY_WEIGHT=0.5 # Weight for AI-determined task fitness
|
||||
|
||||
# Confidence thresholds (0-100)
|
||||
CONFIDENCE_MINIMUM_THRESHOLD=40 # Below this = weak recommendation
|
||||
CONFIDENCE_MEDIUM_THRESHOLD=60 # 40-59 = weak, 60-79 = moderate
|
||||
CONFIDENCE_MINIMUM_THRESHOLD=50 # Below this = weak recommendation
|
||||
CONFIDENCE_MEDIUM_THRESHOLD=70 # 40-59 = weak, 60-79 = moderate
|
||||
CONFIDENCE_HIGH_THRESHOLD=80 # 80+ = strong recommendation
|
||||
|
||||
# ============================================================================
|
||||
|
@ -756,17 +756,14 @@ class AIQueryInterface {
|
||||
|
||||
renderConfidenceTooltip(confidence) {
|
||||
if (!confidence || typeof confidence.overall !== 'number') {
|
||||
console.log('[AI DEBUG] No confidence data or invalid format:', confidence);
|
||||
return '';
|
||||
}
|
||||
|
||||
const confidenceColor = confidence.overall >= 80 ? 'var(--color-accent)' :
|
||||
confidence.overall >= 60 ? 'var(--color-warning)' : 'var(--color-error)';
|
||||
confidence.overall >= 60 ? 'var(--color-warning)' : 'var(--color-error)';
|
||||
|
||||
const tooltipId = `tooltip-${Math.random().toString(36).substr(2, 9)}`;
|
||||
|
||||
console.log(`[AI DEBUG] Generating confidence tooltip: ${confidence.overall}% with ID ${tooltipId}`);
|
||||
|
||||
return `
|
||||
<span class="confidence-tooltip-trigger"
|
||||
style="display: inline-flex; align-items: center; gap: 0.125rem; cursor: help; margin-left: 0.25rem;"
|
||||
@ -789,7 +786,7 @@ class AIQueryInterface {
|
||||
<strong style="color: var(--color-accent);">${confidence.semanticRelevance}%</strong>
|
||||
</div>
|
||||
<div style="font-size: 0.625rem; color: var(--color-text-secondary); line-height: 1.3;">
|
||||
Wie gut die Tool-Beschreibung semantisch zu Ihrer Anfrage passt (basierend auf Vektor-Ähnlichkeit)
|
||||
Wie gut die Tool-Beschreibung semantisch zu Ihrer Anfrage passt (Vektor-Ähnlichkeit)
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@ -802,26 +799,6 @@ class AIQueryInterface {
|
||||
KI-bewertete Eignung des Tools für Ihre spezifische forensische Aufgabenstellung
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div style="background: var(--color-bg-secondary); padding: 0.5rem; border-radius: 0.375rem; border-left: 3px solid var(--color-warning);">
|
||||
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 0.25rem;">
|
||||
<span style="font-weight: 600; font-size: 0.6875rem;">🤝 Methodische Konsistenz</span>
|
||||
<strong style="color: var(--color-warning);">${confidence.methodologicalConsistency}%</strong>
|
||||
</div>
|
||||
<div style="font-size: 0.625rem; color: var(--color-text-secondary); line-height: 1.3;">
|
||||
Wie einheitlich verschiedene Analyseschritte dieses Tool bewerten (Kreuzvalidierung)
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div style="background: var(--color-bg-secondary); padding: 0.5rem; border-radius: 0.375rem; border-left: 3px solid var(--color-text-secondary);">
|
||||
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 0.25rem;">
|
||||
<span style="font-weight: 600; font-size: 0.6875rem;">🔧 Tool-Zuverlässigkeit</span>
|
||||
<strong style="color: var(--color-text);">${confidence.toolReliability}%</strong>
|
||||
</div>
|
||||
<div style="font-size: 0.625rem; color: var(--color-text-secondary); line-height: 1.3;">
|
||||
Qualitätsindikatoren: Dokumentation, Wartung, Verfügbarkeit und Benutzerfreundlichkeit
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
${confidence.strengthIndicators && confidence.strengthIndicators.length > 0 ? `
|
||||
@ -847,7 +824,7 @@ class AIQueryInterface {
|
||||
` : ''}
|
||||
|
||||
<div style="margin-top: 0.75rem; padding-top: 0.75rem; border-top: 1px solid var(--color-border); font-size: 0.625rem; color: var(--color-text-secondary); text-align: center;">
|
||||
Mehrstufige KI-Analyse mit Kreuzvalidierung
|
||||
Forensisch fundierte KI-Analyse
|
||||
</div>
|
||||
</div>
|
||||
</span>
|
||||
|
@ -120,61 +120,60 @@ ${aspects}
|
||||
WICHTIG: Antworten Sie NUR in fließendem deutschen Text ohne Listen oder Markdown. Maximum 120 Wörter.`;
|
||||
},
|
||||
|
||||
// Phase tool selection prompt
|
||||
phaseToolSelection: (userQuery: string, phase: any, phaseTools: any[]) => {
|
||||
return `Wählen Sie 2-3 Methoden/Tools für die Phase "${phase.name}" basierend auf objektiven, fallbezogenen Kriterien.
|
||||
return `Wählen Sie 2-3 Methoden/Tools für die Phase "${phase.name}" und bewerten Sie deren Aufgaben-Eignung VERGLEICHEND.
|
||||
|
||||
SZENARIO: "${userQuery}"
|
||||
SPEZIFISCHE PHASE: ${phase.name} - ${phase.description || 'Forensische Untersuchungsphase'}
|
||||
|
||||
VERFÜGBARE TOOLS FÜR ${phase.name.toUpperCase()}:
|
||||
${phaseTools.map((tool: any) => `- ${tool.name}: ${tool.description.slice(0, 100)}...`).join('\n')}
|
||||
${phaseTools.map((tool: any, index: number) => `${index + 1}. ${tool.name}: ${tool.description.slice(0, 150)}...
|
||||
- Plattformen: ${tool.platforms?.join(', ') || 'N/A'}
|
||||
- Skill Level: ${tool.skillLevel}
|
||||
- Tags: ${tool.tags?.join(', ') || 'N/A'}`).join('\n\n')}
|
||||
|
||||
Wählen Sie Methoden/Tools nach forensischen Kriterien aus:
|
||||
- Court admissibility und Chain of Custody Kompatibilität
|
||||
- Integration in forensische Standard-Workflows
|
||||
- Reproduzierbarkeit und Dokumentationsqualität
|
||||
- Objektivität
|
||||
Bewerten Sie ALLE Tools vergleichend für diese spezifische Aufgabe UND Phase. Wählen Sie die 2-3 besten aus.
|
||||
|
||||
Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format (kein zusätzlicher Text):
|
||||
BEWERTUNGSKRITERIEN:
|
||||
- Wie gut löst das Tool das forensische Problem im SZENARIO-Kontext?
|
||||
- Wie gut passt es zur spezifischen PHASE "${phase.name}"?
|
||||
- Wie vergleicht es sich mit den anderen verfügbaren Tools für diese Phase?
|
||||
|
||||
Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format:
|
||||
[
|
||||
{
|
||||
"toolName": "Exakter Methoden/Tool-Name",
|
||||
"priority": "high|medium|low",
|
||||
"justification": "Objektive Begründung warum diese Methode/Tool für das spezifische Szenario besser geeignet ist"
|
||||
"toolName": "Exakter Tool-Name",
|
||||
"taskRelevance": 85,
|
||||
"justification": "Vergleichende Begründung warum dieses Tool für diese Phase und Aufgabe besser/schlechter als die anderen geeignet ist",
|
||||
"limitations": ["Spezifische Einschränkung 1", "Einschränkung 2"]
|
||||
}
|
||||
]`;
|
||||
},
|
||||
|
||||
// Tool evaluation prompt
|
||||
toolEvaluation: (userQuery: string, tool: any, rank: number) => {
|
||||
return `Sie sind ein DFIR-Experte und bewerten ein forensisches Tool für eine spezifische Aufgabe.
|
||||
|
||||
PROBLEM: "${userQuery}"
|
||||
|
||||
TOOL: ${tool.name}
|
||||
BESCHREIBUNG: ${tool.description}
|
||||
PLATTFORMEN: ${tool.platforms?.join(', ') || 'N/A'}
|
||||
SKILL LEVEL: ${tool.skillLevel}
|
||||
DOMAINS: ${tool.domains?.join(', ') || 'N/A'}
|
||||
TAGS: ${tool.tags?.join(', ') || 'N/A'}
|
||||
|
||||
Bewerten Sie nach forensischen Standards und antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format:
|
||||
{
|
||||
"suitability_score": "high|medium|low",
|
||||
"task_relevance": 85,
|
||||
"detailed_explanation": "Detaillierte forensische Begründung warum diese Methode/Tool das Problem löst",
|
||||
"implementation_approach": "Konkrete methodische Schritte zur korrekten Anwendung für dieses spezifische Problem",
|
||||
"pros": ["Forensischer Vorteil 1", "Validierter Vorteil 2"],
|
||||
"cons": ["Methodische Limitation 1", "Potenzielle Schwäche 2"],
|
||||
"limitations": ["Spezifische Einschränkung 1", "Mögliche Problematik 2"],
|
||||
"alternatives": "Alternative Ansätze falls diese Methode/Tool nicht optimal ist"
|
||||
}
|
||||
]
|
||||
|
||||
WICHTIG:
|
||||
- task_relevance: Numerischer Wert 0-100 wie gut das Tool für DIESE SPEZIFISCHE Aufgabe geeignet ist
|
||||
- limitations: Konkrete Einschränkungen oder Situationen wo das Tool NICHT optimal wäre
|
||||
- Berücksichtigen Sie den Skill Level vs. Anfrage-Komplexität
|
||||
- Bewerten Sie objektiv, nicht beschönigend`;
|
||||
- taskRelevance: 0-100 Score basierend auf Szenario-Eignung UND Phasen-Passung im VERGLEICH zu anderen Tools
|
||||
- Nur die 2-3 BESTEN Tools auswählen und bewerten
|
||||
- justification soll VERGLEICHEND sein ("besser als X weil...", "für diese Phase ideal weil...")`;
|
||||
},
|
||||
|
||||
toolEvaluation: (userQuery: string, tool: any, rank: number, taskRelevance: number) => {
|
||||
return `Sie sind ein DFIR-Experte. Erklären Sie DETAILLIERT die Anwendung dieses bereits bewerteten Tools.
|
||||
|
||||
PROBLEM: "${userQuery}"
|
||||
TOOL: ${tool.name} (bereits bewertet mit ${taskRelevance}% Aufgaben-Eignung)
|
||||
BESCHREIBUNG: ${tool.description}
|
||||
|
||||
Das Tool wurde bereits als Rang ${rank} für diese Aufgabe bewertet. Erklären Sie nun:
|
||||
|
||||
Antworten Sie AUSSCHLIESSLICH mit diesem JSON-Format:
|
||||
{
|
||||
"detailed_explanation": "Detaillierte Erklärung warum und wie dieses Tool für diese spezifische Aufgabe eingesetzt wird",
|
||||
"implementation_approach": "Konkrete Schritt-für-Schritt Anleitung zur korrekten Anwendung",
|
||||
"pros": ["Spezifischer Vorteil 1", "Spezifischer Vorteil 2"],
|
||||
"cons": ["Bekannte Limitation 1", "Bekannte Limitation 2"],
|
||||
"alternatives": "Alternative Ansätze oder Tools falls dieses nicht verfügbar ist"
|
||||
}
|
||||
|
||||
WICHTIG: Keine erneute Bewertung - nur detaillierte Erklärung der bereits bewerteten Eignung.`;
|
||||
},
|
||||
|
||||
// Background knowledge selection prompt
|
||||
@ -229,7 +228,7 @@ export function getPrompt(key: 'scenarioAnalysis', isWorkflow: boolean, userQuer
|
||||
export function getPrompt(key: 'investigationApproach', isWorkflow: boolean, userQuery: string): string;
|
||||
export function getPrompt(key: 'criticalConsiderations', isWorkflow: boolean, userQuery: string): string;
|
||||
export function getPrompt(key: 'phaseToolSelection', userQuery: string, phase: any, phaseTools: any[]): string;
|
||||
export function getPrompt(key: 'toolEvaluation', userQuery: string, tool: any, rank: number): string;
|
||||
export function getPrompt(key: 'toolEvaluation', userQuery: string, tool: any, rank: number, taskRelevance: number): string;
|
||||
export function getPrompt(key: 'backgroundKnowledgeSelection', userQuery: string, mode: string, selectedToolNames: string[], availableConcepts: any[]): string;
|
||||
export function getPrompt(key: 'finalRecommendations', isWorkflow: boolean, userQuery: string, selectedToolNames: string[]): string;
|
||||
export function getPrompt(promptKey: keyof typeof AI_PROMPTS, ...args: any[]): string {
|
||||
|
@ -3975,7 +3975,7 @@ tools:
|
||||
- name: KAPE
|
||||
type: software
|
||||
description: >-
|
||||
Kroll Artifact Parser and Extractor revolutioniert Windows-Forensik durch
|
||||
Kroll Artifact Parser and Extractor versucht sich an Windows-Forensik durch
|
||||
intelligente Ziel-basierte Sammlung. Statt Full-Disk-Images extrahiert
|
||||
KAPE gezielt kritische Artefakte: Registry-Hives, Event-Logs, Prefetch,
|
||||
Browser- Daten, Scheduled-Tasks in Minuten statt Stunden. Die Target-Files
|
||||
@ -3983,12 +3983,10 @@ tools:
|
||||
Besonders clever: Compound-Targets gruppieren zusammengehörige Artefakte
|
||||
(z.B. "Browser" sammelt Chrome+Firefox+Edge), die gKAPE-GUI macht es auch
|
||||
für Nicht-Techniker zugänglich. Batch-Mode verarbeitet mehrere Images
|
||||
parallel. Output direkt kompatibel zu Timeline-Tools wie Plaso. Die
|
||||
ständigen Community-Updates halten mit Windows-Entwicklungen Schritt.
|
||||
parallel. Output direkt kompatibel zu Timeline-Tools wie Plaso.
|
||||
VSS-Processing analysiert Shadow- Copies automatisch. Der
|
||||
Remote-Collection-Mode sammelt über Netzwerk. Kostenlos aber
|
||||
Enterprise-Support verfügbar. Der neue Standard für effiziente
|
||||
Windows-Forensik-Triage.
|
||||
Remote-Collection-Mode sammelt über Netzwerk. Kostenlos (mit Registrierung) aber
|
||||
Enterprise-Support verfügbar.
|
||||
skillLevel: intermediate
|
||||
url: https://www.kroll.com/kape
|
||||
icon: 🧰
|
||||
@ -4003,7 +4001,7 @@ tools:
|
||||
platforms:
|
||||
- Windows
|
||||
accessType: download
|
||||
license: Freeware
|
||||
license: Proprietary
|
||||
knowledgebase: false
|
||||
- name: Kibana
|
||||
type: software
|
||||
|
@ -72,7 +72,6 @@ interface ConfidenceMetrics {
|
||||
semanticRelevance: number; // How well tool description matches query (from embeddings)
|
||||
taskSuitability: number; // AI-determined fitness for this specific task
|
||||
methodologicalConsistency: number; // How well different analysis steps agree
|
||||
toolReliability: number; // Indicators of tool quality and maintenance
|
||||
uncertaintyFactors: string[]; // Specific reasons why this might not work
|
||||
strengthIndicators: string[]; // Specific reasons why this is a good choice
|
||||
}
|
||||
@ -146,17 +145,17 @@ class ImprovedMicroTaskAIPipeline {
|
||||
|
||||
// Updated confidence weights - more focused on AI evaluation
|
||||
this.confidenceConfig = {
|
||||
semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.25'), // Embeddings similarity
|
||||
suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.4'), // AI task fit evaluation
|
||||
consistencyWeight: parseFloat(process.env.CONFIDENCE_CONSISTENCY_WEIGHT || '0.2'), // Cross-validation agreement
|
||||
reliabilityWeight: parseFloat(process.env.CONFIDENCE_RELIABILITY_WEIGHT || '0.15'), // Tool quality indicators
|
||||
semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.3'), // Embeddings similarity
|
||||
suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.7'), // AI task fit evaluation
|
||||
consistencyWeight: 0,
|
||||
reliabilityWeight: 0,
|
||||
minimumThreshold: parseInt(process.env.CONFIDENCE_MINIMUM_THRESHOLD || '40', 10),
|
||||
mediumThreshold: parseInt(process.env.CONFIDENCE_MEDIUM_THRESHOLD || '60', 10),
|
||||
highThreshold: parseInt(process.env.CONFIDENCE_HIGH_THRESHOLD || '80', 10)
|
||||
};
|
||||
|
||||
console.log('[AI PIPELINE] Enhanced confidence scoring enabled:', {
|
||||
weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight} Consistency:${this.confidenceConfig.consistencyWeight} Reliability:${this.confidenceConfig.reliabilityWeight}`,
|
||||
console.log('[AI PIPELINE] Simplified confidence scoring enabled:', {
|
||||
weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight}`,
|
||||
thresholds: `${this.confidenceConfig.minimumThreshold}/${this.confidenceConfig.mediumThreshold}/${this.confidenceConfig.highThreshold}`
|
||||
});
|
||||
}
|
||||
@ -709,197 +708,107 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
|
||||
): ConfidenceMetrics {
|
||||
|
||||
// 1. Semantic Relevance: Real embeddings similarity score
|
||||
const semanticRelevance = context.embeddingsSimilarities.has(tool.name) ?
|
||||
Math.round(context.embeddingsSimilarities.get(tool.name)! * 100) : 50;
|
||||
const rawSemanticRelevance = context.embeddingsSimilarities.has(tool.name) ?
|
||||
context.embeddingsSimilarities.get(tool.name)! * 100 : 50;
|
||||
|
||||
// 2. Task Suitability: AI-determined fitness for specific task
|
||||
const taskSuitability = Math.round(taskRelevance);
|
||||
// 2. Task Suitability: Enhanced with phase-awareness for workflow mode
|
||||
let enhancedTaskSuitability = taskRelevance;
|
||||
|
||||
// 3. Methodological Consistency: Cross-validation between micro-tasks
|
||||
const methodologicalConsistency = this.calculateCrossValidationScore(tool.name, context);
|
||||
if (context.mode === 'workflow') {
|
||||
// In workflow mode, boost score if tool is well-matched to its assigned phase
|
||||
const toolSelection = context.selectedTools?.find(st => st.tool.name === tool.name);
|
||||
if (toolSelection && tool.phases && tool.phases.includes(toolSelection.phase)) {
|
||||
// Boost for phase alignment (but cap at 100)
|
||||
const phaseBonus = Math.min(15, 100 - taskRelevance);
|
||||
enhancedTaskSuitability = Math.min(100, taskRelevance + phaseBonus);
|
||||
|
||||
// 4. Tool Reliability: Quality indicators
|
||||
const toolReliability = this.calculateToolReliability(tool);
|
||||
console.log(`[CONFIDENCE] Phase bonus for ${tool.name}: ${taskRelevance} -> ${enhancedTaskSuitability} (phase: ${toolSelection.phase})`);
|
||||
}
|
||||
}
|
||||
|
||||
// Debug logging
|
||||
console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, {
|
||||
semantic: semanticRelevance,
|
||||
taskSuitability: taskSuitability,
|
||||
consistency: methodologicalConsistency,
|
||||
reliability: toolReliability,
|
||||
hasEmbeddingsSimilarity: context.embeddingsSimilarities.has(tool.name),
|
||||
rawTaskRelevance: taskRelevance
|
||||
});
|
||||
|
||||
// Calculate weighted overall score
|
||||
// Simple weighted combination - no artificial scaling
|
||||
const overall = (
|
||||
semanticRelevance * this.confidenceConfig.semanticWeight +
|
||||
taskSuitability * this.confidenceConfig.suitabilityWeight +
|
||||
methodologicalConsistency * this.confidenceConfig.consistencyWeight +
|
||||
toolReliability * this.confidenceConfig.reliabilityWeight
|
||||
rawSemanticRelevance * this.confidenceConfig.semanticWeight +
|
||||
enhancedTaskSuitability * this.confidenceConfig.suitabilityWeight
|
||||
);
|
||||
|
||||
const uncertaintyFactors = this.identifySpecificUncertaintyFactors(tool, context, limitations, overall);
|
||||
const strengthIndicators = this.identifySpecificStrengthIndicators(tool, context, overall);
|
||||
|
||||
console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, {
|
||||
rawSemantic: Math.round(rawSemanticRelevance),
|
||||
rawTaskSuitability: taskRelevance,
|
||||
enhancedTaskSuitability: Math.round(enhancedTaskSuitability),
|
||||
overall: Math.round(overall),
|
||||
mode: context.mode
|
||||
});
|
||||
|
||||
return {
|
||||
overall: Math.round(overall),
|
||||
semanticRelevance: Math.round(semanticRelevance),
|
||||
taskSuitability: Math.round(taskSuitability),
|
||||
methodologicalConsistency: Math.round(methodologicalConsistency),
|
||||
toolReliability: Math.round(toolReliability),
|
||||
semanticRelevance: Math.round(rawSemanticRelevance),
|
||||
taskSuitability: Math.round(enhancedTaskSuitability),
|
||||
methodologicalConsistency: 0,
|
||||
uncertaintyFactors,
|
||||
strengthIndicators
|
||||
};
|
||||
}
|
||||
|
||||
private calculateCrossValidationScore(toolName: string, context: AnalysisContext): number {
|
||||
// Look for entries where this tool was mentioned across different phases
|
||||
const relevantEntries = context.auditTrail.filter(entry =>
|
||||
entry.phase === 'micro-task' || entry.phase === 'selection'
|
||||
);
|
||||
|
||||
let toolMentions = 0;
|
||||
let positiveEvaluations = 0;
|
||||
let confidenceSum = 0;
|
||||
|
||||
relevantEntries.forEach(entry => {
|
||||
let toolFound = false;
|
||||
|
||||
// Check various ways the tool might be referenced in output
|
||||
if (entry.output && typeof entry.output === 'object') {
|
||||
// Check selectedTools arrays
|
||||
if (Array.isArray(entry.output.selectedTools) &&
|
||||
entry.output.selectedTools.includes(toolName)) {
|
||||
toolFound = true;
|
||||
}
|
||||
|
||||
// Check finalToolNames arrays
|
||||
if (Array.isArray(entry.output.finalToolNames) &&
|
||||
entry.output.finalToolNames.includes(toolName)) {
|
||||
toolFound = true;
|
||||
}
|
||||
|
||||
// Check toolName in individual evaluation
|
||||
if (entry.output.toolName === toolName) {
|
||||
toolFound = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (toolFound) {
|
||||
toolMentions++;
|
||||
confidenceSum += entry.confidence;
|
||||
|
||||
// Consider it positive if confidence >= 60
|
||||
if (entry.confidence >= 60) {
|
||||
positiveEvaluations++;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`[AI PIPELINE] Cross-validation for ${toolName}: ${toolMentions} mentions, ${positiveEvaluations} positive, avg confidence: ${toolMentions > 0 ? Math.round(confidenceSum / toolMentions) : 0}`);
|
||||
|
||||
if (toolMentions === 0) {
|
||||
return 60; // Default when no cross-validation data available
|
||||
}
|
||||
|
||||
if (toolMentions === 1) {
|
||||
// Single mention - use confidence directly but cap it
|
||||
return Math.min(85, Math.max(40, confidenceSum));
|
||||
}
|
||||
|
||||
// Multiple mentions - calculate agreement ratio
|
||||
const agreementRatio = positiveEvaluations / toolMentions;
|
||||
const avgConfidence = confidenceSum / toolMentions;
|
||||
|
||||
// Combine agreement ratio with average confidence
|
||||
const crossValidationScore = (agreementRatio * 0.7 + (avgConfidence / 100) * 0.3) * 100;
|
||||
|
||||
return Math.round(Math.min(95, Math.max(30, crossValidationScore)));
|
||||
}
|
||||
|
||||
// NEW: Calculate tool reliability based on objective indicators
|
||||
private calculateToolReliability(tool: any): number {
|
||||
let reliability = 50; // Base score
|
||||
|
||||
// Documentation availability
|
||||
if (tool.knowledgebase === true) reliability += 25;
|
||||
|
||||
// Active maintenance (hosted tools are typically maintained)
|
||||
if (isToolHosted(tool)) reliability += 20;
|
||||
|
||||
// Community support (open source often has community)
|
||||
if (tool.license && tool.license !== 'Proprietary') reliability += 10;
|
||||
|
||||
// Skill level appropriateness (not too complex, not too simple)
|
||||
if (tool.skillLevel === 'intermediate' || tool.skillLevel === 'advanced') reliability += 10;
|
||||
else if (tool.skillLevel === 'expert') reliability -= 5; // May be overcomplicated
|
||||
|
||||
// Multi-platform support (more versatile)
|
||||
if (tool.platforms && tool.platforms.length > 1) reliability += 5;
|
||||
|
||||
return Math.min(100, reliability);
|
||||
}
|
||||
|
||||
// NEW: Identify specific uncertainty factors based on analysis
|
||||
private identifySpecificUncertaintyFactors(tool: any, context: AnalysisContext, limitations: string[], confidence: number): string[] {
|
||||
const factors: string[] = [];
|
||||
|
||||
// Add AI-identified limitations
|
||||
// Add AI-identified limitations first (most specific)
|
||||
if (limitations && limitations.length > 0) {
|
||||
factors.push(...limitations.slice(0, 3)); // Limit to top 3
|
||||
factors.push(...limitations.slice(0, 2)); // Limit to top 2 to leave room for others
|
||||
}
|
||||
|
||||
// Low semantic similarity
|
||||
const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5;
|
||||
if (similarity < 0.4) {
|
||||
if (similarity < 0.7) {
|
||||
factors.push('Geringe semantische Ähnlichkeit zur Anfrage - Tool-Beschreibung passt möglicherweise nicht optimal');
|
||||
}
|
||||
|
||||
// Skill level mismatch
|
||||
if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent/i.test(context.userQuery)) {
|
||||
factors.push('Experten-Tool für Eilszenario - möglicherweise zu komplex für schnelle Antworten');
|
||||
// Skill level vs scenario complexity mismatch
|
||||
if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent|sofort/i.test(context.userQuery)) {
|
||||
factors.push('Experten-Tool für zeitkritisches Szenario - Setup und Einarbeitung könnten zu lange dauern');
|
||||
}
|
||||
|
||||
if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced/i.test(context.userQuery)) {
|
||||
factors.push('Einsteiger-Tool für komplexes Szenario - könnte funktionale Einschränkungen haben');
|
||||
if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced|forensisch/i.test(context.userQuery)) {
|
||||
factors.push('Einsteiger-Tool für komplexe Analyse - könnte funktionale Limitierungen haben');
|
||||
}
|
||||
|
||||
// Access limitations
|
||||
// Platform availability concerns
|
||||
if (tool.platforms && tool.platforms.length === 1 && tool.platforms[0] === 'Windows' && /linux|unix|server/i.test(context.userQuery)) {
|
||||
factors.push('Nur Windows-Tool bei möglicher Linux/Server-Umgebung - Plattform-Inkompatibilität');
|
||||
}
|
||||
|
||||
// Access and deployment concerns
|
||||
if (tool.type === 'software' && !isToolHosted(tool) && tool.accessType === 'download') {
|
||||
factors.push('Installation erforderlich - nicht sofort verfügbar ohne Setup');
|
||||
factors.push('Installation und Setup erforderlich');
|
||||
}
|
||||
|
||||
// Cross-validation disagreement
|
||||
const crossValidation = this.calculateCrossValidationScore(tool.name, context);
|
||||
if (crossValidation < 50) {
|
||||
factors.push('Uneinheitliche Bewertung in verschiedenen Analyseschritten - Empfehlung nicht eindeutig');
|
||||
// License restrictions
|
||||
if (tool.license === 'Proprietary') {
|
||||
factors.push('Kommerzielle Software - Lizenzkosten und rechtliche Beschränkungen zu beachten');
|
||||
}
|
||||
|
||||
return factors.slice(0, 4); // Limit to 4 most important factors
|
||||
// Low overall confidence warning
|
||||
if (confidence < 60) {
|
||||
factors.push('Moderate Gesamtbewertung - alternative Ansätze sollten ebenfalls betrachtet werden');
|
||||
}
|
||||
|
||||
return factors.slice(0, 4); // Limit to 4 most relevant factors
|
||||
}
|
||||
|
||||
// NEW: Identify specific strength indicators
|
||||
private identifySpecificStrengthIndicators(tool: any, context: AnalysisContext, confidence: number): string[] {
|
||||
const indicators: string[] = [];
|
||||
|
||||
// High confidence overall
|
||||
if (confidence >= this.confidenceConfig.highThreshold) {
|
||||
indicators.push('Hohe Gesamtbewertung durch mehrfache Validierung');
|
||||
}
|
||||
|
||||
// High semantic similarity
|
||||
const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5;
|
||||
if (similarity >= 0.7) {
|
||||
indicators.push('Sehr gute semantische Übereinstimmung mit Ihrer Anfrage');
|
||||
}
|
||||
|
||||
// Strong cross-validation
|
||||
const crossValidation = this.calculateCrossValidationScore(tool.name, context);
|
||||
if (crossValidation >= 80) {
|
||||
indicators.push('Konsistente Empfehlung über verschiedene Analyseschritte hinweg');
|
||||
}
|
||||
|
||||
// Quality indicators
|
||||
if (tool.knowledgebase === true) {
|
||||
indicators.push('Umfassende Dokumentation und Wissensbasis verfügbar');
|
||||
@ -985,7 +894,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
|
||||
|
||||
const prompt = getPrompt('phaseToolSelection', context.userQuery, phase, phaseTools);
|
||||
|
||||
const result = await this.callMicroTaskAI(prompt, context, 800);
|
||||
const result = await this.callMicroTaskAI(prompt, context, 1000);
|
||||
|
||||
if (result.success) {
|
||||
const selections = this.safeParseJSON(result.content, []);
|
||||
@ -998,16 +907,30 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
|
||||
validSelections.forEach((sel: any) => {
|
||||
const tool = phaseTools.find((t: any) => t.name === sel.toolName);
|
||||
if (tool) {
|
||||
this.addToolToSelection(context, tool, phase.id, sel.priority, sel.justification);
|
||||
// Ensure taskRelevance is a number
|
||||
const taskRelevance = typeof sel.taskRelevance === 'number' ?
|
||||
sel.taskRelevance : parseInt(String(sel.taskRelevance)) || 70;
|
||||
|
||||
// Derive priority automatically from score
|
||||
const priority = this.derivePriorityFromScore(taskRelevance);
|
||||
|
||||
this.addToolToSelection(context, tool, phase.id, priority, sel.justification, taskRelevance, sel.limitations);
|
||||
}
|
||||
});
|
||||
|
||||
this.addAuditEntry(context, 'micro-task', 'phase-tool-selection',
|
||||
{ phase: phase.id, availableTools: phaseTools.length },
|
||||
{ validSelections: validSelections.length, selectedTools: validSelections.map(s => s.toolName) },
|
||||
{
|
||||
validSelections: validSelections.length,
|
||||
selectedTools: validSelections.map(s => ({
|
||||
name: s.toolName,
|
||||
taskRelevance: s.taskRelevance,
|
||||
derivedPriority: this.derivePriorityFromScore(s.taskRelevance)
|
||||
}))
|
||||
},
|
||||
validSelections.length > 0 ? 75 : 30,
|
||||
Date.now() - result.processingTimeMs,
|
||||
{ phaseName: phase.name }
|
||||
{ phaseName: phase.name, comparativeEvaluation: true, priorityDerived: true }
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -1016,56 +939,46 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
|
||||
}
|
||||
|
||||
private async evaluateSpecificTool(context: AnalysisContext, tool: any, rank: number): Promise<MicroTaskResult> {
|
||||
const prompt = getPrompt('toolEvaluation', context.userQuery, tool, rank);
|
||||
// Get existing task relevance from previous phase selection
|
||||
const existingSelection = context.selectedTools?.find(st => st.tool.name === tool.name);
|
||||
const taskRelevance = existingSelection?.taskRelevance || 70;
|
||||
const priority = this.derivePriorityFromScore(taskRelevance);
|
||||
|
||||
const result = await this.callMicroTaskAI(prompt, context, 1200);
|
||||
const prompt = getPrompt('toolEvaluation', context.userQuery, tool, rank, taskRelevance);
|
||||
|
||||
const result = await this.callMicroTaskAI(prompt, context, 1000);
|
||||
|
||||
if (result.success) {
|
||||
const evaluation = this.safeParseJSON(result.content, {
|
||||
suitability_score: 'medium',
|
||||
task_relevance: '',
|
||||
detailed_explanation: 'Evaluation failed',
|
||||
implementation_approach: '',
|
||||
pros: [],
|
||||
cons: [],
|
||||
limitations: [],
|
||||
alternatives: ''
|
||||
});
|
||||
|
||||
// Debug logging to see what we're getting
|
||||
console.log(`[AI PIPELINE] Tool ${tool.name} evaluation:`, {
|
||||
taskRelevance: evaluation.task_relevance,
|
||||
suitabilityScore: evaluation.suitability_score,
|
||||
limitationsCount: evaluation.limitations?.length || 0
|
||||
});
|
||||
|
||||
// Ensure task_relevance is a number
|
||||
const taskRelevance = typeof evaluation.task_relevance === 'number' ?
|
||||
evaluation.task_relevance :
|
||||
parseInt(String(evaluation.task_relevance)) || 70;
|
||||
|
||||
// Store enhanced evaluation data
|
||||
// Store evaluation without re-scoring
|
||||
this.addToolToSelection(context, {
|
||||
...tool,
|
||||
evaluation: {
|
||||
...evaluation,
|
||||
task_relevance: taskRelevance, // Ensure it's stored as number
|
||||
rank
|
||||
rank,
|
||||
task_relevance: taskRelevance
|
||||
}
|
||||
}, 'evaluation', evaluation.suitability_score, evaluation.detailed_explanation,
|
||||
taskRelevance, evaluation.limitations);
|
||||
}, 'evaluation', priority, evaluation.detailed_explanation,
|
||||
taskRelevance, existingSelection?.limitations);
|
||||
|
||||
this.addAuditEntry(context, 'micro-task', 'tool-evaluation',
|
||||
{ toolName: tool.name, rank },
|
||||
{ toolName: tool.name, rank, existingTaskRelevance: taskRelevance, derivedPriority: priority },
|
||||
{
|
||||
suitabilityScore: evaluation.suitability_score,
|
||||
taskRelevance: taskRelevance, // Use the cleaned number
|
||||
hasExplanation: !!evaluation.detailed_explanation,
|
||||
limitationsIdentified: evaluation.limitations?.length || 0
|
||||
hasImplementationApproach: !!evaluation.implementation_approach,
|
||||
prosCount: evaluation.pros?.length || 0,
|
||||
consCount: evaluation.cons?.length || 0
|
||||
},
|
||||
evaluation.suitability_score === 'high' ? 85 : evaluation.suitability_score === 'medium' ? 70 : 50,
|
||||
70,
|
||||
Date.now() - result.processingTimeMs,
|
||||
{ toolType: tool.type, taskRelevanceScore: taskRelevance }
|
||||
{ toolType: tool.type, explanationOnly: true, priorityDerived: true }
|
||||
);
|
||||
}
|
||||
|
||||
@ -1173,6 +1086,12 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
|
||||
}
|
||||
}
|
||||
|
||||
private derivePriorityFromScore(taskRelevance: number): string {
|
||||
if (taskRelevance >= 80) return 'high';
|
||||
if (taskRelevance >= 60) return 'medium';
|
||||
return 'low';
|
||||
}
|
||||
|
||||
async processQuery(userQuery: string, mode: string): Promise<AnalysisResult> {
|
||||
const startTime = Date.now();
|
||||
let completeTasks = 0;
|
||||
@ -1323,8 +1242,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
|
||||
components: {
|
||||
semantic: confidence.semanticRelevance,
|
||||
suitability: confidence.taskSuitability,
|
||||
consistency: confidence.methodologicalConsistency,
|
||||
reliability: confidence.toolReliability
|
||||
consistency: confidence.methodologicalConsistency
|
||||
}
|
||||
},
|
||||
confidence.overall,
|
||||
|
Loading…
x
Reference in New Issue
Block a user