confidence updates, content adjustment

This commit is contained in:
overcuriousity
2025-08-05 21:35:38 +02:00
parent 27e64f05ca
commit fe1be323bb
5 changed files with 160 additions and 270 deletions

View File

@@ -72,7 +72,6 @@ interface ConfidenceMetrics {
semanticRelevance: number; // How well tool description matches query (from embeddings)
taskSuitability: number; // AI-determined fitness for this specific task
methodologicalConsistency: number; // How well different analysis steps agree
toolReliability: number; // Indicators of tool quality and maintenance
uncertaintyFactors: string[]; // Specific reasons why this might not work
strengthIndicators: string[]; // Specific reasons why this is a good choice
}
@@ -146,17 +145,17 @@ class ImprovedMicroTaskAIPipeline {
// Updated confidence weights - more focused on AI evaluation
this.confidenceConfig = {
semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.25'), // Embeddings similarity
suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.4'), // AI task fit evaluation
consistencyWeight: parseFloat(process.env.CONFIDENCE_CONSISTENCY_WEIGHT || '0.2'), // Cross-validation agreement
reliabilityWeight: parseFloat(process.env.CONFIDENCE_RELIABILITY_WEIGHT || '0.15'), // Tool quality indicators
semanticWeight: parseFloat(process.env.CONFIDENCE_SEMANTIC_WEIGHT || '0.3'), // Embeddings similarity
suitabilityWeight: parseFloat(process.env.CONFIDENCE_SUITABILITY_WEIGHT || '0.7'), // AI task fit evaluation
consistencyWeight: 0,
reliabilityWeight: 0,
minimumThreshold: parseInt(process.env.CONFIDENCE_MINIMUM_THRESHOLD || '40', 10),
mediumThreshold: parseInt(process.env.CONFIDENCE_MEDIUM_THRESHOLD || '60', 10),
highThreshold: parseInt(process.env.CONFIDENCE_HIGH_THRESHOLD || '80', 10)
};
console.log('[AI PIPELINE] Enhanced confidence scoring enabled:', {
weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight} Consistency:${this.confidenceConfig.consistencyWeight} Reliability:${this.confidenceConfig.reliabilityWeight}`,
console.log('[AI PIPELINE] Simplified confidence scoring enabled:', {
weights: `Semantic:${this.confidenceConfig.semanticWeight} Suitability:${this.confidenceConfig.suitabilityWeight}`,
thresholds: `${this.confidenceConfig.minimumThreshold}/${this.confidenceConfig.mediumThreshold}/${this.confidenceConfig.highThreshold}`
});
}
@@ -709,197 +708,107 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
): ConfidenceMetrics {
// 1. Semantic Relevance: Real embeddings similarity score
const semanticRelevance = context.embeddingsSimilarities.has(tool.name) ?
Math.round(context.embeddingsSimilarities.get(tool.name)! * 100) : 50;
const rawSemanticRelevance = context.embeddingsSimilarities.has(tool.name) ?
context.embeddingsSimilarities.get(tool.name)! * 100 : 50;
// 2. Task Suitability: AI-determined fitness for specific task
const taskSuitability = Math.round(taskRelevance);
// 2. Task Suitability: Enhanced with phase-awareness for workflow mode
let enhancedTaskSuitability = taskRelevance;
// 3. Methodological Consistency: Cross-validation between micro-tasks
const methodologicalConsistency = this.calculateCrossValidationScore(tool.name, context);
if (context.mode === 'workflow') {
// In workflow mode, boost score if tool is well-matched to its assigned phase
const toolSelection = context.selectedTools?.find(st => st.tool.name === tool.name);
if (toolSelection && tool.phases && tool.phases.includes(toolSelection.phase)) {
// Boost for phase alignment (but cap at 100)
const phaseBonus = Math.min(15, 100 - taskRelevance);
enhancedTaskSuitability = Math.min(100, taskRelevance + phaseBonus);
console.log(`[CONFIDENCE] Phase bonus for ${tool.name}: ${taskRelevance} -> ${enhancedTaskSuitability} (phase: ${toolSelection.phase})`);
}
}
// 4. Tool Reliability: Quality indicators
const toolReliability = this.calculateToolReliability(tool);
// Debug logging
console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, {
semantic: semanticRelevance,
taskSuitability: taskSuitability,
consistency: methodologicalConsistency,
reliability: toolReliability,
hasEmbeddingsSimilarity: context.embeddingsSimilarities.has(tool.name),
rawTaskRelevance: taskRelevance
});
// Calculate weighted overall score
// Simple weighted combination - no artificial scaling
const overall = (
semanticRelevance * this.confidenceConfig.semanticWeight +
taskSuitability * this.confidenceConfig.suitabilityWeight +
methodologicalConsistency * this.confidenceConfig.consistencyWeight +
toolReliability * this.confidenceConfig.reliabilityWeight
rawSemanticRelevance * this.confidenceConfig.semanticWeight +
enhancedTaskSuitability * this.confidenceConfig.suitabilityWeight
);
const uncertaintyFactors = this.identifySpecificUncertaintyFactors(tool, context, limitations, overall);
const strengthIndicators = this.identifySpecificStrengthIndicators(tool, context, overall);
console.log(`[CONFIDENCE DEBUG] ${tool.name}:`, {
rawSemantic: Math.round(rawSemanticRelevance),
rawTaskSuitability: taskRelevance,
enhancedTaskSuitability: Math.round(enhancedTaskSuitability),
overall: Math.round(overall),
mode: context.mode
});
return {
overall: Math.round(overall),
semanticRelevance: Math.round(semanticRelevance),
taskSuitability: Math.round(taskSuitability),
methodologicalConsistency: Math.round(methodologicalConsistency),
toolReliability: Math.round(toolReliability),
semanticRelevance: Math.round(rawSemanticRelevance),
taskSuitability: Math.round(enhancedTaskSuitability),
methodologicalConsistency: 0,
uncertaintyFactors,
strengthIndicators
};
}
private calculateCrossValidationScore(toolName: string, context: AnalysisContext): number {
// Look for entries where this tool was mentioned across different phases
const relevantEntries = context.auditTrail.filter(entry =>
entry.phase === 'micro-task' || entry.phase === 'selection'
);
let toolMentions = 0;
let positiveEvaluations = 0;
let confidenceSum = 0;
relevantEntries.forEach(entry => {
let toolFound = false;
// Check various ways the tool might be referenced in output
if (entry.output && typeof entry.output === 'object') {
// Check selectedTools arrays
if (Array.isArray(entry.output.selectedTools) &&
entry.output.selectedTools.includes(toolName)) {
toolFound = true;
}
// Check finalToolNames arrays
if (Array.isArray(entry.output.finalToolNames) &&
entry.output.finalToolNames.includes(toolName)) {
toolFound = true;
}
// Check toolName in individual evaluation
if (entry.output.toolName === toolName) {
toolFound = true;
}
}
if (toolFound) {
toolMentions++;
confidenceSum += entry.confidence;
// Consider it positive if confidence >= 60
if (entry.confidence >= 60) {
positiveEvaluations++;
}
}
});
console.log(`[AI PIPELINE] Cross-validation for ${toolName}: ${toolMentions} mentions, ${positiveEvaluations} positive, avg confidence: ${toolMentions > 0 ? Math.round(confidenceSum / toolMentions) : 0}`);
if (toolMentions === 0) {
return 60; // Default when no cross-validation data available
}
if (toolMentions === 1) {
// Single mention - use confidence directly but cap it
return Math.min(85, Math.max(40, confidenceSum));
}
// Multiple mentions - calculate agreement ratio
const agreementRatio = positiveEvaluations / toolMentions;
const avgConfidence = confidenceSum / toolMentions;
// Combine agreement ratio with average confidence
const crossValidationScore = (agreementRatio * 0.7 + (avgConfidence / 100) * 0.3) * 100;
return Math.round(Math.min(95, Math.max(30, crossValidationScore)));
}
// NEW: Calculate tool reliability based on objective indicators
private calculateToolReliability(tool: any): number {
let reliability = 50; // Base score
// Documentation availability
if (tool.knowledgebase === true) reliability += 25;
// Active maintenance (hosted tools are typically maintained)
if (isToolHosted(tool)) reliability += 20;
// Community support (open source often has community)
if (tool.license && tool.license !== 'Proprietary') reliability += 10;
// Skill level appropriateness (not too complex, not too simple)
if (tool.skillLevel === 'intermediate' || tool.skillLevel === 'advanced') reliability += 10;
else if (tool.skillLevel === 'expert') reliability -= 5; // May be overcomplicated
// Multi-platform support (more versatile)
if (tool.platforms && tool.platforms.length > 1) reliability += 5;
return Math.min(100, reliability);
}
// NEW: Identify specific uncertainty factors based on analysis
private identifySpecificUncertaintyFactors(tool: any, context: AnalysisContext, limitations: string[], confidence: number): string[] {
const factors: string[] = [];
// Add AI-identified limitations
// Add AI-identified limitations first (most specific)
if (limitations && limitations.length > 0) {
factors.push(...limitations.slice(0, 3)); // Limit to top 3
factors.push(...limitations.slice(0, 2)); // Limit to top 2 to leave room for others
}
// Low semantic similarity
const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5;
if (similarity < 0.4) {
if (similarity < 0.7) {
factors.push('Geringe semantische Ähnlichkeit zur Anfrage - Tool-Beschreibung passt möglicherweise nicht optimal');
}
// Skill level mismatch
if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent/i.test(context.userQuery)) {
factors.push('Experten-Tool für Eilszenario - möglicherweise zu komplex für schnelle Antworten');
// Skill level vs scenario complexity mismatch
if (tool.skillLevel === 'expert' && /schnell|rapid|triage|urgent|sofort/i.test(context.userQuery)) {
factors.push('Experten-Tool für zeitkritisches Szenario - Setup und Einarbeitung könnten zu lange dauern');
}
if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced/i.test(context.userQuery)) {
factors.push('Einsteiger-Tool für komplexes Szenario - könnte funktionale Einschränkungen haben');
if (tool.skillLevel === 'novice' && /komplex|erweitert|tiefgehend|advanced|forensisch/i.test(context.userQuery)) {
factors.push('Einsteiger-Tool für komplexe Analyse - könnte funktionale Limitierungen haben');
}
// Access limitations
// Platform availability concerns
if (tool.platforms && tool.platforms.length === 1 && tool.platforms[0] === 'Windows' && /linux|unix|server/i.test(context.userQuery)) {
factors.push('Nur Windows-Tool bei möglicher Linux/Server-Umgebung - Plattform-Inkompatibilität');
}
// Access and deployment concerns
if (tool.type === 'software' && !isToolHosted(tool) && tool.accessType === 'download') {
factors.push('Installation erforderlich - nicht sofort verfügbar ohne Setup');
factors.push('Installation und Setup erforderlich');
}
// Cross-validation disagreement
const crossValidation = this.calculateCrossValidationScore(tool.name, context);
if (crossValidation < 50) {
factors.push('Uneinheitliche Bewertung in verschiedenen Analyseschritten - Empfehlung nicht eindeutig');
// License restrictions
if (tool.license === 'Proprietary') {
factors.push('Kommerzielle Software - Lizenzkosten und rechtliche Beschränkungen zu beachten');
}
return factors.slice(0, 4); // Limit to 4 most important factors
// Low overall confidence warning
if (confidence < 60) {
factors.push('Moderate Gesamtbewertung - alternative Ansätze sollten ebenfalls betrachtet werden');
}
return factors.slice(0, 4); // Limit to 4 most relevant factors
}
// NEW: Identify specific strength indicators
private identifySpecificStrengthIndicators(tool: any, context: AnalysisContext, confidence: number): string[] {
const indicators: string[] = [];
// High confidence overall
if (confidence >= this.confidenceConfig.highThreshold) {
indicators.push('Hohe Gesamtbewertung durch mehrfache Validierung');
}
// High semantic similarity
const similarity = context.embeddingsSimilarities.get(tool.name) || 0.5;
if (similarity >= 0.7) {
indicators.push('Sehr gute semantische Übereinstimmung mit Ihrer Anfrage');
}
// Strong cross-validation
const crossValidation = this.calculateCrossValidationScore(tool.name, context);
if (crossValidation >= 80) {
indicators.push('Konsistente Empfehlung über verschiedene Analyseschritte hinweg');
}
// Quality indicators
if (tool.knowledgebase === true) {
indicators.push('Umfassende Dokumentation und Wissensbasis verfügbar');
@@ -985,7 +894,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
const prompt = getPrompt('phaseToolSelection', context.userQuery, phase, phaseTools);
const result = await this.callMicroTaskAI(prompt, context, 800);
const result = await this.callMicroTaskAI(prompt, context, 1000);
if (result.success) {
const selections = this.safeParseJSON(result.content, []);
@@ -998,16 +907,30 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
validSelections.forEach((sel: any) => {
const tool = phaseTools.find((t: any) => t.name === sel.toolName);
if (tool) {
this.addToolToSelection(context, tool, phase.id, sel.priority, sel.justification);
// Ensure taskRelevance is a number
const taskRelevance = typeof sel.taskRelevance === 'number' ?
sel.taskRelevance : parseInt(String(sel.taskRelevance)) || 70;
// Derive priority automatically from score
const priority = this.derivePriorityFromScore(taskRelevance);
this.addToolToSelection(context, tool, phase.id, priority, sel.justification, taskRelevance, sel.limitations);
}
});
this.addAuditEntry(context, 'micro-task', 'phase-tool-selection',
{ phase: phase.id, availableTools: phaseTools.length },
{ validSelections: validSelections.length, selectedTools: validSelections.map(s => s.toolName) },
{
validSelections: validSelections.length,
selectedTools: validSelections.map(s => ({
name: s.toolName,
taskRelevance: s.taskRelevance,
derivedPriority: this.derivePriorityFromScore(s.taskRelevance)
}))
},
validSelections.length > 0 ? 75 : 30,
Date.now() - result.processingTimeMs,
{ phaseName: phase.name }
{ phaseName: phase.name, comparativeEvaluation: true, priorityDerived: true }
);
}
}
@@ -1016,56 +939,46 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
}
private async evaluateSpecificTool(context: AnalysisContext, tool: any, rank: number): Promise<MicroTaskResult> {
const prompt = getPrompt('toolEvaluation', context.userQuery, tool, rank);
// Get existing task relevance from previous phase selection
const existingSelection = context.selectedTools?.find(st => st.tool.name === tool.name);
const taskRelevance = existingSelection?.taskRelevance || 70;
const priority = this.derivePriorityFromScore(taskRelevance);
const prompt = getPrompt('toolEvaluation', context.userQuery, tool, rank, taskRelevance);
const result = await this.callMicroTaskAI(prompt, context, 1200);
const result = await this.callMicroTaskAI(prompt, context, 1000);
if (result.success) {
const evaluation = this.safeParseJSON(result.content, {
suitability_score: 'medium',
task_relevance: '',
detailed_explanation: 'Evaluation failed',
implementation_approach: '',
pros: [],
cons: [],
limitations: [],
alternatives: ''
});
// Debug logging to see what we're getting
console.log(`[AI PIPELINE] Tool ${tool.name} evaluation:`, {
taskRelevance: evaluation.task_relevance,
suitabilityScore: evaluation.suitability_score,
limitationsCount: evaluation.limitations?.length || 0
});
// Ensure task_relevance is a number
const taskRelevance = typeof evaluation.task_relevance === 'number' ?
evaluation.task_relevance :
parseInt(String(evaluation.task_relevance)) || 70;
// Store enhanced evaluation data
// Store evaluation without re-scoring
this.addToolToSelection(context, {
...tool,
evaluation: {
...evaluation,
task_relevance: taskRelevance, // Ensure it's stored as number
rank
rank,
task_relevance: taskRelevance
}
}, 'evaluation', evaluation.suitability_score, evaluation.detailed_explanation,
taskRelevance, evaluation.limitations);
}, 'evaluation', priority, evaluation.detailed_explanation,
taskRelevance, existingSelection?.limitations);
this.addAuditEntry(context, 'micro-task', 'tool-evaluation',
{ toolName: tool.name, rank },
{ toolName: tool.name, rank, existingTaskRelevance: taskRelevance, derivedPriority: priority },
{
suitabilityScore: evaluation.suitability_score,
taskRelevance: taskRelevance, // Use the cleaned number
hasExplanation: !!evaluation.detailed_explanation,
limitationsIdentified: evaluation.limitations?.length || 0
hasImplementationApproach: !!evaluation.implementation_approach,
prosCount: evaluation.pros?.length || 0,
consCount: evaluation.cons?.length || 0
},
evaluation.suitability_score === 'high' ? 85 : evaluation.suitability_score === 'medium' ? 70 : 50,
70,
Date.now() - result.processingTimeMs,
{ toolType: tool.type, taskRelevanceScore: taskRelevance }
{ toolType: tool.type, explanationOnly: true, priorityDerived: true }
);
}
@@ -1173,6 +1086,12 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
}
}
private derivePriorityFromScore(taskRelevance: number): string {
if (taskRelevance >= 80) return 'high';
if (taskRelevance >= 60) return 'medium';
return 'low';
}
async processQuery(userQuery: string, mode: string): Promise<AnalysisResult> {
const startTime = Date.now();
let completeTasks = 0;
@@ -1323,8 +1242,7 @@ ${JSON.stringify(conceptsToSend, null, 2)}`;
components: {
semantic: confidence.semanticRelevance,
suitability: confidence.taskSuitability,
consistency: confidence.methodologicalConsistency,
reliability: confidence.toolReliability
consistency: confidence.methodologicalConsistency
}
},
confidence.overall,