From b192f257a19a2d4dca15c4b65949cc9db5327ab6 Mon Sep 17 00:00:00 2001 From: overcuriousity Date: Sat, 2 Aug 2025 15:01:05 +0200 Subject: [PATCH] phase 3 --- .env.example | 24 +- src/pages/api/ai/query.ts | 81 +++- src/utils/aiPipeline.ts | 39 +- src/utils/auditTrail.ts | 231 +++++++++-- src/utils/confidenceScoring.ts | 687 +++++++++++++++++++++++++++++++++ 5 files changed, 1011 insertions(+), 51 deletions(-) create mode 100644 src/utils/confidenceScoring.ts diff --git a/.env.example b/.env.example index 769d898..78fdf2a 100644 --- a/.env.example +++ b/.env.example @@ -1,7 +1,3 @@ -# =========================================== -# ForensicPathways Environment Configuration -# =========================================== - # === Authentication Configuration === AUTHENTICATION_NECESSARY=false AUTHENTICATION_NECESSARY_CONTRIBUTIONS=false @@ -49,11 +45,29 @@ FORENSIC_BIAS_DETECTION_ENABLED=true FORENSIC_AUDIT_RETENTION_DAYS=90 FORENSIC_AUDIT_DETAIL_LEVEL=detailed +# === PHASE 3: CONFIDENCE SCORING CONFIGURATION === +# Confidence Assessment Weights (must sum to 1.0) +CONFIDENCE_WEIGHT_RETRIEVAL=0.25 +CONFIDENCE_WEIGHT_SELECTION=0.35 +CONFIDENCE_WEIGHT_DOMAIN=0.25 +CONFIDENCE_WEIGHT_META=0.15 + +# Confidence Quality Thresholds +AI_CONFIDENCE_THRESHOLD=0.7 +CONFIDENCE_RELIABILITY_MINIMUM=0.5 +CONFIDENCE_UNCERTAINTY_ALERT_THRESHOLD=3 +CONFIDENCE_IMPROVEMENT_SUGGESTION_MAX=5 + +# Component-Specific Confidence Thresholds +RETRIEVAL_CONFIDENCE_MINIMUM=0.6 +SELECTION_CONFIDENCE_MINIMUM=0.5 +DOMAIN_CONFIDENCE_MINIMUM=0.5 +META_CONFIDENCE_MINIMUM=0.6 + # === CONFIGURABLE THRESHOLDS (NO MORE HARD-CODED VALUES) === AI_MAX_SELECTED_ITEMS=60 AI_EMBEDDING_CANDIDATES=60 AI_SIMILARITY_THRESHOLD=0.3 -AI_CONFIDENCE_THRESHOLD=0.7 AI_BIAS_ALERT_THRESHOLD=0.8 TOOL_POPULARITY_BIAS_THRESHOLD=0.75 EMBEDDINGS_CONFIDENCE_THRESHOLD=0.6 diff --git a/src/pages/api/ai/query.ts b/src/pages/api/ai/query.ts index ca8cb0e..0b89c82 100644 --- a/src/pages/api/ai/query.ts +++ b/src/pages/api/ai/query.ts @@ -1,4 +1,4 @@ -// src/pages/api/ai/query.ts - Enhanced with Forensic Audit Trail +// src/pages/api/ai/query.ts - Enhanced with Comprehensive Confidence Metrics import type { APIRoute } from 'astro'; import { withAPIAuth } from '../../../utils/auth.js'; @@ -6,6 +6,7 @@ import { apiError, apiServerError, createAuthErrorResponse } from '../../../util import { enqueueApiCall } from '../../../utils/rateLimitedQueue.js'; import { aiPipeline } from '../../../utils/aiPipeline.js'; import { forensicConfig } from '../../../utils/forensicConfig.js'; +import { confidenceScorer } from '../../../utils/confidenceScoring.js'; export const prerender = false; @@ -124,7 +125,8 @@ export const POST: APIRoute = async ({ request }) => { const { query, mode = 'workflow', taskId: clientTaskId } = body; console.log(`[ENHANCED API] Received request - TaskId: ${clientTaskId}, Mode: ${mode}, Query length: ${query?.length || 0}`); - console.log(`[ENHANCED API] User: ${userId}, Audit Trail: ${config.auditTrail.enabled ? 'Enabled' : 'Disabled'}`); + console.log(`[ENHANCED API] User: ${userId}, Confidence Scoring: ${config.features.confidenceScoring ? 'Enabled' : 'Disabled'}`); + console.log(`[ENHANCED API] Audit Trail: ${config.auditTrail.enabled ? 'Enabled' : 'Disabled'}`); console.log(`[ENHANCED API] Micro-task calls remaining: ${rateLimitResult.microTasksRemaining}`); if (!query || typeof query !== 'string') { @@ -147,7 +149,7 @@ export const POST: APIRoute = async ({ request }) => { console.log(`[ENHANCED API] About to enqueue enhanced pipeline ${taskId}`); - // Use enhanced pipeline with audit trail + // Use enhanced pipeline with audit trail and confidence scoring const result = await enqueueApiCall(() => aiPipeline.processQuery(sanitizedQuery, mode, userId) , taskId); @@ -175,15 +177,33 @@ export const POST: APIRoute = async ({ request }) => { if (result.auditTrail) { console.log(` - Audit Trail ID: ${result.auditTrail.auditId}`); - console.log(` - Overall Confidence: ${(result.auditTrail.qualityMetrics.overallConfidence * 100).toFixed(1)}%`); + console.log(` - Overall Confidence: ${(result.auditTrail.qualityMetrics.overallConfidence * 100).toFixed(1)}% (${result.auditTrail.qualityMetrics.qualityLevel})`); + console.log(` - Confidence Reliability: ${(result.auditTrail.qualityMetrics.confidenceReliability * 100).toFixed(1)}%`); console.log(` - Bias Risk Score: ${(result.auditTrail.qualityMetrics.biasRiskScore * 100).toFixed(1)}%`); console.log(` - Transparency Score: ${(result.auditTrail.qualityMetrics.transparencyScore * 100).toFixed(1)}%`); + + if (result.auditTrail.qualityMetrics.uncertaintyFactors.length > 0) { + console.log(` - Uncertainty factors: ${result.auditTrail.qualityMetrics.uncertaintyFactors.join(', ')}`); + } + } + + // NEW: Enhanced confidence metrics + if (result.confidenceMetrics && config.features.confidenceScoring) { + console.log(` - Confidence Breakdown:`); + console.log(` * Retrieval: ${(result.confidenceMetrics.breakdown.retrieval * 100).toFixed(1)}%`); + console.log(` * Selection: ${(result.confidenceMetrics.breakdown.selection * 100).toFixed(1)}%`); + console.log(` * Domain: ${(result.confidenceMetrics.breakdown.domain * 100).toFixed(1)}%`); + console.log(` * Meta: ${(result.confidenceMetrics.breakdown.meta * 100).toFixed(1)}%`); } const currentLimit = rateLimitStore.get(userId); const remainingMicroTasks = currentLimit ? MICRO_TASK_TOTAL_LIMIT - currentLimit.microTaskCount : MICRO_TASK_TOTAL_LIMIT; + // NEW: Check if confidence is acceptable + const confidenceAcceptable = result.auditTrail ? + confidenceScorer.isConfidenceAcceptable(result.auditTrail.qualityMetrics.overallConfidence) : true; + return new Response(JSON.stringify({ success: true, mode, @@ -192,34 +212,73 @@ export const POST: APIRoute = async ({ request }) => { query: sanitizedQuery, processingStats: { ...result.processingStats, - pipelineType: 'enhanced-micro-task', + pipelineType: 'enhanced-micro-task-with-confidence', microTasksSuccessRate: stats.microTasksCompleted / Math.max(stats.microTasksCompleted + stats.microTasksFailed, 1), averageTaskTime: stats.processingTimeMs / Math.max(stats.microTasksCompleted + stats.microTasksFailed, 1), estimatedAICallsMade, auditCompliant: result.auditTrail?.compliance.auditCompliant || false, biasChecked: result.auditTrail?.compliance.biasChecked || false, - confidenceAssessed: result.auditTrail?.compliance.confidenceAssessed || false + confidenceAssessed: result.auditTrail?.compliance.confidenceAssessed || false, + // NEW: Confidence acceptance flag + confidenceAcceptable }, - // NEW: Forensic metadata + // ENHANCED: Comprehensive forensic metadata with confidence details forensicMetadata: result.auditTrail ? { auditTrailId: result.auditTrail.auditId, auditEnabled: config.auditTrail.enabled, + + // Core quality metrics overallConfidence: result.auditTrail.qualityMetrics.overallConfidence, biasRiskScore: result.auditTrail.qualityMetrics.biasRiskScore, transparencyScore: result.auditTrail.qualityMetrics.transparencyScore, reproducibilityScore: result.auditTrail.qualityMetrics.reproducibilityScore, evidenceQuality: result.auditTrail.qualityMetrics.evidenceQuality, methodologicalSoundness: result.auditTrail.qualityMetrics.methodologicalSoundness, + + // NEW: Detailed confidence breakdown + confidenceBreakdown: config.features.confidenceScoring ? { + retrieval: result.auditTrail.qualityMetrics.confidenceBreakdown.retrieval, + selection: result.auditTrail.qualityMetrics.confidenceBreakdown.selection, + domain: result.auditTrail.qualityMetrics.confidenceBreakdown.domain, + meta: result.auditTrail.qualityMetrics.confidenceBreakdown.meta + } : undefined, + + // NEW: Confidence assessment details + confidenceAssessment: config.features.confidenceScoring ? { + qualityLevel: result.auditTrail.qualityMetrics.qualityLevel, + reliability: result.auditTrail.qualityMetrics.confidenceReliability, + uncertaintyFactors: result.auditTrail.qualityMetrics.uncertaintyFactors, + improvementSuggestions: result.auditTrail.qualityMetrics.improvementSuggestions, + isAcceptable: confidenceAcceptable, + threshold: thresholds.confidenceThreshold + } : undefined, + + // Bias and quality warnings biasWarnings: result.auditTrail.biasAnalysis.filter(b => b.detected), + qualityWarnings: !confidenceAcceptable ? ['Confidence below acceptable threshold'] : [], + + // System configuration snapshot systemConfig: { strategicModel: result.auditTrail.systemConfig.strategicModel, tacticalModel: result.auditTrail.systemConfig.tacticalModel, - auditLevel: result.auditTrail.systemConfig.auditLevel + auditLevel: result.auditTrail.systemConfig.auditLevel, + confidenceScoringEnabled: config.features.confidenceScoring, + biasDetectionEnabled: config.features.biasDetection }, + + // Compliance and traceability compliance: result.auditTrail.compliance, qualityLevel: result.auditTrail.qualityMetrics.overallConfidence >= thresholds.confidenceThreshold ? 'high' : - result.auditTrail.qualityMetrics.overallConfidence >= 0.5 ? 'medium' : 'low' + result.auditTrail.qualityMetrics.overallConfidence >= 0.5 ? 'medium' : 'low', + + // NEW: Actionable insights + actionableInsights: { + shouldReviewSelection: result.auditTrail.qualityMetrics.biasRiskScore > thresholds.biasAlertThreshold, + shouldImproveQuery: result.auditTrail.qualityMetrics.uncertaintyFactors.length > 2, + shouldSeekExpertReview: result.auditTrail.qualityMetrics.overallConfidence < 0.6, + confidenceImprovement: result.auditTrail.qualityMetrics.improvementSuggestions.slice(0, 3) + } } : { auditTrailId: null, auditEnabled: false, @@ -240,7 +299,9 @@ export const POST: APIRoute = async ({ request }) => { console.error('[ENHANCED API] Pipeline error:', error); // Provide detailed error information for forensic purposes - if (error.message.includes('embeddings')) { + if (error.message.includes('confidence')) { + return apiServerError.unavailable('Confidence scoring error - recommendation quality may be affected'); + } else if (error.message.includes('embeddings')) { return apiServerError.unavailable('Embeddings service error - using AI fallback with audit trail'); } else if (error.message.includes('micro-task')) { return apiServerError.unavailable('Micro-task pipeline error - some analysis steps failed but audit trail maintained'); diff --git a/src/utils/aiPipeline.ts b/src/utils/aiPipeline.ts index 50e7b09..16f87d7 100644 --- a/src/utils/aiPipeline.ts +++ b/src/utils/aiPipeline.ts @@ -1,9 +1,10 @@ -// src/utils/aiPipeline.ts - Enhanced Forensic AI Pipeline with Audit Trail +// src/utils/aiPipeline.ts - Enhanced with Confidence Scoring Integration import { getCompressedToolsDataForAI } from './dataService.js'; import { embeddingsService, type EmbeddingData, type EmbeddingSearchResult } from './embeddings.js'; import { forensicConfig, type AIModelConfig } from './forensicConfig.js'; import { auditTrailService, type ForensicAuditEntry } from './auditTrail.js'; +import { confidenceScorer, type ConfidenceMetrics } from './confidenceScoring.js'; interface MicroTaskResult { taskType: string; @@ -36,6 +37,8 @@ interface AnalysisResult { biasRiskScore: number; transparencyScore: number; }; + // NEW: Enhanced confidence metrics + confidenceMetrics?: ConfidenceMetrics; } interface AnalysisContext { @@ -80,9 +83,10 @@ class EnhancedMicroTaskAIPipeline { this.maxContextTokens = this.config.aiModels.strategic.maxContextTokens; this.maxPromptTokens = Math.floor(this.maxContextTokens * 0.6); // Leave room for response - console.log('[ENHANCED PIPELINE] Initialized with forensic configuration'); + console.log('[ENHANCED PIPELINE] Initialized with forensic configuration and confidence scoring'); console.log(`[ENHANCED PIPELINE] Strategic Model: ${this.config.aiModels.strategic.model}`); console.log(`[ENHANCED PIPELINE] Tactical Model: ${this.config.aiModels.tactical.model}`); + console.log(`[ENHANCED PIPELINE] Confidence Scoring: ${this.config.features.confidenceScoring ? 'Enabled' : 'Disabled'}`); console.log(`[ENHANCED PIPELINE] Audit Trail: ${this.config.auditTrail.enabled ? 'Enabled' : 'Disabled'}`); } @@ -447,7 +451,11 @@ Respond with ONLY this JSON format: rawResponse: aiResult.content }); + // NEW: Log domain confidence analysis + auditTrailService.logDomainAnalysis(selectedTools, selectedConcepts); + console.log(`[ENHANCED PIPELINE] Final selection: ${selectedTools.length} tools with bias prevention applied`); + console.log(`[ENHANCED PIPELINE] Domain confidence analysis logged`); return { selectedTools, @@ -644,10 +652,8 @@ WICHTIG: Antworten Sie NUR in fließendem deutschen Text ohne Listen, Aufzählun return result; } - // ... (Additional micro-task methods would be implemented similarly with audit trail integration) - // ============================================================================ - // MAIN PROCESSING METHOD WITH FULL AUDIT TRAIL + // MAIN PROCESSING METHOD WITH FULL AUDIT TRAIL AND CONFIDENCE // ============================================================================ async processQuery(userQuery: string, mode: string, userId: string = 'anonymous'): Promise { @@ -718,13 +724,31 @@ WICHTIG: Antworten Sie NUR in fließendem deutschen Text ohne Listen, Aufzählun tokensTotalUsed: auditTrail?.processingSummary.tokensTotalUsed || 0 }; + // NEW: Extract confidence metrics from audit trail + let confidenceMetrics: ConfidenceMetrics | undefined; + if (auditTrail && this.config.features.confidenceScoring) { + confidenceMetrics = { + overall: auditTrail.qualityMetrics.overallConfidence, + breakdown: auditTrail.qualityMetrics.confidenceBreakdown, + uncertaintyFactors: auditTrail.qualityMetrics.uncertaintyFactors, + improvementSuggestions: auditTrail.qualityMetrics.improvementSuggestions, + qualityLevel: auditTrail.qualityMetrics.qualityLevel, + reliability: auditTrail.qualityMetrics.confidenceReliability + }; + } + console.log(`[ENHANCED PIPELINE] Completed: ${completedTasks} tasks, Failed: ${failedTasks} tasks`); console.log(`[ENHANCED PIPELINE] Unique tools selected: ${context.seenToolNames.size}`); if (auditTrail) { console.log(`[ENHANCED PIPELINE] Audit Trail: ${auditTrail.auditId}`); - console.log(`[ENHANCED PIPELINE] Quality Score: ${(auditTrail.qualityMetrics.overallConfidence * 100).toFixed(1)}%`); + console.log(`[ENHANCED PIPELINE] Overall Confidence: ${(auditTrail.qualityMetrics.overallConfidence * 100).toFixed(1)}% (${auditTrail.qualityMetrics.qualityLevel})`); + console.log(`[ENHANCED PIPELINE] Confidence Reliability: ${(auditTrail.qualityMetrics.confidenceReliability * 100).toFixed(1)}%`); console.log(`[ENHANCED PIPELINE] Bias Risk: ${(auditTrail.qualityMetrics.biasRiskScore * 100).toFixed(1)}%`); + + if (auditTrail.qualityMetrics.uncertaintyFactors.length > 0) { + console.log(`[ENHANCED PIPELINE] Uncertainty factors: ${auditTrail.qualityMetrics.uncertaintyFactors.length}`); + } } return { @@ -735,7 +759,8 @@ WICHTIG: Antworten Sie NUR in fließendem deutschen Text ohne Listen, Aufzählun overallConfidence: auditTrail.qualityMetrics.overallConfidence, biasRiskScore: auditTrail.qualityMetrics.biasRiskScore, transparencyScore: auditTrail.qualityMetrics.transparencyScore - } : undefined + } : undefined, + confidenceMetrics // NEW: Return detailed confidence metrics }; } catch (error) { diff --git a/src/utils/auditTrail.ts b/src/utils/auditTrail.ts index 4b94e30..dac9004 100644 --- a/src/utils/auditTrail.ts +++ b/src/utils/auditTrail.ts @@ -1,9 +1,10 @@ -// src/utils/auditTrail.ts - Forensic Audit Trail System +// src/utils/auditTrail.ts - Enhanced Forensic Audit Trail with Confidence Scoring import { forensicConfig } from './forensicConfig.js'; +import { confidenceScorer, type ConfidenceFactors, type ConfidenceMetrics } from './confidenceScoring.js'; // ============================================================================ -// AUDIT TRAIL DATA STRUCTURES +// ENHANCED AUDIT TRAIL DATA STRUCTURES // ============================================================================ interface QueryClassification { @@ -80,13 +81,32 @@ interface MicroTaskAudit { contextContinuityUsed: boolean; } +// ENHANCED: More comprehensive quality metrics with confidence integration interface QualityMetrics { + // Overall metrics overallConfidence: number; reproducibilityScore: number; biasRiskScore: number; transparencyScore: number; evidenceQuality: number; methodologicalSoundness: number; + + // NEW: Detailed confidence breakdown + confidenceBreakdown: { + retrieval: number; + selection: number; + domain: number; + meta: number; + }; + + // NEW: Confidence factors used in calculation + confidenceFactors: ConfidenceFactors; + + // NEW: Uncertainty and improvement tracking + uncertaintyFactors: string[]; + improvementSuggestions: string[]; + qualityLevel: 'low' | 'medium' | 'high' | 'excellent'; + confidenceReliability: number; } interface ForensicAuditEntry { @@ -123,7 +143,7 @@ interface ForensicAuditEntry { // Micro-task Audit microTasks: MicroTaskAudit[]; - // Final Quality Metrics + // ENHANCED: Final Quality Metrics with comprehensive confidence qualityMetrics: QualityMetrics; // Processing Summary @@ -147,7 +167,7 @@ interface ForensicAuditEntry { } // ============================================================================ -// AUDIT TRAIL SERVICE IMPLEMENTATION +// ENHANCED AUDIT TRAIL SERVICE IMPLEMENTATION // ============================================================================ class ForensicAuditTrailService { @@ -155,15 +175,18 @@ class ForensicAuditTrailService { private auditStorage: Map = new Map(); private config = forensicConfig.getConfig(); + // NEW: Track confidence factors as they're calculated + private currentConfidenceFactors: Partial = {}; + constructor() { if (this.config.auditTrail.enabled) { - console.log('[AUDIT TRAIL] Forensic audit trail service initialized'); + console.log('[AUDIT TRAIL] Enhanced forensic audit trail service initialized with confidence scoring'); this.setupCleanupInterval(); } } // ======================================================================== - // AUDIT LIFECYCLE MANAGEMENT + // AUDIT LIFECYCLE MANAGEMENT (Enhanced) // ======================================================================== startAudit(userId: string, query: string, mode: 'workflow' | 'tool'): string { @@ -234,13 +257,38 @@ class ForensicAuditTrailService { biasAnalysis: [], microTasks: [], + // ENHANCED: Initialize with default confidence structure qualityMetrics: { overallConfidence: 0, reproducibilityScore: 0, biasRiskScore: 0, transparencyScore: 0, evidenceQuality: 0, - methodologicalSoundness: 0 + methodologicalSoundness: 0, + confidenceBreakdown: { + retrieval: 0, + selection: 0, + domain: 0, + meta: 0 + }, + confidenceFactors: { + embeddingsQuality: 0.5, + candidateRelevance: 0.5, + retrievalMethod: 0.5, + aiModelCertainty: 0.5, + selectionConsistency: 0.5, + reasoningQuality: 0.5, + scenarioSpecificity: 0.5, + toolSpecialization: 0.5, + expertiseAlignment: 0.5, + biasRiskLevel: 0.5, + historicalAccuracy: 0.5, + processingStability: 0.5 + }, + uncertaintyFactors: [], + improvementSuggestions: [], + qualityLevel: 'medium', + confidenceReliability: 0.5 }, processingSummary: { @@ -261,7 +309,10 @@ class ForensicAuditTrailService { } }; - console.log(`[AUDIT TRAIL] Started audit ${auditId} for user ${userId}, mode: ${mode}`); + // Reset confidence factors tracking + this.currentConfidenceFactors = {}; + + console.log(`[AUDIT TRAIL] Started enhanced audit ${auditId} for user ${userId}, mode: ${mode}`); return auditId; } @@ -281,7 +332,7 @@ class ForensicAuditTrailService { } // ======================================================================== - // RETRIEVAL PROCESS LOGGING + // RETRIEVAL PROCESS LOGGING (Enhanced with Confidence) // ======================================================================== logRetrievalStart(method: 'embeddings' | 'ai_selector' | 'emergency_fallback'): void { @@ -312,10 +363,21 @@ class ForensicAuditTrailService { processingTimeMs: data.processingTimeMs, fallbackReason: data.fallbackReason }; + + // NEW: Calculate and store retrieval confidence factors + const retrievalAssessment = confidenceScorer.assessRetrievalConfidence(this.currentAudit.retrievalProcess); + + // Update confidence factors + this.currentConfidenceFactors = { + ...this.currentConfidenceFactors, + ...retrievalAssessment.factors + }; + + console.log(`[AUDIT TRAIL] Retrieval confidence: ${(retrievalAssessment.confidence * 100).toFixed(1)}%`); } // ======================================================================== - // SELECTION PROCESS LOGGING + // SELECTION PROCESS LOGGING (Enhanced with Confidence) // ======================================================================== logSelectionStart(aiModel: 'strategic' | 'tactical' | 'legacy', initialCandidates: string[]): void { @@ -361,6 +423,17 @@ class ForensicAuditTrailService { this.currentAudit.processingSummary.aiCallsMade++; this.currentAudit.processingSummary.tokensTotalUsed += data.promptTokens + data.responseTokens; + + // NEW: Calculate and store selection confidence factors + const selectionAssessment = confidenceScorer.assessSelectionConfidence(this.currentAudit.selectionProcess); + + // Update confidence factors + this.currentConfidenceFactors = { + ...this.currentConfidenceFactors, + ...selectionAssessment.factors + }; + + console.log(`[AUDIT TRAIL] Selection confidence: ${(selectionAssessment.confidence * 100).toFixed(1)}%`); } // ======================================================================== @@ -373,9 +446,13 @@ class ForensicAuditTrailService { this.currentAudit.biasAnalysis = [...biasResults]; this.currentAudit.compliance.biasChecked = true; - // Calculate overall bias risk score + // Calculate overall bias risk score for confidence factors const biasRiskScore = biasResults.length > 0 ? Math.max(...biasResults.filter(b => b.detected).map(b => b.severity)) : 0; + + // Update confidence factors + this.currentConfidenceFactors.biasRiskLevel = Math.max(0, 1 - biasRiskScore); + this.currentAudit.qualityMetrics.biasRiskScore = biasRiskScore; } @@ -425,7 +502,36 @@ class ForensicAuditTrailService { } // ======================================================================== - // AUDIT FINALIZATION + // NEW: DOMAIN CONFIDENCE LOGGING + // ======================================================================== + + logDomainAnalysis(selectedTools: any[], selectedConcepts: any[]): void { + if (!this.currentAudit || !this.config.auditTrail.enabled) return; + + // Calculate domain confidence factors + const domainAssessment = confidenceScorer.assessDomainConfidence( + { + userQuery: this.currentAudit.userQuery, + queryMode: this.currentAudit.queryMode, + sanitizedQuery: this.currentAudit.sanitizedQuery + }, + { + selectedTools, + selectedConcepts + } + ); + + // Update confidence factors + this.currentConfidenceFactors = { + ...this.currentConfidenceFactors, + ...domainAssessment.factors + }; + + console.log(`[AUDIT TRAIL] Domain confidence: ${(domainAssessment.confidence * 100).toFixed(1)}%`); + } + + // ======================================================================== + // ENHANCED AUDIT FINALIZATION WITH COMPREHENSIVE CONFIDENCE // ======================================================================== calculateQualityMetrics(): void { @@ -433,8 +539,27 @@ class ForensicAuditTrailService { const audit = this.currentAudit; - // Overall confidence (weighted average of retrieval and selection confidence) - const overallConfidence = ( + // NEW: Calculate meta confidence factors + const metaAssessment = confidenceScorer.assessMetaConfidence({ + biasAnalysis: audit.biasAnalysis, + microTasks: audit.microTasks, + processingSummary: audit.processingSummary, + errorCount: audit.processingSummary.errorsEncountered, + fallbacksUsed: audit.processingSummary.fallbacksUsed + }); + + // Finalize confidence factors + const finalConfidenceFactors: ConfidenceFactors = { + ...audit.qualityMetrics.confidenceFactors, // defaults + ...this.currentConfidenceFactors, // calculated values + ...metaAssessment.factors // meta factors + }; + + // NEW: Calculate comprehensive confidence metrics using new framework + const confidenceMetrics = confidenceScorer.calculateOverallConfidence(finalConfidenceFactors); + + // Legacy metrics calculation (for backward compatibility) + const legacyOverallConfidence = ( audit.retrievalProcess.retrievalConfidence * 0.3 + audit.selectionProcess.confidenceScore * 0.5 + (audit.microTasks.reduce((sum, task) => sum + task.confidence, 0) / Math.max(audit.microTasks.length, 1)) * 0.2 @@ -447,9 +572,6 @@ class ForensicAuditTrailService { (audit.microTasks.length >= 4 ? 0.4 : audit.microTasks.length * 0.1) ); - // Bias risk score (inverse of detected bias severity) - const biasRiskScore = audit.qualityMetrics.biasRiskScore; - // Transparency score (based on audit detail level and traceability) const transparencyScore = ( (audit.selectionProcess.rawResponse.length > 0 ? 0.3 : 0) + @@ -472,17 +594,28 @@ class ForensicAuditTrailService { (audit.microTasks.length >= 4 ? 0.2 : 0) ); + // NEW: Use comprehensive confidence metrics audit.qualityMetrics = { - overallConfidence: Math.min(1, Math.max(0, overallConfidence)), + overallConfidence: Math.min(1, Math.max(0, confidenceMetrics.overall)), reproducibilityScore: Math.min(1, Math.max(0, reproducibilityScore)), - biasRiskScore: Math.min(1, Math.max(0, biasRiskScore)), + biasRiskScore: Math.min(1, Math.max(0, audit.qualityMetrics.biasRiskScore)), transparencyScore: Math.min(1, Math.max(0, transparencyScore)), evidenceQuality: Math.min(1, Math.max(0, evidenceQuality)), - methodologicalSoundness: Math.min(1, Math.max(0, methodologicalSoundness)) + methodologicalSoundness: Math.min(1, Math.max(0, methodologicalSoundness)), + + // NEW: Enhanced confidence data + confidenceBreakdown: confidenceMetrics.breakdown, + confidenceFactors: finalConfidenceFactors, + uncertaintyFactors: confidenceMetrics.uncertaintyFactors, + improvementSuggestions: confidenceMetrics.improvementSuggestions, + qualityLevel: confidenceMetrics.qualityLevel, + confidenceReliability: confidenceMetrics.reliability }; audit.compliance.confidenceAssessed = true; audit.compliance.traceabilityScore = transparencyScore; + + console.log(`[AUDIT TRAIL] Enhanced quality metrics calculated - Overall confidence: ${(confidenceMetrics.overall * 100).toFixed(1)}%`); } finalizeAudit(finalRecommendationCount: number): ForensicAuditEntry | null { @@ -496,7 +629,7 @@ class ForensicAuditTrailService { this.currentAudit.processingSummary.finalRecommendationCount = finalRecommendationCount; - // Calculate final quality metrics + // Calculate enhanced quality metrics this.calculateQualityMetrics(); // Store the audit trail @@ -504,17 +637,23 @@ class ForensicAuditTrailService { const finalAudit = { ...this.currentAudit }; this.currentAudit = null; + this.currentConfidenceFactors = {}; // Reset - console.log(`[AUDIT TRAIL] Finalized audit ${finalAudit.auditId}`); - console.log(`[AUDIT TRAIL] Quality Score: ${(finalAudit.qualityMetrics.overallConfidence * 100).toFixed(1)}%`); + console.log(`[AUDIT TRAIL] Finalized enhanced audit ${finalAudit.auditId}`); + console.log(`[AUDIT TRAIL] Overall Confidence: ${(finalAudit.qualityMetrics.overallConfidence * 100).toFixed(1)}% (${finalAudit.qualityMetrics.qualityLevel})`); + console.log(`[AUDIT TRAIL] Confidence Reliability: ${(finalAudit.qualityMetrics.confidenceReliability * 100).toFixed(1)}%`); console.log(`[AUDIT TRAIL] Bias Risk: ${(finalAudit.qualityMetrics.biasRiskScore * 100).toFixed(1)}%`); console.log(`[AUDIT TRAIL] Transparency: ${(finalAudit.qualityMetrics.transparencyScore * 100).toFixed(1)}%`); + if (finalAudit.qualityMetrics.uncertaintyFactors.length > 0) { + console.log(`[AUDIT TRAIL] Uncertainty factors: ${finalAudit.qualityMetrics.uncertaintyFactors.length}`); + } + return finalAudit; } // ======================================================================== - // AUDIT RETRIEVAL AND EXPORT + // AUDIT RETRIEVAL AND EXPORT (Enhanced) // ======================================================================== getAuditTrail(auditId: string): ForensicAuditEntry | null { @@ -528,6 +667,7 @@ class ForensicAuditTrailService { return JSON.stringify(audit, null, 2); } + // NEW: Enhanced audit summary with confidence details getAuditSummary(auditId: string): any { const audit = this.getAuditTrail(auditId); if (!audit) return null; @@ -542,12 +682,37 @@ class ForensicAuditTrailService { compliance: audit.compliance, biasWarnings: audit.biasAnalysis.filter(b => b.detected).length, microTasksCompleted: audit.microTasks.filter(t => t.success).length, - totalMicroTasks: audit.microTasks.length + totalMicroTasks: audit.microTasks.length, + + // NEW: Confidence summary + confidenceSummary: { + overall: audit.qualityMetrics.overallConfidence, + qualityLevel: audit.qualityMetrics.qualityLevel, + reliability: audit.qualityMetrics.confidenceReliability, + uncertaintyCount: audit.qualityMetrics.uncertaintyFactors.length, + improvementSuggestionsCount: audit.qualityMetrics.improvementSuggestions.length, + breakdown: audit.qualityMetrics.confidenceBreakdown + } }; } + // NEW: Get confidence report for specific audit + getConfidenceReport(auditId: string): string | null { + const audit = this.getAuditTrail(auditId); + if (!audit) return null; + + return confidenceScorer.formatConfidenceReport({ + overall: audit.qualityMetrics.overallConfidence, + breakdown: audit.qualityMetrics.confidenceBreakdown, + uncertaintyFactors: audit.qualityMetrics.uncertaintyFactors, + improvementSuggestions: audit.qualityMetrics.improvementSuggestions, + qualityLevel: audit.qualityMetrics.qualityLevel, + reliability: audit.qualityMetrics.confidenceReliability + }); + } + // ======================================================================== - // UTILITY METHODS + // UTILITY METHODS (Enhanced) // ======================================================================== private setupCleanupInterval(): void { @@ -570,19 +735,21 @@ class ForensicAuditTrailService { }, 60 * 60 * 1000); // Run cleanup every hour } - getStorageStats(): { totalAudits: number; oldestAudit: string | null; newestAudit: string | null } { + getStorageStats(): { totalAudits: number; oldestAudit: string | null; newestAudit: string | null; averageConfidence: number } { const audits = Array.from(this.auditStorage.values()); if (audits.length === 0) { - return { totalAudits: 0, oldestAudit: null, newestAudit: null }; + return { totalAudits: 0, oldestAudit: null, newestAudit: null, averageConfidence: 0 }; } const sorted = audits.sort((a, b) => a.timestamp.getTime() - b.timestamp.getTime()); + const averageConfidence = audits.reduce((sum, audit) => sum + audit.qualityMetrics.overallConfidence, 0) / audits.length; return { totalAudits: audits.length, oldestAudit: sorted[0].timestamp.toISOString(), - newestAudit: sorted[sorted.length - 1].timestamp.toISOString() + newestAudit: sorted[sorted.length - 1].timestamp.toISOString(), + averageConfidence }; } @@ -593,6 +760,12 @@ class ForensicAuditTrailService { isAuditInProgress(): boolean { return this.currentAudit !== null; } + + // NEW: Check if confidence is acceptable for current audit + isCurrentConfidenceAcceptable(): boolean { + if (!this.currentAudit) return false; + return confidenceScorer.isConfidenceAcceptable(this.currentAudit.qualityMetrics.overallConfidence); + } } // Export singleton instance diff --git a/src/utils/confidenceScoring.ts b/src/utils/confidenceScoring.ts new file mode 100644 index 0000000..99d4d1c --- /dev/null +++ b/src/utils/confidenceScoring.ts @@ -0,0 +1,687 @@ +// src/utils/confidenceScoring.ts - Forensic Confidence Scoring Framework + +import { forensicConfig } from './forensicConfig.js'; + +// ============================================================================ +// CONFIDENCE FACTOR INTERFACES +// ============================================================================ + +interface ConfidenceFactors { + // Retrieval confidence + embeddingsQuality: number; // How well embeddings matched (0-1) + candidateRelevance: number; // Relevance of retrieved candidates (0-1) + retrievalMethod: number; // Confidence in retrieval method used (0-1) + + // Selection confidence + aiModelCertainty: number; // AI's self-reported confidence (0-1) + selectionConsistency: number; // Internal consistency of selections (0-1) + reasoningQuality: number; // Quality of provided reasoning (0-1) + + // Domain confidence + scenarioSpecificity: number; // How specific the scenario description was (0-1) + toolSpecialization: number; // How specialized selected tools are (0-1) + expertiseAlignment: number; // Alignment with known expert preferences (0-1) + + // Meta confidence + biasRiskLevel: number; // Inverse of bias risk (higher bias = lower confidence) (0-1) + historicalAccuracy: number; // Based on past validation (if available) (0-1) + processingStability: number; // Stability of processing pipeline (0-1) +} + +interface ConfidenceWeights { + retrieval: number; // Weight for retrieval factors + selection: number; // Weight for selection factors + domain: number; // Weight for domain factors + meta: number; // Weight for meta factors +} + +interface ConfidenceMetrics { + overall: number; // Overall confidence score (0-1) + breakdown: { + retrieval: number; + selection: number; + domain: number; + meta: number; + }; + uncertaintyFactors: string[]; // List of uncertainty factors identified + improvementSuggestions: string[]; // Suggestions to improve confidence + qualityLevel: 'low' | 'medium' | 'high' | 'excellent'; + reliability: number; // Reliability of the confidence assessment itself (0-1) +} + +interface UncertaintyFactor { + factor: string; + impact: 'low' | 'medium' | 'high'; + description: string; + mitigation: string; +} + +// ============================================================================ +// CONFIDENCE SCORING IMPLEMENTATION +// ============================================================================ + +class ConfidenceScorer { + private config = forensicConfig.getConfig(); + private thresholds = forensicConfig.getThresholds(); + + // Default confidence weights (configurable via environment) + private weights: ConfidenceWeights = { + retrieval: parseFloat(process.env.CONFIDENCE_WEIGHT_RETRIEVAL || '0.25'), + selection: parseFloat(process.env.CONFIDENCE_WEIGHT_SELECTION || '0.35'), + domain: parseFloat(process.env.CONFIDENCE_WEIGHT_DOMAIN || '0.25'), + meta: parseFloat(process.env.CONFIDENCE_WEIGHT_META || '0.15') + }; + + constructor() { + // Validate weights sum to 1.0 + const weightSum = Object.values(this.weights).reduce((sum, weight) => sum + weight, 0); + if (Math.abs(weightSum - 1.0) > 0.01) { + console.warn(`[CONFIDENCE] Weight sum is ${weightSum}, adjusting to 1.0`); + const factor = 1.0 / weightSum; + this.weights.retrieval *= factor; + this.weights.selection *= factor; + this.weights.domain *= factor; + this.weights.meta *= factor; + } + } + + // ======================================================================== + // MAIN CONFIDENCE CALCULATION + // ======================================================================== + + calculateOverallConfidence(factors: ConfidenceFactors): ConfidenceMetrics { + // Calculate component scores + const retrievalScore = this.calculateRetrievalConfidence(factors); + const selectionScore = this.calculateSelectionConfidence(factors); + const domainScore = this.calculateDomainConfidence(factors); + const metaScore = this.calculateMetaConfidence(factors); + + // Calculate weighted overall score + const overall = ( + retrievalScore * this.weights.retrieval + + selectionScore * this.weights.selection + + domainScore * this.weights.domain + + metaScore * this.weights.meta + ); + + // Identify uncertainty factors + const uncertaintyFactors = this.identifyUncertaintyFactors(factors); + + // Generate improvement suggestions + const improvementSuggestions = this.suggestConfidenceImprovements(factors); + + // Determine quality level + const qualityLevel = this.determineQualityLevel(overall); + + // Calculate reliability of confidence assessment + const reliability = this.calculateConfidenceReliability(factors); + + return { + overall: Math.max(0, Math.min(1, overall)), + breakdown: { + retrieval: retrievalScore, + selection: selectionScore, + domain: domainScore, + meta: metaScore + }, + uncertaintyFactors, + improvementSuggestions, + qualityLevel, + reliability + }; + } + + // ======================================================================== + // COMPONENT CONFIDENCE CALCULATIONS + // ======================================================================== + + private calculateRetrievalConfidence(factors: ConfidenceFactors): number { + const { embeddingsQuality, candidateRelevance, retrievalMethod } = factors; + + // Weighted average with emphasis on relevance + const score = ( + embeddingsQuality * 0.4 + + candidateRelevance * 0.4 + + retrievalMethod * 0.2 + ); + + return Math.max(0, Math.min(1, score)); + } + + private calculateSelectionConfidence(factors: ConfidenceFactors): number { + const { aiModelCertainty, selectionConsistency, reasoningQuality } = factors; + + // Weighted average with emphasis on consistency and reasoning + const score = ( + aiModelCertainty * 0.3 + + selectionConsistency * 0.4 + + reasoningQuality * 0.3 + ); + + return Math.max(0, Math.min(1, score)); + } + + private calculateDomainConfidence(factors: ConfidenceFactors): number { + const { scenarioSpecificity, toolSpecialization, expertiseAlignment } = factors; + + // Weighted average with emphasis on specificity + const score = ( + scenarioSpecificity * 0.5 + + toolSpecialization * 0.3 + + expertiseAlignment * 0.2 + ); + + return Math.max(0, Math.min(1, score)); + } + + private calculateMetaConfidence(factors: ConfidenceFactors): number { + const { biasRiskLevel, historicalAccuracy, processingStability } = factors; + + // Weighted average with emphasis on bias risk + const score = ( + biasRiskLevel * 0.4 + + historicalAccuracy * 0.3 + + processingStability * 0.3 + ); + + return Math.max(0, Math.min(1, score)); + } + + // ======================================================================== + // SPECIFIC CONFIDENCE ASSESSMENTS + // ======================================================================== + + assessRetrievalConfidence(retrievalData: { + method: string; + embeddingsUsed: boolean; + candidatesFound: number; + similarityScores: Array<{ tool: string; score: number; type: string }>; + retrievalConfidence: number; + fallbackReason?: string; + }): { confidence: number; factors: Partial } { + + let embeddingsQuality = 0.5; // Default + let candidateRelevance = 0.5; // Default + let retrievalMethod = 0.5; // Default + + // Assess embeddings quality + if (retrievalData.embeddingsUsed && retrievalData.similarityScores.length > 0) { + const avgSimilarity = retrievalData.similarityScores.reduce((sum, item) => sum + item.score, 0) / retrievalData.similarityScores.length; + embeddingsQuality = Math.min(1, avgSimilarity * 2); // Scale similarity scores + + // Bonus for high-quality matches + const highQualityMatches = retrievalData.similarityScores.filter(item => item.score > 0.7).length; + if (highQualityMatches >= 5) { + embeddingsQuality = Math.min(1, embeddingsQuality + 0.1); + } + } else if (retrievalData.fallbackReason) { + embeddingsQuality = 0.3; // Lower confidence for fallback + } + + // Assess candidate relevance + if (retrievalData.candidatesFound >= 15) { + candidateRelevance = 0.8; // Good number of candidates + } else if (retrievalData.candidatesFound >= 10) { + candidateRelevance = 0.6; // Adequate candidates + } else if (retrievalData.candidatesFound >= 5) { + candidateRelevance = 0.4; // Few candidates + } else { + candidateRelevance = 0.2; // Very few candidates + } + + // Assess retrieval method confidence + switch (retrievalData.method) { + case 'embeddings': + retrievalMethod = 0.9; // High confidence in embeddings + break; + case 'ai_selector': + retrievalMethod = 0.7; // Medium confidence in AI selection + break; + case 'emergency_fallback': + retrievalMethod = 0.3; // Low confidence in emergency fallback + break; + } + + const confidence = this.calculateRetrievalConfidence({ + embeddingsQuality, + candidateRelevance, + retrievalMethod, + // Fill other factors with defaults for this calculation + aiModelCertainty: 0.5, + selectionConsistency: 0.5, + reasoningQuality: 0.5, + scenarioSpecificity: 0.5, + toolSpecialization: 0.5, + expertiseAlignment: 0.5, + biasRiskLevel: 0.5, + historicalAccuracy: 0.5, + processingStability: 0.5 + }); + + return { + confidence, + factors: { + embeddingsQuality, + candidateRelevance, + retrievalMethod + } + }; + } + + assessSelectionConfidence(selectionData: { + aiModel: string; + promptTokens: number; + responseTokens: number; + processingTimeMs: number; + initialCandidates: string[]; + finalSelection: string[]; + rejectedCandidates: Array<{ tool: string; reason: string; score?: number }>; + selectionReasoning: string; + confidenceScore: number; + rawResponse: string; + }): { confidence: number; factors: Partial } { + + let aiModelCertainty = selectionData.confidenceScore || 0.5; + let selectionConsistency = 0.5; + let reasoningQuality = 0.5; + + // Assess AI model certainty (from self-reported confidence or infer from response) + if (selectionData.confidenceScore && selectionData.confidenceScore > 0) { + aiModelCertainty = selectionData.confidenceScore; + } else { + // Infer confidence from response characteristics + const responseLength = selectionData.rawResponse.length; + const reasoningLength = selectionData.selectionReasoning.length; + + if (responseLength > 500 && reasoningLength > 100) { + aiModelCertainty = 0.7; // Detailed response suggests confidence + } else if (responseLength > 200) { + aiModelCertainty = 0.6; // Adequate response + } else { + aiModelCertainty = 0.4; // Short response suggests uncertainty + } + } + + // Assess selection consistency + const selectionRatio = selectionData.finalSelection.length / Math.max(selectionData.initialCandidates.length, 1); + if (selectionRatio > 0.8) { + selectionConsistency = 0.3; // Selected too many - low consistency + } else if (selectionRatio > 0.5) { + selectionConsistency = 0.5; // Moderate selectivity + } else if (selectionRatio > 0.2) { + selectionConsistency = 0.8; // Good selectivity + } else if (selectionRatio > 0.05) { + selectionConsistency = 0.9; // Very selective - high consistency + } else { + selectionConsistency = 0.4; // Too selective - might miss good options + } + + // Assess reasoning quality + const reasoning = selectionData.selectionReasoning.toLowerCase(); + let reasoningScore = 0.3; // Base score + + // Look for quality indicators in reasoning + const qualityIndicators = [ + 'specific', 'analysis', 'forensic', 'evidence', 'methodology', + 'suitable', 'appropriate', 'specialized', 'effective', 'relevant' + ]; + + const foundIndicators = qualityIndicators.filter(indicator => reasoning.includes(indicator)).length; + reasoningScore += (foundIndicators / qualityIndicators.length) * 0.4; + + // Penalty for vague reasoning + const vagueTerms = ['good', 'useful', 'helpful', 'nice', 'popular']; + const foundVagueTerms = vagueTerms.filter(term => reasoning.includes(term)).length; + reasoningScore -= (foundVagueTerms / vagueTerms.length) * 0.2; + + // Bonus for detailed reasoning + if (selectionData.selectionReasoning.length > 200) { + reasoningScore += 0.2; + } + + reasoningQuality = Math.max(0, Math.min(1, reasoningScore)); + + const confidence = this.calculateSelectionConfidence({ + aiModelCertainty, + selectionConsistency, + reasoningQuality, + // Fill other factors with defaults for this calculation + embeddingsQuality: 0.5, + candidateRelevance: 0.5, + retrievalMethod: 0.5, + scenarioSpecificity: 0.5, + toolSpecialization: 0.5, + expertiseAlignment: 0.5, + biasRiskLevel: 0.5, + historicalAccuracy: 0.5, + processingStability: 0.5 + }); + + return { + confidence, + factors: { + aiModelCertainty, + selectionConsistency, + reasoningQuality + } + }; + } + + assessDomainConfidence(queryData: { + userQuery: string; + queryMode: string; + sanitizedQuery: string; + }, toolData: { + selectedTools: any[]; + selectedConcepts: any[]; + }): { confidence: number; factors: Partial } { + + let scenarioSpecificity = 0.5; + let toolSpecialization = 0.5; + let expertiseAlignment = 0.5; + + // Assess scenario specificity + const query = queryData.sanitizedQuery.toLowerCase(); + const queryLength = query.length; + + // Base specificity on query length and detail + if (queryLength > 200) { + scenarioSpecificity = 0.8; // Detailed query + } else if (queryLength > 100) { + scenarioSpecificity = 0.6; // Moderate detail + } else if (queryLength > 50) { + scenarioSpecificity = 0.4; // Basic detail + } else { + scenarioSpecificity = 0.2; // Very brief query + } + + // Look for forensic-specific terms + const forensicTerms = [ + 'malware', 'incident', 'breach', 'attack', 'forensic', 'investigation', + 'evidence', 'analysis', 'memory', 'disk', 'network', 'log', 'artifact', + 'timeline', 'ioc', 'apt', 'ransomware', 'windows', 'linux', 'registry' + ]; + + const foundForensicTerms = forensicTerms.filter(term => query.includes(term)).length; + scenarioSpecificity += (foundForensicTerms / forensicTerms.length) * 0.3; + scenarioSpecificity = Math.min(1, scenarioSpecificity); + + // Assess tool specialization + const allSelectedItems = [...toolData.selectedTools, ...toolData.selectedConcepts]; + + if (allSelectedItems.length === 0) { + toolSpecialization = 0.1; + } else { + // Count specialized tools (methods, concepts, or tools with specific domains) + const specializedCount = allSelectedItems.filter(item => + item.type === 'method' || + item.type === 'concept' || + (item.domains && item.domains.length <= 2) // Specialized to few domains + ).length; + + toolSpecialization = Math.min(1, specializedCount / allSelectedItems.length); + } + + // Assess expertise alignment (simplified - could be enhanced with expert knowledge base) + expertiseAlignment = 0.5; // Default moderate alignment + + // Check for alignment with query urgency and tool selection + if (query.includes('urgent') || query.includes('rapid') || query.includes('quick')) { + const rapidTools = allSelectedItems.filter(item => + item.name?.toLowerCase().includes('rapid') || + item.name?.toLowerCase().includes('quick') || + item.name?.toLowerCase().includes('triage') || + item.type === 'method' + ).length; + + if (rapidTools > 0) { + expertiseAlignment = Math.min(1, expertiseAlignment + 0.3); // Good alignment with urgency + } else { + expertiseAlignment = Math.max(0, expertiseAlignment - 0.2); // Poor alignment with urgency + } + } + + const confidence = this.calculateDomainConfidence({ + scenarioSpecificity, + toolSpecialization, + expertiseAlignment, + // Fill other factors with defaults for this calculation + embeddingsQuality: 0.5, + candidateRelevance: 0.5, + retrievalMethod: 0.5, + aiModelCertainty: 0.5, + selectionConsistency: 0.5, + reasoningQuality: 0.5, + biasRiskLevel: 0.5, + historicalAccuracy: 0.5, + processingStability: 0.5 + }); + + return { + confidence, + factors: { + scenarioSpecificity, + toolSpecialization, + expertiseAlignment + } + }; + } + + assessMetaConfidence(auditData: { + biasAnalysis: any[]; + microTasks: any[]; + processingSummary: any; + errorCount: number; + fallbacksUsed: number; + }): { confidence: number; factors: Partial } { + + let biasRiskLevel = 0.5; + let historicalAccuracy = 0.5; // Default - could be enhanced with historical data + let processingStability = 0.5; + + // Assess bias risk level (inverse of detected bias) + if (auditData.biasAnalysis && auditData.biasAnalysis.length > 0) { + const detectedBiases = auditData.biasAnalysis.filter(bias => bias.detected); + const maxBiasSeverity = detectedBiases.length > 0 ? + Math.max(...detectedBiases.map(bias => bias.severity || 0)) : 0; + + biasRiskLevel = Math.max(0, 1 - maxBiasSeverity); // Higher bias = lower confidence + } else { + biasRiskLevel = 0.7; // No bias detected = good confidence + } + + // Assess processing stability + const totalTasks = auditData.microTasks?.length || 1; + const successfulTasks = auditData.microTasks?.filter(task => task.success).length || 0; + const successRate = successfulTasks / totalTasks; + + processingStability = successRate; + + // Penalty for fallbacks and errors + if (auditData.fallbacksUsed > 0) { + processingStability = Math.max(0, processingStability - (auditData.fallbacksUsed * 0.2)); + } + + if (auditData.errorCount > 0) { + processingStability = Math.max(0, processingStability - (auditData.errorCount * 0.1)); + } + + const confidence = this.calculateMetaConfidence({ + biasRiskLevel, + historicalAccuracy, + processingStability, + // Fill other factors with defaults for this calculation + embeddingsQuality: 0.5, + candidateRelevance: 0.5, + retrievalMethod: 0.5, + aiModelCertainty: 0.5, + selectionConsistency: 0.5, + reasoningQuality: 0.5, + scenarioSpecificity: 0.5, + toolSpecialization: 0.5, + expertiseAlignment: 0.5 + }); + + return { + confidence, + factors: { + biasRiskLevel, + historicalAccuracy, + processingStability + } + }; + } + + // ======================================================================== + // UNCERTAINTY AND IMPROVEMENT ANALYSIS + // ======================================================================== + + identifyUncertaintyFactors(factors: ConfidenceFactors): string[] { + const uncertaintyFactors: string[] = []; + const threshold = 0.6; // Below this threshold, factor contributes to uncertainty + + if (factors.embeddingsQuality < threshold) { + uncertaintyFactors.push('Embeddings similarity scores were lower than expected'); + } + + if (factors.candidateRelevance < threshold) { + uncertaintyFactors.push('Retrieved candidate tools may not be sufficiently relevant'); + } + + if (factors.aiModelCertainty < threshold) { + uncertaintyFactors.push('AI model expressed low confidence in tool selection'); + } + + if (factors.selectionConsistency < threshold) { + uncertaintyFactors.push('Tool selection criteria may not be consistently applied'); + } + + if (factors.reasoningQuality < threshold) { + uncertaintyFactors.push('Selection reasoning lacks depth or specificity'); + } + + if (factors.scenarioSpecificity < threshold) { + uncertaintyFactors.push('Query lacks forensic detail for precise tool matching'); + } + + if (factors.toolSpecialization < threshold) { + uncertaintyFactors.push('Selected tools may be too generic for the specific scenario'); + } + + if (factors.biasRiskLevel < threshold) { + uncertaintyFactors.push('Potential bias detected in tool selection process'); + } + + if (factors.processingStability < threshold) { + uncertaintyFactors.push('Processing pipeline encountered errors or fallbacks'); + } + + return uncertaintyFactors; + } + + suggestConfidenceImprovements(factors: ConfidenceFactors): string[] { + const suggestions: string[] = []; + const threshold = 0.7; // Above this threshold, no improvement needed + + if (factors.embeddingsQuality < threshold) { + suggestions.push('Consider refining query with more forensic-specific terminology'); + } + + if (factors.scenarioSpecificity < threshold) { + suggestions.push('Provide more details about affected systems, incident type, or evidence available'); + } + + if (factors.reasoningQuality < threshold) { + suggestions.push('Request detailed explanation of why specific tools were recommended'); + } + + if (factors.toolSpecialization < threshold) { + suggestions.push('Consider specifying the forensic domain or investigation phase needed'); + } + + if (factors.biasRiskLevel < threshold) { + suggestions.push('Review recommendations for potential bias toward popular tools'); + } + + if (suggestions.length === 0) { + suggestions.push('Confidence is high - no specific improvements needed'); + } + + return suggestions; + } + + private determineQualityLevel(confidence: number): 'low' | 'medium' | 'high' | 'excellent' { + if (confidence >= 0.9) return 'excellent'; + if (confidence >= 0.75) return 'high'; + if (confidence >= 0.6) return 'medium'; + return 'low'; + } + + private calculateConfidenceReliability(factors: ConfidenceFactors): number { + // Assess how reliable our confidence assessment itself is + // Based on availability of data for each factor + + let dataAvailability = 0; + let totalFactors = 0; + + // Count factors that have meaningful data (not just defaults) + Object.entries(factors).forEach(([key, value]) => { + totalFactors++; + // If factor is not exactly 0.5 (default), we have some data + if (Math.abs(value - 0.5) > 0.01) { + dataAvailability++; + } + }); + + const reliabilityBase = dataAvailability / totalFactors; + + // Penalty for extreme values (might indicate missing nuance) + const extremeValues = Object.values(factors).filter(v => v < 0.1 || v > 0.9).length; + const extremePenalty = (extremeValues / totalFactors) * 0.2; + + return Math.max(0.3, Math.min(1, reliabilityBase - extremePenalty)); + } + + // ======================================================================== + // UTILITY METHODS + // ======================================================================== + + getConfidenceThreshold(): number { + return this.thresholds.confidenceThreshold; + } + + isConfidenceAcceptable(confidence: number): boolean { + return confidence >= this.getConfidenceThreshold(); + } + + formatConfidenceReport(metrics: ConfidenceMetrics): string { + const report = ` +Confidence Assessment Report +=========================== +Overall Confidence: ${(metrics.overall * 100).toFixed(1)}% (${metrics.qualityLevel}) +Reliability: ${(metrics.reliability * 100).toFixed(1)}% + +Component Breakdown: +- Retrieval: ${(metrics.breakdown.retrieval * 100).toFixed(1)}% +- Selection: ${(metrics.breakdown.selection * 100).toFixed(1)}% +- Domain: ${(metrics.breakdown.domain * 100).toFixed(1)}% +- Meta: ${(metrics.breakdown.meta * 100).toFixed(1)}% + +${metrics.uncertaintyFactors.length > 0 ? ` +Uncertainty Factors: +${metrics.uncertaintyFactors.map(factor => `- ${factor}`).join('\n')} +` : 'No significant uncertainty factors identified.'} + +${metrics.improvementSuggestions.length > 0 ? ` +Improvement Suggestions: +${metrics.improvementSuggestions.map(suggestion => `- ${suggestion}`).join('\n')} +` : ''} + `.trim(); + + return report; + } +} + +// Export singleton instance +export const confidenceScorer = new ConfidenceScorer(); +export type { ConfidenceFactors, ConfidenceMetrics, ConfidenceWeights, UncertaintyFactor }; \ No newline at end of file