phase 3
This commit is contained in:
parent
fd05f8f291
commit
b192f257a1
24
.env.example
24
.env.example
@ -1,7 +1,3 @@
|
|||||||
# ===========================================
|
|
||||||
# ForensicPathways Environment Configuration
|
|
||||||
# ===========================================
|
|
||||||
|
|
||||||
# === Authentication Configuration ===
|
# === Authentication Configuration ===
|
||||||
AUTHENTICATION_NECESSARY=false
|
AUTHENTICATION_NECESSARY=false
|
||||||
AUTHENTICATION_NECESSARY_CONTRIBUTIONS=false
|
AUTHENTICATION_NECESSARY_CONTRIBUTIONS=false
|
||||||
@ -49,11 +45,29 @@ FORENSIC_BIAS_DETECTION_ENABLED=true
|
|||||||
FORENSIC_AUDIT_RETENTION_DAYS=90
|
FORENSIC_AUDIT_RETENTION_DAYS=90
|
||||||
FORENSIC_AUDIT_DETAIL_LEVEL=detailed
|
FORENSIC_AUDIT_DETAIL_LEVEL=detailed
|
||||||
|
|
||||||
|
# === PHASE 3: CONFIDENCE SCORING CONFIGURATION ===
|
||||||
|
# Confidence Assessment Weights (must sum to 1.0)
|
||||||
|
CONFIDENCE_WEIGHT_RETRIEVAL=0.25
|
||||||
|
CONFIDENCE_WEIGHT_SELECTION=0.35
|
||||||
|
CONFIDENCE_WEIGHT_DOMAIN=0.25
|
||||||
|
CONFIDENCE_WEIGHT_META=0.15
|
||||||
|
|
||||||
|
# Confidence Quality Thresholds
|
||||||
|
AI_CONFIDENCE_THRESHOLD=0.7
|
||||||
|
CONFIDENCE_RELIABILITY_MINIMUM=0.5
|
||||||
|
CONFIDENCE_UNCERTAINTY_ALERT_THRESHOLD=3
|
||||||
|
CONFIDENCE_IMPROVEMENT_SUGGESTION_MAX=5
|
||||||
|
|
||||||
|
# Component-Specific Confidence Thresholds
|
||||||
|
RETRIEVAL_CONFIDENCE_MINIMUM=0.6
|
||||||
|
SELECTION_CONFIDENCE_MINIMUM=0.5
|
||||||
|
DOMAIN_CONFIDENCE_MINIMUM=0.5
|
||||||
|
META_CONFIDENCE_MINIMUM=0.6
|
||||||
|
|
||||||
# === CONFIGURABLE THRESHOLDS (NO MORE HARD-CODED VALUES) ===
|
# === CONFIGURABLE THRESHOLDS (NO MORE HARD-CODED VALUES) ===
|
||||||
AI_MAX_SELECTED_ITEMS=60
|
AI_MAX_SELECTED_ITEMS=60
|
||||||
AI_EMBEDDING_CANDIDATES=60
|
AI_EMBEDDING_CANDIDATES=60
|
||||||
AI_SIMILARITY_THRESHOLD=0.3
|
AI_SIMILARITY_THRESHOLD=0.3
|
||||||
AI_CONFIDENCE_THRESHOLD=0.7
|
|
||||||
AI_BIAS_ALERT_THRESHOLD=0.8
|
AI_BIAS_ALERT_THRESHOLD=0.8
|
||||||
TOOL_POPULARITY_BIAS_THRESHOLD=0.75
|
TOOL_POPULARITY_BIAS_THRESHOLD=0.75
|
||||||
EMBEDDINGS_CONFIDENCE_THRESHOLD=0.6
|
EMBEDDINGS_CONFIDENCE_THRESHOLD=0.6
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
// src/pages/api/ai/query.ts - Enhanced with Forensic Audit Trail
|
// src/pages/api/ai/query.ts - Enhanced with Comprehensive Confidence Metrics
|
||||||
|
|
||||||
import type { APIRoute } from 'astro';
|
import type { APIRoute } from 'astro';
|
||||||
import { withAPIAuth } from '../../../utils/auth.js';
|
import { withAPIAuth } from '../../../utils/auth.js';
|
||||||
@ -6,6 +6,7 @@ import { apiError, apiServerError, createAuthErrorResponse } from '../../../util
|
|||||||
import { enqueueApiCall } from '../../../utils/rateLimitedQueue.js';
|
import { enqueueApiCall } from '../../../utils/rateLimitedQueue.js';
|
||||||
import { aiPipeline } from '../../../utils/aiPipeline.js';
|
import { aiPipeline } from '../../../utils/aiPipeline.js';
|
||||||
import { forensicConfig } from '../../../utils/forensicConfig.js';
|
import { forensicConfig } from '../../../utils/forensicConfig.js';
|
||||||
|
import { confidenceScorer } from '../../../utils/confidenceScoring.js';
|
||||||
|
|
||||||
export const prerender = false;
|
export const prerender = false;
|
||||||
|
|
||||||
@ -124,7 +125,8 @@ export const POST: APIRoute = async ({ request }) => {
|
|||||||
const { query, mode = 'workflow', taskId: clientTaskId } = body;
|
const { query, mode = 'workflow', taskId: clientTaskId } = body;
|
||||||
|
|
||||||
console.log(`[ENHANCED API] Received request - TaskId: ${clientTaskId}, Mode: ${mode}, Query length: ${query?.length || 0}`);
|
console.log(`[ENHANCED API] Received request - TaskId: ${clientTaskId}, Mode: ${mode}, Query length: ${query?.length || 0}`);
|
||||||
console.log(`[ENHANCED API] User: ${userId}, Audit Trail: ${config.auditTrail.enabled ? 'Enabled' : 'Disabled'}`);
|
console.log(`[ENHANCED API] User: ${userId}, Confidence Scoring: ${config.features.confidenceScoring ? 'Enabled' : 'Disabled'}`);
|
||||||
|
console.log(`[ENHANCED API] Audit Trail: ${config.auditTrail.enabled ? 'Enabled' : 'Disabled'}`);
|
||||||
console.log(`[ENHANCED API] Micro-task calls remaining: ${rateLimitResult.microTasksRemaining}`);
|
console.log(`[ENHANCED API] Micro-task calls remaining: ${rateLimitResult.microTasksRemaining}`);
|
||||||
|
|
||||||
if (!query || typeof query !== 'string') {
|
if (!query || typeof query !== 'string') {
|
||||||
@ -147,7 +149,7 @@ export const POST: APIRoute = async ({ request }) => {
|
|||||||
|
|
||||||
console.log(`[ENHANCED API] About to enqueue enhanced pipeline ${taskId}`);
|
console.log(`[ENHANCED API] About to enqueue enhanced pipeline ${taskId}`);
|
||||||
|
|
||||||
// Use enhanced pipeline with audit trail
|
// Use enhanced pipeline with audit trail and confidence scoring
|
||||||
const result = await enqueueApiCall(() =>
|
const result = await enqueueApiCall(() =>
|
||||||
aiPipeline.processQuery(sanitizedQuery, mode, userId)
|
aiPipeline.processQuery(sanitizedQuery, mode, userId)
|
||||||
, taskId);
|
, taskId);
|
||||||
@ -175,15 +177,33 @@ export const POST: APIRoute = async ({ request }) => {
|
|||||||
|
|
||||||
if (result.auditTrail) {
|
if (result.auditTrail) {
|
||||||
console.log(` - Audit Trail ID: ${result.auditTrail.auditId}`);
|
console.log(` - Audit Trail ID: ${result.auditTrail.auditId}`);
|
||||||
console.log(` - Overall Confidence: ${(result.auditTrail.qualityMetrics.overallConfidence * 100).toFixed(1)}%`);
|
console.log(` - Overall Confidence: ${(result.auditTrail.qualityMetrics.overallConfidence * 100).toFixed(1)}% (${result.auditTrail.qualityMetrics.qualityLevel})`);
|
||||||
|
console.log(` - Confidence Reliability: ${(result.auditTrail.qualityMetrics.confidenceReliability * 100).toFixed(1)}%`);
|
||||||
console.log(` - Bias Risk Score: ${(result.auditTrail.qualityMetrics.biasRiskScore * 100).toFixed(1)}%`);
|
console.log(` - Bias Risk Score: ${(result.auditTrail.qualityMetrics.biasRiskScore * 100).toFixed(1)}%`);
|
||||||
console.log(` - Transparency Score: ${(result.auditTrail.qualityMetrics.transparencyScore * 100).toFixed(1)}%`);
|
console.log(` - Transparency Score: ${(result.auditTrail.qualityMetrics.transparencyScore * 100).toFixed(1)}%`);
|
||||||
|
|
||||||
|
if (result.auditTrail.qualityMetrics.uncertaintyFactors.length > 0) {
|
||||||
|
console.log(` - Uncertainty factors: ${result.auditTrail.qualityMetrics.uncertaintyFactors.join(', ')}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NEW: Enhanced confidence metrics
|
||||||
|
if (result.confidenceMetrics && config.features.confidenceScoring) {
|
||||||
|
console.log(` - Confidence Breakdown:`);
|
||||||
|
console.log(` * Retrieval: ${(result.confidenceMetrics.breakdown.retrieval * 100).toFixed(1)}%`);
|
||||||
|
console.log(` * Selection: ${(result.confidenceMetrics.breakdown.selection * 100).toFixed(1)}%`);
|
||||||
|
console.log(` * Domain: ${(result.confidenceMetrics.breakdown.domain * 100).toFixed(1)}%`);
|
||||||
|
console.log(` * Meta: ${(result.confidenceMetrics.breakdown.meta * 100).toFixed(1)}%`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const currentLimit = rateLimitStore.get(userId);
|
const currentLimit = rateLimitStore.get(userId);
|
||||||
const remainingMicroTasks = currentLimit ?
|
const remainingMicroTasks = currentLimit ?
|
||||||
MICRO_TASK_TOTAL_LIMIT - currentLimit.microTaskCount : MICRO_TASK_TOTAL_LIMIT;
|
MICRO_TASK_TOTAL_LIMIT - currentLimit.microTaskCount : MICRO_TASK_TOTAL_LIMIT;
|
||||||
|
|
||||||
|
// NEW: Check if confidence is acceptable
|
||||||
|
const confidenceAcceptable = result.auditTrail ?
|
||||||
|
confidenceScorer.isConfidenceAcceptable(result.auditTrail.qualityMetrics.overallConfidence) : true;
|
||||||
|
|
||||||
return new Response(JSON.stringify({
|
return new Response(JSON.stringify({
|
||||||
success: true,
|
success: true,
|
||||||
mode,
|
mode,
|
||||||
@ -192,34 +212,73 @@ export const POST: APIRoute = async ({ request }) => {
|
|||||||
query: sanitizedQuery,
|
query: sanitizedQuery,
|
||||||
processingStats: {
|
processingStats: {
|
||||||
...result.processingStats,
|
...result.processingStats,
|
||||||
pipelineType: 'enhanced-micro-task',
|
pipelineType: 'enhanced-micro-task-with-confidence',
|
||||||
microTasksSuccessRate: stats.microTasksCompleted / Math.max(stats.microTasksCompleted + stats.microTasksFailed, 1),
|
microTasksSuccessRate: stats.microTasksCompleted / Math.max(stats.microTasksCompleted + stats.microTasksFailed, 1),
|
||||||
averageTaskTime: stats.processingTimeMs / Math.max(stats.microTasksCompleted + stats.microTasksFailed, 1),
|
averageTaskTime: stats.processingTimeMs / Math.max(stats.microTasksCompleted + stats.microTasksFailed, 1),
|
||||||
estimatedAICallsMade,
|
estimatedAICallsMade,
|
||||||
auditCompliant: result.auditTrail?.compliance.auditCompliant || false,
|
auditCompliant: result.auditTrail?.compliance.auditCompliant || false,
|
||||||
biasChecked: result.auditTrail?.compliance.biasChecked || false,
|
biasChecked: result.auditTrail?.compliance.biasChecked || false,
|
||||||
confidenceAssessed: result.auditTrail?.compliance.confidenceAssessed || false
|
confidenceAssessed: result.auditTrail?.compliance.confidenceAssessed || false,
|
||||||
|
// NEW: Confidence acceptance flag
|
||||||
|
confidenceAcceptable
|
||||||
},
|
},
|
||||||
|
|
||||||
// NEW: Forensic metadata
|
// ENHANCED: Comprehensive forensic metadata with confidence details
|
||||||
forensicMetadata: result.auditTrail ? {
|
forensicMetadata: result.auditTrail ? {
|
||||||
auditTrailId: result.auditTrail.auditId,
|
auditTrailId: result.auditTrail.auditId,
|
||||||
auditEnabled: config.auditTrail.enabled,
|
auditEnabled: config.auditTrail.enabled,
|
||||||
|
|
||||||
|
// Core quality metrics
|
||||||
overallConfidence: result.auditTrail.qualityMetrics.overallConfidence,
|
overallConfidence: result.auditTrail.qualityMetrics.overallConfidence,
|
||||||
biasRiskScore: result.auditTrail.qualityMetrics.biasRiskScore,
|
biasRiskScore: result.auditTrail.qualityMetrics.biasRiskScore,
|
||||||
transparencyScore: result.auditTrail.qualityMetrics.transparencyScore,
|
transparencyScore: result.auditTrail.qualityMetrics.transparencyScore,
|
||||||
reproducibilityScore: result.auditTrail.qualityMetrics.reproducibilityScore,
|
reproducibilityScore: result.auditTrail.qualityMetrics.reproducibilityScore,
|
||||||
evidenceQuality: result.auditTrail.qualityMetrics.evidenceQuality,
|
evidenceQuality: result.auditTrail.qualityMetrics.evidenceQuality,
|
||||||
methodologicalSoundness: result.auditTrail.qualityMetrics.methodologicalSoundness,
|
methodologicalSoundness: result.auditTrail.qualityMetrics.methodologicalSoundness,
|
||||||
|
|
||||||
|
// NEW: Detailed confidence breakdown
|
||||||
|
confidenceBreakdown: config.features.confidenceScoring ? {
|
||||||
|
retrieval: result.auditTrail.qualityMetrics.confidenceBreakdown.retrieval,
|
||||||
|
selection: result.auditTrail.qualityMetrics.confidenceBreakdown.selection,
|
||||||
|
domain: result.auditTrail.qualityMetrics.confidenceBreakdown.domain,
|
||||||
|
meta: result.auditTrail.qualityMetrics.confidenceBreakdown.meta
|
||||||
|
} : undefined,
|
||||||
|
|
||||||
|
// NEW: Confidence assessment details
|
||||||
|
confidenceAssessment: config.features.confidenceScoring ? {
|
||||||
|
qualityLevel: result.auditTrail.qualityMetrics.qualityLevel,
|
||||||
|
reliability: result.auditTrail.qualityMetrics.confidenceReliability,
|
||||||
|
uncertaintyFactors: result.auditTrail.qualityMetrics.uncertaintyFactors,
|
||||||
|
improvementSuggestions: result.auditTrail.qualityMetrics.improvementSuggestions,
|
||||||
|
isAcceptable: confidenceAcceptable,
|
||||||
|
threshold: thresholds.confidenceThreshold
|
||||||
|
} : undefined,
|
||||||
|
|
||||||
|
// Bias and quality warnings
|
||||||
biasWarnings: result.auditTrail.biasAnalysis.filter(b => b.detected),
|
biasWarnings: result.auditTrail.biasAnalysis.filter(b => b.detected),
|
||||||
|
qualityWarnings: !confidenceAcceptable ? ['Confidence below acceptable threshold'] : [],
|
||||||
|
|
||||||
|
// System configuration snapshot
|
||||||
systemConfig: {
|
systemConfig: {
|
||||||
strategicModel: result.auditTrail.systemConfig.strategicModel,
|
strategicModel: result.auditTrail.systemConfig.strategicModel,
|
||||||
tacticalModel: result.auditTrail.systemConfig.tacticalModel,
|
tacticalModel: result.auditTrail.systemConfig.tacticalModel,
|
||||||
auditLevel: result.auditTrail.systemConfig.auditLevel
|
auditLevel: result.auditTrail.systemConfig.auditLevel,
|
||||||
|
confidenceScoringEnabled: config.features.confidenceScoring,
|
||||||
|
biasDetectionEnabled: config.features.biasDetection
|
||||||
},
|
},
|
||||||
|
|
||||||
|
// Compliance and traceability
|
||||||
compliance: result.auditTrail.compliance,
|
compliance: result.auditTrail.compliance,
|
||||||
qualityLevel: result.auditTrail.qualityMetrics.overallConfidence >= thresholds.confidenceThreshold ? 'high' :
|
qualityLevel: result.auditTrail.qualityMetrics.overallConfidence >= thresholds.confidenceThreshold ? 'high' :
|
||||||
result.auditTrail.qualityMetrics.overallConfidence >= 0.5 ? 'medium' : 'low'
|
result.auditTrail.qualityMetrics.overallConfidence >= 0.5 ? 'medium' : 'low',
|
||||||
|
|
||||||
|
// NEW: Actionable insights
|
||||||
|
actionableInsights: {
|
||||||
|
shouldReviewSelection: result.auditTrail.qualityMetrics.biasRiskScore > thresholds.biasAlertThreshold,
|
||||||
|
shouldImproveQuery: result.auditTrail.qualityMetrics.uncertaintyFactors.length > 2,
|
||||||
|
shouldSeekExpertReview: result.auditTrail.qualityMetrics.overallConfidence < 0.6,
|
||||||
|
confidenceImprovement: result.auditTrail.qualityMetrics.improvementSuggestions.slice(0, 3)
|
||||||
|
}
|
||||||
} : {
|
} : {
|
||||||
auditTrailId: null,
|
auditTrailId: null,
|
||||||
auditEnabled: false,
|
auditEnabled: false,
|
||||||
@ -240,7 +299,9 @@ export const POST: APIRoute = async ({ request }) => {
|
|||||||
console.error('[ENHANCED API] Pipeline error:', error);
|
console.error('[ENHANCED API] Pipeline error:', error);
|
||||||
|
|
||||||
// Provide detailed error information for forensic purposes
|
// Provide detailed error information for forensic purposes
|
||||||
if (error.message.includes('embeddings')) {
|
if (error.message.includes('confidence')) {
|
||||||
|
return apiServerError.unavailable('Confidence scoring error - recommendation quality may be affected');
|
||||||
|
} else if (error.message.includes('embeddings')) {
|
||||||
return apiServerError.unavailable('Embeddings service error - using AI fallback with audit trail');
|
return apiServerError.unavailable('Embeddings service error - using AI fallback with audit trail');
|
||||||
} else if (error.message.includes('micro-task')) {
|
} else if (error.message.includes('micro-task')) {
|
||||||
return apiServerError.unavailable('Micro-task pipeline error - some analysis steps failed but audit trail maintained');
|
return apiServerError.unavailable('Micro-task pipeline error - some analysis steps failed but audit trail maintained');
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
// src/utils/aiPipeline.ts - Enhanced Forensic AI Pipeline with Audit Trail
|
// src/utils/aiPipeline.ts - Enhanced with Confidence Scoring Integration
|
||||||
|
|
||||||
import { getCompressedToolsDataForAI } from './dataService.js';
|
import { getCompressedToolsDataForAI } from './dataService.js';
|
||||||
import { embeddingsService, type EmbeddingData, type EmbeddingSearchResult } from './embeddings.js';
|
import { embeddingsService, type EmbeddingData, type EmbeddingSearchResult } from './embeddings.js';
|
||||||
import { forensicConfig, type AIModelConfig } from './forensicConfig.js';
|
import { forensicConfig, type AIModelConfig } from './forensicConfig.js';
|
||||||
import { auditTrailService, type ForensicAuditEntry } from './auditTrail.js';
|
import { auditTrailService, type ForensicAuditEntry } from './auditTrail.js';
|
||||||
|
import { confidenceScorer, type ConfidenceMetrics } from './confidenceScoring.js';
|
||||||
|
|
||||||
interface MicroTaskResult {
|
interface MicroTaskResult {
|
||||||
taskType: string;
|
taskType: string;
|
||||||
@ -36,6 +37,8 @@ interface AnalysisResult {
|
|||||||
biasRiskScore: number;
|
biasRiskScore: number;
|
||||||
transparencyScore: number;
|
transparencyScore: number;
|
||||||
};
|
};
|
||||||
|
// NEW: Enhanced confidence metrics
|
||||||
|
confidenceMetrics?: ConfidenceMetrics;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface AnalysisContext {
|
interface AnalysisContext {
|
||||||
@ -80,9 +83,10 @@ class EnhancedMicroTaskAIPipeline {
|
|||||||
this.maxContextTokens = this.config.aiModels.strategic.maxContextTokens;
|
this.maxContextTokens = this.config.aiModels.strategic.maxContextTokens;
|
||||||
this.maxPromptTokens = Math.floor(this.maxContextTokens * 0.6); // Leave room for response
|
this.maxPromptTokens = Math.floor(this.maxContextTokens * 0.6); // Leave room for response
|
||||||
|
|
||||||
console.log('[ENHANCED PIPELINE] Initialized with forensic configuration');
|
console.log('[ENHANCED PIPELINE] Initialized with forensic configuration and confidence scoring');
|
||||||
console.log(`[ENHANCED PIPELINE] Strategic Model: ${this.config.aiModels.strategic.model}`);
|
console.log(`[ENHANCED PIPELINE] Strategic Model: ${this.config.aiModels.strategic.model}`);
|
||||||
console.log(`[ENHANCED PIPELINE] Tactical Model: ${this.config.aiModels.tactical.model}`);
|
console.log(`[ENHANCED PIPELINE] Tactical Model: ${this.config.aiModels.tactical.model}`);
|
||||||
|
console.log(`[ENHANCED PIPELINE] Confidence Scoring: ${this.config.features.confidenceScoring ? 'Enabled' : 'Disabled'}`);
|
||||||
console.log(`[ENHANCED PIPELINE] Audit Trail: ${this.config.auditTrail.enabled ? 'Enabled' : 'Disabled'}`);
|
console.log(`[ENHANCED PIPELINE] Audit Trail: ${this.config.auditTrail.enabled ? 'Enabled' : 'Disabled'}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -447,7 +451,11 @@ Respond with ONLY this JSON format:
|
|||||||
rawResponse: aiResult.content
|
rawResponse: aiResult.content
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// NEW: Log domain confidence analysis
|
||||||
|
auditTrailService.logDomainAnalysis(selectedTools, selectedConcepts);
|
||||||
|
|
||||||
console.log(`[ENHANCED PIPELINE] Final selection: ${selectedTools.length} tools with bias prevention applied`);
|
console.log(`[ENHANCED PIPELINE] Final selection: ${selectedTools.length} tools with bias prevention applied`);
|
||||||
|
console.log(`[ENHANCED PIPELINE] Domain confidence analysis logged`);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
selectedTools,
|
selectedTools,
|
||||||
@ -644,10 +652,8 @@ WICHTIG: Antworten Sie NUR in fließendem deutschen Text ohne Listen, Aufzählun
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ... (Additional micro-task methods would be implemented similarly with audit trail integration)
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// MAIN PROCESSING METHOD WITH FULL AUDIT TRAIL
|
// MAIN PROCESSING METHOD WITH FULL AUDIT TRAIL AND CONFIDENCE
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
async processQuery(userQuery: string, mode: string, userId: string = 'anonymous'): Promise<AnalysisResult> {
|
async processQuery(userQuery: string, mode: string, userId: string = 'anonymous'): Promise<AnalysisResult> {
|
||||||
@ -718,13 +724,31 @@ WICHTIG: Antworten Sie NUR in fließendem deutschen Text ohne Listen, Aufzählun
|
|||||||
tokensTotalUsed: auditTrail?.processingSummary.tokensTotalUsed || 0
|
tokensTotalUsed: auditTrail?.processingSummary.tokensTotalUsed || 0
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// NEW: Extract confidence metrics from audit trail
|
||||||
|
let confidenceMetrics: ConfidenceMetrics | undefined;
|
||||||
|
if (auditTrail && this.config.features.confidenceScoring) {
|
||||||
|
confidenceMetrics = {
|
||||||
|
overall: auditTrail.qualityMetrics.overallConfidence,
|
||||||
|
breakdown: auditTrail.qualityMetrics.confidenceBreakdown,
|
||||||
|
uncertaintyFactors: auditTrail.qualityMetrics.uncertaintyFactors,
|
||||||
|
improvementSuggestions: auditTrail.qualityMetrics.improvementSuggestions,
|
||||||
|
qualityLevel: auditTrail.qualityMetrics.qualityLevel,
|
||||||
|
reliability: auditTrail.qualityMetrics.confidenceReliability
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
console.log(`[ENHANCED PIPELINE] Completed: ${completedTasks} tasks, Failed: ${failedTasks} tasks`);
|
console.log(`[ENHANCED PIPELINE] Completed: ${completedTasks} tasks, Failed: ${failedTasks} tasks`);
|
||||||
console.log(`[ENHANCED PIPELINE] Unique tools selected: ${context.seenToolNames.size}`);
|
console.log(`[ENHANCED PIPELINE] Unique tools selected: ${context.seenToolNames.size}`);
|
||||||
|
|
||||||
if (auditTrail) {
|
if (auditTrail) {
|
||||||
console.log(`[ENHANCED PIPELINE] Audit Trail: ${auditTrail.auditId}`);
|
console.log(`[ENHANCED PIPELINE] Audit Trail: ${auditTrail.auditId}`);
|
||||||
console.log(`[ENHANCED PIPELINE] Quality Score: ${(auditTrail.qualityMetrics.overallConfidence * 100).toFixed(1)}%`);
|
console.log(`[ENHANCED PIPELINE] Overall Confidence: ${(auditTrail.qualityMetrics.overallConfidence * 100).toFixed(1)}% (${auditTrail.qualityMetrics.qualityLevel})`);
|
||||||
|
console.log(`[ENHANCED PIPELINE] Confidence Reliability: ${(auditTrail.qualityMetrics.confidenceReliability * 100).toFixed(1)}%`);
|
||||||
console.log(`[ENHANCED PIPELINE] Bias Risk: ${(auditTrail.qualityMetrics.biasRiskScore * 100).toFixed(1)}%`);
|
console.log(`[ENHANCED PIPELINE] Bias Risk: ${(auditTrail.qualityMetrics.biasRiskScore * 100).toFixed(1)}%`);
|
||||||
|
|
||||||
|
if (auditTrail.qualityMetrics.uncertaintyFactors.length > 0) {
|
||||||
|
console.log(`[ENHANCED PIPELINE] Uncertainty factors: ${auditTrail.qualityMetrics.uncertaintyFactors.length}`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@ -735,7 +759,8 @@ WICHTIG: Antworten Sie NUR in fließendem deutschen Text ohne Listen, Aufzählun
|
|||||||
overallConfidence: auditTrail.qualityMetrics.overallConfidence,
|
overallConfidence: auditTrail.qualityMetrics.overallConfidence,
|
||||||
biasRiskScore: auditTrail.qualityMetrics.biasRiskScore,
|
biasRiskScore: auditTrail.qualityMetrics.biasRiskScore,
|
||||||
transparencyScore: auditTrail.qualityMetrics.transparencyScore
|
transparencyScore: auditTrail.qualityMetrics.transparencyScore
|
||||||
} : undefined
|
} : undefined,
|
||||||
|
confidenceMetrics // NEW: Return detailed confidence metrics
|
||||||
};
|
};
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
// src/utils/auditTrail.ts - Forensic Audit Trail System
|
// src/utils/auditTrail.ts - Enhanced Forensic Audit Trail with Confidence Scoring
|
||||||
|
|
||||||
import { forensicConfig } from './forensicConfig.js';
|
import { forensicConfig } from './forensicConfig.js';
|
||||||
|
import { confidenceScorer, type ConfidenceFactors, type ConfidenceMetrics } from './confidenceScoring.js';
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// AUDIT TRAIL DATA STRUCTURES
|
// ENHANCED AUDIT TRAIL DATA STRUCTURES
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
interface QueryClassification {
|
interface QueryClassification {
|
||||||
@ -80,13 +81,32 @@ interface MicroTaskAudit {
|
|||||||
contextContinuityUsed: boolean;
|
contextContinuityUsed: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ENHANCED: More comprehensive quality metrics with confidence integration
|
||||||
interface QualityMetrics {
|
interface QualityMetrics {
|
||||||
|
// Overall metrics
|
||||||
overallConfidence: number;
|
overallConfidence: number;
|
||||||
reproducibilityScore: number;
|
reproducibilityScore: number;
|
||||||
biasRiskScore: number;
|
biasRiskScore: number;
|
||||||
transparencyScore: number;
|
transparencyScore: number;
|
||||||
evidenceQuality: number;
|
evidenceQuality: number;
|
||||||
methodologicalSoundness: number;
|
methodologicalSoundness: number;
|
||||||
|
|
||||||
|
// NEW: Detailed confidence breakdown
|
||||||
|
confidenceBreakdown: {
|
||||||
|
retrieval: number;
|
||||||
|
selection: number;
|
||||||
|
domain: number;
|
||||||
|
meta: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
// NEW: Confidence factors used in calculation
|
||||||
|
confidenceFactors: ConfidenceFactors;
|
||||||
|
|
||||||
|
// NEW: Uncertainty and improvement tracking
|
||||||
|
uncertaintyFactors: string[];
|
||||||
|
improvementSuggestions: string[];
|
||||||
|
qualityLevel: 'low' | 'medium' | 'high' | 'excellent';
|
||||||
|
confidenceReliability: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface ForensicAuditEntry {
|
interface ForensicAuditEntry {
|
||||||
@ -123,7 +143,7 @@ interface ForensicAuditEntry {
|
|||||||
// Micro-task Audit
|
// Micro-task Audit
|
||||||
microTasks: MicroTaskAudit[];
|
microTasks: MicroTaskAudit[];
|
||||||
|
|
||||||
// Final Quality Metrics
|
// ENHANCED: Final Quality Metrics with comprehensive confidence
|
||||||
qualityMetrics: QualityMetrics;
|
qualityMetrics: QualityMetrics;
|
||||||
|
|
||||||
// Processing Summary
|
// Processing Summary
|
||||||
@ -147,7 +167,7 @@ interface ForensicAuditEntry {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// AUDIT TRAIL SERVICE IMPLEMENTATION
|
// ENHANCED AUDIT TRAIL SERVICE IMPLEMENTATION
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
class ForensicAuditTrailService {
|
class ForensicAuditTrailService {
|
||||||
@ -155,15 +175,18 @@ class ForensicAuditTrailService {
|
|||||||
private auditStorage: Map<string, ForensicAuditEntry> = new Map();
|
private auditStorage: Map<string, ForensicAuditEntry> = new Map();
|
||||||
private config = forensicConfig.getConfig();
|
private config = forensicConfig.getConfig();
|
||||||
|
|
||||||
|
// NEW: Track confidence factors as they're calculated
|
||||||
|
private currentConfidenceFactors: Partial<ConfidenceFactors> = {};
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
if (this.config.auditTrail.enabled) {
|
if (this.config.auditTrail.enabled) {
|
||||||
console.log('[AUDIT TRAIL] Forensic audit trail service initialized');
|
console.log('[AUDIT TRAIL] Enhanced forensic audit trail service initialized with confidence scoring');
|
||||||
this.setupCleanupInterval();
|
this.setupCleanupInterval();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
// AUDIT LIFECYCLE MANAGEMENT
|
// AUDIT LIFECYCLE MANAGEMENT (Enhanced)
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
|
|
||||||
startAudit(userId: string, query: string, mode: 'workflow' | 'tool'): string {
|
startAudit(userId: string, query: string, mode: 'workflow' | 'tool'): string {
|
||||||
@ -234,13 +257,38 @@ class ForensicAuditTrailService {
|
|||||||
biasAnalysis: [],
|
biasAnalysis: [],
|
||||||
microTasks: [],
|
microTasks: [],
|
||||||
|
|
||||||
|
// ENHANCED: Initialize with default confidence structure
|
||||||
qualityMetrics: {
|
qualityMetrics: {
|
||||||
overallConfidence: 0,
|
overallConfidence: 0,
|
||||||
reproducibilityScore: 0,
|
reproducibilityScore: 0,
|
||||||
biasRiskScore: 0,
|
biasRiskScore: 0,
|
||||||
transparencyScore: 0,
|
transparencyScore: 0,
|
||||||
evidenceQuality: 0,
|
evidenceQuality: 0,
|
||||||
methodologicalSoundness: 0
|
methodologicalSoundness: 0,
|
||||||
|
confidenceBreakdown: {
|
||||||
|
retrieval: 0,
|
||||||
|
selection: 0,
|
||||||
|
domain: 0,
|
||||||
|
meta: 0
|
||||||
|
},
|
||||||
|
confidenceFactors: {
|
||||||
|
embeddingsQuality: 0.5,
|
||||||
|
candidateRelevance: 0.5,
|
||||||
|
retrievalMethod: 0.5,
|
||||||
|
aiModelCertainty: 0.5,
|
||||||
|
selectionConsistency: 0.5,
|
||||||
|
reasoningQuality: 0.5,
|
||||||
|
scenarioSpecificity: 0.5,
|
||||||
|
toolSpecialization: 0.5,
|
||||||
|
expertiseAlignment: 0.5,
|
||||||
|
biasRiskLevel: 0.5,
|
||||||
|
historicalAccuracy: 0.5,
|
||||||
|
processingStability: 0.5
|
||||||
|
},
|
||||||
|
uncertaintyFactors: [],
|
||||||
|
improvementSuggestions: [],
|
||||||
|
qualityLevel: 'medium',
|
||||||
|
confidenceReliability: 0.5
|
||||||
},
|
},
|
||||||
|
|
||||||
processingSummary: {
|
processingSummary: {
|
||||||
@ -261,7 +309,10 @@ class ForensicAuditTrailService {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
console.log(`[AUDIT TRAIL] Started audit ${auditId} for user ${userId}, mode: ${mode}`);
|
// Reset confidence factors tracking
|
||||||
|
this.currentConfidenceFactors = {};
|
||||||
|
|
||||||
|
console.log(`[AUDIT TRAIL] Started enhanced audit ${auditId} for user ${userId}, mode: ${mode}`);
|
||||||
return auditId;
|
return auditId;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -281,7 +332,7 @@ class ForensicAuditTrailService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
// RETRIEVAL PROCESS LOGGING
|
// RETRIEVAL PROCESS LOGGING (Enhanced with Confidence)
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
|
|
||||||
logRetrievalStart(method: 'embeddings' | 'ai_selector' | 'emergency_fallback'): void {
|
logRetrievalStart(method: 'embeddings' | 'ai_selector' | 'emergency_fallback'): void {
|
||||||
@ -312,10 +363,21 @@ class ForensicAuditTrailService {
|
|||||||
processingTimeMs: data.processingTimeMs,
|
processingTimeMs: data.processingTimeMs,
|
||||||
fallbackReason: data.fallbackReason
|
fallbackReason: data.fallbackReason
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// NEW: Calculate and store retrieval confidence factors
|
||||||
|
const retrievalAssessment = confidenceScorer.assessRetrievalConfidence(this.currentAudit.retrievalProcess);
|
||||||
|
|
||||||
|
// Update confidence factors
|
||||||
|
this.currentConfidenceFactors = {
|
||||||
|
...this.currentConfidenceFactors,
|
||||||
|
...retrievalAssessment.factors
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log(`[AUDIT TRAIL] Retrieval confidence: ${(retrievalAssessment.confidence * 100).toFixed(1)}%`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
// SELECTION PROCESS LOGGING
|
// SELECTION PROCESS LOGGING (Enhanced with Confidence)
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
|
|
||||||
logSelectionStart(aiModel: 'strategic' | 'tactical' | 'legacy', initialCandidates: string[]): void {
|
logSelectionStart(aiModel: 'strategic' | 'tactical' | 'legacy', initialCandidates: string[]): void {
|
||||||
@ -361,6 +423,17 @@ class ForensicAuditTrailService {
|
|||||||
|
|
||||||
this.currentAudit.processingSummary.aiCallsMade++;
|
this.currentAudit.processingSummary.aiCallsMade++;
|
||||||
this.currentAudit.processingSummary.tokensTotalUsed += data.promptTokens + data.responseTokens;
|
this.currentAudit.processingSummary.tokensTotalUsed += data.promptTokens + data.responseTokens;
|
||||||
|
|
||||||
|
// NEW: Calculate and store selection confidence factors
|
||||||
|
const selectionAssessment = confidenceScorer.assessSelectionConfidence(this.currentAudit.selectionProcess);
|
||||||
|
|
||||||
|
// Update confidence factors
|
||||||
|
this.currentConfidenceFactors = {
|
||||||
|
...this.currentConfidenceFactors,
|
||||||
|
...selectionAssessment.factors
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log(`[AUDIT TRAIL] Selection confidence: ${(selectionAssessment.confidence * 100).toFixed(1)}%`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
@ -373,9 +446,13 @@ class ForensicAuditTrailService {
|
|||||||
this.currentAudit.biasAnalysis = [...biasResults];
|
this.currentAudit.biasAnalysis = [...biasResults];
|
||||||
this.currentAudit.compliance.biasChecked = true;
|
this.currentAudit.compliance.biasChecked = true;
|
||||||
|
|
||||||
// Calculate overall bias risk score
|
// Calculate overall bias risk score for confidence factors
|
||||||
const biasRiskScore = biasResults.length > 0 ?
|
const biasRiskScore = biasResults.length > 0 ?
|
||||||
Math.max(...biasResults.filter(b => b.detected).map(b => b.severity)) : 0;
|
Math.max(...biasResults.filter(b => b.detected).map(b => b.severity)) : 0;
|
||||||
|
|
||||||
|
// Update confidence factors
|
||||||
|
this.currentConfidenceFactors.biasRiskLevel = Math.max(0, 1 - biasRiskScore);
|
||||||
|
|
||||||
this.currentAudit.qualityMetrics.biasRiskScore = biasRiskScore;
|
this.currentAudit.qualityMetrics.biasRiskScore = biasRiskScore;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -425,7 +502,36 @@ class ForensicAuditTrailService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
// AUDIT FINALIZATION
|
// NEW: DOMAIN CONFIDENCE LOGGING
|
||||||
|
// ========================================================================
|
||||||
|
|
||||||
|
logDomainAnalysis(selectedTools: any[], selectedConcepts: any[]): void {
|
||||||
|
if (!this.currentAudit || !this.config.auditTrail.enabled) return;
|
||||||
|
|
||||||
|
// Calculate domain confidence factors
|
||||||
|
const domainAssessment = confidenceScorer.assessDomainConfidence(
|
||||||
|
{
|
||||||
|
userQuery: this.currentAudit.userQuery,
|
||||||
|
queryMode: this.currentAudit.queryMode,
|
||||||
|
sanitizedQuery: this.currentAudit.sanitizedQuery
|
||||||
|
},
|
||||||
|
{
|
||||||
|
selectedTools,
|
||||||
|
selectedConcepts
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update confidence factors
|
||||||
|
this.currentConfidenceFactors = {
|
||||||
|
...this.currentConfidenceFactors,
|
||||||
|
...domainAssessment.factors
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log(`[AUDIT TRAIL] Domain confidence: ${(domainAssessment.confidence * 100).toFixed(1)}%`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// ENHANCED AUDIT FINALIZATION WITH COMPREHENSIVE CONFIDENCE
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
|
|
||||||
calculateQualityMetrics(): void {
|
calculateQualityMetrics(): void {
|
||||||
@ -433,8 +539,27 @@ class ForensicAuditTrailService {
|
|||||||
|
|
||||||
const audit = this.currentAudit;
|
const audit = this.currentAudit;
|
||||||
|
|
||||||
// Overall confidence (weighted average of retrieval and selection confidence)
|
// NEW: Calculate meta confidence factors
|
||||||
const overallConfidence = (
|
const metaAssessment = confidenceScorer.assessMetaConfidence({
|
||||||
|
biasAnalysis: audit.biasAnalysis,
|
||||||
|
microTasks: audit.microTasks,
|
||||||
|
processingSummary: audit.processingSummary,
|
||||||
|
errorCount: audit.processingSummary.errorsEncountered,
|
||||||
|
fallbacksUsed: audit.processingSummary.fallbacksUsed
|
||||||
|
});
|
||||||
|
|
||||||
|
// Finalize confidence factors
|
||||||
|
const finalConfidenceFactors: ConfidenceFactors = {
|
||||||
|
...audit.qualityMetrics.confidenceFactors, // defaults
|
||||||
|
...this.currentConfidenceFactors, // calculated values
|
||||||
|
...metaAssessment.factors // meta factors
|
||||||
|
};
|
||||||
|
|
||||||
|
// NEW: Calculate comprehensive confidence metrics using new framework
|
||||||
|
const confidenceMetrics = confidenceScorer.calculateOverallConfidence(finalConfidenceFactors);
|
||||||
|
|
||||||
|
// Legacy metrics calculation (for backward compatibility)
|
||||||
|
const legacyOverallConfidence = (
|
||||||
audit.retrievalProcess.retrievalConfidence * 0.3 +
|
audit.retrievalProcess.retrievalConfidence * 0.3 +
|
||||||
audit.selectionProcess.confidenceScore * 0.5 +
|
audit.selectionProcess.confidenceScore * 0.5 +
|
||||||
(audit.microTasks.reduce((sum, task) => sum + task.confidence, 0) / Math.max(audit.microTasks.length, 1)) * 0.2
|
(audit.microTasks.reduce((sum, task) => sum + task.confidence, 0) / Math.max(audit.microTasks.length, 1)) * 0.2
|
||||||
@ -447,9 +572,6 @@ class ForensicAuditTrailService {
|
|||||||
(audit.microTasks.length >= 4 ? 0.4 : audit.microTasks.length * 0.1)
|
(audit.microTasks.length >= 4 ? 0.4 : audit.microTasks.length * 0.1)
|
||||||
);
|
);
|
||||||
|
|
||||||
// Bias risk score (inverse of detected bias severity)
|
|
||||||
const biasRiskScore = audit.qualityMetrics.biasRiskScore;
|
|
||||||
|
|
||||||
// Transparency score (based on audit detail level and traceability)
|
// Transparency score (based on audit detail level and traceability)
|
||||||
const transparencyScore = (
|
const transparencyScore = (
|
||||||
(audit.selectionProcess.rawResponse.length > 0 ? 0.3 : 0) +
|
(audit.selectionProcess.rawResponse.length > 0 ? 0.3 : 0) +
|
||||||
@ -472,17 +594,28 @@ class ForensicAuditTrailService {
|
|||||||
(audit.microTasks.length >= 4 ? 0.2 : 0)
|
(audit.microTasks.length >= 4 ? 0.2 : 0)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// NEW: Use comprehensive confidence metrics
|
||||||
audit.qualityMetrics = {
|
audit.qualityMetrics = {
|
||||||
overallConfidence: Math.min(1, Math.max(0, overallConfidence)),
|
overallConfidence: Math.min(1, Math.max(0, confidenceMetrics.overall)),
|
||||||
reproducibilityScore: Math.min(1, Math.max(0, reproducibilityScore)),
|
reproducibilityScore: Math.min(1, Math.max(0, reproducibilityScore)),
|
||||||
biasRiskScore: Math.min(1, Math.max(0, biasRiskScore)),
|
biasRiskScore: Math.min(1, Math.max(0, audit.qualityMetrics.biasRiskScore)),
|
||||||
transparencyScore: Math.min(1, Math.max(0, transparencyScore)),
|
transparencyScore: Math.min(1, Math.max(0, transparencyScore)),
|
||||||
evidenceQuality: Math.min(1, Math.max(0, evidenceQuality)),
|
evidenceQuality: Math.min(1, Math.max(0, evidenceQuality)),
|
||||||
methodologicalSoundness: Math.min(1, Math.max(0, methodologicalSoundness))
|
methodologicalSoundness: Math.min(1, Math.max(0, methodologicalSoundness)),
|
||||||
|
|
||||||
|
// NEW: Enhanced confidence data
|
||||||
|
confidenceBreakdown: confidenceMetrics.breakdown,
|
||||||
|
confidenceFactors: finalConfidenceFactors,
|
||||||
|
uncertaintyFactors: confidenceMetrics.uncertaintyFactors,
|
||||||
|
improvementSuggestions: confidenceMetrics.improvementSuggestions,
|
||||||
|
qualityLevel: confidenceMetrics.qualityLevel,
|
||||||
|
confidenceReliability: confidenceMetrics.reliability
|
||||||
};
|
};
|
||||||
|
|
||||||
audit.compliance.confidenceAssessed = true;
|
audit.compliance.confidenceAssessed = true;
|
||||||
audit.compliance.traceabilityScore = transparencyScore;
|
audit.compliance.traceabilityScore = transparencyScore;
|
||||||
|
|
||||||
|
console.log(`[AUDIT TRAIL] Enhanced quality metrics calculated - Overall confidence: ${(confidenceMetrics.overall * 100).toFixed(1)}%`);
|
||||||
}
|
}
|
||||||
|
|
||||||
finalizeAudit(finalRecommendationCount: number): ForensicAuditEntry | null {
|
finalizeAudit(finalRecommendationCount: number): ForensicAuditEntry | null {
|
||||||
@ -496,7 +629,7 @@ class ForensicAuditTrailService {
|
|||||||
|
|
||||||
this.currentAudit.processingSummary.finalRecommendationCount = finalRecommendationCount;
|
this.currentAudit.processingSummary.finalRecommendationCount = finalRecommendationCount;
|
||||||
|
|
||||||
// Calculate final quality metrics
|
// Calculate enhanced quality metrics
|
||||||
this.calculateQualityMetrics();
|
this.calculateQualityMetrics();
|
||||||
|
|
||||||
// Store the audit trail
|
// Store the audit trail
|
||||||
@ -504,17 +637,23 @@ class ForensicAuditTrailService {
|
|||||||
|
|
||||||
const finalAudit = { ...this.currentAudit };
|
const finalAudit = { ...this.currentAudit };
|
||||||
this.currentAudit = null;
|
this.currentAudit = null;
|
||||||
|
this.currentConfidenceFactors = {}; // Reset
|
||||||
|
|
||||||
console.log(`[AUDIT TRAIL] Finalized audit ${finalAudit.auditId}`);
|
console.log(`[AUDIT TRAIL] Finalized enhanced audit ${finalAudit.auditId}`);
|
||||||
console.log(`[AUDIT TRAIL] Quality Score: ${(finalAudit.qualityMetrics.overallConfidence * 100).toFixed(1)}%`);
|
console.log(`[AUDIT TRAIL] Overall Confidence: ${(finalAudit.qualityMetrics.overallConfidence * 100).toFixed(1)}% (${finalAudit.qualityMetrics.qualityLevel})`);
|
||||||
|
console.log(`[AUDIT TRAIL] Confidence Reliability: ${(finalAudit.qualityMetrics.confidenceReliability * 100).toFixed(1)}%`);
|
||||||
console.log(`[AUDIT TRAIL] Bias Risk: ${(finalAudit.qualityMetrics.biasRiskScore * 100).toFixed(1)}%`);
|
console.log(`[AUDIT TRAIL] Bias Risk: ${(finalAudit.qualityMetrics.biasRiskScore * 100).toFixed(1)}%`);
|
||||||
console.log(`[AUDIT TRAIL] Transparency: ${(finalAudit.qualityMetrics.transparencyScore * 100).toFixed(1)}%`);
|
console.log(`[AUDIT TRAIL] Transparency: ${(finalAudit.qualityMetrics.transparencyScore * 100).toFixed(1)}%`);
|
||||||
|
|
||||||
|
if (finalAudit.qualityMetrics.uncertaintyFactors.length > 0) {
|
||||||
|
console.log(`[AUDIT TRAIL] Uncertainty factors: ${finalAudit.qualityMetrics.uncertaintyFactors.length}`);
|
||||||
|
}
|
||||||
|
|
||||||
return finalAudit;
|
return finalAudit;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
// AUDIT RETRIEVAL AND EXPORT
|
// AUDIT RETRIEVAL AND EXPORT (Enhanced)
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
|
|
||||||
getAuditTrail(auditId: string): ForensicAuditEntry | null {
|
getAuditTrail(auditId: string): ForensicAuditEntry | null {
|
||||||
@ -528,6 +667,7 @@ class ForensicAuditTrailService {
|
|||||||
return JSON.stringify(audit, null, 2);
|
return JSON.stringify(audit, null, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NEW: Enhanced audit summary with confidence details
|
||||||
getAuditSummary(auditId: string): any {
|
getAuditSummary(auditId: string): any {
|
||||||
const audit = this.getAuditTrail(auditId);
|
const audit = this.getAuditTrail(auditId);
|
||||||
if (!audit) return null;
|
if (!audit) return null;
|
||||||
@ -542,12 +682,37 @@ class ForensicAuditTrailService {
|
|||||||
compliance: audit.compliance,
|
compliance: audit.compliance,
|
||||||
biasWarnings: audit.biasAnalysis.filter(b => b.detected).length,
|
biasWarnings: audit.biasAnalysis.filter(b => b.detected).length,
|
||||||
microTasksCompleted: audit.microTasks.filter(t => t.success).length,
|
microTasksCompleted: audit.microTasks.filter(t => t.success).length,
|
||||||
totalMicroTasks: audit.microTasks.length
|
totalMicroTasks: audit.microTasks.length,
|
||||||
|
|
||||||
|
// NEW: Confidence summary
|
||||||
|
confidenceSummary: {
|
||||||
|
overall: audit.qualityMetrics.overallConfidence,
|
||||||
|
qualityLevel: audit.qualityMetrics.qualityLevel,
|
||||||
|
reliability: audit.qualityMetrics.confidenceReliability,
|
||||||
|
uncertaintyCount: audit.qualityMetrics.uncertaintyFactors.length,
|
||||||
|
improvementSuggestionsCount: audit.qualityMetrics.improvementSuggestions.length,
|
||||||
|
breakdown: audit.qualityMetrics.confidenceBreakdown
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NEW: Get confidence report for specific audit
|
||||||
|
getConfidenceReport(auditId: string): string | null {
|
||||||
|
const audit = this.getAuditTrail(auditId);
|
||||||
|
if (!audit) return null;
|
||||||
|
|
||||||
|
return confidenceScorer.formatConfidenceReport({
|
||||||
|
overall: audit.qualityMetrics.overallConfidence,
|
||||||
|
breakdown: audit.qualityMetrics.confidenceBreakdown,
|
||||||
|
uncertaintyFactors: audit.qualityMetrics.uncertaintyFactors,
|
||||||
|
improvementSuggestions: audit.qualityMetrics.improvementSuggestions,
|
||||||
|
qualityLevel: audit.qualityMetrics.qualityLevel,
|
||||||
|
reliability: audit.qualityMetrics.confidenceReliability
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
// UTILITY METHODS
|
// UTILITY METHODS (Enhanced)
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
|
|
||||||
private setupCleanupInterval(): void {
|
private setupCleanupInterval(): void {
|
||||||
@ -570,19 +735,21 @@ class ForensicAuditTrailService {
|
|||||||
}, 60 * 60 * 1000); // Run cleanup every hour
|
}, 60 * 60 * 1000); // Run cleanup every hour
|
||||||
}
|
}
|
||||||
|
|
||||||
getStorageStats(): { totalAudits: number; oldestAudit: string | null; newestAudit: string | null } {
|
getStorageStats(): { totalAudits: number; oldestAudit: string | null; newestAudit: string | null; averageConfidence: number } {
|
||||||
const audits = Array.from(this.auditStorage.values());
|
const audits = Array.from(this.auditStorage.values());
|
||||||
|
|
||||||
if (audits.length === 0) {
|
if (audits.length === 0) {
|
||||||
return { totalAudits: 0, oldestAudit: null, newestAudit: null };
|
return { totalAudits: 0, oldestAudit: null, newestAudit: null, averageConfidence: 0 };
|
||||||
}
|
}
|
||||||
|
|
||||||
const sorted = audits.sort((a, b) => a.timestamp.getTime() - b.timestamp.getTime());
|
const sorted = audits.sort((a, b) => a.timestamp.getTime() - b.timestamp.getTime());
|
||||||
|
const averageConfidence = audits.reduce((sum, audit) => sum + audit.qualityMetrics.overallConfidence, 0) / audits.length;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
totalAudits: audits.length,
|
totalAudits: audits.length,
|
||||||
oldestAudit: sorted[0].timestamp.toISOString(),
|
oldestAudit: sorted[0].timestamp.toISOString(),
|
||||||
newestAudit: sorted[sorted.length - 1].timestamp.toISOString()
|
newestAudit: sorted[sorted.length - 1].timestamp.toISOString(),
|
||||||
|
averageConfidence
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -593,6 +760,12 @@ class ForensicAuditTrailService {
|
|||||||
isAuditInProgress(): boolean {
|
isAuditInProgress(): boolean {
|
||||||
return this.currentAudit !== null;
|
return this.currentAudit !== null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NEW: Check if confidence is acceptable for current audit
|
||||||
|
isCurrentConfidenceAcceptable(): boolean {
|
||||||
|
if (!this.currentAudit) return false;
|
||||||
|
return confidenceScorer.isConfidenceAcceptable(this.currentAudit.qualityMetrics.overallConfidence);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Export singleton instance
|
// Export singleton instance
|
||||||
|
687
src/utils/confidenceScoring.ts
Normal file
687
src/utils/confidenceScoring.ts
Normal file
@ -0,0 +1,687 @@
|
|||||||
|
// src/utils/confidenceScoring.ts - Forensic Confidence Scoring Framework
|
||||||
|
|
||||||
|
import { forensicConfig } from './forensicConfig.js';
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONFIDENCE FACTOR INTERFACES
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
interface ConfidenceFactors {
|
||||||
|
// Retrieval confidence
|
||||||
|
embeddingsQuality: number; // How well embeddings matched (0-1)
|
||||||
|
candidateRelevance: number; // Relevance of retrieved candidates (0-1)
|
||||||
|
retrievalMethod: number; // Confidence in retrieval method used (0-1)
|
||||||
|
|
||||||
|
// Selection confidence
|
||||||
|
aiModelCertainty: number; // AI's self-reported confidence (0-1)
|
||||||
|
selectionConsistency: number; // Internal consistency of selections (0-1)
|
||||||
|
reasoningQuality: number; // Quality of provided reasoning (0-1)
|
||||||
|
|
||||||
|
// Domain confidence
|
||||||
|
scenarioSpecificity: number; // How specific the scenario description was (0-1)
|
||||||
|
toolSpecialization: number; // How specialized selected tools are (0-1)
|
||||||
|
expertiseAlignment: number; // Alignment with known expert preferences (0-1)
|
||||||
|
|
||||||
|
// Meta confidence
|
||||||
|
biasRiskLevel: number; // Inverse of bias risk (higher bias = lower confidence) (0-1)
|
||||||
|
historicalAccuracy: number; // Based on past validation (if available) (0-1)
|
||||||
|
processingStability: number; // Stability of processing pipeline (0-1)
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ConfidenceWeights {
|
||||||
|
retrieval: number; // Weight for retrieval factors
|
||||||
|
selection: number; // Weight for selection factors
|
||||||
|
domain: number; // Weight for domain factors
|
||||||
|
meta: number; // Weight for meta factors
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ConfidenceMetrics {
|
||||||
|
overall: number; // Overall confidence score (0-1)
|
||||||
|
breakdown: {
|
||||||
|
retrieval: number;
|
||||||
|
selection: number;
|
||||||
|
domain: number;
|
||||||
|
meta: number;
|
||||||
|
};
|
||||||
|
uncertaintyFactors: string[]; // List of uncertainty factors identified
|
||||||
|
improvementSuggestions: string[]; // Suggestions to improve confidence
|
||||||
|
qualityLevel: 'low' | 'medium' | 'high' | 'excellent';
|
||||||
|
reliability: number; // Reliability of the confidence assessment itself (0-1)
|
||||||
|
}
|
||||||
|
|
||||||
|
interface UncertaintyFactor {
|
||||||
|
factor: string;
|
||||||
|
impact: 'low' | 'medium' | 'high';
|
||||||
|
description: string;
|
||||||
|
mitigation: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CONFIDENCE SCORING IMPLEMENTATION
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
class ConfidenceScorer {
|
||||||
|
private config = forensicConfig.getConfig();
|
||||||
|
private thresholds = forensicConfig.getThresholds();
|
||||||
|
|
||||||
|
// Default confidence weights (configurable via environment)
|
||||||
|
private weights: ConfidenceWeights = {
|
||||||
|
retrieval: parseFloat(process.env.CONFIDENCE_WEIGHT_RETRIEVAL || '0.25'),
|
||||||
|
selection: parseFloat(process.env.CONFIDENCE_WEIGHT_SELECTION || '0.35'),
|
||||||
|
domain: parseFloat(process.env.CONFIDENCE_WEIGHT_DOMAIN || '0.25'),
|
||||||
|
meta: parseFloat(process.env.CONFIDENCE_WEIGHT_META || '0.15')
|
||||||
|
};
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
// Validate weights sum to 1.0
|
||||||
|
const weightSum = Object.values(this.weights).reduce((sum, weight) => sum + weight, 0);
|
||||||
|
if (Math.abs(weightSum - 1.0) > 0.01) {
|
||||||
|
console.warn(`[CONFIDENCE] Weight sum is ${weightSum}, adjusting to 1.0`);
|
||||||
|
const factor = 1.0 / weightSum;
|
||||||
|
this.weights.retrieval *= factor;
|
||||||
|
this.weights.selection *= factor;
|
||||||
|
this.weights.domain *= factor;
|
||||||
|
this.weights.meta *= factor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// MAIN CONFIDENCE CALCULATION
|
||||||
|
// ========================================================================
|
||||||
|
|
||||||
|
calculateOverallConfidence(factors: ConfidenceFactors): ConfidenceMetrics {
|
||||||
|
// Calculate component scores
|
||||||
|
const retrievalScore = this.calculateRetrievalConfidence(factors);
|
||||||
|
const selectionScore = this.calculateSelectionConfidence(factors);
|
||||||
|
const domainScore = this.calculateDomainConfidence(factors);
|
||||||
|
const metaScore = this.calculateMetaConfidence(factors);
|
||||||
|
|
||||||
|
// Calculate weighted overall score
|
||||||
|
const overall = (
|
||||||
|
retrievalScore * this.weights.retrieval +
|
||||||
|
selectionScore * this.weights.selection +
|
||||||
|
domainScore * this.weights.domain +
|
||||||
|
metaScore * this.weights.meta
|
||||||
|
);
|
||||||
|
|
||||||
|
// Identify uncertainty factors
|
||||||
|
const uncertaintyFactors = this.identifyUncertaintyFactors(factors);
|
||||||
|
|
||||||
|
// Generate improvement suggestions
|
||||||
|
const improvementSuggestions = this.suggestConfidenceImprovements(factors);
|
||||||
|
|
||||||
|
// Determine quality level
|
||||||
|
const qualityLevel = this.determineQualityLevel(overall);
|
||||||
|
|
||||||
|
// Calculate reliability of confidence assessment
|
||||||
|
const reliability = this.calculateConfidenceReliability(factors);
|
||||||
|
|
||||||
|
return {
|
||||||
|
overall: Math.max(0, Math.min(1, overall)),
|
||||||
|
breakdown: {
|
||||||
|
retrieval: retrievalScore,
|
||||||
|
selection: selectionScore,
|
||||||
|
domain: domainScore,
|
||||||
|
meta: metaScore
|
||||||
|
},
|
||||||
|
uncertaintyFactors,
|
||||||
|
improvementSuggestions,
|
||||||
|
qualityLevel,
|
||||||
|
reliability
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// COMPONENT CONFIDENCE CALCULATIONS
|
||||||
|
// ========================================================================
|
||||||
|
|
||||||
|
private calculateRetrievalConfidence(factors: ConfidenceFactors): number {
|
||||||
|
const { embeddingsQuality, candidateRelevance, retrievalMethod } = factors;
|
||||||
|
|
||||||
|
// Weighted average with emphasis on relevance
|
||||||
|
const score = (
|
||||||
|
embeddingsQuality * 0.4 +
|
||||||
|
candidateRelevance * 0.4 +
|
||||||
|
retrievalMethod * 0.2
|
||||||
|
);
|
||||||
|
|
||||||
|
return Math.max(0, Math.min(1, score));
|
||||||
|
}
|
||||||
|
|
||||||
|
private calculateSelectionConfidence(factors: ConfidenceFactors): number {
|
||||||
|
const { aiModelCertainty, selectionConsistency, reasoningQuality } = factors;
|
||||||
|
|
||||||
|
// Weighted average with emphasis on consistency and reasoning
|
||||||
|
const score = (
|
||||||
|
aiModelCertainty * 0.3 +
|
||||||
|
selectionConsistency * 0.4 +
|
||||||
|
reasoningQuality * 0.3
|
||||||
|
);
|
||||||
|
|
||||||
|
return Math.max(0, Math.min(1, score));
|
||||||
|
}
|
||||||
|
|
||||||
|
private calculateDomainConfidence(factors: ConfidenceFactors): number {
|
||||||
|
const { scenarioSpecificity, toolSpecialization, expertiseAlignment } = factors;
|
||||||
|
|
||||||
|
// Weighted average with emphasis on specificity
|
||||||
|
const score = (
|
||||||
|
scenarioSpecificity * 0.5 +
|
||||||
|
toolSpecialization * 0.3 +
|
||||||
|
expertiseAlignment * 0.2
|
||||||
|
);
|
||||||
|
|
||||||
|
return Math.max(0, Math.min(1, score));
|
||||||
|
}
|
||||||
|
|
||||||
|
private calculateMetaConfidence(factors: ConfidenceFactors): number {
|
||||||
|
const { biasRiskLevel, historicalAccuracy, processingStability } = factors;
|
||||||
|
|
||||||
|
// Weighted average with emphasis on bias risk
|
||||||
|
const score = (
|
||||||
|
biasRiskLevel * 0.4 +
|
||||||
|
historicalAccuracy * 0.3 +
|
||||||
|
processingStability * 0.3
|
||||||
|
);
|
||||||
|
|
||||||
|
return Math.max(0, Math.min(1, score));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// SPECIFIC CONFIDENCE ASSESSMENTS
|
||||||
|
// ========================================================================
|
||||||
|
|
||||||
|
assessRetrievalConfidence(retrievalData: {
|
||||||
|
method: string;
|
||||||
|
embeddingsUsed: boolean;
|
||||||
|
candidatesFound: number;
|
||||||
|
similarityScores: Array<{ tool: string; score: number; type: string }>;
|
||||||
|
retrievalConfidence: number;
|
||||||
|
fallbackReason?: string;
|
||||||
|
}): { confidence: number; factors: Partial<ConfidenceFactors> } {
|
||||||
|
|
||||||
|
let embeddingsQuality = 0.5; // Default
|
||||||
|
let candidateRelevance = 0.5; // Default
|
||||||
|
let retrievalMethod = 0.5; // Default
|
||||||
|
|
||||||
|
// Assess embeddings quality
|
||||||
|
if (retrievalData.embeddingsUsed && retrievalData.similarityScores.length > 0) {
|
||||||
|
const avgSimilarity = retrievalData.similarityScores.reduce((sum, item) => sum + item.score, 0) / retrievalData.similarityScores.length;
|
||||||
|
embeddingsQuality = Math.min(1, avgSimilarity * 2); // Scale similarity scores
|
||||||
|
|
||||||
|
// Bonus for high-quality matches
|
||||||
|
const highQualityMatches = retrievalData.similarityScores.filter(item => item.score > 0.7).length;
|
||||||
|
if (highQualityMatches >= 5) {
|
||||||
|
embeddingsQuality = Math.min(1, embeddingsQuality + 0.1);
|
||||||
|
}
|
||||||
|
} else if (retrievalData.fallbackReason) {
|
||||||
|
embeddingsQuality = 0.3; // Lower confidence for fallback
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assess candidate relevance
|
||||||
|
if (retrievalData.candidatesFound >= 15) {
|
||||||
|
candidateRelevance = 0.8; // Good number of candidates
|
||||||
|
} else if (retrievalData.candidatesFound >= 10) {
|
||||||
|
candidateRelevance = 0.6; // Adequate candidates
|
||||||
|
} else if (retrievalData.candidatesFound >= 5) {
|
||||||
|
candidateRelevance = 0.4; // Few candidates
|
||||||
|
} else {
|
||||||
|
candidateRelevance = 0.2; // Very few candidates
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assess retrieval method confidence
|
||||||
|
switch (retrievalData.method) {
|
||||||
|
case 'embeddings':
|
||||||
|
retrievalMethod = 0.9; // High confidence in embeddings
|
||||||
|
break;
|
||||||
|
case 'ai_selector':
|
||||||
|
retrievalMethod = 0.7; // Medium confidence in AI selection
|
||||||
|
break;
|
||||||
|
case 'emergency_fallback':
|
||||||
|
retrievalMethod = 0.3; // Low confidence in emergency fallback
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const confidence = this.calculateRetrievalConfidence({
|
||||||
|
embeddingsQuality,
|
||||||
|
candidateRelevance,
|
||||||
|
retrievalMethod,
|
||||||
|
// Fill other factors with defaults for this calculation
|
||||||
|
aiModelCertainty: 0.5,
|
||||||
|
selectionConsistency: 0.5,
|
||||||
|
reasoningQuality: 0.5,
|
||||||
|
scenarioSpecificity: 0.5,
|
||||||
|
toolSpecialization: 0.5,
|
||||||
|
expertiseAlignment: 0.5,
|
||||||
|
biasRiskLevel: 0.5,
|
||||||
|
historicalAccuracy: 0.5,
|
||||||
|
processingStability: 0.5
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
confidence,
|
||||||
|
factors: {
|
||||||
|
embeddingsQuality,
|
||||||
|
candidateRelevance,
|
||||||
|
retrievalMethod
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
assessSelectionConfidence(selectionData: {
|
||||||
|
aiModel: string;
|
||||||
|
promptTokens: number;
|
||||||
|
responseTokens: number;
|
||||||
|
processingTimeMs: number;
|
||||||
|
initialCandidates: string[];
|
||||||
|
finalSelection: string[];
|
||||||
|
rejectedCandidates: Array<{ tool: string; reason: string; score?: number }>;
|
||||||
|
selectionReasoning: string;
|
||||||
|
confidenceScore: number;
|
||||||
|
rawResponse: string;
|
||||||
|
}): { confidence: number; factors: Partial<ConfidenceFactors> } {
|
||||||
|
|
||||||
|
let aiModelCertainty = selectionData.confidenceScore || 0.5;
|
||||||
|
let selectionConsistency = 0.5;
|
||||||
|
let reasoningQuality = 0.5;
|
||||||
|
|
||||||
|
// Assess AI model certainty (from self-reported confidence or infer from response)
|
||||||
|
if (selectionData.confidenceScore && selectionData.confidenceScore > 0) {
|
||||||
|
aiModelCertainty = selectionData.confidenceScore;
|
||||||
|
} else {
|
||||||
|
// Infer confidence from response characteristics
|
||||||
|
const responseLength = selectionData.rawResponse.length;
|
||||||
|
const reasoningLength = selectionData.selectionReasoning.length;
|
||||||
|
|
||||||
|
if (responseLength > 500 && reasoningLength > 100) {
|
||||||
|
aiModelCertainty = 0.7; // Detailed response suggests confidence
|
||||||
|
} else if (responseLength > 200) {
|
||||||
|
aiModelCertainty = 0.6; // Adequate response
|
||||||
|
} else {
|
||||||
|
aiModelCertainty = 0.4; // Short response suggests uncertainty
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assess selection consistency
|
||||||
|
const selectionRatio = selectionData.finalSelection.length / Math.max(selectionData.initialCandidates.length, 1);
|
||||||
|
if (selectionRatio > 0.8) {
|
||||||
|
selectionConsistency = 0.3; // Selected too many - low consistency
|
||||||
|
} else if (selectionRatio > 0.5) {
|
||||||
|
selectionConsistency = 0.5; // Moderate selectivity
|
||||||
|
} else if (selectionRatio > 0.2) {
|
||||||
|
selectionConsistency = 0.8; // Good selectivity
|
||||||
|
} else if (selectionRatio > 0.05) {
|
||||||
|
selectionConsistency = 0.9; // Very selective - high consistency
|
||||||
|
} else {
|
||||||
|
selectionConsistency = 0.4; // Too selective - might miss good options
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assess reasoning quality
|
||||||
|
const reasoning = selectionData.selectionReasoning.toLowerCase();
|
||||||
|
let reasoningScore = 0.3; // Base score
|
||||||
|
|
||||||
|
// Look for quality indicators in reasoning
|
||||||
|
const qualityIndicators = [
|
||||||
|
'specific', 'analysis', 'forensic', 'evidence', 'methodology',
|
||||||
|
'suitable', 'appropriate', 'specialized', 'effective', 'relevant'
|
||||||
|
];
|
||||||
|
|
||||||
|
const foundIndicators = qualityIndicators.filter(indicator => reasoning.includes(indicator)).length;
|
||||||
|
reasoningScore += (foundIndicators / qualityIndicators.length) * 0.4;
|
||||||
|
|
||||||
|
// Penalty for vague reasoning
|
||||||
|
const vagueTerms = ['good', 'useful', 'helpful', 'nice', 'popular'];
|
||||||
|
const foundVagueTerms = vagueTerms.filter(term => reasoning.includes(term)).length;
|
||||||
|
reasoningScore -= (foundVagueTerms / vagueTerms.length) * 0.2;
|
||||||
|
|
||||||
|
// Bonus for detailed reasoning
|
||||||
|
if (selectionData.selectionReasoning.length > 200) {
|
||||||
|
reasoningScore += 0.2;
|
||||||
|
}
|
||||||
|
|
||||||
|
reasoningQuality = Math.max(0, Math.min(1, reasoningScore));
|
||||||
|
|
||||||
|
const confidence = this.calculateSelectionConfidence({
|
||||||
|
aiModelCertainty,
|
||||||
|
selectionConsistency,
|
||||||
|
reasoningQuality,
|
||||||
|
// Fill other factors with defaults for this calculation
|
||||||
|
embeddingsQuality: 0.5,
|
||||||
|
candidateRelevance: 0.5,
|
||||||
|
retrievalMethod: 0.5,
|
||||||
|
scenarioSpecificity: 0.5,
|
||||||
|
toolSpecialization: 0.5,
|
||||||
|
expertiseAlignment: 0.5,
|
||||||
|
biasRiskLevel: 0.5,
|
||||||
|
historicalAccuracy: 0.5,
|
||||||
|
processingStability: 0.5
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
confidence,
|
||||||
|
factors: {
|
||||||
|
aiModelCertainty,
|
||||||
|
selectionConsistency,
|
||||||
|
reasoningQuality
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
assessDomainConfidence(queryData: {
|
||||||
|
userQuery: string;
|
||||||
|
queryMode: string;
|
||||||
|
sanitizedQuery: string;
|
||||||
|
}, toolData: {
|
||||||
|
selectedTools: any[];
|
||||||
|
selectedConcepts: any[];
|
||||||
|
}): { confidence: number; factors: Partial<ConfidenceFactors> } {
|
||||||
|
|
||||||
|
let scenarioSpecificity = 0.5;
|
||||||
|
let toolSpecialization = 0.5;
|
||||||
|
let expertiseAlignment = 0.5;
|
||||||
|
|
||||||
|
// Assess scenario specificity
|
||||||
|
const query = queryData.sanitizedQuery.toLowerCase();
|
||||||
|
const queryLength = query.length;
|
||||||
|
|
||||||
|
// Base specificity on query length and detail
|
||||||
|
if (queryLength > 200) {
|
||||||
|
scenarioSpecificity = 0.8; // Detailed query
|
||||||
|
} else if (queryLength > 100) {
|
||||||
|
scenarioSpecificity = 0.6; // Moderate detail
|
||||||
|
} else if (queryLength > 50) {
|
||||||
|
scenarioSpecificity = 0.4; // Basic detail
|
||||||
|
} else {
|
||||||
|
scenarioSpecificity = 0.2; // Very brief query
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for forensic-specific terms
|
||||||
|
const forensicTerms = [
|
||||||
|
'malware', 'incident', 'breach', 'attack', 'forensic', 'investigation',
|
||||||
|
'evidence', 'analysis', 'memory', 'disk', 'network', 'log', 'artifact',
|
||||||
|
'timeline', 'ioc', 'apt', 'ransomware', 'windows', 'linux', 'registry'
|
||||||
|
];
|
||||||
|
|
||||||
|
const foundForensicTerms = forensicTerms.filter(term => query.includes(term)).length;
|
||||||
|
scenarioSpecificity += (foundForensicTerms / forensicTerms.length) * 0.3;
|
||||||
|
scenarioSpecificity = Math.min(1, scenarioSpecificity);
|
||||||
|
|
||||||
|
// Assess tool specialization
|
||||||
|
const allSelectedItems = [...toolData.selectedTools, ...toolData.selectedConcepts];
|
||||||
|
|
||||||
|
if (allSelectedItems.length === 0) {
|
||||||
|
toolSpecialization = 0.1;
|
||||||
|
} else {
|
||||||
|
// Count specialized tools (methods, concepts, or tools with specific domains)
|
||||||
|
const specializedCount = allSelectedItems.filter(item =>
|
||||||
|
item.type === 'method' ||
|
||||||
|
item.type === 'concept' ||
|
||||||
|
(item.domains && item.domains.length <= 2) // Specialized to few domains
|
||||||
|
).length;
|
||||||
|
|
||||||
|
toolSpecialization = Math.min(1, specializedCount / allSelectedItems.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assess expertise alignment (simplified - could be enhanced with expert knowledge base)
|
||||||
|
expertiseAlignment = 0.5; // Default moderate alignment
|
||||||
|
|
||||||
|
// Check for alignment with query urgency and tool selection
|
||||||
|
if (query.includes('urgent') || query.includes('rapid') || query.includes('quick')) {
|
||||||
|
const rapidTools = allSelectedItems.filter(item =>
|
||||||
|
item.name?.toLowerCase().includes('rapid') ||
|
||||||
|
item.name?.toLowerCase().includes('quick') ||
|
||||||
|
item.name?.toLowerCase().includes('triage') ||
|
||||||
|
item.type === 'method'
|
||||||
|
).length;
|
||||||
|
|
||||||
|
if (rapidTools > 0) {
|
||||||
|
expertiseAlignment = Math.min(1, expertiseAlignment + 0.3); // Good alignment with urgency
|
||||||
|
} else {
|
||||||
|
expertiseAlignment = Math.max(0, expertiseAlignment - 0.2); // Poor alignment with urgency
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const confidence = this.calculateDomainConfidence({
|
||||||
|
scenarioSpecificity,
|
||||||
|
toolSpecialization,
|
||||||
|
expertiseAlignment,
|
||||||
|
// Fill other factors with defaults for this calculation
|
||||||
|
embeddingsQuality: 0.5,
|
||||||
|
candidateRelevance: 0.5,
|
||||||
|
retrievalMethod: 0.5,
|
||||||
|
aiModelCertainty: 0.5,
|
||||||
|
selectionConsistency: 0.5,
|
||||||
|
reasoningQuality: 0.5,
|
||||||
|
biasRiskLevel: 0.5,
|
||||||
|
historicalAccuracy: 0.5,
|
||||||
|
processingStability: 0.5
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
confidence,
|
||||||
|
factors: {
|
||||||
|
scenarioSpecificity,
|
||||||
|
toolSpecialization,
|
||||||
|
expertiseAlignment
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
assessMetaConfidence(auditData: {
|
||||||
|
biasAnalysis: any[];
|
||||||
|
microTasks: any[];
|
||||||
|
processingSummary: any;
|
||||||
|
errorCount: number;
|
||||||
|
fallbacksUsed: number;
|
||||||
|
}): { confidence: number; factors: Partial<ConfidenceFactors> } {
|
||||||
|
|
||||||
|
let biasRiskLevel = 0.5;
|
||||||
|
let historicalAccuracy = 0.5; // Default - could be enhanced with historical data
|
||||||
|
let processingStability = 0.5;
|
||||||
|
|
||||||
|
// Assess bias risk level (inverse of detected bias)
|
||||||
|
if (auditData.biasAnalysis && auditData.biasAnalysis.length > 0) {
|
||||||
|
const detectedBiases = auditData.biasAnalysis.filter(bias => bias.detected);
|
||||||
|
const maxBiasSeverity = detectedBiases.length > 0 ?
|
||||||
|
Math.max(...detectedBiases.map(bias => bias.severity || 0)) : 0;
|
||||||
|
|
||||||
|
biasRiskLevel = Math.max(0, 1 - maxBiasSeverity); // Higher bias = lower confidence
|
||||||
|
} else {
|
||||||
|
biasRiskLevel = 0.7; // No bias detected = good confidence
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assess processing stability
|
||||||
|
const totalTasks = auditData.microTasks?.length || 1;
|
||||||
|
const successfulTasks = auditData.microTasks?.filter(task => task.success).length || 0;
|
||||||
|
const successRate = successfulTasks / totalTasks;
|
||||||
|
|
||||||
|
processingStability = successRate;
|
||||||
|
|
||||||
|
// Penalty for fallbacks and errors
|
||||||
|
if (auditData.fallbacksUsed > 0) {
|
||||||
|
processingStability = Math.max(0, processingStability - (auditData.fallbacksUsed * 0.2));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (auditData.errorCount > 0) {
|
||||||
|
processingStability = Math.max(0, processingStability - (auditData.errorCount * 0.1));
|
||||||
|
}
|
||||||
|
|
||||||
|
const confidence = this.calculateMetaConfidence({
|
||||||
|
biasRiskLevel,
|
||||||
|
historicalAccuracy,
|
||||||
|
processingStability,
|
||||||
|
// Fill other factors with defaults for this calculation
|
||||||
|
embeddingsQuality: 0.5,
|
||||||
|
candidateRelevance: 0.5,
|
||||||
|
retrievalMethod: 0.5,
|
||||||
|
aiModelCertainty: 0.5,
|
||||||
|
selectionConsistency: 0.5,
|
||||||
|
reasoningQuality: 0.5,
|
||||||
|
scenarioSpecificity: 0.5,
|
||||||
|
toolSpecialization: 0.5,
|
||||||
|
expertiseAlignment: 0.5
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
confidence,
|
||||||
|
factors: {
|
||||||
|
biasRiskLevel,
|
||||||
|
historicalAccuracy,
|
||||||
|
processingStability
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// UNCERTAINTY AND IMPROVEMENT ANALYSIS
|
||||||
|
// ========================================================================
|
||||||
|
|
||||||
|
identifyUncertaintyFactors(factors: ConfidenceFactors): string[] {
|
||||||
|
const uncertaintyFactors: string[] = [];
|
||||||
|
const threshold = 0.6; // Below this threshold, factor contributes to uncertainty
|
||||||
|
|
||||||
|
if (factors.embeddingsQuality < threshold) {
|
||||||
|
uncertaintyFactors.push('Embeddings similarity scores were lower than expected');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factors.candidateRelevance < threshold) {
|
||||||
|
uncertaintyFactors.push('Retrieved candidate tools may not be sufficiently relevant');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factors.aiModelCertainty < threshold) {
|
||||||
|
uncertaintyFactors.push('AI model expressed low confidence in tool selection');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factors.selectionConsistency < threshold) {
|
||||||
|
uncertaintyFactors.push('Tool selection criteria may not be consistently applied');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factors.reasoningQuality < threshold) {
|
||||||
|
uncertaintyFactors.push('Selection reasoning lacks depth or specificity');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factors.scenarioSpecificity < threshold) {
|
||||||
|
uncertaintyFactors.push('Query lacks forensic detail for precise tool matching');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factors.toolSpecialization < threshold) {
|
||||||
|
uncertaintyFactors.push('Selected tools may be too generic for the specific scenario');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factors.biasRiskLevel < threshold) {
|
||||||
|
uncertaintyFactors.push('Potential bias detected in tool selection process');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factors.processingStability < threshold) {
|
||||||
|
uncertaintyFactors.push('Processing pipeline encountered errors or fallbacks');
|
||||||
|
}
|
||||||
|
|
||||||
|
return uncertaintyFactors;
|
||||||
|
}
|
||||||
|
|
||||||
|
suggestConfidenceImprovements(factors: ConfidenceFactors): string[] {
|
||||||
|
const suggestions: string[] = [];
|
||||||
|
const threshold = 0.7; // Above this threshold, no improvement needed
|
||||||
|
|
||||||
|
if (factors.embeddingsQuality < threshold) {
|
||||||
|
suggestions.push('Consider refining query with more forensic-specific terminology');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factors.scenarioSpecificity < threshold) {
|
||||||
|
suggestions.push('Provide more details about affected systems, incident type, or evidence available');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factors.reasoningQuality < threshold) {
|
||||||
|
suggestions.push('Request detailed explanation of why specific tools were recommended');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factors.toolSpecialization < threshold) {
|
||||||
|
suggestions.push('Consider specifying the forensic domain or investigation phase needed');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factors.biasRiskLevel < threshold) {
|
||||||
|
suggestions.push('Review recommendations for potential bias toward popular tools');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (suggestions.length === 0) {
|
||||||
|
suggestions.push('Confidence is high - no specific improvements needed');
|
||||||
|
}
|
||||||
|
|
||||||
|
return suggestions;
|
||||||
|
}
|
||||||
|
|
||||||
|
private determineQualityLevel(confidence: number): 'low' | 'medium' | 'high' | 'excellent' {
|
||||||
|
if (confidence >= 0.9) return 'excellent';
|
||||||
|
if (confidence >= 0.75) return 'high';
|
||||||
|
if (confidence >= 0.6) return 'medium';
|
||||||
|
return 'low';
|
||||||
|
}
|
||||||
|
|
||||||
|
private calculateConfidenceReliability(factors: ConfidenceFactors): number {
|
||||||
|
// Assess how reliable our confidence assessment itself is
|
||||||
|
// Based on availability of data for each factor
|
||||||
|
|
||||||
|
let dataAvailability = 0;
|
||||||
|
let totalFactors = 0;
|
||||||
|
|
||||||
|
// Count factors that have meaningful data (not just defaults)
|
||||||
|
Object.entries(factors).forEach(([key, value]) => {
|
||||||
|
totalFactors++;
|
||||||
|
// If factor is not exactly 0.5 (default), we have some data
|
||||||
|
if (Math.abs(value - 0.5) > 0.01) {
|
||||||
|
dataAvailability++;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const reliabilityBase = dataAvailability / totalFactors;
|
||||||
|
|
||||||
|
// Penalty for extreme values (might indicate missing nuance)
|
||||||
|
const extremeValues = Object.values(factors).filter(v => v < 0.1 || v > 0.9).length;
|
||||||
|
const extremePenalty = (extremeValues / totalFactors) * 0.2;
|
||||||
|
|
||||||
|
return Math.max(0.3, Math.min(1, reliabilityBase - extremePenalty));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// UTILITY METHODS
|
||||||
|
// ========================================================================
|
||||||
|
|
||||||
|
getConfidenceThreshold(): number {
|
||||||
|
return this.thresholds.confidenceThreshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
isConfidenceAcceptable(confidence: number): boolean {
|
||||||
|
return confidence >= this.getConfidenceThreshold();
|
||||||
|
}
|
||||||
|
|
||||||
|
formatConfidenceReport(metrics: ConfidenceMetrics): string {
|
||||||
|
const report = `
|
||||||
|
Confidence Assessment Report
|
||||||
|
===========================
|
||||||
|
Overall Confidence: ${(metrics.overall * 100).toFixed(1)}% (${metrics.qualityLevel})
|
||||||
|
Reliability: ${(metrics.reliability * 100).toFixed(1)}%
|
||||||
|
|
||||||
|
Component Breakdown:
|
||||||
|
- Retrieval: ${(metrics.breakdown.retrieval * 100).toFixed(1)}%
|
||||||
|
- Selection: ${(metrics.breakdown.selection * 100).toFixed(1)}%
|
||||||
|
- Domain: ${(metrics.breakdown.domain * 100).toFixed(1)}%
|
||||||
|
- Meta: ${(metrics.breakdown.meta * 100).toFixed(1)}%
|
||||||
|
|
||||||
|
${metrics.uncertaintyFactors.length > 0 ? `
|
||||||
|
Uncertainty Factors:
|
||||||
|
${metrics.uncertaintyFactors.map(factor => `- ${factor}`).join('\n')}
|
||||||
|
` : 'No significant uncertainty factors identified.'}
|
||||||
|
|
||||||
|
${metrics.improvementSuggestions.length > 0 ? `
|
||||||
|
Improvement Suggestions:
|
||||||
|
${metrics.improvementSuggestions.map(suggestion => `- ${suggestion}`).join('\n')}
|
||||||
|
` : ''}
|
||||||
|
`.trim();
|
||||||
|
|
||||||
|
return report;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Export singleton instance
|
||||||
|
export const confidenceScorer = new ConfidenceScorer();
|
||||||
|
export type { ConfidenceFactors, ConfidenceMetrics, ConfidenceWeights, UncertaintyFactor };
|
Loading…
x
Reference in New Issue
Block a user