forensic-pathways/src/utils/embeddings.ts
2025-08-06 15:06:53 +02:00

491 lines
16 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// src/utils/embeddings.ts
import { promises as fs } from 'fs';
import path from 'path';
import { getCompressedToolsDataForAI } from './dataService.js';
interface EmbeddingData {
id: string;
type: 'tool' | 'concept';
name: string;
content: string;
embedding: number[];
metadata: {
domains?: string[];
phases?: string[];
tags?: string[];
skillLevel?: string;
type?: string;
};
}
interface EmbeddingsDatabase {
version: string;
lastUpdated: number;
embeddings: EmbeddingData[];
}
interface SimilarityResult extends EmbeddingData {
similarity: number;
}
class EmbeddingsService {
private embeddings: EmbeddingData[] = [];
private isInitialized = false;
private initializationPromise: Promise<void> | null = null;
private readonly embeddingsPath = path.join(process.cwd(), 'data', 'embeddings.json');
private readonly batchSize: number;
private readonly batchDelay: number;
private enabled: boolean = false; // Make mutable again
constructor() {
this.batchSize = parseInt(process.env.AI_EMBEDDINGS_BATCH_SIZE || '20', 10);
this.batchDelay = parseInt(process.env.AI_EMBEDDINGS_BATCH_DELAY_MS || '1000', 10);
// Don't call async method from constructor - handle in initialize() instead
this.enabled = true; // Start optimistically enabled for development
}
private async checkEnabledStatus(): Promise<void> {
try {
// Add debugging to see what's actually in process.env
console.log('[EMBEDDINGS] Debug env check:', {
AI_EMBEDDINGS_ENABLED: process.env.AI_EMBEDDINGS_ENABLED,
envKeys: Object.keys(process.env).filter(k => k.includes('EMBEDDINGS')).length,
allEnvKeys: Object.keys(process.env).length
});
const envEnabled = process.env.AI_EMBEDDINGS_ENABLED;
if (envEnabled === 'true') {
// Check if we have the required API configuration
const endpoint = process.env.AI_EMBEDDINGS_ENDPOINT;
const model = process.env.AI_EMBEDDINGS_MODEL;
if (!endpoint || !model) {
console.warn('[EMBEDDINGS] Embeddings enabled but API configuration missing - disabling');
this.enabled = false;
return;
}
console.log('[EMBEDDINGS] All requirements met - enabling embeddings');
this.enabled = true;
return;
}
// Check if embeddings file exists
try {
await fs.stat(this.embeddingsPath);
console.log('[EMBEDDINGS] Existing embeddings file found - enabling');
this.enabled = true;
} catch {
console.log('[EMBEDDINGS] Embeddings not explicitly enabled - disabling');
this.enabled = false;
}
} catch (error) {
console.error('[EMBEDDINGS] Error checking enabled status:', error);
this.enabled = false;
}
}
async initialize(): Promise<void> {
if (this.initializationPromise) {
return this.initializationPromise;
}
if (this.isInitialized) {
return Promise.resolve();
}
this.initializationPromise = this.performInitialization();
return this.initializationPromise;
}
private async performInitialization(): Promise<void> {
// 1⃣ Respect the on/off switch that the newer code introduced
await this.checkEnabledStatus();
if (!this.enabled) {
console.log('[EMBEDDINGS] Embeddings disabled, skipping initialization');
return;
}
const initStart = Date.now();
try {
console.log('[EMBEDDINGS] Initializing embeddings system…');
// Make sure the data folder exists
await fs.mkdir(path.dirname(this.embeddingsPath), { recursive: true });
// Load current tools / concepts and generate a hash
const toolsData = await getCompressedToolsDataForAI();
const currentDataHash = this.hashData(toolsData); // <- keep the old helper
// (SHA-256, xxHash etc.)
// Try to read an existing file
const existing = await this.loadEmbeddings();
const cacheIsUsable =
existing &&
existing.version === currentDataHash &&
Array.isArray(existing.embeddings) &&
existing.embeddings.length > 0;
if (cacheIsUsable) {
console.log('[EMBEDDINGS] Using cached embeddings');
this.embeddings = existing.embeddings;
} else {
console.log('[EMBEDDINGS] Generating new embeddings…');
// 2⃣ Build and persist new vectors
await this.generateEmbeddings(toolsData, currentDataHash); // <- old helper
}
this.isInitialized = true;
console.log(`[EMBEDDINGS] Initialized with ${this.embeddings.length} embeddings in ${Date.now() - initStart} ms`);
} catch (err) {
console.error('[EMBEDDINGS] Failed to initialize:', err);
this.isInitialized = false;
throw err; // Let the caller know same behaviour as before
} finally {
// 3⃣ Always clear the promise so subsequent calls don't hang
this.initializationPromise = null;
}
}
private hashData(data: any): string {
return Buffer.from(JSON.stringify(data)).toString('base64').slice(0, 32);
}
private async loadEmbeddings(): Promise<EmbeddingsDatabase | null> {
try {
const data = await fs.readFile(this.embeddingsPath, 'utf8');
return JSON.parse(data);
} catch (error) {
console.log('[EMBEDDINGS] No existing embeddings found');
return null;
}
}
private async saveEmbeddings(version: string): Promise<void> {
const database: EmbeddingsDatabase = {
version,
lastUpdated: Date.now(),
embeddings: this.embeddings
};
await fs.writeFile(this.embeddingsPath, JSON.stringify(database, null, 2));
console.log(`[EMBEDDINGS] Saved ${this.embeddings.length} embeddings to disk`);
}
private createContentString(item: any): string {
const parts = [
item.name,
item.description || '',
...(item.tags || []),
...(item.domains || []),
...(item.phases || [])
];
return parts.filter(Boolean).join(' ').toLowerCase();
}
private async generateEmbeddingsBatch(contents: string[]): Promise<number[][]> {
const endpoint = process.env.AI_EMBEDDINGS_ENDPOINT;
const apiKey = process.env.AI_EMBEDDINGS_API_KEY;
const model = process.env.AI_EMBEDDINGS_MODEL;
if (!endpoint || !model) {
const missing: string[] = [];
if (!endpoint) missing.push('AI_EMBEDDINGS_ENDPOINT');
if (!model) missing.push('AI_EMBEDDINGS_MODEL');
throw new Error(`Missing embeddings API configuration: ${missing.join(', ')}`);
}
const headers: Record<string, string> = {
'Content-Type': 'application/json'
};
if (apiKey) {
headers['Authorization'] = `Bearer ${apiKey}`;
}
const response = await fetch(endpoint, {
method: 'POST',
headers,
body: JSON.stringify({
model,
input: contents
})
});
if (!response.ok) {
const error = await response.text();
throw new Error(`Embeddings API error: ${response.status} - ${error}`);
}
const data = await response.json();
if (Array.isArray(data.embeddings)) {
return data.embeddings;
}
if (Array.isArray(data.data)) {
return data.data.map((item: any) => item.embedding);
}
throw new Error('Unknown embeddings API response format');
}
private async generateEmbeddings(toolsData: any, version: string): Promise<void> {
const allItems = [
...toolsData.tools.map((tool: any) => ({ ...tool, type: 'tool' })),
...toolsData.concepts.map((concept: any) => ({ ...concept, type: 'concept' }))
];
const contents = allItems.map(item => this.createContentString(item));
this.embeddings = [];
for (let i = 0; i < contents.length; i += this.batchSize) {
const batch = contents.slice(i, i + this.batchSize);
const batchItems = allItems.slice(i, i + this.batchSize);
console.log(`[EMBEDDINGS] Processing batch ${Math.ceil((i + 1) / this.batchSize)} of ${Math.ceil(contents.length / this.batchSize)}`);
try {
const embeddings = await this.generateEmbeddingsBatch(batch);
embeddings.forEach((embedding, index) => {
const item = batchItems[index];
this.embeddings.push({
id: `${item.type}_${item.name.replace(/[^a-zA-Z0-9]/g, '_')}`,
type: item.type,
name: item.name,
content: batch[index],
embedding,
metadata: {
domains: item.domains,
phases: item.phases,
tags: item.tags,
skillLevel: item.skillLevel,
type: item.type
}
});
});
if (i + this.batchSize < contents.length) {
await new Promise(resolve => setTimeout(resolve, this.batchDelay));
}
} catch (error) {
console.error(`[EMBEDDINGS] Failed to process batch ${Math.ceil((i + 1) / this.batchSize)}:`, error);
throw error;
}
}
await this.saveEmbeddings(version);
}
public async embedText(text: string): Promise<number[]> {
if (!this.enabled || !this.isInitialized) {
throw new Error('Embeddings service not available');
}
const [embedding] = await this.generateEmbeddingsBatch([text.toLowerCase()]);
return embedding;
}
async waitForInitialization(): Promise<void> {
// Always re-check environment status first in case variables loaded after initial check
await this.checkEnabledStatus();
if (!this.enabled || this.isInitialized) {
return Promise.resolve();
}
if (this.initializationPromise) {
await this.initializationPromise;
return;
}
return this.initialize();
}
// Force re-check of environment status (useful for development)
async forceRecheckEnvironment(): Promise<void> {
this.enabled = false;
this.isInitialized = false;
await this.checkEnabledStatus();
console.log('[EMBEDDINGS] Environment status re-checked, enabled:', this.enabled);
}
private cosineSimilarity(a: number[], b: number[]): number {
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
async findSimilar(query: string, maxResults: number = 30, threshold: number = 0.3): Promise<SimilarityResult[]> {
if (!this.enabled) {
console.log('[EMBEDDINGS] Service disabled for similarity search');
return [];
}
try {
// If we have embeddings data, use it
if (this.isInitialized && this.embeddings.length > 0) {
console.log(`[EMBEDDINGS] Using embeddings data for similarity search: ${query}`);
const queryEmbeddings = await this.generateEmbeddingsBatch([query.toLowerCase()]);
const queryEmbedding = queryEmbeddings[0];
console.log(`[EMBEDDINGS] Computing similarities for ${this.embeddings.length} items`);
const similarities: SimilarityResult[] = this.embeddings.map(item => ({
...item,
similarity: this.cosineSimilarity(queryEmbedding, item.embedding)
}));
const results = similarities
.filter(item => item.similarity >= threshold)
.sort((a, b) => b.similarity - a.similarity)
.slice(0, maxResults);
const orderingValid = results.every((item, index) => {
if (index === 0) return true;
return item.similarity <= results[index - 1].similarity;
});
if (!orderingValid) {
console.error('[EMBEDDINGS] CRITICAL: Similarity ordering is broken!');
results.forEach((item, idx) => {
console.error(` ${idx}: ${item.name} = ${item.similarity.toFixed(4)}`);
});
}
console.log(`[EMBEDDINGS] Found ${results.length} similar items (threshold: ${threshold})`);
if (results.length > 0) {
console.log('[EMBEDDINGS] Top 10 similarity matches:');
results.slice(0, 10).forEach((item, idx) => {
console.log(` ${idx + 1}. ${item.name} (${item.type}) = ${item.similarity.toFixed(4)}`);
});
const topSimilarity = results[0].similarity;
const hasHigherSimilarity = results.some(item => item.similarity > topSimilarity);
if (hasHigherSimilarity) {
console.error('[EMBEDDINGS] CRITICAL: Top result is not actually the highest similarity!');
}
}
return results;
} else {
// Fallback: generate mock similarity results from actual tools data
console.log(`[EMBEDDINGS] No embeddings data, using fallback text matching: ${query}`);
const { getToolsData } = await import('./dataService.js');
const toolsData = await getToolsData();
const queryLower = query.toLowerCase();
const queryWords = queryLower.split(/\s+/).filter(w => w.length > 2);
const similarities: SimilarityResult[] = toolsData.tools
.map((tool: any) => {
let similarity = 0;
// Name matching
if (tool.name.toLowerCase().includes(queryLower)) {
similarity += 0.8;
}
// Description matching
if (tool.description && tool.description.toLowerCase().includes(queryLower)) {
similarity += 0.6;
}
// Tag matching
if (tool.tags && Array.isArray(tool.tags)) {
const matchingTags = tool.tags.filter((tag: string) =>
tag.toLowerCase().includes(queryLower) || queryLower.includes(tag.toLowerCase())
);
if (tool.tags.length > 0) {
similarity += (matchingTags.length / tool.tags.length) * 0.4;
}
}
// Word-level matching
const toolText = `${tool.name} ${tool.description || ''} ${(tool.tags || []).join(' ')}`.toLowerCase();
const matchingWords = queryWords.filter(word => toolText.includes(word));
if (queryWords.length > 0) {
similarity += (matchingWords.length / queryWords.length) * 0.3;
}
return {
id: `tool_${tool.name.replace(/[^a-zA-Z0-9]/g, '_').toLowerCase()}`,
type: 'tool' as const,
name: tool.name,
content: toolText,
embedding: [], // Empty for fallback
metadata: {
domains: tool.domains || [],
phases: tool.phases || [],
tags: tool.tags || [],
skillLevel: tool.skillLevel,
type: tool.type
},
similarity: Math.min(similarity, 1.0)
};
})
.filter(item => item.similarity >= threshold)
.sort((a, b) => b.similarity - a.similarity)
.slice(0, maxResults);
console.log(`[EMBEDDINGS] Fallback found ${similarities.length} similar items`);
return similarities;
}
} catch (error) {
console.error('[EMBEDDINGS] Failed to find similar items:', error);
return [];
}
}
isEnabled(): boolean {
// If not enabled and not initialized, try re-checking environment
// This handles the case where environment variables loaded after initial check
if (!this.enabled && !this.isInitialized) {
// Don't await this, just trigger it and return current status
this.checkEnabledStatus().catch(console.error);
}
return this.enabled;
}
getStats(): { enabled: boolean; initialized: boolean; count: number } {
return {
enabled: this.enabled, // Always true during development
initialized: this.isInitialized,
count: this.embeddings.length
};
}
}
const embeddingsService = new EmbeddingsService();
export { embeddingsService, type EmbeddingData, type SimilarityResult };
// Export utility functions for debugging
export const debugEmbeddings = {
async recheckEnvironment() {
return embeddingsService.forceRecheckEnvironment();
},
getStatus() {
return embeddingsService.getStats();
}
};
// Remove auto-initialization - let it initialize lazily when first needed