// src/utils/embeddings.ts import { promises as fs } from 'fs'; import path from 'path'; import { getCompressedToolsDataForAI } from './dataService.js'; interface EmbeddingData { id: string; type: 'tool' | 'concept'; name: string; content: string; embedding: number[]; metadata: { domains?: string[]; phases?: string[]; tags?: string[]; skillLevel?: string; type?: string; }; } interface EmbeddingsDatabase { version: string; lastUpdated: number; embeddings: EmbeddingData[]; } interface SimilarityResult extends EmbeddingData { similarity: number; } class EmbeddingsService { private embeddings: EmbeddingData[] = []; private isInitialized = false; private initializationPromise: Promise | null = null; private readonly embeddingsPath = path.join(process.cwd(), 'data', 'embeddings.json'); private readonly batchSize: number; private readonly batchDelay: number; private enabled: boolean = false; // Make mutable again constructor() { this.batchSize = parseInt(process.env.AI_EMBEDDINGS_BATCH_SIZE || '20', 10); this.batchDelay = parseInt(process.env.AI_EMBEDDINGS_BATCH_DELAY_MS || '1000', 10); // Don't call async method from constructor - handle in initialize() instead this.enabled = true; // Start optimistically enabled for development } private async checkEnabledStatus(): Promise { try { // Add debugging to see what's actually in process.env console.log('[EMBEDDINGS] Debug env check:', { AI_EMBEDDINGS_ENABLED: process.env.AI_EMBEDDINGS_ENABLED, envKeys: Object.keys(process.env).filter(k => k.includes('EMBEDDINGS')).length, allEnvKeys: Object.keys(process.env).length }); const envEnabled = process.env.AI_EMBEDDINGS_ENABLED; if (envEnabled === 'true') { // Check if we have the required API configuration const endpoint = process.env.AI_EMBEDDINGS_ENDPOINT; const model = process.env.AI_EMBEDDINGS_MODEL; if (!endpoint || !model) { console.warn('[EMBEDDINGS] Embeddings enabled but API configuration missing - disabling'); this.enabled = false; return; } console.log('[EMBEDDINGS] All requirements met - enabling embeddings'); this.enabled = true; return; } // Check if embeddings file exists try { await fs.stat(this.embeddingsPath); console.log('[EMBEDDINGS] Existing embeddings file found - enabling'); this.enabled = true; } catch { console.log('[EMBEDDINGS] Embeddings not explicitly enabled - disabling'); this.enabled = false; } } catch (error) { console.error('[EMBEDDINGS] Error checking enabled status:', error); this.enabled = false; } } async initialize(): Promise { if (this.initializationPromise) { return this.initializationPromise; } if (this.isInitialized) { return Promise.resolve(); } this.initializationPromise = this.performInitialization(); return this.initializationPromise; } private async performInitialization(): Promise { // 1️⃣ Respect the on/off switch that the newer code introduced await this.checkEnabledStatus(); if (!this.enabled) { console.log('[EMBEDDINGS] Embeddings disabled, skipping initialization'); return; } const initStart = Date.now(); try { console.log('[EMBEDDINGS] Initializing embeddings system…'); // Make sure the data folder exists await fs.mkdir(path.dirname(this.embeddingsPath), { recursive: true }); // Load current tools / concepts and generate a hash const toolsData = await getCompressedToolsDataForAI(); const currentDataHash = this.hashData(toolsData); // <- keep the old helper // (SHA-256, xxHash etc.) // Try to read an existing file const existing = await this.loadEmbeddings(); const cacheIsUsable = existing && existing.version === currentDataHash && Array.isArray(existing.embeddings) && existing.embeddings.length > 0; if (cacheIsUsable) { console.log('[EMBEDDINGS] Using cached embeddings'); this.embeddings = existing.embeddings; } else { console.log('[EMBEDDINGS] Generating new embeddings…'); // 2️⃣ Build and persist new vectors await this.generateEmbeddings(toolsData, currentDataHash); // <- old helper } this.isInitialized = true; console.log(`[EMBEDDINGS] Initialized with ${this.embeddings.length} embeddings in ${Date.now() - initStart} ms`); } catch (err) { console.error('[EMBEDDINGS] Failed to initialize:', err); this.isInitialized = false; throw err; // Let the caller know – same behaviour as before } finally { // 3️⃣ Always clear the promise so subsequent calls don't hang this.initializationPromise = null; } } private hashData(data: any): string { return Buffer.from(JSON.stringify(data)).toString('base64').slice(0, 32); } private async loadEmbeddings(): Promise { try { const data = await fs.readFile(this.embeddingsPath, 'utf8'); return JSON.parse(data); } catch (error) { console.log('[EMBEDDINGS] No existing embeddings found'); return null; } } private async saveEmbeddings(version: string): Promise { const database: EmbeddingsDatabase = { version, lastUpdated: Date.now(), embeddings: this.embeddings }; await fs.writeFile(this.embeddingsPath, JSON.stringify(database, null, 2)); console.log(`[EMBEDDINGS] Saved ${this.embeddings.length} embeddings to disk`); } private createContentString(item: any): string { const parts = [ item.name, item.description || '', ...(item.tags || []), ...(item.domains || []), ...(item.phases || []) ]; return parts.filter(Boolean).join(' ').toLowerCase(); } private async generateEmbeddingsBatch(contents: string[]): Promise { const endpoint = process.env.AI_EMBEDDINGS_ENDPOINT; const apiKey = process.env.AI_EMBEDDINGS_API_KEY; const model = process.env.AI_EMBEDDINGS_MODEL; if (!endpoint || !model) { const missing: string[] = []; if (!endpoint) missing.push('AI_EMBEDDINGS_ENDPOINT'); if (!model) missing.push('AI_EMBEDDINGS_MODEL'); throw new Error(`Missing embeddings API configuration: ${missing.join(', ')}`); } const headers: Record = { 'Content-Type': 'application/json' }; if (apiKey) { headers['Authorization'] = `Bearer ${apiKey}`; } const response = await fetch(endpoint, { method: 'POST', headers, body: JSON.stringify({ model, input: contents }) }); if (!response.ok) { const error = await response.text(); throw new Error(`Embeddings API error: ${response.status} - ${error}`); } const data = await response.json(); if (Array.isArray(data.embeddings)) { return data.embeddings; } if (Array.isArray(data.data)) { return data.data.map((item: any) => item.embedding); } throw new Error('Unknown embeddings API response format'); } private async generateEmbeddings(toolsData: any, version: string): Promise { const allItems = [ ...toolsData.tools.map((tool: any) => ({ ...tool, type: 'tool' })), ...toolsData.concepts.map((concept: any) => ({ ...concept, type: 'concept' })) ]; const contents = allItems.map(item => this.createContentString(item)); this.embeddings = []; for (let i = 0; i < contents.length; i += this.batchSize) { const batch = contents.slice(i, i + this.batchSize); const batchItems = allItems.slice(i, i + this.batchSize); console.log(`[EMBEDDINGS] Processing batch ${Math.ceil((i + 1) / this.batchSize)} of ${Math.ceil(contents.length / this.batchSize)}`); try { const embeddings = await this.generateEmbeddingsBatch(batch); embeddings.forEach((embedding, index) => { const item = batchItems[index]; this.embeddings.push({ id: `${item.type}_${item.name.replace(/[^a-zA-Z0-9]/g, '_')}`, type: item.type, name: item.name, content: batch[index], embedding, metadata: { domains: item.domains, phases: item.phases, tags: item.tags, skillLevel: item.skillLevel, type: item.type } }); }); if (i + this.batchSize < contents.length) { await new Promise(resolve => setTimeout(resolve, this.batchDelay)); } } catch (error) { console.error(`[EMBEDDINGS] Failed to process batch ${Math.ceil((i + 1) / this.batchSize)}:`, error); throw error; } } await this.saveEmbeddings(version); } public async embedText(text: string): Promise { if (!this.enabled || !this.isInitialized) { throw new Error('Embeddings service not available'); } const [embedding] = await this.generateEmbeddingsBatch([text.toLowerCase()]); return embedding; } async waitForInitialization(): Promise { // Always re-check environment status first in case variables loaded after initial check await this.checkEnabledStatus(); if (!this.enabled || this.isInitialized) { return Promise.resolve(); } if (this.initializationPromise) { await this.initializationPromise; return; } return this.initialize(); } // Force re-check of environment status (useful for development) async forceRecheckEnvironment(): Promise { this.enabled = false; this.isInitialized = false; await this.checkEnabledStatus(); console.log('[EMBEDDINGS] Environment status re-checked, enabled:', this.enabled); } private cosineSimilarity(a: number[], b: number[]): number { let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dotProduct += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); } async findSimilar(query: string, maxResults: number = 30, threshold: number = 0.3): Promise { if (!this.enabled) { console.log('[EMBEDDINGS] Service disabled for similarity search'); return []; } try { // If we have embeddings data, use it if (this.isInitialized && this.embeddings.length > 0) { console.log(`[EMBEDDINGS] Using embeddings data for similarity search: ${query}`); const queryEmbeddings = await this.generateEmbeddingsBatch([query.toLowerCase()]); const queryEmbedding = queryEmbeddings[0]; console.log(`[EMBEDDINGS] Computing similarities for ${this.embeddings.length} items`); const similarities: SimilarityResult[] = this.embeddings.map(item => ({ ...item, similarity: this.cosineSimilarity(queryEmbedding, item.embedding) })); const results = similarities .filter(item => item.similarity >= threshold) .sort((a, b) => b.similarity - a.similarity) .slice(0, maxResults); const orderingValid = results.every((item, index) => { if (index === 0) return true; return item.similarity <= results[index - 1].similarity; }); if (!orderingValid) { console.error('[EMBEDDINGS] CRITICAL: Similarity ordering is broken!'); results.forEach((item, idx) => { console.error(` ${idx}: ${item.name} = ${item.similarity.toFixed(4)}`); }); } console.log(`[EMBEDDINGS] Found ${results.length} similar items (threshold: ${threshold})`); if (results.length > 0) { console.log('[EMBEDDINGS] Top 10 similarity matches:'); results.slice(0, 10).forEach((item, idx) => { console.log(` ${idx + 1}. ${item.name} (${item.type}) = ${item.similarity.toFixed(4)}`); }); const topSimilarity = results[0].similarity; const hasHigherSimilarity = results.some(item => item.similarity > topSimilarity); if (hasHigherSimilarity) { console.error('[EMBEDDINGS] CRITICAL: Top result is not actually the highest similarity!'); } } return results; } else { // Fallback: generate mock similarity results from actual tools data console.log(`[EMBEDDINGS] No embeddings data, using fallback text matching: ${query}`); const { getToolsData } = await import('./dataService.js'); const toolsData = await getToolsData(); const queryLower = query.toLowerCase(); const queryWords = queryLower.split(/\s+/).filter(w => w.length > 2); const similarities: SimilarityResult[] = toolsData.tools .map((tool: any) => { let similarity = 0; // Name matching if (tool.name.toLowerCase().includes(queryLower)) { similarity += 0.8; } // Description matching if (tool.description && tool.description.toLowerCase().includes(queryLower)) { similarity += 0.6; } // Tag matching if (tool.tags && Array.isArray(tool.tags)) { const matchingTags = tool.tags.filter((tag: string) => tag.toLowerCase().includes(queryLower) || queryLower.includes(tag.toLowerCase()) ); if (tool.tags.length > 0) { similarity += (matchingTags.length / tool.tags.length) * 0.4; } } // Word-level matching const toolText = `${tool.name} ${tool.description || ''} ${(tool.tags || []).join(' ')}`.toLowerCase(); const matchingWords = queryWords.filter(word => toolText.includes(word)); if (queryWords.length > 0) { similarity += (matchingWords.length / queryWords.length) * 0.3; } return { id: `tool_${tool.name.replace(/[^a-zA-Z0-9]/g, '_').toLowerCase()}`, type: 'tool' as const, name: tool.name, content: toolText, embedding: [], // Empty for fallback metadata: { domains: tool.domains || [], phases: tool.phases || [], tags: tool.tags || [], skillLevel: tool.skillLevel, type: tool.type }, similarity: Math.min(similarity, 1.0) }; }) .filter(item => item.similarity >= threshold) .sort((a, b) => b.similarity - a.similarity) .slice(0, maxResults); console.log(`[EMBEDDINGS] Fallback found ${similarities.length} similar items`); return similarities; } } catch (error) { console.error('[EMBEDDINGS] Failed to find similar items:', error); return []; } } isEnabled(): boolean { // If not enabled and not initialized, try re-checking environment // This handles the case where environment variables loaded after initial check if (!this.enabled && !this.isInitialized) { // Don't await this, just trigger it and return current status this.checkEnabledStatus().catch(console.error); } return this.enabled; } getStats(): { enabled: boolean; initialized: boolean; count: number } { return { enabled: this.enabled, // Always true during development initialized: this.isInitialized, count: this.embeddings.length }; } } const embeddingsService = new EmbeddingsService(); export { embeddingsService, type EmbeddingData, type SimilarityResult }; // Export utility functions for debugging export const debugEmbeddings = { async recheckEnvironment() { return embeddingsService.forceRecheckEnvironment(); }, getStatus() { return embeddingsService.getStats(); } }; // Remove auto-initialization - let it initialize lazily when first needed