// src/utils/embeddings.ts import { promises as fs } from 'fs'; import path from 'path'; import { getCompressedToolsDataForAI } from './dataService.js'; interface EmbeddingData { id: string; type: 'tool' | 'concept'; name: string; content: string; embedding: number[]; metadata: { domains?: string[]; phases?: string[]; tags?: string[]; skillLevel?: string; type?: string; }; } interface EmbeddingsDatabase { version: string; lastUpdated: number; embeddings: EmbeddingData[]; } interface EmbeddingSearchResult extends EmbeddingData { similarity: number; } class EmbeddingsService { private embeddings: EmbeddingData[] = []; private isInitialized = false; private readonly embeddingsPath = path.join(process.cwd(), 'data', 'embeddings.json'); private readonly batchSize: number; private readonly batchDelay: number; private readonly enabled: boolean; constructor() { this.enabled = process.env.AI_EMBEDDINGS_ENABLED === 'true'; this.batchSize = parseInt(process.env.AI_EMBEDDINGS_BATCH_SIZE || '20', 10); this.batchDelay = parseInt(process.env.AI_EMBEDDINGS_BATCH_DELAY_MS || '1000', 10); } async initialize(): Promise { if (!this.enabled) { console.log('[EMBEDDINGS] Embeddings disabled, skipping initialization'); return; } try { console.log('[EMBEDDINGS] Initializing embeddings system...'); // Create data directory if it doesn't exist await fs.mkdir(path.dirname(this.embeddingsPath), { recursive: true }); const toolsData = await getCompressedToolsDataForAI(); const currentDataHash = this.hashData(toolsData); // Try to load existing embeddings const existingEmbeddings = await this.loadEmbeddings(); if (existingEmbeddings && existingEmbeddings.version === currentDataHash) { console.log('[EMBEDDINGS] Using cached embeddings'); this.embeddings = existingEmbeddings.embeddings; } else { console.log('[EMBEDDINGS] Generating new embeddings...'); await this.generateEmbeddings(toolsData, currentDataHash); } this.isInitialized = true; console.log(`[EMBEDDINGS] Initialized with ${this.embeddings.length} embeddings`); } catch (error) { console.error('[EMBEDDINGS] Failed to initialize:', error); this.isInitialized = false; } } private hashData(data: any): string { return Buffer.from(JSON.stringify(data)).toString('base64').slice(0, 32); } private async loadEmbeddings(): Promise { try { const data = await fs.readFile(this.embeddingsPath, 'utf8'); return JSON.parse(data); } catch (error) { console.log('[EMBEDDINGS] No existing embeddings found'); return null; } } private async saveEmbeddings(version: string): Promise { const database: EmbeddingsDatabase = { version, lastUpdated: Date.now(), embeddings: this.embeddings }; await fs.writeFile(this.embeddingsPath, JSON.stringify(database, null, 2)); console.log(`[EMBEDDINGS] Saved ${this.embeddings.length} embeddings to disk`); } private createContentString(item: any): string { const parts = [ item.name, item.description || '', ...(item.tags || []), ...(item.domains || []), ...(item.phases || []) ]; return parts.filter(Boolean).join(' ').toLowerCase(); } private async generateEmbeddingsBatch(contents: string[]): Promise { const endpoint = process.env.AI_EMBEDDINGS_ENDPOINT; const apiKey = process.env.AI_EMBEDDINGS_API_KEY; const model = process.env.AI_EMBEDDINGS_MODEL; if (!endpoint || !apiKey || !model) { throw new Error('Missing embeddings API configuration'); } const response = await fetch(endpoint, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${apiKey}` }, body: JSON.stringify({ model, input: contents }) }); if (!response.ok) { const error = await response.text(); throw new Error(`Embeddings API error: ${response.status} - ${error}`); } const data = await response.json(); return data.data.map((item: any) => item.embedding); } private async generateEmbeddings(toolsData: any, version: string): Promise { const allItems = [ ...toolsData.tools.map((tool: any) => ({ ...tool, type: 'tool' })), ...toolsData.concepts.map((concept: any) => ({ ...concept, type: 'concept' })) ]; const contents = allItems.map(item => this.createContentString(item)); this.embeddings = []; // Process in batches to respect rate limits for (let i = 0; i < contents.length; i += this.batchSize) { const batch = contents.slice(i, i + this.batchSize); const batchItems = allItems.slice(i, i + this.batchSize); console.log(`[EMBEDDINGS] Processing batch ${Math.ceil((i + 1) / this.batchSize)} of ${Math.ceil(contents.length / this.batchSize)}`); try { const embeddings = await this.generateEmbeddingsBatch(batch); embeddings.forEach((embedding, index) => { const item = batchItems[index]; this.embeddings.push({ id: `${item.type}_${item.name.replace(/[^a-zA-Z0-9]/g, '_')}`, type: item.type, name: item.name, content: batch[index], embedding, metadata: { domains: item.domains, phases: item.phases, tags: item.tags, skillLevel: item.skillLevel, type: item.type } }); }); // Rate limiting delay between batches if (i + this.batchSize < contents.length) { await new Promise(resolve => setTimeout(resolve, this.batchDelay)); } } catch (error) { console.error(`[EMBEDDINGS] Failed to process batch ${Math.ceil((i + 1) / this.batchSize)}:`, error); throw error; } } await this.saveEmbeddings(version); } public async embedText(text: string): Promise { // Re‑use the private batch helper to avoid auth duplication const [embedding] = await this.generateEmbeddingsBatch([text.toLowerCase()]); return embedding; } private cosineSimilarity(a: number[], b: number[]): number { let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dotProduct += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); } async findSimilar(query: string, maxResults: number = 30, threshold: number = 0.3): Promise { if (!this.enabled || !this.isInitialized || this.embeddings.length === 0) { return []; } try { const queryEmbeddings = await this.generateEmbeddingsBatch([query.toLowerCase()]); const queryEmbedding = queryEmbeddings[0]; const similarities = this.embeddings.map(item => ({ ...item, similarity: this.cosineSimilarity(queryEmbedding, item.embedding) })); return similarities .filter(item => item.similarity >= threshold) .sort((a, b) => b.similarity - a.similarity) .slice(0, maxResults); } catch (error) { console.error('[EMBEDDINGS] Failed to find similar items:', error); return []; } } isEnabled(): boolean { return this.enabled && this.isInitialized; } getStats(): { enabled: boolean; initialized: boolean; count: number } { return { enabled: this.enabled, initialized: this.isInitialized, count: this.embeddings.length }; } } const embeddingsService = new EmbeddingsService(); export { embeddingsService, type EmbeddingData, type EmbeddingSearchResult }; if (typeof window === 'undefined' && process.env.NODE_ENV !== 'test') { embeddingsService.initialize().catch(error => { console.error('[EMBEDDINGS] Auto-initialization failed:', error); }); }