// src/utils/embeddings.ts - Refactored import { promises as fs } from 'fs'; import path from 'path'; import { getCompressedToolsDataForAI } from './dataService.js'; import 'dotenv/config'; import crypto from 'crypto'; export interface EmbeddingData { id: string; type: 'tool' | 'concept'; name: string; content: string; embedding: number[]; metadata: { domains?: string[]; phases?: string[]; tags?: string[]; skillLevel?: string; type?: string; }; } export interface SimilarityResult extends EmbeddingData { similarity: number; } interface EmbeddingsDatabase { version: string; lastUpdated: number; embeddings: EmbeddingData[]; } interface EmbeddingsConfig { enabled: boolean; endpoint?: string; apiKey?: string; model?: string; batchSize: number; batchDelay: number; } class EmbeddingsService { private embeddings: EmbeddingData[] = []; private isInitialized = false; private initializationPromise: Promise | null = null; private readonly embeddingsPath = path.join(process.cwd(), 'data', 'embeddings.json'); private config: EmbeddingsConfig; constructor() { this.config = this.loadConfig(); console.log('[EMBEDDINGS-SERVICE] Initialized:', { enabled: this.config.enabled, hasEndpoint: !!this.config.endpoint, hasModel: !!this.config.model }); } private loadConfig(): EmbeddingsConfig { const enabled = process.env.AI_EMBEDDINGS_ENABLED === 'true'; const endpoint = process.env.AI_EMBEDDINGS_ENDPOINT; const apiKey = process.env.AI_EMBEDDINGS_API_KEY; const model = process.env.AI_EMBEDDINGS_MODEL; const batchSize = parseInt(process.env.AI_EMBEDDINGS_BATCH_SIZE || '20', 10); const batchDelay = parseInt(process.env.AI_EMBEDDINGS_BATCH_DELAY_MS || '1000', 10); return { enabled, endpoint, apiKey, model, batchSize, batchDelay }; } async initialize(): Promise { if (this.initializationPromise) { return this.initializationPromise; } if (this.isInitialized) { return Promise.resolve(); } this.initializationPromise = this.performInitialization(); return this.initializationPromise; } private async performInitialization(): Promise { const initStart = Date.now(); try { console.log('[EMBEDDINGS-SERVICE] Starting initialization'); if (!this.config.enabled) { console.log('[EMBEDDINGS-SERVICE] Service disabled via configuration'); return; } await fs.mkdir(path.dirname(this.embeddingsPath), { recursive: true }); const toolsData = await getCompressedToolsDataForAI(); const currentDataHash = await this.hashToolsFile(); const existing = await this.loadEmbeddings(); const cacheIsUsable = existing && existing.version === currentDataHash && Array.isArray(existing.embeddings) && existing.embeddings.length > 0; if (cacheIsUsable) { console.log('[EMBEDDINGS-SERVICE] Using cached embeddings'); this.embeddings = existing.embeddings; } else { console.log('[EMBEDDINGS-SERVICE] Generating new embeddings'); await this.generateEmbeddings(toolsData, currentDataHash); } this.isInitialized = true; console.log(`[EMBEDDINGS-SERVICE] Initialized successfully with ${this.embeddings.length} embeddings in ${Date.now() - initStart}ms`); } catch (error) { console.error('[EMBEDDINGS-SERVICE] Initialization failed:', error); this.isInitialized = false; throw error; } finally { this.initializationPromise = null; } } private async hashToolsFile(): Promise { const file = path.join(process.cwd(), 'src', 'data', 'tools.yaml'); const raw = await fs.readFile(file, 'utf8'); return crypto.createHash('sha256').update(raw).digest('hex'); } private async loadEmbeddings(): Promise { try { const data = await fs.readFile(this.embeddingsPath, 'utf8'); return JSON.parse(data); } catch (error) { console.log('[EMBEDDINGS-SERVICE] No existing embeddings file found'); return null; } } private async saveEmbeddings(version: string): Promise { const database: EmbeddingsDatabase = { version, lastUpdated: Date.now(), embeddings: this.embeddings }; await fs.writeFile(this.embeddingsPath, JSON.stringify(database, null, 2)); console.log(`[EMBEDDINGS-SERVICE] Saved ${this.embeddings.length} embeddings to disk`); } private createContentString(item: any): string { const parts = [ item.name, item.description || '', ...(item.tags || []), ...(item.domains || []), ...(item.phases || []) ]; return parts.filter(Boolean).join(' ').toLowerCase(); } private async generateEmbeddingsBatch(contents: string[]): Promise { if (!this.config.endpoint || !this.config.model) { throw new Error('Missing embeddings API configuration'); } const headers: Record = { 'Content-Type': 'application/json' }; if (this.config.apiKey) { headers['Authorization'] = `Bearer ${this.config.apiKey}`; } const response = await fetch(this.config.endpoint, { method: 'POST', headers, body: JSON.stringify({ model: this.config.model, input: contents }) }); if (!response.ok) { const error = await response.text(); throw new Error(`Embeddings API error: ${response.status} - ${error}`); } const data = await response.json(); if (Array.isArray(data.embeddings)) { return data.embeddings; } if (Array.isArray(data.data)) { return data.data.map((item: any) => item.embedding); } throw new Error('Unknown embeddings API response format'); } private async generateEmbeddings(toolsData: any, version: string): Promise { const allItems = [ ...toolsData.tools.map((tool: any) => ({ ...tool, type: 'tool' })), ...toolsData.concepts.map((concept: any) => ({ ...concept, type: 'concept' })) ]; const contents = allItems.map(item => this.createContentString(item)); this.embeddings = []; console.log(`[EMBEDDINGS-SERVICE] Generating embeddings for ${contents.length} items`); for (let i = 0; i < contents.length; i += this.config.batchSize) { const batch = contents.slice(i, i + this.config.batchSize); const batchItems = allItems.slice(i, i + this.config.batchSize); const batchNumber = Math.ceil((i + 1) / this.config.batchSize); const totalBatches = Math.ceil(contents.length / this.config.batchSize); console.log(`[EMBEDDINGS-SERVICE] Processing batch ${batchNumber}/${totalBatches}`); try { const embeddings = await this.generateEmbeddingsBatch(batch); embeddings.forEach((embedding, index) => { const item = batchItems[index]; this.embeddings.push({ id: `${item.type}_${item.name.replace(/[^a-zA-Z0-9]/g, '_')}`, type: item.type, name: item.name, content: batch[index], embedding, metadata: { domains: item.domains, phases: item.phases, tags: item.tags, skillLevel: item.skillLevel, type: item.type } }); }); if (i + this.config.batchSize < contents.length) { await new Promise(resolve => setTimeout(resolve, this.config.batchDelay)); } } catch (error) { console.error(`[EMBEDDINGS-SERVICE] Batch ${batchNumber} failed:`, error); throw error; } } await this.saveEmbeddings(version); } async embedText(text: string): Promise { if (!this.isEnabled() || !this.isInitialized) { throw new Error('Embeddings service not available'); } const [embedding] = await this.generateEmbeddingsBatch([text.toLowerCase()]); return embedding; } async waitForInitialization(): Promise { if (!this.config.enabled) { return Promise.resolve(); } if (this.isInitialized) { return Promise.resolve(); } if (this.initializationPromise) { await this.initializationPromise; return; } return this.initialize(); } private cosineSimilarity(a: number[], b: number[]): number { let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dotProduct += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); } async findSimilar(query: string, maxResults: number = 30, threshold: number = 0.3): Promise { if (!this.config.enabled) { console.log('[EMBEDDINGS-SERVICE] Service disabled, returning empty results'); return []; } if (!this.isInitialized || this.embeddings.length === 0) { console.log('[EMBEDDINGS-SERVICE] Not initialized or no embeddings available'); return []; } try { console.log(`[EMBEDDINGS-SERVICE] Finding similar items for query: "${query}"`); const queryEmbeddings = await this.generateEmbeddingsBatch([query.toLowerCase()]); const queryEmbedding = queryEmbeddings[0]; const similarities: SimilarityResult[] = this.embeddings.map(item => ({ ...item, similarity: this.cosineSimilarity(queryEmbedding, item.embedding) })); const topScore = Math.max(...similarities.map(s => s.similarity)); const dynamicThreshold = Math.max(threshold, topScore * 0.85); const results = similarities .filter(item => item.similarity >= dynamicThreshold) .sort((a, b) => b.similarity - a.similarity) .slice(0, maxResults); console.log(`[EMBEDDINGS-SERVICE] Found ${results.length} similar items (threshold: ${dynamicThreshold.toFixed(3)})`); if (results.length > 0) { console.log('[EMBEDDINGS-SERVICE] Top 5 matches:'); results.slice(0, 5).forEach((item, idx) => { console.log(` ${idx + 1}. ${item.name} (${item.type}) = ${item.similarity.toFixed(4)}`); }); } return results; } catch (error) { console.error('[EMBEDDINGS-SERVICE] Similarity search failed:', error); return []; } } isEnabled(): boolean { return this.config.enabled; } getStats(): { enabled: boolean; initialized: boolean; count: number } { return { enabled: this.config.enabled, initialized: this.isInitialized, count: this.embeddings.length }; } getConfig(): EmbeddingsConfig { return { ...this.config }; } } export const embeddingsService = new EmbeddingsService();