491 lines
16 KiB
TypeScript
491 lines
16 KiB
TypeScript
// src/utils/embeddings.ts
|
||
import { promises as fs } from 'fs';
|
||
import path from 'path';
|
||
import { getCompressedToolsDataForAI } from './dataService.js';
|
||
|
||
interface EmbeddingData {
|
||
id: string;
|
||
type: 'tool' | 'concept';
|
||
name: string;
|
||
content: string;
|
||
embedding: number[];
|
||
metadata: {
|
||
domains?: string[];
|
||
phases?: string[];
|
||
tags?: string[];
|
||
skillLevel?: string;
|
||
type?: string;
|
||
};
|
||
}
|
||
|
||
interface EmbeddingsDatabase {
|
||
version: string;
|
||
lastUpdated: number;
|
||
embeddings: EmbeddingData[];
|
||
}
|
||
|
||
interface SimilarityResult extends EmbeddingData {
|
||
similarity: number;
|
||
}
|
||
|
||
class EmbeddingsService {
|
||
private embeddings: EmbeddingData[] = [];
|
||
private isInitialized = false;
|
||
private initializationPromise: Promise<void> | null = null;
|
||
private readonly embeddingsPath = path.join(process.cwd(), 'data', 'embeddings.json');
|
||
private readonly batchSize: number;
|
||
private readonly batchDelay: number;
|
||
private enabled: boolean = false; // Make mutable again
|
||
|
||
constructor() {
|
||
this.batchSize = parseInt(process.env.AI_EMBEDDINGS_BATCH_SIZE || '20', 10);
|
||
this.batchDelay = parseInt(process.env.AI_EMBEDDINGS_BATCH_DELAY_MS || '1000', 10);
|
||
|
||
// Don't call async method from constructor - handle in initialize() instead
|
||
this.enabled = true; // Start optimistically enabled for development
|
||
}
|
||
|
||
private async checkEnabledStatus(): Promise<void> {
|
||
try {
|
||
// Add debugging to see what's actually in process.env
|
||
console.log('[EMBEDDINGS] Debug env check:', {
|
||
AI_EMBEDDINGS_ENABLED: process.env.AI_EMBEDDINGS_ENABLED,
|
||
envKeys: Object.keys(process.env).filter(k => k.includes('EMBEDDINGS')).length,
|
||
allEnvKeys: Object.keys(process.env).length
|
||
});
|
||
|
||
const envEnabled = process.env.AI_EMBEDDINGS_ENABLED;
|
||
|
||
if (envEnabled === 'true') {
|
||
// Check if we have the required API configuration
|
||
const endpoint = process.env.AI_EMBEDDINGS_ENDPOINT;
|
||
const model = process.env.AI_EMBEDDINGS_MODEL;
|
||
|
||
if (!endpoint || !model) {
|
||
console.warn('[EMBEDDINGS] Embeddings enabled but API configuration missing - disabling');
|
||
this.enabled = false;
|
||
return;
|
||
}
|
||
|
||
console.log('[EMBEDDINGS] All requirements met - enabling embeddings');
|
||
this.enabled = true;
|
||
return;
|
||
}
|
||
|
||
// Check if embeddings file exists
|
||
try {
|
||
await fs.stat(this.embeddingsPath);
|
||
console.log('[EMBEDDINGS] Existing embeddings file found - enabling');
|
||
this.enabled = true;
|
||
} catch {
|
||
console.log('[EMBEDDINGS] Embeddings not explicitly enabled - disabling');
|
||
this.enabled = false;
|
||
}
|
||
} catch (error) {
|
||
console.error('[EMBEDDINGS] Error checking enabled status:', error);
|
||
this.enabled = false;
|
||
}
|
||
}
|
||
|
||
async initialize(): Promise<void> {
|
||
if (this.initializationPromise) {
|
||
return this.initializationPromise;
|
||
}
|
||
|
||
if (this.isInitialized) {
|
||
return Promise.resolve();
|
||
}
|
||
|
||
this.initializationPromise = this.performInitialization();
|
||
return this.initializationPromise;
|
||
}
|
||
|
||
private async performInitialization(): Promise<void> {
|
||
// 1️⃣ Respect the on/off switch that the newer code introduced
|
||
await this.checkEnabledStatus();
|
||
if (!this.enabled) {
|
||
console.log('[EMBEDDINGS] Embeddings disabled, skipping initialization');
|
||
return;
|
||
}
|
||
|
||
const initStart = Date.now();
|
||
try {
|
||
console.log('[EMBEDDINGS] Initializing embeddings system…');
|
||
|
||
// Make sure the data folder exists
|
||
await fs.mkdir(path.dirname(this.embeddingsPath), { recursive: true });
|
||
|
||
// Load current tools / concepts and generate a hash
|
||
const toolsData = await getCompressedToolsDataForAI();
|
||
const currentDataHash = this.hashData(toolsData); // <- keep the old helper
|
||
// (SHA-256, xxHash etc.)
|
||
|
||
// Try to read an existing file
|
||
const existing = await this.loadEmbeddings();
|
||
|
||
const cacheIsUsable =
|
||
existing &&
|
||
existing.version === currentDataHash &&
|
||
Array.isArray(existing.embeddings) &&
|
||
existing.embeddings.length > 0;
|
||
|
||
if (cacheIsUsable) {
|
||
console.log('[EMBEDDINGS] Using cached embeddings');
|
||
this.embeddings = existing.embeddings;
|
||
} else {
|
||
console.log('[EMBEDDINGS] Generating new embeddings…');
|
||
// 2️⃣ Build and persist new vectors
|
||
await this.generateEmbeddings(toolsData, currentDataHash); // <- old helper
|
||
}
|
||
|
||
this.isInitialized = true;
|
||
console.log(`[EMBEDDINGS] Initialized with ${this.embeddings.length} embeddings in ${Date.now() - initStart} ms`);
|
||
} catch (err) {
|
||
console.error('[EMBEDDINGS] Failed to initialize:', err);
|
||
this.isInitialized = false;
|
||
throw err; // Let the caller know – same behaviour as before
|
||
} finally {
|
||
// 3️⃣ Always clear the promise so subsequent calls don't hang
|
||
this.initializationPromise = null;
|
||
}
|
||
}
|
||
|
||
private hashData(data: any): string {
|
||
return Buffer.from(JSON.stringify(data)).toString('base64').slice(0, 32);
|
||
}
|
||
|
||
private async loadEmbeddings(): Promise<EmbeddingsDatabase | null> {
|
||
try {
|
||
const data = await fs.readFile(this.embeddingsPath, 'utf8');
|
||
return JSON.parse(data);
|
||
} catch (error) {
|
||
console.log('[EMBEDDINGS] No existing embeddings found');
|
||
return null;
|
||
}
|
||
}
|
||
|
||
private async saveEmbeddings(version: string): Promise<void> {
|
||
const database: EmbeddingsDatabase = {
|
||
version,
|
||
lastUpdated: Date.now(),
|
||
embeddings: this.embeddings
|
||
};
|
||
|
||
await fs.writeFile(this.embeddingsPath, JSON.stringify(database, null, 2));
|
||
console.log(`[EMBEDDINGS] Saved ${this.embeddings.length} embeddings to disk`);
|
||
}
|
||
|
||
private createContentString(item: any): string {
|
||
const parts = [
|
||
item.name,
|
||
item.description || '',
|
||
...(item.tags || []),
|
||
...(item.domains || []),
|
||
...(item.phases || [])
|
||
];
|
||
|
||
return parts.filter(Boolean).join(' ').toLowerCase();
|
||
}
|
||
|
||
private async generateEmbeddingsBatch(contents: string[]): Promise<number[][]> {
|
||
const endpoint = process.env.AI_EMBEDDINGS_ENDPOINT;
|
||
const apiKey = process.env.AI_EMBEDDINGS_API_KEY;
|
||
const model = process.env.AI_EMBEDDINGS_MODEL;
|
||
|
||
if (!endpoint || !model) {
|
||
const missing: string[] = [];
|
||
if (!endpoint) missing.push('AI_EMBEDDINGS_ENDPOINT');
|
||
if (!model) missing.push('AI_EMBEDDINGS_MODEL');
|
||
throw new Error(`Missing embeddings API configuration: ${missing.join(', ')}`);
|
||
}
|
||
|
||
const headers: Record<string, string> = {
|
||
'Content-Type': 'application/json'
|
||
};
|
||
|
||
if (apiKey) {
|
||
headers['Authorization'] = `Bearer ${apiKey}`;
|
||
}
|
||
|
||
const response = await fetch(endpoint, {
|
||
method: 'POST',
|
||
headers,
|
||
body: JSON.stringify({
|
||
model,
|
||
input: contents
|
||
})
|
||
});
|
||
|
||
if (!response.ok) {
|
||
const error = await response.text();
|
||
throw new Error(`Embeddings API error: ${response.status} - ${error}`);
|
||
}
|
||
|
||
const data = await response.json();
|
||
|
||
if (Array.isArray(data.embeddings)) {
|
||
return data.embeddings;
|
||
}
|
||
|
||
if (Array.isArray(data.data)) {
|
||
return data.data.map((item: any) => item.embedding);
|
||
}
|
||
|
||
throw new Error('Unknown embeddings API response format');
|
||
}
|
||
|
||
private async generateEmbeddings(toolsData: any, version: string): Promise<void> {
|
||
const allItems = [
|
||
...toolsData.tools.map((tool: any) => ({ ...tool, type: 'tool' })),
|
||
...toolsData.concepts.map((concept: any) => ({ ...concept, type: 'concept' }))
|
||
];
|
||
|
||
const contents = allItems.map(item => this.createContentString(item));
|
||
this.embeddings = [];
|
||
|
||
for (let i = 0; i < contents.length; i += this.batchSize) {
|
||
const batch = contents.slice(i, i + this.batchSize);
|
||
const batchItems = allItems.slice(i, i + this.batchSize);
|
||
|
||
console.log(`[EMBEDDINGS] Processing batch ${Math.ceil((i + 1) / this.batchSize)} of ${Math.ceil(contents.length / this.batchSize)}`);
|
||
|
||
try {
|
||
const embeddings = await this.generateEmbeddingsBatch(batch);
|
||
|
||
embeddings.forEach((embedding, index) => {
|
||
const item = batchItems[index];
|
||
this.embeddings.push({
|
||
id: `${item.type}_${item.name.replace(/[^a-zA-Z0-9]/g, '_')}`,
|
||
type: item.type,
|
||
name: item.name,
|
||
content: batch[index],
|
||
embedding,
|
||
metadata: {
|
||
domains: item.domains,
|
||
phases: item.phases,
|
||
tags: item.tags,
|
||
skillLevel: item.skillLevel,
|
||
type: item.type
|
||
}
|
||
});
|
||
});
|
||
|
||
if (i + this.batchSize < contents.length) {
|
||
await new Promise(resolve => setTimeout(resolve, this.batchDelay));
|
||
}
|
||
|
||
} catch (error) {
|
||
console.error(`[EMBEDDINGS] Failed to process batch ${Math.ceil((i + 1) / this.batchSize)}:`, error);
|
||
throw error;
|
||
}
|
||
}
|
||
|
||
await this.saveEmbeddings(version);
|
||
}
|
||
|
||
public async embedText(text: string): Promise<number[]> {
|
||
if (!this.enabled || !this.isInitialized) {
|
||
throw new Error('Embeddings service not available');
|
||
}
|
||
const [embedding] = await this.generateEmbeddingsBatch([text.toLowerCase()]);
|
||
return embedding;
|
||
}
|
||
|
||
async waitForInitialization(): Promise<void> {
|
||
// Always re-check environment status first in case variables loaded after initial check
|
||
await this.checkEnabledStatus();
|
||
|
||
if (!this.enabled || this.isInitialized) {
|
||
return Promise.resolve();
|
||
}
|
||
|
||
if (this.initializationPromise) {
|
||
await this.initializationPromise;
|
||
return;
|
||
}
|
||
|
||
return this.initialize();
|
||
}
|
||
|
||
// Force re-check of environment status (useful for development)
|
||
async forceRecheckEnvironment(): Promise<void> {
|
||
this.enabled = false;
|
||
this.isInitialized = false;
|
||
await this.checkEnabledStatus();
|
||
console.log('[EMBEDDINGS] Environment status re-checked, enabled:', this.enabled);
|
||
}
|
||
|
||
private cosineSimilarity(a: number[], b: number[]): number {
|
||
let dotProduct = 0;
|
||
let normA = 0;
|
||
let normB = 0;
|
||
|
||
for (let i = 0; i < a.length; i++) {
|
||
dotProduct += a[i] * b[i];
|
||
normA += a[i] * a[i];
|
||
normB += b[i] * b[i];
|
||
}
|
||
|
||
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
||
}
|
||
|
||
async findSimilar(query: string, maxResults: number = 30, threshold: number = 0.3): Promise<SimilarityResult[]> {
|
||
if (!this.enabled) {
|
||
console.log('[EMBEDDINGS] Service disabled for similarity search');
|
||
return [];
|
||
}
|
||
|
||
try {
|
||
// If we have embeddings data, use it
|
||
if (this.isInitialized && this.embeddings.length > 0) {
|
||
console.log(`[EMBEDDINGS] Using embeddings data for similarity search: ${query}`);
|
||
|
||
const queryEmbeddings = await this.generateEmbeddingsBatch([query.toLowerCase()]);
|
||
const queryEmbedding = queryEmbeddings[0];
|
||
|
||
console.log(`[EMBEDDINGS] Computing similarities for ${this.embeddings.length} items`);
|
||
|
||
const similarities: SimilarityResult[] = this.embeddings.map(item => ({
|
||
...item,
|
||
similarity: this.cosineSimilarity(queryEmbedding, item.embedding)
|
||
}));
|
||
|
||
const results = similarities
|
||
.filter(item => item.similarity >= threshold)
|
||
.sort((a, b) => b.similarity - a.similarity)
|
||
.slice(0, maxResults);
|
||
|
||
const orderingValid = results.every((item, index) => {
|
||
if (index === 0) return true;
|
||
return item.similarity <= results[index - 1].similarity;
|
||
});
|
||
|
||
if (!orderingValid) {
|
||
console.error('[EMBEDDINGS] CRITICAL: Similarity ordering is broken!');
|
||
results.forEach((item, idx) => {
|
||
console.error(` ${idx}: ${item.name} = ${item.similarity.toFixed(4)}`);
|
||
});
|
||
}
|
||
|
||
console.log(`[EMBEDDINGS] Found ${results.length} similar items (threshold: ${threshold})`);
|
||
if (results.length > 0) {
|
||
console.log('[EMBEDDINGS] Top 10 similarity matches:');
|
||
results.slice(0, 10).forEach((item, idx) => {
|
||
console.log(` ${idx + 1}. ${item.name} (${item.type}) = ${item.similarity.toFixed(4)}`);
|
||
});
|
||
|
||
const topSimilarity = results[0].similarity;
|
||
const hasHigherSimilarity = results.some(item => item.similarity > topSimilarity);
|
||
if (hasHigherSimilarity) {
|
||
console.error('[EMBEDDINGS] CRITICAL: Top result is not actually the highest similarity!');
|
||
}
|
||
}
|
||
|
||
return results;
|
||
|
||
} else {
|
||
// Fallback: generate mock similarity results from actual tools data
|
||
console.log(`[EMBEDDINGS] No embeddings data, using fallback text matching: ${query}`);
|
||
|
||
const { getToolsData } = await import('./dataService.js');
|
||
const toolsData = await getToolsData();
|
||
|
||
const queryLower = query.toLowerCase();
|
||
const queryWords = queryLower.split(/\s+/).filter(w => w.length > 2);
|
||
|
||
const similarities: SimilarityResult[] = toolsData.tools
|
||
.map((tool: any) => {
|
||
let similarity = 0;
|
||
|
||
// Name matching
|
||
if (tool.name.toLowerCase().includes(queryLower)) {
|
||
similarity += 0.8;
|
||
}
|
||
|
||
// Description matching
|
||
if (tool.description && tool.description.toLowerCase().includes(queryLower)) {
|
||
similarity += 0.6;
|
||
}
|
||
|
||
// Tag matching
|
||
if (tool.tags && Array.isArray(tool.tags)) {
|
||
const matchingTags = tool.tags.filter((tag: string) =>
|
||
tag.toLowerCase().includes(queryLower) || queryLower.includes(tag.toLowerCase())
|
||
);
|
||
if (tool.tags.length > 0) {
|
||
similarity += (matchingTags.length / tool.tags.length) * 0.4;
|
||
}
|
||
}
|
||
|
||
// Word-level matching
|
||
const toolText = `${tool.name} ${tool.description || ''} ${(tool.tags || []).join(' ')}`.toLowerCase();
|
||
const matchingWords = queryWords.filter(word => toolText.includes(word));
|
||
if (queryWords.length > 0) {
|
||
similarity += (matchingWords.length / queryWords.length) * 0.3;
|
||
}
|
||
|
||
return {
|
||
id: `tool_${tool.name.replace(/[^a-zA-Z0-9]/g, '_').toLowerCase()}`,
|
||
type: 'tool' as const,
|
||
name: tool.name,
|
||
content: toolText,
|
||
embedding: [], // Empty for fallback
|
||
metadata: {
|
||
domains: tool.domains || [],
|
||
phases: tool.phases || [],
|
||
tags: tool.tags || [],
|
||
skillLevel: tool.skillLevel,
|
||
type: tool.type
|
||
},
|
||
similarity: Math.min(similarity, 1.0)
|
||
};
|
||
})
|
||
.filter(item => item.similarity >= threshold)
|
||
.sort((a, b) => b.similarity - a.similarity)
|
||
.slice(0, maxResults);
|
||
|
||
console.log(`[EMBEDDINGS] Fallback found ${similarities.length} similar items`);
|
||
return similarities;
|
||
}
|
||
|
||
} catch (error) {
|
||
console.error('[EMBEDDINGS] Failed to find similar items:', error);
|
||
return [];
|
||
}
|
||
}
|
||
|
||
isEnabled(): boolean {
|
||
// If not enabled and not initialized, try re-checking environment
|
||
// This handles the case where environment variables loaded after initial check
|
||
if (!this.enabled && !this.isInitialized) {
|
||
// Don't await this, just trigger it and return current status
|
||
this.checkEnabledStatus().catch(console.error);
|
||
}
|
||
|
||
return this.enabled;
|
||
}
|
||
|
||
getStats(): { enabled: boolean; initialized: boolean; count: number } {
|
||
return {
|
||
enabled: this.enabled, // Always true during development
|
||
initialized: this.isInitialized,
|
||
count: this.embeddings.length
|
||
};
|
||
}
|
||
}
|
||
|
||
const embeddingsService = new EmbeddingsService();
|
||
|
||
export { embeddingsService, type EmbeddingData, type SimilarityResult };
|
||
|
||
// Export utility functions for debugging
|
||
export const debugEmbeddings = {
|
||
async recheckEnvironment() {
|
||
return embeddingsService.forceRecheckEnvironment();
|
||
},
|
||
getStatus() {
|
||
return embeddingsService.getStats();
|
||
}
|
||
};
|
||
|
||
// Remove auto-initialization - let it initialize lazily when first needed
|