372 lines
11 KiB
TypeScript
372 lines
11 KiB
TypeScript
// src/utils/embeddings.ts - Refactored
|
|
import { promises as fs } from 'fs';
|
|
import path from 'path';
|
|
import { getCompressedToolsDataForAI } from './dataService.js';
|
|
import 'dotenv/config';
|
|
import crypto from 'crypto';
|
|
|
|
export interface EmbeddingData {
|
|
id: string;
|
|
type: 'tool' | 'concept';
|
|
name: string;
|
|
content: string;
|
|
embedding: number[];
|
|
metadata: {
|
|
domains?: string[];
|
|
phases?: string[];
|
|
tags?: string[];
|
|
skillLevel?: string;
|
|
type?: string;
|
|
};
|
|
}
|
|
|
|
export interface SimilarityResult extends EmbeddingData {
|
|
similarity: number;
|
|
}
|
|
|
|
interface EmbeddingsDatabase {
|
|
version: string;
|
|
lastUpdated: number;
|
|
embeddings: EmbeddingData[];
|
|
}
|
|
|
|
interface EmbeddingsConfig {
|
|
//enabled: boolean;
|
|
endpoint?: string;
|
|
apiKey?: string;
|
|
model?: string;
|
|
batchSize: number;
|
|
batchDelay: number;
|
|
}
|
|
|
|
class EmbeddingsService {
|
|
private embeddings: EmbeddingData[] = [];
|
|
private isInitialized = false;
|
|
private initializationPromise: Promise<void> | null = null;
|
|
private readonly embeddingsPath = path.join(process.cwd(), 'data', 'embeddings.json');
|
|
private config: EmbeddingsConfig;
|
|
|
|
constructor() {
|
|
this.config = this.loadConfig();
|
|
console.log('[EMBEDDINGS-SERVICE] Initialized:', {
|
|
//enabled: this.config.enabled,
|
|
hasEndpoint: !!this.config.endpoint,
|
|
hasModel: !!this.config.model
|
|
});
|
|
}
|
|
|
|
private loadConfig(): EmbeddingsConfig {
|
|
//const enabled = process.env.AI_EMBEDDINGS_ENABLED === 'true';
|
|
const endpoint = process.env.AI_EMBEDDINGS_ENDPOINT;
|
|
const apiKey = process.env.AI_EMBEDDINGS_API_KEY;
|
|
const model = process.env.AI_EMBEDDINGS_MODEL;
|
|
const batchSize = parseInt(process.env.AI_EMBEDDINGS_BATCH_SIZE || '20', 10);
|
|
const batchDelay = parseInt(process.env.AI_EMBEDDINGS_BATCH_DELAY_MS || '1000', 10);
|
|
|
|
return {
|
|
//enabled,
|
|
endpoint,
|
|
apiKey,
|
|
model,
|
|
batchSize,
|
|
batchDelay
|
|
};
|
|
}
|
|
|
|
async initialize(): Promise<void> {
|
|
if (this.initializationPromise) {
|
|
return this.initializationPromise;
|
|
}
|
|
|
|
if (this.isInitialized) {
|
|
return Promise.resolve();
|
|
}
|
|
|
|
this.initializationPromise = this.performInitialization();
|
|
return this.initializationPromise;
|
|
}
|
|
|
|
private async performInitialization(): Promise<void> {
|
|
const initStart = Date.now();
|
|
|
|
try {
|
|
console.log('[EMBEDDINGS-SERVICE] Starting initialization');
|
|
|
|
/*if (!this.config.enabled) {
|
|
console.log('[EMBEDDINGS-SERVICE] Service disabled via configuration');
|
|
return;
|
|
}*/
|
|
|
|
await fs.mkdir(path.dirname(this.embeddingsPath), { recursive: true });
|
|
|
|
const toolsData = await getCompressedToolsDataForAI();
|
|
const { getToolsFileHash } = await import('./hashUtils.js');
|
|
const currentDataHash = await getToolsFileHash();
|
|
|
|
const existing = await this.loadEmbeddings();
|
|
|
|
const cacheIsUsable = existing &&
|
|
existing.version === currentDataHash &&
|
|
Array.isArray(existing.embeddings) &&
|
|
existing.embeddings.length > 0;
|
|
|
|
if (cacheIsUsable) {
|
|
console.log('[EMBEDDINGS-SERVICE] Using cached embeddings');
|
|
this.embeddings = existing.embeddings;
|
|
} else {
|
|
console.log('[EMBEDDINGS-SERVICE] Generating new embeddings');
|
|
await this.generateEmbeddings(toolsData, currentDataHash);
|
|
}
|
|
|
|
this.isInitialized = true;
|
|
console.log(`[EMBEDDINGS-SERVICE] Initialized successfully with ${this.embeddings.length} embeddings in ${Date.now() - initStart}ms`);
|
|
|
|
} catch (error) {
|
|
console.error('[EMBEDDINGS-SERVICE] Initialization failed:', error);
|
|
this.isInitialized = false;
|
|
throw error;
|
|
} finally {
|
|
this.initializationPromise = null;
|
|
}
|
|
}
|
|
|
|
private async loadEmbeddings(): Promise<EmbeddingsDatabase | null> {
|
|
try {
|
|
const data = await fs.readFile(this.embeddingsPath, 'utf8');
|
|
return JSON.parse(data);
|
|
} catch (error) {
|
|
console.log('[EMBEDDINGS-SERVICE] No existing embeddings file found');
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private async saveEmbeddings(version: string): Promise<void> {
|
|
const database: EmbeddingsDatabase = {
|
|
version,
|
|
lastUpdated: Date.now(),
|
|
embeddings: this.embeddings
|
|
};
|
|
|
|
await fs.writeFile(this.embeddingsPath, JSON.stringify(database, null, 2));
|
|
console.log(`[EMBEDDINGS-SERVICE] Saved ${this.embeddings.length} embeddings to disk`);
|
|
}
|
|
|
|
private createContentString(item: any): string {
|
|
const parts = [
|
|
item.name,
|
|
item.description || '',
|
|
...(item.tags || []),
|
|
...(item.domains || []),
|
|
...(item.phases || [])
|
|
];
|
|
|
|
return parts.filter(Boolean).join(' ').toLowerCase();
|
|
}
|
|
|
|
private async generateEmbeddingsBatch(contents: string[]): Promise<number[][]> {
|
|
if (!this.config.endpoint || !this.config.model) {
|
|
throw new Error('Missing embeddings API configuration');
|
|
}
|
|
|
|
const headers: Record<string, string> = {
|
|
'Content-Type': 'application/json'
|
|
};
|
|
|
|
if (this.config.apiKey) {
|
|
headers['Authorization'] = `Bearer ${this.config.apiKey}`;
|
|
}
|
|
|
|
const response = await fetch(this.config.endpoint, {
|
|
method: 'POST',
|
|
headers,
|
|
body: JSON.stringify({
|
|
model: this.config.model,
|
|
input: contents
|
|
})
|
|
});
|
|
|
|
if (!response.ok) {
|
|
const error = await response.text();
|
|
throw new Error(`Embeddings API error: ${response.status} - ${error}`);
|
|
}
|
|
|
|
const data = await response.json();
|
|
|
|
if (Array.isArray(data.embeddings)) {
|
|
return data.embeddings;
|
|
}
|
|
|
|
if (Array.isArray(data.data)) {
|
|
return data.data.map((item: any) => item.embedding);
|
|
}
|
|
|
|
throw new Error('Unknown embeddings API response format');
|
|
}
|
|
|
|
private async generateEmbeddings(toolsData: any, version: string): Promise<void> {
|
|
const allItems = [
|
|
...toolsData.tools.map((tool: any) => ({ ...tool, type: 'tool' })),
|
|
...toolsData.concepts.map((concept: any) => ({ ...concept, type: 'concept' }))
|
|
];
|
|
|
|
const contents = allItems.map(item => this.createContentString(item));
|
|
this.embeddings = [];
|
|
|
|
console.log(`[EMBEDDINGS-SERVICE] Generating embeddings for ${contents.length} items`);
|
|
|
|
for (let i = 0; i < contents.length; i += this.config.batchSize) {
|
|
const batch = contents.slice(i, i + this.config.batchSize);
|
|
const batchItems = allItems.slice(i, i + this.config.batchSize);
|
|
|
|
const batchNumber = Math.ceil((i + 1) / this.config.batchSize);
|
|
const totalBatches = Math.ceil(contents.length / this.config.batchSize);
|
|
|
|
console.log(`[EMBEDDINGS-SERVICE] Processing batch ${batchNumber}/${totalBatches}`);
|
|
|
|
try {
|
|
const embeddings = await this.generateEmbeddingsBatch(batch);
|
|
|
|
embeddings.forEach((embedding, index) => {
|
|
const item = batchItems[index];
|
|
this.embeddings.push({
|
|
id: `${item.type}_${item.name.replace(/[^a-zA-Z0-9]/g, '_')}`,
|
|
type: item.type,
|
|
name: item.name,
|
|
content: batch[index],
|
|
embedding,
|
|
metadata: {
|
|
domains: item.domains,
|
|
phases: item.phases,
|
|
tags: item.tags,
|
|
skillLevel: item.skillLevel,
|
|
type: item.type
|
|
}
|
|
});
|
|
});
|
|
|
|
if (i + this.config.batchSize < contents.length) {
|
|
await new Promise(resolve => setTimeout(resolve, this.config.batchDelay));
|
|
}
|
|
|
|
} catch (error) {
|
|
console.error(`[EMBEDDINGS-SERVICE] Batch ${batchNumber} failed:`, error);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
await this.saveEmbeddings(version);
|
|
}
|
|
|
|
async embedText(text: string): Promise<number[]> {
|
|
if (!this.isInitialized) {
|
|
throw new Error('Embeddings service not available');
|
|
}
|
|
|
|
const [embedding] = await this.generateEmbeddingsBatch([text.toLowerCase()]);
|
|
return embedding;
|
|
}
|
|
|
|
async waitForInitialization(): Promise<void> {
|
|
/*if (!this.config.enabled) {
|
|
return Promise.resolve();
|
|
}*/
|
|
|
|
if (this.isInitialized) {
|
|
return Promise.resolve();
|
|
}
|
|
|
|
if (this.initializationPromise) {
|
|
await this.initializationPromise;
|
|
return;
|
|
}
|
|
|
|
return this.initialize();
|
|
}
|
|
|
|
private cosineSimilarity(a: number[], b: number[]): number {
|
|
let dotProduct = 0;
|
|
let normA = 0;
|
|
let normB = 0;
|
|
|
|
for (let i = 0; i < a.length; i++) {
|
|
dotProduct += a[i] * b[i];
|
|
normA += a[i] * a[i];
|
|
normB += b[i] * b[i];
|
|
}
|
|
|
|
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
}
|
|
|
|
async findSimilar(query: string, maxResults: number = 30, threshold: number = 0.3): Promise<SimilarityResult[]> {
|
|
/*if (!this.config.enabled) {
|
|
console.log('[EMBEDDINGS-SERVICE] Service disabled, returning empty results');
|
|
return [];
|
|
}*/
|
|
|
|
if (!this.isInitialized || this.embeddings.length === 0) {
|
|
console.log('[EMBEDDINGS-SERVICE] Not initialized or no embeddings available');
|
|
return [];
|
|
}
|
|
|
|
try {
|
|
console.log(`[EMBEDDINGS-SERVICE] Finding similar items for query: "${query}"`);
|
|
|
|
const queryEmbeddings = await this.generateEmbeddingsBatch([query.toLowerCase()]);
|
|
const queryEmbedding = queryEmbeddings[0];
|
|
|
|
const similarities: SimilarityResult[] = this.embeddings.map(item => ({
|
|
...item,
|
|
similarity: this.cosineSimilarity(queryEmbedding, item.embedding)
|
|
}));
|
|
|
|
const topScore = Math.max(...similarities.map(s => s.similarity));
|
|
const dynamicThreshold = Math.max(threshold, topScore * 0.85);
|
|
|
|
const results = similarities
|
|
.filter(item => item.similarity >= dynamicThreshold)
|
|
.sort((a, b) => b.similarity - a.similarity)
|
|
.slice(0, maxResults);
|
|
|
|
console.log(`[EMBEDDINGS-SERVICE] Found ${results.length} similar items (threshold: ${dynamicThreshold.toFixed(3)})`);
|
|
|
|
if (results.length > 0) {
|
|
console.log('[EMBEDDINGS-SERVICE] Top 5 matches:');
|
|
results.slice(0, 5).forEach((item, idx) => {
|
|
console.log(` ${idx + 1}. ${item.name} (${item.type}) = ${item.similarity.toFixed(4)}`);
|
|
});
|
|
}
|
|
|
|
return results;
|
|
|
|
} catch (error) {
|
|
console.error('[EMBEDDINGS-SERVICE] Similarity search failed:', error);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/*isEnabled(): boolean {
|
|
return this.config.enabled;
|
|
}*/
|
|
|
|
/*getStats(): { enabled: boolean; initialized: boolean; count: number } {
|
|
return {
|
|
enabled: this.config.enabled,
|
|
initialized: this.isInitialized,
|
|
count: this.embeddings.length
|
|
};
|
|
}*/
|
|
|
|
getStats(): {initialized: boolean; count: number } {
|
|
return {
|
|
//enabled: this.config.enabled,
|
|
initialized: this.isInitialized,
|
|
count: this.embeddings.length
|
|
};
|
|
}
|
|
|
|
getConfig(): EmbeddingsConfig {
|
|
return { ...this.config };
|
|
}
|
|
}
|
|
|
|
export const embeddingsService = new EmbeddingsService(); |