forensic-pathways/src/utils/embeddings.ts
overcuriousity a0955c2e58 Progress
2025-08-04 21:05:15 +02:00

349 lines
10 KiB
TypeScript

// src/utils/embeddings.ts
import { promises as fs } from 'fs';
import path from 'path';
import { getCompressedToolsDataForAI } from './dataService.js';
interface EmbeddingData {
id: string;
type: 'tool' | 'concept';
name: string;
content: string;
embedding: number[];
metadata: {
domains?: string[];
phases?: string[];
tags?: string[];
skillLevel?: string;
type?: string;
};
}
interface EmbeddingsDatabase {
version: string;
lastUpdated: number;
embeddings: EmbeddingData[];
}
interface SimilarityResult extends EmbeddingData {
similarity: number;
}
class EmbeddingsService {
private embeddings: EmbeddingData[] = [];
private isInitialized = false;
private initializationPromise: Promise<void> | null = null; // ADD THIS LINE
private readonly embeddingsPath = path.join(process.cwd(), 'data', 'embeddings.json');
private readonly batchSize: number;
private readonly batchDelay: number;
private readonly enabled: boolean;
constructor() {
this.enabled = process.env.AI_EMBEDDINGS_ENABLED === 'true';
this.batchSize = parseInt(process.env.AI_EMBEDDINGS_BATCH_SIZE || '20', 10);
this.batchDelay = parseInt(process.env.AI_EMBEDDINGS_BATCH_DELAY_MS || '1000', 10);
}
// REPLACE the existing initialize method with this:
async initialize(): Promise<void> {
// If initialization is already in progress, wait for it
if (this.initializationPromise) {
return this.initializationPromise;
}
// If already initialized, return immediately
if (this.isInitialized) {
return Promise.resolve();
}
// Start initialization and store the promise
this.initializationPromise = this.performInitialization();
return this.initializationPromise;
}
// ADD THIS NEW METHOD:
private async performInitialization(): Promise<void> {
if (!this.enabled) {
console.log('[EMBEDDINGS] Embeddings disabled, skipping initialization');
return;
}
try {
console.log('[EMBEDDINGS] Initializing embeddings system...');
// Create data directory if it doesn't exist
await fs.mkdir(path.dirname(this.embeddingsPath), { recursive: true });
const toolsData = await getCompressedToolsDataForAI();
const currentDataHash = this.hashData(toolsData);
// Try to load existing embeddings
const existingEmbeddings = await this.loadEmbeddings();
if (existingEmbeddings && existingEmbeddings.version === currentDataHash) {
console.log('[EMBEDDINGS] Using cached embeddings');
this.embeddings = existingEmbeddings.embeddings;
} else {
console.log('[EMBEDDINGS] Generating new embeddings...');
await this.generateEmbeddings(toolsData, currentDataHash);
}
this.isInitialized = true;
console.log(`[EMBEDDINGS] Initialized with ${this.embeddings.length} embeddings`);
} catch (error) {
console.error('[EMBEDDINGS] Failed to initialize:', error);
this.isInitialized = false;
throw error;
} finally {
this.initializationPromise = null;
}
}
async waitForInitialization(): Promise<void> {
if (!this.enabled) {
return Promise.resolve();
}
if (this.isInitialized) {
return Promise.resolve();
}
if (this.initializationPromise) {
await this.initializationPromise;
return;
}
return this.initialize();
}
private hashData(data: any): string {
return Buffer.from(JSON.stringify(data)).toString('base64').slice(0, 32);
}
private async loadEmbeddings(): Promise<EmbeddingsDatabase | null> {
try {
const data = await fs.readFile(this.embeddingsPath, 'utf8');
return JSON.parse(data);
} catch (error) {
console.log('[EMBEDDINGS] No existing embeddings found');
return null;
}
}
private async saveEmbeddings(version: string): Promise<void> {
const database: EmbeddingsDatabase = {
version,
lastUpdated: Date.now(),
embeddings: this.embeddings
};
await fs.writeFile(this.embeddingsPath, JSON.stringify(database, null, 2));
console.log(`[EMBEDDINGS] Saved ${this.embeddings.length} embeddings to disk`);
}
private createContentString(item: any): string {
const parts = [
item.name,
item.description || '',
...(item.tags || []),
...(item.domains || []),
...(item.phases || [])
];
return parts.filter(Boolean).join(' ').toLowerCase();
}
private async generateEmbeddingsBatch(contents: string[]): Promise<number[][]> {
const endpoint = process.env.AI_EMBEDDINGS_ENDPOINT;
const apiKey = process.env.AI_EMBEDDINGS_API_KEY;
const model = process.env.AI_EMBEDDINGS_MODEL;
if (!endpoint || !model) {
throw new Error('Missing embeddings API configuration');
}
const headers: Record<string, string> = {
'Content-Type': 'application/json'
};
if (apiKey) {
headers['Authorization'] = `Bearer ${apiKey}`;
}
const response = await fetch(endpoint, {
method: 'POST',
headers,
body: JSON.stringify({
model,
input: contents
})
});
if (!response.ok) {
const error = await response.text();
throw new Error(`Embeddings API error: ${response.status} - ${error}`);
}
const data = await response.json();
if (Array.isArray(data.embeddings)) {
return data.embeddings;
}
if (Array.isArray(data.data)) {
return data.data.map((item: any) => item.embedding);
}
throw new Error('Unknown embeddings API response format');
}
private async generateEmbeddings(toolsData: any, version: string): Promise<void> {
const allItems = [
...toolsData.tools.map((tool: any) => ({ ...tool, type: 'tool' })),
...toolsData.concepts.map((concept: any) => ({ ...concept, type: 'concept' }))
];
const contents = allItems.map(item => this.createContentString(item));
this.embeddings = [];
for (let i = 0; i < contents.length; i += this.batchSize) {
const batch = contents.slice(i, i + this.batchSize);
const batchItems = allItems.slice(i, i + this.batchSize);
console.log(`[EMBEDDINGS] Processing batch ${Math.ceil((i + 1) / this.batchSize)} of ${Math.ceil(contents.length / this.batchSize)}`);
try {
const embeddings = await this.generateEmbeddingsBatch(batch);
embeddings.forEach((embedding, index) => {
const item = batchItems[index];
this.embeddings.push({
id: `${item.type}_${item.name.replace(/[^a-zA-Z0-9]/g, '_')}`,
type: item.type,
name: item.name,
content: batch[index],
embedding,
metadata: {
domains: item.domains,
phases: item.phases,
tags: item.tags,
skillLevel: item.skillLevel,
type: item.type
}
});
});
if (i + this.batchSize < contents.length) {
await new Promise(resolve => setTimeout(resolve, this.batchDelay));
}
} catch (error) {
console.error(`[EMBEDDINGS] Failed to process batch ${Math.ceil((i + 1) / this.batchSize)}:`, error);
throw error;
}
}
await this.saveEmbeddings(version);
}
public async embedText(text: string): Promise<number[]> {
const [embedding] = await this.generateEmbeddingsBatch([text.toLowerCase()]);
return embedding;
}
private cosineSimilarity(a: number[], b: number[]): number {
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
async findSimilar(query: string, maxResults: number = 30, threshold: number = 0.3): Promise<SimilarityResult[]> {
if (!this.enabled || !this.isInitialized || this.embeddings.length === 0) {
console.log('[EMBEDDINGS] Service not available for similarity search');
return [];
}
try {
const queryEmbeddings = await this.generateEmbeddingsBatch([query.toLowerCase()]);
const queryEmbedding = queryEmbeddings[0];
console.log(`[EMBEDDINGS] Computing similarities for ${this.embeddings.length} items`);
const similarities: SimilarityResult[] = this.embeddings.map(item => ({
...item,
similarity: this.cosineSimilarity(queryEmbedding, item.embedding)
}));
const results = similarities
.filter(item => item.similarity >= threshold)
.sort((a, b) => b.similarity - a.similarity)
.slice(0, maxResults);
const orderingValid = results.every((item, index) => {
if (index === 0) return true;
return item.similarity <= results[index - 1].similarity;
});
if (!orderingValid) {
console.error('[EMBEDDINGS] CRITICAL: Similarity ordering is broken!');
results.forEach((item, idx) => {
console.error(` ${idx}: ${item.name} = ${item.similarity.toFixed(4)}`);
});
}
console.log(`[EMBEDDINGS] Found ${results.length} similar items (threshold: ${threshold})`);
if (results.length > 0) {
console.log('[EMBEDDINGS] Top 10 similarity matches:');
results.slice(0, 10).forEach((item, idx) => {
console.log(` ${idx + 1}. ${item.name} (${item.type}) = ${item.similarity.toFixed(4)}`);
});
const topSimilarity = results[0].similarity;
const hasHigherSimilarity = results.some(item => item.similarity > topSimilarity);
if (hasHigherSimilarity) {
console.error('[EMBEDDINGS] CRITICAL: Top result is not actually the highest similarity!');
}
}
return results;
} catch (error) {
console.error('[EMBEDDINGS] Failed to find similar items:', error);
return [];
}
}
isEnabled(): boolean {
return this.enabled && this.isInitialized;
}
getStats(): { enabled: boolean; initialized: boolean; count: number } {
return {
enabled: this.enabled,
initialized: this.isInitialized,
count: this.embeddings.length
};
}
}
// Global instance
const embeddingsService = new EmbeddingsService();
export { embeddingsService, type EmbeddingData, type SimilarityResult };
// Auto-initialize on import in server environment
if (typeof window === 'undefined' && process.env.NODE_ENV !== 'test') {
embeddingsService.initialize().catch(error => {
console.error('[EMBEDDINGS] Auto-initialization failed:', error);
});
}