forensic-pathways/src/utils/embeddings.ts
2025-08-02 12:57:38 +02:00

266 lines
8.0 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// src/utils/embeddings.ts
import { promises as fs } from 'fs';
import path from 'path';
import { getCompressedToolsDataForAI } from './dataService.js';
interface EmbeddingData {
id: string;
type: 'tool' | 'concept';
name: string;
content: string;
embedding: number[];
metadata: {
domains?: string[];
phases?: string[];
tags?: string[];
skillLevel?: string;
type?: string;
};
}
interface EmbeddingsDatabase {
version: string;
lastUpdated: number;
embeddings: EmbeddingData[];
}
interface EmbeddingSearchResult extends EmbeddingData {
similarity: number;
}
class EmbeddingsService {
private embeddings: EmbeddingData[] = [];
private isInitialized = false;
private readonly embeddingsPath = path.join(process.cwd(), 'data', 'embeddings.json');
private readonly batchSize: number;
private readonly batchDelay: number;
private readonly enabled: boolean;
constructor() {
this.enabled = process.env.AI_EMBEDDINGS_ENABLED === 'true';
this.batchSize = parseInt(process.env.AI_EMBEDDINGS_BATCH_SIZE || '20', 10);
this.batchDelay = parseInt(process.env.AI_EMBEDDINGS_BATCH_DELAY_MS || '1000', 10);
}
async initialize(): Promise<void> {
if (!this.enabled) {
console.log('[EMBEDDINGS] Embeddings disabled, skipping initialization');
return;
}
try {
console.log('[EMBEDDINGS] Initializing embeddings system...');
// Create data directory if it doesn't exist
await fs.mkdir(path.dirname(this.embeddingsPath), { recursive: true });
const toolsData = await getCompressedToolsDataForAI();
const currentDataHash = this.hashData(toolsData);
// Try to load existing embeddings
const existingEmbeddings = await this.loadEmbeddings();
if (existingEmbeddings && existingEmbeddings.version === currentDataHash) {
console.log('[EMBEDDINGS] Using cached embeddings');
this.embeddings = existingEmbeddings.embeddings;
} else {
console.log('[EMBEDDINGS] Generating new embeddings...');
await this.generateEmbeddings(toolsData, currentDataHash);
}
this.isInitialized = true;
console.log(`[EMBEDDINGS] Initialized with ${this.embeddings.length} embeddings`);
} catch (error) {
console.error('[EMBEDDINGS] Failed to initialize:', error);
this.isInitialized = false;
}
}
private hashData(data: any): string {
return Buffer.from(JSON.stringify(data)).toString('base64').slice(0, 32);
}
private async loadEmbeddings(): Promise<EmbeddingsDatabase | null> {
try {
const data = await fs.readFile(this.embeddingsPath, 'utf8');
return JSON.parse(data);
} catch (error) {
console.log('[EMBEDDINGS] No existing embeddings found');
return null;
}
}
private async saveEmbeddings(version: string): Promise<void> {
const database: EmbeddingsDatabase = {
version,
lastUpdated: Date.now(),
embeddings: this.embeddings
};
await fs.writeFile(this.embeddingsPath, JSON.stringify(database, null, 2));
console.log(`[EMBEDDINGS] Saved ${this.embeddings.length} embeddings to disk`);
}
private createContentString(item: any): string {
const parts = [
item.name,
item.description || '',
...(item.tags || []),
...(item.domains || []),
...(item.phases || [])
];
return parts.filter(Boolean).join(' ').toLowerCase();
}
private async generateEmbeddingsBatch(contents: string[]): Promise<number[][]> {
const endpoint = process.env.AI_EMBEDDINGS_ENDPOINT;
const apiKey = process.env.AI_EMBEDDINGS_API_KEY;
const model = process.env.AI_EMBEDDINGS_MODEL;
if (!endpoint || !apiKey || !model) {
throw new Error('Missing embeddings API configuration');
}
const response = await fetch(endpoint, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${apiKey}`
},
body: JSON.stringify({
model,
input: contents
})
});
if (!response.ok) {
const error = await response.text();
throw new Error(`Embeddings API error: ${response.status} - ${error}`);
}
const data = await response.json();
return data.data.map((item: any) => item.embedding);
}
private async generateEmbeddings(toolsData: any, version: string): Promise<void> {
const allItems = [
...toolsData.tools.map((tool: any) => ({ ...tool, type: 'tool' })),
...toolsData.concepts.map((concept: any) => ({ ...concept, type: 'concept' }))
];
const contents = allItems.map(item => this.createContentString(item));
this.embeddings = [];
// Process in batches to respect rate limits
for (let i = 0; i < contents.length; i += this.batchSize) {
const batch = contents.slice(i, i + this.batchSize);
const batchItems = allItems.slice(i, i + this.batchSize);
console.log(`[EMBEDDINGS] Processing batch ${Math.ceil((i + 1) / this.batchSize)} of ${Math.ceil(contents.length / this.batchSize)}`);
try {
const embeddings = await this.generateEmbeddingsBatch(batch);
embeddings.forEach((embedding, index) => {
const item = batchItems[index];
this.embeddings.push({
id: `${item.type}_${item.name.replace(/[^a-zA-Z0-9]/g, '_')}`,
type: item.type,
name: item.name,
content: batch[index],
embedding,
metadata: {
domains: item.domains,
phases: item.phases,
tags: item.tags,
skillLevel: item.skillLevel,
type: item.type
}
});
});
// Rate limiting delay between batches
if (i + this.batchSize < contents.length) {
await new Promise(resolve => setTimeout(resolve, this.batchDelay));
}
} catch (error) {
console.error(`[EMBEDDINGS] Failed to process batch ${Math.ceil((i + 1) / this.batchSize)}:`, error);
throw error;
}
}
await this.saveEmbeddings(version);
}
public async embedText(text: string): Promise<number[]> {
// Reuse the private batch helper to avoid auth duplication
const [embedding] = await this.generateEmbeddingsBatch([text.toLowerCase()]);
return embedding;
}
private cosineSimilarity(a: number[], b: number[]): number {
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
async findSimilar(query: string, maxResults: number = 30, threshold: number = 0.3): Promise<EmbeddingSearchResult[]> {
if (!this.enabled || !this.isInitialized || this.embeddings.length === 0) {
return [];
}
try {
const queryEmbeddings = await this.generateEmbeddingsBatch([query.toLowerCase()]);
const queryEmbedding = queryEmbeddings[0];
const similarities = this.embeddings.map(item => ({
...item,
similarity: this.cosineSimilarity(queryEmbedding, item.embedding)
}));
return similarities
.filter(item => item.similarity >= threshold)
.sort((a, b) => b.similarity - a.similarity)
.slice(0, maxResults);
} catch (error) {
console.error('[EMBEDDINGS] Failed to find similar items:', error);
return [];
}
}
isEnabled(): boolean {
return this.enabled && this.isInitialized;
}
getStats(): { enabled: boolean; initialized: boolean; count: number } {
return {
enabled: this.enabled,
initialized: this.isInitialized,
count: this.embeddings.length
};
}
}
const embeddingsService = new EmbeddingsService();
export { embeddingsService, type EmbeddingData, type EmbeddingSearchResult };
if (typeof window === 'undefined' && process.env.NODE_ENV !== 'test') {
embeddingsService.initialize().catch(error => {
console.error('[EMBEDDINGS] Auto-initialization failed:', error);
});
}