embeddings-1 #2
File diff suppressed because one or more lines are too long
117075
data/embeddings.json
Normal file
117075
data/embeddings.json
Normal file
File diff suppressed because it is too large
Load Diff
22
src/pages/api/ai/embeddings.status.ts
Normal file
22
src/pages/api/ai/embeddings.status.ts
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
// src/pages/api/ai/embeddings-status.ts
|
||||||
|
import type { APIRoute } from 'astro';
|
||||||
|
import { embeddingsService } from '../../../utils/embeddings.js';
|
||||||
|
import { apiResponse, apiServerError } from '../../../utils/api.js';
|
||||||
|
|
||||||
|
export const prerender = false;
|
||||||
|
|
||||||
|
export const GET: APIRoute = async () => {
|
||||||
|
try {
|
||||||
|
const stats = embeddingsService.getStats();
|
||||||
|
|
||||||
|
return apiResponse.success({
|
||||||
|
embeddings: stats,
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
status: stats.enabled && stats.initialized ? 'ready' :
|
||||||
|
stats.enabled && !stats.initialized ? 'initializing' : 'disabled'
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Embeddings status error:', error);
|
||||||
|
return apiServerError.internal('Failed to get embeddings status');
|
||||||
|
}
|
||||||
|
};
|
@ -14,7 +14,11 @@ function getEnv(key: string): string {
|
|||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
const AI_MODEL = getEnv('AI_MODEL');
|
// Use the analyzer AI for smart prompting (smaller, faster model)
|
||||||
|
const AI_ENDPOINT = getEnv('AI_ANALYZER_ENDPOINT');
|
||||||
|
const AI_API_KEY = getEnv('AI_ANALYZER_API_KEY');
|
||||||
|
const AI_MODEL = getEnv('AI_ANALYZER_MODEL');
|
||||||
|
|
||||||
const rateLimitStore = new Map<string, { count: number; resetTime: number }>();
|
const rateLimitStore = new Map<string, { count: number; resetTime: number }>();
|
||||||
const RATE_LIMIT_WINDOW = 60 * 1000; // 1 minute
|
const RATE_LIMIT_WINDOW = 60 * 1000; // 1 minute
|
||||||
const RATE_LIMIT_MAX = 5; // 5 enhancement requests per minute per user
|
const RATE_LIMIT_MAX = 5; // 5 enhancement requests per minute per user
|
||||||
@ -59,29 +63,38 @@ function cleanupExpiredRateLimits() {
|
|||||||
setInterval(cleanupExpiredRateLimits, 5 * 60 * 1000);
|
setInterval(cleanupExpiredRateLimits, 5 * 60 * 1000);
|
||||||
|
|
||||||
function createEnhancementPrompt(input: string): string {
|
function createEnhancementPrompt(input: string): string {
|
||||||
return `
|
return `Du bist eine KI für digitale Forensik-Anfragen. Der Nutzer beschreibt ein forensisches Szenario oder Problem. Analysiere die Eingabe auf Vollständigkeit und Klarheit.
|
||||||
Du bist eine KI für digitale Forensik. Der Nutzer beschreibt ein forensisches Szenario. Analysiere die Eingabe.
|
|
||||||
|
|
||||||
Wenn die Beschreibung unvollständig oder vage ist, stelle bis zu drei präzise Rückfragen im JSON-Array-Format, um wichtige Details zu klären (z. B. Vorfalltyp, System, Ziel, Datenquellen, Zeit, Beteiligte, rechtlicher Rahmen).
|
ANALYSIERE DIESE KATEGORIEN:
|
||||||
|
1. **Vorfalltyp**: Was ist passiert? (Malware, Datendiebstahl, Compliance-Verstoß, etc.)
|
||||||
|
2. **Betroffene Systeme**: Welche Technologien/Plattformen? (Windows, Linux, Mobile, Cloud, etc.)
|
||||||
|
3. **Verfügbare Datenquellen**: Was kann untersucht werden? (Logs, Images, Memory Dumps, etc.)
|
||||||
|
4. **Untersuchungsziel**: Was soll erreicht werden? (IOCs finden, Timeline erstellen, etc.)
|
||||||
|
5. **Zeitrahmen & Dringlichkeit**: Wann ist etwas passiert? Wie dringend?
|
||||||
|
6. **Ressourcen & Constraints**: Budget, Skills, Tools, rechtliche Aspekte
|
||||||
|
7. **Beweisziele**: Dokumentation, Gerichtsverfahren, interne Aufklärung?
|
||||||
|
|
||||||
Wenn die Eingabe bereits klar, spezifisch und vollständig ist, gib stattdessen nur eine leere Liste [] zurück.
|
WENN die Beschreibung vollständig und spezifisch ist: Gib eine leere Liste [] zurück.
|
||||||
|
|
||||||
Antwortformat strikt:
|
WENN wichtige Details fehlen: Formuliere 2-3 präzise Fragen, die die kritischsten Lücken schließen. Fokussiere auf Details, die die Tool-/Methoden-Auswahl stark beeinflussen.
|
||||||
|
|
||||||
\`\`\`json
|
FRAGE-QUALITÄT:
|
||||||
|
- Spezifisch, nicht allgemein (❌ "Mehr Details?" ✅ "Welche Betriebssysteme sind betroffen?")
|
||||||
|
- Handlungsrelevant (❌ "Wann passierte das?" ✅ "Haben Sie Logs aus der Vorfallzeit verfügbar?")
|
||||||
|
- Priorisiert nach Wichtigkeit für die forensische Analyse
|
||||||
|
|
||||||
|
ANTWORTFORMAT (NUR JSON):
|
||||||
[
|
[
|
||||||
"Frage 1?",
|
"Spezifische Frage 1?",
|
||||||
"Frage 2?",
|
"Spezifische Frage 2?",
|
||||||
"Frage 3?"
|
"Spezifische Frage 3?"
|
||||||
]
|
]
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
Nutzer-Eingabe:
|
NUTZER-EINGABE:
|
||||||
${input}
|
${input}
|
||||||
`.trim();
|
`.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export const POST: APIRoute = async ({ request }) => {
|
export const POST: APIRoute = async ({ request }) => {
|
||||||
try {
|
try {
|
||||||
const authResult = await withAPIAuth(request, 'ai');
|
const authResult = await withAPIAuth(request, 'ai');
|
||||||
@ -98,12 +111,12 @@ export const POST: APIRoute = async ({ request }) => {
|
|||||||
const body = await request.json();
|
const body = await request.json();
|
||||||
const { input } = body;
|
const { input } = body;
|
||||||
|
|
||||||
if (!input || typeof input !== 'string' || input.length < 20) {
|
if (!input || typeof input !== 'string' || input.length < 40) {
|
||||||
return apiError.badRequest('Input too short for enhancement');
|
return apiError.badRequest('Input too short for enhancement (minimum 40 characters)');
|
||||||
}
|
}
|
||||||
|
|
||||||
const sanitizedInput = sanitizeInput(input);
|
const sanitizedInput = sanitizeInput(input);
|
||||||
if (sanitizedInput.length < 20) {
|
if (sanitizedInput.length < 40) {
|
||||||
return apiError.badRequest('Input too short after sanitization');
|
return apiError.badRequest('Input too short after sanitization');
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -111,11 +124,11 @@ export const POST: APIRoute = async ({ request }) => {
|
|||||||
const taskId = `enhance_${userId}_${Date.now()}_${Math.random().toString(36).substr(2, 4)}`;
|
const taskId = `enhance_${userId}_${Date.now()}_${Math.random().toString(36).substr(2, 4)}`;
|
||||||
|
|
||||||
const aiResponse = await enqueueApiCall(() =>
|
const aiResponse = await enqueueApiCall(() =>
|
||||||
fetch(process.env.AI_API_ENDPOINT + '/v1/chat/completions', {
|
fetch(`${AI_ENDPOINT}/v1/chat/completions`, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: {
|
headers: {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'Authorization': `Bearer ${process.env.AI_API_KEY}`
|
'Authorization': `Bearer ${AI_API_KEY}`
|
||||||
},
|
},
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
model: AI_MODEL,
|
model: AI_MODEL,
|
||||||
@ -125,7 +138,7 @@ export const POST: APIRoute = async ({ request }) => {
|
|||||||
content: systemPrompt
|
content: systemPrompt
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
max_tokens: 200,
|
max_tokens: 300,
|
||||||
temperature: 0.7
|
temperature: 0.7
|
||||||
})
|
})
|
||||||
}), taskId);
|
}), taskId);
|
||||||
@ -144,28 +157,32 @@ export const POST: APIRoute = async ({ request }) => {
|
|||||||
|
|
||||||
let questions;
|
let questions;
|
||||||
try {
|
try {
|
||||||
const cleanedContent = aiContent
|
const cleanedContent = aiContent
|
||||||
.replace(/^```json\s*/i, '')
|
.replace(/^```json\s*/i, '')
|
||||||
.replace(/\s*```\s*$/, '')
|
.replace(/\s*```\s*$/, '')
|
||||||
.trim();
|
.trim();
|
||||||
questions = JSON.parse(cleanedContent);
|
questions = JSON.parse(cleanedContent);
|
||||||
|
|
||||||
if (!Array.isArray(questions) || questions.length === 0) {
|
if (!Array.isArray(questions)) {
|
||||||
throw new Error('Invalid questions format');
|
throw new Error('Response is not an array');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Validate and clean questions
|
// Enhanced validation and cleaning
|
||||||
questions = questions
|
questions = questions
|
||||||
.filter(q => typeof q === 'string' && q.length > 5 && q.length < 120)
|
.filter(q => typeof q === 'string' && q.length > 10 && q.length < 150) // More reasonable length limits
|
||||||
.slice(0, 3);
|
.filter(q => q.includes('?')) // Must be a question
|
||||||
|
.map(q => q.trim())
|
||||||
|
.slice(0, 3); // Max 3 questions
|
||||||
|
|
||||||
|
// If no valid questions, return empty array (means input is complete)
|
||||||
if (questions.length === 0) {
|
if (questions.length === 0) {
|
||||||
throw new Error('No valid questions found');
|
questions = [];
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Failed to parse enhancement response:', aiContent);
|
console.error('Failed to parse enhancement response:', aiContent);
|
||||||
return apiServerError.unavailable('Invalid enhancement response format');
|
// If parsing fails, assume input is complete enough
|
||||||
|
questions = [];
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`[AI Enhancement] User: ${userId}, Questions: ${questions.length}, Input length: ${sanitizedInput.length}`);
|
console.log(`[AI Enhancement] User: ${userId}, Questions: ${questions.length}, Input length: ${sanitizedInput.length}`);
|
||||||
@ -173,7 +190,8 @@ export const POST: APIRoute = async ({ request }) => {
|
|||||||
return new Response(JSON.stringify({
|
return new Response(JSON.stringify({
|
||||||
success: true,
|
success: true,
|
||||||
questions,
|
questions,
|
||||||
taskId
|
taskId,
|
||||||
|
inputComplete: questions.length === 0 // Flag to indicate if input seems complete
|
||||||
}), {
|
}), {
|
||||||
status: 200,
|
status: 200,
|
||||||
headers: { 'Content-Type': 'application/json' }
|
headers: { 'Content-Type': 'application/json' }
|
||||||
|
@ -1,21 +1,12 @@
|
|||||||
// src/pages/api/ai/query.ts
|
// src/pages/api/ai/query.ts
|
||||||
import type { APIRoute } from 'astro';
|
import type { APIRoute } from 'astro';
|
||||||
import { withAPIAuth } from '../../../utils/auth.js';
|
import { withAPIAuth } from '../../../utils/auth.js';
|
||||||
import { getCompressedToolsDataForAI } from '../../../utils/dataService.js';
|
|
||||||
import { apiError, apiServerError, createAuthErrorResponse } from '../../../utils/api.js';
|
import { apiError, apiServerError, createAuthErrorResponse } from '../../../utils/api.js';
|
||||||
import { enqueueApiCall } from '../../../utils/rateLimitedQueue.js';
|
import { enqueueApiCall } from '../../../utils/rateLimitedQueue.js';
|
||||||
|
import { aiPipeline } from '../../../utils/aiPipeline.js';
|
||||||
|
|
||||||
export const prerender = false;
|
export const prerender = false;
|
||||||
|
|
||||||
function getEnv(key: string): string {
|
|
||||||
const value = process.env[key];
|
|
||||||
if (!value) {
|
|
||||||
throw new Error(`Missing environment variable: ${key}`);
|
|
||||||
}
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
const AI_MODEL = getEnv('AI_MODEL');
|
|
||||||
const rateLimitStore = new Map<string, { count: number; resetTime: number }>();
|
const rateLimitStore = new Map<string, { count: number; resetTime: number }>();
|
||||||
const RATE_LIMIT_WINDOW = 60 * 1000;
|
const RATE_LIMIT_WINDOW = 60 * 1000;
|
||||||
const RATE_LIMIT_MAX = 10;
|
const RATE_LIMIT_MAX = 10;
|
||||||
@ -33,13 +24,6 @@ function sanitizeInput(input: string): string {
|
|||||||
return sanitized;
|
return sanitized;
|
||||||
}
|
}
|
||||||
|
|
||||||
function stripMarkdownJson(content: string): string {
|
|
||||||
return content
|
|
||||||
.replace(/^```json\s*/i, '')
|
|
||||||
.replace(/\s*```\s*$/, '')
|
|
||||||
.trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
function checkRateLimit(userId: string): boolean {
|
function checkRateLimit(userId: string): boolean {
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
const userLimit = rateLimitStore.get(userId);
|
const userLimit = rateLimitStore.get(userId);
|
||||||
@ -68,209 +52,6 @@ function cleanupExpiredRateLimits() {
|
|||||||
|
|
||||||
setInterval(cleanupExpiredRateLimits, 5 * 60 * 1000);
|
setInterval(cleanupExpiredRateLimits, 5 * 60 * 1000);
|
||||||
|
|
||||||
async function loadToolsDatabase() {
|
|
||||||
try {
|
|
||||||
return await getCompressedToolsDataForAI();
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Failed to load tools database:', error);
|
|
||||||
throw new Error('Database unavailable');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function createWorkflowSystemPrompt(toolsData: any): string {
|
|
||||||
const toolsList = toolsData.tools.map((tool: any) => ({
|
|
||||||
name: tool.name,
|
|
||||||
description: tool.description,
|
|
||||||
domains: tool.domains,
|
|
||||||
phases: tool.phases,
|
|
||||||
domainAgnostic: tool['domain-agnostic-software'],
|
|
||||||
platforms: tool.platforms,
|
|
||||||
skillLevel: tool.skillLevel,
|
|
||||||
license: tool.license,
|
|
||||||
tags: tool.tags,
|
|
||||||
related_concepts: tool.related_concepts || []
|
|
||||||
}));
|
|
||||||
|
|
||||||
const conceptsList = toolsData.concepts.map((concept: any) => ({
|
|
||||||
name: concept.name,
|
|
||||||
description: concept.description,
|
|
||||||
domains: concept.domains,
|
|
||||||
phases: concept.phases,
|
|
||||||
skillLevel: concept.skillLevel,
|
|
||||||
tags: concept.tags
|
|
||||||
}));
|
|
||||||
|
|
||||||
const regularPhases = toolsData.phases || [];
|
|
||||||
|
|
||||||
const domainAgnosticSoftware = toolsData['domain-agnostic-software'] || [];
|
|
||||||
|
|
||||||
const allPhaseItems = [
|
|
||||||
...regularPhases,
|
|
||||||
...domainAgnosticSoftware
|
|
||||||
];
|
|
||||||
|
|
||||||
const phasesDescription = allPhaseItems.map((phase: any) =>
|
|
||||||
`- ${phase.id}: ${phase.name}`
|
|
||||||
).join('\n');
|
|
||||||
|
|
||||||
const domainsDescription = toolsData.domains.map((domain: any) =>
|
|
||||||
`- ${domain.id}: ${domain.name}`
|
|
||||||
).join('\n');
|
|
||||||
|
|
||||||
const phaseDescriptions = regularPhases.map((phase: any) =>
|
|
||||||
`- ${phase.name}: ${phase.description || 'Tools/Methods for this phase'}`
|
|
||||||
).join('\n');
|
|
||||||
|
|
||||||
const domainAgnosticDescriptions = domainAgnosticSoftware.map((section: any) =>
|
|
||||||
`- ${section.name}: ${section.description || 'Cross-cutting software and platforms'}`
|
|
||||||
).join('\n');
|
|
||||||
|
|
||||||
const validPhases = [
|
|
||||||
...regularPhases.map((p: any) => p.id),
|
|
||||||
...domainAgnosticSoftware.map((s: any) => s.id)
|
|
||||||
].join('|');
|
|
||||||
|
|
||||||
return `Du bist ein DFIR (Digital Forensics and Incident Response) Experte, der Ermittlern bei der Auswahl von Software und Methoden hilft.
|
|
||||||
|
|
||||||
VERFÜGBARE TOOLS/METHODEN:
|
|
||||||
${JSON.stringify(toolsList, null, 2)}
|
|
||||||
|
|
||||||
VERFÜGBARE HINTERGRUNDWISSEN-KONZEPTE:
|
|
||||||
${JSON.stringify(conceptsList, null, 2)}
|
|
||||||
|
|
||||||
UNTERSUCHUNGSPHASEN (NIST Framework):
|
|
||||||
${phasesDescription}
|
|
||||||
|
|
||||||
FORENSISCHE DOMÄNEN:
|
|
||||||
${domainsDescription}
|
|
||||||
|
|
||||||
WICHTIGE REGELN:
|
|
||||||
1. Pro Phase 2-3 Tools/Methoden empfehlen (immer mindestens 2 wenn verfügbar)
|
|
||||||
2. Tools/Methoden können in MEHREREN Phasen empfohlen werden wenn sinnvoll - versuche ein Tool/Methode für jede Phase zu empfehlen, selbst wenn die Priorität "low" ist.
|
|
||||||
3. Für Reporting-Phase: Visualisierungs- und Dokumentationssoftware einschließen
|
|
||||||
4. Gib stets dem spezieller für den Fall geeigneten Werkzeug den Vorzug.
|
|
||||||
5. Deutsche Antworten für deutsche Anfragen, English for English queries
|
|
||||||
6. Methoden haben, sofern für das SZENARIO passend, IMMER Vorrang vor Software.
|
|
||||||
7. Bevorzuge alles, was nicht proprietär ist (license != "Proprietary"), aber erkenne an, wenn proprietäre Software besser geeignet ist.
|
|
||||||
8. WICHTIG: Erwähne relevante Hintergrundwissen-Konzepte wenn Tools verwendet werden, die related_concepts haben
|
|
||||||
9. Konzepte sind NICHT Tools - empfehle sie nicht als actionable Schritte, sondern als Wissensbasis
|
|
||||||
|
|
||||||
ENHANCED CONTEXTUAL ANALYSIS:
|
|
||||||
10. Analysiere das Szenario detailliert und identifiziere Schlüsselelemente, Bedrohungen und forensische Herausforderungen
|
|
||||||
11. Entwickle einen strategischen Untersuchungsansatz basierend auf dem spezifischen Szenario
|
|
||||||
12. Identifiziere zeitkritische oder besonders wichtige Faktoren für diesen Fall
|
|
||||||
|
|
||||||
SOFTWARE/METHODEN-AUSWAHL NACH PHASE:
|
|
||||||
${phaseDescriptions}
|
|
||||||
|
|
||||||
DOMÄNENAGNOSTISCHE SOFTWARE/METHODEN:
|
|
||||||
${domainAgnosticDescriptions}
|
|
||||||
|
|
||||||
ANTWORT-FORMAT (strict JSON):
|
|
||||||
{
|
|
||||||
"scenario_analysis": "Detaillierte Analyse des Szenarios: Erkannte Schlüsselelemente, Art des Vorfalls, betroffene Systeme, potentielle Bedrohungen und forensische Herausforderungen",
|
|
||||||
"investigation_approach": "Strategischer Untersuchungsansatz für dieses spezifische Szenario: Prioritäten, Reihenfolge der Phasen, besondere Überlegungen",
|
|
||||||
"critical_considerations": "Zeitkritische Faktoren, wichtige Sicherheitsaspekte oder besondere Vorsichtsmaßnahmen für diesen Fall",
|
|
||||||
"recommended_tools": [
|
|
||||||
{
|
|
||||||
"name": "EXAKTER Name aus der Tools-Database",
|
|
||||||
"priority": "high|medium|low",
|
|
||||||
"phase": "${validPhases}",
|
|
||||||
"justification": "Warum diese Methode für diese Phase und dieses spezifische Szenario geeignet ist - mit Bezug zu den erkannten Schlüsselelementen"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"workflow_suggestion": "Vorgeschlagener Untersuchungsablauf mit konkreten Schritten für dieses Szenario",
|
|
||||||
"background_knowledge": [
|
|
||||||
{
|
|
||||||
"concept_name": "EXAKTER Name aus der Konzepte-Database",
|
|
||||||
"relevance": "Warum dieses Konzept für das Szenario relevant ist, und bei welchen der empfohlenen Methoden/Tools."
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"additional_notes": "Wichtige Überlegungen und Hinweise"
|
|
||||||
}
|
|
||||||
|
|
||||||
Antworte NUR mit validen JSON. Keine zusätzlichen Erklärungen außerhalb des JSON.`;
|
|
||||||
}
|
|
||||||
|
|
||||||
function createToolSystemPrompt(toolsData: any): string {
|
|
||||||
const toolsList = toolsData.tools.map((tool: any) => ({
|
|
||||||
name: tool.name,
|
|
||||||
description: tool.description,
|
|
||||||
domains: tool.domains,
|
|
||||||
phases: tool.phases,
|
|
||||||
platforms: tool.platforms,
|
|
||||||
skillLevel: tool.skillLevel,
|
|
||||||
license: tool.license,
|
|
||||||
tags: tool.tags,
|
|
||||||
url: tool.url,
|
|
||||||
projectUrl: tool.projectUrl,
|
|
||||||
related_concepts: tool.related_concepts || []
|
|
||||||
}));
|
|
||||||
|
|
||||||
const conceptsList = toolsData.concepts.map((concept: any) => ({
|
|
||||||
name: concept.name,
|
|
||||||
description: concept.description,
|
|
||||||
domains: concept.domains,
|
|
||||||
phases: concept.phases,
|
|
||||||
skillLevel: concept.skillLevel,
|
|
||||||
tags: concept.tags
|
|
||||||
}));
|
|
||||||
|
|
||||||
return `Du bist ein DFIR (Digital Forensics and Incident Response) Experte, der bei der Auswahl spezifischer Software/Methoden für konkrete Probleme hilft.
|
|
||||||
|
|
||||||
VERFÜGBARE TOOLS/METHODEN:
|
|
||||||
${JSON.stringify(toolsList, null, 2)}
|
|
||||||
|
|
||||||
VERFÜGBARE HINTERGRUNDWISSEN-KONZEPTE:
|
|
||||||
${JSON.stringify(conceptsList, null, 2)}
|
|
||||||
|
|
||||||
WICHTIGE REGELN:
|
|
||||||
1. Analysiere das spezifische Problem/die Anforderung sorgfältig
|
|
||||||
2. Empfehle 1-3 Methoden/Tools, sortiert nach Eignung (beste Empfehlung zuerst)
|
|
||||||
3. Gib detaillierte Erklärungen, WARUM und WIE jede Methode/Tool das Problem löst
|
|
||||||
4. Berücksichtige praktische Aspekte: Skill Level, Plattformen, Verfügbarkeit
|
|
||||||
5. Deutsche Antworten für deutsche Anfragen, English for English queries
|
|
||||||
6. Gib konkrete Anwendungshinweise, nicht nur allgemeine Beschreibungen - Methoden haben, sofern für das SZENARIO passend, IMMER Vorrang vor Software.
|
|
||||||
7. Erwähne sowohl Stärken als auch Schwächen/Limitationen
|
|
||||||
8. Schlage alternative Ansätze vor, wenn sinnvoll
|
|
||||||
9. Gib grundsätzliche Hinweise, WIE die Methode/Tool konkret eingesetzt wird
|
|
||||||
10. WICHTIG: Erwähne relevante Hintergrundwissen-Konzepte wenn Tools verwendet werden, die related_concepts haben
|
|
||||||
11. Konzepte sind NICHT Tools - empfehle sie nicht als actionable Schritte, sondern als Wissensbasis
|
|
||||||
|
|
||||||
ENHANCED CONTEXTUAL ANALYSIS:
|
|
||||||
12. Analysiere das Problem detailliert und identifiziere technische Anforderungen, Herausforderungen und Erfolgsfaktoren
|
|
||||||
13. Entwickle einen strategischen Lösungsansatz basierend auf dem spezifischen Problem
|
|
||||||
14. Identifiziere wichtige Voraussetzungen oder Warnungen für die Anwendung
|
|
||||||
|
|
||||||
ANTWORT-FORMAT (strict JSON):
|
|
||||||
{
|
|
||||||
"problem_analysis": "Detaillierte Analyse des Problems: Erkannte technische Anforderungen, Herausforderungen, benötigte Fähigkeiten und Erfolgsfaktoren",
|
|
||||||
"investigation_approach": "Strategischer Lösungsansatz für dieses spezifische Problem: Herangehensweise, Prioritäten, optimale Anwendungsreihenfolge",
|
|
||||||
"critical_considerations": "Wichtige Voraussetzungen, potentielle Fallstricke oder Warnungen für die Anwendung der empfohlenen Lösungen",
|
|
||||||
"recommended_tools": [
|
|
||||||
{
|
|
||||||
"name": "EXAKTER Name aus der Tools-Database",
|
|
||||||
"rank": 1,
|
|
||||||
"suitability_score": "high|medium|low",
|
|
||||||
"detailed_explanation": "Detaillierte Erklärung, warum dieses Tool/diese Methode das spezifische Problem löst - mit Bezug zu den erkannten Anforderungen",
|
|
||||||
"implementation_approach": "Konkrete Schritte/Ansatz zur Anwendung für dieses spezifische Problem",
|
|
||||||
"pros": ["Spezifische Vorteile für diesen Anwendungsfall", "Weitere Vorteile"],
|
|
||||||
"cons": ["Potentielle Nachteile oder Limitationen", "Weitere Einschränkungen"],
|
|
||||||
"alternatives": "Alternative Ansätze oder ergänzende Tools/Methoden, falls relevant"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"background_knowledge": [
|
|
||||||
{
|
|
||||||
"concept_name": "EXAKTER Name aus der Konzepte-Database",
|
|
||||||
"relevance": "Warum dieses Konzept für die empfohlenen Tools/das Problem relevant ist, und für welche der empfohlenen Methoden/Tools."
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"additional_considerations": "Wichtige Überlegungen, Voraussetzungen oder Warnungen"
|
|
||||||
}
|
|
||||||
|
|
||||||
Antworte NUR mit validen JSON. Keine zusätzlichen Erklärungen außerhalb des JSON.`;
|
|
||||||
}
|
|
||||||
|
|
||||||
export const POST: APIRoute = async ({ request }) => {
|
export const POST: APIRoute = async ({ request }) => {
|
||||||
try {
|
try {
|
||||||
const authResult = await withAPIAuth(request, 'ai');
|
const authResult = await withAPIAuth(request, 'ai');
|
||||||
@ -287,7 +68,6 @@ export const POST: APIRoute = async ({ request }) => {
|
|||||||
const body = await request.json();
|
const body = await request.json();
|
||||||
const { query, mode = 'workflow', taskId: clientTaskId } = body;
|
const { query, mode = 'workflow', taskId: clientTaskId } = body;
|
||||||
|
|
||||||
// ADD THIS DEBUG LOGGING
|
|
||||||
console.log(`[AI API] Received request - TaskId: ${clientTaskId}, Mode: ${mode}, Query length: ${query?.length || 0}`);
|
console.log(`[AI API] Received request - TaskId: ${clientTaskId}, Mode: ${mode}, Query length: ${query?.length || 0}`);
|
||||||
|
|
||||||
if (!query || typeof query !== 'string') {
|
if (!query || typeof query !== 'string') {
|
||||||
@ -306,128 +86,31 @@ export const POST: APIRoute = async ({ request }) => {
|
|||||||
return apiError.badRequest('Invalid input detected');
|
return apiError.badRequest('Invalid input detected');
|
||||||
}
|
}
|
||||||
|
|
||||||
const toolsData = await loadToolsDatabase();
|
|
||||||
|
|
||||||
const systemPrompt = mode === 'workflow'
|
|
||||||
? createWorkflowSystemPrompt(toolsData)
|
|
||||||
: createToolSystemPrompt(toolsData);
|
|
||||||
|
|
||||||
const taskId = clientTaskId || `ai_${userId}_${Date.now()}_${Math.random().toString(36).substr(2, 6)}`;
|
const taskId = clientTaskId || `ai_${userId}_${Date.now()}_${Math.random().toString(36).substr(2, 6)}`;
|
||||||
|
|
||||||
console.log(`[AI API] About to enqueue task ${taskId}`);
|
console.log(`[AI API] About to enqueue task ${taskId}`);
|
||||||
|
|
||||||
|
// Use the new AI pipeline instead of direct API calls
|
||||||
const aiResponse = await enqueueApiCall(() =>
|
const result = await enqueueApiCall(() =>
|
||||||
fetch(process.env.AI_API_ENDPOINT + '/v1/chat/completions', {
|
aiPipeline.processQuery(sanitizedQuery, mode)
|
||||||
method: 'POST',
|
|
||||||
headers: {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'Authorization': `Bearer ${process.env.AI_API_KEY}`
|
|
||||||
},
|
|
||||||
body: JSON.stringify({
|
|
||||||
model: AI_MODEL,
|
|
||||||
messages: [
|
|
||||||
{
|
|
||||||
role: 'system',
|
|
||||||
content: systemPrompt
|
|
||||||
},
|
|
||||||
{
|
|
||||||
role: 'user',
|
|
||||||
content: sanitizedQuery
|
|
||||||
}
|
|
||||||
],
|
|
||||||
max_tokens: 3500,
|
|
||||||
temperature: 0.3
|
|
||||||
})
|
|
||||||
})
|
|
||||||
, taskId);
|
, taskId);
|
||||||
|
|
||||||
if (!aiResponse.ok) {
|
if (!result || !result.recommendation) {
|
||||||
console.error('AI API error:', await aiResponse.text());
|
return apiServerError.unavailable('No response from AI pipeline');
|
||||||
return apiServerError.unavailable('AI service unavailable');
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const aiData = await aiResponse.json();
|
// Add processing statistics to the response for debugging/monitoring
|
||||||
const aiContent = aiData.choices?.[0]?.message?.content;
|
console.log(`[AI Query] Mode: ${mode}, User: ${userId}, Query length: ${sanitizedQuery.length}`);
|
||||||
|
console.log(`[AI Query] Processing stats:`, result.processingStats);
|
||||||
if (!aiContent) {
|
console.log(`[AI Query] Tools: ${result.recommendation.recommended_tools?.length || 0}, Concepts: ${result.recommendation.background_knowledge?.length || 0}`);
|
||||||
return apiServerError.unavailable('No response from AI');
|
|
||||||
}
|
|
||||||
|
|
||||||
let recommendation;
|
|
||||||
try {
|
|
||||||
const cleanedContent = stripMarkdownJson(aiContent);
|
|
||||||
recommendation = JSON.parse(cleanedContent);
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Failed to parse AI response:', aiContent);
|
|
||||||
return apiServerError.unavailable('Invalid AI response format');
|
|
||||||
}
|
|
||||||
|
|
||||||
const validToolNames = new Set(toolsData.tools.map((t: any) => t.name));
|
|
||||||
const validConceptNames = new Set(toolsData.concepts.map((c: any) => c.name));
|
|
||||||
|
|
||||||
let validatedRecommendation;
|
|
||||||
|
|
||||||
if (mode === 'workflow') {
|
|
||||||
validatedRecommendation = {
|
|
||||||
...recommendation,
|
|
||||||
// Ensure all new fields are included with fallbacks
|
|
||||||
scenario_analysis: recommendation.scenario_analysis || recommendation.problem_analysis || '',
|
|
||||||
investigation_approach: recommendation.investigation_approach || '',
|
|
||||||
critical_considerations: recommendation.critical_considerations || '',
|
|
||||||
recommended_tools: recommendation.recommended_tools?.filter((tool: any) => {
|
|
||||||
if (!validToolNames.has(tool.name)) {
|
|
||||||
console.warn(`AI recommended unknown tool: ${tool.name}`);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}) || [],
|
|
||||||
background_knowledge: recommendation.background_knowledge?.filter((concept: any) => {
|
|
||||||
if (!validConceptNames.has(concept.concept_name)) {
|
|
||||||
console.warn(`AI referenced unknown concept: ${concept.concept_name}`);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}) || []
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
validatedRecommendation = {
|
|
||||||
...recommendation,
|
|
||||||
// Ensure all new fields are included with fallbacks
|
|
||||||
problem_analysis: recommendation.problem_analysis || recommendation.scenario_analysis || '',
|
|
||||||
investigation_approach: recommendation.investigation_approach || '',
|
|
||||||
critical_considerations: recommendation.critical_considerations || '',
|
|
||||||
recommended_tools: recommendation.recommended_tools?.filter((tool: any) => {
|
|
||||||
if (!validToolNames.has(tool.name)) {
|
|
||||||
console.warn(`AI recommended unknown tool: ${tool.name}`);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}).map((tool: any, index: number) => ({
|
|
||||||
...tool,
|
|
||||||
rank: tool.rank || (index + 1),
|
|
||||||
suitability_score: tool.suitability_score || 'medium',
|
|
||||||
pros: Array.isArray(tool.pros) ? tool.pros : [],
|
|
||||||
cons: Array.isArray(tool.cons) ? tool.cons : []
|
|
||||||
})) || [],
|
|
||||||
background_knowledge: recommendation.background_knowledge?.filter((concept: any) => {
|
|
||||||
if (!validConceptNames.has(concept.concept_name)) {
|
|
||||||
console.warn(`AI referenced unknown concept: ${concept.concept_name}`);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}) || []
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(`[AI Query] Mode: ${mode}, User: ${userId}, Query length: ${sanitizedQuery.length}, Tools: ${validatedRecommendation.recommended_tools.length}, Concepts: ${validatedRecommendation.background_knowledge?.length || 0}`);
|
|
||||||
|
|
||||||
return new Response(JSON.stringify({
|
return new Response(JSON.stringify({
|
||||||
success: true,
|
success: true,
|
||||||
mode,
|
mode,
|
||||||
taskId,
|
taskId,
|
||||||
recommendation: validatedRecommendation,
|
recommendation: result.recommendation,
|
||||||
query: sanitizedQuery
|
query: sanitizedQuery,
|
||||||
|
processingStats: result.processingStats // Include stats for monitoring
|
||||||
}), {
|
}), {
|
||||||
status: 200,
|
status: 200,
|
||||||
headers: { 'Content-Type': 'application/json' }
|
headers: { 'Content-Type': 'application/json' }
|
||||||
@ -435,6 +118,16 @@ export const POST: APIRoute = async ({ request }) => {
|
|||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('AI query error:', error);
|
console.error('AI query error:', error);
|
||||||
return apiServerError.internal('Internal server error');
|
|
||||||
|
// Provide more specific error messages based on error type
|
||||||
|
if (error.message.includes('embeddings')) {
|
||||||
|
return apiServerError.unavailable('Embeddings service error - falling back to basic processing');
|
||||||
|
} else if (error.message.includes('selector')) {
|
||||||
|
return apiServerError.unavailable('AI selector service error');
|
||||||
|
} else if (error.message.includes('analyzer')) {
|
||||||
|
return apiServerError.unavailable('AI analyzer service error');
|
||||||
|
} else {
|
||||||
|
return apiServerError.internal('Internal server error');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
521
src/utils/aiPipeline.ts
Normal file
521
src/utils/aiPipeline.ts
Normal file
@ -0,0 +1,521 @@
|
|||||||
|
// src/utils/aiPipeline.ts
|
||||||
|
import { getCompressedToolsDataForAI } from './dataService.js';
|
||||||
|
import { embeddingsService, type EmbeddingData } from './embeddings.js';
|
||||||
|
|
||||||
|
interface AIConfig {
|
||||||
|
endpoint: string;
|
||||||
|
apiKey: string;
|
||||||
|
model: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface SelectionResult {
|
||||||
|
selectedTools: string[];
|
||||||
|
selectedConcepts: string[];
|
||||||
|
reasoning: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface AnalysisResult {
|
||||||
|
recommendation: any;
|
||||||
|
processingStats: {
|
||||||
|
embeddingsUsed: boolean;
|
||||||
|
candidatesFromEmbeddings: number;
|
||||||
|
finalSelectedItems: number;
|
||||||
|
processingTimeMs: number;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
class AIProcessingPipeline {
|
||||||
|
private selectorConfig: AIConfig;
|
||||||
|
private analyzerConfig: AIConfig;
|
||||||
|
private maxSelectedItems: number;
|
||||||
|
private embeddingCandidates: number;
|
||||||
|
private similarityThreshold: number;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.selectorConfig = {
|
||||||
|
endpoint: this.getEnv('AI_SELECTOR_ENDPOINT'),
|
||||||
|
apiKey: this.getEnv('AI_SELECTOR_API_KEY'),
|
||||||
|
model: this.getEnv('AI_SELECTOR_MODEL')
|
||||||
|
};
|
||||||
|
|
||||||
|
this.analyzerConfig = {
|
||||||
|
endpoint: this.getEnv('AI_ANALYZER_ENDPOINT'),
|
||||||
|
apiKey: this.getEnv('AI_ANALYZER_API_KEY'),
|
||||||
|
model: this.getEnv('AI_ANALYZER_MODEL')
|
||||||
|
};
|
||||||
|
|
||||||
|
this.maxSelectedItems = parseInt(process.env.AI_MAX_SELECTED_ITEMS || '15', 10);
|
||||||
|
this.embeddingCandidates = parseInt(process.env.AI_EMBEDDING_CANDIDATES || '30', 10);
|
||||||
|
this.similarityThreshold = parseFloat(process.env.AI_SIMILARITY_THRESHOLD || '0.3');
|
||||||
|
}
|
||||||
|
|
||||||
|
private getEnv(key: string): string {
|
||||||
|
const value = process.env[key];
|
||||||
|
if (!value) {
|
||||||
|
throw new Error(`Missing environment variable: ${key}`);
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async callAI(config: AIConfig, messages: any[], maxTokens: number = 1000): Promise<string> {
|
||||||
|
const response = await fetch(`${config.endpoint}/v1/chat/completions`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Authorization': `Bearer ${config.apiKey}`
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
model: config.model,
|
||||||
|
messages,
|
||||||
|
max_tokens: maxTokens,
|
||||||
|
temperature: 0.3
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const errorText = await response.text();
|
||||||
|
throw new Error(`AI API error (${config.model}): ${response.status} - ${errorText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
const content = data.choices?.[0]?.message?.content;
|
||||||
|
|
||||||
|
if (!content) {
|
||||||
|
throw new Error(`No response from AI model: ${config.model}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
private createSelectorPrompt(toolsData: any, userQuery: string, mode: string): string {
|
||||||
|
const toolsList = toolsData.tools.map((tool: any) => ({
|
||||||
|
name: tool.name,
|
||||||
|
type: tool.type,
|
||||||
|
description: tool.description.slice(0, 200) + '...',
|
||||||
|
domains: tool.domains,
|
||||||
|
phases: tool.phases,
|
||||||
|
tags: tool.tags?.slice(0, 5) || [],
|
||||||
|
skillLevel: tool.skillLevel
|
||||||
|
}));
|
||||||
|
|
||||||
|
const conceptsList = toolsData.concepts.map((concept: any) => ({
|
||||||
|
name: concept.name,
|
||||||
|
type: 'concept',
|
||||||
|
description: concept.description.slice(0, 200) + '...',
|
||||||
|
domains: concept.domains,
|
||||||
|
phases: concept.phases,
|
||||||
|
tags: concept.tags?.slice(0, 5) || []
|
||||||
|
}));
|
||||||
|
|
||||||
|
const modeInstruction = mode === 'workflow'
|
||||||
|
? 'The user wants a COMPREHENSIVE WORKFLOW with multiple tools/methods across different phases.'
|
||||||
|
: 'The user wants SPECIFIC TOOLS/METHODS that directly solve their particular problem.';
|
||||||
|
|
||||||
|
return `You are a DFIR expert tasked with selecting the most relevant tools and concepts for a user query.
|
||||||
|
|
||||||
|
${modeInstruction}
|
||||||
|
|
||||||
|
AVAILABLE TOOLS:
|
||||||
|
${JSON.stringify(toolsList, null, 2)}
|
||||||
|
|
||||||
|
AVAILABLE CONCEPTS:
|
||||||
|
${JSON.stringify(conceptsList, null, 2)}
|
||||||
|
|
||||||
|
USER QUERY: "${userQuery}"
|
||||||
|
|
||||||
|
Select the most relevant items (max ${this.maxSelectedItems} total). For workflow mode, prioritize breadth across phases. For tool mode, prioritize specificity and direct relevance.
|
||||||
|
|
||||||
|
Respond with ONLY this JSON format:
|
||||||
|
{
|
||||||
|
"selectedTools": ["Tool Name 1", "Tool Name 2", ...],
|
||||||
|
"selectedConcepts": ["Concept Name 1", "Concept Name 2", ...],
|
||||||
|
"reasoning": "Brief explanation of selection criteria and approach"
|
||||||
|
}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async selectRelevantItems(toolsData: any, userQuery: string, mode: string): Promise<SelectionResult> {
|
||||||
|
const prompt = this.createSelectorPrompt(toolsData, userQuery, mode);
|
||||||
|
|
||||||
|
const messages = [
|
||||||
|
{ role: 'user', content: prompt }
|
||||||
|
];
|
||||||
|
|
||||||
|
const response = await this.callAI(this.selectorConfig, messages, 1500);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const cleaned = response.replace(/^```json\s*/i, '').replace(/\s*```\s*$/g, '').trim();
|
||||||
|
const result = JSON.parse(cleaned);
|
||||||
|
|
||||||
|
// Validate the structure
|
||||||
|
if (!Array.isArray(result.selectedTools) || !Array.isArray(result.selectedConcepts)) {
|
||||||
|
throw new Error('Invalid selection result structure');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Limit selections
|
||||||
|
const totalSelected = result.selectedTools.length + result.selectedConcepts.length;
|
||||||
|
if (totalSelected > this.maxSelectedItems) {
|
||||||
|
console.warn(`[AI PIPELINE] Selection exceeded limit (${totalSelected}), truncating`);
|
||||||
|
result.selectedTools = result.selectedTools.slice(0, Math.floor(this.maxSelectedItems * 0.8));
|
||||||
|
result.selectedConcepts = result.selectedConcepts.slice(0, Math.ceil(this.maxSelectedItems * 0.2));
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[AI PIPELINE] Failed to parse selector response:', response);
|
||||||
|
throw new Error('Invalid JSON response from selector AI');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private filterDataBySelection(toolsData: any, selection: SelectionResult): any {
|
||||||
|
const selectedToolNames = new Set(selection.selectedTools);
|
||||||
|
const selectedConceptNames = new Set(selection.selectedConcepts);
|
||||||
|
|
||||||
|
return {
|
||||||
|
tools: toolsData.tools.filter((tool: any) => selectedToolNames.has(tool.name)),
|
||||||
|
concepts: toolsData.concepts.filter((concept: any) => selectedConceptNames.has(concept.name)),
|
||||||
|
domains: toolsData.domains,
|
||||||
|
phases: toolsData.phases,
|
||||||
|
'domain-agnostic-software': toolsData['domain-agnostic-software']
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private async processWithEmbeddings(userQuery: string, toolsData: any, mode: string): Promise<{ filteredData: any; stats: any }> {
|
||||||
|
console.log('[AI PIPELINE] Using embeddings for initial filtering');
|
||||||
|
|
||||||
|
const similarItems = await embeddingsService.findSimilar(
|
||||||
|
userQuery,
|
||||||
|
this.embeddingCandidates,
|
||||||
|
this.similarityThreshold
|
||||||
|
);
|
||||||
|
|
||||||
|
if (similarItems.length === 0) {
|
||||||
|
console.log('[AI PIPELINE] No similar items found with embeddings, using full dataset');
|
||||||
|
return {
|
||||||
|
filteredData: toolsData,
|
||||||
|
stats: { embeddingsUsed: true, candidatesFromEmbeddings: 0, fallbackToFull: true }
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create filtered dataset from embedding results
|
||||||
|
const similarToolNames = new Set();
|
||||||
|
const similarConceptNames = new Set();
|
||||||
|
|
||||||
|
similarItems.forEach(item => {
|
||||||
|
if (item.type === 'tool') {
|
||||||
|
similarToolNames.add(item.name);
|
||||||
|
} else if (item.type === 'concept') {
|
||||||
|
similarConceptNames.add(item.name);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const embeddingFilteredData = {
|
||||||
|
tools: toolsData.tools.filter((tool: any) => similarToolNames.has(tool.name)),
|
||||||
|
concepts: toolsData.concepts.filter((concept: any) => similarConceptNames.has(concept.name)),
|
||||||
|
domains: toolsData.domains,
|
||||||
|
phases: toolsData.phases,
|
||||||
|
'domain-agnostic-software': toolsData['domain-agnostic-software']
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log(`[AI PIPELINE] Embeddings filtered to ${embeddingFilteredData.tools.length} tools, ${embeddingFilteredData.concepts.length} concepts`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
filteredData: embeddingFilteredData,
|
||||||
|
stats: { embeddingsUsed: true, candidatesFromEmbeddings: similarItems.length }
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private async processWithoutEmbeddings(userQuery: string, toolsData: any, mode: string): Promise<{ filteredData: any; stats: any }> {
|
||||||
|
console.log('[AI PIPELINE] Processing without embeddings - using selector AI');
|
||||||
|
|
||||||
|
const selection = await this.selectRelevantItems(toolsData, userQuery, mode);
|
||||||
|
const filteredData = this.filterDataBySelection(toolsData, selection);
|
||||||
|
|
||||||
|
console.log(`[AI PIPELINE] Selector chose ${selection.selectedTools.length} tools, ${selection.selectedConcepts.length} concepts`);
|
||||||
|
console.log(`[AI PIPELINE] Selection reasoning: ${selection.reasoning}`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
filteredData,
|
||||||
|
stats: { embeddingsUsed: false, candidatesFromEmbeddings: 0, selectorReasoning: selection.reasoning }
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private createAnalyzerPrompt(filteredData: any, userQuery: string, mode: string): string {
|
||||||
|
// Use existing prompt creation logic but with filtered data
|
||||||
|
if (mode === 'workflow') {
|
||||||
|
return this.createWorkflowAnalyzerPrompt(filteredData, userQuery);
|
||||||
|
} else {
|
||||||
|
return this.createToolAnalyzerPrompt(filteredData, userQuery);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private createWorkflowAnalyzerPrompt(toolsData: any, userQuery: string): string {
|
||||||
|
const toolsList = toolsData.tools.map((tool: any) => ({
|
||||||
|
name: tool.name,
|
||||||
|
description: tool.description,
|
||||||
|
domains: tool.domains,
|
||||||
|
phases: tool.phases,
|
||||||
|
domainAgnostic: tool['domain-agnostic-software'],
|
||||||
|
platforms: tool.platforms,
|
||||||
|
skillLevel: tool.skillLevel,
|
||||||
|
license: tool.license,
|
||||||
|
tags: tool.tags,
|
||||||
|
related_concepts: tool.related_concepts || []
|
||||||
|
}));
|
||||||
|
|
||||||
|
const conceptsList = toolsData.concepts.map((concept: any) => ({
|
||||||
|
name: concept.name,
|
||||||
|
description: concept.description,
|
||||||
|
domains: concept.domains,
|
||||||
|
phases: concept.phases,
|
||||||
|
skillLevel: concept.skillLevel,
|
||||||
|
tags: concept.tags
|
||||||
|
}));
|
||||||
|
|
||||||
|
const regularPhases = toolsData.phases || [];
|
||||||
|
const domainAgnosticSoftware = toolsData['domain-agnostic-software'] || [];
|
||||||
|
const allPhaseItems = [...regularPhases, ...domainAgnosticSoftware];
|
||||||
|
|
||||||
|
const phasesDescription = allPhaseItems.map((phase: any) =>
|
||||||
|
`- ${phase.id}: ${phase.name}`
|
||||||
|
).join('\n');
|
||||||
|
|
||||||
|
const domainsDescription = toolsData.domains.map((domain: any) =>
|
||||||
|
`- ${domain.id}: ${domain.name}`
|
||||||
|
).join('\n');
|
||||||
|
|
||||||
|
const validPhases = [...regularPhases.map((p: any) => p.id), ...domainAgnosticSoftware.map((s: any) => s.id)].join('|');
|
||||||
|
|
||||||
|
return `Du bist ein DFIR (Digital Forensics and Incident Response) Experte. Du erhältst eine vorgefilterte Auswahl relevanter Tools und Konzepte und sollst daraus eine optimale Empfehlung erstellen.
|
||||||
|
|
||||||
|
VERFÜGBARE TOOLS/METHODEN (VORGEFILTERT):
|
||||||
|
${JSON.stringify(toolsList, null, 2)}
|
||||||
|
|
||||||
|
VERFÜGBARE KONZEPTE (VORGEFILTERT):
|
||||||
|
${JSON.stringify(conceptsList, null, 2)}
|
||||||
|
|
||||||
|
UNTERSUCHUNGSPHASEN:
|
||||||
|
${phasesDescription}
|
||||||
|
|
||||||
|
FORENSISCHE DOMÄNEN:
|
||||||
|
${domainsDescription}
|
||||||
|
|
||||||
|
WICHTIGE REGELN:
|
||||||
|
1. Pro Phase 2-3 Tools/Methoden empfehlen (immer mindestens 2 wenn verfügbar)
|
||||||
|
2. Tools/Methoden können in MEHREREN Phasen empfohlen werden wenn sinnvoll
|
||||||
|
3. Für Reporting-Phase: Visualisierungs- und Dokumentationssoftware einschließen
|
||||||
|
4. Gib stets dem spezieller für den Fall geeigneten Werkzeug den Vorzug
|
||||||
|
5. Deutsche Antworten für deutsche Anfragen, English for English queries
|
||||||
|
6. Methoden haben, sofern für das SZENARIO passend, IMMER Vorrang vor Software
|
||||||
|
7. Bevorzuge alles, was nicht proprietär ist (license != "Proprietary"), aber erkenne an, wenn proprietäre Software besser geeignet ist
|
||||||
|
8. WICHTIG: Erwähne relevante Hintergrundwissen-Konzepte wenn Tools verwendet werden, die related_concepts haben
|
||||||
|
9. Konzepte sind NICHT Tools - empfehle sie nicht als actionable Schritte, sondern als Wissensbasis
|
||||||
|
|
||||||
|
ENHANCED CONTEXTUAL ANALYSIS:
|
||||||
|
10. Analysiere das Szenario detailliert und identifiziere Schlüsselelemente, Bedrohungen und forensische Herausforderungen
|
||||||
|
11. Entwickle einen strategischen Untersuchungsansatz basierend auf dem spezifischen Szenario
|
||||||
|
12. Identifiziere zeitkritische oder besonders wichtige Faktoren für diesen Fall
|
||||||
|
|
||||||
|
USER QUERY: "${userQuery}"
|
||||||
|
|
||||||
|
ANTWORT-FORMAT (strict JSON):
|
||||||
|
{
|
||||||
|
"scenario_analysis": "Detaillierte Analyse des Szenarios: Erkannte Schlüsselelemente, Art des Vorfalls, betroffene Systeme, potentielle Bedrohungen und forensische Herausforderungen",
|
||||||
|
"investigation_approach": "Strategischer Untersuchungsansatz für dieses spezifische Szenario: Prioritäten, Reihenfolge der Phasen, besondere Überlegungen",
|
||||||
|
"critical_considerations": "Zeitkritische Faktoren, wichtige Sicherheitsaspekte oder besondere Vorsichtsmaßnahmen für diesen Fall",
|
||||||
|
"recommended_tools": [
|
||||||
|
{
|
||||||
|
"name": "EXAKTER Name aus der Tools-Database",
|
||||||
|
"priority": "high|medium|low",
|
||||||
|
"phase": "${validPhases}",
|
||||||
|
"justification": "Warum diese Methode für diese Phase und dieses spezifische Szenario geeignet ist - mit Bezug zu den erkannten Schlüsselelementen"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"workflow_suggestion": "Vorgeschlagener Untersuchungsablauf mit konkreten Schritten für dieses Szenario",
|
||||||
|
"background_knowledge": [
|
||||||
|
{
|
||||||
|
"concept_name": "EXAKTER Name aus der Konzepte-Database",
|
||||||
|
"relevance": "Warum dieses Konzept für das Szenario relevant ist, und bei welchen der empfohlenen Methoden/Tools."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"additional_notes": "Wichtige Überlegungen und Hinweise"
|
||||||
|
}
|
||||||
|
|
||||||
|
Antworte NUR mit validen JSON. Keine zusätzlichen Erklärungen außerhalb des JSON.`;
|
||||||
|
}
|
||||||
|
|
||||||
|
private createToolAnalyzerPrompt(toolsData: any, userQuery: string): string {
|
||||||
|
const toolsList = toolsData.tools.map((tool: any) => ({
|
||||||
|
name: tool.name,
|
||||||
|
description: tool.description,
|
||||||
|
domains: tool.domains,
|
||||||
|
phases: tool.phases,
|
||||||
|
platforms: tool.platforms,
|
||||||
|
skillLevel: tool.skillLevel,
|
||||||
|
license: tool.license,
|
||||||
|
tags: tool.tags,
|
||||||
|
url: tool.url,
|
||||||
|
projectUrl: tool.projectUrl,
|
||||||
|
related_concepts: tool.related_concepts || []
|
||||||
|
}));
|
||||||
|
|
||||||
|
const conceptsList = toolsData.concepts.map((concept: any) => ({
|
||||||
|
name: concept.name,
|
||||||
|
description: concept.description,
|
||||||
|
domains: concept.domains,
|
||||||
|
phases: concept.phases,
|
||||||
|
skillLevel: concept.skillLevel,
|
||||||
|
tags: concept.tags
|
||||||
|
}));
|
||||||
|
|
||||||
|
return `Du bist ein DFIR (Digital Forensics and Incident Response) Experte. Du erhältst eine vorgefilterte Auswahl relevanter Tools und Konzepte und sollst daraus 1-3 optimale Empfehlungen für ein spezifisches Problem erstellen.
|
||||||
|
|
||||||
|
VERFÜGBARE TOOLS/METHODEN (VORGEFILTERT):
|
||||||
|
${JSON.stringify(toolsList, null, 2)}
|
||||||
|
|
||||||
|
VERFÜGBARE KONZEPTE (VORGEFILTERT):
|
||||||
|
${JSON.stringify(conceptsList, null, 2)}
|
||||||
|
|
||||||
|
WICHTIGE REGELN:
|
||||||
|
1. Analysiere das spezifische Problem/die Anforderung sorgfältig
|
||||||
|
2. Empfehle 1-3 Methoden/Tools, sortiert nach Eignung (beste Empfehlung zuerst)
|
||||||
|
3. Gib detaillierte Erklärungen, WARUM und WIE jede Methode/Tool das Problem löst
|
||||||
|
4. Berücksichtige praktische Aspekte: Skill Level, Plattformen, Verfügbarkeit
|
||||||
|
5. Deutsche Antworten für deutsche Anfragen, English for English queries
|
||||||
|
6. Gib konkrete Anwendungshinweise, nicht nur allgemeine Beschreibungen
|
||||||
|
7. Methoden haben, sofern für das SZENARIO passend, IMMER Vorrang vor Software
|
||||||
|
8. Erwähne sowohl Stärken als auch Schwächen/Limitationen
|
||||||
|
9. Schlage alternative Ansätze vor, wenn sinnvoll
|
||||||
|
10. Gib grundsätzliche Hinweise, WIE die Methode/Tool konkret eingesetzt wird
|
||||||
|
11. WICHTIG: Erwähne relevante Hintergrundwissen-Konzepte wenn Tools verwendet werden, die related_concepts haben
|
||||||
|
12. Konzepte sind NICHT Tools - empfehle sie nicht als actionable Schritte, sondern als Wissensbasis
|
||||||
|
|
||||||
|
ENHANCED CONTEXTUAL ANALYSIS:
|
||||||
|
13. Analysiere das Problem detailliert und identifiziere technische Anforderungen, Herausforderungen und Erfolgsfaktoren
|
||||||
|
14. Entwickle einen strategischen Lösungsansatz basierend auf dem spezifischen Problem
|
||||||
|
15. Identifiziere wichtige Voraussetzungen oder Warnungen für die Anwendung
|
||||||
|
|
||||||
|
USER QUERY: "${userQuery}"
|
||||||
|
|
||||||
|
ANTWORT-FORMAT (strict JSON):
|
||||||
|
{
|
||||||
|
"problem_analysis": "Detaillierte Analyse des Problems: Erkannte technische Anforderungen, Herausforderungen, benötigte Fähigkeiten und Erfolgsfaktoren",
|
||||||
|
"investigation_approach": "Strategischer Lösungsansatz für dieses spezifische Problem: Herangehensweise, Prioritäten, optimale Anwendungsreihenfolge",
|
||||||
|
"critical_considerations": "Wichtige Voraussetzungen, potentielle Fallstricke oder Warnungen für die Anwendung der empfohlenen Lösungen",
|
||||||
|
"recommended_tools": [
|
||||||
|
{
|
||||||
|
"name": "EXAKTER Name aus der Tools-Database",
|
||||||
|
"rank": 1,
|
||||||
|
"suitability_score": "high|medium|low",
|
||||||
|
"detailed_explanation": "Detaillierte Erklärung, warum dieses Tool/diese Methode das spezifische Problem löst - mit Bezug zu den erkannten Anforderungen",
|
||||||
|
"implementation_approach": "Konkrete Schritte/Ansatz zur Anwendung für dieses spezifische Problem",
|
||||||
|
"pros": ["Spezifische Vorteile für diesen Anwendungsfall", "Weitere Vorteile"],
|
||||||
|
"cons": ["Potentielle Nachteile oder Limitationen", "Weitere Einschränkungen"],
|
||||||
|
"alternatives": "Alternative Ansätze oder ergänzende Tools/Methoden, falls relevant"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"background_knowledge": [
|
||||||
|
{
|
||||||
|
"concept_name": "EXAKTER Name aus der Konzepte-Database",
|
||||||
|
"relevance": "Warum dieses Konzept für die empfohlenen Tools/das Problem relevant ist, und für welche der empfohlenen Methoden/Tools."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"additional_considerations": "Wichtige Überlegungen, Voraussetzungen oder Warnungen"
|
||||||
|
}
|
||||||
|
|
||||||
|
Antworte NUR mit validen JSON. Keine zusätzlichen Erklärungen außerhalb des JSON.`;
|
||||||
|
}
|
||||||
|
|
||||||
|
async processQuery(userQuery: string, mode: string): Promise<AnalysisResult> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
console.log(`[AI PIPELINE] Starting ${mode} query processing`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Load full dataset
|
||||||
|
const toolsData = await getCompressedToolsDataForAI();
|
||||||
|
|
||||||
|
let filteredData: any;
|
||||||
|
let processingStats: any = {
|
||||||
|
embeddingsUsed: false,
|
||||||
|
candidatesFromEmbeddings: 0,
|
||||||
|
finalSelectedItems: 0,
|
||||||
|
processingTimeMs: 0
|
||||||
|
};
|
||||||
|
|
||||||
|
// Stage 1: Filter candidates (embeddings or selector AI)
|
||||||
|
if (embeddingsService.isEnabled()) {
|
||||||
|
const result = await this.processWithEmbeddings(userQuery, toolsData, mode);
|
||||||
|
filteredData = result.filteredData;
|
||||||
|
processingStats = { ...processingStats, ...result.stats };
|
||||||
|
} else {
|
||||||
|
const result = await this.processWithoutEmbeddings(userQuery, toolsData, mode);
|
||||||
|
filteredData = result.filteredData;
|
||||||
|
processingStats = { ...processingStats, ...result.stats };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stage 2: Generate detailed analysis with analyzer AI
|
||||||
|
console.log('[AI PIPELINE] Stage 2: Generating detailed analysis');
|
||||||
|
const analyzerPrompt = this.createAnalyzerPrompt(filteredData, userQuery, mode);
|
||||||
|
|
||||||
|
const messages = [
|
||||||
|
{ role: 'user', content: analyzerPrompt }
|
||||||
|
];
|
||||||
|
|
||||||
|
const analysisResponse = await this.callAI(this.analyzerConfig, messages, 3500);
|
||||||
|
|
||||||
|
// Parse the response
|
||||||
|
let recommendation;
|
||||||
|
try {
|
||||||
|
const cleanedContent = analysisResponse.replace(/^```json\s*/i, '').replace(/\s*```\s*$/g, '').trim();
|
||||||
|
recommendation = JSON.parse(cleanedContent);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[AI PIPELINE] Failed to parse analysis response:', analysisResponse);
|
||||||
|
throw new Error('Invalid JSON response from analyzer AI');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate tool/concept names exist in filtered data
|
||||||
|
const validToolNames = new Set(filteredData.tools.map((t: any) => t.name));
|
||||||
|
const validConceptNames = new Set(filteredData.concepts.map((c: any) => c.name));
|
||||||
|
|
||||||
|
if (recommendation.recommended_tools) {
|
||||||
|
recommendation.recommended_tools = recommendation.recommended_tools.filter((tool: any) => {
|
||||||
|
if (!validToolNames.has(tool.name)) {
|
||||||
|
console.warn(`[AI PIPELINE] Analyzer recommended unknown tool: ${tool.name}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (recommendation.background_knowledge) {
|
||||||
|
recommendation.background_knowledge = recommendation.background_knowledge.filter((concept: any) => {
|
||||||
|
if (!validConceptNames.has(concept.concept_name)) {
|
||||||
|
console.warn(`[AI PIPELINE] Analyzer referenced unknown concept: ${concept.concept_name}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
processingStats.finalSelectedItems = (recommendation.recommended_tools?.length || 0) +
|
||||||
|
(recommendation.background_knowledge?.length || 0);
|
||||||
|
processingStats.processingTimeMs = Date.now() - startTime;
|
||||||
|
|
||||||
|
console.log(`[AI PIPELINE] Completed in ${processingStats.processingTimeMs}ms`);
|
||||||
|
console.log(`[AI PIPELINE] Final recommendations: ${recommendation.recommended_tools?.length || 0} tools, ${recommendation.background_knowledge?.length || 0} concepts`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
recommendation,
|
||||||
|
processingStats
|
||||||
|
};
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[AI PIPELINE] Processing failed:', error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Global instance
|
||||||
|
const aiPipeline = new AIProcessingPipeline();
|
||||||
|
|
||||||
|
export { aiPipeline, type AnalysisResult };
|
@ -21,7 +21,7 @@ const ToolSchema = z.object({
|
|||||||
accessType: z.string().optional().nullable(),
|
accessType: z.string().optional().nullable(),
|
||||||
'domain-agnostic-software': z.array(z.string()).optional().nullable(),
|
'domain-agnostic-software': z.array(z.string()).optional().nullable(),
|
||||||
related_concepts: z.array(z.string()).optional().nullable().default([]),
|
related_concepts: z.array(z.string()).optional().nullable().default([]),
|
||||||
related_software: z.array(z.string()).optional().nullable().default([]), // Added this line
|
related_software: z.array(z.string()).optional().nullable().default([]),
|
||||||
});
|
});
|
||||||
|
|
||||||
const ToolsDataSchema = z.object({
|
const ToolsDataSchema = z.object({
|
||||||
@ -67,6 +67,7 @@ let cachedData: ToolsData | null = null;
|
|||||||
let cachedRandomizedData: ToolsData | null = null;
|
let cachedRandomizedData: ToolsData | null = null;
|
||||||
let cachedCompressedData: CompressedToolsData | null = null;
|
let cachedCompressedData: CompressedToolsData | null = null;
|
||||||
let lastRandomizationDate: string | null = null;
|
let lastRandomizationDate: string | null = null;
|
||||||
|
let dataVersion: string | null = null; // Add version tracking for embeddings
|
||||||
|
|
||||||
function seededRandom(seed: number): () => number {
|
function seededRandom(seed: number): () => number {
|
||||||
let x = Math.sin(seed) * 10000;
|
let x = Math.sin(seed) * 10000;
|
||||||
@ -91,6 +92,18 @@ function shuffleArray<T>(array: T[], randomFn: () => number): T[] {
|
|||||||
return shuffled;
|
return shuffled;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Generate a simple hash of the data for version tracking
|
||||||
|
function generateDataVersion(data: any): string {
|
||||||
|
const str = JSON.stringify(data, Object.keys(data).sort());
|
||||||
|
let hash = 0;
|
||||||
|
for (let i = 0; i < str.length; i++) {
|
||||||
|
const char = str.charCodeAt(i);
|
||||||
|
hash = ((hash << 5) - hash) + char;
|
||||||
|
hash = hash & hash; // Convert to 32-bit integer
|
||||||
|
}
|
||||||
|
return Math.abs(hash).toString(36);
|
||||||
|
}
|
||||||
|
|
||||||
async function loadRawData(): Promise<ToolsData> {
|
async function loadRawData(): Promise<ToolsData> {
|
||||||
if (!cachedData) {
|
if (!cachedData) {
|
||||||
const yamlPath = path.join(process.cwd(), 'src/data/tools.yaml');
|
const yamlPath = path.join(process.cwd(), 'src/data/tools.yaml');
|
||||||
@ -99,6 +112,11 @@ async function loadRawData(): Promise<ToolsData> {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
cachedData = ToolsDataSchema.parse(rawData);
|
cachedData = ToolsDataSchema.parse(rawData);
|
||||||
|
|
||||||
|
// Generate data version for embeddings tracking
|
||||||
|
dataVersion = generateDataVersion(cachedData);
|
||||||
|
console.log(`[DATA SERVICE] Loaded data version: ${dataVersion}`);
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('YAML validation failed:', error);
|
console.error('YAML validation failed:', error);
|
||||||
throw new Error('Invalid tools.yaml structure');
|
throw new Error('Invalid tools.yaml structure');
|
||||||
@ -124,6 +142,7 @@ export async function getToolsData(): Promise<ToolsData> {
|
|||||||
|
|
||||||
lastRandomizationDate = today;
|
lastRandomizationDate = today;
|
||||||
|
|
||||||
|
// Clear compressed cache when randomized data changes
|
||||||
cachedCompressedData = null;
|
cachedCompressedData = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -156,14 +175,23 @@ export async function getCompressedToolsDataForAI(): Promise<CompressedToolsData
|
|||||||
'domain-agnostic-software': data['domain-agnostic-software']
|
'domain-agnostic-software': data['domain-agnostic-software']
|
||||||
// scenarios intentionally excluded from AI data
|
// scenarios intentionally excluded from AI data
|
||||||
};
|
};
|
||||||
|
|
||||||
|
console.log(`[DATA SERVICE] Generated compressed data: ${compressedTools.length} tools, ${concepts.length} concepts`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return cachedCompressedData;
|
return cachedCompressedData;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function getDataVersion(): string | null {
|
||||||
|
return dataVersion;
|
||||||
|
}
|
||||||
|
|
||||||
export function clearCache(): void {
|
export function clearCache(): void {
|
||||||
cachedData = null;
|
cachedData = null;
|
||||||
cachedRandomizedData = null;
|
cachedRandomizedData = null;
|
||||||
cachedCompressedData = null;
|
cachedCompressedData = null;
|
||||||
lastRandomizationDate = null;
|
lastRandomizationDate = null;
|
||||||
|
dataVersion = null;
|
||||||
|
|
||||||
|
console.log('[DATA SERVICE] Cache cleared');
|
||||||
}
|
}
|
259
src/utils/embeddings.ts
Normal file
259
src/utils/embeddings.ts
Normal file
@ -0,0 +1,259 @@
|
|||||||
|
// src/utils/embeddings.ts
|
||||||
|
import { promises as fs } from 'fs';
|
||||||
|
import path from 'path';
|
||||||
|
import { getCompressedToolsDataForAI } from './dataService.js';
|
||||||
|
|
||||||
|
interface EmbeddingData {
|
||||||
|
id: string;
|
||||||
|
type: 'tool' | 'concept';
|
||||||
|
name: string;
|
||||||
|
content: string;
|
||||||
|
embedding: number[];
|
||||||
|
metadata: {
|
||||||
|
domains?: string[];
|
||||||
|
phases?: string[];
|
||||||
|
tags?: string[];
|
||||||
|
skillLevel?: string;
|
||||||
|
type?: string;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
interface EmbeddingsDatabase {
|
||||||
|
version: string;
|
||||||
|
lastUpdated: number;
|
||||||
|
embeddings: EmbeddingData[];
|
||||||
|
}
|
||||||
|
|
||||||
|
class EmbeddingsService {
|
||||||
|
private embeddings: EmbeddingData[] = [];
|
||||||
|
private isInitialized = false;
|
||||||
|
private readonly embeddingsPath = path.join(process.cwd(), 'data', 'embeddings.json');
|
||||||
|
private readonly batchSize: number;
|
||||||
|
private readonly batchDelay: number;
|
||||||
|
private readonly enabled: boolean;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.enabled = process.env.AI_EMBEDDINGS_ENABLED === 'true';
|
||||||
|
this.batchSize = parseInt(process.env.AI_EMBEDDINGS_BATCH_SIZE || '20', 10);
|
||||||
|
this.batchDelay = parseInt(process.env.AI_EMBEDDINGS_BATCH_DELAY_MS || '1000', 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
async initialize(): Promise<void> {
|
||||||
|
if (!this.enabled) {
|
||||||
|
console.log('[EMBEDDINGS] Embeddings disabled, skipping initialization');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
console.log('[EMBEDDINGS] Initializing embeddings system...');
|
||||||
|
|
||||||
|
// Create data directory if it doesn't exist
|
||||||
|
await fs.mkdir(path.dirname(this.embeddingsPath), { recursive: true });
|
||||||
|
|
||||||
|
const toolsData = await getCompressedToolsDataForAI();
|
||||||
|
const currentDataHash = this.hashData(toolsData);
|
||||||
|
|
||||||
|
// Try to load existing embeddings
|
||||||
|
const existingEmbeddings = await this.loadEmbeddings();
|
||||||
|
|
||||||
|
if (existingEmbeddings && existingEmbeddings.version === currentDataHash) {
|
||||||
|
console.log('[EMBEDDINGS] Using cached embeddings');
|
||||||
|
this.embeddings = existingEmbeddings.embeddings;
|
||||||
|
} else {
|
||||||
|
console.log('[EMBEDDINGS] Generating new embeddings...');
|
||||||
|
await this.generateEmbeddings(toolsData, currentDataHash);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.isInitialized = true;
|
||||||
|
console.log(`[EMBEDDINGS] Initialized with ${this.embeddings.length} embeddings`);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[EMBEDDINGS] Failed to initialize:', error);
|
||||||
|
this.isInitialized = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private hashData(data: any): string {
|
||||||
|
return Buffer.from(JSON.stringify(data)).toString('base64').slice(0, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async loadEmbeddings(): Promise<EmbeddingsDatabase | null> {
|
||||||
|
try {
|
||||||
|
const data = await fs.readFile(this.embeddingsPath, 'utf8');
|
||||||
|
return JSON.parse(data);
|
||||||
|
} catch (error) {
|
||||||
|
console.log('[EMBEDDINGS] No existing embeddings found');
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async saveEmbeddings(version: string): Promise<void> {
|
||||||
|
const database: EmbeddingsDatabase = {
|
||||||
|
version,
|
||||||
|
lastUpdated: Date.now(),
|
||||||
|
embeddings: this.embeddings
|
||||||
|
};
|
||||||
|
|
||||||
|
await fs.writeFile(this.embeddingsPath, JSON.stringify(database, null, 2));
|
||||||
|
console.log(`[EMBEDDINGS] Saved ${this.embeddings.length} embeddings to disk`);
|
||||||
|
}
|
||||||
|
|
||||||
|
private createContentString(item: any): string {
|
||||||
|
const parts = [
|
||||||
|
item.name,
|
||||||
|
item.description || '',
|
||||||
|
...(item.tags || []),
|
||||||
|
...(item.domains || []),
|
||||||
|
...(item.phases || [])
|
||||||
|
];
|
||||||
|
|
||||||
|
return parts.filter(Boolean).join(' ').toLowerCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
private async generateEmbeddingsBatch(contents: string[]): Promise<number[][]> {
|
||||||
|
const endpoint = process.env.AI_EMBEDDINGS_ENDPOINT;
|
||||||
|
const apiKey = process.env.AI_EMBEDDINGS_API_KEY;
|
||||||
|
const model = process.env.AI_EMBEDDINGS_MODEL;
|
||||||
|
|
||||||
|
if (!endpoint || !apiKey || !model) {
|
||||||
|
throw new Error('Missing embeddings API configuration');
|
||||||
|
}
|
||||||
|
|
||||||
|
const response = await fetch(endpoint, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Authorization': `Bearer ${apiKey}`
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
model,
|
||||||
|
input: contents
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const error = await response.text();
|
||||||
|
throw new Error(`Embeddings API error: ${response.status} - ${error}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
return data.data.map((item: any) => item.embedding);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async generateEmbeddings(toolsData: any, version: string): Promise<void> {
|
||||||
|
const allItems = [
|
||||||
|
...toolsData.tools.map((tool: any) => ({ ...tool, type: 'tool' })),
|
||||||
|
...toolsData.concepts.map((concept: any) => ({ ...concept, type: 'concept' }))
|
||||||
|
];
|
||||||
|
|
||||||
|
const contents = allItems.map(item => this.createContentString(item));
|
||||||
|
this.embeddings = [];
|
||||||
|
|
||||||
|
// Process in batches to respect rate limits
|
||||||
|
for (let i = 0; i < contents.length; i += this.batchSize) {
|
||||||
|
const batch = contents.slice(i, i + this.batchSize);
|
||||||
|
const batchItems = allItems.slice(i, i + this.batchSize);
|
||||||
|
|
||||||
|
console.log(`[EMBEDDINGS] Processing batch ${Math.ceil((i + 1) / this.batchSize)} of ${Math.ceil(contents.length / this.batchSize)}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const embeddings = await this.generateEmbeddingsBatch(batch);
|
||||||
|
|
||||||
|
embeddings.forEach((embedding, index) => {
|
||||||
|
const item = batchItems[index];
|
||||||
|
this.embeddings.push({
|
||||||
|
id: `${item.type}_${item.name.replace(/[^a-zA-Z0-9]/g, '_')}`,
|
||||||
|
type: item.type,
|
||||||
|
name: item.name,
|
||||||
|
content: batch[index],
|
||||||
|
embedding,
|
||||||
|
metadata: {
|
||||||
|
domains: item.domains,
|
||||||
|
phases: item.phases,
|
||||||
|
tags: item.tags,
|
||||||
|
skillLevel: item.skillLevel,
|
||||||
|
type: item.type
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Rate limiting delay between batches
|
||||||
|
if (i + this.batchSize < contents.length) {
|
||||||
|
await new Promise(resolve => setTimeout(resolve, this.batchDelay));
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`[EMBEDDINGS] Failed to process batch ${Math.ceil((i + 1) / this.batchSize)}:`, error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.saveEmbeddings(version);
|
||||||
|
}
|
||||||
|
|
||||||
|
private cosineSimilarity(a: number[], b: number[]): number {
|
||||||
|
let dotProduct = 0;
|
||||||
|
let normA = 0;
|
||||||
|
let normB = 0;
|
||||||
|
|
||||||
|
for (let i = 0; i < a.length; i++) {
|
||||||
|
dotProduct += a[i] * b[i];
|
||||||
|
normA += a[i] * a[i];
|
||||||
|
normB += b[i] * b[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
||||||
|
}
|
||||||
|
|
||||||
|
async findSimilar(query: string, maxResults: number = 30, threshold: number = 0.3): Promise<EmbeddingData[]> {
|
||||||
|
if (!this.enabled || !this.isInitialized || this.embeddings.length === 0) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Generate embedding for query
|
||||||
|
const queryEmbeddings = await this.generateEmbeddingsBatch([query.toLowerCase()]);
|
||||||
|
const queryEmbedding = queryEmbeddings[0];
|
||||||
|
|
||||||
|
// Calculate similarities
|
||||||
|
const similarities = this.embeddings.map(item => ({
|
||||||
|
...item,
|
||||||
|
similarity: this.cosineSimilarity(queryEmbedding, item.embedding)
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Filter by threshold and sort by similarity
|
||||||
|
return similarities
|
||||||
|
.filter(item => item.similarity >= threshold)
|
||||||
|
.sort((a, b) => b.similarity - a.similarity)
|
||||||
|
.slice(0, maxResults);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('[EMBEDDINGS] Failed to find similar items:', error);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
isEnabled(): boolean {
|
||||||
|
return this.enabled && this.isInitialized;
|
||||||
|
}
|
||||||
|
|
||||||
|
getStats(): { enabled: boolean; initialized: boolean; count: number } {
|
||||||
|
return {
|
||||||
|
enabled: this.enabled,
|
||||||
|
initialized: this.isInitialized,
|
||||||
|
count: this.embeddings.length
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Global instance
|
||||||
|
const embeddingsService = new EmbeddingsService();
|
||||||
|
|
||||||
|
export { embeddingsService, type EmbeddingData };
|
||||||
|
|
||||||
|
// Auto-initialize on import in server environment
|
||||||
|
if (typeof window === 'undefined' && process.env.NODE_ENV !== 'test') {
|
||||||
|
embeddingsService.initialize().catch(error => {
|
||||||
|
console.error('[EMBEDDINGS] Auto-initialization failed:', error);
|
||||||
|
});
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user