progress
This commit is contained in:
parent
78779fc8da
commit
895c476476
File diff suppressed because one or more lines are too long
117075
data/embeddings.json
Normal file
117075
data/embeddings.json
Normal file
File diff suppressed because it is too large
Load Diff
22
src/pages/api/ai/embeddings.status.ts
Normal file
22
src/pages/api/ai/embeddings.status.ts
Normal file
@ -0,0 +1,22 @@
|
||||
// src/pages/api/ai/embeddings-status.ts
|
||||
import type { APIRoute } from 'astro';
|
||||
import { embeddingsService } from '../../../utils/embeddings.js';
|
||||
import { apiResponse, apiServerError } from '../../../utils/api.js';
|
||||
|
||||
export const prerender = false;
|
||||
|
||||
export const GET: APIRoute = async () => {
|
||||
try {
|
||||
const stats = embeddingsService.getStats();
|
||||
|
||||
return apiResponse.success({
|
||||
embeddings: stats,
|
||||
timestamp: new Date().toISOString(),
|
||||
status: stats.enabled && stats.initialized ? 'ready' :
|
||||
stats.enabled && !stats.initialized ? 'initializing' : 'disabled'
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Embeddings status error:', error);
|
||||
return apiServerError.internal('Failed to get embeddings status');
|
||||
}
|
||||
};
|
@ -14,7 +14,11 @@ function getEnv(key: string): string {
|
||||
return value;
|
||||
}
|
||||
|
||||
const AI_MODEL = getEnv('AI_MODEL');
|
||||
// Use the analyzer AI for smart prompting (smaller, faster model)
|
||||
const AI_ENDPOINT = getEnv('AI_ANALYZER_ENDPOINT');
|
||||
const AI_API_KEY = getEnv('AI_ANALYZER_API_KEY');
|
||||
const AI_MODEL = getEnv('AI_ANALYZER_MODEL');
|
||||
|
||||
const rateLimitStore = new Map<string, { count: number; resetTime: number }>();
|
||||
const RATE_LIMIT_WINDOW = 60 * 1000; // 1 minute
|
||||
const RATE_LIMIT_MAX = 5; // 5 enhancement requests per minute per user
|
||||
@ -59,29 +63,38 @@ function cleanupExpiredRateLimits() {
|
||||
setInterval(cleanupExpiredRateLimits, 5 * 60 * 1000);
|
||||
|
||||
function createEnhancementPrompt(input: string): string {
|
||||
return `
|
||||
Du bist eine KI für digitale Forensik. Der Nutzer beschreibt ein forensisches Szenario. Analysiere die Eingabe.
|
||||
return `Du bist eine KI für digitale Forensik-Anfragen. Der Nutzer beschreibt ein forensisches Szenario oder Problem. Analysiere die Eingabe auf Vollständigkeit und Klarheit.
|
||||
|
||||
Wenn die Beschreibung unvollständig oder vage ist, stelle bis zu drei präzise Rückfragen im JSON-Array-Format, um wichtige Details zu klären (z. B. Vorfalltyp, System, Ziel, Datenquellen, Zeit, Beteiligte, rechtlicher Rahmen).
|
||||
ANALYSIERE DIESE KATEGORIEN:
|
||||
1. **Vorfalltyp**: Was ist passiert? (Malware, Datendiebstahl, Compliance-Verstoß, etc.)
|
||||
2. **Betroffene Systeme**: Welche Technologien/Plattformen? (Windows, Linux, Mobile, Cloud, etc.)
|
||||
3. **Verfügbare Datenquellen**: Was kann untersucht werden? (Logs, Images, Memory Dumps, etc.)
|
||||
4. **Untersuchungsziel**: Was soll erreicht werden? (IOCs finden, Timeline erstellen, etc.)
|
||||
5. **Zeitrahmen & Dringlichkeit**: Wann ist etwas passiert? Wie dringend?
|
||||
6. **Ressourcen & Constraints**: Budget, Skills, Tools, rechtliche Aspekte
|
||||
7. **Beweisziele**: Dokumentation, Gerichtsverfahren, interne Aufklärung?
|
||||
|
||||
Wenn die Eingabe bereits klar, spezifisch und vollständig ist, gib stattdessen nur eine leere Liste [] zurück.
|
||||
WENN die Beschreibung vollständig und spezifisch ist: Gib eine leere Liste [] zurück.
|
||||
|
||||
Antwortformat strikt:
|
||||
WENN wichtige Details fehlen: Formuliere 2-3 präzise Fragen, die die kritischsten Lücken schließen. Fokussiere auf Details, die die Tool-/Methoden-Auswahl stark beeinflussen.
|
||||
|
||||
\`\`\`json
|
||||
FRAGE-QUALITÄT:
|
||||
- Spezifisch, nicht allgemein (❌ "Mehr Details?" ✅ "Welche Betriebssysteme sind betroffen?")
|
||||
- Handlungsrelevant (❌ "Wann passierte das?" ✅ "Haben Sie Logs aus der Vorfallzeit verfügbar?")
|
||||
- Priorisiert nach Wichtigkeit für die forensische Analyse
|
||||
|
||||
ANTWORTFORMAT (NUR JSON):
|
||||
[
|
||||
"Frage 1?",
|
||||
"Frage 2?",
|
||||
"Frage 3?"
|
||||
"Spezifische Frage 1?",
|
||||
"Spezifische Frage 2?",
|
||||
"Spezifische Frage 3?"
|
||||
]
|
||||
\`\`\`
|
||||
|
||||
Nutzer-Eingabe:
|
||||
NUTZER-EINGABE:
|
||||
${input}
|
||||
`.trim();
|
||||
}
|
||||
|
||||
|
||||
export const POST: APIRoute = async ({ request }) => {
|
||||
try {
|
||||
const authResult = await withAPIAuth(request, 'ai');
|
||||
@ -98,12 +111,12 @@ export const POST: APIRoute = async ({ request }) => {
|
||||
const body = await request.json();
|
||||
const { input } = body;
|
||||
|
||||
if (!input || typeof input !== 'string' || input.length < 20) {
|
||||
return apiError.badRequest('Input too short for enhancement');
|
||||
if (!input || typeof input !== 'string' || input.length < 40) {
|
||||
return apiError.badRequest('Input too short for enhancement (minimum 40 characters)');
|
||||
}
|
||||
|
||||
const sanitizedInput = sanitizeInput(input);
|
||||
if (sanitizedInput.length < 20) {
|
||||
if (sanitizedInput.length < 40) {
|
||||
return apiError.badRequest('Input too short after sanitization');
|
||||
}
|
||||
|
||||
@ -111,11 +124,11 @@ export const POST: APIRoute = async ({ request }) => {
|
||||
const taskId = `enhance_${userId}_${Date.now()}_${Math.random().toString(36).substr(2, 4)}`;
|
||||
|
||||
const aiResponse = await enqueueApiCall(() =>
|
||||
fetch(process.env.AI_API_ENDPOINT + '/v1/chat/completions', {
|
||||
fetch(`${AI_ENDPOINT}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${process.env.AI_API_KEY}`
|
||||
'Authorization': `Bearer ${AI_API_KEY}`
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: AI_MODEL,
|
||||
@ -125,7 +138,7 @@ export const POST: APIRoute = async ({ request }) => {
|
||||
content: systemPrompt
|
||||
}
|
||||
],
|
||||
max_tokens: 200,
|
||||
max_tokens: 300,
|
||||
temperature: 0.7
|
||||
})
|
||||
}), taskId);
|
||||
@ -144,28 +157,32 @@ export const POST: APIRoute = async ({ request }) => {
|
||||
|
||||
let questions;
|
||||
try {
|
||||
const cleanedContent = aiContent
|
||||
const cleanedContent = aiContent
|
||||
.replace(/^```json\s*/i, '')
|
||||
.replace(/\s*```\s*$/, '')
|
||||
.trim();
|
||||
questions = JSON.parse(cleanedContent);
|
||||
questions = JSON.parse(cleanedContent);
|
||||
|
||||
if (!Array.isArray(questions) || questions.length === 0) {
|
||||
throw new Error('Invalid questions format');
|
||||
if (!Array.isArray(questions)) {
|
||||
throw new Error('Response is not an array');
|
||||
}
|
||||
|
||||
// Validate and clean questions
|
||||
// Enhanced validation and cleaning
|
||||
questions = questions
|
||||
.filter(q => typeof q === 'string' && q.length > 5 && q.length < 120)
|
||||
.slice(0, 3);
|
||||
.filter(q => typeof q === 'string' && q.length > 10 && q.length < 150) // More reasonable length limits
|
||||
.filter(q => q.includes('?')) // Must be a question
|
||||
.map(q => q.trim())
|
||||
.slice(0, 3); // Max 3 questions
|
||||
|
||||
// If no valid questions, return empty array (means input is complete)
|
||||
if (questions.length === 0) {
|
||||
throw new Error('No valid questions found');
|
||||
questions = [];
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error('Failed to parse enhancement response:', aiContent);
|
||||
return apiServerError.unavailable('Invalid enhancement response format');
|
||||
// If parsing fails, assume input is complete enough
|
||||
questions = [];
|
||||
}
|
||||
|
||||
console.log(`[AI Enhancement] User: ${userId}, Questions: ${questions.length}, Input length: ${sanitizedInput.length}`);
|
||||
@ -173,7 +190,8 @@ export const POST: APIRoute = async ({ request }) => {
|
||||
return new Response(JSON.stringify({
|
||||
success: true,
|
||||
questions,
|
||||
taskId
|
||||
taskId,
|
||||
inputComplete: questions.length === 0 // Flag to indicate if input seems complete
|
||||
}), {
|
||||
status: 200,
|
||||
headers: { 'Content-Type': 'application/json' }
|
||||
|
@ -1,21 +1,12 @@
|
||||
// src/pages/api/ai/query.ts
|
||||
import type { APIRoute } from 'astro';
|
||||
import { withAPIAuth } from '../../../utils/auth.js';
|
||||
import { getCompressedToolsDataForAI } from '../../../utils/dataService.js';
|
||||
import { apiError, apiServerError, createAuthErrorResponse } from '../../../utils/api.js';
|
||||
import { enqueueApiCall } from '../../../utils/rateLimitedQueue.js';
|
||||
import { aiPipeline } from '../../../utils/aiPipeline.js';
|
||||
|
||||
export const prerender = false;
|
||||
|
||||
function getEnv(key: string): string {
|
||||
const value = process.env[key];
|
||||
if (!value) {
|
||||
throw new Error(`Missing environment variable: ${key}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
const AI_MODEL = getEnv('AI_MODEL');
|
||||
const rateLimitStore = new Map<string, { count: number; resetTime: number }>();
|
||||
const RATE_LIMIT_WINDOW = 60 * 1000;
|
||||
const RATE_LIMIT_MAX = 10;
|
||||
@ -33,13 +24,6 @@ function sanitizeInput(input: string): string {
|
||||
return sanitized;
|
||||
}
|
||||
|
||||
function stripMarkdownJson(content: string): string {
|
||||
return content
|
||||
.replace(/^```json\s*/i, '')
|
||||
.replace(/\s*```\s*$/, '')
|
||||
.trim();
|
||||
}
|
||||
|
||||
function checkRateLimit(userId: string): boolean {
|
||||
const now = Date.now();
|
||||
const userLimit = rateLimitStore.get(userId);
|
||||
@ -68,209 +52,6 @@ function cleanupExpiredRateLimits() {
|
||||
|
||||
setInterval(cleanupExpiredRateLimits, 5 * 60 * 1000);
|
||||
|
||||
async function loadToolsDatabase() {
|
||||
try {
|
||||
return await getCompressedToolsDataForAI();
|
||||
} catch (error) {
|
||||
console.error('Failed to load tools database:', error);
|
||||
throw new Error('Database unavailable');
|
||||
}
|
||||
}
|
||||
|
||||
function createWorkflowSystemPrompt(toolsData: any): string {
|
||||
const toolsList = toolsData.tools.map((tool: any) => ({
|
||||
name: tool.name,
|
||||
description: tool.description,
|
||||
domains: tool.domains,
|
||||
phases: tool.phases,
|
||||
domainAgnostic: tool['domain-agnostic-software'],
|
||||
platforms: tool.platforms,
|
||||
skillLevel: tool.skillLevel,
|
||||
license: tool.license,
|
||||
tags: tool.tags,
|
||||
related_concepts: tool.related_concepts || []
|
||||
}));
|
||||
|
||||
const conceptsList = toolsData.concepts.map((concept: any) => ({
|
||||
name: concept.name,
|
||||
description: concept.description,
|
||||
domains: concept.domains,
|
||||
phases: concept.phases,
|
||||
skillLevel: concept.skillLevel,
|
||||
tags: concept.tags
|
||||
}));
|
||||
|
||||
const regularPhases = toolsData.phases || [];
|
||||
|
||||
const domainAgnosticSoftware = toolsData['domain-agnostic-software'] || [];
|
||||
|
||||
const allPhaseItems = [
|
||||
...regularPhases,
|
||||
...domainAgnosticSoftware
|
||||
];
|
||||
|
||||
const phasesDescription = allPhaseItems.map((phase: any) =>
|
||||
`- ${phase.id}: ${phase.name}`
|
||||
).join('\n');
|
||||
|
||||
const domainsDescription = toolsData.domains.map((domain: any) =>
|
||||
`- ${domain.id}: ${domain.name}`
|
||||
).join('\n');
|
||||
|
||||
const phaseDescriptions = regularPhases.map((phase: any) =>
|
||||
`- ${phase.name}: ${phase.description || 'Tools/Methods for this phase'}`
|
||||
).join('\n');
|
||||
|
||||
const domainAgnosticDescriptions = domainAgnosticSoftware.map((section: any) =>
|
||||
`- ${section.name}: ${section.description || 'Cross-cutting software and platforms'}`
|
||||
).join('\n');
|
||||
|
||||
const validPhases = [
|
||||
...regularPhases.map((p: any) => p.id),
|
||||
...domainAgnosticSoftware.map((s: any) => s.id)
|
||||
].join('|');
|
||||
|
||||
return `Du bist ein DFIR (Digital Forensics and Incident Response) Experte, der Ermittlern bei der Auswahl von Software und Methoden hilft.
|
||||
|
||||
VERFÜGBARE TOOLS/METHODEN:
|
||||
${JSON.stringify(toolsList, null, 2)}
|
||||
|
||||
VERFÜGBARE HINTERGRUNDWISSEN-KONZEPTE:
|
||||
${JSON.stringify(conceptsList, null, 2)}
|
||||
|
||||
UNTERSUCHUNGSPHASEN (NIST Framework):
|
||||
${phasesDescription}
|
||||
|
||||
FORENSISCHE DOMÄNEN:
|
||||
${domainsDescription}
|
||||
|
||||
WICHTIGE REGELN:
|
||||
1. Pro Phase 2-3 Tools/Methoden empfehlen (immer mindestens 2 wenn verfügbar)
|
||||
2. Tools/Methoden können in MEHREREN Phasen empfohlen werden wenn sinnvoll - versuche ein Tool/Methode für jede Phase zu empfehlen, selbst wenn die Priorität "low" ist.
|
||||
3. Für Reporting-Phase: Visualisierungs- und Dokumentationssoftware einschließen
|
||||
4. Gib stets dem spezieller für den Fall geeigneten Werkzeug den Vorzug.
|
||||
5. Deutsche Antworten für deutsche Anfragen, English for English queries
|
||||
6. Methoden haben, sofern für das SZENARIO passend, IMMER Vorrang vor Software.
|
||||
7. Bevorzuge alles, was nicht proprietär ist (license != "Proprietary"), aber erkenne an, wenn proprietäre Software besser geeignet ist.
|
||||
8. WICHTIG: Erwähne relevante Hintergrundwissen-Konzepte wenn Tools verwendet werden, die related_concepts haben
|
||||
9. Konzepte sind NICHT Tools - empfehle sie nicht als actionable Schritte, sondern als Wissensbasis
|
||||
|
||||
ENHANCED CONTEXTUAL ANALYSIS:
|
||||
10. Analysiere das Szenario detailliert und identifiziere Schlüsselelemente, Bedrohungen und forensische Herausforderungen
|
||||
11. Entwickle einen strategischen Untersuchungsansatz basierend auf dem spezifischen Szenario
|
||||
12. Identifiziere zeitkritische oder besonders wichtige Faktoren für diesen Fall
|
||||
|
||||
SOFTWARE/METHODEN-AUSWAHL NACH PHASE:
|
||||
${phaseDescriptions}
|
||||
|
||||
DOMÄNENAGNOSTISCHE SOFTWARE/METHODEN:
|
||||
${domainAgnosticDescriptions}
|
||||
|
||||
ANTWORT-FORMAT (strict JSON):
|
||||
{
|
||||
"scenario_analysis": "Detaillierte Analyse des Szenarios: Erkannte Schlüsselelemente, Art des Vorfalls, betroffene Systeme, potentielle Bedrohungen und forensische Herausforderungen",
|
||||
"investigation_approach": "Strategischer Untersuchungsansatz für dieses spezifische Szenario: Prioritäten, Reihenfolge der Phasen, besondere Überlegungen",
|
||||
"critical_considerations": "Zeitkritische Faktoren, wichtige Sicherheitsaspekte oder besondere Vorsichtsmaßnahmen für diesen Fall",
|
||||
"recommended_tools": [
|
||||
{
|
||||
"name": "EXAKTER Name aus der Tools-Database",
|
||||
"priority": "high|medium|low",
|
||||
"phase": "${validPhases}",
|
||||
"justification": "Warum diese Methode für diese Phase und dieses spezifische Szenario geeignet ist - mit Bezug zu den erkannten Schlüsselelementen"
|
||||
}
|
||||
],
|
||||
"workflow_suggestion": "Vorgeschlagener Untersuchungsablauf mit konkreten Schritten für dieses Szenario",
|
||||
"background_knowledge": [
|
||||
{
|
||||
"concept_name": "EXAKTER Name aus der Konzepte-Database",
|
||||
"relevance": "Warum dieses Konzept für das Szenario relevant ist, und bei welchen der empfohlenen Methoden/Tools."
|
||||
}
|
||||
],
|
||||
"additional_notes": "Wichtige Überlegungen und Hinweise"
|
||||
}
|
||||
|
||||
Antworte NUR mit validen JSON. Keine zusätzlichen Erklärungen außerhalb des JSON.`;
|
||||
}
|
||||
|
||||
function createToolSystemPrompt(toolsData: any): string {
|
||||
const toolsList = toolsData.tools.map((tool: any) => ({
|
||||
name: tool.name,
|
||||
description: tool.description,
|
||||
domains: tool.domains,
|
||||
phases: tool.phases,
|
||||
platforms: tool.platforms,
|
||||
skillLevel: tool.skillLevel,
|
||||
license: tool.license,
|
||||
tags: tool.tags,
|
||||
url: tool.url,
|
||||
projectUrl: tool.projectUrl,
|
||||
related_concepts: tool.related_concepts || []
|
||||
}));
|
||||
|
||||
const conceptsList = toolsData.concepts.map((concept: any) => ({
|
||||
name: concept.name,
|
||||
description: concept.description,
|
||||
domains: concept.domains,
|
||||
phases: concept.phases,
|
||||
skillLevel: concept.skillLevel,
|
||||
tags: concept.tags
|
||||
}));
|
||||
|
||||
return `Du bist ein DFIR (Digital Forensics and Incident Response) Experte, der bei der Auswahl spezifischer Software/Methoden für konkrete Probleme hilft.
|
||||
|
||||
VERFÜGBARE TOOLS/METHODEN:
|
||||
${JSON.stringify(toolsList, null, 2)}
|
||||
|
||||
VERFÜGBARE HINTERGRUNDWISSEN-KONZEPTE:
|
||||
${JSON.stringify(conceptsList, null, 2)}
|
||||
|
||||
WICHTIGE REGELN:
|
||||
1. Analysiere das spezifische Problem/die Anforderung sorgfältig
|
||||
2. Empfehle 1-3 Methoden/Tools, sortiert nach Eignung (beste Empfehlung zuerst)
|
||||
3. Gib detaillierte Erklärungen, WARUM und WIE jede Methode/Tool das Problem löst
|
||||
4. Berücksichtige praktische Aspekte: Skill Level, Plattformen, Verfügbarkeit
|
||||
5. Deutsche Antworten für deutsche Anfragen, English for English queries
|
||||
6. Gib konkrete Anwendungshinweise, nicht nur allgemeine Beschreibungen - Methoden haben, sofern für das SZENARIO passend, IMMER Vorrang vor Software.
|
||||
7. Erwähne sowohl Stärken als auch Schwächen/Limitationen
|
||||
8. Schlage alternative Ansätze vor, wenn sinnvoll
|
||||
9. Gib grundsätzliche Hinweise, WIE die Methode/Tool konkret eingesetzt wird
|
||||
10. WICHTIG: Erwähne relevante Hintergrundwissen-Konzepte wenn Tools verwendet werden, die related_concepts haben
|
||||
11. Konzepte sind NICHT Tools - empfehle sie nicht als actionable Schritte, sondern als Wissensbasis
|
||||
|
||||
ENHANCED CONTEXTUAL ANALYSIS:
|
||||
12. Analysiere das Problem detailliert und identifiziere technische Anforderungen, Herausforderungen und Erfolgsfaktoren
|
||||
13. Entwickle einen strategischen Lösungsansatz basierend auf dem spezifischen Problem
|
||||
14. Identifiziere wichtige Voraussetzungen oder Warnungen für die Anwendung
|
||||
|
||||
ANTWORT-FORMAT (strict JSON):
|
||||
{
|
||||
"problem_analysis": "Detaillierte Analyse des Problems: Erkannte technische Anforderungen, Herausforderungen, benötigte Fähigkeiten und Erfolgsfaktoren",
|
||||
"investigation_approach": "Strategischer Lösungsansatz für dieses spezifische Problem: Herangehensweise, Prioritäten, optimale Anwendungsreihenfolge",
|
||||
"critical_considerations": "Wichtige Voraussetzungen, potentielle Fallstricke oder Warnungen für die Anwendung der empfohlenen Lösungen",
|
||||
"recommended_tools": [
|
||||
{
|
||||
"name": "EXAKTER Name aus der Tools-Database",
|
||||
"rank": 1,
|
||||
"suitability_score": "high|medium|low",
|
||||
"detailed_explanation": "Detaillierte Erklärung, warum dieses Tool/diese Methode das spezifische Problem löst - mit Bezug zu den erkannten Anforderungen",
|
||||
"implementation_approach": "Konkrete Schritte/Ansatz zur Anwendung für dieses spezifische Problem",
|
||||
"pros": ["Spezifische Vorteile für diesen Anwendungsfall", "Weitere Vorteile"],
|
||||
"cons": ["Potentielle Nachteile oder Limitationen", "Weitere Einschränkungen"],
|
||||
"alternatives": "Alternative Ansätze oder ergänzende Tools/Methoden, falls relevant"
|
||||
}
|
||||
],
|
||||
"background_knowledge": [
|
||||
{
|
||||
"concept_name": "EXAKTER Name aus der Konzepte-Database",
|
||||
"relevance": "Warum dieses Konzept für die empfohlenen Tools/das Problem relevant ist, und für welche der empfohlenen Methoden/Tools."
|
||||
}
|
||||
],
|
||||
"additional_considerations": "Wichtige Überlegungen, Voraussetzungen oder Warnungen"
|
||||
}
|
||||
|
||||
Antworte NUR mit validen JSON. Keine zusätzlichen Erklärungen außerhalb des JSON.`;
|
||||
}
|
||||
|
||||
export const POST: APIRoute = async ({ request }) => {
|
||||
try {
|
||||
const authResult = await withAPIAuth(request, 'ai');
|
||||
@ -287,7 +68,6 @@ export const POST: APIRoute = async ({ request }) => {
|
||||
const body = await request.json();
|
||||
const { query, mode = 'workflow', taskId: clientTaskId } = body;
|
||||
|
||||
// ADD THIS DEBUG LOGGING
|
||||
console.log(`[AI API] Received request - TaskId: ${clientTaskId}, Mode: ${mode}, Query length: ${query?.length || 0}`);
|
||||
|
||||
if (!query || typeof query !== 'string') {
|
||||
@ -306,128 +86,31 @@ export const POST: APIRoute = async ({ request }) => {
|
||||
return apiError.badRequest('Invalid input detected');
|
||||
}
|
||||
|
||||
const toolsData = await loadToolsDatabase();
|
||||
|
||||
const systemPrompt = mode === 'workflow'
|
||||
? createWorkflowSystemPrompt(toolsData)
|
||||
: createToolSystemPrompt(toolsData);
|
||||
|
||||
const taskId = clientTaskId || `ai_${userId}_${Date.now()}_${Math.random().toString(36).substr(2, 6)}`;
|
||||
|
||||
console.log(`[AI API] About to enqueue task ${taskId}`);
|
||||
|
||||
|
||||
const aiResponse = await enqueueApiCall(() =>
|
||||
fetch(process.env.AI_API_ENDPOINT + '/v1/chat/completions', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${process.env.AI_API_KEY}`
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: AI_MODEL,
|
||||
messages: [
|
||||
{
|
||||
role: 'system',
|
||||
content: systemPrompt
|
||||
},
|
||||
{
|
||||
role: 'user',
|
||||
content: sanitizedQuery
|
||||
}
|
||||
],
|
||||
max_tokens: 3500,
|
||||
temperature: 0.3
|
||||
})
|
||||
})
|
||||
// Use the new AI pipeline instead of direct API calls
|
||||
const result = await enqueueApiCall(() =>
|
||||
aiPipeline.processQuery(sanitizedQuery, mode)
|
||||
, taskId);
|
||||
|
||||
if (!aiResponse.ok) {
|
||||
console.error('AI API error:', await aiResponse.text());
|
||||
return apiServerError.unavailable('AI service unavailable');
|
||||
if (!result || !result.recommendation) {
|
||||
return apiServerError.unavailable('No response from AI pipeline');
|
||||
}
|
||||
|
||||
const aiData = await aiResponse.json();
|
||||
const aiContent = aiData.choices?.[0]?.message?.content;
|
||||
|
||||
if (!aiContent) {
|
||||
return apiServerError.unavailable('No response from AI');
|
||||
}
|
||||
|
||||
let recommendation;
|
||||
try {
|
||||
const cleanedContent = stripMarkdownJson(aiContent);
|
||||
recommendation = JSON.parse(cleanedContent);
|
||||
} catch (error) {
|
||||
console.error('Failed to parse AI response:', aiContent);
|
||||
return apiServerError.unavailable('Invalid AI response format');
|
||||
}
|
||||
|
||||
const validToolNames = new Set(toolsData.tools.map((t: any) => t.name));
|
||||
const validConceptNames = new Set(toolsData.concepts.map((c: any) => c.name));
|
||||
|
||||
let validatedRecommendation;
|
||||
|
||||
if (mode === 'workflow') {
|
||||
validatedRecommendation = {
|
||||
...recommendation,
|
||||
// Ensure all new fields are included with fallbacks
|
||||
scenario_analysis: recommendation.scenario_analysis || recommendation.problem_analysis || '',
|
||||
investigation_approach: recommendation.investigation_approach || '',
|
||||
critical_considerations: recommendation.critical_considerations || '',
|
||||
recommended_tools: recommendation.recommended_tools?.filter((tool: any) => {
|
||||
if (!validToolNames.has(tool.name)) {
|
||||
console.warn(`AI recommended unknown tool: ${tool.name}`);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}) || [],
|
||||
background_knowledge: recommendation.background_knowledge?.filter((concept: any) => {
|
||||
if (!validConceptNames.has(concept.concept_name)) {
|
||||
console.warn(`AI referenced unknown concept: ${concept.concept_name}`);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}) || []
|
||||
};
|
||||
} else {
|
||||
validatedRecommendation = {
|
||||
...recommendation,
|
||||
// Ensure all new fields are included with fallbacks
|
||||
problem_analysis: recommendation.problem_analysis || recommendation.scenario_analysis || '',
|
||||
investigation_approach: recommendation.investigation_approach || '',
|
||||
critical_considerations: recommendation.critical_considerations || '',
|
||||
recommended_tools: recommendation.recommended_tools?.filter((tool: any) => {
|
||||
if (!validToolNames.has(tool.name)) {
|
||||
console.warn(`AI recommended unknown tool: ${tool.name}`);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}).map((tool: any, index: number) => ({
|
||||
...tool,
|
||||
rank: tool.rank || (index + 1),
|
||||
suitability_score: tool.suitability_score || 'medium',
|
||||
pros: Array.isArray(tool.pros) ? tool.pros : [],
|
||||
cons: Array.isArray(tool.cons) ? tool.cons : []
|
||||
})) || [],
|
||||
background_knowledge: recommendation.background_knowledge?.filter((concept: any) => {
|
||||
if (!validConceptNames.has(concept.concept_name)) {
|
||||
console.warn(`AI referenced unknown concept: ${concept.concept_name}`);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}) || []
|
||||
};
|
||||
}
|
||||
|
||||
console.log(`[AI Query] Mode: ${mode}, User: ${userId}, Query length: ${sanitizedQuery.length}, Tools: ${validatedRecommendation.recommended_tools.length}, Concepts: ${validatedRecommendation.background_knowledge?.length || 0}`);
|
||||
// Add processing statistics to the response for debugging/monitoring
|
||||
console.log(`[AI Query] Mode: ${mode}, User: ${userId}, Query length: ${sanitizedQuery.length}`);
|
||||
console.log(`[AI Query] Processing stats:`, result.processingStats);
|
||||
console.log(`[AI Query] Tools: ${result.recommendation.recommended_tools?.length || 0}, Concepts: ${result.recommendation.background_knowledge?.length || 0}`);
|
||||
|
||||
return new Response(JSON.stringify({
|
||||
success: true,
|
||||
mode,
|
||||
taskId,
|
||||
recommendation: validatedRecommendation,
|
||||
query: sanitizedQuery
|
||||
recommendation: result.recommendation,
|
||||
query: sanitizedQuery,
|
||||
processingStats: result.processingStats // Include stats for monitoring
|
||||
}), {
|
||||
status: 200,
|
||||
headers: { 'Content-Type': 'application/json' }
|
||||
@ -435,6 +118,16 @@ export const POST: APIRoute = async ({ request }) => {
|
||||
|
||||
} catch (error) {
|
||||
console.error('AI query error:', error);
|
||||
return apiServerError.internal('Internal server error');
|
||||
|
||||
// Provide more specific error messages based on error type
|
||||
if (error.message.includes('embeddings')) {
|
||||
return apiServerError.unavailable('Embeddings service error - falling back to basic processing');
|
||||
} else if (error.message.includes('selector')) {
|
||||
return apiServerError.unavailable('AI selector service error');
|
||||
} else if (error.message.includes('analyzer')) {
|
||||
return apiServerError.unavailable('AI analyzer service error');
|
||||
} else {
|
||||
return apiServerError.internal('Internal server error');
|
||||
}
|
||||
}
|
||||
};
|
521
src/utils/aiPipeline.ts
Normal file
521
src/utils/aiPipeline.ts
Normal file
@ -0,0 +1,521 @@
|
||||
// src/utils/aiPipeline.ts
|
||||
import { getCompressedToolsDataForAI } from './dataService.js';
|
||||
import { embeddingsService, type EmbeddingData } from './embeddings.js';
|
||||
|
||||
interface AIConfig {
|
||||
endpoint: string;
|
||||
apiKey: string;
|
||||
model: string;
|
||||
}
|
||||
|
||||
interface SelectionResult {
|
||||
selectedTools: string[];
|
||||
selectedConcepts: string[];
|
||||
reasoning: string;
|
||||
}
|
||||
|
||||
interface AnalysisResult {
|
||||
recommendation: any;
|
||||
processingStats: {
|
||||
embeddingsUsed: boolean;
|
||||
candidatesFromEmbeddings: number;
|
||||
finalSelectedItems: number;
|
||||
processingTimeMs: number;
|
||||
};
|
||||
}
|
||||
|
||||
class AIProcessingPipeline {
|
||||
private selectorConfig: AIConfig;
|
||||
private analyzerConfig: AIConfig;
|
||||
private maxSelectedItems: number;
|
||||
private embeddingCandidates: number;
|
||||
private similarityThreshold: number;
|
||||
|
||||
constructor() {
|
||||
this.selectorConfig = {
|
||||
endpoint: this.getEnv('AI_SELECTOR_ENDPOINT'),
|
||||
apiKey: this.getEnv('AI_SELECTOR_API_KEY'),
|
||||
model: this.getEnv('AI_SELECTOR_MODEL')
|
||||
};
|
||||
|
||||
this.analyzerConfig = {
|
||||
endpoint: this.getEnv('AI_ANALYZER_ENDPOINT'),
|
||||
apiKey: this.getEnv('AI_ANALYZER_API_KEY'),
|
||||
model: this.getEnv('AI_ANALYZER_MODEL')
|
||||
};
|
||||
|
||||
this.maxSelectedItems = parseInt(process.env.AI_MAX_SELECTED_ITEMS || '15', 10);
|
||||
this.embeddingCandidates = parseInt(process.env.AI_EMBEDDING_CANDIDATES || '30', 10);
|
||||
this.similarityThreshold = parseFloat(process.env.AI_SIMILARITY_THRESHOLD || '0.3');
|
||||
}
|
||||
|
||||
private getEnv(key: string): string {
|
||||
const value = process.env[key];
|
||||
if (!value) {
|
||||
throw new Error(`Missing environment variable: ${key}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
private async callAI(config: AIConfig, messages: any[], maxTokens: number = 1000): Promise<string> {
|
||||
const response = await fetch(`${config.endpoint}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${config.apiKey}`
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: config.model,
|
||||
messages,
|
||||
max_tokens: maxTokens,
|
||||
temperature: 0.3
|
||||
})
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
throw new Error(`AI API error (${config.model}): ${response.status} - ${errorText}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const content = data.choices?.[0]?.message?.content;
|
||||
|
||||
if (!content) {
|
||||
throw new Error(`No response from AI model: ${config.model}`);
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
private createSelectorPrompt(toolsData: any, userQuery: string, mode: string): string {
|
||||
const toolsList = toolsData.tools.map((tool: any) => ({
|
||||
name: tool.name,
|
||||
type: tool.type,
|
||||
description: tool.description.slice(0, 200) + '...',
|
||||
domains: tool.domains,
|
||||
phases: tool.phases,
|
||||
tags: tool.tags?.slice(0, 5) || [],
|
||||
skillLevel: tool.skillLevel
|
||||
}));
|
||||
|
||||
const conceptsList = toolsData.concepts.map((concept: any) => ({
|
||||
name: concept.name,
|
||||
type: 'concept',
|
||||
description: concept.description.slice(0, 200) + '...',
|
||||
domains: concept.domains,
|
||||
phases: concept.phases,
|
||||
tags: concept.tags?.slice(0, 5) || []
|
||||
}));
|
||||
|
||||
const modeInstruction = mode === 'workflow'
|
||||
? 'The user wants a COMPREHENSIVE WORKFLOW with multiple tools/methods across different phases.'
|
||||
: 'The user wants SPECIFIC TOOLS/METHODS that directly solve their particular problem.';
|
||||
|
||||
return `You are a DFIR expert tasked with selecting the most relevant tools and concepts for a user query.
|
||||
|
||||
${modeInstruction}
|
||||
|
||||
AVAILABLE TOOLS:
|
||||
${JSON.stringify(toolsList, null, 2)}
|
||||
|
||||
AVAILABLE CONCEPTS:
|
||||
${JSON.stringify(conceptsList, null, 2)}
|
||||
|
||||
USER QUERY: "${userQuery}"
|
||||
|
||||
Select the most relevant items (max ${this.maxSelectedItems} total). For workflow mode, prioritize breadth across phases. For tool mode, prioritize specificity and direct relevance.
|
||||
|
||||
Respond with ONLY this JSON format:
|
||||
{
|
||||
"selectedTools": ["Tool Name 1", "Tool Name 2", ...],
|
||||
"selectedConcepts": ["Concept Name 1", "Concept Name 2", ...],
|
||||
"reasoning": "Brief explanation of selection criteria and approach"
|
||||
}`;
|
||||
}
|
||||
|
||||
private async selectRelevantItems(toolsData: any, userQuery: string, mode: string): Promise<SelectionResult> {
|
||||
const prompt = this.createSelectorPrompt(toolsData, userQuery, mode);
|
||||
|
||||
const messages = [
|
||||
{ role: 'user', content: prompt }
|
||||
];
|
||||
|
||||
const response = await this.callAI(this.selectorConfig, messages, 1500);
|
||||
|
||||
try {
|
||||
const cleaned = response.replace(/^```json\s*/i, '').replace(/\s*```\s*$/g, '').trim();
|
||||
const result = JSON.parse(cleaned);
|
||||
|
||||
// Validate the structure
|
||||
if (!Array.isArray(result.selectedTools) || !Array.isArray(result.selectedConcepts)) {
|
||||
throw new Error('Invalid selection result structure');
|
||||
}
|
||||
|
||||
// Limit selections
|
||||
const totalSelected = result.selectedTools.length + result.selectedConcepts.length;
|
||||
if (totalSelected > this.maxSelectedItems) {
|
||||
console.warn(`[AI PIPELINE] Selection exceeded limit (${totalSelected}), truncating`);
|
||||
result.selectedTools = result.selectedTools.slice(0, Math.floor(this.maxSelectedItems * 0.8));
|
||||
result.selectedConcepts = result.selectedConcepts.slice(0, Math.ceil(this.maxSelectedItems * 0.2));
|
||||
}
|
||||
|
||||
return result;
|
||||
} catch (error) {
|
||||
console.error('[AI PIPELINE] Failed to parse selector response:', response);
|
||||
throw new Error('Invalid JSON response from selector AI');
|
||||
}
|
||||
}
|
||||
|
||||
private filterDataBySelection(toolsData: any, selection: SelectionResult): any {
|
||||
const selectedToolNames = new Set(selection.selectedTools);
|
||||
const selectedConceptNames = new Set(selection.selectedConcepts);
|
||||
|
||||
return {
|
||||
tools: toolsData.tools.filter((tool: any) => selectedToolNames.has(tool.name)),
|
||||
concepts: toolsData.concepts.filter((concept: any) => selectedConceptNames.has(concept.name)),
|
||||
domains: toolsData.domains,
|
||||
phases: toolsData.phases,
|
||||
'domain-agnostic-software': toolsData['domain-agnostic-software']
|
||||
};
|
||||
}
|
||||
|
||||
private async processWithEmbeddings(userQuery: string, toolsData: any, mode: string): Promise<{ filteredData: any; stats: any }> {
|
||||
console.log('[AI PIPELINE] Using embeddings for initial filtering');
|
||||
|
||||
const similarItems = await embeddingsService.findSimilar(
|
||||
userQuery,
|
||||
this.embeddingCandidates,
|
||||
this.similarityThreshold
|
||||
);
|
||||
|
||||
if (similarItems.length === 0) {
|
||||
console.log('[AI PIPELINE] No similar items found with embeddings, using full dataset');
|
||||
return {
|
||||
filteredData: toolsData,
|
||||
stats: { embeddingsUsed: true, candidatesFromEmbeddings: 0, fallbackToFull: true }
|
||||
};
|
||||
}
|
||||
|
||||
// Create filtered dataset from embedding results
|
||||
const similarToolNames = new Set();
|
||||
const similarConceptNames = new Set();
|
||||
|
||||
similarItems.forEach(item => {
|
||||
if (item.type === 'tool') {
|
||||
similarToolNames.add(item.name);
|
||||
} else if (item.type === 'concept') {
|
||||
similarConceptNames.add(item.name);
|
||||
}
|
||||
});
|
||||
|
||||
const embeddingFilteredData = {
|
||||
tools: toolsData.tools.filter((tool: any) => similarToolNames.has(tool.name)),
|
||||
concepts: toolsData.concepts.filter((concept: any) => similarConceptNames.has(concept.name)),
|
||||
domains: toolsData.domains,
|
||||
phases: toolsData.phases,
|
||||
'domain-agnostic-software': toolsData['domain-agnostic-software']
|
||||
};
|
||||
|
||||
console.log(`[AI PIPELINE] Embeddings filtered to ${embeddingFilteredData.tools.length} tools, ${embeddingFilteredData.concepts.length} concepts`);
|
||||
|
||||
return {
|
||||
filteredData: embeddingFilteredData,
|
||||
stats: { embeddingsUsed: true, candidatesFromEmbeddings: similarItems.length }
|
||||
};
|
||||
}
|
||||
|
||||
private async processWithoutEmbeddings(userQuery: string, toolsData: any, mode: string): Promise<{ filteredData: any; stats: any }> {
|
||||
console.log('[AI PIPELINE] Processing without embeddings - using selector AI');
|
||||
|
||||
const selection = await this.selectRelevantItems(toolsData, userQuery, mode);
|
||||
const filteredData = this.filterDataBySelection(toolsData, selection);
|
||||
|
||||
console.log(`[AI PIPELINE] Selector chose ${selection.selectedTools.length} tools, ${selection.selectedConcepts.length} concepts`);
|
||||
console.log(`[AI PIPELINE] Selection reasoning: ${selection.reasoning}`);
|
||||
|
||||
return {
|
||||
filteredData,
|
||||
stats: { embeddingsUsed: false, candidatesFromEmbeddings: 0, selectorReasoning: selection.reasoning }
|
||||
};
|
||||
}
|
||||
|
||||
private createAnalyzerPrompt(filteredData: any, userQuery: string, mode: string): string {
|
||||
// Use existing prompt creation logic but with filtered data
|
||||
if (mode === 'workflow') {
|
||||
return this.createWorkflowAnalyzerPrompt(filteredData, userQuery);
|
||||
} else {
|
||||
return this.createToolAnalyzerPrompt(filteredData, userQuery);
|
||||
}
|
||||
}
|
||||
|
||||
private createWorkflowAnalyzerPrompt(toolsData: any, userQuery: string): string {
|
||||
const toolsList = toolsData.tools.map((tool: any) => ({
|
||||
name: tool.name,
|
||||
description: tool.description,
|
||||
domains: tool.domains,
|
||||
phases: tool.phases,
|
||||
domainAgnostic: tool['domain-agnostic-software'],
|
||||
platforms: tool.platforms,
|
||||
skillLevel: tool.skillLevel,
|
||||
license: tool.license,
|
||||
tags: tool.tags,
|
||||
related_concepts: tool.related_concepts || []
|
||||
}));
|
||||
|
||||
const conceptsList = toolsData.concepts.map((concept: any) => ({
|
||||
name: concept.name,
|
||||
description: concept.description,
|
||||
domains: concept.domains,
|
||||
phases: concept.phases,
|
||||
skillLevel: concept.skillLevel,
|
||||
tags: concept.tags
|
||||
}));
|
||||
|
||||
const regularPhases = toolsData.phases || [];
|
||||
const domainAgnosticSoftware = toolsData['domain-agnostic-software'] || [];
|
||||
const allPhaseItems = [...regularPhases, ...domainAgnosticSoftware];
|
||||
|
||||
const phasesDescription = allPhaseItems.map((phase: any) =>
|
||||
`- ${phase.id}: ${phase.name}`
|
||||
).join('\n');
|
||||
|
||||
const domainsDescription = toolsData.domains.map((domain: any) =>
|
||||
`- ${domain.id}: ${domain.name}`
|
||||
).join('\n');
|
||||
|
||||
const validPhases = [...regularPhases.map((p: any) => p.id), ...domainAgnosticSoftware.map((s: any) => s.id)].join('|');
|
||||
|
||||
return `Du bist ein DFIR (Digital Forensics and Incident Response) Experte. Du erhältst eine vorgefilterte Auswahl relevanter Tools und Konzepte und sollst daraus eine optimale Empfehlung erstellen.
|
||||
|
||||
VERFÜGBARE TOOLS/METHODEN (VORGEFILTERT):
|
||||
${JSON.stringify(toolsList, null, 2)}
|
||||
|
||||
VERFÜGBARE KONZEPTE (VORGEFILTERT):
|
||||
${JSON.stringify(conceptsList, null, 2)}
|
||||
|
||||
UNTERSUCHUNGSPHASEN:
|
||||
${phasesDescription}
|
||||
|
||||
FORENSISCHE DOMÄNEN:
|
||||
${domainsDescription}
|
||||
|
||||
WICHTIGE REGELN:
|
||||
1. Pro Phase 2-3 Tools/Methoden empfehlen (immer mindestens 2 wenn verfügbar)
|
||||
2. Tools/Methoden können in MEHREREN Phasen empfohlen werden wenn sinnvoll
|
||||
3. Für Reporting-Phase: Visualisierungs- und Dokumentationssoftware einschließen
|
||||
4. Gib stets dem spezieller für den Fall geeigneten Werkzeug den Vorzug
|
||||
5. Deutsche Antworten für deutsche Anfragen, English for English queries
|
||||
6. Methoden haben, sofern für das SZENARIO passend, IMMER Vorrang vor Software
|
||||
7. Bevorzuge alles, was nicht proprietär ist (license != "Proprietary"), aber erkenne an, wenn proprietäre Software besser geeignet ist
|
||||
8. WICHTIG: Erwähne relevante Hintergrundwissen-Konzepte wenn Tools verwendet werden, die related_concepts haben
|
||||
9. Konzepte sind NICHT Tools - empfehle sie nicht als actionable Schritte, sondern als Wissensbasis
|
||||
|
||||
ENHANCED CONTEXTUAL ANALYSIS:
|
||||
10. Analysiere das Szenario detailliert und identifiziere Schlüsselelemente, Bedrohungen und forensische Herausforderungen
|
||||
11. Entwickle einen strategischen Untersuchungsansatz basierend auf dem spezifischen Szenario
|
||||
12. Identifiziere zeitkritische oder besonders wichtige Faktoren für diesen Fall
|
||||
|
||||
USER QUERY: "${userQuery}"
|
||||
|
||||
ANTWORT-FORMAT (strict JSON):
|
||||
{
|
||||
"scenario_analysis": "Detaillierte Analyse des Szenarios: Erkannte Schlüsselelemente, Art des Vorfalls, betroffene Systeme, potentielle Bedrohungen und forensische Herausforderungen",
|
||||
"investigation_approach": "Strategischer Untersuchungsansatz für dieses spezifische Szenario: Prioritäten, Reihenfolge der Phasen, besondere Überlegungen",
|
||||
"critical_considerations": "Zeitkritische Faktoren, wichtige Sicherheitsaspekte oder besondere Vorsichtsmaßnahmen für diesen Fall",
|
||||
"recommended_tools": [
|
||||
{
|
||||
"name": "EXAKTER Name aus der Tools-Database",
|
||||
"priority": "high|medium|low",
|
||||
"phase": "${validPhases}",
|
||||
"justification": "Warum diese Methode für diese Phase und dieses spezifische Szenario geeignet ist - mit Bezug zu den erkannten Schlüsselelementen"
|
||||
}
|
||||
],
|
||||
"workflow_suggestion": "Vorgeschlagener Untersuchungsablauf mit konkreten Schritten für dieses Szenario",
|
||||
"background_knowledge": [
|
||||
{
|
||||
"concept_name": "EXAKTER Name aus der Konzepte-Database",
|
||||
"relevance": "Warum dieses Konzept für das Szenario relevant ist, und bei welchen der empfohlenen Methoden/Tools."
|
||||
}
|
||||
],
|
||||
"additional_notes": "Wichtige Überlegungen und Hinweise"
|
||||
}
|
||||
|
||||
Antworte NUR mit validen JSON. Keine zusätzlichen Erklärungen außerhalb des JSON.`;
|
||||
}
|
||||
|
||||
private createToolAnalyzerPrompt(toolsData: any, userQuery: string): string {
|
||||
const toolsList = toolsData.tools.map((tool: any) => ({
|
||||
name: tool.name,
|
||||
description: tool.description,
|
||||
domains: tool.domains,
|
||||
phases: tool.phases,
|
||||
platforms: tool.platforms,
|
||||
skillLevel: tool.skillLevel,
|
||||
license: tool.license,
|
||||
tags: tool.tags,
|
||||
url: tool.url,
|
||||
projectUrl: tool.projectUrl,
|
||||
related_concepts: tool.related_concepts || []
|
||||
}));
|
||||
|
||||
const conceptsList = toolsData.concepts.map((concept: any) => ({
|
||||
name: concept.name,
|
||||
description: concept.description,
|
||||
domains: concept.domains,
|
||||
phases: concept.phases,
|
||||
skillLevel: concept.skillLevel,
|
||||
tags: concept.tags
|
||||
}));
|
||||
|
||||
return `Du bist ein DFIR (Digital Forensics and Incident Response) Experte. Du erhältst eine vorgefilterte Auswahl relevanter Tools und Konzepte und sollst daraus 1-3 optimale Empfehlungen für ein spezifisches Problem erstellen.
|
||||
|
||||
VERFÜGBARE TOOLS/METHODEN (VORGEFILTERT):
|
||||
${JSON.stringify(toolsList, null, 2)}
|
||||
|
||||
VERFÜGBARE KONZEPTE (VORGEFILTERT):
|
||||
${JSON.stringify(conceptsList, null, 2)}
|
||||
|
||||
WICHTIGE REGELN:
|
||||
1. Analysiere das spezifische Problem/die Anforderung sorgfältig
|
||||
2. Empfehle 1-3 Methoden/Tools, sortiert nach Eignung (beste Empfehlung zuerst)
|
||||
3. Gib detaillierte Erklärungen, WARUM und WIE jede Methode/Tool das Problem löst
|
||||
4. Berücksichtige praktische Aspekte: Skill Level, Plattformen, Verfügbarkeit
|
||||
5. Deutsche Antworten für deutsche Anfragen, English for English queries
|
||||
6. Gib konkrete Anwendungshinweise, nicht nur allgemeine Beschreibungen
|
||||
7. Methoden haben, sofern für das SZENARIO passend, IMMER Vorrang vor Software
|
||||
8. Erwähne sowohl Stärken als auch Schwächen/Limitationen
|
||||
9. Schlage alternative Ansätze vor, wenn sinnvoll
|
||||
10. Gib grundsätzliche Hinweise, WIE die Methode/Tool konkret eingesetzt wird
|
||||
11. WICHTIG: Erwähne relevante Hintergrundwissen-Konzepte wenn Tools verwendet werden, die related_concepts haben
|
||||
12. Konzepte sind NICHT Tools - empfehle sie nicht als actionable Schritte, sondern als Wissensbasis
|
||||
|
||||
ENHANCED CONTEXTUAL ANALYSIS:
|
||||
13. Analysiere das Problem detailliert und identifiziere technische Anforderungen, Herausforderungen und Erfolgsfaktoren
|
||||
14. Entwickle einen strategischen Lösungsansatz basierend auf dem spezifischen Problem
|
||||
15. Identifiziere wichtige Voraussetzungen oder Warnungen für die Anwendung
|
||||
|
||||
USER QUERY: "${userQuery}"
|
||||
|
||||
ANTWORT-FORMAT (strict JSON):
|
||||
{
|
||||
"problem_analysis": "Detaillierte Analyse des Problems: Erkannte technische Anforderungen, Herausforderungen, benötigte Fähigkeiten und Erfolgsfaktoren",
|
||||
"investigation_approach": "Strategischer Lösungsansatz für dieses spezifische Problem: Herangehensweise, Prioritäten, optimale Anwendungsreihenfolge",
|
||||
"critical_considerations": "Wichtige Voraussetzungen, potentielle Fallstricke oder Warnungen für die Anwendung der empfohlenen Lösungen",
|
||||
"recommended_tools": [
|
||||
{
|
||||
"name": "EXAKTER Name aus der Tools-Database",
|
||||
"rank": 1,
|
||||
"suitability_score": "high|medium|low",
|
||||
"detailed_explanation": "Detaillierte Erklärung, warum dieses Tool/diese Methode das spezifische Problem löst - mit Bezug zu den erkannten Anforderungen",
|
||||
"implementation_approach": "Konkrete Schritte/Ansatz zur Anwendung für dieses spezifische Problem",
|
||||
"pros": ["Spezifische Vorteile für diesen Anwendungsfall", "Weitere Vorteile"],
|
||||
"cons": ["Potentielle Nachteile oder Limitationen", "Weitere Einschränkungen"],
|
||||
"alternatives": "Alternative Ansätze oder ergänzende Tools/Methoden, falls relevant"
|
||||
}
|
||||
],
|
||||
"background_knowledge": [
|
||||
{
|
||||
"concept_name": "EXAKTER Name aus der Konzepte-Database",
|
||||
"relevance": "Warum dieses Konzept für die empfohlenen Tools/das Problem relevant ist, und für welche der empfohlenen Methoden/Tools."
|
||||
}
|
||||
],
|
||||
"additional_considerations": "Wichtige Überlegungen, Voraussetzungen oder Warnungen"
|
||||
}
|
||||
|
||||
Antworte NUR mit validen JSON. Keine zusätzlichen Erklärungen außerhalb des JSON.`;
|
||||
}
|
||||
|
||||
async processQuery(userQuery: string, mode: string): Promise<AnalysisResult> {
|
||||
const startTime = Date.now();
|
||||
console.log(`[AI PIPELINE] Starting ${mode} query processing`);
|
||||
|
||||
try {
|
||||
// Load full dataset
|
||||
const toolsData = await getCompressedToolsDataForAI();
|
||||
|
||||
let filteredData: any;
|
||||
let processingStats: any = {
|
||||
embeddingsUsed: false,
|
||||
candidatesFromEmbeddings: 0,
|
||||
finalSelectedItems: 0,
|
||||
processingTimeMs: 0
|
||||
};
|
||||
|
||||
// Stage 1: Filter candidates (embeddings or selector AI)
|
||||
if (embeddingsService.isEnabled()) {
|
||||
const result = await this.processWithEmbeddings(userQuery, toolsData, mode);
|
||||
filteredData = result.filteredData;
|
||||
processingStats = { ...processingStats, ...result.stats };
|
||||
} else {
|
||||
const result = await this.processWithoutEmbeddings(userQuery, toolsData, mode);
|
||||
filteredData = result.filteredData;
|
||||
processingStats = { ...processingStats, ...result.stats };
|
||||
}
|
||||
|
||||
// Stage 2: Generate detailed analysis with analyzer AI
|
||||
console.log('[AI PIPELINE] Stage 2: Generating detailed analysis');
|
||||
const analyzerPrompt = this.createAnalyzerPrompt(filteredData, userQuery, mode);
|
||||
|
||||
const messages = [
|
||||
{ role: 'user', content: analyzerPrompt }
|
||||
];
|
||||
|
||||
const analysisResponse = await this.callAI(this.analyzerConfig, messages, 3500);
|
||||
|
||||
// Parse the response
|
||||
let recommendation;
|
||||
try {
|
||||
const cleanedContent = analysisResponse.replace(/^```json\s*/i, '').replace(/\s*```\s*$/g, '').trim();
|
||||
recommendation = JSON.parse(cleanedContent);
|
||||
} catch (error) {
|
||||
console.error('[AI PIPELINE] Failed to parse analysis response:', analysisResponse);
|
||||
throw new Error('Invalid JSON response from analyzer AI');
|
||||
}
|
||||
|
||||
// Validate tool/concept names exist in filtered data
|
||||
const validToolNames = new Set(filteredData.tools.map((t: any) => t.name));
|
||||
const validConceptNames = new Set(filteredData.concepts.map((c: any) => c.name));
|
||||
|
||||
if (recommendation.recommended_tools) {
|
||||
recommendation.recommended_tools = recommendation.recommended_tools.filter((tool: any) => {
|
||||
if (!validToolNames.has(tool.name)) {
|
||||
console.warn(`[AI PIPELINE] Analyzer recommended unknown tool: ${tool.name}`);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
if (recommendation.background_knowledge) {
|
||||
recommendation.background_knowledge = recommendation.background_knowledge.filter((concept: any) => {
|
||||
if (!validConceptNames.has(concept.concept_name)) {
|
||||
console.warn(`[AI PIPELINE] Analyzer referenced unknown concept: ${concept.concept_name}`);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
processingStats.finalSelectedItems = (recommendation.recommended_tools?.length || 0) +
|
||||
(recommendation.background_knowledge?.length || 0);
|
||||
processingStats.processingTimeMs = Date.now() - startTime;
|
||||
|
||||
console.log(`[AI PIPELINE] Completed in ${processingStats.processingTimeMs}ms`);
|
||||
console.log(`[AI PIPELINE] Final recommendations: ${recommendation.recommended_tools?.length || 0} tools, ${recommendation.background_knowledge?.length || 0} concepts`);
|
||||
|
||||
return {
|
||||
recommendation,
|
||||
processingStats
|
||||
};
|
||||
|
||||
} catch (error) {
|
||||
console.error('[AI PIPELINE] Processing failed:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Global instance
|
||||
const aiPipeline = new AIProcessingPipeline();
|
||||
|
||||
export { aiPipeline, type AnalysisResult };
|
@ -21,7 +21,7 @@ const ToolSchema = z.object({
|
||||
accessType: z.string().optional().nullable(),
|
||||
'domain-agnostic-software': z.array(z.string()).optional().nullable(),
|
||||
related_concepts: z.array(z.string()).optional().nullable().default([]),
|
||||
related_software: z.array(z.string()).optional().nullable().default([]), // Added this line
|
||||
related_software: z.array(z.string()).optional().nullable().default([]),
|
||||
});
|
||||
|
||||
const ToolsDataSchema = z.object({
|
||||
@ -67,6 +67,7 @@ let cachedData: ToolsData | null = null;
|
||||
let cachedRandomizedData: ToolsData | null = null;
|
||||
let cachedCompressedData: CompressedToolsData | null = null;
|
||||
let lastRandomizationDate: string | null = null;
|
||||
let dataVersion: string | null = null; // Add version tracking for embeddings
|
||||
|
||||
function seededRandom(seed: number): () => number {
|
||||
let x = Math.sin(seed) * 10000;
|
||||
@ -91,6 +92,18 @@ function shuffleArray<T>(array: T[], randomFn: () => number): T[] {
|
||||
return shuffled;
|
||||
}
|
||||
|
||||
// Generate a simple hash of the data for version tracking
|
||||
function generateDataVersion(data: any): string {
|
||||
const str = JSON.stringify(data, Object.keys(data).sort());
|
||||
let hash = 0;
|
||||
for (let i = 0; i < str.length; i++) {
|
||||
const char = str.charCodeAt(i);
|
||||
hash = ((hash << 5) - hash) + char;
|
||||
hash = hash & hash; // Convert to 32-bit integer
|
||||
}
|
||||
return Math.abs(hash).toString(36);
|
||||
}
|
||||
|
||||
async function loadRawData(): Promise<ToolsData> {
|
||||
if (!cachedData) {
|
||||
const yamlPath = path.join(process.cwd(), 'src/data/tools.yaml');
|
||||
@ -99,6 +112,11 @@ async function loadRawData(): Promise<ToolsData> {
|
||||
|
||||
try {
|
||||
cachedData = ToolsDataSchema.parse(rawData);
|
||||
|
||||
// Generate data version for embeddings tracking
|
||||
dataVersion = generateDataVersion(cachedData);
|
||||
console.log(`[DATA SERVICE] Loaded data version: ${dataVersion}`);
|
||||
|
||||
} catch (error) {
|
||||
console.error('YAML validation failed:', error);
|
||||
throw new Error('Invalid tools.yaml structure');
|
||||
@ -124,6 +142,7 @@ export async function getToolsData(): Promise<ToolsData> {
|
||||
|
||||
lastRandomizationDate = today;
|
||||
|
||||
// Clear compressed cache when randomized data changes
|
||||
cachedCompressedData = null;
|
||||
}
|
||||
|
||||
@ -156,14 +175,23 @@ export async function getCompressedToolsDataForAI(): Promise<CompressedToolsData
|
||||
'domain-agnostic-software': data['domain-agnostic-software']
|
||||
// scenarios intentionally excluded from AI data
|
||||
};
|
||||
|
||||
console.log(`[DATA SERVICE] Generated compressed data: ${compressedTools.length} tools, ${concepts.length} concepts`);
|
||||
}
|
||||
|
||||
return cachedCompressedData;
|
||||
}
|
||||
|
||||
export function getDataVersion(): string | null {
|
||||
return dataVersion;
|
||||
}
|
||||
|
||||
export function clearCache(): void {
|
||||
cachedData = null;
|
||||
cachedRandomizedData = null;
|
||||
cachedCompressedData = null;
|
||||
lastRandomizationDate = null;
|
||||
dataVersion = null;
|
||||
|
||||
console.log('[DATA SERVICE] Cache cleared');
|
||||
}
|
259
src/utils/embeddings.ts
Normal file
259
src/utils/embeddings.ts
Normal file
@ -0,0 +1,259 @@
|
||||
// src/utils/embeddings.ts
|
||||
import { promises as fs } from 'fs';
|
||||
import path from 'path';
|
||||
import { getCompressedToolsDataForAI } from './dataService.js';
|
||||
|
||||
interface EmbeddingData {
|
||||
id: string;
|
||||
type: 'tool' | 'concept';
|
||||
name: string;
|
||||
content: string;
|
||||
embedding: number[];
|
||||
metadata: {
|
||||
domains?: string[];
|
||||
phases?: string[];
|
||||
tags?: string[];
|
||||
skillLevel?: string;
|
||||
type?: string;
|
||||
};
|
||||
}
|
||||
|
||||
interface EmbeddingsDatabase {
|
||||
version: string;
|
||||
lastUpdated: number;
|
||||
embeddings: EmbeddingData[];
|
||||
}
|
||||
|
||||
class EmbeddingsService {
|
||||
private embeddings: EmbeddingData[] = [];
|
||||
private isInitialized = false;
|
||||
private readonly embeddingsPath = path.join(process.cwd(), 'data', 'embeddings.json');
|
||||
private readonly batchSize: number;
|
||||
private readonly batchDelay: number;
|
||||
private readonly enabled: boolean;
|
||||
|
||||
constructor() {
|
||||
this.enabled = process.env.AI_EMBEDDINGS_ENABLED === 'true';
|
||||
this.batchSize = parseInt(process.env.AI_EMBEDDINGS_BATCH_SIZE || '20', 10);
|
||||
this.batchDelay = parseInt(process.env.AI_EMBEDDINGS_BATCH_DELAY_MS || '1000', 10);
|
||||
}
|
||||
|
||||
async initialize(): Promise<void> {
|
||||
if (!this.enabled) {
|
||||
console.log('[EMBEDDINGS] Embeddings disabled, skipping initialization');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
console.log('[EMBEDDINGS] Initializing embeddings system...');
|
||||
|
||||
// Create data directory if it doesn't exist
|
||||
await fs.mkdir(path.dirname(this.embeddingsPath), { recursive: true });
|
||||
|
||||
const toolsData = await getCompressedToolsDataForAI();
|
||||
const currentDataHash = this.hashData(toolsData);
|
||||
|
||||
// Try to load existing embeddings
|
||||
const existingEmbeddings = await this.loadEmbeddings();
|
||||
|
||||
if (existingEmbeddings && existingEmbeddings.version === currentDataHash) {
|
||||
console.log('[EMBEDDINGS] Using cached embeddings');
|
||||
this.embeddings = existingEmbeddings.embeddings;
|
||||
} else {
|
||||
console.log('[EMBEDDINGS] Generating new embeddings...');
|
||||
await this.generateEmbeddings(toolsData, currentDataHash);
|
||||
}
|
||||
|
||||
this.isInitialized = true;
|
||||
console.log(`[EMBEDDINGS] Initialized with ${this.embeddings.length} embeddings`);
|
||||
|
||||
} catch (error) {
|
||||
console.error('[EMBEDDINGS] Failed to initialize:', error);
|
||||
this.isInitialized = false;
|
||||
}
|
||||
}
|
||||
|
||||
private hashData(data: any): string {
|
||||
return Buffer.from(JSON.stringify(data)).toString('base64').slice(0, 32);
|
||||
}
|
||||
|
||||
private async loadEmbeddings(): Promise<EmbeddingsDatabase | null> {
|
||||
try {
|
||||
const data = await fs.readFile(this.embeddingsPath, 'utf8');
|
||||
return JSON.parse(data);
|
||||
} catch (error) {
|
||||
console.log('[EMBEDDINGS] No existing embeddings found');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private async saveEmbeddings(version: string): Promise<void> {
|
||||
const database: EmbeddingsDatabase = {
|
||||
version,
|
||||
lastUpdated: Date.now(),
|
||||
embeddings: this.embeddings
|
||||
};
|
||||
|
||||
await fs.writeFile(this.embeddingsPath, JSON.stringify(database, null, 2));
|
||||
console.log(`[EMBEDDINGS] Saved ${this.embeddings.length} embeddings to disk`);
|
||||
}
|
||||
|
||||
private createContentString(item: any): string {
|
||||
const parts = [
|
||||
item.name,
|
||||
item.description || '',
|
||||
...(item.tags || []),
|
||||
...(item.domains || []),
|
||||
...(item.phases || [])
|
||||
];
|
||||
|
||||
return parts.filter(Boolean).join(' ').toLowerCase();
|
||||
}
|
||||
|
||||
private async generateEmbeddingsBatch(contents: string[]): Promise<number[][]> {
|
||||
const endpoint = process.env.AI_EMBEDDINGS_ENDPOINT;
|
||||
const apiKey = process.env.AI_EMBEDDINGS_API_KEY;
|
||||
const model = process.env.AI_EMBEDDINGS_MODEL;
|
||||
|
||||
if (!endpoint || !apiKey || !model) {
|
||||
throw new Error('Missing embeddings API configuration');
|
||||
}
|
||||
|
||||
const response = await fetch(endpoint, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${apiKey}`
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
input: contents
|
||||
})
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`Embeddings API error: ${response.status} - ${error}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
return data.data.map((item: any) => item.embedding);
|
||||
}
|
||||
|
||||
private async generateEmbeddings(toolsData: any, version: string): Promise<void> {
|
||||
const allItems = [
|
||||
...toolsData.tools.map((tool: any) => ({ ...tool, type: 'tool' })),
|
||||
...toolsData.concepts.map((concept: any) => ({ ...concept, type: 'concept' }))
|
||||
];
|
||||
|
||||
const contents = allItems.map(item => this.createContentString(item));
|
||||
this.embeddings = [];
|
||||
|
||||
// Process in batches to respect rate limits
|
||||
for (let i = 0; i < contents.length; i += this.batchSize) {
|
||||
const batch = contents.slice(i, i + this.batchSize);
|
||||
const batchItems = allItems.slice(i, i + this.batchSize);
|
||||
|
||||
console.log(`[EMBEDDINGS] Processing batch ${Math.ceil((i + 1) / this.batchSize)} of ${Math.ceil(contents.length / this.batchSize)}`);
|
||||
|
||||
try {
|
||||
const embeddings = await this.generateEmbeddingsBatch(batch);
|
||||
|
||||
embeddings.forEach((embedding, index) => {
|
||||
const item = batchItems[index];
|
||||
this.embeddings.push({
|
||||
id: `${item.type}_${item.name.replace(/[^a-zA-Z0-9]/g, '_')}`,
|
||||
type: item.type,
|
||||
name: item.name,
|
||||
content: batch[index],
|
||||
embedding,
|
||||
metadata: {
|
||||
domains: item.domains,
|
||||
phases: item.phases,
|
||||
tags: item.tags,
|
||||
skillLevel: item.skillLevel,
|
||||
type: item.type
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// Rate limiting delay between batches
|
||||
if (i + this.batchSize < contents.length) {
|
||||
await new Promise(resolve => setTimeout(resolve, this.batchDelay));
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error(`[EMBEDDINGS] Failed to process batch ${Math.ceil((i + 1) / this.batchSize)}:`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
await this.saveEmbeddings(version);
|
||||
}
|
||||
|
||||
private cosineSimilarity(a: number[], b: number[]): number {
|
||||
let dotProduct = 0;
|
||||
let normA = 0;
|
||||
let normB = 0;
|
||||
|
||||
for (let i = 0; i < a.length; i++) {
|
||||
dotProduct += a[i] * b[i];
|
||||
normA += a[i] * a[i];
|
||||
normB += b[i] * b[i];
|
||||
}
|
||||
|
||||
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
||||
}
|
||||
|
||||
async findSimilar(query: string, maxResults: number = 30, threshold: number = 0.3): Promise<EmbeddingData[]> {
|
||||
if (!this.enabled || !this.isInitialized || this.embeddings.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
// Generate embedding for query
|
||||
const queryEmbeddings = await this.generateEmbeddingsBatch([query.toLowerCase()]);
|
||||
const queryEmbedding = queryEmbeddings[0];
|
||||
|
||||
// Calculate similarities
|
||||
const similarities = this.embeddings.map(item => ({
|
||||
...item,
|
||||
similarity: this.cosineSimilarity(queryEmbedding, item.embedding)
|
||||
}));
|
||||
|
||||
// Filter by threshold and sort by similarity
|
||||
return similarities
|
||||
.filter(item => item.similarity >= threshold)
|
||||
.sort((a, b) => b.similarity - a.similarity)
|
||||
.slice(0, maxResults);
|
||||
|
||||
} catch (error) {
|
||||
console.error('[EMBEDDINGS] Failed to find similar items:', error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
isEnabled(): boolean {
|
||||
return this.enabled && this.isInitialized;
|
||||
}
|
||||
|
||||
getStats(): { enabled: boolean; initialized: boolean; count: number } {
|
||||
return {
|
||||
enabled: this.enabled,
|
||||
initialized: this.isInitialized,
|
||||
count: this.embeddings.length
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Global instance
|
||||
const embeddingsService = new EmbeddingsService();
|
||||
|
||||
export { embeddingsService, type EmbeddingData };
|
||||
|
||||
// Auto-initialize on import in server environment
|
||||
if (typeof window === 'undefined' && process.env.NODE_ENV !== 'test') {
|
||||
embeddingsService.initialize().catch(error => {
|
||||
console.error('[EMBEDDINGS] Auto-initialization failed:', error);
|
||||
});
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user