forensic-pathways/src/utils/dataService.ts
overcuriousity 8c9bdf0710 vector index
2025-08-01 13:24:43 +02:00

378 lines
12 KiB
TypeScript

// src/utils/dataService.ts - Enhanced for micro-task AI pipeline
import { promises as fs } from 'fs';
import { load } from 'js-yaml';
import path from 'path';
import { z } from 'zod';
const ToolSchema = z.object({
name: z.string(),
icon: z.string().optional().nullable(),
type: z.enum(['software', 'method', 'concept']),
description: z.string(),
domains: z.array(z.string()).optional().nullable().default([]),
phases: z.array(z.string()).optional().nullable().default([]),
platforms: z.array(z.string()).default([]),
skillLevel: z.string(),
url: z.string(),
license: z.string().optional().nullable(),
tags: z.array(z.string()).default([]),
projectUrl: z.string().optional().nullable(),
knowledgebase: z.boolean().optional().nullable(),
statusUrl: z.string().optional().nullable(),
accessType: z.string().optional().nullable(),
'domain-agnostic-software': z.array(z.string()).optional().nullable(),
related_concepts: z.array(z.string()).optional().nullable().default([]),
related_software: z.array(z.string()).optional().nullable().default([]),
});
const ToolsDataSchema = z.object({
tools: z.array(ToolSchema),
domains: z.array(z.object({
id: z.string(),
name: z.string(),
description: z.string().optional()
})),
phases: z.array(z.object({
id: z.string(),
name: z.string(),
description: z.string().optional(),
typical_tools: z.array(z.string()).optional().default([]),
key_activities: z.array(z.string()).optional().default([])
})),
'domain-agnostic-software': z.array(z.object({
id: z.string(),
name: z.string(),
description: z.string().optional(),
use_cases: z.array(z.string()).optional().default([])
})).optional().default([]),
scenarios: z.array(z.object({
id: z.string(),
icon: z.string(),
friendly_name: z.string(),
description: z.string().optional(),
typical_phases: z.array(z.string()).optional().default([]),
complexity: z.enum(['low', 'medium', 'high']).optional()
})).optional().default([]),
skill_levels: z.object({
novice: z.string().optional(),
beginner: z.string().optional(),
intermediate: z.string().optional(),
advanced: z.string().optional(),
expert: z.string().optional()
}).optional().default({})
});
interface ToolsData {
tools: any[];
domains: any[];
phases: any[];
'domain-agnostic-software': any[];
scenarios: any[];
skill_levels?: any;
}
interface EnhancedCompressedToolsData {
tools: any[];
concepts: any[];
domains: any[];
phases: any[];
'domain-agnostic-software': any[];
scenarios?: any[]; // Optional for AI processing
skill_levels: any;
// Enhanced context for micro-tasks
domain_relationships: DomainRelationship[];
phase_dependencies: PhaseDependency[];
tool_compatibility_matrix: CompatibilityMatrix[];
}
interface DomainRelationship {
domain_id: string;
tool_count: number;
common_tags: string[];
skill_distribution: Record<string, number>;
}
interface PhaseDependency {
phase_id: string;
order: number;
depends_on: string | null;
enables: string | null;
is_parallel_capable: boolean;
typical_duration: string;
}
interface CompatibilityMatrix {
type: string;
groups: Record<string, string[]>;
}
let cachedData: ToolsData | null = null;
let cachedRandomizedData: ToolsData | null = null;
let cachedCompressedData: EnhancedCompressedToolsData | null = null;
let lastRandomizationDate: string | null = null;
let dataVersion: string | null = null;
function seededRandom(seed: number): () => number {
let x = Math.sin(seed) * 10000;
return function() {
x = Math.sin(x) * 10000;
return x - Math.floor(x);
};
}
function getDailySeed(): number {
const today = new Date().toDateString();
const processStart = process.uptime();
return today.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0) + Math.floor(processStart);
}
function shuffleArray<T>(array: T[], randomFn: () => number): T[] {
const shuffled = [...array];
for (let i = shuffled.length - 1; i > 0; i--) {
const j = Math.floor(randomFn() * (i + 1));
[shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
}
return shuffled;
}
function generateDataVersion(data: any): string {
const str = JSON.stringify(data, Object.keys(data).sort());
let hash = 0;
for (let i = 0; i < str.length; i++) {
const char = str.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
return Math.abs(hash).toString(36);
}
// Enhanced: Generate domain relationships for better AI understanding
function generateDomainRelationships(domains: any[], tools: any[]): DomainRelationship[] {
const relationships: DomainRelationship[] = [];
for (const domain of domains) {
const domainTools = tools.filter(tool =>
tool.domains && tool.domains.includes(domain.id)
);
const commonTags = domainTools
.flatMap(tool => tool.tags || [])
.reduce((acc: any, tag: string) => {
acc[tag] = (acc[tag] || 0) + 1;
return acc;
}, {});
const topTags = Object.entries(commonTags)
.sort(([,a], [,b]) => (b as number) - (a as number))
.slice(0, 5)
.map(([tag]) => tag);
relationships.push({
domain_id: domain.id,
tool_count: domainTools.length,
common_tags: topTags,
skill_distribution: domainTools.reduce((acc: any, tool: any) => {
acc[tool.skillLevel] = (acc[tool.skillLevel] || 0) + 1;
return acc;
}, {})
});
}
return relationships;
}
// Enhanced: Generate phase dependencies
function generatePhaseDependencies(phases: any[]): PhaseDependency[] {
const dependencies: PhaseDependency[] = [];
for (let i = 0; i < phases.length; i++) {
const phase = phases[i];
const nextPhase = phases[i + 1];
const prevPhase = phases[i - 1];
dependencies.push({
phase_id: phase.id,
order: i + 1,
depends_on: prevPhase?.id || null,
enables: nextPhase?.id || null,
is_parallel_capable: ['examination', 'analysis'].includes(phase.id), // Some phases can run in parallel
typical_duration: phase.id === 'data-collection' ? 'hours-days' :
phase.id === 'examination' ? 'hours-weeks' :
phase.id === 'analysis' ? 'days-weeks' :
'hours-days'
});
}
return dependencies;
}
// Enhanced: Generate tool compatibility matrix
function generateToolCompatibilityMatrix(tools: any[]): CompatibilityMatrix[] {
const matrix: CompatibilityMatrix[] = [];
// Group tools by common characteristics
const platformGroups = tools.reduce((acc: any, tool: any) => {
if (tool.platforms) {
tool.platforms.forEach((platform: string) => {
if (!acc[platform]) acc[platform] = [];
acc[platform].push(tool.name);
});
}
return acc;
}, {});
const phaseGroups = tools.reduce((acc: any, tool: any) => {
if (tool.phases) {
tool.phases.forEach((phase: string) => {
if (!acc[phase]) acc[phase] = [];
acc[phase].push(tool.name);
});
}
return acc;
}, {});
matrix.push({
type: 'platform_compatibility',
groups: platformGroups
});
matrix.push({
type: 'phase_synergy',
groups: phaseGroups
});
return matrix;
}
async function loadRawData(): Promise<ToolsData> {
if (!cachedData) {
const yamlPath = path.join(process.cwd(), 'src/data/tools.yaml');
const yamlContent = await fs.readFile(yamlPath, 'utf8');
const rawData = load(yamlContent);
try {
cachedData = ToolsDataSchema.parse(rawData);
// Enhanced: Add default skill level descriptions if not provided
if (!cachedData.skill_levels || Object.keys(cachedData.skill_levels).length === 0) {
cachedData.skill_levels = {
novice: "Minimal technical background required, guided interfaces",
beginner: "Basic IT knowledge, some command-line familiarity helpful",
intermediate: "Solid technical foundation, comfortable with various tools",
advanced: "Extensive experience, deep technical understanding required",
expert: "Specialist knowledge, cutting-edge techniques and complex scenarios"
};
}
dataVersion = generateDataVersion(cachedData);
console.log(`[DATA SERVICE] Loaded enhanced data version: ${dataVersion}`);
} catch (error) {
console.error('YAML validation failed:', error);
throw new Error('Invalid tools.yaml structure');
}
}
return cachedData;
}
export async function getToolsData(): Promise<ToolsData> {
const today = new Date().toDateString();
if (!cachedRandomizedData || lastRandomizationDate !== today) {
const rawData = await loadRawData();
const seed = getDailySeed();
const randomFn = seededRandom(seed);
const randomizedTools = shuffleArray(rawData.tools, randomFn);
cachedRandomizedData = {
...rawData,
tools: randomizedTools
};
lastRandomizationDate = today;
cachedCompressedData = null;
}
return cachedRandomizedData;
}
export async function getCompressedToolsDataForAI(): Promise<EnhancedCompressedToolsData> {
if (!cachedCompressedData) {
const data = await getToolsData();
// Enhanced: More detailed tool information for micro-tasks
const compressedTools = data.tools
.filter(tool => tool.type !== 'concept')
.map(tool => {
const { projectUrl, statusUrl, ...compressedTool } = tool;
return {
...compressedTool,
// Enhanced: Add computed fields for AI
is_hosted: projectUrl !== undefined && projectUrl !== null && projectUrl !== "" && projectUrl.trim() !== "",
is_open_source: tool.license && tool.license !== 'Proprietary',
complexity_score: tool.skillLevel === 'expert' ? 5 :
tool.skillLevel === 'advanced' ? 4 :
tool.skillLevel === 'intermediate' ? 3 :
tool.skillLevel === 'beginner' ? 2 : 1,
// Enhanced: Phase-specific suitability hints
phase_suitability: tool.phases?.map(phase => ({
phase,
primary_use: tool.tags?.find(tag => tag.includes(phase)) ? 'primary' : 'secondary'
})) || []
};
});
const concepts = data.tools
.filter(tool => tool.type === 'concept')
.map(concept => {
const { projectUrl, statusUrl, platforms, accessType, license, ...compressedConcept } = concept;
return {
...compressedConcept,
// Enhanced: Learning difficulty indicator
learning_complexity: concept.skillLevel === 'expert' ? 'very_high' :
concept.skillLevel === 'advanced' ? 'high' :
concept.skillLevel === 'intermediate' ? 'medium' :
'low'
};
});
// Enhanced: Add rich context data
const domainRelationships = generateDomainRelationships(data.domains, compressedTools);
const phaseDependencies = generatePhaseDependencies(data.phases);
const toolCompatibilityMatrix = generateToolCompatibilityMatrix(compressedTools);
cachedCompressedData = {
tools: compressedTools,
concepts: concepts,
domains: data.domains,
phases: data.phases,
'domain-agnostic-software': data['domain-agnostic-software'],
scenarios: data.scenarios, // Include scenarios for context
skill_levels: data.skill_levels || {},
// Enhanced context for micro-tasks
domain_relationships: domainRelationships,
phase_dependencies: phaseDependencies,
tool_compatibility_matrix: toolCompatibilityMatrix
};
console.log(`[DATA SERVICE] Generated enhanced compressed data: ${compressedTools.length} tools, ${concepts.length} concepts`);
console.log(`[DATA SERVICE] Added context: ${domainRelationships.length} domain relationships, ${phaseDependencies.length} phase dependencies`);
}
return cachedCompressedData;
}
export function getDataVersion(): string | null {
return dataVersion;
}
export function clearCache(): void {
cachedData = null;
cachedRandomizedData = null;
cachedCompressedData = null;
lastRandomizationDate = null;
dataVersion = null;
console.log('[DATA SERVICE] Enhanced cache cleared');
}