diff --git a/.env.example b/.env.example index e0a1ee0..76d8fea 100644 --- a/.env.example +++ b/.env.example @@ -60,7 +60,7 @@ FORENSIC_AUDIT_MAX_ENTRIES=50 # === AI SEMANTIC SEARCH === # Enable semantic search (highly recommended for better results) -AI_EMBEDDINGS_ENABLED=true +REMOVE_AI_EMBEDDINGS_ENABLED=true AI_EMBEDDINGS_ENDPOINT=https://api.mistral.ai/v1/embeddings AI_EMBEDDINGS_API_KEY=your-embeddings-api-key-here AI_EMBEDDINGS_MODEL=mistral-embed @@ -122,8 +122,8 @@ AI_EMBEDDINGS_BATCH_SIZE=10 AI_EMBEDDINGS_BATCH_DELAY_MS=1000 # === Context Management === -AI_MAX_CONTEXT_TOKENS=4000 -AI_MAX_PROMPT_TOKENS=2500 +REMOVE_AI_MAX_CONTEXT_TOKENS=4000 +REMOVE_AI_MAX_PROMPT_TOKENS=2500 # === Confidence Scoring === CONFIDENCE_SEMANTIC_WEIGHT=0.5 diff --git a/find-duplicates.mjs b/find-duplicates.mjs new file mode 100644 index 0000000..00dfa39 --- /dev/null +++ b/find-duplicates.mjs @@ -0,0 +1,333 @@ +#!/usr/bin/env node +// find-duplicate-functions.mjs +// Usage: +// node find-duplicate-functions.mjs [rootDir] [--mode exact|struct] [--min-lines N] [--json] +// Example: +// node find-duplicate-functions.mjs . --mode struct --min-lines 3 + +import fs from "fs"; +import path from "path"; +import * as url from "url"; +import ts from "typescript"; + +const __dirname = path.dirname(url.fileURLToPath(import.meta.url)); + +/** -------- CLI OPTIONS -------- */ +const args = process.argv.slice(2); +let rootDir = "."; +let mode = "struct"; // "exact" | "struct" +let minLines = 3; +let outputJson = false; + +for (let i = 0; i < args.length; i++) { + const a = args[i]; + if (!a.startsWith("--") && rootDir === ".") { + rootDir = a; + } else if (a === "--mode") { + mode = (args[++i] || "struct").toLowerCase(); + if (!["exact", "struct"].includes(mode)) { + console.error("Invalid --mode. Use 'exact' or 'struct'."); + process.exit(1); + } + } else if (a === "--min-lines") { + minLines = parseInt(args[++i] || "3", 10); + } else if (a === "--json") { + outputJson = true; + } +} + +/** -------- FILE DISCOVERY -------- */ +const DEFAULT_IGNORES = new Set([ + "node_modules", + ".git", + ".next", + ".vercel", + "dist", + "build", + ".astro", // Astro's generated cache dir +]); + +const VALID_EXTS = new Set([".ts", ".tsx", ".astro", ".mts", ".cts"]); + +function walk(dir) { + /** @type {string[]} */ + const out = []; + const entries = fs.readdirSync(dir, { withFileTypes: true }); + for (const e of entries) { + const p = path.join(dir, e.name); + if (e.isDirectory()) { + if (DEFAULT_IGNORES.has(e.name)) continue; + out.push(...walk(p)); + } else if (e.isFile() && VALID_EXTS.has(path.extname(e.name))) { + out.push(p); + } + } + return out; +} + +/** -------- ASTRO CODE EXTRACTION -------- + * Extract TS/JS code from: + * - frontmatter: --- ... --- + * - + */ +function extractCodeFromAstro(source) { + /** @type {{code:string, offset:number}[]} */ + const blocks = []; + + // Frontmatter (must be at top in Astro) + // Match the FIRST pair of --- ... --- + const fm = source.startsWith("---") + ? (() => { + const end = source.indexOf("\n---", 3); + if (end !== -1) { + const front = source.slice(3, end + 1); // include trailing \n + return { start: 0, end: end + 4, code: front }; + } + return null; + })() + : null; + if (fm) { + // offset for line numbers is after the first '---\n' + blocks.push({ code: fm.code, offset: 4 }); // rough; we’ll fix line numbers via positions later + } + + // + const scriptRe = /]*>([\s\S]*?)<\/script>/gi; + let m; + while ((m = scriptRe.exec(source))) { + const code = m[1] || ""; + blocks.push({ code, offset: indexToLine(source, m.index) }); + } + + return blocks; +} + +/** -------- UTIL: index -> 1-based line -------- */ +function indexToLine(text, idx) { + let line = 1; + for (let i = 0; i < idx && i < text.length; i++) { + if (text.charCodeAt(i) === 10) line++; + } + return line; +} + +/** -------- AST HELPERS -------- */ +function createSourceFile(virtualPath, code) { + return ts.createSourceFile( + virtualPath, + code, + ts.ScriptTarget.Latest, + /*setParentNodes*/ true, + virtualPath.endsWith(".tsx") ? ts.ScriptKind.TSX : ts.ScriptKind.TS + ); +} + +// Normalize AST to a structural signature string +function structuralSignature(node) { + /** @type {string[]} */ + const parts = []; + const visit = (n) => { + // Skip trivia: comments/whitespace are already not in AST + const kindName = ts.SyntaxKind[n.kind] || `K${n.kind}`; + switch (n.kind) { + case ts.SyntaxKind.Identifier: + parts.push("Id"); + return; + case ts.SyntaxKind.PrivateIdentifier: + parts.push("PrivId"); + return; + case ts.SyntaxKind.StringLiteral: + case ts.SyntaxKind.NoSubstitutionTemplateLiteral: + case ts.SyntaxKind.TemplateHead: + case ts.SyntaxKind.TemplateMiddle: + case ts.SyntaxKind.TemplateTail: + parts.push("Str"); + return; + case ts.SyntaxKind.NumericLiteral: + parts.push("Num"); + return; + case ts.SyntaxKind.TrueKeyword: + case ts.SyntaxKind.FalseKeyword: + parts.push("Bool"); + return; + case ts.SyntaxKind.NullKeyword: + case ts.SyntaxKind.UndefinedKeyword: + parts.push("Nil"); + return; + case ts.SyntaxKind.PropertyAssignment: + case ts.SyntaxKind.ShorthandPropertyAssignment: + case ts.SyntaxKind.MethodDeclaration: + case ts.SyntaxKind.MethodSignature: + parts.push("Prop"); + break; + default: + parts.push(kindName); + } + n.forEachChild(visit); + }; + visit(node); + return parts.join("|"); +} + +function getFunctionInfo(sf, filePath) { + /** @type {Array<{ + name: string, + bodyText: string, + structKey: string, + start: number, + end: number, + startLine: number, + endLine: number + }>} */ + const out = []; + + const addFunc = (nameNode, bodyNode) => { + if (!bodyNode) return; + const bodyText = bodyNode.getText(sf).trim(); + const start = bodyNode.getStart(sf); + const end = bodyNode.getEnd(); + const { line: startLine } = sf.getLineAndCharacterOfPosition(start); + const { line: endLine } = sf.getLineAndCharacterOfPosition(end); + const name = + nameNode && ts.isIdentifier(nameNode) ? nameNode.escapedText.toString() : "(anonymous)"; + + // min-lines filter + const lines = bodyText.split(/\r?\n/).filter(Boolean); + if (lines.length < minLines) return; + + // structural signature from the body + const structKey = structuralSignature(bodyNode); + + out.push({ + name, + bodyText, + structKey, + start, + end, + startLine: startLine + 1, + endLine: endLine + 1, + }); + }; + + const visit = (node) => { + if (ts.isFunctionDeclaration(node) && node.body) { + addFunc(node.name ?? null, node.body); + } else if ( + ts.isFunctionExpression(node) || + ts.isArrowFunction(node) + ) { + // find name if it’s assigned: const foo = () => {} + let name = null; + if (node.parent && ts.isVariableDeclaration(node.parent) && node.parent.name) { + name = node.parent.name; + } else if ( + node.parent && + ts.isPropertyAssignment(node.parent) && + ts.isIdentifier(node.parent.name) + ) { + name = node.parent.name; + } else if (node.name) { + name = node.name; + } + if (node.body) addFunc(name, node.body); + } else if (ts.isMethodDeclaration(node) && node.body) { + addFunc(node.name, node.body); + } + node.forEachChild(visit); + }; + + visit(sf); + return out; +} + +/** -------- MAIN SCAN -------- */ +const files = walk(path.resolve(process.cwd(), rootDir)); + +/** Maps from hash -> occurrences */ +const groups = new Map(); +/** Helper for exact hash */ +import crypto from "crypto"; +const exactHash = (text) => crypto.createHash("sha1").update(text.replace(/\s+/g, " ").trim()).digest("hex"); + +for (const file of files) { + try { + const ext = path.extname(file).toLowerCase(); + const raw = fs.readFileSync(file, "utf8"); + + /** @type {Array<{virtualPath:string, code:string, lineOffset:number}>} */ + const codeUnits = []; + + if (ext === ".astro") { + const blocks = extractCodeFromAstro(raw); + blocks.forEach((b, i) => { + codeUnits.push({ + virtualPath: file + `#astro${i + 1}.ts`, + code: b.code, + lineOffset: b.offset || 1, + }); + }); + } else { + codeUnits.push({ virtualPath: file, code: raw, lineOffset: 1 }); + } + + for (const { virtualPath, code, lineOffset } of codeUnits) { + const sf = createSourceFile(virtualPath, code); + const funcs = getFunctionInfo(sf, file); + for (const f of funcs) { + const key = + mode === "exact" ? exactHash(f.bodyText) : crypto.createHash("sha1").update(f.structKey).digest("hex"); + const item = { + file, + where: + ext === ".astro" + ? `${path.relative(process.cwd(), file)}:${f.startLine + lineOffset - 1}-${f.endLine + lineOffset - 1}` + : `${path.relative(process.cwd(), file)}:${f.startLine}-${f.endLine}`, + name: f.name, + lines: f.endLine - f.startLine + 1, + preview: f.bodyText.split(/\r?\n/).slice(0, 5).join("\n") + (f.endLine - f.startLine + 1 > 5 ? "\n..." : ""), + }; + if (!groups.has(key)) groups.set(key, []); + groups.get(key).push(item); + } + } + } catch (e) { + console.warn(`⚠️ Skipping ${file}: ${e.message}`); + } +} + +/** -------- REPORT -------- */ +const dupes = [...groups.entries()] + .map(([key, arr]) => ({ key, items: arr })) + .filter((g) => g.items.length > 1) + .sort((a, b) => b.items.length - a.items.length); + +if (outputJson) { + console.log(JSON.stringify({ mode, minLines, groups: dupes }, null, 2)); + process.exit(0); +} + +if (dupes.length === 0) { + console.log(`✅ No duplicate functions found (mode=${mode}, min-lines=${minLines}).`); + process.exit(0); +} + +console.log(`\nFound ${dupes.length} duplicate group(s) (mode=${mode}, min-lines=${minLines}):\n`); +dupes.forEach((g, i) => { + console.log(`== Group ${i + 1} (${g.items.length} matches) ==`); + const example = g.items[0]; + console.log(` Sample (${example.lines} lines) from ${example.where}${example.name ? ` [${example.name}]` : ""}`); + console.log(" ---"); + console.log(indent(example.preview, " ")); + console.log(" ---"); + g.items.forEach((it) => { + console.log(` • ${it.where}${it.name ? ` [${it.name}]` : ""} (${it.lines} lines)`); + }); + console.log(); +}); + +function indent(s, pre) { + return s + .split("\n") + .map((l) => pre + l) + .join("\n"); +} diff --git a/src/components/AIQueryInterface.astro b/src/components/AIQueryInterface.astro index b513a6c..3053541 100644 --- a/src/components/AIQueryInterface.astro +++ b/src/components/AIQueryInterface.astro @@ -12,20 +12,20 @@ const domainAgnosticSoftware = data['domain-agnostic-software'] || [];