forensic-pathways/find-duplicates.mjs
2025-08-17 16:55:02 +02:00

334 lines
9.5 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
// find-duplicate-functions.mjs
// Usage:
// node find-duplicate-functions.mjs [rootDir] [--mode exact|struct] [--min-lines N] [--json]
// Example:
// node find-duplicate-functions.mjs . --mode struct --min-lines 3
import fs from "fs";
import path from "path";
import * as url from "url";
import ts from "typescript";
const __dirname = path.dirname(url.fileURLToPath(import.meta.url));
/** -------- CLI OPTIONS -------- */
const args = process.argv.slice(2);
let rootDir = ".";
let mode = "struct"; // "exact" | "struct"
let minLines = 3;
let outputJson = false;
for (let i = 0; i < args.length; i++) {
const a = args[i];
if (!a.startsWith("--") && rootDir === ".") {
rootDir = a;
} else if (a === "--mode") {
mode = (args[++i] || "struct").toLowerCase();
if (!["exact", "struct"].includes(mode)) {
console.error("Invalid --mode. Use 'exact' or 'struct'.");
process.exit(1);
}
} else if (a === "--min-lines") {
minLines = parseInt(args[++i] || "3", 10);
} else if (a === "--json") {
outputJson = true;
}
}
/** -------- FILE DISCOVERY -------- */
const DEFAULT_IGNORES = new Set([
"node_modules",
".git",
".next",
".vercel",
"dist",
"build",
".astro", // Astro's generated cache dir
]);
const VALID_EXTS = new Set([".ts", ".tsx", ".astro", ".mts", ".cts"]);
function walk(dir) {
/** @type {string[]} */
const out = [];
const entries = fs.readdirSync(dir, { withFileTypes: true });
for (const e of entries) {
const p = path.join(dir, e.name);
if (e.isDirectory()) {
if (DEFAULT_IGNORES.has(e.name)) continue;
out.push(...walk(p));
} else if (e.isFile() && VALID_EXTS.has(path.extname(e.name))) {
out.push(p);
}
}
return out;
}
/** -------- ASTRO CODE EXTRACTION --------
* Extract TS/JS code from:
* - frontmatter: --- ... ---
* - <script ...> ... </script>
*/
function extractCodeFromAstro(source) {
/** @type {{code:string, offset:number}[]} */
const blocks = [];
// Frontmatter (must be at top in Astro)
// Match the FIRST pair of --- ... ---
const fm = source.startsWith("---")
? (() => {
const end = source.indexOf("\n---", 3);
if (end !== -1) {
const front = source.slice(3, end + 1); // include trailing \n
return { start: 0, end: end + 4, code: front };
}
return null;
})()
: null;
if (fm) {
// offset for line numbers is after the first '---\n'
blocks.push({ code: fm.code, offset: 4 }); // rough; well fix line numbers via positions later
}
// <script ...> ... </script>
const scriptRe = /<script\b[^>]*>([\s\S]*?)<\/script>/gi;
let m;
while ((m = scriptRe.exec(source))) {
const code = m[1] || "";
blocks.push({ code, offset: indexToLine(source, m.index) });
}
return blocks;
}
/** -------- UTIL: index -> 1-based line -------- */
function indexToLine(text, idx) {
let line = 1;
for (let i = 0; i < idx && i < text.length; i++) {
if (text.charCodeAt(i) === 10) line++;
}
return line;
}
/** -------- AST HELPERS -------- */
function createSourceFile(virtualPath, code) {
return ts.createSourceFile(
virtualPath,
code,
ts.ScriptTarget.Latest,
/*setParentNodes*/ true,
virtualPath.endsWith(".tsx") ? ts.ScriptKind.TSX : ts.ScriptKind.TS
);
}
// Normalize AST to a structural signature string
function structuralSignature(node) {
/** @type {string[]} */
const parts = [];
const visit = (n) => {
// Skip trivia: comments/whitespace are already not in AST
const kindName = ts.SyntaxKind[n.kind] || `K${n.kind}`;
switch (n.kind) {
case ts.SyntaxKind.Identifier:
parts.push("Id");
return;
case ts.SyntaxKind.PrivateIdentifier:
parts.push("PrivId");
return;
case ts.SyntaxKind.StringLiteral:
case ts.SyntaxKind.NoSubstitutionTemplateLiteral:
case ts.SyntaxKind.TemplateHead:
case ts.SyntaxKind.TemplateMiddle:
case ts.SyntaxKind.TemplateTail:
parts.push("Str");
return;
case ts.SyntaxKind.NumericLiteral:
parts.push("Num");
return;
case ts.SyntaxKind.TrueKeyword:
case ts.SyntaxKind.FalseKeyword:
parts.push("Bool");
return;
case ts.SyntaxKind.NullKeyword:
case ts.SyntaxKind.UndefinedKeyword:
parts.push("Nil");
return;
case ts.SyntaxKind.PropertyAssignment:
case ts.SyntaxKind.ShorthandPropertyAssignment:
case ts.SyntaxKind.MethodDeclaration:
case ts.SyntaxKind.MethodSignature:
parts.push("Prop");
break;
default:
parts.push(kindName);
}
n.forEachChild(visit);
};
visit(node);
return parts.join("|");
}
function getFunctionInfo(sf, filePath) {
/** @type {Array<{
name: string,
bodyText: string,
structKey: string,
start: number,
end: number,
startLine: number,
endLine: number
}>} */
const out = [];
const addFunc = (nameNode, bodyNode) => {
if (!bodyNode) return;
const bodyText = bodyNode.getText(sf).trim();
const start = bodyNode.getStart(sf);
const end = bodyNode.getEnd();
const { line: startLine } = sf.getLineAndCharacterOfPosition(start);
const { line: endLine } = sf.getLineAndCharacterOfPosition(end);
const name =
nameNode && ts.isIdentifier(nameNode) ? nameNode.escapedText.toString() : "(anonymous)";
// min-lines filter
const lines = bodyText.split(/\r?\n/).filter(Boolean);
if (lines.length < minLines) return;
// structural signature from the body
const structKey = structuralSignature(bodyNode);
out.push({
name,
bodyText,
structKey,
start,
end,
startLine: startLine + 1,
endLine: endLine + 1,
});
};
const visit = (node) => {
if (ts.isFunctionDeclaration(node) && node.body) {
addFunc(node.name ?? null, node.body);
} else if (
ts.isFunctionExpression(node) ||
ts.isArrowFunction(node)
) {
// find name if its assigned: const foo = () => {}
let name = null;
if (node.parent && ts.isVariableDeclaration(node.parent) && node.parent.name) {
name = node.parent.name;
} else if (
node.parent &&
ts.isPropertyAssignment(node.parent) &&
ts.isIdentifier(node.parent.name)
) {
name = node.parent.name;
} else if (node.name) {
name = node.name;
}
if (node.body) addFunc(name, node.body);
} else if (ts.isMethodDeclaration(node) && node.body) {
addFunc(node.name, node.body);
}
node.forEachChild(visit);
};
visit(sf);
return out;
}
/** -------- MAIN SCAN -------- */
const files = walk(path.resolve(process.cwd(), rootDir));
/** Maps from hash -> occurrences */
const groups = new Map();
/** Helper for exact hash */
import crypto from "crypto";
const exactHash = (text) => crypto.createHash("sha1").update(text.replace(/\s+/g, " ").trim()).digest("hex");
for (const file of files) {
try {
const ext = path.extname(file).toLowerCase();
const raw = fs.readFileSync(file, "utf8");
/** @type {Array<{virtualPath:string, code:string, lineOffset:number}>} */
const codeUnits = [];
if (ext === ".astro") {
const blocks = extractCodeFromAstro(raw);
blocks.forEach((b, i) => {
codeUnits.push({
virtualPath: file + `#astro${i + 1}.ts`,
code: b.code,
lineOffset: b.offset || 1,
});
});
} else {
codeUnits.push({ virtualPath: file, code: raw, lineOffset: 1 });
}
for (const { virtualPath, code, lineOffset } of codeUnits) {
const sf = createSourceFile(virtualPath, code);
const funcs = getFunctionInfo(sf, file);
for (const f of funcs) {
const key =
mode === "exact" ? exactHash(f.bodyText) : crypto.createHash("sha1").update(f.structKey).digest("hex");
const item = {
file,
where:
ext === ".astro"
? `${path.relative(process.cwd(), file)}:${f.startLine + lineOffset - 1}-${f.endLine + lineOffset - 1}`
: `${path.relative(process.cwd(), file)}:${f.startLine}-${f.endLine}`,
name: f.name,
lines: f.endLine - f.startLine + 1,
preview: f.bodyText.split(/\r?\n/).slice(0, 5).join("\n") + (f.endLine - f.startLine + 1 > 5 ? "\n..." : ""),
};
if (!groups.has(key)) groups.set(key, []);
groups.get(key).push(item);
}
}
} catch (e) {
console.warn(`⚠️ Skipping ${file}: ${e.message}`);
}
}
/** -------- REPORT -------- */
const dupes = [...groups.entries()]
.map(([key, arr]) => ({ key, items: arr }))
.filter((g) => g.items.length > 1)
.sort((a, b) => b.items.length - a.items.length);
if (outputJson) {
console.log(JSON.stringify({ mode, minLines, groups: dupes }, null, 2));
process.exit(0);
}
if (dupes.length === 0) {
console.log(`✅ No duplicate functions found (mode=${mode}, min-lines=${minLines}).`);
process.exit(0);
}
console.log(`\nFound ${dupes.length} duplicate group(s) (mode=${mode}, min-lines=${minLines}):\n`);
dupes.forEach((g, i) => {
console.log(`== Group ${i + 1} (${g.items.length} matches) ==`);
const example = g.items[0];
console.log(` Sample (${example.lines} lines) from ${example.where}${example.name ? ` [${example.name}]` : ""}`);
console.log(" ---");
console.log(indent(example.preview, " "));
console.log(" ---");
g.items.forEach((it) => {
console.log(`${it.where}${it.name ? ` [${it.name}]` : ""} (${it.lines} lines)`);
});
console.log();
});
function indent(s, pre) {
return s
.split("\n")
.map((l) => pre + l)
.join("\n");
}