/** * Checksum-based diff for incremental re-indexing (TRUEREF-0017). * * Compares a fresh crawl result against the documents currently stored in the * database for a given repository and classifies each file as: * * added — new file not yet in the DB * modified — file exists in DB but its checksum differs * deleted — file exists in DB but is absent from the new crawl * unchanged — file exists in DB with the same checksum */ import type { CrawledFile } from '$lib/server/crawler/types.js'; import type { Document } from '$lib/types'; // --------------------------------------------------------------------------- // Public types // --------------------------------------------------------------------------- export interface FileDiff { /** New files not present in the DB. */ added: CrawledFile[]; /** Files whose checksum has changed since the last index. */ modified: CrawledFile[]; /** File paths present in the DB but absent from the current crawl. */ deleted: string[]; /** File paths whose checksum matches the stored document — no action needed. */ unchanged: string[]; } // --------------------------------------------------------------------------- // computeDiff // --------------------------------------------------------------------------- /** * Compute the diff between a fresh crawl and the currently-stored documents. * * @param crawledFiles - Files returned by the crawler for this run. * @param existingDocs - Documents currently in the DB for this repository * (and optionally a specific version). * @returns A {@link FileDiff} categorising every file into one of four buckets. */ export function computeDiff(crawledFiles: CrawledFile[], existingDocs: Document[]): FileDiff { // Build lookup maps for O(1) access. const existingMap = new Map(existingDocs.map((d) => [d.filePath, d])); const crawledMap = new Map(crawledFiles.map((f) => [f.path, f])); const added: CrawledFile[] = []; const modified: CrawledFile[] = []; const unchanged: string[] = []; for (const file of crawledFiles) { const existing = existingMap.get(file.path); if (!existing) { added.push(file); } else if (existing.checksum !== file.sha) { modified.push(file); } else { unchanged.push(file.path); } } // Files in DB that are no longer present in the crawl have been deleted. const deleted = existingDocs .filter((doc) => !crawledMap.has(doc.filePath)) .map((doc) => doc.filePath); return { added, modified, deleted, unchanged }; }