/** * Local Filesystem Crawler (TRUEREF-0004). * * Walks a directory tree and enumerates all files, applying the same * extension and size filters as the GitHub crawler (TRUEREF-0003). * Reads file contents as UTF-8 strings and computes SHA-256 checksums * for change detection. * * Design decisions: * - Uses Node.js `fs/promises` and `crypto` — no extra dependencies. * - Symlinks and special files (devices, sockets, FIFOs) are skipped. * - `trueref.json` / `context7.json` at the repo root are detected and * parsed before any other file filtering runs, matching the GitHub crawler. * - File size for filtering is taken from `stat().size` so the size limit * is applied before reading file content (saves I/O on large excluded files). * - `commitSha` is derived from a SHA-256 hash of all per-file checksums, * giving a deterministic fingerprint of the crawled file set. */ import { execFile } from 'node:child_process'; import { createHash } from 'node:crypto'; import { promises as fs } from 'node:fs'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { promisify } from 'node:util'; import { detectLanguage, shouldIndexFile, shouldPruneDirectory } from './file-filter.js'; import { parseGitignore, type GitignoreFilter } from './gitignore-parser.js'; import { InvalidRefError, NotAGitRepositoryError } from './types.js'; import type { CrawledFile, CrawlResult, RepoConfig } from './types.js'; const execFileAsync = promisify(execFile); // --------------------------------------------------------------------------- // Public options type // --------------------------------------------------------------------------- export interface LocalCrawlOptions { /** Absolute path to the repository root directory. */ rootPath: string; /** * Git ref to check out before crawling — a tag name (e.g. "v2.1.0"), * a branch name, or a commit SHA. When provided the crawler creates an * isolated git worktree at that ref, crawls it, then removes the worktree. * The original working tree is never modified. * Requires `rootPath` to be inside a git repository. */ ref?: string; /** Pre-parsed trueref.json / context7.json configuration, if already loaded. */ config?: RepoConfig; /** Progress callback invoked after each file is read. */ onProgress?: (processed: number, total: number) => void; } // --------------------------------------------------------------------------- // Internal helpers // --------------------------------------------------------------------------- /** Names of config files that control include/exclude rules. */ const CONFIG_FILE_NAMES = new Set(['trueref.json', 'context7.json']); // --------------------------------------------------------------------------- // Git helpers // --------------------------------------------------------------------------- /** * Run a git command inside `cwd` and return trimmed stdout. * Throws the child-process error on non-zero exit. */ async function runGit(cwd: string, args: string[]): Promise { const { stdout } = await execFileAsync('git', ['-C', cwd, ...args], { encoding: 'utf-8' }); return stdout.trim(); } /** * Compute a SHA-256 hex digest of a UTF-8 string. */ function computeSHA256(content: string): string { return createHash('sha256').update(content, 'utf-8').digest('hex'); } /** * Attempt to read and JSON-parse a config file. * Returns undefined if the file cannot be read or parsed. */ async function parseConfigFile(absPath: string): Promise { try { const raw = await fs.readFile(absPath, 'utf-8'); return JSON.parse(raw) as RepoConfig; } catch { console.warn(`[LocalCrawler] Failed to parse config file: ${absPath}`); return undefined; } } // --------------------------------------------------------------------------- // LocalCrawler // --------------------------------------------------------------------------- export class LocalCrawler { /** * Crawl a local directory tree and return structured file objects. * * When `options.ref` is supplied the crawler creates an isolated git * worktree checked out at that ref, crawls it, then removes the worktree. * The caller's working tree is never modified. * * @param options - Root path, optional git ref, optional config, and progress callback. * @returns CrawlResult with all read files and summary statistics. */ async crawl(options: LocalCrawlOptions): Promise { const { rootPath, ref } = options; if (!ref) { // Fast path: crawl the working tree as-is. return this.crawlDirectory(rootPath, options.config, options.onProgress, 'local'); } // Git-aware path: verify repo, resolve ref, create worktree, crawl, clean up. let worktreePath: string | undefined; try { // Verify rootPath is inside a git repository. await runGit(rootPath, ['rev-parse', '--git-dir']).catch(() => { throw new NotAGitRepositoryError(`Not a git repository: ${rootPath}`); }); // Resolve the ref to a concrete commit SHA (validates it exists). const commitSha = await runGit(rootPath, ['rev-parse', '--verify', ref]).catch(() => { throw new InvalidRefError(`Invalid git ref "${ref}" in repository: ${rootPath}`); }); // Create a temporary isolated worktree at the resolved ref. const tmpDir = await fs.mkdtemp(join(tmpdir(), 'trueref-wt-')); worktreePath = tmpDir; await runGit(rootPath, ['worktree', 'add', '--detach', tmpDir, ref]).catch((err) => { throw new InvalidRefError( `Cannot create worktree for ref "${ref}": ${err instanceof Error ? err.message : String(err)}` ); }); // Crawl the worktree and stamp the result with the git-resolved metadata. const result = await this.crawlDirectory( worktreePath, options.config, options.onProgress, ref ); return { ...result, commitSha }; } finally { if (worktreePath) { // Remove the worktree (git also deletes the directory). await runGit(rootPath, ['worktree', 'remove', '--force', worktreePath]).catch(() => { // Best-effort; leave the temp directory for the OS to clean up. fs.rm(worktreePath!, { recursive: true, force: true }).catch(() => {}); }); } } } // --------------------------------------------------------------------------- // Private — directory crawl // --------------------------------------------------------------------------- /** * Walk `rootPath`, apply filters, read files, and build a CrawlResult. * `branch` is embedded verbatim into the returned result. */ private async crawlDirectory( rootPath: string, callerConfig: RepoConfig | undefined, onProgress: LocalCrawlOptions['onProgress'], branch: string ): Promise { // Step 1: Load .gitignore from the repo root (if present). // When found, the filter drives file exclusion during the walk. // Built-in dependency / build-artifact pruning still applies so local // indexing stays focused on repository source, not vendored code. const gitignoreFilter = await this.loadGitignore(rootPath); // Step 2: Walk the directory tree and collect (relPath, size) pairs. // Directories are pruned early — their contents are never enumerated. const statCache = new Map(); const allRelPaths = await this.walkDirectory(rootPath, '', statCache, gitignoreFilter); // Step 3: Detect trueref.json / context7.json at the repo root first. // Only root-level config files are honoured (no directory prefix). const configRelPath = allRelPaths.find((p) => CONFIG_FILE_NAMES.has(p)); let config = callerConfig; if (configRelPath && !config) { config = await parseConfigFile(join(rootPath, configRelPath)); } // Step 4: Filter files according to extension, size, and config rules. const filteredPaths = allRelPaths.filter((relPath) => { const size = statCache.get(relPath) ?? 0; return shouldIndexFile(relPath, size, config); }); // Step 5: Read file contents and build CrawledFile records. const crawledFiles: CrawledFile[] = []; for (const [i, relPath] of filteredPaths.entries()) { const absPath = join(rootPath, relPath); try { const content = await fs.readFile(absPath, 'utf-8'); const sha = computeSHA256(content); crawledFiles.push({ path: relPath, content, size: Buffer.byteLength(content, 'utf-8'), sha, language: detectLanguage(relPath) }); } catch (err) { console.warn( `[LocalCrawler] Could not read file: ${relPath} — ${err instanceof Error ? err.message : String(err)}` ); } onProgress?.(i + 1, filteredPaths.length); } // Step 6: Build a deterministic repo-level fingerprint from file SHAs. const commitSha = computeSHA256(crawledFiles.map((f) => f.sha).join('')); return { files: crawledFiles, totalFiles: filteredPaths.length, skippedFiles: allRelPaths.length - filteredPaths.length, branch, commitSha, // Surface the pre-parsed config so the indexing pipeline can read rules // without needing to find trueref.json inside crawledFiles (which fails // when a `folders` allowlist excludes the repo root). config: config ?? undefined }; } // --------------------------------------------------------------------------- // Private — .gitignore loading // --------------------------------------------------------------------------- /** * Attempt to read and parse the root .gitignore file. * Returns null when the file does not exist or cannot be read. * * Only the repository root .gitignore is honoured. Nested .gitignore files * inside subdirectories are not processed (they are rare and their absence * only leads to over-indexing, never incorrect indexing). */ private async loadGitignore(rootPath: string): Promise { try { const content = await fs.readFile(join(rootPath, '.gitignore'), 'utf-8'); return parseGitignore(content); } catch { // File absent or unreadable — fall back to IGNORED_DIR_NAMES. return null; } } // --------------------------------------------------------------------------- // Private — directory walk // --------------------------------------------------------------------------- /** * Recursively walk a directory and collect relative paths of all regular files. * * Directories are pruned before recursion using the built-in ignored-directory * list plus any matching root .gitignore rule. This avoids * enumerating the contents of node_modules, dist, .venv, etc. entirely. * * Individual files are also tested against the gitignore filter when present, * so patterns like *.log or /secrets.json are respected. * * Symlinks and special files (devices, sockets, FIFOs) are silently skipped. * Populates `statCache` with file sizes so the caller can filter without a * second `stat()` call. * * @param dir - Absolute path of the directory to read. * @param rel - Relative path prefix accumulated during recursion. * @param statCache - Mutable map from relative path → byte size. * @param filter - Compiled gitignore filter, or null when absent. */ private async walkDirectory( dir: string, rel: string, statCache: Map, filter: GitignoreFilter | null ): Promise { let entries; try { entries = await fs.readdir(dir, { withFileTypes: true }); } catch { // Directory is unreadable (permissions, etc.) — skip silently. return []; } const files: string[] = []; for (const entry of entries) { // Only descend into plain directories and collect plain files. // entry.isFile() / entry.isDirectory() return false for symlinks, // devices, sockets, and FIFOs, so those are all implicitly skipped. if (!entry.isFile() && !entry.isDirectory()) continue; const relPath = rel ? `${rel}/${entry.name}` : entry.name; if (entry.isDirectory()) { // Prune ignored directories before recursing — never enumerate // their contents. Built-in exclusions always apply, even when a // repo-level .gitignore exists but does not mention them. const ignored = shouldPruneDirectory(relPath) || filter?.isIgnored(relPath, true); if (ignored) continue; const children = await this.walkDirectory( join(dir, entry.name), relPath, statCache, filter ); files.push(...children); } else { // Apply gitignore file-level rules when a filter is loaded. if (filter?.isIgnored(relPath, false)) continue; // Capture file size from stat so shouldIndexFile can enforce // the size limit without reading the file content. try { const stat = await fs.stat(join(dir, entry.name)); statCache.set(relPath, stat.size); } catch { statCache.set(relPath, 0); } files.push(relPath); } } return files; } }