trueref/src/lib/server/crawler/local.crawler.ts

/**
 * Local Filesystem Crawler (TRUEREF-0004).
 *
 * Walks a directory tree and enumerates all files, applying the same
 * extension and size filters as the GitHub crawler (TRUEREF-0003).
 * Reads file contents as UTF-8 strings and computes SHA-256 checksums
 * for change detection.
 *
 * Design decisions:
 *  - Uses Node.js `fs/promises` and `crypto` — no extra dependencies.
 *  - Symlinks and special files (devices, sockets, FIFOs) are skipped.
 *  - `trueref.json` / `context7.json` at the repo root are detected and
 *    parsed before any other file filtering runs, matching the GitHub crawler.
 *  - File size for filtering is taken from `stat().size` so the size limit
 *    is applied before reading file content (saves I/O on large excluded files).
 *  - `commitSha` is derived from a SHA-256 hash of all per-file checksums,
 *    giving a deterministic fingerprint of the crawled file set.
 */

import { execFile } from 'node:child_process';
import { createHash } from 'node:crypto';
import { promises as fs } from 'node:fs';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { promisify } from 'node:util';

import { detectLanguage, shouldIndexFile, shouldPruneDirectory } from './file-filter.js';
import { parseGitignore, type GitignoreFilter } from './gitignore-parser.js';
import { InvalidRefError, NotAGitRepositoryError } from './types.js';
import type { CrawledFile, CrawlResult, RepoConfig } from './types.js';

const execFileAsync = promisify(execFile);

// ---------------------------------------------------------------------------
// Public options type
// ---------------------------------------------------------------------------

export interface LocalCrawlOptions {
	/** Absolute path to the repository root directory. */
	rootPath: string;
	/**
	 * Git ref to check out before crawling — a tag name (e.g. "v2.1.0"),
	 * a branch name, or a commit SHA. When provided the crawler creates an
	 * isolated git worktree at that ref, crawls it, then removes the worktree.
	 * The original working tree is never modified.
	 * Requires `rootPath` to be inside a git repository.
	 */
	ref?: string;
	/** Pre-parsed trueref.json / context7.json configuration, if already loaded. */
	config?: RepoConfig;
	/** Progress callback invoked after each file is read. */
	onProgress?: (processed: number, total: number) => void;
}

// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------

/** Names of config files that control include/exclude rules. */
const CONFIG_FILE_NAMES = new Set(['trueref.json', 'context7.json']);

// ---------------------------------------------------------------------------
// Git helpers
// ---------------------------------------------------------------------------

/**
 * Run a git command inside `cwd` and return trimmed stdout.
 * Throws the child-process error on non-zero exit.
 */
async function runGit(cwd: string, args: string[]): Promise<string> {
	const { stdout } = await execFileAsync('git', ['-C', cwd, ...args], { encoding: 'utf-8' });
	return stdout.trim();
}

/**
 * Compute a SHA-256 hex digest of a UTF-8 string.
 */
function computeSHA256(content: string): string {
	return createHash('sha256').update(content, 'utf-8').digest('hex');
}

/**
 * Attempt to read and JSON-parse a config file.
 * Returns undefined if the file cannot be read or parsed.
 */
async function parseConfigFile(absPath: string): Promise<RepoConfig | undefined> {
	try {
		const raw = await fs.readFile(absPath, 'utf-8');
		return JSON.parse(raw) as RepoConfig;
	} catch {
		console.warn(`[LocalCrawler] Failed to parse config file: ${absPath}`);
		return undefined;
	}
}

// ---------------------------------------------------------------------------
// LocalCrawler
// ---------------------------------------------------------------------------

export class LocalCrawler {
	/**
	 * Crawl a local directory tree and return structured file objects.
	 *
	 * When `options.ref` is supplied the crawler creates an isolated git
	 * worktree checked out at that ref, crawls it, then removes the worktree.
	 * The caller's working tree is never modified.
	 *
	 * @param options - Root path, optional git ref, optional config, and progress callback.
	 * @returns CrawlResult with all read files and summary statistics.
	 */
	async crawl(options: LocalCrawlOptions): Promise<CrawlResult> {
		const { rootPath, ref } = options;

		if (!ref) {
			// Fast path: crawl the working tree as-is.
			return this.crawlDirectory(rootPath, options.config, options.onProgress, 'local');
		}

		// Git-aware path: verify repo, resolve ref, create worktree, crawl, clean up.
		let worktreePath: string | undefined;

		try {
			// Verify rootPath is inside a git repository.
			await runGit(rootPath, ['rev-parse', '--git-dir']).catch(() => {
				throw new NotAGitRepositoryError(`Not a git repository: ${rootPath}`);
			});

			// Resolve the ref to a concrete commit SHA (validates it exists).
			const commitSha = await runGit(rootPath, ['rev-parse', '--verify', ref]).catch(() => {
				throw new InvalidRefError(`Invalid git ref "${ref}" in repository: ${rootPath}`);
			});

			// Create a temporary isolated worktree at the resolved ref.
			const tmpDir = await fs.mkdtemp(join(tmpdir(), 'trueref-wt-'));
			worktreePath = tmpDir;

			await runGit(rootPath, ['worktree', 'add', '--detach', tmpDir, ref]).catch((err) => {
				throw new InvalidRefError(
					`Cannot create worktree for ref "${ref}": ${err instanceof Error ? err.message : String(err)}`
				);
			});

			// Crawl the worktree and stamp the result with the git-resolved metadata.
			const result = await this.crawlDirectory(
				worktreePath,
				options.config,
				options.onProgress,
				ref
			);

			return { ...result, commitSha };
		} finally {
			if (worktreePath) {
				// Remove the worktree (git also deletes the directory).
				await runGit(rootPath, ['worktree', 'remove', '--force', worktreePath]).catch(() => {
					// Best-effort; leave the temp directory for the OS to clean up.
					fs.rm(worktreePath!, { recursive: true, force: true }).catch(() => {});
				});
			}
		}
	}

	// ---------------------------------------------------------------------------
	// Private — directory crawl
	// ---------------------------------------------------------------------------

	/**
	 * Walk `rootPath`, apply filters, read files, and build a CrawlResult.
	 * `branch` is embedded verbatim into the returned result.
	 */
	private async crawlDirectory(
		rootPath: string,
		callerConfig: RepoConfig | undefined,
		onProgress: LocalCrawlOptions['onProgress'],
		branch: string
	): Promise<CrawlResult> {
		// Step 1: Load .gitignore from the repo root (if present).
		// When found, the filter drives file exclusion during the walk.
		// Built-in dependency / build-artifact pruning still applies so local
		// indexing stays focused on repository source, not vendored code.
		const gitignoreFilter = await this.loadGitignore(rootPath);

		// Step 2: Walk the directory tree and collect (relPath, size) pairs.
		// Directories are pruned early — their contents are never enumerated.
		const statCache = new Map<string, number>();
		const allRelPaths = await this.walkDirectory(rootPath, '', statCache, gitignoreFilter);

		// Step 3: Detect trueref.json / context7.json at the repo root first.
		// Only root-level config files are honoured (no directory prefix).
		const configRelPath = allRelPaths.find((p) => CONFIG_FILE_NAMES.has(p));
		let config = callerConfig;
		if (configRelPath && !config) {
			config = await parseConfigFile(join(rootPath, configRelPath));
		}

		// Step 4: Filter files according to extension, size, and config rules.
		const filteredPaths = allRelPaths.filter((relPath) => {
			const size = statCache.get(relPath) ?? 0;
			return shouldIndexFile(relPath, size, config);
		});

		// Step 5: Read file contents and build CrawledFile records.
		const crawledFiles: CrawledFile[] = [];

		for (const [i, relPath] of filteredPaths.entries()) {
			const absPath = join(rootPath, relPath);
			try {
				const content = await fs.readFile(absPath, 'utf-8');
				const sha = computeSHA256(content);
				crawledFiles.push({
					path: relPath,
					content,
					size: Buffer.byteLength(content, 'utf-8'),
					sha,
					language: detectLanguage(relPath)
				});
			} catch (err) {
				console.warn(
					`[LocalCrawler] Could not read file: ${relPath} — ${err instanceof Error ? err.message : String(err)}`
				);
			}
			onProgress?.(i + 1, filteredPaths.length);
		}

		// Step 6: Build a deterministic repo-level fingerprint from file SHAs.
		const commitSha = computeSHA256(crawledFiles.map((f) => f.sha).join(''));

		return {
			files: crawledFiles,
			totalFiles: filteredPaths.length,
			skippedFiles: allRelPaths.length - filteredPaths.length,
			branch,
			commitSha,
			// Surface the pre-parsed config so the indexing pipeline can read rules
			// without needing to find trueref.json inside crawledFiles (which fails
			// when a `folders` allowlist excludes the repo root).
			config: config ?? undefined
		};
	}

	// ---------------------------------------------------------------------------
	// Private — .gitignore loading
	// ---------------------------------------------------------------------------

	/**
	 * Attempt to read and parse the root .gitignore file.
	 * Returns null when the file does not exist or cannot be read.
	 *
	 * Only the repository root .gitignore is honoured. Nested .gitignore files
	 * inside subdirectories are not processed (they are rare and their absence
	 * only leads to over-indexing, never incorrect indexing).
	 */
	private async loadGitignore(rootPath: string): Promise<GitignoreFilter | null> {
		try {
			const content = await fs.readFile(join(rootPath, '.gitignore'), 'utf-8');
			return parseGitignore(content);
		} catch {
			// File absent or unreadable — fall back to IGNORED_DIR_NAMES.
			return null;
		}
	}

	// ---------------------------------------------------------------------------
	// Private — directory walk
	// ---------------------------------------------------------------------------

	/**
	 * Recursively walk a directory and collect relative paths of all regular files.
	 *
	 * Directories are pruned before recursion using the built-in ignored-directory
	 * list plus any matching root .gitignore rule. This avoids
	 * enumerating the contents of node_modules, dist, .venv, etc. entirely.
	 *
	 * Individual files are also tested against the gitignore filter when present,
	 * so patterns like *.log or /secrets.json are respected.
	 *
	 * Symlinks and special files (devices, sockets, FIFOs) are silently skipped.
	 * Populates `statCache` with file sizes so the caller can filter without a
	 * second `stat()` call.
	 *
	 * @param dir        - Absolute path of the directory to read.
	 * @param rel        - Relative path prefix accumulated during recursion.
	 * @param statCache  - Mutable map from relative path → byte size.
	 * @param filter     - Compiled gitignore filter, or null when absent.
	 */
	private async walkDirectory(
		dir: string,
		rel: string,
		statCache: Map<string, number>,
		filter: GitignoreFilter | null
	): Promise<string[]> {
		let entries;
		try {
			entries = await fs.readdir(dir, { withFileTypes: true });
		} catch {
			// Directory is unreadable (permissions, etc.) — skip silently.
			return [];
		}

		const files: string[] = [];

		for (const entry of entries) {
			// Only descend into plain directories and collect plain files.
			// entry.isFile() / entry.isDirectory() return false for symlinks,
			// devices, sockets, and FIFOs, so those are all implicitly skipped.
			if (!entry.isFile() && !entry.isDirectory()) continue;

			const relPath = rel ? `${rel}/${entry.name}` : entry.name;

			if (entry.isDirectory()) {
				// Prune ignored directories before recursing — never enumerate
				// their contents. Built-in exclusions always apply, even when a
				// repo-level .gitignore exists but does not mention them.
				const ignored = shouldPruneDirectory(relPath) || filter?.isIgnored(relPath, true);
				if (ignored) continue;

				const children = await this.walkDirectory(
					join(dir, entry.name),
					relPath,
					statCache,
					filter
				);
				files.push(...children);
			} else {
				// Apply gitignore file-level rules when a filter is loaded.
				if (filter?.isIgnored(relPath, false)) continue;

				// Capture file size from stat so shouldIndexFile can enforce
				// the size limit without reading the file content.
				try {
					const stat = await fs.stat(join(dir, entry.name));
					statCache.set(relPath, stat.size);
				} catch {
					statCache.set(relPath, 0);
				}
				files.push(relPath);
			}
		}

		return files;
	}
}