feat(TRUEREF-0003-0004): implement GitHub and local filesystem crawlers

- GitHub crawler with rate limiting, semaphore concurrency, retry logic
- File filtering by extension, size, and trueref.json rules
- Local filesystem crawler with SHA-256 checksums and progress callbacks
- Shared types and file filter logic between both crawlers

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Giancarmine Salucci
2026-03-23 09:06:07 +01:00
parent cb253ffe98
commit 1c15d6c474
7 changed files with 2308 additions and 0 deletions

View File

@@ -0,0 +1,275 @@
/**
* Local Filesystem Crawler (TRUEREF-0004).
*
* Walks a directory tree and enumerates all files, applying the same
* extension and size filters as the GitHub crawler (TRUEREF-0003).
* Reads file contents as UTF-8 strings and computes SHA-256 checksums
* for change detection.
*
* Design decisions:
* - Uses Node.js `fs/promises` and `crypto` — no extra dependencies.
* - Symlinks and special files (devices, sockets, FIFOs) are skipped.
* - `trueref.json` / `context7.json` at the repo root are detected and
* parsed before any other file filtering runs, matching the GitHub crawler.
* - File size for filtering is taken from `stat().size` so the size limit
* is applied before reading file content (saves I/O on large excluded files).
* - `commitSha` is derived from a SHA-256 hash of all per-file checksums,
* giving a deterministic fingerprint of the crawled file set.
*/
import { execFile } from 'node:child_process';
import { createHash } from 'node:crypto';
import { promises as fs } from 'node:fs';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { promisify } from 'node:util';
import { detectLanguage, shouldIndexFile } from './file-filter.js';
import { InvalidRefError, NotAGitRepositoryError } from './types.js';
import type { CrawledFile, CrawlResult, RepoConfig } from './types.js';
const execFileAsync = promisify(execFile);
// ---------------------------------------------------------------------------
// Public options type
// ---------------------------------------------------------------------------
export interface LocalCrawlOptions {
/** Absolute path to the repository root directory. */
rootPath: string;
/**
* Git ref to check out before crawling — a tag name (e.g. "v2.1.0"),
* a branch name, or a commit SHA. When provided the crawler creates an
* isolated git worktree at that ref, crawls it, then removes the worktree.
* The original working tree is never modified.
* Requires `rootPath` to be inside a git repository.
*/
ref?: string;
/** Pre-parsed trueref.json / context7.json configuration, if already loaded. */
config?: RepoConfig;
/** Progress callback invoked after each file is read. */
onProgress?: (processed: number, total: number) => void;
}
// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------
/** Names of config files that control include/exclude rules. */
const CONFIG_FILE_NAMES = new Set(['trueref.json', 'context7.json']);
// ---------------------------------------------------------------------------
// Git helpers
// ---------------------------------------------------------------------------
/**
* Run a git command inside `cwd` and return trimmed stdout.
* Throws the child-process error on non-zero exit.
*/
async function runGit(cwd: string, args: string[]): Promise<string> {
const { stdout } = await execFileAsync('git', ['-C', cwd, ...args], { encoding: 'utf-8' });
return stdout.trim();
}
/**
* Compute a SHA-256 hex digest of a UTF-8 string.
*/
function computeSHA256(content: string): string {
return createHash('sha256').update(content, 'utf-8').digest('hex');
}
/**
* Attempt to read and JSON-parse a config file.
* Returns undefined if the file cannot be read or parsed.
*/
async function parseConfigFile(absPath: string): Promise<RepoConfig | undefined> {
try {
const raw = await fs.readFile(absPath, 'utf-8');
return JSON.parse(raw) as RepoConfig;
} catch {
console.warn(`[LocalCrawler] Failed to parse config file: ${absPath}`);
return undefined;
}
}
// ---------------------------------------------------------------------------
// LocalCrawler
// ---------------------------------------------------------------------------
export class LocalCrawler {
/**
* Crawl a local directory tree and return structured file objects.
*
* When `options.ref` is supplied the crawler creates an isolated git
* worktree checked out at that ref, crawls it, then removes the worktree.
* The caller's working tree is never modified.
*
* @param options - Root path, optional git ref, optional config, and progress callback.
* @returns CrawlResult with all read files and summary statistics.
*/
async crawl(options: LocalCrawlOptions): Promise<CrawlResult> {
const { rootPath, ref } = options;
if (!ref) {
// Fast path: crawl the working tree as-is.
return this.crawlDirectory(rootPath, options.config, options.onProgress, 'local');
}
// Git-aware path: verify repo, resolve ref, create worktree, crawl, clean up.
let worktreePath: string | undefined;
try {
// Verify rootPath is inside a git repository.
await runGit(rootPath, ['rev-parse', '--git-dir']).catch(() => {
throw new NotAGitRepositoryError(`Not a git repository: ${rootPath}`);
});
// Resolve the ref to a concrete commit SHA (validates it exists).
const commitSha = await runGit(rootPath, ['rev-parse', '--verify', ref]).catch(() => {
throw new InvalidRefError(`Invalid git ref "${ref}" in repository: ${rootPath}`);
});
// Create a temporary isolated worktree at the resolved ref.
const tmpDir = await fs.mkdtemp(join(tmpdir(), 'trueref-wt-'));
worktreePath = tmpDir;
await runGit(rootPath, ['worktree', 'add', '--detach', tmpDir, ref]).catch((err) => {
throw new InvalidRefError(
`Cannot create worktree for ref "${ref}": ${err instanceof Error ? err.message : String(err)}`
);
});
// Crawl the worktree and stamp the result with the git-resolved metadata.
const result = await this.crawlDirectory(worktreePath, options.config, options.onProgress, ref);
return { ...result, commitSha };
} finally {
if (worktreePath) {
// Remove the worktree (git also deletes the directory).
await runGit(rootPath, ['worktree', 'remove', '--force', worktreePath]).catch(() => {
// Best-effort; leave the temp directory for the OS to clean up.
fs.rm(worktreePath!, { recursive: true, force: true }).catch(() => {});
});
}
}
}
// ---------------------------------------------------------------------------
// Private — directory crawl
// ---------------------------------------------------------------------------
/**
* Walk `rootPath`, apply filters, read files, and build a CrawlResult.
* `branch` is embedded verbatim into the returned result.
*/
private async crawlDirectory(
rootPath: string,
callerConfig: RepoConfig | undefined,
onProgress: LocalCrawlOptions['onProgress'],
branch: string
): Promise<CrawlResult> {
// Step 1: Walk the directory tree and collect (relPath, size) pairs.
const statCache = new Map<string, number>();
const allRelPaths = await this.walkDirectory(rootPath, '', statCache);
// Step 2: Detect trueref.json / context7.json at the repo root first.
// Only root-level config files are honoured (no directory prefix).
const configRelPath = allRelPaths.find((p) => CONFIG_FILE_NAMES.has(p));
let config = callerConfig;
if (configRelPath && !config) {
config = await parseConfigFile(join(rootPath, configRelPath));
}
// Step 3: Filter files according to extension, size, and config rules.
const filteredPaths = allRelPaths.filter((relPath) => {
const size = statCache.get(relPath) ?? 0;
return shouldIndexFile(relPath, size, config);
});
// Step 4: Read file contents and build CrawledFile records.
const crawledFiles: CrawledFile[] = [];
for (const [i, relPath] of filteredPaths.entries()) {
const absPath = join(rootPath, relPath);
try {
const content = await fs.readFile(absPath, 'utf-8');
const sha = computeSHA256(content);
crawledFiles.push({
path: relPath,
content,
size: Buffer.byteLength(content, 'utf-8'),
sha,
language: detectLanguage(relPath)
});
} catch (err) {
console.warn(
`[LocalCrawler] Could not read file: ${relPath}${err instanceof Error ? err.message : String(err)}`
);
}
onProgress?.(i + 1, filteredPaths.length);
}
// Step 5: Build a deterministic repo-level fingerprint from file SHAs.
const commitSha = computeSHA256(crawledFiles.map((f) => f.sha).join(''));
return {
files: crawledFiles,
totalFiles: filteredPaths.length,
skippedFiles: allRelPaths.length - filteredPaths.length,
branch,
commitSha
};
}
/**
* Recursively walk a directory and collect relative paths of all regular files.
* Symlinks and special files (devices, sockets, FIFOs) are silently skipped.
* Populates `statCache` with file sizes so the caller can filter without a
* second `stat()` call.
*
* @param dir - Absolute path of the directory to read.
* @param rel - Relative path prefix accumulated during recursion.
* @param statCache - Mutable map from relative path → byte size.
*/
private async walkDirectory(
dir: string,
rel: string,
statCache: Map<string, number>
): Promise<string[]> {
let entries;
try {
entries = await fs.readdir(dir, { withFileTypes: true });
} catch {
// Directory is unreadable (permissions, etc.) — skip silently.
return [];
}
const files: string[] = [];
for (const entry of entries) {
// Only descend into plain directories and collect plain files.
// entry.isFile() / entry.isDirectory() return false for symlinks,
// devices, sockets, and FIFOs, so those are all implicitly skipped.
if (!entry.isFile() && !entry.isDirectory()) continue;
const relPath = rel ? `${rel}/${entry.name}` : entry.name;
if (entry.isDirectory()) {
const children = await this.walkDirectory(join(dir, entry.name), relPath, statCache);
files.push(...children);
} else {
// Capture file size from stat so shouldIndexFile can enforce the limit
// without reading the file.
try {
const stat = await fs.stat(join(dir, entry.name));
statCache.set(relPath, stat.size);
} catch {
statCache.set(relPath, 0);
}
files.push(relPath);
}
}
return files;
}
}