feat(TRUEREF-0003-0004): implement GitHub and local filesystem crawlers
- GitHub crawler with rate limiting, semaphore concurrency, retry logic - File filtering by extension, size, and trueref.json rules - Local filesystem crawler with SHA-256 checksums and progress callbacks - Shared types and file filter logic between both crawlers Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
275
src/lib/server/crawler/local.crawler.ts
Normal file
275
src/lib/server/crawler/local.crawler.ts
Normal file
@@ -0,0 +1,275 @@
|
||||
/**
|
||||
* Local Filesystem Crawler (TRUEREF-0004).
|
||||
*
|
||||
* Walks a directory tree and enumerates all files, applying the same
|
||||
* extension and size filters as the GitHub crawler (TRUEREF-0003).
|
||||
* Reads file contents as UTF-8 strings and computes SHA-256 checksums
|
||||
* for change detection.
|
||||
*
|
||||
* Design decisions:
|
||||
* - Uses Node.js `fs/promises` and `crypto` — no extra dependencies.
|
||||
* - Symlinks and special files (devices, sockets, FIFOs) are skipped.
|
||||
* - `trueref.json` / `context7.json` at the repo root are detected and
|
||||
* parsed before any other file filtering runs, matching the GitHub crawler.
|
||||
* - File size for filtering is taken from `stat().size` so the size limit
|
||||
* is applied before reading file content (saves I/O on large excluded files).
|
||||
* - `commitSha` is derived from a SHA-256 hash of all per-file checksums,
|
||||
* giving a deterministic fingerprint of the crawled file set.
|
||||
*/
|
||||
|
||||
import { execFile } from 'node:child_process';
|
||||
import { createHash } from 'node:crypto';
|
||||
import { promises as fs } from 'node:fs';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { promisify } from 'node:util';
|
||||
|
||||
import { detectLanguage, shouldIndexFile } from './file-filter.js';
|
||||
import { InvalidRefError, NotAGitRepositoryError } from './types.js';
|
||||
import type { CrawledFile, CrawlResult, RepoConfig } from './types.js';
|
||||
|
||||
const execFileAsync = promisify(execFile);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public options type
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface LocalCrawlOptions {
|
||||
/** Absolute path to the repository root directory. */
|
||||
rootPath: string;
|
||||
/**
|
||||
* Git ref to check out before crawling — a tag name (e.g. "v2.1.0"),
|
||||
* a branch name, or a commit SHA. When provided the crawler creates an
|
||||
* isolated git worktree at that ref, crawls it, then removes the worktree.
|
||||
* The original working tree is never modified.
|
||||
* Requires `rootPath` to be inside a git repository.
|
||||
*/
|
||||
ref?: string;
|
||||
/** Pre-parsed trueref.json / context7.json configuration, if already loaded. */
|
||||
config?: RepoConfig;
|
||||
/** Progress callback invoked after each file is read. */
|
||||
onProgress?: (processed: number, total: number) => void;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Internal helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Names of config files that control include/exclude rules. */
|
||||
const CONFIG_FILE_NAMES = new Set(['trueref.json', 'context7.json']);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Git helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Run a git command inside `cwd` and return trimmed stdout.
|
||||
* Throws the child-process error on non-zero exit.
|
||||
*/
|
||||
async function runGit(cwd: string, args: string[]): Promise<string> {
|
||||
const { stdout } = await execFileAsync('git', ['-C', cwd, ...args], { encoding: 'utf-8' });
|
||||
return stdout.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute a SHA-256 hex digest of a UTF-8 string.
|
||||
*/
|
||||
function computeSHA256(content: string): string {
|
||||
return createHash('sha256').update(content, 'utf-8').digest('hex');
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to read and JSON-parse a config file.
|
||||
* Returns undefined if the file cannot be read or parsed.
|
||||
*/
|
||||
async function parseConfigFile(absPath: string): Promise<RepoConfig | undefined> {
|
||||
try {
|
||||
const raw = await fs.readFile(absPath, 'utf-8');
|
||||
return JSON.parse(raw) as RepoConfig;
|
||||
} catch {
|
||||
console.warn(`[LocalCrawler] Failed to parse config file: ${absPath}`);
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// LocalCrawler
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export class LocalCrawler {
|
||||
/**
|
||||
* Crawl a local directory tree and return structured file objects.
|
||||
*
|
||||
* When `options.ref` is supplied the crawler creates an isolated git
|
||||
* worktree checked out at that ref, crawls it, then removes the worktree.
|
||||
* The caller's working tree is never modified.
|
||||
*
|
||||
* @param options - Root path, optional git ref, optional config, and progress callback.
|
||||
* @returns CrawlResult with all read files and summary statistics.
|
||||
*/
|
||||
async crawl(options: LocalCrawlOptions): Promise<CrawlResult> {
|
||||
const { rootPath, ref } = options;
|
||||
|
||||
if (!ref) {
|
||||
// Fast path: crawl the working tree as-is.
|
||||
return this.crawlDirectory(rootPath, options.config, options.onProgress, 'local');
|
||||
}
|
||||
|
||||
// Git-aware path: verify repo, resolve ref, create worktree, crawl, clean up.
|
||||
let worktreePath: string | undefined;
|
||||
|
||||
try {
|
||||
// Verify rootPath is inside a git repository.
|
||||
await runGit(rootPath, ['rev-parse', '--git-dir']).catch(() => {
|
||||
throw new NotAGitRepositoryError(`Not a git repository: ${rootPath}`);
|
||||
});
|
||||
|
||||
// Resolve the ref to a concrete commit SHA (validates it exists).
|
||||
const commitSha = await runGit(rootPath, ['rev-parse', '--verify', ref]).catch(() => {
|
||||
throw new InvalidRefError(`Invalid git ref "${ref}" in repository: ${rootPath}`);
|
||||
});
|
||||
|
||||
// Create a temporary isolated worktree at the resolved ref.
|
||||
const tmpDir = await fs.mkdtemp(join(tmpdir(), 'trueref-wt-'));
|
||||
worktreePath = tmpDir;
|
||||
|
||||
await runGit(rootPath, ['worktree', 'add', '--detach', tmpDir, ref]).catch((err) => {
|
||||
throw new InvalidRefError(
|
||||
`Cannot create worktree for ref "${ref}": ${err instanceof Error ? err.message : String(err)}`
|
||||
);
|
||||
});
|
||||
|
||||
// Crawl the worktree and stamp the result with the git-resolved metadata.
|
||||
const result = await this.crawlDirectory(worktreePath, options.config, options.onProgress, ref);
|
||||
|
||||
return { ...result, commitSha };
|
||||
} finally {
|
||||
if (worktreePath) {
|
||||
// Remove the worktree (git also deletes the directory).
|
||||
await runGit(rootPath, ['worktree', 'remove', '--force', worktreePath]).catch(() => {
|
||||
// Best-effort; leave the temp directory for the OS to clean up.
|
||||
fs.rm(worktreePath!, { recursive: true, force: true }).catch(() => {});
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Private — directory crawl
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Walk `rootPath`, apply filters, read files, and build a CrawlResult.
|
||||
* `branch` is embedded verbatim into the returned result.
|
||||
*/
|
||||
private async crawlDirectory(
|
||||
rootPath: string,
|
||||
callerConfig: RepoConfig | undefined,
|
||||
onProgress: LocalCrawlOptions['onProgress'],
|
||||
branch: string
|
||||
): Promise<CrawlResult> {
|
||||
// Step 1: Walk the directory tree and collect (relPath, size) pairs.
|
||||
const statCache = new Map<string, number>();
|
||||
const allRelPaths = await this.walkDirectory(rootPath, '', statCache);
|
||||
|
||||
// Step 2: Detect trueref.json / context7.json at the repo root first.
|
||||
// Only root-level config files are honoured (no directory prefix).
|
||||
const configRelPath = allRelPaths.find((p) => CONFIG_FILE_NAMES.has(p));
|
||||
let config = callerConfig;
|
||||
if (configRelPath && !config) {
|
||||
config = await parseConfigFile(join(rootPath, configRelPath));
|
||||
}
|
||||
|
||||
// Step 3: Filter files according to extension, size, and config rules.
|
||||
const filteredPaths = allRelPaths.filter((relPath) => {
|
||||
const size = statCache.get(relPath) ?? 0;
|
||||
return shouldIndexFile(relPath, size, config);
|
||||
});
|
||||
|
||||
// Step 4: Read file contents and build CrawledFile records.
|
||||
const crawledFiles: CrawledFile[] = [];
|
||||
|
||||
for (const [i, relPath] of filteredPaths.entries()) {
|
||||
const absPath = join(rootPath, relPath);
|
||||
try {
|
||||
const content = await fs.readFile(absPath, 'utf-8');
|
||||
const sha = computeSHA256(content);
|
||||
crawledFiles.push({
|
||||
path: relPath,
|
||||
content,
|
||||
size: Buffer.byteLength(content, 'utf-8'),
|
||||
sha,
|
||||
language: detectLanguage(relPath)
|
||||
});
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
`[LocalCrawler] Could not read file: ${relPath} — ${err instanceof Error ? err.message : String(err)}`
|
||||
);
|
||||
}
|
||||
onProgress?.(i + 1, filteredPaths.length);
|
||||
}
|
||||
|
||||
// Step 5: Build a deterministic repo-level fingerprint from file SHAs.
|
||||
const commitSha = computeSHA256(crawledFiles.map((f) => f.sha).join(''));
|
||||
|
||||
return {
|
||||
files: crawledFiles,
|
||||
totalFiles: filteredPaths.length,
|
||||
skippedFiles: allRelPaths.length - filteredPaths.length,
|
||||
branch,
|
||||
commitSha
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively walk a directory and collect relative paths of all regular files.
|
||||
* Symlinks and special files (devices, sockets, FIFOs) are silently skipped.
|
||||
* Populates `statCache` with file sizes so the caller can filter without a
|
||||
* second `stat()` call.
|
||||
*
|
||||
* @param dir - Absolute path of the directory to read.
|
||||
* @param rel - Relative path prefix accumulated during recursion.
|
||||
* @param statCache - Mutable map from relative path → byte size.
|
||||
*/
|
||||
private async walkDirectory(
|
||||
dir: string,
|
||||
rel: string,
|
||||
statCache: Map<string, number>
|
||||
): Promise<string[]> {
|
||||
let entries;
|
||||
try {
|
||||
entries = await fs.readdir(dir, { withFileTypes: true });
|
||||
} catch {
|
||||
// Directory is unreadable (permissions, etc.) — skip silently.
|
||||
return [];
|
||||
}
|
||||
|
||||
const files: string[] = [];
|
||||
|
||||
for (const entry of entries) {
|
||||
// Only descend into plain directories and collect plain files.
|
||||
// entry.isFile() / entry.isDirectory() return false for symlinks,
|
||||
// devices, sockets, and FIFOs, so those are all implicitly skipped.
|
||||
if (!entry.isFile() && !entry.isDirectory()) continue;
|
||||
|
||||
const relPath = rel ? `${rel}/${entry.name}` : entry.name;
|
||||
|
||||
if (entry.isDirectory()) {
|
||||
const children = await this.walkDirectory(join(dir, entry.name), relPath, statCache);
|
||||
files.push(...children);
|
||||
} else {
|
||||
// Capture file size from stat so shouldIndexFile can enforce the limit
|
||||
// without reading the file.
|
||||
try {
|
||||
const stat = await fs.stat(join(dir, entry.name));
|
||||
statCache.set(relPath, stat.size);
|
||||
} catch {
|
||||
statCache.set(relPath, 0);
|
||||
}
|
||||
files.push(relPath);
|
||||
}
|
||||
}
|
||||
|
||||
return files;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user