When trueref.json specifies a `folders` allowlist (e.g. ["src/"]), shouldIndexFile() excludes trueref.json itself because it lives at the repo root. The indexing pipeline then searches crawlResult.files for the config file, finds nothing, and never writes rules to repository_configs. Fix (Option B): add a `config` field to CrawlResult so LocalCrawler returns the pre-parsed config directly. The indexing pipeline now reads crawlResult.config first instead of scanning files[], which resolves the regression for all repos with a folders allowlist. - Add `config?: RepoConfig` to CrawlResult in crawler/types.ts - Return `config` from LocalCrawler.crawlDirectory() - Update IndexingPipeline.crawl() to propagate CrawlResult.config - Update IndexingPipeline.run() to prefer crawlResult.config over files - Add regression tests covering the folders-allowlist exclusion scenario Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
343 lines
13 KiB
TypeScript
343 lines
13 KiB
TypeScript
/**
|
|
* Local Filesystem Crawler (TRUEREF-0004).
|
|
*
|
|
* Walks a directory tree and enumerates all files, applying the same
|
|
* extension and size filters as the GitHub crawler (TRUEREF-0003).
|
|
* Reads file contents as UTF-8 strings and computes SHA-256 checksums
|
|
* for change detection.
|
|
*
|
|
* Design decisions:
|
|
* - Uses Node.js `fs/promises` and `crypto` — no extra dependencies.
|
|
* - Symlinks and special files (devices, sockets, FIFOs) are skipped.
|
|
* - `trueref.json` / `context7.json` at the repo root are detected and
|
|
* parsed before any other file filtering runs, matching the GitHub crawler.
|
|
* - File size for filtering is taken from `stat().size` so the size limit
|
|
* is applied before reading file content (saves I/O on large excluded files).
|
|
* - `commitSha` is derived from a SHA-256 hash of all per-file checksums,
|
|
* giving a deterministic fingerprint of the crawled file set.
|
|
*/
|
|
|
|
import { execFile } from 'node:child_process';
|
|
import { createHash } from 'node:crypto';
|
|
import { promises as fs } from 'node:fs';
|
|
import { tmpdir } from 'node:os';
|
|
import { join } from 'node:path';
|
|
import { promisify } from 'node:util';
|
|
|
|
import { detectLanguage, shouldIndexFile, shouldPruneDirectory } from './file-filter.js';
|
|
import { parseGitignore, type GitignoreFilter } from './gitignore-parser.js';
|
|
import { InvalidRefError, NotAGitRepositoryError } from './types.js';
|
|
import type { CrawledFile, CrawlResult, RepoConfig } from './types.js';
|
|
|
|
const execFileAsync = promisify(execFile);
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Public options type
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export interface LocalCrawlOptions {
|
|
/** Absolute path to the repository root directory. */
|
|
rootPath: string;
|
|
/**
|
|
* Git ref to check out before crawling — a tag name (e.g. "v2.1.0"),
|
|
* a branch name, or a commit SHA. When provided the crawler creates an
|
|
* isolated git worktree at that ref, crawls it, then removes the worktree.
|
|
* The original working tree is never modified.
|
|
* Requires `rootPath` to be inside a git repository.
|
|
*/
|
|
ref?: string;
|
|
/** Pre-parsed trueref.json / context7.json configuration, if already loaded. */
|
|
config?: RepoConfig;
|
|
/** Progress callback invoked after each file is read. */
|
|
onProgress?: (processed: number, total: number) => void;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Internal helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/** Names of config files that control include/exclude rules. */
|
|
const CONFIG_FILE_NAMES = new Set(['trueref.json', 'context7.json']);
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Git helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Run a git command inside `cwd` and return trimmed stdout.
|
|
* Throws the child-process error on non-zero exit.
|
|
*/
|
|
async function runGit(cwd: string, args: string[]): Promise<string> {
|
|
const { stdout } = await execFileAsync('git', ['-C', cwd, ...args], { encoding: 'utf-8' });
|
|
return stdout.trim();
|
|
}
|
|
|
|
/**
|
|
* Compute a SHA-256 hex digest of a UTF-8 string.
|
|
*/
|
|
function computeSHA256(content: string): string {
|
|
return createHash('sha256').update(content, 'utf-8').digest('hex');
|
|
}
|
|
|
|
/**
|
|
* Attempt to read and JSON-parse a config file.
|
|
* Returns undefined if the file cannot be read or parsed.
|
|
*/
|
|
async function parseConfigFile(absPath: string): Promise<RepoConfig | undefined> {
|
|
try {
|
|
const raw = await fs.readFile(absPath, 'utf-8');
|
|
return JSON.parse(raw) as RepoConfig;
|
|
} catch {
|
|
console.warn(`[LocalCrawler] Failed to parse config file: ${absPath}`);
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// LocalCrawler
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export class LocalCrawler {
|
|
/**
|
|
* Crawl a local directory tree and return structured file objects.
|
|
*
|
|
* When `options.ref` is supplied the crawler creates an isolated git
|
|
* worktree checked out at that ref, crawls it, then removes the worktree.
|
|
* The caller's working tree is never modified.
|
|
*
|
|
* @param options - Root path, optional git ref, optional config, and progress callback.
|
|
* @returns CrawlResult with all read files and summary statistics.
|
|
*/
|
|
async crawl(options: LocalCrawlOptions): Promise<CrawlResult> {
|
|
const { rootPath, ref } = options;
|
|
|
|
if (!ref) {
|
|
// Fast path: crawl the working tree as-is.
|
|
return this.crawlDirectory(rootPath, options.config, options.onProgress, 'local');
|
|
}
|
|
|
|
// Git-aware path: verify repo, resolve ref, create worktree, crawl, clean up.
|
|
let worktreePath: string | undefined;
|
|
|
|
try {
|
|
// Verify rootPath is inside a git repository.
|
|
await runGit(rootPath, ['rev-parse', '--git-dir']).catch(() => {
|
|
throw new NotAGitRepositoryError(`Not a git repository: ${rootPath}`);
|
|
});
|
|
|
|
// Resolve the ref to a concrete commit SHA (validates it exists).
|
|
const commitSha = await runGit(rootPath, ['rev-parse', '--verify', ref]).catch(() => {
|
|
throw new InvalidRefError(`Invalid git ref "${ref}" in repository: ${rootPath}`);
|
|
});
|
|
|
|
// Create a temporary isolated worktree at the resolved ref.
|
|
const tmpDir = await fs.mkdtemp(join(tmpdir(), 'trueref-wt-'));
|
|
worktreePath = tmpDir;
|
|
|
|
await runGit(rootPath, ['worktree', 'add', '--detach', tmpDir, ref]).catch((err) => {
|
|
throw new InvalidRefError(
|
|
`Cannot create worktree for ref "${ref}": ${err instanceof Error ? err.message : String(err)}`
|
|
);
|
|
});
|
|
|
|
// Crawl the worktree and stamp the result with the git-resolved metadata.
|
|
const result = await this.crawlDirectory(
|
|
worktreePath,
|
|
options.config,
|
|
options.onProgress,
|
|
ref
|
|
);
|
|
|
|
return { ...result, commitSha };
|
|
} finally {
|
|
if (worktreePath) {
|
|
// Remove the worktree (git also deletes the directory).
|
|
await runGit(rootPath, ['worktree', 'remove', '--force', worktreePath]).catch(() => {
|
|
// Best-effort; leave the temp directory for the OS to clean up.
|
|
fs.rm(worktreePath!, { recursive: true, force: true }).catch(() => {});
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Private — directory crawl
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Walk `rootPath`, apply filters, read files, and build a CrawlResult.
|
|
* `branch` is embedded verbatim into the returned result.
|
|
*/
|
|
private async crawlDirectory(
|
|
rootPath: string,
|
|
callerConfig: RepoConfig | undefined,
|
|
onProgress: LocalCrawlOptions['onProgress'],
|
|
branch: string
|
|
): Promise<CrawlResult> {
|
|
// Step 1: Load .gitignore from the repo root (if present).
|
|
// When found, the filter drives file exclusion during the walk.
|
|
// Built-in dependency / build-artifact pruning still applies so local
|
|
// indexing stays focused on repository source, not vendored code.
|
|
const gitignoreFilter = await this.loadGitignore(rootPath);
|
|
|
|
// Step 2: Walk the directory tree and collect (relPath, size) pairs.
|
|
// Directories are pruned early — their contents are never enumerated.
|
|
const statCache = new Map<string, number>();
|
|
const allRelPaths = await this.walkDirectory(rootPath, '', statCache, gitignoreFilter);
|
|
|
|
// Step 3: Detect trueref.json / context7.json at the repo root first.
|
|
// Only root-level config files are honoured (no directory prefix).
|
|
const configRelPath = allRelPaths.find((p) => CONFIG_FILE_NAMES.has(p));
|
|
let config = callerConfig;
|
|
if (configRelPath && !config) {
|
|
config = await parseConfigFile(join(rootPath, configRelPath));
|
|
}
|
|
|
|
// Step 4: Filter files according to extension, size, and config rules.
|
|
const filteredPaths = allRelPaths.filter((relPath) => {
|
|
const size = statCache.get(relPath) ?? 0;
|
|
return shouldIndexFile(relPath, size, config);
|
|
});
|
|
|
|
// Step 5: Read file contents and build CrawledFile records.
|
|
const crawledFiles: CrawledFile[] = [];
|
|
|
|
for (const [i, relPath] of filteredPaths.entries()) {
|
|
const absPath = join(rootPath, relPath);
|
|
try {
|
|
const content = await fs.readFile(absPath, 'utf-8');
|
|
const sha = computeSHA256(content);
|
|
crawledFiles.push({
|
|
path: relPath,
|
|
content,
|
|
size: Buffer.byteLength(content, 'utf-8'),
|
|
sha,
|
|
language: detectLanguage(relPath)
|
|
});
|
|
} catch (err) {
|
|
console.warn(
|
|
`[LocalCrawler] Could not read file: ${relPath} — ${err instanceof Error ? err.message : String(err)}`
|
|
);
|
|
}
|
|
onProgress?.(i + 1, filteredPaths.length);
|
|
}
|
|
|
|
// Step 6: Build a deterministic repo-level fingerprint from file SHAs.
|
|
const commitSha = computeSHA256(crawledFiles.map((f) => f.sha).join(''));
|
|
|
|
return {
|
|
files: crawledFiles,
|
|
totalFiles: filteredPaths.length,
|
|
skippedFiles: allRelPaths.length - filteredPaths.length,
|
|
branch,
|
|
commitSha,
|
|
// Surface the pre-parsed config so the indexing pipeline can read rules
|
|
// without needing to find trueref.json inside crawledFiles (which fails
|
|
// when a `folders` allowlist excludes the repo root).
|
|
config: config ?? undefined
|
|
};
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Private — .gitignore loading
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Attempt to read and parse the root .gitignore file.
|
|
* Returns null when the file does not exist or cannot be read.
|
|
*
|
|
* Only the repository root .gitignore is honoured. Nested .gitignore files
|
|
* inside subdirectories are not processed (they are rare and their absence
|
|
* only leads to over-indexing, never incorrect indexing).
|
|
*/
|
|
private async loadGitignore(rootPath: string): Promise<GitignoreFilter | null> {
|
|
try {
|
|
const content = await fs.readFile(join(rootPath, '.gitignore'), 'utf-8');
|
|
return parseGitignore(content);
|
|
} catch {
|
|
// File absent or unreadable — fall back to IGNORED_DIR_NAMES.
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Private — directory walk
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Recursively walk a directory and collect relative paths of all regular files.
|
|
*
|
|
* Directories are pruned before recursion using the built-in ignored-directory
|
|
* list plus any matching root .gitignore rule. This avoids
|
|
* enumerating the contents of node_modules, dist, .venv, etc. entirely.
|
|
*
|
|
* Individual files are also tested against the gitignore filter when present,
|
|
* so patterns like *.log or /secrets.json are respected.
|
|
*
|
|
* Symlinks and special files (devices, sockets, FIFOs) are silently skipped.
|
|
* Populates `statCache` with file sizes so the caller can filter without a
|
|
* second `stat()` call.
|
|
*
|
|
* @param dir - Absolute path of the directory to read.
|
|
* @param rel - Relative path prefix accumulated during recursion.
|
|
* @param statCache - Mutable map from relative path → byte size.
|
|
* @param filter - Compiled gitignore filter, or null when absent.
|
|
*/
|
|
private async walkDirectory(
|
|
dir: string,
|
|
rel: string,
|
|
statCache: Map<string, number>,
|
|
filter: GitignoreFilter | null
|
|
): Promise<string[]> {
|
|
let entries;
|
|
try {
|
|
entries = await fs.readdir(dir, { withFileTypes: true });
|
|
} catch {
|
|
// Directory is unreadable (permissions, etc.) — skip silently.
|
|
return [];
|
|
}
|
|
|
|
const files: string[] = [];
|
|
|
|
for (const entry of entries) {
|
|
// Only descend into plain directories and collect plain files.
|
|
// entry.isFile() / entry.isDirectory() return false for symlinks,
|
|
// devices, sockets, and FIFOs, so those are all implicitly skipped.
|
|
if (!entry.isFile() && !entry.isDirectory()) continue;
|
|
|
|
const relPath = rel ? `${rel}/${entry.name}` : entry.name;
|
|
|
|
if (entry.isDirectory()) {
|
|
// Prune ignored directories before recursing — never enumerate
|
|
// their contents. Built-in exclusions always apply, even when a
|
|
// repo-level .gitignore exists but does not mention them.
|
|
const ignored = shouldPruneDirectory(relPath) || filter?.isIgnored(relPath, true);
|
|
if (ignored) continue;
|
|
|
|
const children = await this.walkDirectory(
|
|
join(dir, entry.name),
|
|
relPath,
|
|
statCache,
|
|
filter
|
|
);
|
|
files.push(...children);
|
|
} else {
|
|
// Apply gitignore file-level rules when a filter is loaded.
|
|
if (filter?.isIgnored(relPath, false)) continue;
|
|
|
|
// Capture file size from stat so shouldIndexFile can enforce
|
|
// the size limit without reading the file content.
|
|
try {
|
|
const stat = await fs.stat(join(dir, entry.name));
|
|
statCache.set(relPath, stat.size);
|
|
} catch {
|
|
statCache.set(relPath, 0);
|
|
}
|
|
files.push(relPath);
|
|
}
|
|
}
|
|
|
|
return files;
|
|
}
|
|
}
|