feat(TRUEREF-0003-0004): implement GitHub and local filesystem crawlers

- GitHub crawler with rate limiting, semaphore concurrency, retry logic
- File filtering by extension, size, and trueref.json rules
- Local filesystem crawler with SHA-256 checksums and progress callbacks
- Shared types and file filter logic between both crawlers

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Giancarmine Salucci
2026-03-23 09:06:07 +01:00
parent cb253ffe98
commit 1c15d6c474
7 changed files with 2308 additions and 0 deletions

View File

@@ -0,0 +1,554 @@
/**
* Unit tests for the local filesystem crawler (TRUEREF-0004).
*
* Each test that needs a filesystem fixture creates a temporary directory via
* `fs.mkdtemp`, writes the required files, runs the crawler, then cleans up
* with `fs.rm` regardless of the test outcome.
*/
import { execFile } from 'node:child_process';
import { createHash } from 'node:crypto';
import { promises as fs } from 'node:fs';
import { join } from 'node:path';
import { tmpdir } from 'node:os';
import { promisify } from 'node:util';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { LocalCrawler } from './local.crawler.js';
import type { LocalCrawlOptions } from './local.crawler.js';
import { InvalidRefError, NotAGitRepositoryError } from './types.js';
const execFileAsync = promisify(execFile);
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function sha256(content: string): string {
return createHash('sha256').update(content, 'utf-8').digest('hex');
}
/** Create a temp directory, write a map of relPath → content, return rootPath. */
async function makeTempRepo(files: Record<string, string>): Promise<string> {
const root = await fs.mkdtemp(join(tmpdir(), 'trueref-test-'));
for (const [relPath, content] of Object.entries(files)) {
const absPath = join(root, relPath);
await fs.mkdir(join(absPath, '..'), { recursive: true });
await fs.writeFile(absPath, content, 'utf-8');
}
return root;
}
/** Remove a temporary directory tree created by makeTempRepo. */
async function cleanupTempRepo(root: string): Promise<void> {
await fs.rm(root, { recursive: true, force: true });
}
// ---------------------------------------------------------------------------
// Test state
// ---------------------------------------------------------------------------
let root: string = '';
const crawler = new LocalCrawler();
async function crawlRoot(opts: Partial<LocalCrawlOptions> = {}): Promise<ReturnType<LocalCrawler['crawl']>> {
return crawler.crawl({ rootPath: root, ...opts });
}
// ---------------------------------------------------------------------------
// Basic crawl behaviour
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — basic file enumeration', () => {
beforeEach(async () => {
root = await makeTempRepo({
'README.md': '# Hello',
'src/index.ts': 'export const x = 1;',
'src/utils.ts': 'export const y = 2;',
'package.json': '{"name":"test"}'
});
});
afterEach(async () => {
await cleanupTempRepo(root);
});
it('returns all indexable files', async () => {
const result = await crawlRoot();
const paths = result.files.map((f) => f.path).sort();
expect(paths).toEqual(['README.md', 'package.json', 'src/index.ts', 'src/utils.ts'].sort());
});
it('populates content as a UTF-8 string', async () => {
const result = await crawlRoot();
const readme = result.files.find((f) => f.path === 'README.md');
expect(readme?.content).toBe('# Hello');
});
it('sets size equal to Buffer.byteLength of content', async () => {
const result = await crawlRoot();
for (const file of result.files) {
expect(file.size).toBe(Buffer.byteLength(file.content, 'utf-8'));
}
});
it('computes correct SHA-256 per file', async () => {
const result = await crawlRoot();
const readme = result.files.find((f) => f.path === 'README.md');
expect(readme?.sha).toBe(sha256('# Hello'));
});
it('detects language from extension', async () => {
const result = await crawlRoot();
const ts = result.files.find((f) => f.path === 'src/index.ts');
expect(ts?.language).toBe('typescript');
const md = result.files.find((f) => f.path === 'README.md');
expect(md?.language).toBe('markdown');
const json = result.files.find((f) => f.path === 'package.json');
expect(json?.language).toBe('json');
});
it('sets branch to "local"', async () => {
const result = await crawlRoot();
expect(result.branch).toBe('local');
});
it('sets totalFiles to the count of filtered files', async () => {
const result = await crawlRoot();
expect(result.totalFiles).toBe(result.files.length);
});
it('sets commitSha to a non-empty hex string', async () => {
const result = await crawlRoot();
expect(result.commitSha).toMatch(/^[0-9a-f]{64}$/);
});
it('produces a deterministic commitSha for the same file set', async () => {
const r1 = await crawlRoot();
const r2 = await crawlRoot();
expect(r1.commitSha).toBe(r2.commitSha);
});
});
// ---------------------------------------------------------------------------
// Filtering — default excludes and extension allow-list
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — default filtering', () => {
beforeEach(async () => {
root = await makeTempRepo({
'src/index.ts': 'export {};',
'dist/bundle.js': 'bundled',
'node_modules/lodash/index.js': 'lodash',
'.git/config': '[core]',
'image.png': '\x89PNG',
'README.md': '# Docs'
});
});
afterEach(async () => {
await cleanupTempRepo(root);
});
it('excludes files in dist/', async () => {
const result = await crawlRoot();
expect(result.files.every((f) => !f.path.startsWith('dist/'))).toBe(true);
});
it('excludes files in node_modules/', async () => {
const result = await crawlRoot();
expect(result.files.every((f) => !f.path.startsWith('node_modules/'))).toBe(true);
});
it('excludes files in .git/', async () => {
const result = await crawlRoot();
expect(result.files.every((f) => !f.path.startsWith('.git/'))).toBe(true);
});
it('excludes non-indexable extensions like .png', async () => {
const result = await crawlRoot();
expect(result.files.every((f) => !f.path.endsWith('.png'))).toBe(true);
});
it('reports skippedFiles = total enumerated filtered', async () => {
const result = await crawlRoot();
// dist/, node_modules/, .git/, .png = 4 skipped
// src/index.ts + README.md = 2 kept
expect(result.skippedFiles).toBe(4);
expect(result.totalFiles).toBe(2);
});
});
// ---------------------------------------------------------------------------
// Size limit
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — size limit', () => {
afterEach(async () => {
await cleanupTempRepo(root);
});
it('excludes files larger than MAX_FILE_SIZE_BYTES (500 KB)', async () => {
// 500_001 bytes of 'x'
const bigContent = 'x'.repeat(500_001);
root = await makeTempRepo({
'big.ts': bigContent,
'small.ts': 'export const x = 1;'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'big.ts')).toBe(false);
expect(result.files.some((f) => f.path === 'small.ts')).toBe(true);
});
it('includes files exactly at MAX_FILE_SIZE_BYTES (500 KB)', async () => {
const edgeContent = 'a'.repeat(500_000);
root = await makeTempRepo({ 'edge.ts': edgeContent });
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'edge.ts')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// trueref.json / context7.json config detection
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — config file detection', () => {
afterEach(async () => {
await cleanupTempRepo(root);
});
it('auto-detects trueref.json and applies excludeFiles', async () => {
root = await makeTempRepo({
'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }),
'src/index.ts': 'export {};',
'package.json': '{"name":"test"}'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('auto-detects context7.json and applies folders allowlist', async () => {
root = await makeTempRepo({
'context7.json': JSON.stringify({ folders: ['docs/'] }),
'src/index.ts': 'export {};',
'docs/guide.md': '# Guide'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(false);
expect(result.files.some((f) => f.path === 'docs/guide.md')).toBe(true);
});
it('caller-supplied config takes precedence over discovered config file', async () => {
root = await makeTempRepo({
'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }),
'src/index.ts': 'export {};',
'package.json': '{"name":"test"}'
});
// Caller provides a config with no exclusions — package.json should appear.
const result = await crawlRoot({ config: {} });
expect(result.files.some((f) => f.path === 'package.json')).toBe(true);
});
it('applies excludeFolders from config', async () => {
root = await makeTempRepo({
'trueref.json': JSON.stringify({ excludeFolders: ['internal/'] }),
'internal/secret.ts': 'secret',
'src/public.ts': 'public'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path.startsWith('internal/'))).toBe(false);
expect(result.files.some((f) => f.path === 'src/public.ts')).toBe(true);
});
it('gracefully handles a malformed config file', async () => {
root = await makeTempRepo({
'trueref.json': 'NOT VALID JSON {{{',
'src/index.ts': 'export {};'
});
// Should not throw; falls back to no config.
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Progress callback
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — progress reporting', () => {
beforeEach(async () => {
root = await makeTempRepo({
'src/a.ts': 'a',
'src/b.ts': 'b',
'src/c.ts': 'c'
});
});
afterEach(async () => {
await cleanupTempRepo(root);
});
it('calls onProgress once per filtered file', async () => {
const calls: Array<[number, number]> = [];
await crawlRoot({ onProgress: (p, t) => calls.push([p, t]) });
expect(calls).toHaveLength(3);
});
it('increments processed from 1 to totalFiles', async () => {
const calls: Array<[number, number]> = [];
await crawlRoot({ onProgress: (p, t) => calls.push([p, t]) });
const processed = calls.map(([p]) => p);
expect(processed).toEqual([1, 2, 3]);
});
it('keeps total constant across all callback invocations', async () => {
const totals: number[] = [];
await crawlRoot({ onProgress: (_, t) => totals.push(t) });
expect(totals.every((t) => t === totals[0])).toBe(true);
});
it('does not call onProgress when no files pass the filter', async () => {
// Overwrite root with only non-indexable files.
await fs.rm(root, { recursive: true, force: true });
root = await makeTempRepo({ 'image.png': '\x89PNG' });
const calls: number[] = [];
await crawlRoot({ onProgress: () => calls.push(1) });
expect(calls).toHaveLength(0);
});
});
// ---------------------------------------------------------------------------
// Git ref checkout
// ---------------------------------------------------------------------------
/**
* Create a temp directory that is a valid git repo with one commit per entry
* in `history`. Each entry is a map of relPath → content committed under the
* given tag (if provided). Returns the repo root path.
*
* Layout of `history`:
* [{ tag?: string, files: Record<string, string> }, ...]
*/
async function makeGitRepo(
history: Array<{ tag?: string; files: Record<string, string> }>
): Promise<string> {
const root = await fs.mkdtemp(join(tmpdir(), 'trueref-git-test-'));
async function git(...args: string[]) {
await execFileAsync('git', ['-C', root, ...args]);
}
await git('init', '--initial-branch=main');
await git('config', 'user.email', 'test@trueref.local');
await git('config', 'user.name', 'TrueRef Test');
for (const { tag, files } of history) {
// Write files
for (const [relPath, content] of Object.entries(files)) {
const absPath = join(root, relPath);
await fs.mkdir(join(absPath, '..'), { recursive: true });
await fs.writeFile(absPath, content, 'utf-8');
}
await git('add', '.');
await git('commit', '--allow-empty', '-m', `commit for ${tag ?? 'HEAD'}`);
if (tag) {
await git('tag', tag);
}
}
return root;
}
describe('LocalCrawler.crawl() — git ref checkout', () => {
let root: string = '';
const crawler = new LocalCrawler();
afterEach(async () => {
if (root) await cleanupTempRepo(root);
});
it('crawls files at a specific tag, not the HEAD state', async () => {
root = await makeGitRepo([
{ tag: 'v1.0.0', files: { 'src/index.ts': 'export const version = 1;' } },
{ files: { 'src/index.ts': 'export const version = 2;' } }
]);
const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
const indexFile = result.files.find((f) => f.path === 'src/index.ts');
expect(indexFile?.content).toBe('export const version = 1;');
});
it('crawls files at a specific commit SHA', async () => {
root = await makeGitRepo([
{ tag: 'v1.0.0', files: { 'api.ts': 'v1' } },
{ files: { 'api.ts': 'v2' } }
]);
// Resolve the SHA of v1.0.0
const { stdout } = await execFileAsync('git', ['-C', root, 'rev-parse', 'v1.0.0'], {
encoding: 'utf-8'
});
const sha = stdout.trim();
const result = await crawler.crawl({ rootPath: root, ref: sha });
const api = result.files.find((f) => f.path === 'api.ts');
expect(api?.content).toBe('v1');
});
it('sets branch to the ref string in the result', async () => {
root = await makeGitRepo([{ tag: 'v2.3.1', files: { 'README.md': '# v2' } }]);
const result = await crawler.crawl({ rootPath: root, ref: 'v2.3.1' });
expect(result.branch).toBe('v2.3.1');
});
it('sets commitSha to the git-resolved SHA (not file-content hash)', async () => {
root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'a.ts': 'a' } }]);
const { stdout } = await execFileAsync('git', ['-C', root, 'rev-parse', 'v1.0.0'], {
encoding: 'utf-8'
});
const expectedSha = stdout.trim();
const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
expect(result.commitSha).toBe(expectedSha);
});
it('does not modify the working tree', async () => {
root = await makeGitRepo([
{ tag: 'v1.0.0', files: { 'src/index.ts': 'v1' } },
{ files: { 'src/index.ts': 'v2' } }
]);
// Working tree is at HEAD (v2)
const before = await fs.readFile(join(root, 'src/index.ts'), 'utf-8');
await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
const after = await fs.readFile(join(root, 'src/index.ts'), 'utf-8');
expect(before).toBe('v2');
expect(after).toBe('v2');
});
it('removes the temporary worktree after crawling', async () => {
root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'f.ts': 'x' } }]);
await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
// List remaining worktrees — only the main one should remain.
const { stdout } = await execFileAsync('git', ['-C', root, 'worktree', 'list', '--porcelain'], {
encoding: 'utf-8'
});
const worktreeCount = stdout.split('\n').filter((l) => l.startsWith('worktree ')).length;
expect(worktreeCount).toBe(1);
});
it('throws NotAGitRepositoryError for a plain directory', async () => {
const plainDir = await fs.mkdtemp(join(tmpdir(), 'trueref-plain-'));
root = plainDir; // cleaned up in afterEach
await expect(crawler.crawl({ rootPath: plainDir, ref: 'v1.0.0' })).rejects.toThrow(
NotAGitRepositoryError
);
});
it('throws InvalidRefError for a ref that does not exist', async () => {
root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'f.ts': 'x' } }]);
await expect(crawler.crawl({ rootPath: root, ref: 'v99.99.99' })).rejects.toThrow(
InvalidRefError
);
});
it('applies caller-supplied config at the checked-out ref', async () => {
root = await makeGitRepo([
{
tag: 'v1.0.0',
files: {
'src/index.ts': 'export {};',
'package.json': '{"name":"test"}'
}
}
]);
// Exclude package.json via caller config
const result = await crawler.crawl({
rootPath: root,
ref: 'v1.0.0',
config: { excludeFiles: ['package.json'] }
});
expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('reads trueref.json from the checked-out ref', async () => {
root = await makeGitRepo([
{
tag: 'v1.0.0',
files: {
'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }),
'src/index.ts': 'export {};',
'package.json': '{"name":"test"}'
}
}
]);
const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Edge cases
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — edge cases', () => {
afterEach(async () => {
await cleanupTempRepo(root);
});
it('returns empty result for an empty directory', async () => {
root = await makeTempRepo({});
const result = await crawlRoot();
expect(result.files).toHaveLength(0);
expect(result.totalFiles).toBe(0);
expect(result.skippedFiles).toBe(0);
});
it('handles deeply nested directory structures', async () => {
root = await makeTempRepo({
'a/b/c/d/deep.ts': 'export const deep = true;'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'a/b/c/d/deep.ts')).toBe(true);
});
it('handles files with UTF-8 content correctly', async () => {
const utf8Content = 'const greeting = "héllo wörld — 日本語";';
root = await makeTempRepo({ 'src/unicode.ts': utf8Content });
const result = await crawlRoot();
const file = result.files.find((f) => f.path === 'src/unicode.ts');
expect(file?.content).toBe(utf8Content);
expect(file?.sha).toBe(sha256(utf8Content));
});
it('commitSha differs when file content changes', async () => {
root = await makeTempRepo({ 'src/index.ts': 'version 1' });
const r1 = await crawlRoot();
await fs.writeFile(join(root, 'src/index.ts'), 'version 2', 'utf-8');
const r2 = await crawlRoot();
expect(r1.commitSha).not.toBe(r2.commitSha);
});
it('commitSha is empty-string hash when no files are crawled', async () => {
root = await makeTempRepo({ 'image.png': '\x89PNG' });
const result = await crawlRoot();
// SHA-256 of an empty string
expect(result.commitSha).toBe(sha256(''));
});
});