feat(TRUEREF-0003-0004): implement GitHub and local filesystem crawlers

- GitHub crawler with rate limiting, semaphore concurrency, retry logic
- File filtering by extension, size, and trueref.json rules
- Local filesystem crawler with SHA-256 checksums and progress callbacks
- Shared types and file filter logic between both crawlers

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Giancarmine Salucci
2026-03-23 09:06:07 +01:00
parent cb253ffe98
commit 1c15d6c474
7 changed files with 2308 additions and 0 deletions

View File

@@ -0,0 +1,183 @@
/**
* File filtering logic for the GitHub crawler (TRUEREF-0003).
*
* Determines whether a file in the repository tree should be downloaded
* and indexed based on its extension, size, and the trueref.json config.
*/
import { extname, basename } from 'node:path';
import type { RepoConfig } from './types.js';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
/** File extensions that the indexer can meaningfully process. */
export const INDEXABLE_EXTENSIONS = new Set([
// Documentation
'.md',
'.mdx',
'.txt',
'.rst',
// Code
'.ts',
'.tsx',
'.js',
'.jsx',
'.py',
'.rb',
'.go',
'.rs',
'.java',
'.cs',
'.cpp',
'.c',
'.h',
'.swift',
'.kt',
'.php',
'.scala',
'.clj',
'.ex',
'.exs',
'.sh',
'.bash',
'.zsh',
'.fish',
// Config / data
'.json',
'.yaml',
'.yml',
'.toml',
// Web
'.html',
'.css',
'.svelte',
'.vue'
]);
/** Maximum file size we are willing to download (500 KB). */
export const MAX_FILE_SIZE_BYTES = 500_000;
/**
* Default path prefixes that are always excluded regardless of config.
* These directories contain generated or dependency files that should never
* be indexed.
*/
const DEFAULT_EXCLUDES: string[] = [
'node_modules/',
'.git/',
'dist/',
'build/',
'coverage/',
'.next/',
'__pycache__/',
'vendor/',
'target/',
'.cache/'
];
// ---------------------------------------------------------------------------
// Language detection
// ---------------------------------------------------------------------------
const EXTENSION_TO_LANGUAGE: Record<string, string> = {
'.ts': 'typescript',
'.tsx': 'typescript',
'.js': 'javascript',
'.jsx': 'javascript',
'.py': 'python',
'.rb': 'ruby',
'.go': 'go',
'.rs': 'rust',
'.java': 'java',
'.cs': 'csharp',
'.cpp': 'cpp',
'.c': 'c',
'.h': 'c',
'.swift': 'swift',
'.kt': 'kotlin',
'.php': 'php',
'.scala': 'scala',
'.clj': 'clojure',
'.ex': 'elixir',
'.exs': 'elixir',
'.sh': 'shell',
'.bash': 'shell',
'.zsh': 'shell',
'.fish': 'shell',
'.json': 'json',
'.yaml': 'yaml',
'.yml': 'yaml',
'.toml': 'toml',
'.html': 'html',
'.css': 'css',
'.svelte': 'svelte',
'.vue': 'vue',
'.md': 'markdown',
'.mdx': 'markdown',
'.txt': 'text',
'.rst': 'rst'
};
/**
* Detect a human-readable language name from a file extension.
* Returns an empty string when the extension is unknown.
*/
export function detectLanguage(filePath: string): string {
const ext = extname(filePath).toLowerCase();
return EXTENSION_TO_LANGUAGE[ext] ?? '';
}
// ---------------------------------------------------------------------------
// Filter predicate
// ---------------------------------------------------------------------------
/**
* Decide whether a file from the repository tree should be downloaded.
*
* Rules (applied in order):
* 1. Must have an indexable extension.
* 2. Must not exceed the size limit.
* 3. Must not match config.excludeFiles (exact basename match).
* 4. Must not be under a config.excludeFolders path / regex.
* 5. Must be under a config.folders allowlist path / regex (if specified).
* 6. Must not start with a default-excluded prefix.
*/
export function shouldIndexFile(
filePath: string,
fileSize: number,
config?: RepoConfig
): boolean {
const ext = extname(filePath).toLowerCase();
// 1. Extension allow-list
if (!INDEXABLE_EXTENSIONS.has(ext)) return false;
// 2. Size limit
if (fileSize > MAX_FILE_SIZE_BYTES) return false;
// 3. Config excludeFiles — exact basename match
if (config?.excludeFiles?.includes(basename(filePath))) return false;
// 4. Config excludeFolders — prefix or regex match
if (
config?.excludeFolders?.some(
(folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath)
)
)
return false;
// 5. Config folders allowlist — if provided, the file must match at least one
if (config?.folders?.length) {
const inAllowedFolder = config.folders.some(
(folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath)
);
if (!inAllowedFolder) return false;
}
// 6. Default excludes
if (DEFAULT_EXCLUDES.some((ex) => filePath.startsWith(ex))) return false;
return true;
}

View File

@@ -0,0 +1,561 @@
/**
* Unit tests for the GitHub repository crawler (TRUEREF-0003).
*
* All GitHub API calls are intercepted via vi.stubGlobal('fetch', ...) so
* that no real network traffic is produced.
*/
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import { crawl } from './github.crawler.js';
import { shouldIndexFile, detectLanguage, INDEXABLE_EXTENSIONS, MAX_FILE_SIZE_BYTES } from './file-filter.js';
import { GitHubRateLimiter, Semaphore, withRetry } from './rate-limiter.js';
import {
AuthenticationError,
PermissionError,
RepositoryNotFoundError
} from './types.js';
// ---------------------------------------------------------------------------
// Mock fetch helpers
// ---------------------------------------------------------------------------
type FetchHandler = (url: string, init?: RequestInit) => Response;
function stubFetch(handler: FetchHandler) {
vi.stubGlobal(
'fetch',
vi.fn((url: string, init?: RequestInit) => Promise.resolve(handler(url, init)))
);
}
function jsonResponse(body: unknown, status = 200, headers: Record<string, string> = {}): Response {
return new Response(JSON.stringify(body), {
status,
headers: {
'Content-Type': 'application/json',
'X-RateLimit-Remaining': '4999',
'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600),
...headers
}
});
}
function textResponse(body: string, status = 200, headers: Record<string, string> = {}): Response {
return new Response(body, {
status,
headers: {
'Content-Type': 'text/plain',
'X-RateLimit-Remaining': '4999',
'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600),
...headers
}
});
}
// ---------------------------------------------------------------------------
// Fixtures
// ---------------------------------------------------------------------------
const REPO_INFO = {
default_branch: 'main',
stargazers_count: 42
};
const TREE_RESPONSE = {
tree: [
{ path: 'README.md', type: 'blob', size: 1024, sha: 'sha-readme', url: '' },
{ path: 'src/index.ts', type: 'blob', size: 512, sha: 'sha-index', url: '' },
{ path: 'src/utils.ts', type: 'blob', size: 256, sha: 'sha-utils', url: '' },
{ path: 'package.json', type: 'blob', size: 128, sha: 'sha-pkg', url: '' },
{ path: 'dist/bundle.js', type: 'blob', size: 9999, sha: 'sha-dist', url: '' }, // excluded by default
{ path: 'node_modules/lodash/index.js', type: 'blob', size: 100, sha: 'sha-nm', url: '' }, // excluded
{ path: 'image.png', type: 'blob', size: 4096, sha: 'sha-img', url: '' }, // non-indexable
{ path: 'src', type: 'tree', size: 0, sha: 'sha-src-tree', url: '' }
],
truncated: false
};
const COMMIT_SHA = 'deadbeef1234567890abcdef';
// ---------------------------------------------------------------------------
// shouldIndexFile unit tests
// ---------------------------------------------------------------------------
describe('shouldIndexFile()', () => {
it('returns true for a .ts file within size limit', () => {
expect(shouldIndexFile('src/index.ts', 1000)).toBe(true);
});
it('returns false for a .png file (non-indexable extension)', () => {
expect(shouldIndexFile('assets/logo.png', 100)).toBe(false);
});
it('returns false when file exceeds MAX_FILE_SIZE_BYTES', () => {
expect(shouldIndexFile('big.ts', MAX_FILE_SIZE_BYTES + 1)).toBe(false);
});
it('returns false for a file in node_modules/', () => {
expect(shouldIndexFile('node_modules/lodash/index.js', 100)).toBe(false);
});
it('returns false for a file in dist/', () => {
expect(shouldIndexFile('dist/bundle.js', 100)).toBe(false);
});
it('respects config.excludeFiles (exact basename)', () => {
expect(shouldIndexFile('src/secret.ts', 100, { excludeFiles: ['secret.ts'] })).toBe(false);
});
it('does not exclude a file whose basename merely contains the excluded name', () => {
expect(shouldIndexFile('src/not-secret.ts', 100, { excludeFiles: ['secret.ts'] })).toBe(true);
});
it('respects config.excludeFolders prefix', () => {
expect(shouldIndexFile('internal/config.ts', 100, { excludeFolders: ['internal/'] })).toBe(false);
});
it('allows files outside of config.excludeFolders', () => {
expect(shouldIndexFile('public/api.ts', 100, { excludeFolders: ['internal/'] })).toBe(true);
});
it('restricts to config.folders allowlist when specified', () => {
const config = { folders: ['docs/'] };
expect(shouldIndexFile('src/index.ts', 100, config)).toBe(false);
expect(shouldIndexFile('docs/guide.md', 100, config)).toBe(true);
});
it('returns true when config.folders is an empty array (no restriction)', () => {
expect(shouldIndexFile('src/index.ts', 100, { folders: [] })).toBe(true);
});
it('handles all default-excluded directories', () => {
const excluded = [
'node_modules/pkg/index.js',
'.git/config',
'dist/out.js',
'build/app.js',
'coverage/lcov.info',
'.next/server.js',
'__pycache__/mod.py',
'vendor/lib.go',
'target/release.rs',
'.cache/file.ts'
];
for (const path of excluded) {
expect(shouldIndexFile(path, 100), `should exclude ${path}`).toBe(false);
}
});
it('INDEXABLE_EXTENSIONS covers all expected types', () => {
const required = ['.md', '.ts', '.py', '.go', '.rs', '.json', '.svelte'];
for (const ext of required) {
expect(INDEXABLE_EXTENSIONS.has(ext), `missing extension ${ext}`).toBe(true);
}
});
});
// ---------------------------------------------------------------------------
// detectLanguage unit tests
// ---------------------------------------------------------------------------
describe('detectLanguage()', () => {
it('detects typescript', () => expect(detectLanguage('foo.ts')).toBe('typescript'));
it('detects tsx as typescript', () => expect(detectLanguage('foo.tsx')).toBe('typescript'));
it('detects javascript', () => expect(detectLanguage('foo.js')).toBe('javascript'));
it('detects python', () => expect(detectLanguage('foo.py')).toBe('python'));
it('detects go', () => expect(detectLanguage('foo.go')).toBe('go'));
it('detects rust', () => expect(detectLanguage('foo.rs')).toBe('rust'));
it('detects markdown', () => expect(detectLanguage('README.md')).toBe('markdown'));
it('detects svelte', () => expect(detectLanguage('App.svelte')).toBe('svelte'));
it('detects yaml', () => expect(detectLanguage('config.yaml')).toBe('yaml'));
it('returns empty string for unknown extension', () => expect(detectLanguage('file.xyz')).toBe(''));
it('is case-insensitive for extensions', () => expect(detectLanguage('FILE.TS')).toBe('typescript'));
});
// ---------------------------------------------------------------------------
// GitHubRateLimiter unit tests
// ---------------------------------------------------------------------------
describe('GitHubRateLimiter', () => {
it('defaults to 5000 remaining requests', () => {
const limiter = new GitHubRateLimiter();
expect(limiter.remainingRequests).toBe(5000);
});
it('updates remaining and resetAt from headers', () => {
const limiter = new GitHubRateLimiter();
const resetEpoch = Math.floor(Date.now() / 1000) + 3600;
const headers = new Headers({
'X-RateLimit-Remaining': '42',
'X-RateLimit-Reset': String(resetEpoch)
});
limiter.updateFromHeaders(headers);
expect(limiter.remainingRequests).toBe(42);
expect(limiter.resetTimestamp).toBe(resetEpoch * 1000);
});
it('does not mutate state when headers are absent', () => {
const limiter = new GitHubRateLimiter();
limiter.updateFromHeaders(new Headers());
expect(limiter.remainingRequests).toBe(5000);
});
it('waitIfNeeded resolves immediately when remaining > 10', async () => {
const limiter = new GitHubRateLimiter();
const start = Date.now();
await limiter.waitIfNeeded();
expect(Date.now() - start).toBeLessThan(100);
});
});
// ---------------------------------------------------------------------------
// Semaphore unit tests
// ---------------------------------------------------------------------------
describe('Semaphore', () => {
it('allows up to concurrency tasks to run simultaneously', async () => {
const sem = new Semaphore(2);
let active = 0;
let maxActive = 0;
const task = () =>
sem.run(async () => {
active++;
maxActive = Math.max(maxActive, active);
await new Promise((r) => setTimeout(r, 10));
active--;
});
await Promise.all([task(), task(), task(), task()]);
expect(maxActive).toBeLessThanOrEqual(2);
});
it('resolves all tasks even when queued', async () => {
const sem = new Semaphore(1);
const results: number[] = [];
await Promise.all(
[1, 2, 3].map((n) =>
sem.run(async () => {
results.push(n);
})
)
);
expect(results).toHaveLength(3);
});
});
// ---------------------------------------------------------------------------
// withRetry unit tests
// ---------------------------------------------------------------------------
describe('withRetry()', () => {
it('returns the result on first success', async () => {
const result = await withRetry(() => Promise.resolve(42));
expect(result).toBe(42);
});
it('retries on failure and returns eventual success', async () => {
let calls = 0;
const result = await withRetry(async () => {
calls++;
if (calls < 3) throw new Error('transient');
return 'ok';
}, 3);
expect(result).toBe('ok');
expect(calls).toBe(3);
});
it('throws after exhausting all attempts', async () => {
await expect(
withRetry(() => Promise.reject(new Error('always fails')), 3)
).rejects.toThrow('always fails');
});
});
// ---------------------------------------------------------------------------
// crawl() integration tests (fetch mocked)
// ---------------------------------------------------------------------------
describe('crawl()', () => {
beforeEach(() => {
vi.useFakeTimers();
});
afterEach(() => {
vi.restoreAllMocks();
vi.useRealTimers();
});
function setupDefaultMocks(overrides: Partial<Record<string, Response>> = {}) {
stubFetch((url) => {
// Repo info
if (url === 'https://api.github.com/repos/owner/repo') {
return overrides[url] ?? jsonResponse(REPO_INFO);
}
// Commit SHA
if (url === 'https://api.github.com/repos/owner/repo/commits/main') {
return overrides[url] ?? textResponse(COMMIT_SHA);
}
// File tree
if (url.startsWith('https://api.github.com/repos/owner/repo/git/trees/main')) {
return overrides[url] ?? jsonResponse(TREE_RESPONSE);
}
// Raw content (raw.githubusercontent.com)
if (url.startsWith('https://raw.githubusercontent.com/')) {
const filePath = url.split('/').slice(6).join('/');
return overrides[url] ?? textResponse(`// content of ${filePath}`);
}
return new Response('not found', { status: 404 });
});
}
it('returns files that pass the filter', async () => {
setupDefaultMocks();
const result = await crawl({ owner: 'owner', repo: 'repo' });
// dist/ and node_modules/ should be excluded; .png should be excluded.
// Expected: README.md, src/index.ts, src/utils.ts, package.json
expect(result.files.length).toBeGreaterThanOrEqual(4);
expect(result.files.every((f) => !f.path.startsWith('dist/'))).toBe(true);
expect(result.files.every((f) => !f.path.startsWith('node_modules/'))).toBe(true);
expect(result.files.every((f) => !f.path.endsWith('.png'))).toBe(true);
});
it('sets branch to the default_branch when no ref is given', async () => {
setupDefaultMocks();
const result = await crawl({ owner: 'owner', repo: 'repo' });
expect(result.branch).toBe('main');
});
it('uses the provided ref when specified', async () => {
stubFetch((url) => {
if (url === 'https://api.github.com/repos/owner/repo') {
return jsonResponse(REPO_INFO);
}
if (url.includes('/git/trees/v2.0.0')) {
return jsonResponse({ tree: [], truncated: false });
}
if (url.includes('/commits/v2.0.0')) {
return textResponse('tagsha');
}
return textResponse('content');
});
const result = await crawl({ owner: 'owner', repo: 'repo', ref: 'v2.0.0' });
expect(result.branch).toBe('v2.0.0');
});
it('populates commitSha from the commits endpoint', async () => {
setupDefaultMocks();
const result = await crawl({ owner: 'owner', repo: 'repo' });
expect(result.commitSha).toBe(COMMIT_SHA);
});
it('sets correct sha on each CrawledFile from the tree', async () => {
setupDefaultMocks();
const result = await crawl({ owner: 'owner', repo: 'repo' });
const readme = result.files.find((f) => f.path === 'README.md');
expect(readme).toBeDefined();
expect(readme!.sha).toBe('sha-readme');
});
it('attaches language to each CrawledFile', async () => {
setupDefaultMocks();
const result = await crawl({ owner: 'owner', repo: 'repo' });
const indexTs = result.files.find((f) => f.path === 'src/index.ts');
expect(indexTs?.language).toBe('typescript');
const readme = result.files.find((f) => f.path === 'README.md');
expect(readme?.language).toBe('markdown');
});
it('reports progress via onProgress callback', async () => {
setupDefaultMocks();
const calls: Array<[number, number]> = [];
await crawl({
owner: 'owner',
repo: 'repo',
onProgress: (p, t) => calls.push([p, t])
});
expect(calls.length).toBeGreaterThan(0);
// Total must remain constant across all calls.
const totals = calls.map(([, t]) => t);
expect(totals.every((t) => t === totals[0])).toBe(true);
});
it('skips files that fail to download without throwing', async () => {
stubFetch((url) => {
if (url === 'https://api.github.com/repos/owner/repo') {
return jsonResponse(REPO_INFO);
}
if (url.includes('/git/trees/main')) {
return jsonResponse({
tree: [{ path: 'src/index.ts', type: 'blob', size: 100, sha: 'sha1', url: '' }],
truncated: false
});
}
if (url.includes('/commits/main')) {
return textResponse(COMMIT_SHA);
}
// All content downloads fail.
return new Response('error', { status: 500 });
});
// Should not throw; just return zero files.
const result = await crawl({ owner: 'owner', repo: 'repo' });
expect(result.files).toHaveLength(0);
expect(result.totalFiles).toBe(1);
});
it('throws RepositoryNotFoundError on 404', async () => {
stubFetch((url) => {
if (url === 'https://api.github.com/repos/owner/missing') {
return jsonResponse({ message: 'Not Found' }, 404, {
'X-RateLimit-Remaining': '4999',
'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600)
});
}
return new Response('not found', { status: 404 });
});
await expect(crawl({ owner: 'owner', repo: 'missing' })).rejects.toThrow(
RepositoryNotFoundError
);
});
it('throws AuthenticationError on 401', async () => {
stubFetch(() =>
new Response('Unauthorized', {
status: 401,
headers: {
'X-RateLimit-Remaining': '0',
'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600)
}
})
);
await expect(crawl({ owner: 'owner', repo: 'repo', token: 'bad-token' })).rejects.toThrow(
AuthenticationError
);
});
it('throws PermissionError on 403 without rate-limit exhaustion', async () => {
stubFetch(() =>
new Response('Forbidden', {
status: 403,
headers: {
'X-RateLimit-Remaining': '100',
'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600)
}
})
);
await expect(crawl({ owner: 'owner', repo: 'repo' })).rejects.toThrow(PermissionError);
});
it('respects config.folders allowlist when provided', async () => {
setupDefaultMocks();
const result = await crawl({
owner: 'owner',
repo: 'repo',
config: { folders: ['src/'] }
});
// Only src/ files should be present.
expect(result.files.every((f) => f.path.startsWith('src/'))).toBe(true);
});
it('applies config.excludeFiles filter', async () => {
setupDefaultMocks();
const result = await crawl({
owner: 'owner',
repo: 'repo',
config: { excludeFiles: ['package.json'] }
});
expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
});
it('returns correct skippedFiles count', async () => {
setupDefaultMocks();
const result = await crawl({ owner: 'owner', repo: 'repo' });
// dist/, node_modules/, and .png are the excluded items = 3
expect(result.skippedFiles).toBe(3);
});
it('uses auth token in requests to GitHub API', async () => {
const capturedHeaders: Record<string, string>[] = [];
stubFetch((url, init) => {
const headers = Object.fromEntries(
Object.entries((init?.headers as Record<string, string>) ?? {})
);
capturedHeaders.push(headers);
if (url === 'https://api.github.com/repos/owner/repo') {
return jsonResponse(REPO_INFO);
}
if (url.includes('/git/trees/main')) {
return jsonResponse({ tree: [], truncated: false });
}
if (url.includes('/commits/main')) {
return textResponse(COMMIT_SHA);
}
return textResponse('content');
});
await crawl({ owner: 'owner', repo: 'repo', token: 'ghp_mysecrettoken' });
const apiCalls = capturedHeaders.filter((h) => h.Authorization);
expect(apiCalls.length).toBeGreaterThan(0);
expect(apiCalls[0].Authorization).toBe('Bearer ghp_mysecrettoken');
});
it('handles a tree with zero indexable files gracefully', async () => {
stubFetch((url) => {
if (url === 'https://api.github.com/repos/owner/repo') return jsonResponse(REPO_INFO);
if (url.includes('/git/trees/main'))
return jsonResponse({
tree: [
{ path: 'image.png', type: 'blob', size: 100, sha: 'sha1', url: '' },
{ path: 'video.mp4', type: 'blob', size: 1000, sha: 'sha2', url: '' }
],
truncated: false
});
if (url.includes('/commits/main')) return textResponse(COMMIT_SHA);
return textResponse('content');
});
const result = await crawl({ owner: 'owner', repo: 'repo' });
expect(result.files).toHaveLength(0);
expect(result.totalFiles).toBe(0);
expect(result.skippedFiles).toBe(2);
});
it('reads and applies config from trueref.json found in the tree', async () => {
const truerefConfig = { excludeFiles: ['package.json'] };
stubFetch((url) => {
if (url === 'https://api.github.com/repos/owner/repo') return jsonResponse(REPO_INFO);
if (url.includes('/git/trees/main')) {
return jsonResponse({
tree: [
{ path: 'trueref.json', type: 'blob', size: 50, sha: 'sha-cfg', url: '' },
{ path: 'src/index.ts', type: 'blob', size: 200, sha: 'sha-idx', url: '' },
{ path: 'package.json', type: 'blob', size: 100, sha: 'sha-pkg', url: '' }
],
truncated: false
});
}
if (url.includes('/commits/main')) return textResponse(COMMIT_SHA);
if (url.includes('trueref.json')) return textResponse(JSON.stringify(truerefConfig));
if (url.includes('src/index.ts')) return textResponse('export const x = 1;');
if (url.includes('package.json')) return textResponse('{"name":"test"}');
return textResponse('content');
});
// No caller-supplied config — crawler should auto-detect trueref.json.
const result = await crawl({ owner: 'owner', repo: 'repo' });
expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
});

View File

@@ -0,0 +1,477 @@
/**
* GitHub Repository Crawler (TRUEREF-0003).
*
* Fetches repository file trees via the GitHub Trees API and downloads file
* contents in parallel while respecting rate limits and applying
* include/exclude filtering rules from trueref.json.
*
* Download strategy:
* - Uses raw.githubusercontent.com for file content — faster and counts less
* against the REST API rate limit.
* - Falls back to the GitHub Contents API if raw download fails.
*
* Error handling:
* - 404 → RepositoryNotFoundError
* - 401 → AuthenticationError
* - 403 → waits for rate-limit reset if X-RateLimit-Remaining is 0; else PermissionError
* - 422 → tree too large; switches to directory-by-directory traversal (depth pagination)
* - Network errors → retried up to 3 times with exponential backoff
* - Bad base64 content → file skipped with a console warning
*/
import { shouldIndexFile, detectLanguage } from './file-filter.js';
import { GitHubRateLimiter, Semaphore, withRetry } from './rate-limiter.js';
import {
AuthenticationError,
PermissionError,
RateLimitError,
RepositoryNotFoundError
} from './types.js';
// Domain errors should not be retried — they are permanent HTTP status codes.
function isDomainError(err: unknown): boolean {
return (
err instanceof RepositoryNotFoundError ||
err instanceof AuthenticationError ||
err instanceof PermissionError ||
err instanceof RateLimitError
);
}
function isRetryable(err: unknown): boolean {
return !isDomainError(err);
}
import type {
CrawlOptions,
CrawlResult,
CrawledFile,
GitHubContentResponse,
GitHubRepoResponse,
GitHubTreeItem,
GitHubTreeResponse
} from './types.js';
// ---------------------------------------------------------------------------
// Internal constants
// ---------------------------------------------------------------------------
const GITHUB_API = 'https://api.github.com';
const RAW_CONTENT = 'https://raw.githubusercontent.com';
/** Maximum parallel file downloads. */
const DOWNLOAD_CONCURRENCY = 10;
/** Config file names that should be fetched first so their filtering rules
* apply to all subsequent downloads. */
const CONFIG_FILE_NAMES = new Set(['trueref.json', 'context7.json']);
// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------
/**
* Build standard GitHub API request headers.
*/
function buildHeaders(token?: string): Record<string, string> {
const headers: Record<string, string> = {
Accept: 'application/vnd.github+json',
'X-GitHub-Api-Version': '2022-11-28'
};
if (token) {
headers.Authorization = `Bearer ${token}`;
}
return headers;
}
/**
* Throw a domain error for non-2xx GitHub API responses.
* Mutates the rate limiter with header data before throwing.
*/
async function throwForStatus(response: Response, rateLimiter: GitHubRateLimiter): Promise<void> {
if (response.ok) return;
rateLimiter.updateFromHeaders(response.headers);
switch (response.status) {
case 401:
throw new AuthenticationError('GitHub authentication failed — check your PAT.');
case 403: {
const remaining = response.headers.get('X-RateLimit-Remaining');
if (remaining === '0') {
const reset = parseInt(response.headers.get('X-RateLimit-Reset') ?? '0', 10) * 1000;
throw new RateLimitError('GitHub rate limit exceeded.', reset);
}
throw new PermissionError(
'GitHub returned 403 Forbidden — insufficient permissions for this resource.'
);
}
case 404:
throw new RepositoryNotFoundError(
`Repository not found or not accessible: ${response.url}`
);
default: {
const body = await response.text().catch(() => '');
throw new Error(`GitHub API error ${response.status}: ${body}`);
}
}
}
// ---------------------------------------------------------------------------
// GitHub API calls
// ---------------------------------------------------------------------------
/**
* Fetch repository metadata (default branch, stars, etc.).
*/
async function fetchRepoInfo(
owner: string,
repo: string,
token: string | undefined,
rateLimiter: GitHubRateLimiter
): Promise<GitHubRepoResponse> {
return withRetry(async () => {
await rateLimiter.waitIfNeeded();
const response = await fetch(`${GITHUB_API}/repos/${owner}/${repo}`, {
headers: buildHeaders(token)
});
rateLimiter.updateFromHeaders(response.headers);
await throwForStatus(response, rateLimiter);
return (await response.json()) as GitHubRepoResponse;
}, 3, isRetryable);
}
/**
* Fetch the recursive file tree for a given ref.
* Returns null when the tree is truncated (>100k items), signalling that we
* should fall back to directory-level traversal.
*/
async function fetchTree(
owner: string,
repo: string,
ref: string,
token: string | undefined,
rateLimiter: GitHubRateLimiter
): Promise<GitHubTreeResponse | null> {
return withRetry(async () => {
await rateLimiter.waitIfNeeded();
const url = `${GITHUB_API}/repos/${owner}/${repo}/git/trees/${ref}?recursive=1`;
const response = await fetch(url, { headers: buildHeaders(token) });
rateLimiter.updateFromHeaders(response.headers);
// 422 means the tree is too large for a single recursive call.
if (response.status === 422) return null;
await throwForStatus(response, rateLimiter);
return (await response.json()) as GitHubTreeResponse;
}, 3, isRetryable);
}
/**
* Fetch a subtree (non-recursive) for a single directory path.
* Used when the full recursive tree is truncated.
*/
async function fetchSubTree(
owner: string,
repo: string,
ref: string,
treeSha: string,
token: string | undefined,
rateLimiter: GitHubRateLimiter
): Promise<GitHubTreeResponse> {
return withRetry(async () => {
await rateLimiter.waitIfNeeded();
const url = `${GITHUB_API}/repos/${owner}/${repo}/git/trees/${treeSha}`;
const response = await fetch(url, { headers: buildHeaders(token) });
rateLimiter.updateFromHeaders(response.headers);
await throwForStatus(response, rateLimiter);
return (await response.json()) as GitHubTreeResponse;
}, 3, isRetryable);
}
/**
* Resolve the HEAD commit SHA from a branch/tag ref by fetching the
* commit object at the ref tip.
*/
async function fetchCommitSha(
owner: string,
repo: string,
ref: string,
token: string | undefined,
rateLimiter: GitHubRateLimiter
): Promise<string> {
return withRetry(async () => {
await rateLimiter.waitIfNeeded();
const url = `${GITHUB_API}/repos/${owner}/${repo}/commits/${ref}`;
const response = await fetch(url, {
headers: { ...buildHeaders(token), Accept: 'application/vnd.github.sha' }
});
rateLimiter.updateFromHeaders(response.headers);
await throwForStatus(response, rateLimiter);
// When Accept is 'application/vnd.github.sha', the response body is the
// bare SHA string.
return (await response.text()).trim();
}, 3, isRetryable);
}
/**
* Download raw file content via raw.githubusercontent.com.
* Returns null on any failure (the caller will skip or fall back).
*/
async function downloadRawFile(
owner: string,
repo: string,
ref: string,
filePath: string,
token: string | undefined
): Promise<string | null> {
try {
const url = `${RAW_CONTENT}/${owner}/${repo}/${ref}/${filePath}`;
const headers: Record<string, string> = {};
if (token) headers.Authorization = `Bearer ${token}`;
const response = await fetch(url, { headers });
if (!response.ok) return null;
return await response.text();
} catch {
return null;
}
}
/**
* Download file content via the GitHub Contents API (fallback).
*/
async function downloadViaContentsApi(
owner: string,
repo: string,
ref: string,
filePath: string,
token: string | undefined,
rateLimiter: GitHubRateLimiter
): Promise<string | null> {
try {
return await withRetry(async () => {
await rateLimiter.waitIfNeeded();
const url = `${GITHUB_API}/repos/${owner}/${repo}/contents/${filePath}?ref=${ref}`;
const response = await fetch(url, { headers: buildHeaders(token) });
rateLimiter.updateFromHeaders(response.headers);
if (!response.ok) return null;
const data = (await response.json()) as GitHubContentResponse;
if (data.encoding !== 'base64') return null;
// Node.js Buffer handles both padded and unpadded base64.
return Buffer.from(data.content.replace(/\n/g, ''), 'base64').toString('utf-8');
});
} catch {
return null;
}
}
// ---------------------------------------------------------------------------
// Directory-level traversal (fallback for truncated trees)
// ---------------------------------------------------------------------------
/**
* Recursively collect all blob items from sub-trees when the top-level
* recursive tree is truncated (>100k items).
*/
async function collectBlobsFromSubTrees(
owner: string,
repo: string,
ref: string,
token: string | undefined,
rateLimiter: GitHubRateLimiter
): Promise<GitHubTreeItem[]> {
const allBlobs: GitHubTreeItem[] = [];
const queue: Array<{ sha: string; prefix: string }> = [{ sha: ref, prefix: '' }];
while (queue.length > 0) {
const batch = queue.splice(0, DOWNLOAD_CONCURRENCY);
await Promise.all(
batch.map(async ({ sha, prefix }) => {
const subTree = await fetchSubTree(owner, repo, ref, sha, token, rateLimiter).catch(
() => null
);
if (!subTree) return;
for (const item of subTree.tree) {
const fullPath = prefix ? `${prefix}/${item.path}` : item.path;
if (item.type === 'blob') {
allBlobs.push({ ...item, path: fullPath });
} else if (item.type === 'tree') {
queue.push({ sha: item.sha, prefix: fullPath });
}
}
})
);
}
return allBlobs;
}
// ---------------------------------------------------------------------------
// Config file detection
// ---------------------------------------------------------------------------
/**
* Try to download and parse a trueref.json / context7.json config from the
* repository root. Returns undefined if not found or unparseable.
*/
async function fetchRepoConfig(
owner: string,
repo: string,
ref: string,
token: string | undefined,
blobs: GitHubTreeItem[],
rateLimiter: GitHubRateLimiter
): Promise<CrawlOptions['config'] | undefined> {
// Look for config files only at the repo root (no directory prefix).
const configItem = blobs.find((b) => CONFIG_FILE_NAMES.has(b.path));
if (!configItem) return undefined;
const content =
(await downloadRawFile(owner, repo, ref, configItem.path, token)) ??
(await downloadViaContentsApi(
owner,
repo,
ref,
configItem.path,
token,
rateLimiter
));
if (!content) return undefined;
try {
return JSON.parse(content) as CrawlOptions['config'];
} catch {
console.warn(`[GitHubCrawler] Failed to parse config file: ${configItem.path}`);
return undefined;
}
}
// ---------------------------------------------------------------------------
// Public crawl() function
// ---------------------------------------------------------------------------
/**
* Crawl a GitHub repository and return structured file objects.
*
* @param options - Repository coordinates, auth token, filter config, and
* optional progress callback.
* @returns CrawlResult with all downloaded files and summary statistics.
*/
export async function crawl(options: CrawlOptions): Promise<CrawlResult> {
const { owner, repo, token, onProgress } = options;
const rateLimiter = new GitHubRateLimiter();
const semaphore = new Semaphore(DOWNLOAD_CONCURRENCY);
// ---- Step 1: Resolve the ref (default branch if not provided) ----------
let ref = options.ref;
let commitSha = '';
const repoInfo = await fetchRepoInfo(owner, repo, token, rateLimiter);
if (!ref) {
ref = repoInfo.default_branch;
}
// ---- Step 2: Fetch the file tree ---------------------------------------
let blobs: GitHubTreeItem[];
const treeResponse = await fetchTree(owner, repo, ref, token, rateLimiter);
if (treeResponse === null) {
// Tree truncated — fall back to directory-by-directory traversal.
console.warn(
`[GitHubCrawler] Tree for ${owner}/${repo}@${ref} is truncated; using sub-tree traversal.`
);
blobs = await collectBlobsFromSubTrees(owner, repo, ref, token, rateLimiter);
} else {
blobs = treeResponse.tree.filter((item) => item.type === 'blob');
}
// Resolve HEAD commit SHA (best-effort; empty string on failure).
commitSha = await fetchCommitSha(owner, repo, ref, token, rateLimiter).catch(() => '');
// ---- Step 3: Detect and download config file first ---------------------
// Merge caller-supplied config with any discovered repo config.
let effectiveConfig = options.config;
if (!effectiveConfig) {
effectiveConfig = await fetchRepoConfig(owner, repo, ref, token, blobs, rateLimiter);
}
// ---- Step 4: Filter blobs according to config --------------------------
const filteredBlobs = blobs.filter((item) =>
shouldIndexFile(item.path, item.size ?? 0, effectiveConfig)
);
const totalFiles = filteredBlobs.length;
const skippedFiles = blobs.length - totalFiles;
// ---- Step 5: Download file contents in parallel -------------------------
const files: CrawledFile[] = [];
let processed = 0;
await Promise.all(
filteredBlobs.map((item) =>
semaphore.run(async () => {
try {
// Prefer raw download (cheaper on rate limit); fall back to API.
const content =
(await downloadRawFile(owner, repo, ref!, item.path, token)) ??
(await downloadViaContentsApi(
owner,
repo,
ref!,
item.path,
token,
rateLimiter
));
if (content === null) {
console.warn(`[GitHubCrawler] Could not download: ${item.path} — skipping.`);
} else {
files.push({
path: item.path,
content,
size: item.size ?? Buffer.byteLength(content, 'utf-8'),
sha: item.sha,
language: detectLanguage(item.path)
});
}
} catch (err) {
console.warn(
`[GitHubCrawler] Error downloading ${item.path}: ${err instanceof Error ? err.message : String(err)}`
);
} finally {
processed++;
onProgress?.(processed, totalFiles);
}
})
)
);
return {
files,
totalFiles,
skippedFiles,
branch: ref,
commitSha
};
}

View File

@@ -0,0 +1,554 @@
/**
* Unit tests for the local filesystem crawler (TRUEREF-0004).
*
* Each test that needs a filesystem fixture creates a temporary directory via
* `fs.mkdtemp`, writes the required files, runs the crawler, then cleans up
* with `fs.rm` regardless of the test outcome.
*/
import { execFile } from 'node:child_process';
import { createHash } from 'node:crypto';
import { promises as fs } from 'node:fs';
import { join } from 'node:path';
import { tmpdir } from 'node:os';
import { promisify } from 'node:util';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { LocalCrawler } from './local.crawler.js';
import type { LocalCrawlOptions } from './local.crawler.js';
import { InvalidRefError, NotAGitRepositoryError } from './types.js';
const execFileAsync = promisify(execFile);
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function sha256(content: string): string {
return createHash('sha256').update(content, 'utf-8').digest('hex');
}
/** Create a temp directory, write a map of relPath → content, return rootPath. */
async function makeTempRepo(files: Record<string, string>): Promise<string> {
const root = await fs.mkdtemp(join(tmpdir(), 'trueref-test-'));
for (const [relPath, content] of Object.entries(files)) {
const absPath = join(root, relPath);
await fs.mkdir(join(absPath, '..'), { recursive: true });
await fs.writeFile(absPath, content, 'utf-8');
}
return root;
}
/** Remove a temporary directory tree created by makeTempRepo. */
async function cleanupTempRepo(root: string): Promise<void> {
await fs.rm(root, { recursive: true, force: true });
}
// ---------------------------------------------------------------------------
// Test state
// ---------------------------------------------------------------------------
let root: string = '';
const crawler = new LocalCrawler();
async function crawlRoot(opts: Partial<LocalCrawlOptions> = {}): Promise<ReturnType<LocalCrawler['crawl']>> {
return crawler.crawl({ rootPath: root, ...opts });
}
// ---------------------------------------------------------------------------
// Basic crawl behaviour
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — basic file enumeration', () => {
beforeEach(async () => {
root = await makeTempRepo({
'README.md': '# Hello',
'src/index.ts': 'export const x = 1;',
'src/utils.ts': 'export const y = 2;',
'package.json': '{"name":"test"}'
});
});
afterEach(async () => {
await cleanupTempRepo(root);
});
it('returns all indexable files', async () => {
const result = await crawlRoot();
const paths = result.files.map((f) => f.path).sort();
expect(paths).toEqual(['README.md', 'package.json', 'src/index.ts', 'src/utils.ts'].sort());
});
it('populates content as a UTF-8 string', async () => {
const result = await crawlRoot();
const readme = result.files.find((f) => f.path === 'README.md');
expect(readme?.content).toBe('# Hello');
});
it('sets size equal to Buffer.byteLength of content', async () => {
const result = await crawlRoot();
for (const file of result.files) {
expect(file.size).toBe(Buffer.byteLength(file.content, 'utf-8'));
}
});
it('computes correct SHA-256 per file', async () => {
const result = await crawlRoot();
const readme = result.files.find((f) => f.path === 'README.md');
expect(readme?.sha).toBe(sha256('# Hello'));
});
it('detects language from extension', async () => {
const result = await crawlRoot();
const ts = result.files.find((f) => f.path === 'src/index.ts');
expect(ts?.language).toBe('typescript');
const md = result.files.find((f) => f.path === 'README.md');
expect(md?.language).toBe('markdown');
const json = result.files.find((f) => f.path === 'package.json');
expect(json?.language).toBe('json');
});
it('sets branch to "local"', async () => {
const result = await crawlRoot();
expect(result.branch).toBe('local');
});
it('sets totalFiles to the count of filtered files', async () => {
const result = await crawlRoot();
expect(result.totalFiles).toBe(result.files.length);
});
it('sets commitSha to a non-empty hex string', async () => {
const result = await crawlRoot();
expect(result.commitSha).toMatch(/^[0-9a-f]{64}$/);
});
it('produces a deterministic commitSha for the same file set', async () => {
const r1 = await crawlRoot();
const r2 = await crawlRoot();
expect(r1.commitSha).toBe(r2.commitSha);
});
});
// ---------------------------------------------------------------------------
// Filtering — default excludes and extension allow-list
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — default filtering', () => {
beforeEach(async () => {
root = await makeTempRepo({
'src/index.ts': 'export {};',
'dist/bundle.js': 'bundled',
'node_modules/lodash/index.js': 'lodash',
'.git/config': '[core]',
'image.png': '\x89PNG',
'README.md': '# Docs'
});
});
afterEach(async () => {
await cleanupTempRepo(root);
});
it('excludes files in dist/', async () => {
const result = await crawlRoot();
expect(result.files.every((f) => !f.path.startsWith('dist/'))).toBe(true);
});
it('excludes files in node_modules/', async () => {
const result = await crawlRoot();
expect(result.files.every((f) => !f.path.startsWith('node_modules/'))).toBe(true);
});
it('excludes files in .git/', async () => {
const result = await crawlRoot();
expect(result.files.every((f) => !f.path.startsWith('.git/'))).toBe(true);
});
it('excludes non-indexable extensions like .png', async () => {
const result = await crawlRoot();
expect(result.files.every((f) => !f.path.endsWith('.png'))).toBe(true);
});
it('reports skippedFiles = total enumerated filtered', async () => {
const result = await crawlRoot();
// dist/, node_modules/, .git/, .png = 4 skipped
// src/index.ts + README.md = 2 kept
expect(result.skippedFiles).toBe(4);
expect(result.totalFiles).toBe(2);
});
});
// ---------------------------------------------------------------------------
// Size limit
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — size limit', () => {
afterEach(async () => {
await cleanupTempRepo(root);
});
it('excludes files larger than MAX_FILE_SIZE_BYTES (500 KB)', async () => {
// 500_001 bytes of 'x'
const bigContent = 'x'.repeat(500_001);
root = await makeTempRepo({
'big.ts': bigContent,
'small.ts': 'export const x = 1;'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'big.ts')).toBe(false);
expect(result.files.some((f) => f.path === 'small.ts')).toBe(true);
});
it('includes files exactly at MAX_FILE_SIZE_BYTES (500 KB)', async () => {
const edgeContent = 'a'.repeat(500_000);
root = await makeTempRepo({ 'edge.ts': edgeContent });
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'edge.ts')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// trueref.json / context7.json config detection
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — config file detection', () => {
afterEach(async () => {
await cleanupTempRepo(root);
});
it('auto-detects trueref.json and applies excludeFiles', async () => {
root = await makeTempRepo({
'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }),
'src/index.ts': 'export {};',
'package.json': '{"name":"test"}'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('auto-detects context7.json and applies folders allowlist', async () => {
root = await makeTempRepo({
'context7.json': JSON.stringify({ folders: ['docs/'] }),
'src/index.ts': 'export {};',
'docs/guide.md': '# Guide'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(false);
expect(result.files.some((f) => f.path === 'docs/guide.md')).toBe(true);
});
it('caller-supplied config takes precedence over discovered config file', async () => {
root = await makeTempRepo({
'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }),
'src/index.ts': 'export {};',
'package.json': '{"name":"test"}'
});
// Caller provides a config with no exclusions — package.json should appear.
const result = await crawlRoot({ config: {} });
expect(result.files.some((f) => f.path === 'package.json')).toBe(true);
});
it('applies excludeFolders from config', async () => {
root = await makeTempRepo({
'trueref.json': JSON.stringify({ excludeFolders: ['internal/'] }),
'internal/secret.ts': 'secret',
'src/public.ts': 'public'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path.startsWith('internal/'))).toBe(false);
expect(result.files.some((f) => f.path === 'src/public.ts')).toBe(true);
});
it('gracefully handles a malformed config file', async () => {
root = await makeTempRepo({
'trueref.json': 'NOT VALID JSON {{{',
'src/index.ts': 'export {};'
});
// Should not throw; falls back to no config.
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Progress callback
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — progress reporting', () => {
beforeEach(async () => {
root = await makeTempRepo({
'src/a.ts': 'a',
'src/b.ts': 'b',
'src/c.ts': 'c'
});
});
afterEach(async () => {
await cleanupTempRepo(root);
});
it('calls onProgress once per filtered file', async () => {
const calls: Array<[number, number]> = [];
await crawlRoot({ onProgress: (p, t) => calls.push([p, t]) });
expect(calls).toHaveLength(3);
});
it('increments processed from 1 to totalFiles', async () => {
const calls: Array<[number, number]> = [];
await crawlRoot({ onProgress: (p, t) => calls.push([p, t]) });
const processed = calls.map(([p]) => p);
expect(processed).toEqual([1, 2, 3]);
});
it('keeps total constant across all callback invocations', async () => {
const totals: number[] = [];
await crawlRoot({ onProgress: (_, t) => totals.push(t) });
expect(totals.every((t) => t === totals[0])).toBe(true);
});
it('does not call onProgress when no files pass the filter', async () => {
// Overwrite root with only non-indexable files.
await fs.rm(root, { recursive: true, force: true });
root = await makeTempRepo({ 'image.png': '\x89PNG' });
const calls: number[] = [];
await crawlRoot({ onProgress: () => calls.push(1) });
expect(calls).toHaveLength(0);
});
});
// ---------------------------------------------------------------------------
// Git ref checkout
// ---------------------------------------------------------------------------
/**
* Create a temp directory that is a valid git repo with one commit per entry
* in `history`. Each entry is a map of relPath → content committed under the
* given tag (if provided). Returns the repo root path.
*
* Layout of `history`:
* [{ tag?: string, files: Record<string, string> }, ...]
*/
async function makeGitRepo(
history: Array<{ tag?: string; files: Record<string, string> }>
): Promise<string> {
const root = await fs.mkdtemp(join(tmpdir(), 'trueref-git-test-'));
async function git(...args: string[]) {
await execFileAsync('git', ['-C', root, ...args]);
}
await git('init', '--initial-branch=main');
await git('config', 'user.email', 'test@trueref.local');
await git('config', 'user.name', 'TrueRef Test');
for (const { tag, files } of history) {
// Write files
for (const [relPath, content] of Object.entries(files)) {
const absPath = join(root, relPath);
await fs.mkdir(join(absPath, '..'), { recursive: true });
await fs.writeFile(absPath, content, 'utf-8');
}
await git('add', '.');
await git('commit', '--allow-empty', '-m', `commit for ${tag ?? 'HEAD'}`);
if (tag) {
await git('tag', tag);
}
}
return root;
}
describe('LocalCrawler.crawl() — git ref checkout', () => {
let root: string = '';
const crawler = new LocalCrawler();
afterEach(async () => {
if (root) await cleanupTempRepo(root);
});
it('crawls files at a specific tag, not the HEAD state', async () => {
root = await makeGitRepo([
{ tag: 'v1.0.0', files: { 'src/index.ts': 'export const version = 1;' } },
{ files: { 'src/index.ts': 'export const version = 2;' } }
]);
const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
const indexFile = result.files.find((f) => f.path === 'src/index.ts');
expect(indexFile?.content).toBe('export const version = 1;');
});
it('crawls files at a specific commit SHA', async () => {
root = await makeGitRepo([
{ tag: 'v1.0.0', files: { 'api.ts': 'v1' } },
{ files: { 'api.ts': 'v2' } }
]);
// Resolve the SHA of v1.0.0
const { stdout } = await execFileAsync('git', ['-C', root, 'rev-parse', 'v1.0.0'], {
encoding: 'utf-8'
});
const sha = stdout.trim();
const result = await crawler.crawl({ rootPath: root, ref: sha });
const api = result.files.find((f) => f.path === 'api.ts');
expect(api?.content).toBe('v1');
});
it('sets branch to the ref string in the result', async () => {
root = await makeGitRepo([{ tag: 'v2.3.1', files: { 'README.md': '# v2' } }]);
const result = await crawler.crawl({ rootPath: root, ref: 'v2.3.1' });
expect(result.branch).toBe('v2.3.1');
});
it('sets commitSha to the git-resolved SHA (not file-content hash)', async () => {
root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'a.ts': 'a' } }]);
const { stdout } = await execFileAsync('git', ['-C', root, 'rev-parse', 'v1.0.0'], {
encoding: 'utf-8'
});
const expectedSha = stdout.trim();
const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
expect(result.commitSha).toBe(expectedSha);
});
it('does not modify the working tree', async () => {
root = await makeGitRepo([
{ tag: 'v1.0.0', files: { 'src/index.ts': 'v1' } },
{ files: { 'src/index.ts': 'v2' } }
]);
// Working tree is at HEAD (v2)
const before = await fs.readFile(join(root, 'src/index.ts'), 'utf-8');
await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
const after = await fs.readFile(join(root, 'src/index.ts'), 'utf-8');
expect(before).toBe('v2');
expect(after).toBe('v2');
});
it('removes the temporary worktree after crawling', async () => {
root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'f.ts': 'x' } }]);
await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
// List remaining worktrees — only the main one should remain.
const { stdout } = await execFileAsync('git', ['-C', root, 'worktree', 'list', '--porcelain'], {
encoding: 'utf-8'
});
const worktreeCount = stdout.split('\n').filter((l) => l.startsWith('worktree ')).length;
expect(worktreeCount).toBe(1);
});
it('throws NotAGitRepositoryError for a plain directory', async () => {
const plainDir = await fs.mkdtemp(join(tmpdir(), 'trueref-plain-'));
root = plainDir; // cleaned up in afterEach
await expect(crawler.crawl({ rootPath: plainDir, ref: 'v1.0.0' })).rejects.toThrow(
NotAGitRepositoryError
);
});
it('throws InvalidRefError for a ref that does not exist', async () => {
root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'f.ts': 'x' } }]);
await expect(crawler.crawl({ rootPath: root, ref: 'v99.99.99' })).rejects.toThrow(
InvalidRefError
);
});
it('applies caller-supplied config at the checked-out ref', async () => {
root = await makeGitRepo([
{
tag: 'v1.0.0',
files: {
'src/index.ts': 'export {};',
'package.json': '{"name":"test"}'
}
}
]);
// Exclude package.json via caller config
const result = await crawler.crawl({
rootPath: root,
ref: 'v1.0.0',
config: { excludeFiles: ['package.json'] }
});
expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('reads trueref.json from the checked-out ref', async () => {
root = await makeGitRepo([
{
tag: 'v1.0.0',
files: {
'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }),
'src/index.ts': 'export {};',
'package.json': '{"name":"test"}'
}
}
]);
const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Edge cases
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — edge cases', () => {
afterEach(async () => {
await cleanupTempRepo(root);
});
it('returns empty result for an empty directory', async () => {
root = await makeTempRepo({});
const result = await crawlRoot();
expect(result.files).toHaveLength(0);
expect(result.totalFiles).toBe(0);
expect(result.skippedFiles).toBe(0);
});
it('handles deeply nested directory structures', async () => {
root = await makeTempRepo({
'a/b/c/d/deep.ts': 'export const deep = true;'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'a/b/c/d/deep.ts')).toBe(true);
});
it('handles files with UTF-8 content correctly', async () => {
const utf8Content = 'const greeting = "héllo wörld — 日本語";';
root = await makeTempRepo({ 'src/unicode.ts': utf8Content });
const result = await crawlRoot();
const file = result.files.find((f) => f.path === 'src/unicode.ts');
expect(file?.content).toBe(utf8Content);
expect(file?.sha).toBe(sha256(utf8Content));
});
it('commitSha differs when file content changes', async () => {
root = await makeTempRepo({ 'src/index.ts': 'version 1' });
const r1 = await crawlRoot();
await fs.writeFile(join(root, 'src/index.ts'), 'version 2', 'utf-8');
const r2 = await crawlRoot();
expect(r1.commitSha).not.toBe(r2.commitSha);
});
it('commitSha is empty-string hash when no files are crawled', async () => {
root = await makeTempRepo({ 'image.png': '\x89PNG' });
const result = await crawlRoot();
// SHA-256 of an empty string
expect(result.commitSha).toBe(sha256(''));
});
});

View File

@@ -0,0 +1,275 @@
/**
* Local Filesystem Crawler (TRUEREF-0004).
*
* Walks a directory tree and enumerates all files, applying the same
* extension and size filters as the GitHub crawler (TRUEREF-0003).
* Reads file contents as UTF-8 strings and computes SHA-256 checksums
* for change detection.
*
* Design decisions:
* - Uses Node.js `fs/promises` and `crypto` — no extra dependencies.
* - Symlinks and special files (devices, sockets, FIFOs) are skipped.
* - `trueref.json` / `context7.json` at the repo root are detected and
* parsed before any other file filtering runs, matching the GitHub crawler.
* - File size for filtering is taken from `stat().size` so the size limit
* is applied before reading file content (saves I/O on large excluded files).
* - `commitSha` is derived from a SHA-256 hash of all per-file checksums,
* giving a deterministic fingerprint of the crawled file set.
*/
import { execFile } from 'node:child_process';
import { createHash } from 'node:crypto';
import { promises as fs } from 'node:fs';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { promisify } from 'node:util';
import { detectLanguage, shouldIndexFile } from './file-filter.js';
import { InvalidRefError, NotAGitRepositoryError } from './types.js';
import type { CrawledFile, CrawlResult, RepoConfig } from './types.js';
const execFileAsync = promisify(execFile);
// ---------------------------------------------------------------------------
// Public options type
// ---------------------------------------------------------------------------
export interface LocalCrawlOptions {
/** Absolute path to the repository root directory. */
rootPath: string;
/**
* Git ref to check out before crawling — a tag name (e.g. "v2.1.0"),
* a branch name, or a commit SHA. When provided the crawler creates an
* isolated git worktree at that ref, crawls it, then removes the worktree.
* The original working tree is never modified.
* Requires `rootPath` to be inside a git repository.
*/
ref?: string;
/** Pre-parsed trueref.json / context7.json configuration, if already loaded. */
config?: RepoConfig;
/** Progress callback invoked after each file is read. */
onProgress?: (processed: number, total: number) => void;
}
// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------
/** Names of config files that control include/exclude rules. */
const CONFIG_FILE_NAMES = new Set(['trueref.json', 'context7.json']);
// ---------------------------------------------------------------------------
// Git helpers
// ---------------------------------------------------------------------------
/**
* Run a git command inside `cwd` and return trimmed stdout.
* Throws the child-process error on non-zero exit.
*/
async function runGit(cwd: string, args: string[]): Promise<string> {
const { stdout } = await execFileAsync('git', ['-C', cwd, ...args], { encoding: 'utf-8' });
return stdout.trim();
}
/**
* Compute a SHA-256 hex digest of a UTF-8 string.
*/
function computeSHA256(content: string): string {
return createHash('sha256').update(content, 'utf-8').digest('hex');
}
/**
* Attempt to read and JSON-parse a config file.
* Returns undefined if the file cannot be read or parsed.
*/
async function parseConfigFile(absPath: string): Promise<RepoConfig | undefined> {
try {
const raw = await fs.readFile(absPath, 'utf-8');
return JSON.parse(raw) as RepoConfig;
} catch {
console.warn(`[LocalCrawler] Failed to parse config file: ${absPath}`);
return undefined;
}
}
// ---------------------------------------------------------------------------
// LocalCrawler
// ---------------------------------------------------------------------------
export class LocalCrawler {
/**
* Crawl a local directory tree and return structured file objects.
*
* When `options.ref` is supplied the crawler creates an isolated git
* worktree checked out at that ref, crawls it, then removes the worktree.
* The caller's working tree is never modified.
*
* @param options - Root path, optional git ref, optional config, and progress callback.
* @returns CrawlResult with all read files and summary statistics.
*/
async crawl(options: LocalCrawlOptions): Promise<CrawlResult> {
const { rootPath, ref } = options;
if (!ref) {
// Fast path: crawl the working tree as-is.
return this.crawlDirectory(rootPath, options.config, options.onProgress, 'local');
}
// Git-aware path: verify repo, resolve ref, create worktree, crawl, clean up.
let worktreePath: string | undefined;
try {
// Verify rootPath is inside a git repository.
await runGit(rootPath, ['rev-parse', '--git-dir']).catch(() => {
throw new NotAGitRepositoryError(`Not a git repository: ${rootPath}`);
});
// Resolve the ref to a concrete commit SHA (validates it exists).
const commitSha = await runGit(rootPath, ['rev-parse', '--verify', ref]).catch(() => {
throw new InvalidRefError(`Invalid git ref "${ref}" in repository: ${rootPath}`);
});
// Create a temporary isolated worktree at the resolved ref.
const tmpDir = await fs.mkdtemp(join(tmpdir(), 'trueref-wt-'));
worktreePath = tmpDir;
await runGit(rootPath, ['worktree', 'add', '--detach', tmpDir, ref]).catch((err) => {
throw new InvalidRefError(
`Cannot create worktree for ref "${ref}": ${err instanceof Error ? err.message : String(err)}`
);
});
// Crawl the worktree and stamp the result with the git-resolved metadata.
const result = await this.crawlDirectory(worktreePath, options.config, options.onProgress, ref);
return { ...result, commitSha };
} finally {
if (worktreePath) {
// Remove the worktree (git also deletes the directory).
await runGit(rootPath, ['worktree', 'remove', '--force', worktreePath]).catch(() => {
// Best-effort; leave the temp directory for the OS to clean up.
fs.rm(worktreePath!, { recursive: true, force: true }).catch(() => {});
});
}
}
}
// ---------------------------------------------------------------------------
// Private — directory crawl
// ---------------------------------------------------------------------------
/**
* Walk `rootPath`, apply filters, read files, and build a CrawlResult.
* `branch` is embedded verbatim into the returned result.
*/
private async crawlDirectory(
rootPath: string,
callerConfig: RepoConfig | undefined,
onProgress: LocalCrawlOptions['onProgress'],
branch: string
): Promise<CrawlResult> {
// Step 1: Walk the directory tree and collect (relPath, size) pairs.
const statCache = new Map<string, number>();
const allRelPaths = await this.walkDirectory(rootPath, '', statCache);
// Step 2: Detect trueref.json / context7.json at the repo root first.
// Only root-level config files are honoured (no directory prefix).
const configRelPath = allRelPaths.find((p) => CONFIG_FILE_NAMES.has(p));
let config = callerConfig;
if (configRelPath && !config) {
config = await parseConfigFile(join(rootPath, configRelPath));
}
// Step 3: Filter files according to extension, size, and config rules.
const filteredPaths = allRelPaths.filter((relPath) => {
const size = statCache.get(relPath) ?? 0;
return shouldIndexFile(relPath, size, config);
});
// Step 4: Read file contents and build CrawledFile records.
const crawledFiles: CrawledFile[] = [];
for (const [i, relPath] of filteredPaths.entries()) {
const absPath = join(rootPath, relPath);
try {
const content = await fs.readFile(absPath, 'utf-8');
const sha = computeSHA256(content);
crawledFiles.push({
path: relPath,
content,
size: Buffer.byteLength(content, 'utf-8'),
sha,
language: detectLanguage(relPath)
});
} catch (err) {
console.warn(
`[LocalCrawler] Could not read file: ${relPath}${err instanceof Error ? err.message : String(err)}`
);
}
onProgress?.(i + 1, filteredPaths.length);
}
// Step 5: Build a deterministic repo-level fingerprint from file SHAs.
const commitSha = computeSHA256(crawledFiles.map((f) => f.sha).join(''));
return {
files: crawledFiles,
totalFiles: filteredPaths.length,
skippedFiles: allRelPaths.length - filteredPaths.length,
branch,
commitSha
};
}
/**
* Recursively walk a directory and collect relative paths of all regular files.
* Symlinks and special files (devices, sockets, FIFOs) are silently skipped.
* Populates `statCache` with file sizes so the caller can filter without a
* second `stat()` call.
*
* @param dir - Absolute path of the directory to read.
* @param rel - Relative path prefix accumulated during recursion.
* @param statCache - Mutable map from relative path → byte size.
*/
private async walkDirectory(
dir: string,
rel: string,
statCache: Map<string, number>
): Promise<string[]> {
let entries;
try {
entries = await fs.readdir(dir, { withFileTypes: true });
} catch {
// Directory is unreadable (permissions, etc.) — skip silently.
return [];
}
const files: string[] = [];
for (const entry of entries) {
// Only descend into plain directories and collect plain files.
// entry.isFile() / entry.isDirectory() return false for symlinks,
// devices, sockets, and FIFOs, so those are all implicitly skipped.
if (!entry.isFile() && !entry.isDirectory()) continue;
const relPath = rel ? `${rel}/${entry.name}` : entry.name;
if (entry.isDirectory()) {
const children = await this.walkDirectory(join(dir, entry.name), relPath, statCache);
files.push(...children);
} else {
// Capture file size from stat so shouldIndexFile can enforce the limit
// without reading the file.
try {
const stat = await fs.stat(join(dir, entry.name));
statCache.set(relPath, stat.size);
} catch {
statCache.set(relPath, 0);
}
files.push(relPath);
}
}
return files;
}
}

View File

@@ -0,0 +1,123 @@
/**
* GitHub API rate-limit tracker and backoff helper (TRUEREF-0003).
*
* Reads X-RateLimit-* headers from every API response and pauses outgoing
* requests when the remaining allowance drops to ≤ 10.
*/
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
export class GitHubRateLimiter {
private remaining = 5000;
private resetAt = Date.now();
/**
* Update internal counters from the headers of a GitHub API response.
*/
updateFromHeaders(headers: Headers): void {
const remaining = headers.get('X-RateLimit-Remaining');
const reset = headers.get('X-RateLimit-Reset');
if (remaining !== null) {
this.remaining = parseInt(remaining, 10);
}
if (reset !== null) {
// GitHub returns a Unix epoch in seconds.
this.resetAt = parseInt(reset, 10) * 1000;
}
}
/**
* If the remaining allowance is critically low (≤ 10), sleep until the
* rate-limit window resets (plus a 1 s buffer).
*/
async waitIfNeeded(): Promise<void> {
if (this.remaining <= 10) {
const waitMs = Math.max(0, this.resetAt - Date.now()) + 1000;
await sleep(waitMs);
}
}
/** Remaining requests in the current window (for testing). */
get remainingRequests(): number {
return this.remaining;
}
/** Reset timestamp as a Unix epoch in ms (for testing). */
get resetTimestamp(): number {
return this.resetAt;
}
}
/**
* Exponential-backoff retry wrapper for network-level errors.
*
* Retries up to `maxAttempts` times (default 3) with 1 s, 2 s, 4 s delays.
*
* @param fn - Async function to attempt.
* @param maxAttempts - Maximum number of attempts (default 3).
* @param isRetryable - Optional predicate; when it returns false for a given
* error the error is re-thrown immediately without further
* retries. Defaults to retrying all errors.
*/
export async function withRetry<T>(
fn: () => Promise<T>,
maxAttempts = 3,
isRetryable: (err: unknown) => boolean = () => true
): Promise<T> {
let lastError: unknown;
for (let attempt = 0; attempt < maxAttempts; attempt++) {
try {
return await fn();
} catch (err) {
if (!isRetryable(err)) throw err;
lastError = err;
if (attempt < maxAttempts - 1) {
await sleep(1000 * Math.pow(2, attempt));
}
}
}
throw lastError;
}
/**
* Async semaphore — limits the number of concurrently executing promises.
*/
export class Semaphore {
private count: number;
private readonly queue: Array<() => void> = [];
constructor(concurrency: number) {
this.count = concurrency;
}
async acquire(): Promise<void> {
if (this.count > 0) {
this.count--;
return;
}
return new Promise((resolve) => {
this.queue.push(resolve);
});
}
release(): void {
const next = this.queue.shift();
if (next) {
next();
} else {
this.count++;
}
}
async run<T>(fn: () => Promise<T>): Promise<T> {
await this.acquire();
try {
return await fn();
} finally {
this.release();
}
}
}

View File

@@ -0,0 +1,135 @@
/**
* Types for the GitHub repository crawler (TRUEREF-0003).
*/
import type { TrueRefConfig } from '$lib/types';
// Re-export RepoConfig alias so crawler modules can reference it consistently.
export type RepoConfig = TrueRefConfig;
// ---------------------------------------------------------------------------
// Core crawler data types
// ---------------------------------------------------------------------------
export interface CrawledFile {
/** Relative path within the repo, e.g. "src/index.ts" */
path: string;
/** UTF-8 file content */
content: string;
/** File size in bytes */
size: number;
/** GitHub blob SHA (used as checksum) */
sha: string;
/** Programming language detected from extension */
language: string;
}
export interface CrawlResult {
/** Successfully downloaded files */
files: CrawledFile[];
/** Total files that matched filters */
totalFiles: number;
/** Files that were filtered out or too large */
skippedFiles: number;
/** Branch or tag that was crawled */
branch: string;
/** HEAD commit SHA */
commitSha: string;
}
export interface CrawlOptions {
owner: string;
repo: string;
/** Branch, tag, or commit SHA; defaults to repo default branch */
ref?: string;
/** GitHub PAT for private repos */
token?: string;
/** Parsed trueref.json / context7.json configuration */
config?: RepoConfig;
/** Progress callback invoked after each file is processed */
onProgress?: (processed: number, total: number) => void;
}
// ---------------------------------------------------------------------------
// GitHub API response shapes (minimal — only fields we use)
// ---------------------------------------------------------------------------
export interface GitHubRepoResponse {
default_branch: string;
stargazers_count: number;
}
export interface GitHubTreeItem {
path: string;
type: 'blob' | 'tree';
size?: number;
sha: string;
url: string;
}
export interface GitHubTreeResponse {
tree: GitHubTreeItem[];
truncated: boolean;
}
export interface GitHubContentResponse {
content: string;
encoding: string;
size: number;
sha: string;
}
// ---------------------------------------------------------------------------
// Domain errors
// ---------------------------------------------------------------------------
export class RepositoryNotFoundError extends Error {
readonly code = 'REPOSITORY_NOT_FOUND';
constructor(message: string) {
super(message);
this.name = 'RepositoryNotFoundError';
}
}
export class AuthenticationError extends Error {
readonly code = 'AUTHENTICATION_ERROR';
constructor(message: string) {
super(message);
this.name = 'AuthenticationError';
}
}
export class PermissionError extends Error {
readonly code = 'PERMISSION_ERROR';
constructor(message: string) {
super(message);
this.name = 'PermissionError';
}
}
export class RateLimitError extends Error {
readonly code = 'RATE_LIMIT_ERROR';
constructor(
message: string,
public readonly resetAt: number
) {
super(message);
this.name = 'RateLimitError';
}
}
export class NotAGitRepositoryError extends Error {
readonly code = 'NOT_A_GIT_REPOSITORY';
constructor(message: string) {
super(message);
this.name = 'NotAGitRepositoryError';
}
}
export class InvalidRefError extends Error {
readonly code = 'INVALID_REF';
constructor(message: string) {
super(message);
this.name = 'InvalidRefError';
}
}