feat(TRUEREF-0003-0004): implement GitHub and local filesystem crawlers

- GitHub crawler with rate limiting, semaphore concurrency, retry logic - File filtering by extension, size, and trueref.json rules - Local filesystem crawler with SHA-256 checksums and progress callbacks - Shared types and file filter logic between both crawlers Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-23 09:06:07 +01:00
parent cb253ffe98
commit 1c15d6c474
7 changed files with 2308 additions and 0 deletions
--- a/src/lib/server/crawler/file-filter.ts
+++ b/src/lib/server/crawler/file-filter.ts
@@ -0,0 +1,183 @@
+/**
+ * File filtering logic for the GitHub crawler (TRUEREF-0003).
+ *
+ * Determines whether a file in the repository tree should be downloaded
+ * and indexed based on its extension, size, and the trueref.json config.
+ */
+
+import { extname, basename } from 'node:path';
+import type { RepoConfig } from './types.js';
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+/** File extensions that the indexer can meaningfully process. */
+export const INDEXABLE_EXTENSIONS = new Set([
+	// Documentation
+	'.md',
+	'.mdx',
+	'.txt',
+	'.rst',
+	// Code
+	'.ts',
+	'.tsx',
+	'.js',
+	'.jsx',
+	'.py',
+	'.rb',
+	'.go',
+	'.rs',
+	'.java',
+	'.cs',
+	'.cpp',
+	'.c',
+	'.h',
+	'.swift',
+	'.kt',
+	'.php',
+	'.scala',
+	'.clj',
+	'.ex',
+	'.exs',
+	'.sh',
+	'.bash',
+	'.zsh',
+	'.fish',
+	// Config / data
+	'.json',
+	'.yaml',
+	'.yml',
+	'.toml',
+	// Web
+	'.html',
+	'.css',
+	'.svelte',
+	'.vue'
+]);
+
+/** Maximum file size we are willing to download (500 KB). */
+export const MAX_FILE_SIZE_BYTES = 500_000;
+
+/**
+ * Default path prefixes that are always excluded regardless of config.
+ * These directories contain generated or dependency files that should never
+ * be indexed.
+ */
+const DEFAULT_EXCLUDES: string[] = [
+	'node_modules/',
+	'.git/',
+	'dist/',
+	'build/',
+	'coverage/',
+	'.next/',
+	'__pycache__/',
+	'vendor/',
+	'target/',
+	'.cache/'
+];
+
+// ---------------------------------------------------------------------------
+// Language detection
+// ---------------------------------------------------------------------------
+
+const EXTENSION_TO_LANGUAGE: Record<string, string> = {
+	'.ts': 'typescript',
+	'.tsx': 'typescript',
+	'.js': 'javascript',
+	'.jsx': 'javascript',
+	'.py': 'python',
+	'.rb': 'ruby',
+	'.go': 'go',
+	'.rs': 'rust',
+	'.java': 'java',
+	'.cs': 'csharp',
+	'.cpp': 'cpp',
+	'.c': 'c',
+	'.h': 'c',
+	'.swift': 'swift',
+	'.kt': 'kotlin',
+	'.php': 'php',
+	'.scala': 'scala',
+	'.clj': 'clojure',
+	'.ex': 'elixir',
+	'.exs': 'elixir',
+	'.sh': 'shell',
+	'.bash': 'shell',
+	'.zsh': 'shell',
+	'.fish': 'shell',
+	'.json': 'json',
+	'.yaml': 'yaml',
+	'.yml': 'yaml',
+	'.toml': 'toml',
+	'.html': 'html',
+	'.css': 'css',
+	'.svelte': 'svelte',
+	'.vue': 'vue',
+	'.md': 'markdown',
+	'.mdx': 'markdown',
+	'.txt': 'text',
+	'.rst': 'rst'
+};
+
+/**
+ * Detect a human-readable language name from a file extension.
+ * Returns an empty string when the extension is unknown.
+ */
+export function detectLanguage(filePath: string): string {
+	const ext = extname(filePath).toLowerCase();
+	return EXTENSION_TO_LANGUAGE[ext] ?? '';
+}
+
+// ---------------------------------------------------------------------------
+// Filter predicate
+// ---------------------------------------------------------------------------
+
+/**
+ * Decide whether a file from the repository tree should be downloaded.
+ *
+ * Rules (applied in order):
+ * 1. Must have an indexable extension.
+ * 2. Must not exceed the size limit.
+ * 3. Must not match config.excludeFiles (exact basename match).
+ * 4. Must not be under a config.excludeFolders path / regex.
+ * 5. Must be under a config.folders allowlist path / regex (if specified).
+ * 6. Must not start with a default-excluded prefix.
+ */
+export function shouldIndexFile(
+	filePath: string,
+	fileSize: number,
+	config?: RepoConfig
+): boolean {
+	const ext = extname(filePath).toLowerCase();
+
+	// 1. Extension allow-list
+	if (!INDEXABLE_EXTENSIONS.has(ext)) return false;
+
+	// 2. Size limit
+	if (fileSize > MAX_FILE_SIZE_BYTES) return false;
+
+	// 3. Config excludeFiles — exact basename match
+	if (config?.excludeFiles?.includes(basename(filePath))) return false;
+
+	// 4. Config excludeFolders — prefix or regex match
+	if (
+		config?.excludeFolders?.some(
+			(folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath)
+		)
+	)
+		return false;
+
+	// 5. Config folders allowlist — if provided, the file must match at least one
+	if (config?.folders?.length) {
+		const inAllowedFolder = config.folders.some(
+			(folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath)
+		);
+		if (!inAllowedFolder) return false;
+	}
+
+	// 6. Default excludes
+	if (DEFAULT_EXCLUDES.some((ex) => filePath.startsWith(ex))) return false;
+
+	return true;
+}
--- a/src/lib/server/crawler/github.crawler.test.ts
+++ b/src/lib/server/crawler/github.crawler.test.ts
@@ -0,0 +1,561 @@
+/**
+ * Unit tests for the GitHub repository crawler (TRUEREF-0003).
+ *
+ * All GitHub API calls are intercepted via vi.stubGlobal('fetch', ...) so
+ * that no real network traffic is produced.
+ */
+
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+
+import { crawl } from './github.crawler.js';
+import { shouldIndexFile, detectLanguage, INDEXABLE_EXTENSIONS, MAX_FILE_SIZE_BYTES } from './file-filter.js';
+import { GitHubRateLimiter, Semaphore, withRetry } from './rate-limiter.js';
+import {
+	AuthenticationError,
+	PermissionError,
+	RepositoryNotFoundError
+} from './types.js';
+
+// ---------------------------------------------------------------------------
+// Mock fetch helpers
+// ---------------------------------------------------------------------------
+
+type FetchHandler = (url: string, init?: RequestInit) => Response;
+
+function stubFetch(handler: FetchHandler) {
+	vi.stubGlobal(
+		'fetch',
+		vi.fn((url: string, init?: RequestInit) => Promise.resolve(handler(url, init)))
+	);
+}
+
+function jsonResponse(body: unknown, status = 200, headers: Record<string, string> = {}): Response {
+	return new Response(JSON.stringify(body), {
+		status,
+		headers: {
+			'Content-Type': 'application/json',
+			'X-RateLimit-Remaining': '4999',
+			'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600),
+			...headers
+		}
+	});
+}
+
+function textResponse(body: string, status = 200, headers: Record<string, string> = {}): Response {
+	return new Response(body, {
+		status,
+		headers: {
+			'Content-Type': 'text/plain',
+			'X-RateLimit-Remaining': '4999',
+			'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600),
+			...headers
+		}
+	});
+}
+
+// ---------------------------------------------------------------------------
+// Fixtures
+// ---------------------------------------------------------------------------
+
+const REPO_INFO = {
+	default_branch: 'main',
+	stargazers_count: 42
+};
+
+const TREE_RESPONSE = {
+	tree: [
+		{ path: 'README.md', type: 'blob', size: 1024, sha: 'sha-readme', url: '' },
+		{ path: 'src/index.ts', type: 'blob', size: 512, sha: 'sha-index', url: '' },
+		{ path: 'src/utils.ts', type: 'blob', size: 256, sha: 'sha-utils', url: '' },
+		{ path: 'package.json', type: 'blob', size: 128, sha: 'sha-pkg', url: '' },
+		{ path: 'dist/bundle.js', type: 'blob', size: 9999, sha: 'sha-dist', url: '' }, // excluded by default
+		{ path: 'node_modules/lodash/index.js', type: 'blob', size: 100, sha: 'sha-nm', url: '' }, // excluded
+		{ path: 'image.png', type: 'blob', size: 4096, sha: 'sha-img', url: '' }, // non-indexable
+		{ path: 'src', type: 'tree', size: 0, sha: 'sha-src-tree', url: '' }
+	],
+	truncated: false
+};
+
+const COMMIT_SHA = 'deadbeef1234567890abcdef';
+
+// ---------------------------------------------------------------------------
+// shouldIndexFile unit tests
+// ---------------------------------------------------------------------------
+
+describe('shouldIndexFile()', () => {
+	it('returns true for a .ts file within size limit', () => {
+		expect(shouldIndexFile('src/index.ts', 1000)).toBe(true);
+	});
+
+	it('returns false for a .png file (non-indexable extension)', () => {
+		expect(shouldIndexFile('assets/logo.png', 100)).toBe(false);
+	});
+
+	it('returns false when file exceeds MAX_FILE_SIZE_BYTES', () => {
+		expect(shouldIndexFile('big.ts', MAX_FILE_SIZE_BYTES + 1)).toBe(false);
+	});
+
+	it('returns false for a file in node_modules/', () => {
+		expect(shouldIndexFile('node_modules/lodash/index.js', 100)).toBe(false);
+	});
+
+	it('returns false for a file in dist/', () => {
+		expect(shouldIndexFile('dist/bundle.js', 100)).toBe(false);
+	});
+
+	it('respects config.excludeFiles (exact basename)', () => {
+		expect(shouldIndexFile('src/secret.ts', 100, { excludeFiles: ['secret.ts'] })).toBe(false);
+	});
+
+	it('does not exclude a file whose basename merely contains the excluded name', () => {
+		expect(shouldIndexFile('src/not-secret.ts', 100, { excludeFiles: ['secret.ts'] })).toBe(true);
+	});
+
+	it('respects config.excludeFolders prefix', () => {
+		expect(shouldIndexFile('internal/config.ts', 100, { excludeFolders: ['internal/'] })).toBe(false);
+	});
+
+	it('allows files outside of config.excludeFolders', () => {
+		expect(shouldIndexFile('public/api.ts', 100, { excludeFolders: ['internal/'] })).toBe(true);
+	});
+
+	it('restricts to config.folders allowlist when specified', () => {
+		const config = { folders: ['docs/'] };
+		expect(shouldIndexFile('src/index.ts', 100, config)).toBe(false);
+		expect(shouldIndexFile('docs/guide.md', 100, config)).toBe(true);
+	});
+
+	it('returns true when config.folders is an empty array (no restriction)', () => {
+		expect(shouldIndexFile('src/index.ts', 100, { folders: [] })).toBe(true);
+	});
+
+	it('handles all default-excluded directories', () => {
+		const excluded = [
+			'node_modules/pkg/index.js',
+			'.git/config',
+			'dist/out.js',
+			'build/app.js',
+			'coverage/lcov.info',
+			'.next/server.js',
+			'__pycache__/mod.py',
+			'vendor/lib.go',
+			'target/release.rs',
+			'.cache/file.ts'
+		];
+		for (const path of excluded) {
+			expect(shouldIndexFile(path, 100), `should exclude ${path}`).toBe(false);
+		}
+	});
+
+	it('INDEXABLE_EXTENSIONS covers all expected types', () => {
+		const required = ['.md', '.ts', '.py', '.go', '.rs', '.json', '.svelte'];
+		for (const ext of required) {
+			expect(INDEXABLE_EXTENSIONS.has(ext), `missing extension ${ext}`).toBe(true);
+		}
+	});
+});
+
+// ---------------------------------------------------------------------------
+// detectLanguage unit tests
+// ---------------------------------------------------------------------------
+
+describe('detectLanguage()', () => {
+	it('detects typescript', () => expect(detectLanguage('foo.ts')).toBe('typescript'));
+	it('detects tsx as typescript', () => expect(detectLanguage('foo.tsx')).toBe('typescript'));
+	it('detects javascript', () => expect(detectLanguage('foo.js')).toBe('javascript'));
+	it('detects python', () => expect(detectLanguage('foo.py')).toBe('python'));
+	it('detects go', () => expect(detectLanguage('foo.go')).toBe('go'));
+	it('detects rust', () => expect(detectLanguage('foo.rs')).toBe('rust'));
+	it('detects markdown', () => expect(detectLanguage('README.md')).toBe('markdown'));
+	it('detects svelte', () => expect(detectLanguage('App.svelte')).toBe('svelte'));
+	it('detects yaml', () => expect(detectLanguage('config.yaml')).toBe('yaml'));
+	it('returns empty string for unknown extension', () => expect(detectLanguage('file.xyz')).toBe(''));
+	it('is case-insensitive for extensions', () => expect(detectLanguage('FILE.TS')).toBe('typescript'));
+});
+
+// ---------------------------------------------------------------------------
+// GitHubRateLimiter unit tests
+// ---------------------------------------------------------------------------
+
+describe('GitHubRateLimiter', () => {
+	it('defaults to 5000 remaining requests', () => {
+		const limiter = new GitHubRateLimiter();
+		expect(limiter.remainingRequests).toBe(5000);
+	});
+
+	it('updates remaining and resetAt from headers', () => {
+		const limiter = new GitHubRateLimiter();
+		const resetEpoch = Math.floor(Date.now() / 1000) + 3600;
+		const headers = new Headers({
+			'X-RateLimit-Remaining': '42',
+			'X-RateLimit-Reset': String(resetEpoch)
+		});
+		limiter.updateFromHeaders(headers);
+		expect(limiter.remainingRequests).toBe(42);
+		expect(limiter.resetTimestamp).toBe(resetEpoch * 1000);
+	});
+
+	it('does not mutate state when headers are absent', () => {
+		const limiter = new GitHubRateLimiter();
+		limiter.updateFromHeaders(new Headers());
+		expect(limiter.remainingRequests).toBe(5000);
+	});
+
+	it('waitIfNeeded resolves immediately when remaining > 10', async () => {
+		const limiter = new GitHubRateLimiter();
+		const start = Date.now();
+		await limiter.waitIfNeeded();
+		expect(Date.now() - start).toBeLessThan(100);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Semaphore unit tests
+// ---------------------------------------------------------------------------
+
+describe('Semaphore', () => {
+	it('allows up to concurrency tasks to run simultaneously', async () => {
+		const sem = new Semaphore(2);
+		let active = 0;
+		let maxActive = 0;
+
+		const task = () =>
+			sem.run(async () => {
+				active++;
+				maxActive = Math.max(maxActive, active);
+				await new Promise((r) => setTimeout(r, 10));
+				active--;
+			});
+
+		await Promise.all([task(), task(), task(), task()]);
+		expect(maxActive).toBeLessThanOrEqual(2);
+	});
+
+	it('resolves all tasks even when queued', async () => {
+		const sem = new Semaphore(1);
+		const results: number[] = [];
+		await Promise.all(
+			[1, 2, 3].map((n) =>
+				sem.run(async () => {
+					results.push(n);
+				})
+			)
+		);
+		expect(results).toHaveLength(3);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// withRetry unit tests
+// ---------------------------------------------------------------------------
+
+describe('withRetry()', () => {
+	it('returns the result on first success', async () => {
+		const result = await withRetry(() => Promise.resolve(42));
+		expect(result).toBe(42);
+	});
+
+	it('retries on failure and returns eventual success', async () => {
+		let calls = 0;
+		const result = await withRetry(async () => {
+			calls++;
+			if (calls < 3) throw new Error('transient');
+			return 'ok';
+		}, 3);
+		expect(result).toBe('ok');
+		expect(calls).toBe(3);
+	});
+
+	it('throws after exhausting all attempts', async () => {
+		await expect(
+			withRetry(() => Promise.reject(new Error('always fails')), 3)
+		).rejects.toThrow('always fails');
+	});
+});
+
+// ---------------------------------------------------------------------------
+// crawl() integration tests (fetch mocked)
+// ---------------------------------------------------------------------------
+
+describe('crawl()', () => {
+	beforeEach(() => {
+		vi.useFakeTimers();
+	});
+
+	afterEach(() => {
+		vi.restoreAllMocks();
+		vi.useRealTimers();
+	});
+
+	function setupDefaultMocks(overrides: Partial<Record<string, Response>> = {}) {
+		stubFetch((url) => {
+			// Repo info
+			if (url === 'https://api.github.com/repos/owner/repo') {
+				return overrides[url] ?? jsonResponse(REPO_INFO);
+			}
+			// Commit SHA
+			if (url === 'https://api.github.com/repos/owner/repo/commits/main') {
+				return overrides[url] ?? textResponse(COMMIT_SHA);
+			}
+			// File tree
+			if (url.startsWith('https://api.github.com/repos/owner/repo/git/trees/main')) {
+				return overrides[url] ?? jsonResponse(TREE_RESPONSE);
+			}
+			// Raw content (raw.githubusercontent.com)
+			if (url.startsWith('https://raw.githubusercontent.com/')) {
+				const filePath = url.split('/').slice(6).join('/');
+				return overrides[url] ?? textResponse(`// content of ${filePath}`);
+			}
+			return new Response('not found', { status: 404 });
+		});
+	}
+
+	it('returns files that pass the filter', async () => {
+		setupDefaultMocks();
+
+		const result = await crawl({ owner: 'owner', repo: 'repo' });
+
+		// dist/ and node_modules/ should be excluded; .png should be excluded.
+		// Expected: README.md, src/index.ts, src/utils.ts, package.json
+		expect(result.files.length).toBeGreaterThanOrEqual(4);
+		expect(result.files.every((f) => !f.path.startsWith('dist/'))).toBe(true);
+		expect(result.files.every((f) => !f.path.startsWith('node_modules/'))).toBe(true);
+		expect(result.files.every((f) => !f.path.endsWith('.png'))).toBe(true);
+	});
+
+	it('sets branch to the default_branch when no ref is given', async () => {
+		setupDefaultMocks();
+		const result = await crawl({ owner: 'owner', repo: 'repo' });
+		expect(result.branch).toBe('main');
+	});
+
+	it('uses the provided ref when specified', async () => {
+		stubFetch((url) => {
+			if (url === 'https://api.github.com/repos/owner/repo') {
+				return jsonResponse(REPO_INFO);
+			}
+			if (url.includes('/git/trees/v2.0.0')) {
+				return jsonResponse({ tree: [], truncated: false });
+			}
+			if (url.includes('/commits/v2.0.0')) {
+				return textResponse('tagsha');
+			}
+			return textResponse('content');
+		});
+
+		const result = await crawl({ owner: 'owner', repo: 'repo', ref: 'v2.0.0' });
+		expect(result.branch).toBe('v2.0.0');
+	});
+
+	it('populates commitSha from the commits endpoint', async () => {
+		setupDefaultMocks();
+		const result = await crawl({ owner: 'owner', repo: 'repo' });
+		expect(result.commitSha).toBe(COMMIT_SHA);
+	});
+
+	it('sets correct sha on each CrawledFile from the tree', async () => {
+		setupDefaultMocks();
+		const result = await crawl({ owner: 'owner', repo: 'repo' });
+		const readme = result.files.find((f) => f.path === 'README.md');
+		expect(readme).toBeDefined();
+		expect(readme!.sha).toBe('sha-readme');
+	});
+
+	it('attaches language to each CrawledFile', async () => {
+		setupDefaultMocks();
+		const result = await crawl({ owner: 'owner', repo: 'repo' });
+		const indexTs = result.files.find((f) => f.path === 'src/index.ts');
+		expect(indexTs?.language).toBe('typescript');
+		const readme = result.files.find((f) => f.path === 'README.md');
+		expect(readme?.language).toBe('markdown');
+	});
+
+	it('reports progress via onProgress callback', async () => {
+		setupDefaultMocks();
+		const calls: Array<[number, number]> = [];
+		await crawl({
+			owner: 'owner',
+			repo: 'repo',
+			onProgress: (p, t) => calls.push([p, t])
+		});
+		expect(calls.length).toBeGreaterThan(0);
+		// Total must remain constant across all calls.
+		const totals = calls.map(([, t]) => t);
+		expect(totals.every((t) => t === totals[0])).toBe(true);
+	});
+
+	it('skips files that fail to download without throwing', async () => {
+		stubFetch((url) => {
+			if (url === 'https://api.github.com/repos/owner/repo') {
+				return jsonResponse(REPO_INFO);
+			}
+			if (url.includes('/git/trees/main')) {
+				return jsonResponse({
+					tree: [{ path: 'src/index.ts', type: 'blob', size: 100, sha: 'sha1', url: '' }],
+					truncated: false
+				});
+			}
+			if (url.includes('/commits/main')) {
+				return textResponse(COMMIT_SHA);
+			}
+			// All content downloads fail.
+			return new Response('error', { status: 500 });
+		});
+
+		// Should not throw; just return zero files.
+		const result = await crawl({ owner: 'owner', repo: 'repo' });
+		expect(result.files).toHaveLength(0);
+		expect(result.totalFiles).toBe(1);
+	});
+
+	it('throws RepositoryNotFoundError on 404', async () => {
+		stubFetch((url) => {
+			if (url === 'https://api.github.com/repos/owner/missing') {
+				return jsonResponse({ message: 'Not Found' }, 404, {
+					'X-RateLimit-Remaining': '4999',
+					'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600)
+				});
+			}
+			return new Response('not found', { status: 404 });
+		});
+
+		await expect(crawl({ owner: 'owner', repo: 'missing' })).rejects.toThrow(
+			RepositoryNotFoundError
+		);
+	});
+
+	it('throws AuthenticationError on 401', async () => {
+		stubFetch(() =>
+			new Response('Unauthorized', {
+				status: 401,
+				headers: {
+					'X-RateLimit-Remaining': '0',
+					'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600)
+				}
+			})
+		);
+
+		await expect(crawl({ owner: 'owner', repo: 'repo', token: 'bad-token' })).rejects.toThrow(
+			AuthenticationError
+		);
+	});
+
+	it('throws PermissionError on 403 without rate-limit exhaustion', async () => {
+		stubFetch(() =>
+			new Response('Forbidden', {
+				status: 403,
+				headers: {
+					'X-RateLimit-Remaining': '100',
+					'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600)
+				}
+			})
+		);
+
+		await expect(crawl({ owner: 'owner', repo: 'repo' })).rejects.toThrow(PermissionError);
+	});
+
+	it('respects config.folders allowlist when provided', async () => {
+		setupDefaultMocks();
+		const result = await crawl({
+			owner: 'owner',
+			repo: 'repo',
+			config: { folders: ['src/'] }
+		});
+		// Only src/ files should be present.
+		expect(result.files.every((f) => f.path.startsWith('src/'))).toBe(true);
+	});
+
+	it('applies config.excludeFiles filter', async () => {
+		setupDefaultMocks();
+		const result = await crawl({
+			owner: 'owner',
+			repo: 'repo',
+			config: { excludeFiles: ['package.json'] }
+		});
+		expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
+	});
+
+	it('returns correct skippedFiles count', async () => {
+		setupDefaultMocks();
+		const result = await crawl({ owner: 'owner', repo: 'repo' });
+		// dist/, node_modules/, and .png are the excluded items = 3
+		expect(result.skippedFiles).toBe(3);
+	});
+
+	it('uses auth token in requests to GitHub API', async () => {
+		const capturedHeaders: Record<string, string>[] = [];
+
+		stubFetch((url, init) => {
+			const headers = Object.fromEntries(
+				Object.entries((init?.headers as Record<string, string>) ?? {})
+			);
+			capturedHeaders.push(headers);
+
+			if (url === 'https://api.github.com/repos/owner/repo') {
+				return jsonResponse(REPO_INFO);
+			}
+			if (url.includes('/git/trees/main')) {
+				return jsonResponse({ tree: [], truncated: false });
+			}
+			if (url.includes('/commits/main')) {
+				return textResponse(COMMIT_SHA);
+			}
+			return textResponse('content');
+		});
+
+		await crawl({ owner: 'owner', repo: 'repo', token: 'ghp_mysecrettoken' });
+
+		const apiCalls = capturedHeaders.filter((h) => h.Authorization);
+		expect(apiCalls.length).toBeGreaterThan(0);
+		expect(apiCalls[0].Authorization).toBe('Bearer ghp_mysecrettoken');
+	});
+
+	it('handles a tree with zero indexable files gracefully', async () => {
+		stubFetch((url) => {
+			if (url === 'https://api.github.com/repos/owner/repo') return jsonResponse(REPO_INFO);
+			if (url.includes('/git/trees/main'))
+				return jsonResponse({
+					tree: [
+						{ path: 'image.png', type: 'blob', size: 100, sha: 'sha1', url: '' },
+						{ path: 'video.mp4', type: 'blob', size: 1000, sha: 'sha2', url: '' }
+					],
+					truncated: false
+				});
+			if (url.includes('/commits/main')) return textResponse(COMMIT_SHA);
+			return textResponse('content');
+		});
+
+		const result = await crawl({ owner: 'owner', repo: 'repo' });
+		expect(result.files).toHaveLength(0);
+		expect(result.totalFiles).toBe(0);
+		expect(result.skippedFiles).toBe(2);
+	});
+
+	it('reads and applies config from trueref.json found in the tree', async () => {
+		const truerefConfig = { excludeFiles: ['package.json'] };
+
+		stubFetch((url) => {
+			if (url === 'https://api.github.com/repos/owner/repo') return jsonResponse(REPO_INFO);
+			if (url.includes('/git/trees/main')) {
+				return jsonResponse({
+					tree: [
+						{ path: 'trueref.json', type: 'blob', size: 50, sha: 'sha-cfg', url: '' },
+						{ path: 'src/index.ts', type: 'blob', size: 200, sha: 'sha-idx', url: '' },
+						{ path: 'package.json', type: 'blob', size: 100, sha: 'sha-pkg', url: '' }
+					],
+					truncated: false
+				});
+			}
+			if (url.includes('/commits/main')) return textResponse(COMMIT_SHA);
+			if (url.includes('trueref.json')) return textResponse(JSON.stringify(truerefConfig));
+			if (url.includes('src/index.ts')) return textResponse('export const x = 1;');
+			if (url.includes('package.json')) return textResponse('{"name":"test"}');
+			return textResponse('content');
+		});
+
+		// No caller-supplied config — crawler should auto-detect trueref.json.
+		const result = await crawl({ owner: 'owner', repo: 'repo' });
+		expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
+		expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
+	});
+});
--- a/src/lib/server/crawler/github.crawler.ts
+++ b/src/lib/server/crawler/github.crawler.ts
@@ -0,0 +1,477 @@
+/**
+ * GitHub Repository Crawler (TRUEREF-0003).
+ *
+ * Fetches repository file trees via the GitHub Trees API and downloads file
+ * contents in parallel while respecting rate limits and applying
+ * include/exclude filtering rules from trueref.json.
+ *
+ * Download strategy:
+ *  - Uses raw.githubusercontent.com for file content — faster and counts less
+ *    against the REST API rate limit.
+ *  - Falls back to the GitHub Contents API if raw download fails.
+ *
+ * Error handling:
+ *  - 404  → RepositoryNotFoundError
+ *  - 401  → AuthenticationError
+ *  - 403  → waits for rate-limit reset if X-RateLimit-Remaining is 0; else PermissionError
+ *  - 422  → tree too large; switches to directory-by-directory traversal (depth pagination)
+ *  - Network errors → retried up to 3 times with exponential backoff
+ *  - Bad base64 content → file skipped with a console warning
+ */
+
+import { shouldIndexFile, detectLanguage } from './file-filter.js';
+import { GitHubRateLimiter, Semaphore, withRetry } from './rate-limiter.js';
+import {
+	AuthenticationError,
+	PermissionError,
+	RateLimitError,
+	RepositoryNotFoundError
+} from './types.js';
+
+// Domain errors should not be retried — they are permanent HTTP status codes.
+function isDomainError(err: unknown): boolean {
+	return (
+		err instanceof RepositoryNotFoundError ||
+		err instanceof AuthenticationError ||
+		err instanceof PermissionError ||
+		err instanceof RateLimitError
+	);
+}
+
+function isRetryable(err: unknown): boolean {
+	return !isDomainError(err);
+}
+import type {
+	CrawlOptions,
+	CrawlResult,
+	CrawledFile,
+	GitHubContentResponse,
+	GitHubRepoResponse,
+	GitHubTreeItem,
+	GitHubTreeResponse
+} from './types.js';
+
+// ---------------------------------------------------------------------------
+// Internal constants
+// ---------------------------------------------------------------------------
+
+const GITHUB_API = 'https://api.github.com';
+const RAW_CONTENT = 'https://raw.githubusercontent.com';
+
+/** Maximum parallel file downloads. */
+const DOWNLOAD_CONCURRENCY = 10;
+
+/** Config file names that should be fetched first so their filtering rules
+ *  apply to all subsequent downloads. */
+const CONFIG_FILE_NAMES = new Set(['trueref.json', 'context7.json']);
+
+// ---------------------------------------------------------------------------
+// Internal helpers
+// ---------------------------------------------------------------------------
+
+/**
+ * Build standard GitHub API request headers.
+ */
+function buildHeaders(token?: string): Record<string, string> {
+	const headers: Record<string, string> = {
+		Accept: 'application/vnd.github+json',
+		'X-GitHub-Api-Version': '2022-11-28'
+	};
+	if (token) {
+		headers.Authorization = `Bearer ${token}`;
+	}
+	return headers;
+}
+
+/**
+ * Throw a domain error for non-2xx GitHub API responses.
+ * Mutates the rate limiter with header data before throwing.
+ */
+async function throwForStatus(response: Response, rateLimiter: GitHubRateLimiter): Promise<void> {
+	if (response.ok) return;
+
+	rateLimiter.updateFromHeaders(response.headers);
+
+	switch (response.status) {
+		case 401:
+			throw new AuthenticationError('GitHub authentication failed — check your PAT.');
+		case 403: {
+			const remaining = response.headers.get('X-RateLimit-Remaining');
+			if (remaining === '0') {
+				const reset = parseInt(response.headers.get('X-RateLimit-Reset') ?? '0', 10) * 1000;
+				throw new RateLimitError('GitHub rate limit exceeded.', reset);
+			}
+			throw new PermissionError(
+				'GitHub returned 403 Forbidden — insufficient permissions for this resource.'
+			);
+		}
+		case 404:
+			throw new RepositoryNotFoundError(
+				`Repository not found or not accessible: ${response.url}`
+			);
+		default: {
+			const body = await response.text().catch(() => '');
+			throw new Error(`GitHub API error ${response.status}: ${body}`);
+		}
+	}
+}
+
+// ---------------------------------------------------------------------------
+// GitHub API calls
+// ---------------------------------------------------------------------------
+
+/**
+ * Fetch repository metadata (default branch, stars, etc.).
+ */
+async function fetchRepoInfo(
+	owner: string,
+	repo: string,
+	token: string | undefined,
+	rateLimiter: GitHubRateLimiter
+): Promise<GitHubRepoResponse> {
+	return withRetry(async () => {
+		await rateLimiter.waitIfNeeded();
+
+		const response = await fetch(`${GITHUB_API}/repos/${owner}/${repo}`, {
+			headers: buildHeaders(token)
+		});
+
+		rateLimiter.updateFromHeaders(response.headers);
+		await throwForStatus(response, rateLimiter);
+
+		return (await response.json()) as GitHubRepoResponse;
+	}, 3, isRetryable);
+}
+
+/**
+ * Fetch the recursive file tree for a given ref.
+ * Returns null when the tree is truncated (>100k items), signalling that we
+ * should fall back to directory-level traversal.
+ */
+async function fetchTree(
+	owner: string,
+	repo: string,
+	ref: string,
+	token: string | undefined,
+	rateLimiter: GitHubRateLimiter
+): Promise<GitHubTreeResponse | null> {
+	return withRetry(async () => {
+		await rateLimiter.waitIfNeeded();
+
+		const url = `${GITHUB_API}/repos/${owner}/${repo}/git/trees/${ref}?recursive=1`;
+		const response = await fetch(url, { headers: buildHeaders(token) });
+
+		rateLimiter.updateFromHeaders(response.headers);
+
+		// 422 means the tree is too large for a single recursive call.
+		if (response.status === 422) return null;
+
+		await throwForStatus(response, rateLimiter);
+
+		return (await response.json()) as GitHubTreeResponse;
+	}, 3, isRetryable);
+}
+
+/**
+ * Fetch a subtree (non-recursive) for a single directory path.
+ * Used when the full recursive tree is truncated.
+ */
+async function fetchSubTree(
+	owner: string,
+	repo: string,
+	ref: string,
+	treeSha: string,
+	token: string | undefined,
+	rateLimiter: GitHubRateLimiter
+): Promise<GitHubTreeResponse> {
+	return withRetry(async () => {
+		await rateLimiter.waitIfNeeded();
+
+		const url = `${GITHUB_API}/repos/${owner}/${repo}/git/trees/${treeSha}`;
+		const response = await fetch(url, { headers: buildHeaders(token) });
+
+		rateLimiter.updateFromHeaders(response.headers);
+		await throwForStatus(response, rateLimiter);
+
+		return (await response.json()) as GitHubTreeResponse;
+	}, 3, isRetryable);
+}
+
+/**
+ * Resolve the HEAD commit SHA from a branch/tag ref by fetching the
+ * commit object at the ref tip.
+ */
+async function fetchCommitSha(
+	owner: string,
+	repo: string,
+	ref: string,
+	token: string | undefined,
+	rateLimiter: GitHubRateLimiter
+): Promise<string> {
+	return withRetry(async () => {
+		await rateLimiter.waitIfNeeded();
+
+		const url = `${GITHUB_API}/repos/${owner}/${repo}/commits/${ref}`;
+		const response = await fetch(url, {
+			headers: { ...buildHeaders(token), Accept: 'application/vnd.github.sha' }
+		});
+
+		rateLimiter.updateFromHeaders(response.headers);
+		await throwForStatus(response, rateLimiter);
+
+		// When Accept is 'application/vnd.github.sha', the response body is the
+		// bare SHA string.
+		return (await response.text()).trim();
+	}, 3, isRetryable);
+}
+
+/**
+ * Download raw file content via raw.githubusercontent.com.
+ * Returns null on any failure (the caller will skip or fall back).
+ */
+async function downloadRawFile(
+	owner: string,
+	repo: string,
+	ref: string,
+	filePath: string,
+	token: string | undefined
+): Promise<string | null> {
+	try {
+		const url = `${RAW_CONTENT}/${owner}/${repo}/${ref}/${filePath}`;
+		const headers: Record<string, string> = {};
+		if (token) headers.Authorization = `Bearer ${token}`;
+
+		const response = await fetch(url, { headers });
+		if (!response.ok) return null;
+
+		return await response.text();
+	} catch {
+		return null;
+	}
+}
+
+/**
+ * Download file content via the GitHub Contents API (fallback).
+ */
+async function downloadViaContentsApi(
+	owner: string,
+	repo: string,
+	ref: string,
+	filePath: string,
+	token: string | undefined,
+	rateLimiter: GitHubRateLimiter
+): Promise<string | null> {
+	try {
+		return await withRetry(async () => {
+			await rateLimiter.waitIfNeeded();
+
+			const url = `${GITHUB_API}/repos/${owner}/${repo}/contents/${filePath}?ref=${ref}`;
+			const response = await fetch(url, { headers: buildHeaders(token) });
+
+			rateLimiter.updateFromHeaders(response.headers);
+			if (!response.ok) return null;
+
+			const data = (await response.json()) as GitHubContentResponse;
+			if (data.encoding !== 'base64') return null;
+
+			// Node.js Buffer handles both padded and unpadded base64.
+			return Buffer.from(data.content.replace(/\n/g, ''), 'base64').toString('utf-8');
+		});
+	} catch {
+		return null;
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Directory-level traversal (fallback for truncated trees)
+// ---------------------------------------------------------------------------
+
+/**
+ * Recursively collect all blob items from sub-trees when the top-level
+ * recursive tree is truncated (>100k items).
+ */
+async function collectBlobsFromSubTrees(
+	owner: string,
+	repo: string,
+	ref: string,
+	token: string | undefined,
+	rateLimiter: GitHubRateLimiter
+): Promise<GitHubTreeItem[]> {
+	const allBlobs: GitHubTreeItem[] = [];
+	const queue: Array<{ sha: string; prefix: string }> = [{ sha: ref, prefix: '' }];
+
+	while (queue.length > 0) {
+		const batch = queue.splice(0, DOWNLOAD_CONCURRENCY);
+
+		await Promise.all(
+			batch.map(async ({ sha, prefix }) => {
+				const subTree = await fetchSubTree(owner, repo, ref, sha, token, rateLimiter).catch(
+					() => null
+				);
+				if (!subTree) return;
+
+				for (const item of subTree.tree) {
+					const fullPath = prefix ? `${prefix}/${item.path}` : item.path;
+					if (item.type === 'blob') {
+						allBlobs.push({ ...item, path: fullPath });
+					} else if (item.type === 'tree') {
+						queue.push({ sha: item.sha, prefix: fullPath });
+					}
+				}
+			})
+		);
+	}
+
+	return allBlobs;
+}
+
+// ---------------------------------------------------------------------------
+// Config file detection
+// ---------------------------------------------------------------------------
+
+/**
+ * Try to download and parse a trueref.json / context7.json config from the
+ * repository root. Returns undefined if not found or unparseable.
+ */
+async function fetchRepoConfig(
+	owner: string,
+	repo: string,
+	ref: string,
+	token: string | undefined,
+	blobs: GitHubTreeItem[],
+	rateLimiter: GitHubRateLimiter
+): Promise<CrawlOptions['config'] | undefined> {
+	// Look for config files only at the repo root (no directory prefix).
+	const configItem = blobs.find((b) => CONFIG_FILE_NAMES.has(b.path));
+	if (!configItem) return undefined;
+
+	const content =
+		(await downloadRawFile(owner, repo, ref, configItem.path, token)) ??
+		(await downloadViaContentsApi(
+			owner,
+			repo,
+			ref,
+			configItem.path,
+			token,
+			rateLimiter
+		));
+
+	if (!content) return undefined;
+
+	try {
+		return JSON.parse(content) as CrawlOptions['config'];
+	} catch {
+		console.warn(`[GitHubCrawler] Failed to parse config file: ${configItem.path}`);
+		return undefined;
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Public crawl() function
+// ---------------------------------------------------------------------------
+
+/**
+ * Crawl a GitHub repository and return structured file objects.
+ *
+ * @param options - Repository coordinates, auth token, filter config, and
+ *                  optional progress callback.
+ * @returns CrawlResult with all downloaded files and summary statistics.
+ */
+export async function crawl(options: CrawlOptions): Promise<CrawlResult> {
+	const { owner, repo, token, onProgress } = options;
+	const rateLimiter = new GitHubRateLimiter();
+	const semaphore = new Semaphore(DOWNLOAD_CONCURRENCY);
+
+	// ---- Step 1: Resolve the ref (default branch if not provided) ----------
+	let ref = options.ref;
+	let commitSha = '';
+
+	const repoInfo = await fetchRepoInfo(owner, repo, token, rateLimiter);
+	if (!ref) {
+		ref = repoInfo.default_branch;
+	}
+
+	// ---- Step 2: Fetch the file tree ---------------------------------------
+	let blobs: GitHubTreeItem[];
+
+	const treeResponse = await fetchTree(owner, repo, ref, token, rateLimiter);
+
+	if (treeResponse === null) {
+		// Tree truncated — fall back to directory-by-directory traversal.
+		console.warn(
+			`[GitHubCrawler] Tree for ${owner}/${repo}@${ref} is truncated; using sub-tree traversal.`
+		);
+		blobs = await collectBlobsFromSubTrees(owner, repo, ref, token, rateLimiter);
+	} else {
+		blobs = treeResponse.tree.filter((item) => item.type === 'blob');
+	}
+
+	// Resolve HEAD commit SHA (best-effort; empty string on failure).
+	commitSha = await fetchCommitSha(owner, repo, ref, token, rateLimiter).catch(() => '');
+
+	// ---- Step 3: Detect and download config file first ---------------------
+	// Merge caller-supplied config with any discovered repo config.
+	let effectiveConfig = options.config;
+	if (!effectiveConfig) {
+		effectiveConfig = await fetchRepoConfig(owner, repo, ref, token, blobs, rateLimiter);
+	}
+
+	// ---- Step 4: Filter blobs according to config --------------------------
+	const filteredBlobs = blobs.filter((item) =>
+		shouldIndexFile(item.path, item.size ?? 0, effectiveConfig)
+	);
+
+	const totalFiles = filteredBlobs.length;
+	const skippedFiles = blobs.length - totalFiles;
+
+	// ---- Step 5: Download file contents in parallel -------------------------
+	const files: CrawledFile[] = [];
+	let processed = 0;
+
+	await Promise.all(
+		filteredBlobs.map((item) =>
+			semaphore.run(async () => {
+				try {
+					// Prefer raw download (cheaper on rate limit); fall back to API.
+					const content =
+						(await downloadRawFile(owner, repo, ref!, item.path, token)) ??
+						(await downloadViaContentsApi(
+							owner,
+							repo,
+							ref!,
+							item.path,
+							token,
+							rateLimiter
+						));
+
+					if (content === null) {
+						console.warn(`[GitHubCrawler] Could not download: ${item.path} — skipping.`);
+					} else {
+						files.push({
+							path: item.path,
+							content,
+							size: item.size ?? Buffer.byteLength(content, 'utf-8'),
+							sha: item.sha,
+							language: detectLanguage(item.path)
+						});
+					}
+				} catch (err) {
+					console.warn(
+						`[GitHubCrawler] Error downloading ${item.path}: ${err instanceof Error ? err.message : String(err)}`
+					);
+				} finally {
+					processed++;
+					onProgress?.(processed, totalFiles);
+				}
+			})
+		)
+	);
+
+	return {
+		files,
+		totalFiles,
+		skippedFiles,
+		branch: ref,
+		commitSha
+	};
+}
--- a/src/lib/server/crawler/local.crawler.test.ts
+++ b/src/lib/server/crawler/local.crawler.test.ts
@@ -0,0 +1,554 @@
+/**
+ * Unit tests for the local filesystem crawler (TRUEREF-0004).
+ *
+ * Each test that needs a filesystem fixture creates a temporary directory via
+ * `fs.mkdtemp`, writes the required files, runs the crawler, then cleans up
+ * with `fs.rm` regardless of the test outcome.
+ */
+
+import { execFile } from 'node:child_process';
+import { createHash } from 'node:crypto';
+import { promises as fs } from 'node:fs';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+import { promisify } from 'node:util';
+
+import { afterEach, beforeEach, describe, expect, it } from 'vitest';
+
+import { LocalCrawler } from './local.crawler.js';
+import type { LocalCrawlOptions } from './local.crawler.js';
+import { InvalidRefError, NotAGitRepositoryError } from './types.js';
+
+const execFileAsync = promisify(execFile);
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function sha256(content: string): string {
+	return createHash('sha256').update(content, 'utf-8').digest('hex');
+}
+
+/** Create a temp directory, write a map of relPath → content, return rootPath. */
+async function makeTempRepo(files: Record<string, string>): Promise<string> {
+	const root = await fs.mkdtemp(join(tmpdir(), 'trueref-test-'));
+	for (const [relPath, content] of Object.entries(files)) {
+		const absPath = join(root, relPath);
+		await fs.mkdir(join(absPath, '..'), { recursive: true });
+		await fs.writeFile(absPath, content, 'utf-8');
+	}
+	return root;
+}
+
+/** Remove a temporary directory tree created by makeTempRepo. */
+async function cleanupTempRepo(root: string): Promise<void> {
+	await fs.rm(root, { recursive: true, force: true });
+}
+
+// ---------------------------------------------------------------------------
+// Test state
+// ---------------------------------------------------------------------------
+
+let root: string = '';
+const crawler = new LocalCrawler();
+
+async function crawlRoot(opts: Partial<LocalCrawlOptions> = {}): Promise<ReturnType<LocalCrawler['crawl']>> {
+	return crawler.crawl({ rootPath: root, ...opts });
+}
+
+// ---------------------------------------------------------------------------
+// Basic crawl behaviour
+// ---------------------------------------------------------------------------
+
+describe('LocalCrawler.crawl() — basic file enumeration', () => {
+	beforeEach(async () => {
+		root = await makeTempRepo({
+			'README.md': '# Hello',
+			'src/index.ts': 'export const x = 1;',
+			'src/utils.ts': 'export const y = 2;',
+			'package.json': '{"name":"test"}'
+		});
+	});
+
+	afterEach(async () => {
+		await cleanupTempRepo(root);
+	});
+
+	it('returns all indexable files', async () => {
+		const result = await crawlRoot();
+		const paths = result.files.map((f) => f.path).sort();
+		expect(paths).toEqual(['README.md', 'package.json', 'src/index.ts', 'src/utils.ts'].sort());
+	});
+
+	it('populates content as a UTF-8 string', async () => {
+		const result = await crawlRoot();
+		const readme = result.files.find((f) => f.path === 'README.md');
+		expect(readme?.content).toBe('# Hello');
+	});
+
+	it('sets size equal to Buffer.byteLength of content', async () => {
+		const result = await crawlRoot();
+		for (const file of result.files) {
+			expect(file.size).toBe(Buffer.byteLength(file.content, 'utf-8'));
+		}
+	});
+
+	it('computes correct SHA-256 per file', async () => {
+		const result = await crawlRoot();
+		const readme = result.files.find((f) => f.path === 'README.md');
+		expect(readme?.sha).toBe(sha256('# Hello'));
+	});
+
+	it('detects language from extension', async () => {
+		const result = await crawlRoot();
+		const ts = result.files.find((f) => f.path === 'src/index.ts');
+		expect(ts?.language).toBe('typescript');
+		const md = result.files.find((f) => f.path === 'README.md');
+		expect(md?.language).toBe('markdown');
+		const json = result.files.find((f) => f.path === 'package.json');
+		expect(json?.language).toBe('json');
+	});
+
+	it('sets branch to "local"', async () => {
+		const result = await crawlRoot();
+		expect(result.branch).toBe('local');
+	});
+
+	it('sets totalFiles to the count of filtered files', async () => {
+		const result = await crawlRoot();
+		expect(result.totalFiles).toBe(result.files.length);
+	});
+
+	it('sets commitSha to a non-empty hex string', async () => {
+		const result = await crawlRoot();
+		expect(result.commitSha).toMatch(/^[0-9a-f]{64}$/);
+	});
+
+	it('produces a deterministic commitSha for the same file set', async () => {
+		const r1 = await crawlRoot();
+		const r2 = await crawlRoot();
+		expect(r1.commitSha).toBe(r2.commitSha);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Filtering — default excludes and extension allow-list
+// ---------------------------------------------------------------------------
+
+describe('LocalCrawler.crawl() — default filtering', () => {
+	beforeEach(async () => {
+		root = await makeTempRepo({
+			'src/index.ts': 'export {};',
+			'dist/bundle.js': 'bundled',
+			'node_modules/lodash/index.js': 'lodash',
+			'.git/config': '[core]',
+			'image.png': '\x89PNG',
+			'README.md': '# Docs'
+		});
+	});
+
+	afterEach(async () => {
+		await cleanupTempRepo(root);
+	});
+
+	it('excludes files in dist/', async () => {
+		const result = await crawlRoot();
+		expect(result.files.every((f) => !f.path.startsWith('dist/'))).toBe(true);
+	});
+
+	it('excludes files in node_modules/', async () => {
+		const result = await crawlRoot();
+		expect(result.files.every((f) => !f.path.startsWith('node_modules/'))).toBe(true);
+	});
+
+	it('excludes files in .git/', async () => {
+		const result = await crawlRoot();
+		expect(result.files.every((f) => !f.path.startsWith('.git/'))).toBe(true);
+	});
+
+	it('excludes non-indexable extensions like .png', async () => {
+		const result = await crawlRoot();
+		expect(result.files.every((f) => !f.path.endsWith('.png'))).toBe(true);
+	});
+
+	it('reports skippedFiles = total enumerated – filtered', async () => {
+		const result = await crawlRoot();
+		// dist/, node_modules/, .git/, .png = 4 skipped
+		// src/index.ts + README.md = 2 kept
+		expect(result.skippedFiles).toBe(4);
+		expect(result.totalFiles).toBe(2);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Size limit
+// ---------------------------------------------------------------------------
+
+describe('LocalCrawler.crawl() — size limit', () => {
+	afterEach(async () => {
+		await cleanupTempRepo(root);
+	});
+
+	it('excludes files larger than MAX_FILE_SIZE_BYTES (500 KB)', async () => {
+		// 500_001 bytes of 'x'
+		const bigContent = 'x'.repeat(500_001);
+		root = await makeTempRepo({
+			'big.ts': bigContent,
+			'small.ts': 'export const x = 1;'
+		});
+		const result = await crawlRoot();
+		expect(result.files.some((f) => f.path === 'big.ts')).toBe(false);
+		expect(result.files.some((f) => f.path === 'small.ts')).toBe(true);
+	});
+
+	it('includes files exactly at MAX_FILE_SIZE_BYTES (500 KB)', async () => {
+		const edgeContent = 'a'.repeat(500_000);
+		root = await makeTempRepo({ 'edge.ts': edgeContent });
+		const result = await crawlRoot();
+		expect(result.files.some((f) => f.path === 'edge.ts')).toBe(true);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// trueref.json / context7.json config detection
+// ---------------------------------------------------------------------------
+
+describe('LocalCrawler.crawl() — config file detection', () => {
+	afterEach(async () => {
+		await cleanupTempRepo(root);
+	});
+
+	it('auto-detects trueref.json and applies excludeFiles', async () => {
+		root = await makeTempRepo({
+			'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }),
+			'src/index.ts': 'export {};',
+			'package.json': '{"name":"test"}'
+		});
+		const result = await crawlRoot();
+		expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
+		expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
+	});
+
+	it('auto-detects context7.json and applies folders allowlist', async () => {
+		root = await makeTempRepo({
+			'context7.json': JSON.stringify({ folders: ['docs/'] }),
+			'src/index.ts': 'export {};',
+			'docs/guide.md': '# Guide'
+		});
+		const result = await crawlRoot();
+		expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(false);
+		expect(result.files.some((f) => f.path === 'docs/guide.md')).toBe(true);
+	});
+
+	it('caller-supplied config takes precedence over discovered config file', async () => {
+		root = await makeTempRepo({
+			'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }),
+			'src/index.ts': 'export {};',
+			'package.json': '{"name":"test"}'
+		});
+		// Caller provides a config with no exclusions — package.json should appear.
+		const result = await crawlRoot({ config: {} });
+		expect(result.files.some((f) => f.path === 'package.json')).toBe(true);
+	});
+
+	it('applies excludeFolders from config', async () => {
+		root = await makeTempRepo({
+			'trueref.json': JSON.stringify({ excludeFolders: ['internal/'] }),
+			'internal/secret.ts': 'secret',
+			'src/public.ts': 'public'
+		});
+		const result = await crawlRoot();
+		expect(result.files.some((f) => f.path.startsWith('internal/'))).toBe(false);
+		expect(result.files.some((f) => f.path === 'src/public.ts')).toBe(true);
+	});
+
+	it('gracefully handles a malformed config file', async () => {
+		root = await makeTempRepo({
+			'trueref.json': 'NOT VALID JSON {{{',
+			'src/index.ts': 'export {};'
+		});
+		// Should not throw; falls back to no config.
+		const result = await crawlRoot();
+		expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Progress callback
+// ---------------------------------------------------------------------------
+
+describe('LocalCrawler.crawl() — progress reporting', () => {
+	beforeEach(async () => {
+		root = await makeTempRepo({
+			'src/a.ts': 'a',
+			'src/b.ts': 'b',
+			'src/c.ts': 'c'
+		});
+	});
+
+	afterEach(async () => {
+		await cleanupTempRepo(root);
+	});
+
+	it('calls onProgress once per filtered file', async () => {
+		const calls: Array<[number, number]> = [];
+		await crawlRoot({ onProgress: (p, t) => calls.push([p, t]) });
+		expect(calls).toHaveLength(3);
+	});
+
+	it('increments processed from 1 to totalFiles', async () => {
+		const calls: Array<[number, number]> = [];
+		await crawlRoot({ onProgress: (p, t) => calls.push([p, t]) });
+		const processed = calls.map(([p]) => p);
+		expect(processed).toEqual([1, 2, 3]);
+	});
+
+	it('keeps total constant across all callback invocations', async () => {
+		const totals: number[] = [];
+		await crawlRoot({ onProgress: (_, t) => totals.push(t) });
+		expect(totals.every((t) => t === totals[0])).toBe(true);
+	});
+
+	it('does not call onProgress when no files pass the filter', async () => {
+		// Overwrite root with only non-indexable files.
+		await fs.rm(root, { recursive: true, force: true });
+		root = await makeTempRepo({ 'image.png': '\x89PNG' });
+		const calls: number[] = [];
+		await crawlRoot({ onProgress: () => calls.push(1) });
+		expect(calls).toHaveLength(0);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Git ref checkout
+// ---------------------------------------------------------------------------
+
+/**
+ * Create a temp directory that is a valid git repo with one commit per entry
+ * in `history`. Each entry is a map of relPath → content committed under the
+ * given tag (if provided). Returns the repo root path.
+ *
+ * Layout of `history`:
+ *   [{ tag?: string, files: Record<string, string> }, ...]
+ */
+async function makeGitRepo(
+	history: Array<{ tag?: string; files: Record<string, string> }>
+): Promise<string> {
+	const root = await fs.mkdtemp(join(tmpdir(), 'trueref-git-test-'));
+
+	async function git(...args: string[]) {
+		await execFileAsync('git', ['-C', root, ...args]);
+	}
+
+	await git('init', '--initial-branch=main');
+	await git('config', 'user.email', 'test@trueref.local');
+	await git('config', 'user.name', 'TrueRef Test');
+
+	for (const { tag, files } of history) {
+		// Write files
+		for (const [relPath, content] of Object.entries(files)) {
+			const absPath = join(root, relPath);
+			await fs.mkdir(join(absPath, '..'), { recursive: true });
+			await fs.writeFile(absPath, content, 'utf-8');
+		}
+		await git('add', '.');
+		await git('commit', '--allow-empty', '-m', `commit for ${tag ?? 'HEAD'}`);
+		if (tag) {
+			await git('tag', tag);
+		}
+	}
+
+	return root;
+}
+
+describe('LocalCrawler.crawl() — git ref checkout', () => {
+	let root: string = '';
+	const crawler = new LocalCrawler();
+
+	afterEach(async () => {
+		if (root) await cleanupTempRepo(root);
+	});
+
+	it('crawls files at a specific tag, not the HEAD state', async () => {
+		root = await makeGitRepo([
+			{ tag: 'v1.0.0', files: { 'src/index.ts': 'export const version = 1;' } },
+			{ files: { 'src/index.ts': 'export const version = 2;' } }
+		]);
+
+		const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
+		const indexFile = result.files.find((f) => f.path === 'src/index.ts');
+		expect(indexFile?.content).toBe('export const version = 1;');
+	});
+
+	it('crawls files at a specific commit SHA', async () => {
+		root = await makeGitRepo([
+			{ tag: 'v1.0.0', files: { 'api.ts': 'v1' } },
+			{ files: { 'api.ts': 'v2' } }
+		]);
+
+		// Resolve the SHA of v1.0.0
+		const { stdout } = await execFileAsync('git', ['-C', root, 'rev-parse', 'v1.0.0'], {
+			encoding: 'utf-8'
+		});
+		const sha = stdout.trim();
+
+		const result = await crawler.crawl({ rootPath: root, ref: sha });
+		const api = result.files.find((f) => f.path === 'api.ts');
+		expect(api?.content).toBe('v1');
+	});
+
+	it('sets branch to the ref string in the result', async () => {
+		root = await makeGitRepo([{ tag: 'v2.3.1', files: { 'README.md': '# v2' } }]);
+
+		const result = await crawler.crawl({ rootPath: root, ref: 'v2.3.1' });
+		expect(result.branch).toBe('v2.3.1');
+	});
+
+	it('sets commitSha to the git-resolved SHA (not file-content hash)', async () => {
+		root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'a.ts': 'a' } }]);
+
+		const { stdout } = await execFileAsync('git', ['-C', root, 'rev-parse', 'v1.0.0'], {
+			encoding: 'utf-8'
+		});
+		const expectedSha = stdout.trim();
+
+		const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
+		expect(result.commitSha).toBe(expectedSha);
+	});
+
+	it('does not modify the working tree', async () => {
+		root = await makeGitRepo([
+			{ tag: 'v1.0.0', files: { 'src/index.ts': 'v1' } },
+			{ files: { 'src/index.ts': 'v2' } }
+		]);
+
+		// Working tree is at HEAD (v2)
+		const before = await fs.readFile(join(root, 'src/index.ts'), 'utf-8');
+		await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
+		const after = await fs.readFile(join(root, 'src/index.ts'), 'utf-8');
+
+		expect(before).toBe('v2');
+		expect(after).toBe('v2');
+	});
+
+	it('removes the temporary worktree after crawling', async () => {
+		root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'f.ts': 'x' } }]);
+
+		await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
+
+		// List remaining worktrees — only the main one should remain.
+		const { stdout } = await execFileAsync('git', ['-C', root, 'worktree', 'list', '--porcelain'], {
+			encoding: 'utf-8'
+		});
+		const worktreeCount = stdout.split('\n').filter((l) => l.startsWith('worktree ')).length;
+		expect(worktreeCount).toBe(1);
+	});
+
+	it('throws NotAGitRepositoryError for a plain directory', async () => {
+		const plainDir = await fs.mkdtemp(join(tmpdir(), 'trueref-plain-'));
+		root = plainDir; // cleaned up in afterEach
+
+		await expect(crawler.crawl({ rootPath: plainDir, ref: 'v1.0.0' })).rejects.toThrow(
+			NotAGitRepositoryError
+		);
+	});
+
+	it('throws InvalidRefError for a ref that does not exist', async () => {
+		root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'f.ts': 'x' } }]);
+
+		await expect(crawler.crawl({ rootPath: root, ref: 'v99.99.99' })).rejects.toThrow(
+			InvalidRefError
+		);
+	});
+
+	it('applies caller-supplied config at the checked-out ref', async () => {
+		root = await makeGitRepo([
+			{
+				tag: 'v1.0.0',
+				files: {
+					'src/index.ts': 'export {};',
+					'package.json': '{"name":"test"}'
+				}
+			}
+		]);
+
+		// Exclude package.json via caller config
+		const result = await crawler.crawl({
+			rootPath: root,
+			ref: 'v1.0.0',
+			config: { excludeFiles: ['package.json'] }
+		});
+
+		expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
+		expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
+	});
+
+	it('reads trueref.json from the checked-out ref', async () => {
+		root = await makeGitRepo([
+			{
+				tag: 'v1.0.0',
+				files: {
+					'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }),
+					'src/index.ts': 'export {};',
+					'package.json': '{"name":"test"}'
+				}
+			}
+		]);
+
+		const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
+		expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
+		expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Edge cases
+// ---------------------------------------------------------------------------
+
+describe('LocalCrawler.crawl() — edge cases', () => {
+	afterEach(async () => {
+		await cleanupTempRepo(root);
+	});
+
+	it('returns empty result for an empty directory', async () => {
+		root = await makeTempRepo({});
+		const result = await crawlRoot();
+		expect(result.files).toHaveLength(0);
+		expect(result.totalFiles).toBe(0);
+		expect(result.skippedFiles).toBe(0);
+	});
+
+	it('handles deeply nested directory structures', async () => {
+		root = await makeTempRepo({
+			'a/b/c/d/deep.ts': 'export const deep = true;'
+		});
+		const result = await crawlRoot();
+		expect(result.files.some((f) => f.path === 'a/b/c/d/deep.ts')).toBe(true);
+	});
+
+	it('handles files with UTF-8 content correctly', async () => {
+		const utf8Content = 'const greeting = "héllo wörld — 日本語";';
+		root = await makeTempRepo({ 'src/unicode.ts': utf8Content });
+		const result = await crawlRoot();
+		const file = result.files.find((f) => f.path === 'src/unicode.ts');
+		expect(file?.content).toBe(utf8Content);
+		expect(file?.sha).toBe(sha256(utf8Content));
+	});
+
+	it('commitSha differs when file content changes', async () => {
+		root = await makeTempRepo({ 'src/index.ts': 'version 1' });
+		const r1 = await crawlRoot();
+
+		await fs.writeFile(join(root, 'src/index.ts'), 'version 2', 'utf-8');
+		const r2 = await crawlRoot();
+
+		expect(r1.commitSha).not.toBe(r2.commitSha);
+	});
+
+	it('commitSha is empty-string hash when no files are crawled', async () => {
+		root = await makeTempRepo({ 'image.png': '\x89PNG' });
+		const result = await crawlRoot();
+		// SHA-256 of an empty string
+		expect(result.commitSha).toBe(sha256(''));
+	});
+});
--- a/src/lib/server/crawler/local.crawler.ts
+++ b/src/lib/server/crawler/local.crawler.ts
@@ -0,0 +1,275 @@
+/**
+ * Local Filesystem Crawler (TRUEREF-0004).
+ *
+ * Walks a directory tree and enumerates all files, applying the same
+ * extension and size filters as the GitHub crawler (TRUEREF-0003).
+ * Reads file contents as UTF-8 strings and computes SHA-256 checksums
+ * for change detection.
+ *
+ * Design decisions:
+ *  - Uses Node.js `fs/promises` and `crypto` — no extra dependencies.
+ *  - Symlinks and special files (devices, sockets, FIFOs) are skipped.
+ *  - `trueref.json` / `context7.json` at the repo root are detected and
+ *    parsed before any other file filtering runs, matching the GitHub crawler.
+ *  - File size for filtering is taken from `stat().size` so the size limit
+ *    is applied before reading file content (saves I/O on large excluded files).
+ *  - `commitSha` is derived from a SHA-256 hash of all per-file checksums,
+ *    giving a deterministic fingerprint of the crawled file set.
+ */
+
+import { execFile } from 'node:child_process';
+import { createHash } from 'node:crypto';
+import { promises as fs } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { promisify } from 'node:util';
+
+import { detectLanguage, shouldIndexFile } from './file-filter.js';
+import { InvalidRefError, NotAGitRepositoryError } from './types.js';
+import type { CrawledFile, CrawlResult, RepoConfig } from './types.js';
+
+const execFileAsync = promisify(execFile);
+
+// ---------------------------------------------------------------------------
+// Public options type
+// ---------------------------------------------------------------------------
+
+export interface LocalCrawlOptions {
+	/** Absolute path to the repository root directory. */
+	rootPath: string;
+	/**
+	 * Git ref to check out before crawling — a tag name (e.g. "v2.1.0"),
+	 * a branch name, or a commit SHA. When provided the crawler creates an
+	 * isolated git worktree at that ref, crawls it, then removes the worktree.
+	 * The original working tree is never modified.
+	 * Requires `rootPath` to be inside a git repository.
+	 */
+	ref?: string;
+	/** Pre-parsed trueref.json / context7.json configuration, if already loaded. */
+	config?: RepoConfig;
+	/** Progress callback invoked after each file is read. */
+	onProgress?: (processed: number, total: number) => void;
+}
+
+// ---------------------------------------------------------------------------
+// Internal helpers
+// ---------------------------------------------------------------------------
+
+/** Names of config files that control include/exclude rules. */
+const CONFIG_FILE_NAMES = new Set(['trueref.json', 'context7.json']);
+
+// ---------------------------------------------------------------------------
+// Git helpers
+// ---------------------------------------------------------------------------
+
+/**
+ * Run a git command inside `cwd` and return trimmed stdout.
+ * Throws the child-process error on non-zero exit.
+ */
+async function runGit(cwd: string, args: string[]): Promise<string> {
+	const { stdout } = await execFileAsync('git', ['-C', cwd, ...args], { encoding: 'utf-8' });
+	return stdout.trim();
+}
+
+/**
+ * Compute a SHA-256 hex digest of a UTF-8 string.
+ */
+function computeSHA256(content: string): string {
+	return createHash('sha256').update(content, 'utf-8').digest('hex');
+}
+
+/**
+ * Attempt to read and JSON-parse a config file.
+ * Returns undefined if the file cannot be read or parsed.
+ */
+async function parseConfigFile(absPath: string): Promise<RepoConfig | undefined> {
+	try {
+		const raw = await fs.readFile(absPath, 'utf-8');
+		return JSON.parse(raw) as RepoConfig;
+	} catch {
+		console.warn(`[LocalCrawler] Failed to parse config file: ${absPath}`);
+		return undefined;
+	}
+}
+
+// ---------------------------------------------------------------------------
+// LocalCrawler
+// ---------------------------------------------------------------------------
+
+export class LocalCrawler {
+	/**
+	 * Crawl a local directory tree and return structured file objects.
+	 *
+	 * When `options.ref` is supplied the crawler creates an isolated git
+	 * worktree checked out at that ref, crawls it, then removes the worktree.
+	 * The caller's working tree is never modified.
+	 *
+	 * @param options - Root path, optional git ref, optional config, and progress callback.
+	 * @returns CrawlResult with all read files and summary statistics.
+	 */
+	async crawl(options: LocalCrawlOptions): Promise<CrawlResult> {
+		const { rootPath, ref } = options;
+
+		if (!ref) {
+			// Fast path: crawl the working tree as-is.
+			return this.crawlDirectory(rootPath, options.config, options.onProgress, 'local');
+		}
+
+		// Git-aware path: verify repo, resolve ref, create worktree, crawl, clean up.
+		let worktreePath: string | undefined;
+
+		try {
+			// Verify rootPath is inside a git repository.
+			await runGit(rootPath, ['rev-parse', '--git-dir']).catch(() => {
+				throw new NotAGitRepositoryError(`Not a git repository: ${rootPath}`);
+			});
+
+			// Resolve the ref to a concrete commit SHA (validates it exists).
+			const commitSha = await runGit(rootPath, ['rev-parse', '--verify', ref]).catch(() => {
+				throw new InvalidRefError(`Invalid git ref "${ref}" in repository: ${rootPath}`);
+			});
+
+			// Create a temporary isolated worktree at the resolved ref.
+			const tmpDir = await fs.mkdtemp(join(tmpdir(), 'trueref-wt-'));
+			worktreePath = tmpDir;
+
+			await runGit(rootPath, ['worktree', 'add', '--detach', tmpDir, ref]).catch((err) => {
+				throw new InvalidRefError(
+					`Cannot create worktree for ref "${ref}": ${err instanceof Error ? err.message : String(err)}`
+				);
+			});
+
+			// Crawl the worktree and stamp the result with the git-resolved metadata.
+			const result = await this.crawlDirectory(worktreePath, options.config, options.onProgress, ref);
+
+			return { ...result, commitSha };
+		} finally {
+			if (worktreePath) {
+				// Remove the worktree (git also deletes the directory).
+				await runGit(rootPath, ['worktree', 'remove', '--force', worktreePath]).catch(() => {
+					// Best-effort; leave the temp directory for the OS to clean up.
+					fs.rm(worktreePath!, { recursive: true, force: true }).catch(() => {});
+				});
+			}
+		}
+	}
+
+	// ---------------------------------------------------------------------------
+	// Private — directory crawl
+	// ---------------------------------------------------------------------------
+
+	/**
+	 * Walk `rootPath`, apply filters, read files, and build a CrawlResult.
+	 * `branch` is embedded verbatim into the returned result.
+	 */
+	private async crawlDirectory(
+		rootPath: string,
+		callerConfig: RepoConfig | undefined,
+		onProgress: LocalCrawlOptions['onProgress'],
+		branch: string
+	): Promise<CrawlResult> {
+		// Step 1: Walk the directory tree and collect (relPath, size) pairs.
+		const statCache = new Map<string, number>();
+		const allRelPaths = await this.walkDirectory(rootPath, '', statCache);
+
+		// Step 2: Detect trueref.json / context7.json at the repo root first.
+		// Only root-level config files are honoured (no directory prefix).
+		const configRelPath = allRelPaths.find((p) => CONFIG_FILE_NAMES.has(p));
+		let config = callerConfig;
+		if (configRelPath && !config) {
+			config = await parseConfigFile(join(rootPath, configRelPath));
+		}
+
+		// Step 3: Filter files according to extension, size, and config rules.
+		const filteredPaths = allRelPaths.filter((relPath) => {
+			const size = statCache.get(relPath) ?? 0;
+			return shouldIndexFile(relPath, size, config);
+		});
+
+		// Step 4: Read file contents and build CrawledFile records.
+		const crawledFiles: CrawledFile[] = [];
+
+		for (const [i, relPath] of filteredPaths.entries()) {
+			const absPath = join(rootPath, relPath);
+			try {
+				const content = await fs.readFile(absPath, 'utf-8');
+				const sha = computeSHA256(content);
+				crawledFiles.push({
+					path: relPath,
+					content,
+					size: Buffer.byteLength(content, 'utf-8'),
+					sha,
+					language: detectLanguage(relPath)
+				});
+			} catch (err) {
+				console.warn(
+					`[LocalCrawler] Could not read file: ${relPath} — ${err instanceof Error ? err.message : String(err)}`
+				);
+			}
+			onProgress?.(i + 1, filteredPaths.length);
+		}
+
+		// Step 5: Build a deterministic repo-level fingerprint from file SHAs.
+		const commitSha = computeSHA256(crawledFiles.map((f) => f.sha).join(''));
+
+		return {
+			files: crawledFiles,
+			totalFiles: filteredPaths.length,
+			skippedFiles: allRelPaths.length - filteredPaths.length,
+			branch,
+			commitSha
+		};
+	}
+
+	/**
+	 * Recursively walk a directory and collect relative paths of all regular files.
+	 * Symlinks and special files (devices, sockets, FIFOs) are silently skipped.
+	 * Populates `statCache` with file sizes so the caller can filter without a
+	 * second `stat()` call.
+	 *
+	 * @param dir     - Absolute path of the directory to read.
+	 * @param rel     - Relative path prefix accumulated during recursion.
+	 * @param statCache - Mutable map from relative path → byte size.
+	 */
+	private async walkDirectory(
+		dir: string,
+		rel: string,
+		statCache: Map<string, number>
+	): Promise<string[]> {
+		let entries;
+		try {
+			entries = await fs.readdir(dir, { withFileTypes: true });
+		} catch {
+			// Directory is unreadable (permissions, etc.) — skip silently.
+			return [];
+		}
+
+		const files: string[] = [];
+
+		for (const entry of entries) {
+			// Only descend into plain directories and collect plain files.
+			// entry.isFile() / entry.isDirectory() return false for symlinks,
+			// devices, sockets, and FIFOs, so those are all implicitly skipped.
+			if (!entry.isFile() && !entry.isDirectory()) continue;
+
+			const relPath = rel ? `${rel}/${entry.name}` : entry.name;
+
+			if (entry.isDirectory()) {
+				const children = await this.walkDirectory(join(dir, entry.name), relPath, statCache);
+				files.push(...children);
+			} else {
+				// Capture file size from stat so shouldIndexFile can enforce the limit
+				// without reading the file.
+				try {
+					const stat = await fs.stat(join(dir, entry.name));
+					statCache.set(relPath, stat.size);
+				} catch {
+					statCache.set(relPath, 0);
+				}
+				files.push(relPath);
+			}
+		}
+
+		return files;
+	}
+}
--- a/src/lib/server/crawler/rate-limiter.ts
+++ b/src/lib/server/crawler/rate-limiter.ts
@@ -0,0 +1,123 @@
+/**
+ * GitHub API rate-limit tracker and backoff helper (TRUEREF-0003).
+ *
+ * Reads X-RateLimit-* headers from every API response and pauses outgoing
+ * requests when the remaining allowance drops to ≤ 10.
+ */
+
+function sleep(ms: number): Promise<void> {
+	return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+export class GitHubRateLimiter {
+	private remaining = 5000;
+	private resetAt = Date.now();
+
+	/**
+	 * Update internal counters from the headers of a GitHub API response.
+	 */
+	updateFromHeaders(headers: Headers): void {
+		const remaining = headers.get('X-RateLimit-Remaining');
+		const reset = headers.get('X-RateLimit-Reset');
+
+		if (remaining !== null) {
+			this.remaining = parseInt(remaining, 10);
+		}
+		if (reset !== null) {
+			// GitHub returns a Unix epoch in seconds.
+			this.resetAt = parseInt(reset, 10) * 1000;
+		}
+	}
+
+	/**
+	 * If the remaining allowance is critically low (≤ 10), sleep until the
+	 * rate-limit window resets (plus a 1 s buffer).
+	 */
+	async waitIfNeeded(): Promise<void> {
+		if (this.remaining <= 10) {
+			const waitMs = Math.max(0, this.resetAt - Date.now()) + 1000;
+			await sleep(waitMs);
+		}
+	}
+
+	/** Remaining requests in the current window (for testing). */
+	get remainingRequests(): number {
+		return this.remaining;
+	}
+
+	/** Reset timestamp as a Unix epoch in ms (for testing). */
+	get resetTimestamp(): number {
+		return this.resetAt;
+	}
+}
+
+/**
+ * Exponential-backoff retry wrapper for network-level errors.
+ *
+ * Retries up to `maxAttempts` times (default 3) with 1 s, 2 s, 4 s delays.
+ *
+ * @param fn          - Async function to attempt.
+ * @param maxAttempts - Maximum number of attempts (default 3).
+ * @param isRetryable - Optional predicate; when it returns false for a given
+ *                      error the error is re-thrown immediately without further
+ *                      retries. Defaults to retrying all errors.
+ */
+export async function withRetry<T>(
+	fn: () => Promise<T>,
+	maxAttempts = 3,
+	isRetryable: (err: unknown) => boolean = () => true
+): Promise<T> {
+	let lastError: unknown;
+	for (let attempt = 0; attempt < maxAttempts; attempt++) {
+		try {
+			return await fn();
+		} catch (err) {
+			if (!isRetryable(err)) throw err;
+			lastError = err;
+			if (attempt < maxAttempts - 1) {
+				await sleep(1000 * Math.pow(2, attempt));
+			}
+		}
+	}
+	throw lastError;
+}
+
+/**
+ * Async semaphore — limits the number of concurrently executing promises.
+ */
+export class Semaphore {
+	private count: number;
+	private readonly queue: Array<() => void> = [];
+
+	constructor(concurrency: number) {
+		this.count = concurrency;
+	}
+
+	async acquire(): Promise<void> {
+		if (this.count > 0) {
+			this.count--;
+			return;
+		}
+		return new Promise((resolve) => {
+			this.queue.push(resolve);
+		});
+	}
+
+	release(): void {
+		const next = this.queue.shift();
+		if (next) {
+			next();
+		} else {
+			this.count++;
+		}
+	}
+
+	async run<T>(fn: () => Promise<T>): Promise<T> {
+		await this.acquire();
+		try {
+			return await fn();
+		} finally {
+			this.release();
+		}
+	}
+}
--- a/src/lib/server/crawler/types.ts
+++ b/src/lib/server/crawler/types.ts
@@ -0,0 +1,135 @@
+/**
+ * Types for the GitHub repository crawler (TRUEREF-0003).
+ */
+
+import type { TrueRefConfig } from '$lib/types';
+
+// Re-export RepoConfig alias so crawler modules can reference it consistently.
+export type RepoConfig = TrueRefConfig;
+
+// ---------------------------------------------------------------------------
+// Core crawler data types
+// ---------------------------------------------------------------------------
+
+export interface CrawledFile {
+	/** Relative path within the repo, e.g. "src/index.ts" */
+	path: string;
+	/** UTF-8 file content */
+	content: string;
+	/** File size in bytes */
+	size: number;
+	/** GitHub blob SHA (used as checksum) */
+	sha: string;
+	/** Programming language detected from extension */
+	language: string;
+}
+
+export interface CrawlResult {
+	/** Successfully downloaded files */
+	files: CrawledFile[];
+	/** Total files that matched filters */
+	totalFiles: number;
+	/** Files that were filtered out or too large */
+	skippedFiles: number;
+	/** Branch or tag that was crawled */
+	branch: string;
+	/** HEAD commit SHA */
+	commitSha: string;
+}
+
+export interface CrawlOptions {
+	owner: string;
+	repo: string;
+	/** Branch, tag, or commit SHA; defaults to repo default branch */
+	ref?: string;
+	/** GitHub PAT for private repos */
+	token?: string;
+	/** Parsed trueref.json / context7.json configuration */
+	config?: RepoConfig;
+	/** Progress callback invoked after each file is processed */
+	onProgress?: (processed: number, total: number) => void;
+}
+
+// ---------------------------------------------------------------------------
+// GitHub API response shapes (minimal — only fields we use)
+// ---------------------------------------------------------------------------
+
+export interface GitHubRepoResponse {
+	default_branch: string;
+	stargazers_count: number;
+}
+
+export interface GitHubTreeItem {
+	path: string;
+	type: 'blob' | 'tree';
+	size?: number;
+	sha: string;
+	url: string;
+}
+
+export interface GitHubTreeResponse {
+	tree: GitHubTreeItem[];
+	truncated: boolean;
+}
+
+export interface GitHubContentResponse {
+	content: string;
+	encoding: string;
+	size: number;
+	sha: string;
+}
+
+// ---------------------------------------------------------------------------
+// Domain errors
+// ---------------------------------------------------------------------------
+
+export class RepositoryNotFoundError extends Error {
+	readonly code = 'REPOSITORY_NOT_FOUND';
+	constructor(message: string) {
+		super(message);
+		this.name = 'RepositoryNotFoundError';
+	}
+}
+
+export class AuthenticationError extends Error {
+	readonly code = 'AUTHENTICATION_ERROR';
+	constructor(message: string) {
+		super(message);
+		this.name = 'AuthenticationError';
+	}
+}
+
+export class PermissionError extends Error {
+	readonly code = 'PERMISSION_ERROR';
+	constructor(message: string) {
+		super(message);
+		this.name = 'PermissionError';
+	}
+}
+
+export class RateLimitError extends Error {
+	readonly code = 'RATE_LIMIT_ERROR';
+	constructor(
+		message: string,
+		public readonly resetAt: number
+	) {
+		super(message);
+		this.name = 'RateLimitError';
+	}
+}
+
+export class NotAGitRepositoryError extends Error {
+	readonly code = 'NOT_A_GIT_REPOSITORY';
+	constructor(message: string) {
+		super(message);
+		this.name = 'NotAGitRepositoryError';
+	}
+}
+
+export class InvalidRefError extends Error {
+	readonly code = 'INVALID_REF';
+	constructor(message: string) {
+		super(message);
+		this.name = 'InvalidRefError';
+	}
+}