diff --git a/src/lib/server/crawler/file-filter.ts b/src/lib/server/crawler/file-filter.ts new file mode 100644 index 0000000..dd8e048 --- /dev/null +++ b/src/lib/server/crawler/file-filter.ts @@ -0,0 +1,183 @@ +/** + * File filtering logic for the GitHub crawler (TRUEREF-0003). + * + * Determines whether a file in the repository tree should be downloaded + * and indexed based on its extension, size, and the trueref.json config. + */ + +import { extname, basename } from 'node:path'; +import type { RepoConfig } from './types.js'; + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +/** File extensions that the indexer can meaningfully process. */ +export const INDEXABLE_EXTENSIONS = new Set([ + // Documentation + '.md', + '.mdx', + '.txt', + '.rst', + // Code + '.ts', + '.tsx', + '.js', + '.jsx', + '.py', + '.rb', + '.go', + '.rs', + '.java', + '.cs', + '.cpp', + '.c', + '.h', + '.swift', + '.kt', + '.php', + '.scala', + '.clj', + '.ex', + '.exs', + '.sh', + '.bash', + '.zsh', + '.fish', + // Config / data + '.json', + '.yaml', + '.yml', + '.toml', + // Web + '.html', + '.css', + '.svelte', + '.vue' +]); + +/** Maximum file size we are willing to download (500 KB). */ +export const MAX_FILE_SIZE_BYTES = 500_000; + +/** + * Default path prefixes that are always excluded regardless of config. + * These directories contain generated or dependency files that should never + * be indexed. + */ +const DEFAULT_EXCLUDES: string[] = [ + 'node_modules/', + '.git/', + 'dist/', + 'build/', + 'coverage/', + '.next/', + '__pycache__/', + 'vendor/', + 'target/', + '.cache/' +]; + +// --------------------------------------------------------------------------- +// Language detection +// --------------------------------------------------------------------------- + +const EXTENSION_TO_LANGUAGE: Record = { + '.ts': 'typescript', + '.tsx': 'typescript', + '.js': 'javascript', + '.jsx': 'javascript', + '.py': 'python', + '.rb': 'ruby', + '.go': 'go', + '.rs': 'rust', + '.java': 'java', + '.cs': 'csharp', + '.cpp': 'cpp', + '.c': 'c', + '.h': 'c', + '.swift': 'swift', + '.kt': 'kotlin', + '.php': 'php', + '.scala': 'scala', + '.clj': 'clojure', + '.ex': 'elixir', + '.exs': 'elixir', + '.sh': 'shell', + '.bash': 'shell', + '.zsh': 'shell', + '.fish': 'shell', + '.json': 'json', + '.yaml': 'yaml', + '.yml': 'yaml', + '.toml': 'toml', + '.html': 'html', + '.css': 'css', + '.svelte': 'svelte', + '.vue': 'vue', + '.md': 'markdown', + '.mdx': 'markdown', + '.txt': 'text', + '.rst': 'rst' +}; + +/** + * Detect a human-readable language name from a file extension. + * Returns an empty string when the extension is unknown. + */ +export function detectLanguage(filePath: string): string { + const ext = extname(filePath).toLowerCase(); + return EXTENSION_TO_LANGUAGE[ext] ?? ''; +} + +// --------------------------------------------------------------------------- +// Filter predicate +// --------------------------------------------------------------------------- + +/** + * Decide whether a file from the repository tree should be downloaded. + * + * Rules (applied in order): + * 1. Must have an indexable extension. + * 2. Must not exceed the size limit. + * 3. Must not match config.excludeFiles (exact basename match). + * 4. Must not be under a config.excludeFolders path / regex. + * 5. Must be under a config.folders allowlist path / regex (if specified). + * 6. Must not start with a default-excluded prefix. + */ +export function shouldIndexFile( + filePath: string, + fileSize: number, + config?: RepoConfig +): boolean { + const ext = extname(filePath).toLowerCase(); + + // 1. Extension allow-list + if (!INDEXABLE_EXTENSIONS.has(ext)) return false; + + // 2. Size limit + if (fileSize > MAX_FILE_SIZE_BYTES) return false; + + // 3. Config excludeFiles — exact basename match + if (config?.excludeFiles?.includes(basename(filePath))) return false; + + // 4. Config excludeFolders — prefix or regex match + if ( + config?.excludeFolders?.some( + (folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath) + ) + ) + return false; + + // 5. Config folders allowlist — if provided, the file must match at least one + if (config?.folders?.length) { + const inAllowedFolder = config.folders.some( + (folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath) + ); + if (!inAllowedFolder) return false; + } + + // 6. Default excludes + if (DEFAULT_EXCLUDES.some((ex) => filePath.startsWith(ex))) return false; + + return true; +} diff --git a/src/lib/server/crawler/github.crawler.test.ts b/src/lib/server/crawler/github.crawler.test.ts new file mode 100644 index 0000000..33f0105 --- /dev/null +++ b/src/lib/server/crawler/github.crawler.test.ts @@ -0,0 +1,561 @@ +/** + * Unit tests for the GitHub repository crawler (TRUEREF-0003). + * + * All GitHub API calls are intercepted via vi.stubGlobal('fetch', ...) so + * that no real network traffic is produced. + */ + +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; + +import { crawl } from './github.crawler.js'; +import { shouldIndexFile, detectLanguage, INDEXABLE_EXTENSIONS, MAX_FILE_SIZE_BYTES } from './file-filter.js'; +import { GitHubRateLimiter, Semaphore, withRetry } from './rate-limiter.js'; +import { + AuthenticationError, + PermissionError, + RepositoryNotFoundError +} from './types.js'; + +// --------------------------------------------------------------------------- +// Mock fetch helpers +// --------------------------------------------------------------------------- + +type FetchHandler = (url: string, init?: RequestInit) => Response; + +function stubFetch(handler: FetchHandler) { + vi.stubGlobal( + 'fetch', + vi.fn((url: string, init?: RequestInit) => Promise.resolve(handler(url, init))) + ); +} + +function jsonResponse(body: unknown, status = 200, headers: Record = {}): Response { + return new Response(JSON.stringify(body), { + status, + headers: { + 'Content-Type': 'application/json', + 'X-RateLimit-Remaining': '4999', + 'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600), + ...headers + } + }); +} + +function textResponse(body: string, status = 200, headers: Record = {}): Response { + return new Response(body, { + status, + headers: { + 'Content-Type': 'text/plain', + 'X-RateLimit-Remaining': '4999', + 'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600), + ...headers + } + }); +} + +// --------------------------------------------------------------------------- +// Fixtures +// --------------------------------------------------------------------------- + +const REPO_INFO = { + default_branch: 'main', + stargazers_count: 42 +}; + +const TREE_RESPONSE = { + tree: [ + { path: 'README.md', type: 'blob', size: 1024, sha: 'sha-readme', url: '' }, + { path: 'src/index.ts', type: 'blob', size: 512, sha: 'sha-index', url: '' }, + { path: 'src/utils.ts', type: 'blob', size: 256, sha: 'sha-utils', url: '' }, + { path: 'package.json', type: 'blob', size: 128, sha: 'sha-pkg', url: '' }, + { path: 'dist/bundle.js', type: 'blob', size: 9999, sha: 'sha-dist', url: '' }, // excluded by default + { path: 'node_modules/lodash/index.js', type: 'blob', size: 100, sha: 'sha-nm', url: '' }, // excluded + { path: 'image.png', type: 'blob', size: 4096, sha: 'sha-img', url: '' }, // non-indexable + { path: 'src', type: 'tree', size: 0, sha: 'sha-src-tree', url: '' } + ], + truncated: false +}; + +const COMMIT_SHA = 'deadbeef1234567890abcdef'; + +// --------------------------------------------------------------------------- +// shouldIndexFile unit tests +// --------------------------------------------------------------------------- + +describe('shouldIndexFile()', () => { + it('returns true for a .ts file within size limit', () => { + expect(shouldIndexFile('src/index.ts', 1000)).toBe(true); + }); + + it('returns false for a .png file (non-indexable extension)', () => { + expect(shouldIndexFile('assets/logo.png', 100)).toBe(false); + }); + + it('returns false when file exceeds MAX_FILE_SIZE_BYTES', () => { + expect(shouldIndexFile('big.ts', MAX_FILE_SIZE_BYTES + 1)).toBe(false); + }); + + it('returns false for a file in node_modules/', () => { + expect(shouldIndexFile('node_modules/lodash/index.js', 100)).toBe(false); + }); + + it('returns false for a file in dist/', () => { + expect(shouldIndexFile('dist/bundle.js', 100)).toBe(false); + }); + + it('respects config.excludeFiles (exact basename)', () => { + expect(shouldIndexFile('src/secret.ts', 100, { excludeFiles: ['secret.ts'] })).toBe(false); + }); + + it('does not exclude a file whose basename merely contains the excluded name', () => { + expect(shouldIndexFile('src/not-secret.ts', 100, { excludeFiles: ['secret.ts'] })).toBe(true); + }); + + it('respects config.excludeFolders prefix', () => { + expect(shouldIndexFile('internal/config.ts', 100, { excludeFolders: ['internal/'] })).toBe(false); + }); + + it('allows files outside of config.excludeFolders', () => { + expect(shouldIndexFile('public/api.ts', 100, { excludeFolders: ['internal/'] })).toBe(true); + }); + + it('restricts to config.folders allowlist when specified', () => { + const config = { folders: ['docs/'] }; + expect(shouldIndexFile('src/index.ts', 100, config)).toBe(false); + expect(shouldIndexFile('docs/guide.md', 100, config)).toBe(true); + }); + + it('returns true when config.folders is an empty array (no restriction)', () => { + expect(shouldIndexFile('src/index.ts', 100, { folders: [] })).toBe(true); + }); + + it('handles all default-excluded directories', () => { + const excluded = [ + 'node_modules/pkg/index.js', + '.git/config', + 'dist/out.js', + 'build/app.js', + 'coverage/lcov.info', + '.next/server.js', + '__pycache__/mod.py', + 'vendor/lib.go', + 'target/release.rs', + '.cache/file.ts' + ]; + for (const path of excluded) { + expect(shouldIndexFile(path, 100), `should exclude ${path}`).toBe(false); + } + }); + + it('INDEXABLE_EXTENSIONS covers all expected types', () => { + const required = ['.md', '.ts', '.py', '.go', '.rs', '.json', '.svelte']; + for (const ext of required) { + expect(INDEXABLE_EXTENSIONS.has(ext), `missing extension ${ext}`).toBe(true); + } + }); +}); + +// --------------------------------------------------------------------------- +// detectLanguage unit tests +// --------------------------------------------------------------------------- + +describe('detectLanguage()', () => { + it('detects typescript', () => expect(detectLanguage('foo.ts')).toBe('typescript')); + it('detects tsx as typescript', () => expect(detectLanguage('foo.tsx')).toBe('typescript')); + it('detects javascript', () => expect(detectLanguage('foo.js')).toBe('javascript')); + it('detects python', () => expect(detectLanguage('foo.py')).toBe('python')); + it('detects go', () => expect(detectLanguage('foo.go')).toBe('go')); + it('detects rust', () => expect(detectLanguage('foo.rs')).toBe('rust')); + it('detects markdown', () => expect(detectLanguage('README.md')).toBe('markdown')); + it('detects svelte', () => expect(detectLanguage('App.svelte')).toBe('svelte')); + it('detects yaml', () => expect(detectLanguage('config.yaml')).toBe('yaml')); + it('returns empty string for unknown extension', () => expect(detectLanguage('file.xyz')).toBe('')); + it('is case-insensitive for extensions', () => expect(detectLanguage('FILE.TS')).toBe('typescript')); +}); + +// --------------------------------------------------------------------------- +// GitHubRateLimiter unit tests +// --------------------------------------------------------------------------- + +describe('GitHubRateLimiter', () => { + it('defaults to 5000 remaining requests', () => { + const limiter = new GitHubRateLimiter(); + expect(limiter.remainingRequests).toBe(5000); + }); + + it('updates remaining and resetAt from headers', () => { + const limiter = new GitHubRateLimiter(); + const resetEpoch = Math.floor(Date.now() / 1000) + 3600; + const headers = new Headers({ + 'X-RateLimit-Remaining': '42', + 'X-RateLimit-Reset': String(resetEpoch) + }); + limiter.updateFromHeaders(headers); + expect(limiter.remainingRequests).toBe(42); + expect(limiter.resetTimestamp).toBe(resetEpoch * 1000); + }); + + it('does not mutate state when headers are absent', () => { + const limiter = new GitHubRateLimiter(); + limiter.updateFromHeaders(new Headers()); + expect(limiter.remainingRequests).toBe(5000); + }); + + it('waitIfNeeded resolves immediately when remaining > 10', async () => { + const limiter = new GitHubRateLimiter(); + const start = Date.now(); + await limiter.waitIfNeeded(); + expect(Date.now() - start).toBeLessThan(100); + }); +}); + +// --------------------------------------------------------------------------- +// Semaphore unit tests +// --------------------------------------------------------------------------- + +describe('Semaphore', () => { + it('allows up to concurrency tasks to run simultaneously', async () => { + const sem = new Semaphore(2); + let active = 0; + let maxActive = 0; + + const task = () => + sem.run(async () => { + active++; + maxActive = Math.max(maxActive, active); + await new Promise((r) => setTimeout(r, 10)); + active--; + }); + + await Promise.all([task(), task(), task(), task()]); + expect(maxActive).toBeLessThanOrEqual(2); + }); + + it('resolves all tasks even when queued', async () => { + const sem = new Semaphore(1); + const results: number[] = []; + await Promise.all( + [1, 2, 3].map((n) => + sem.run(async () => { + results.push(n); + }) + ) + ); + expect(results).toHaveLength(3); + }); +}); + +// --------------------------------------------------------------------------- +// withRetry unit tests +// --------------------------------------------------------------------------- + +describe('withRetry()', () => { + it('returns the result on first success', async () => { + const result = await withRetry(() => Promise.resolve(42)); + expect(result).toBe(42); + }); + + it('retries on failure and returns eventual success', async () => { + let calls = 0; + const result = await withRetry(async () => { + calls++; + if (calls < 3) throw new Error('transient'); + return 'ok'; + }, 3); + expect(result).toBe('ok'); + expect(calls).toBe(3); + }); + + it('throws after exhausting all attempts', async () => { + await expect( + withRetry(() => Promise.reject(new Error('always fails')), 3) + ).rejects.toThrow('always fails'); + }); +}); + +// --------------------------------------------------------------------------- +// crawl() integration tests (fetch mocked) +// --------------------------------------------------------------------------- + +describe('crawl()', () => { + beforeEach(() => { + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.restoreAllMocks(); + vi.useRealTimers(); + }); + + function setupDefaultMocks(overrides: Partial> = {}) { + stubFetch((url) => { + // Repo info + if (url === 'https://api.github.com/repos/owner/repo') { + return overrides[url] ?? jsonResponse(REPO_INFO); + } + // Commit SHA + if (url === 'https://api.github.com/repos/owner/repo/commits/main') { + return overrides[url] ?? textResponse(COMMIT_SHA); + } + // File tree + if (url.startsWith('https://api.github.com/repos/owner/repo/git/trees/main')) { + return overrides[url] ?? jsonResponse(TREE_RESPONSE); + } + // Raw content (raw.githubusercontent.com) + if (url.startsWith('https://raw.githubusercontent.com/')) { + const filePath = url.split('/').slice(6).join('/'); + return overrides[url] ?? textResponse(`// content of ${filePath}`); + } + return new Response('not found', { status: 404 }); + }); + } + + it('returns files that pass the filter', async () => { + setupDefaultMocks(); + + const result = await crawl({ owner: 'owner', repo: 'repo' }); + + // dist/ and node_modules/ should be excluded; .png should be excluded. + // Expected: README.md, src/index.ts, src/utils.ts, package.json + expect(result.files.length).toBeGreaterThanOrEqual(4); + expect(result.files.every((f) => !f.path.startsWith('dist/'))).toBe(true); + expect(result.files.every((f) => !f.path.startsWith('node_modules/'))).toBe(true); + expect(result.files.every((f) => !f.path.endsWith('.png'))).toBe(true); + }); + + it('sets branch to the default_branch when no ref is given', async () => { + setupDefaultMocks(); + const result = await crawl({ owner: 'owner', repo: 'repo' }); + expect(result.branch).toBe('main'); + }); + + it('uses the provided ref when specified', async () => { + stubFetch((url) => { + if (url === 'https://api.github.com/repos/owner/repo') { + return jsonResponse(REPO_INFO); + } + if (url.includes('/git/trees/v2.0.0')) { + return jsonResponse({ tree: [], truncated: false }); + } + if (url.includes('/commits/v2.0.0')) { + return textResponse('tagsha'); + } + return textResponse('content'); + }); + + const result = await crawl({ owner: 'owner', repo: 'repo', ref: 'v2.0.0' }); + expect(result.branch).toBe('v2.0.0'); + }); + + it('populates commitSha from the commits endpoint', async () => { + setupDefaultMocks(); + const result = await crawl({ owner: 'owner', repo: 'repo' }); + expect(result.commitSha).toBe(COMMIT_SHA); + }); + + it('sets correct sha on each CrawledFile from the tree', async () => { + setupDefaultMocks(); + const result = await crawl({ owner: 'owner', repo: 'repo' }); + const readme = result.files.find((f) => f.path === 'README.md'); + expect(readme).toBeDefined(); + expect(readme!.sha).toBe('sha-readme'); + }); + + it('attaches language to each CrawledFile', async () => { + setupDefaultMocks(); + const result = await crawl({ owner: 'owner', repo: 'repo' }); + const indexTs = result.files.find((f) => f.path === 'src/index.ts'); + expect(indexTs?.language).toBe('typescript'); + const readme = result.files.find((f) => f.path === 'README.md'); + expect(readme?.language).toBe('markdown'); + }); + + it('reports progress via onProgress callback', async () => { + setupDefaultMocks(); + const calls: Array<[number, number]> = []; + await crawl({ + owner: 'owner', + repo: 'repo', + onProgress: (p, t) => calls.push([p, t]) + }); + expect(calls.length).toBeGreaterThan(0); + // Total must remain constant across all calls. + const totals = calls.map(([, t]) => t); + expect(totals.every((t) => t === totals[0])).toBe(true); + }); + + it('skips files that fail to download without throwing', async () => { + stubFetch((url) => { + if (url === 'https://api.github.com/repos/owner/repo') { + return jsonResponse(REPO_INFO); + } + if (url.includes('/git/trees/main')) { + return jsonResponse({ + tree: [{ path: 'src/index.ts', type: 'blob', size: 100, sha: 'sha1', url: '' }], + truncated: false + }); + } + if (url.includes('/commits/main')) { + return textResponse(COMMIT_SHA); + } + // All content downloads fail. + return new Response('error', { status: 500 }); + }); + + // Should not throw; just return zero files. + const result = await crawl({ owner: 'owner', repo: 'repo' }); + expect(result.files).toHaveLength(0); + expect(result.totalFiles).toBe(1); + }); + + it('throws RepositoryNotFoundError on 404', async () => { + stubFetch((url) => { + if (url === 'https://api.github.com/repos/owner/missing') { + return jsonResponse({ message: 'Not Found' }, 404, { + 'X-RateLimit-Remaining': '4999', + 'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600) + }); + } + return new Response('not found', { status: 404 }); + }); + + await expect(crawl({ owner: 'owner', repo: 'missing' })).rejects.toThrow( + RepositoryNotFoundError + ); + }); + + it('throws AuthenticationError on 401', async () => { + stubFetch(() => + new Response('Unauthorized', { + status: 401, + headers: { + 'X-RateLimit-Remaining': '0', + 'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600) + } + }) + ); + + await expect(crawl({ owner: 'owner', repo: 'repo', token: 'bad-token' })).rejects.toThrow( + AuthenticationError + ); + }); + + it('throws PermissionError on 403 without rate-limit exhaustion', async () => { + stubFetch(() => + new Response('Forbidden', { + status: 403, + headers: { + 'X-RateLimit-Remaining': '100', + 'X-RateLimit-Reset': String(Math.floor(Date.now() / 1000) + 3600) + } + }) + ); + + await expect(crawl({ owner: 'owner', repo: 'repo' })).rejects.toThrow(PermissionError); + }); + + it('respects config.folders allowlist when provided', async () => { + setupDefaultMocks(); + const result = await crawl({ + owner: 'owner', + repo: 'repo', + config: { folders: ['src/'] } + }); + // Only src/ files should be present. + expect(result.files.every((f) => f.path.startsWith('src/'))).toBe(true); + }); + + it('applies config.excludeFiles filter', async () => { + setupDefaultMocks(); + const result = await crawl({ + owner: 'owner', + repo: 'repo', + config: { excludeFiles: ['package.json'] } + }); + expect(result.files.some((f) => f.path === 'package.json')).toBe(false); + }); + + it('returns correct skippedFiles count', async () => { + setupDefaultMocks(); + const result = await crawl({ owner: 'owner', repo: 'repo' }); + // dist/, node_modules/, and .png are the excluded items = 3 + expect(result.skippedFiles).toBe(3); + }); + + it('uses auth token in requests to GitHub API', async () => { + const capturedHeaders: Record[] = []; + + stubFetch((url, init) => { + const headers = Object.fromEntries( + Object.entries((init?.headers as Record) ?? {}) + ); + capturedHeaders.push(headers); + + if (url === 'https://api.github.com/repos/owner/repo') { + return jsonResponse(REPO_INFO); + } + if (url.includes('/git/trees/main')) { + return jsonResponse({ tree: [], truncated: false }); + } + if (url.includes('/commits/main')) { + return textResponse(COMMIT_SHA); + } + return textResponse('content'); + }); + + await crawl({ owner: 'owner', repo: 'repo', token: 'ghp_mysecrettoken' }); + + const apiCalls = capturedHeaders.filter((h) => h.Authorization); + expect(apiCalls.length).toBeGreaterThan(0); + expect(apiCalls[0].Authorization).toBe('Bearer ghp_mysecrettoken'); + }); + + it('handles a tree with zero indexable files gracefully', async () => { + stubFetch((url) => { + if (url === 'https://api.github.com/repos/owner/repo') return jsonResponse(REPO_INFO); + if (url.includes('/git/trees/main')) + return jsonResponse({ + tree: [ + { path: 'image.png', type: 'blob', size: 100, sha: 'sha1', url: '' }, + { path: 'video.mp4', type: 'blob', size: 1000, sha: 'sha2', url: '' } + ], + truncated: false + }); + if (url.includes('/commits/main')) return textResponse(COMMIT_SHA); + return textResponse('content'); + }); + + const result = await crawl({ owner: 'owner', repo: 'repo' }); + expect(result.files).toHaveLength(0); + expect(result.totalFiles).toBe(0); + expect(result.skippedFiles).toBe(2); + }); + + it('reads and applies config from trueref.json found in the tree', async () => { + const truerefConfig = { excludeFiles: ['package.json'] }; + + stubFetch((url) => { + if (url === 'https://api.github.com/repos/owner/repo') return jsonResponse(REPO_INFO); + if (url.includes('/git/trees/main')) { + return jsonResponse({ + tree: [ + { path: 'trueref.json', type: 'blob', size: 50, sha: 'sha-cfg', url: '' }, + { path: 'src/index.ts', type: 'blob', size: 200, sha: 'sha-idx', url: '' }, + { path: 'package.json', type: 'blob', size: 100, sha: 'sha-pkg', url: '' } + ], + truncated: false + }); + } + if (url.includes('/commits/main')) return textResponse(COMMIT_SHA); + if (url.includes('trueref.json')) return textResponse(JSON.stringify(truerefConfig)); + if (url.includes('src/index.ts')) return textResponse('export const x = 1;'); + if (url.includes('package.json')) return textResponse('{"name":"test"}'); + return textResponse('content'); + }); + + // No caller-supplied config — crawler should auto-detect trueref.json. + const result = await crawl({ owner: 'owner', repo: 'repo' }); + expect(result.files.some((f) => f.path === 'package.json')).toBe(false); + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); + }); +}); diff --git a/src/lib/server/crawler/github.crawler.ts b/src/lib/server/crawler/github.crawler.ts new file mode 100644 index 0000000..72dc957 --- /dev/null +++ b/src/lib/server/crawler/github.crawler.ts @@ -0,0 +1,477 @@ +/** + * GitHub Repository Crawler (TRUEREF-0003). + * + * Fetches repository file trees via the GitHub Trees API and downloads file + * contents in parallel while respecting rate limits and applying + * include/exclude filtering rules from trueref.json. + * + * Download strategy: + * - Uses raw.githubusercontent.com for file content — faster and counts less + * against the REST API rate limit. + * - Falls back to the GitHub Contents API if raw download fails. + * + * Error handling: + * - 404 → RepositoryNotFoundError + * - 401 → AuthenticationError + * - 403 → waits for rate-limit reset if X-RateLimit-Remaining is 0; else PermissionError + * - 422 → tree too large; switches to directory-by-directory traversal (depth pagination) + * - Network errors → retried up to 3 times with exponential backoff + * - Bad base64 content → file skipped with a console warning + */ + +import { shouldIndexFile, detectLanguage } from './file-filter.js'; +import { GitHubRateLimiter, Semaphore, withRetry } from './rate-limiter.js'; +import { + AuthenticationError, + PermissionError, + RateLimitError, + RepositoryNotFoundError +} from './types.js'; + +// Domain errors should not be retried — they are permanent HTTP status codes. +function isDomainError(err: unknown): boolean { + return ( + err instanceof RepositoryNotFoundError || + err instanceof AuthenticationError || + err instanceof PermissionError || + err instanceof RateLimitError + ); +} + +function isRetryable(err: unknown): boolean { + return !isDomainError(err); +} +import type { + CrawlOptions, + CrawlResult, + CrawledFile, + GitHubContentResponse, + GitHubRepoResponse, + GitHubTreeItem, + GitHubTreeResponse +} from './types.js'; + +// --------------------------------------------------------------------------- +// Internal constants +// --------------------------------------------------------------------------- + +const GITHUB_API = 'https://api.github.com'; +const RAW_CONTENT = 'https://raw.githubusercontent.com'; + +/** Maximum parallel file downloads. */ +const DOWNLOAD_CONCURRENCY = 10; + +/** Config file names that should be fetched first so their filtering rules + * apply to all subsequent downloads. */ +const CONFIG_FILE_NAMES = new Set(['trueref.json', 'context7.json']); + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +/** + * Build standard GitHub API request headers. + */ +function buildHeaders(token?: string): Record { + const headers: Record = { + Accept: 'application/vnd.github+json', + 'X-GitHub-Api-Version': '2022-11-28' + }; + if (token) { + headers.Authorization = `Bearer ${token}`; + } + return headers; +} + +/** + * Throw a domain error for non-2xx GitHub API responses. + * Mutates the rate limiter with header data before throwing. + */ +async function throwForStatus(response: Response, rateLimiter: GitHubRateLimiter): Promise { + if (response.ok) return; + + rateLimiter.updateFromHeaders(response.headers); + + switch (response.status) { + case 401: + throw new AuthenticationError('GitHub authentication failed — check your PAT.'); + case 403: { + const remaining = response.headers.get('X-RateLimit-Remaining'); + if (remaining === '0') { + const reset = parseInt(response.headers.get('X-RateLimit-Reset') ?? '0', 10) * 1000; + throw new RateLimitError('GitHub rate limit exceeded.', reset); + } + throw new PermissionError( + 'GitHub returned 403 Forbidden — insufficient permissions for this resource.' + ); + } + case 404: + throw new RepositoryNotFoundError( + `Repository not found or not accessible: ${response.url}` + ); + default: { + const body = await response.text().catch(() => ''); + throw new Error(`GitHub API error ${response.status}: ${body}`); + } + } +} + +// --------------------------------------------------------------------------- +// GitHub API calls +// --------------------------------------------------------------------------- + +/** + * Fetch repository metadata (default branch, stars, etc.). + */ +async function fetchRepoInfo( + owner: string, + repo: string, + token: string | undefined, + rateLimiter: GitHubRateLimiter +): Promise { + return withRetry(async () => { + await rateLimiter.waitIfNeeded(); + + const response = await fetch(`${GITHUB_API}/repos/${owner}/${repo}`, { + headers: buildHeaders(token) + }); + + rateLimiter.updateFromHeaders(response.headers); + await throwForStatus(response, rateLimiter); + + return (await response.json()) as GitHubRepoResponse; + }, 3, isRetryable); +} + +/** + * Fetch the recursive file tree for a given ref. + * Returns null when the tree is truncated (>100k items), signalling that we + * should fall back to directory-level traversal. + */ +async function fetchTree( + owner: string, + repo: string, + ref: string, + token: string | undefined, + rateLimiter: GitHubRateLimiter +): Promise { + return withRetry(async () => { + await rateLimiter.waitIfNeeded(); + + const url = `${GITHUB_API}/repos/${owner}/${repo}/git/trees/${ref}?recursive=1`; + const response = await fetch(url, { headers: buildHeaders(token) }); + + rateLimiter.updateFromHeaders(response.headers); + + // 422 means the tree is too large for a single recursive call. + if (response.status === 422) return null; + + await throwForStatus(response, rateLimiter); + + return (await response.json()) as GitHubTreeResponse; + }, 3, isRetryable); +} + +/** + * Fetch a subtree (non-recursive) for a single directory path. + * Used when the full recursive tree is truncated. + */ +async function fetchSubTree( + owner: string, + repo: string, + ref: string, + treeSha: string, + token: string | undefined, + rateLimiter: GitHubRateLimiter +): Promise { + return withRetry(async () => { + await rateLimiter.waitIfNeeded(); + + const url = `${GITHUB_API}/repos/${owner}/${repo}/git/trees/${treeSha}`; + const response = await fetch(url, { headers: buildHeaders(token) }); + + rateLimiter.updateFromHeaders(response.headers); + await throwForStatus(response, rateLimiter); + + return (await response.json()) as GitHubTreeResponse; + }, 3, isRetryable); +} + +/** + * Resolve the HEAD commit SHA from a branch/tag ref by fetching the + * commit object at the ref tip. + */ +async function fetchCommitSha( + owner: string, + repo: string, + ref: string, + token: string | undefined, + rateLimiter: GitHubRateLimiter +): Promise { + return withRetry(async () => { + await rateLimiter.waitIfNeeded(); + + const url = `${GITHUB_API}/repos/${owner}/${repo}/commits/${ref}`; + const response = await fetch(url, { + headers: { ...buildHeaders(token), Accept: 'application/vnd.github.sha' } + }); + + rateLimiter.updateFromHeaders(response.headers); + await throwForStatus(response, rateLimiter); + + // When Accept is 'application/vnd.github.sha', the response body is the + // bare SHA string. + return (await response.text()).trim(); + }, 3, isRetryable); +} + +/** + * Download raw file content via raw.githubusercontent.com. + * Returns null on any failure (the caller will skip or fall back). + */ +async function downloadRawFile( + owner: string, + repo: string, + ref: string, + filePath: string, + token: string | undefined +): Promise { + try { + const url = `${RAW_CONTENT}/${owner}/${repo}/${ref}/${filePath}`; + const headers: Record = {}; + if (token) headers.Authorization = `Bearer ${token}`; + + const response = await fetch(url, { headers }); + if (!response.ok) return null; + + return await response.text(); + } catch { + return null; + } +} + +/** + * Download file content via the GitHub Contents API (fallback). + */ +async function downloadViaContentsApi( + owner: string, + repo: string, + ref: string, + filePath: string, + token: string | undefined, + rateLimiter: GitHubRateLimiter +): Promise { + try { + return await withRetry(async () => { + await rateLimiter.waitIfNeeded(); + + const url = `${GITHUB_API}/repos/${owner}/${repo}/contents/${filePath}?ref=${ref}`; + const response = await fetch(url, { headers: buildHeaders(token) }); + + rateLimiter.updateFromHeaders(response.headers); + if (!response.ok) return null; + + const data = (await response.json()) as GitHubContentResponse; + if (data.encoding !== 'base64') return null; + + // Node.js Buffer handles both padded and unpadded base64. + return Buffer.from(data.content.replace(/\n/g, ''), 'base64').toString('utf-8'); + }); + } catch { + return null; + } +} + +// --------------------------------------------------------------------------- +// Directory-level traversal (fallback for truncated trees) +// --------------------------------------------------------------------------- + +/** + * Recursively collect all blob items from sub-trees when the top-level + * recursive tree is truncated (>100k items). + */ +async function collectBlobsFromSubTrees( + owner: string, + repo: string, + ref: string, + token: string | undefined, + rateLimiter: GitHubRateLimiter +): Promise { + const allBlobs: GitHubTreeItem[] = []; + const queue: Array<{ sha: string; prefix: string }> = [{ sha: ref, prefix: '' }]; + + while (queue.length > 0) { + const batch = queue.splice(0, DOWNLOAD_CONCURRENCY); + + await Promise.all( + batch.map(async ({ sha, prefix }) => { + const subTree = await fetchSubTree(owner, repo, ref, sha, token, rateLimiter).catch( + () => null + ); + if (!subTree) return; + + for (const item of subTree.tree) { + const fullPath = prefix ? `${prefix}/${item.path}` : item.path; + if (item.type === 'blob') { + allBlobs.push({ ...item, path: fullPath }); + } else if (item.type === 'tree') { + queue.push({ sha: item.sha, prefix: fullPath }); + } + } + }) + ); + } + + return allBlobs; +} + +// --------------------------------------------------------------------------- +// Config file detection +// --------------------------------------------------------------------------- + +/** + * Try to download and parse a trueref.json / context7.json config from the + * repository root. Returns undefined if not found or unparseable. + */ +async function fetchRepoConfig( + owner: string, + repo: string, + ref: string, + token: string | undefined, + blobs: GitHubTreeItem[], + rateLimiter: GitHubRateLimiter +): Promise { + // Look for config files only at the repo root (no directory prefix). + const configItem = blobs.find((b) => CONFIG_FILE_NAMES.has(b.path)); + if (!configItem) return undefined; + + const content = + (await downloadRawFile(owner, repo, ref, configItem.path, token)) ?? + (await downloadViaContentsApi( + owner, + repo, + ref, + configItem.path, + token, + rateLimiter + )); + + if (!content) return undefined; + + try { + return JSON.parse(content) as CrawlOptions['config']; + } catch { + console.warn(`[GitHubCrawler] Failed to parse config file: ${configItem.path}`); + return undefined; + } +} + +// --------------------------------------------------------------------------- +// Public crawl() function +// --------------------------------------------------------------------------- + +/** + * Crawl a GitHub repository and return structured file objects. + * + * @param options - Repository coordinates, auth token, filter config, and + * optional progress callback. + * @returns CrawlResult with all downloaded files and summary statistics. + */ +export async function crawl(options: CrawlOptions): Promise { + const { owner, repo, token, onProgress } = options; + const rateLimiter = new GitHubRateLimiter(); + const semaphore = new Semaphore(DOWNLOAD_CONCURRENCY); + + // ---- Step 1: Resolve the ref (default branch if not provided) ---------- + let ref = options.ref; + let commitSha = ''; + + const repoInfo = await fetchRepoInfo(owner, repo, token, rateLimiter); + if (!ref) { + ref = repoInfo.default_branch; + } + + // ---- Step 2: Fetch the file tree --------------------------------------- + let blobs: GitHubTreeItem[]; + + const treeResponse = await fetchTree(owner, repo, ref, token, rateLimiter); + + if (treeResponse === null) { + // Tree truncated — fall back to directory-by-directory traversal. + console.warn( + `[GitHubCrawler] Tree for ${owner}/${repo}@${ref} is truncated; using sub-tree traversal.` + ); + blobs = await collectBlobsFromSubTrees(owner, repo, ref, token, rateLimiter); + } else { + blobs = treeResponse.tree.filter((item) => item.type === 'blob'); + } + + // Resolve HEAD commit SHA (best-effort; empty string on failure). + commitSha = await fetchCommitSha(owner, repo, ref, token, rateLimiter).catch(() => ''); + + // ---- Step 3: Detect and download config file first --------------------- + // Merge caller-supplied config with any discovered repo config. + let effectiveConfig = options.config; + if (!effectiveConfig) { + effectiveConfig = await fetchRepoConfig(owner, repo, ref, token, blobs, rateLimiter); + } + + // ---- Step 4: Filter blobs according to config -------------------------- + const filteredBlobs = blobs.filter((item) => + shouldIndexFile(item.path, item.size ?? 0, effectiveConfig) + ); + + const totalFiles = filteredBlobs.length; + const skippedFiles = blobs.length - totalFiles; + + // ---- Step 5: Download file contents in parallel ------------------------- + const files: CrawledFile[] = []; + let processed = 0; + + await Promise.all( + filteredBlobs.map((item) => + semaphore.run(async () => { + try { + // Prefer raw download (cheaper on rate limit); fall back to API. + const content = + (await downloadRawFile(owner, repo, ref!, item.path, token)) ?? + (await downloadViaContentsApi( + owner, + repo, + ref!, + item.path, + token, + rateLimiter + )); + + if (content === null) { + console.warn(`[GitHubCrawler] Could not download: ${item.path} — skipping.`); + } else { + files.push({ + path: item.path, + content, + size: item.size ?? Buffer.byteLength(content, 'utf-8'), + sha: item.sha, + language: detectLanguage(item.path) + }); + } + } catch (err) { + console.warn( + `[GitHubCrawler] Error downloading ${item.path}: ${err instanceof Error ? err.message : String(err)}` + ); + } finally { + processed++; + onProgress?.(processed, totalFiles); + } + }) + ) + ); + + return { + files, + totalFiles, + skippedFiles, + branch: ref, + commitSha + }; +} diff --git a/src/lib/server/crawler/local.crawler.test.ts b/src/lib/server/crawler/local.crawler.test.ts new file mode 100644 index 0000000..fbae0d0 --- /dev/null +++ b/src/lib/server/crawler/local.crawler.test.ts @@ -0,0 +1,554 @@ +/** + * Unit tests for the local filesystem crawler (TRUEREF-0004). + * + * Each test that needs a filesystem fixture creates a temporary directory via + * `fs.mkdtemp`, writes the required files, runs the crawler, then cleans up + * with `fs.rm` regardless of the test outcome. + */ + +import { execFile } from 'node:child_process'; +import { createHash } from 'node:crypto'; +import { promises as fs } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; +import { promisify } from 'node:util'; + +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; + +import { LocalCrawler } from './local.crawler.js'; +import type { LocalCrawlOptions } from './local.crawler.js'; +import { InvalidRefError, NotAGitRepositoryError } from './types.js'; + +const execFileAsync = promisify(execFile); + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function sha256(content: string): string { + return createHash('sha256').update(content, 'utf-8').digest('hex'); +} + +/** Create a temp directory, write a map of relPath → content, return rootPath. */ +async function makeTempRepo(files: Record): Promise { + const root = await fs.mkdtemp(join(tmpdir(), 'trueref-test-')); + for (const [relPath, content] of Object.entries(files)) { + const absPath = join(root, relPath); + await fs.mkdir(join(absPath, '..'), { recursive: true }); + await fs.writeFile(absPath, content, 'utf-8'); + } + return root; +} + +/** Remove a temporary directory tree created by makeTempRepo. */ +async function cleanupTempRepo(root: string): Promise { + await fs.rm(root, { recursive: true, force: true }); +} + +// --------------------------------------------------------------------------- +// Test state +// --------------------------------------------------------------------------- + +let root: string = ''; +const crawler = new LocalCrawler(); + +async function crawlRoot(opts: Partial = {}): Promise> { + return crawler.crawl({ rootPath: root, ...opts }); +} + +// --------------------------------------------------------------------------- +// Basic crawl behaviour +// --------------------------------------------------------------------------- + +describe('LocalCrawler.crawl() — basic file enumeration', () => { + beforeEach(async () => { + root = await makeTempRepo({ + 'README.md': '# Hello', + 'src/index.ts': 'export const x = 1;', + 'src/utils.ts': 'export const y = 2;', + 'package.json': '{"name":"test"}' + }); + }); + + afterEach(async () => { + await cleanupTempRepo(root); + }); + + it('returns all indexable files', async () => { + const result = await crawlRoot(); + const paths = result.files.map((f) => f.path).sort(); + expect(paths).toEqual(['README.md', 'package.json', 'src/index.ts', 'src/utils.ts'].sort()); + }); + + it('populates content as a UTF-8 string', async () => { + const result = await crawlRoot(); + const readme = result.files.find((f) => f.path === 'README.md'); + expect(readme?.content).toBe('# Hello'); + }); + + it('sets size equal to Buffer.byteLength of content', async () => { + const result = await crawlRoot(); + for (const file of result.files) { + expect(file.size).toBe(Buffer.byteLength(file.content, 'utf-8')); + } + }); + + it('computes correct SHA-256 per file', async () => { + const result = await crawlRoot(); + const readme = result.files.find((f) => f.path === 'README.md'); + expect(readme?.sha).toBe(sha256('# Hello')); + }); + + it('detects language from extension', async () => { + const result = await crawlRoot(); + const ts = result.files.find((f) => f.path === 'src/index.ts'); + expect(ts?.language).toBe('typescript'); + const md = result.files.find((f) => f.path === 'README.md'); + expect(md?.language).toBe('markdown'); + const json = result.files.find((f) => f.path === 'package.json'); + expect(json?.language).toBe('json'); + }); + + it('sets branch to "local"', async () => { + const result = await crawlRoot(); + expect(result.branch).toBe('local'); + }); + + it('sets totalFiles to the count of filtered files', async () => { + const result = await crawlRoot(); + expect(result.totalFiles).toBe(result.files.length); + }); + + it('sets commitSha to a non-empty hex string', async () => { + const result = await crawlRoot(); + expect(result.commitSha).toMatch(/^[0-9a-f]{64}$/); + }); + + it('produces a deterministic commitSha for the same file set', async () => { + const r1 = await crawlRoot(); + const r2 = await crawlRoot(); + expect(r1.commitSha).toBe(r2.commitSha); + }); +}); + +// --------------------------------------------------------------------------- +// Filtering — default excludes and extension allow-list +// --------------------------------------------------------------------------- + +describe('LocalCrawler.crawl() — default filtering', () => { + beforeEach(async () => { + root = await makeTempRepo({ + 'src/index.ts': 'export {};', + 'dist/bundle.js': 'bundled', + 'node_modules/lodash/index.js': 'lodash', + '.git/config': '[core]', + 'image.png': '\x89PNG', + 'README.md': '# Docs' + }); + }); + + afterEach(async () => { + await cleanupTempRepo(root); + }); + + it('excludes files in dist/', async () => { + const result = await crawlRoot(); + expect(result.files.every((f) => !f.path.startsWith('dist/'))).toBe(true); + }); + + it('excludes files in node_modules/', async () => { + const result = await crawlRoot(); + expect(result.files.every((f) => !f.path.startsWith('node_modules/'))).toBe(true); + }); + + it('excludes files in .git/', async () => { + const result = await crawlRoot(); + expect(result.files.every((f) => !f.path.startsWith('.git/'))).toBe(true); + }); + + it('excludes non-indexable extensions like .png', async () => { + const result = await crawlRoot(); + expect(result.files.every((f) => !f.path.endsWith('.png'))).toBe(true); + }); + + it('reports skippedFiles = total enumerated – filtered', async () => { + const result = await crawlRoot(); + // dist/, node_modules/, .git/, .png = 4 skipped + // src/index.ts + README.md = 2 kept + expect(result.skippedFiles).toBe(4); + expect(result.totalFiles).toBe(2); + }); +}); + +// --------------------------------------------------------------------------- +// Size limit +// --------------------------------------------------------------------------- + +describe('LocalCrawler.crawl() — size limit', () => { + afterEach(async () => { + await cleanupTempRepo(root); + }); + + it('excludes files larger than MAX_FILE_SIZE_BYTES (500 KB)', async () => { + // 500_001 bytes of 'x' + const bigContent = 'x'.repeat(500_001); + root = await makeTempRepo({ + 'big.ts': bigContent, + 'small.ts': 'export const x = 1;' + }); + const result = await crawlRoot(); + expect(result.files.some((f) => f.path === 'big.ts')).toBe(false); + expect(result.files.some((f) => f.path === 'small.ts')).toBe(true); + }); + + it('includes files exactly at MAX_FILE_SIZE_BYTES (500 KB)', async () => { + const edgeContent = 'a'.repeat(500_000); + root = await makeTempRepo({ 'edge.ts': edgeContent }); + const result = await crawlRoot(); + expect(result.files.some((f) => f.path === 'edge.ts')).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// trueref.json / context7.json config detection +// --------------------------------------------------------------------------- + +describe('LocalCrawler.crawl() — config file detection', () => { + afterEach(async () => { + await cleanupTempRepo(root); + }); + + it('auto-detects trueref.json and applies excludeFiles', async () => { + root = await makeTempRepo({ + 'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }), + 'src/index.ts': 'export {};', + 'package.json': '{"name":"test"}' + }); + const result = await crawlRoot(); + expect(result.files.some((f) => f.path === 'package.json')).toBe(false); + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); + }); + + it('auto-detects context7.json and applies folders allowlist', async () => { + root = await makeTempRepo({ + 'context7.json': JSON.stringify({ folders: ['docs/'] }), + 'src/index.ts': 'export {};', + 'docs/guide.md': '# Guide' + }); + const result = await crawlRoot(); + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(false); + expect(result.files.some((f) => f.path === 'docs/guide.md')).toBe(true); + }); + + it('caller-supplied config takes precedence over discovered config file', async () => { + root = await makeTempRepo({ + 'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }), + 'src/index.ts': 'export {};', + 'package.json': '{"name":"test"}' + }); + // Caller provides a config with no exclusions — package.json should appear. + const result = await crawlRoot({ config: {} }); + expect(result.files.some((f) => f.path === 'package.json')).toBe(true); + }); + + it('applies excludeFolders from config', async () => { + root = await makeTempRepo({ + 'trueref.json': JSON.stringify({ excludeFolders: ['internal/'] }), + 'internal/secret.ts': 'secret', + 'src/public.ts': 'public' + }); + const result = await crawlRoot(); + expect(result.files.some((f) => f.path.startsWith('internal/'))).toBe(false); + expect(result.files.some((f) => f.path === 'src/public.ts')).toBe(true); + }); + + it('gracefully handles a malformed config file', async () => { + root = await makeTempRepo({ + 'trueref.json': 'NOT VALID JSON {{{', + 'src/index.ts': 'export {};' + }); + // Should not throw; falls back to no config. + const result = await crawlRoot(); + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Progress callback +// --------------------------------------------------------------------------- + +describe('LocalCrawler.crawl() — progress reporting', () => { + beforeEach(async () => { + root = await makeTempRepo({ + 'src/a.ts': 'a', + 'src/b.ts': 'b', + 'src/c.ts': 'c' + }); + }); + + afterEach(async () => { + await cleanupTempRepo(root); + }); + + it('calls onProgress once per filtered file', async () => { + const calls: Array<[number, number]> = []; + await crawlRoot({ onProgress: (p, t) => calls.push([p, t]) }); + expect(calls).toHaveLength(3); + }); + + it('increments processed from 1 to totalFiles', async () => { + const calls: Array<[number, number]> = []; + await crawlRoot({ onProgress: (p, t) => calls.push([p, t]) }); + const processed = calls.map(([p]) => p); + expect(processed).toEqual([1, 2, 3]); + }); + + it('keeps total constant across all callback invocations', async () => { + const totals: number[] = []; + await crawlRoot({ onProgress: (_, t) => totals.push(t) }); + expect(totals.every((t) => t === totals[0])).toBe(true); + }); + + it('does not call onProgress when no files pass the filter', async () => { + // Overwrite root with only non-indexable files. + await fs.rm(root, { recursive: true, force: true }); + root = await makeTempRepo({ 'image.png': '\x89PNG' }); + const calls: number[] = []; + await crawlRoot({ onProgress: () => calls.push(1) }); + expect(calls).toHaveLength(0); + }); +}); + +// --------------------------------------------------------------------------- +// Git ref checkout +// --------------------------------------------------------------------------- + +/** + * Create a temp directory that is a valid git repo with one commit per entry + * in `history`. Each entry is a map of relPath → content committed under the + * given tag (if provided). Returns the repo root path. + * + * Layout of `history`: + * [{ tag?: string, files: Record }, ...] + */ +async function makeGitRepo( + history: Array<{ tag?: string; files: Record }> +): Promise { + const root = await fs.mkdtemp(join(tmpdir(), 'trueref-git-test-')); + + async function git(...args: string[]) { + await execFileAsync('git', ['-C', root, ...args]); + } + + await git('init', '--initial-branch=main'); + await git('config', 'user.email', 'test@trueref.local'); + await git('config', 'user.name', 'TrueRef Test'); + + for (const { tag, files } of history) { + // Write files + for (const [relPath, content] of Object.entries(files)) { + const absPath = join(root, relPath); + await fs.mkdir(join(absPath, '..'), { recursive: true }); + await fs.writeFile(absPath, content, 'utf-8'); + } + await git('add', '.'); + await git('commit', '--allow-empty', '-m', `commit for ${tag ?? 'HEAD'}`); + if (tag) { + await git('tag', tag); + } + } + + return root; +} + +describe('LocalCrawler.crawl() — git ref checkout', () => { + let root: string = ''; + const crawler = new LocalCrawler(); + + afterEach(async () => { + if (root) await cleanupTempRepo(root); + }); + + it('crawls files at a specific tag, not the HEAD state', async () => { + root = await makeGitRepo([ + { tag: 'v1.0.0', files: { 'src/index.ts': 'export const version = 1;' } }, + { files: { 'src/index.ts': 'export const version = 2;' } } + ]); + + const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' }); + const indexFile = result.files.find((f) => f.path === 'src/index.ts'); + expect(indexFile?.content).toBe('export const version = 1;'); + }); + + it('crawls files at a specific commit SHA', async () => { + root = await makeGitRepo([ + { tag: 'v1.0.0', files: { 'api.ts': 'v1' } }, + { files: { 'api.ts': 'v2' } } + ]); + + // Resolve the SHA of v1.0.0 + const { stdout } = await execFileAsync('git', ['-C', root, 'rev-parse', 'v1.0.0'], { + encoding: 'utf-8' + }); + const sha = stdout.trim(); + + const result = await crawler.crawl({ rootPath: root, ref: sha }); + const api = result.files.find((f) => f.path === 'api.ts'); + expect(api?.content).toBe('v1'); + }); + + it('sets branch to the ref string in the result', async () => { + root = await makeGitRepo([{ tag: 'v2.3.1', files: { 'README.md': '# v2' } }]); + + const result = await crawler.crawl({ rootPath: root, ref: 'v2.3.1' }); + expect(result.branch).toBe('v2.3.1'); + }); + + it('sets commitSha to the git-resolved SHA (not file-content hash)', async () => { + root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'a.ts': 'a' } }]); + + const { stdout } = await execFileAsync('git', ['-C', root, 'rev-parse', 'v1.0.0'], { + encoding: 'utf-8' + }); + const expectedSha = stdout.trim(); + + const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' }); + expect(result.commitSha).toBe(expectedSha); + }); + + it('does not modify the working tree', async () => { + root = await makeGitRepo([ + { tag: 'v1.0.0', files: { 'src/index.ts': 'v1' } }, + { files: { 'src/index.ts': 'v2' } } + ]); + + // Working tree is at HEAD (v2) + const before = await fs.readFile(join(root, 'src/index.ts'), 'utf-8'); + await crawler.crawl({ rootPath: root, ref: 'v1.0.0' }); + const after = await fs.readFile(join(root, 'src/index.ts'), 'utf-8'); + + expect(before).toBe('v2'); + expect(after).toBe('v2'); + }); + + it('removes the temporary worktree after crawling', async () => { + root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'f.ts': 'x' } }]); + + await crawler.crawl({ rootPath: root, ref: 'v1.0.0' }); + + // List remaining worktrees — only the main one should remain. + const { stdout } = await execFileAsync('git', ['-C', root, 'worktree', 'list', '--porcelain'], { + encoding: 'utf-8' + }); + const worktreeCount = stdout.split('\n').filter((l) => l.startsWith('worktree ')).length; + expect(worktreeCount).toBe(1); + }); + + it('throws NotAGitRepositoryError for a plain directory', async () => { + const plainDir = await fs.mkdtemp(join(tmpdir(), 'trueref-plain-')); + root = plainDir; // cleaned up in afterEach + + await expect(crawler.crawl({ rootPath: plainDir, ref: 'v1.0.0' })).rejects.toThrow( + NotAGitRepositoryError + ); + }); + + it('throws InvalidRefError for a ref that does not exist', async () => { + root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'f.ts': 'x' } }]); + + await expect(crawler.crawl({ rootPath: root, ref: 'v99.99.99' })).rejects.toThrow( + InvalidRefError + ); + }); + + it('applies caller-supplied config at the checked-out ref', async () => { + root = await makeGitRepo([ + { + tag: 'v1.0.0', + files: { + 'src/index.ts': 'export {};', + 'package.json': '{"name":"test"}' + } + } + ]); + + // Exclude package.json via caller config + const result = await crawler.crawl({ + rootPath: root, + ref: 'v1.0.0', + config: { excludeFiles: ['package.json'] } + }); + + expect(result.files.some((f) => f.path === 'package.json')).toBe(false); + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); + }); + + it('reads trueref.json from the checked-out ref', async () => { + root = await makeGitRepo([ + { + tag: 'v1.0.0', + files: { + 'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }), + 'src/index.ts': 'export {};', + 'package.json': '{"name":"test"}' + } + } + ]); + + const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' }); + expect(result.files.some((f) => f.path === 'package.json')).toBe(false); + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Edge cases +// --------------------------------------------------------------------------- + +describe('LocalCrawler.crawl() — edge cases', () => { + afterEach(async () => { + await cleanupTempRepo(root); + }); + + it('returns empty result for an empty directory', async () => { + root = await makeTempRepo({}); + const result = await crawlRoot(); + expect(result.files).toHaveLength(0); + expect(result.totalFiles).toBe(0); + expect(result.skippedFiles).toBe(0); + }); + + it('handles deeply nested directory structures', async () => { + root = await makeTempRepo({ + 'a/b/c/d/deep.ts': 'export const deep = true;' + }); + const result = await crawlRoot(); + expect(result.files.some((f) => f.path === 'a/b/c/d/deep.ts')).toBe(true); + }); + + it('handles files with UTF-8 content correctly', async () => { + const utf8Content = 'const greeting = "héllo wörld — 日本語";'; + root = await makeTempRepo({ 'src/unicode.ts': utf8Content }); + const result = await crawlRoot(); + const file = result.files.find((f) => f.path === 'src/unicode.ts'); + expect(file?.content).toBe(utf8Content); + expect(file?.sha).toBe(sha256(utf8Content)); + }); + + it('commitSha differs when file content changes', async () => { + root = await makeTempRepo({ 'src/index.ts': 'version 1' }); + const r1 = await crawlRoot(); + + await fs.writeFile(join(root, 'src/index.ts'), 'version 2', 'utf-8'); + const r2 = await crawlRoot(); + + expect(r1.commitSha).not.toBe(r2.commitSha); + }); + + it('commitSha is empty-string hash when no files are crawled', async () => { + root = await makeTempRepo({ 'image.png': '\x89PNG' }); + const result = await crawlRoot(); + // SHA-256 of an empty string + expect(result.commitSha).toBe(sha256('')); + }); +}); diff --git a/src/lib/server/crawler/local.crawler.ts b/src/lib/server/crawler/local.crawler.ts new file mode 100644 index 0000000..ba8d7e6 --- /dev/null +++ b/src/lib/server/crawler/local.crawler.ts @@ -0,0 +1,275 @@ +/** + * Local Filesystem Crawler (TRUEREF-0004). + * + * Walks a directory tree and enumerates all files, applying the same + * extension and size filters as the GitHub crawler (TRUEREF-0003). + * Reads file contents as UTF-8 strings and computes SHA-256 checksums + * for change detection. + * + * Design decisions: + * - Uses Node.js `fs/promises` and `crypto` — no extra dependencies. + * - Symlinks and special files (devices, sockets, FIFOs) are skipped. + * - `trueref.json` / `context7.json` at the repo root are detected and + * parsed before any other file filtering runs, matching the GitHub crawler. + * - File size for filtering is taken from `stat().size` so the size limit + * is applied before reading file content (saves I/O on large excluded files). + * - `commitSha` is derived from a SHA-256 hash of all per-file checksums, + * giving a deterministic fingerprint of the crawled file set. + */ + +import { execFile } from 'node:child_process'; +import { createHash } from 'node:crypto'; +import { promises as fs } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { promisify } from 'node:util'; + +import { detectLanguage, shouldIndexFile } from './file-filter.js'; +import { InvalidRefError, NotAGitRepositoryError } from './types.js'; +import type { CrawledFile, CrawlResult, RepoConfig } from './types.js'; + +const execFileAsync = promisify(execFile); + +// --------------------------------------------------------------------------- +// Public options type +// --------------------------------------------------------------------------- + +export interface LocalCrawlOptions { + /** Absolute path to the repository root directory. */ + rootPath: string; + /** + * Git ref to check out before crawling — a tag name (e.g. "v2.1.0"), + * a branch name, or a commit SHA. When provided the crawler creates an + * isolated git worktree at that ref, crawls it, then removes the worktree. + * The original working tree is never modified. + * Requires `rootPath` to be inside a git repository. + */ + ref?: string; + /** Pre-parsed trueref.json / context7.json configuration, if already loaded. */ + config?: RepoConfig; + /** Progress callback invoked after each file is read. */ + onProgress?: (processed: number, total: number) => void; +} + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +/** Names of config files that control include/exclude rules. */ +const CONFIG_FILE_NAMES = new Set(['trueref.json', 'context7.json']); + +// --------------------------------------------------------------------------- +// Git helpers +// --------------------------------------------------------------------------- + +/** + * Run a git command inside `cwd` and return trimmed stdout. + * Throws the child-process error on non-zero exit. + */ +async function runGit(cwd: string, args: string[]): Promise { + const { stdout } = await execFileAsync('git', ['-C', cwd, ...args], { encoding: 'utf-8' }); + return stdout.trim(); +} + +/** + * Compute a SHA-256 hex digest of a UTF-8 string. + */ +function computeSHA256(content: string): string { + return createHash('sha256').update(content, 'utf-8').digest('hex'); +} + +/** + * Attempt to read and JSON-parse a config file. + * Returns undefined if the file cannot be read or parsed. + */ +async function parseConfigFile(absPath: string): Promise { + try { + const raw = await fs.readFile(absPath, 'utf-8'); + return JSON.parse(raw) as RepoConfig; + } catch { + console.warn(`[LocalCrawler] Failed to parse config file: ${absPath}`); + return undefined; + } +} + +// --------------------------------------------------------------------------- +// LocalCrawler +// --------------------------------------------------------------------------- + +export class LocalCrawler { + /** + * Crawl a local directory tree and return structured file objects. + * + * When `options.ref` is supplied the crawler creates an isolated git + * worktree checked out at that ref, crawls it, then removes the worktree. + * The caller's working tree is never modified. + * + * @param options - Root path, optional git ref, optional config, and progress callback. + * @returns CrawlResult with all read files and summary statistics. + */ + async crawl(options: LocalCrawlOptions): Promise { + const { rootPath, ref } = options; + + if (!ref) { + // Fast path: crawl the working tree as-is. + return this.crawlDirectory(rootPath, options.config, options.onProgress, 'local'); + } + + // Git-aware path: verify repo, resolve ref, create worktree, crawl, clean up. + let worktreePath: string | undefined; + + try { + // Verify rootPath is inside a git repository. + await runGit(rootPath, ['rev-parse', '--git-dir']).catch(() => { + throw new NotAGitRepositoryError(`Not a git repository: ${rootPath}`); + }); + + // Resolve the ref to a concrete commit SHA (validates it exists). + const commitSha = await runGit(rootPath, ['rev-parse', '--verify', ref]).catch(() => { + throw new InvalidRefError(`Invalid git ref "${ref}" in repository: ${rootPath}`); + }); + + // Create a temporary isolated worktree at the resolved ref. + const tmpDir = await fs.mkdtemp(join(tmpdir(), 'trueref-wt-')); + worktreePath = tmpDir; + + await runGit(rootPath, ['worktree', 'add', '--detach', tmpDir, ref]).catch((err) => { + throw new InvalidRefError( + `Cannot create worktree for ref "${ref}": ${err instanceof Error ? err.message : String(err)}` + ); + }); + + // Crawl the worktree and stamp the result with the git-resolved metadata. + const result = await this.crawlDirectory(worktreePath, options.config, options.onProgress, ref); + + return { ...result, commitSha }; + } finally { + if (worktreePath) { + // Remove the worktree (git also deletes the directory). + await runGit(rootPath, ['worktree', 'remove', '--force', worktreePath]).catch(() => { + // Best-effort; leave the temp directory for the OS to clean up. + fs.rm(worktreePath!, { recursive: true, force: true }).catch(() => {}); + }); + } + } + } + + // --------------------------------------------------------------------------- + // Private — directory crawl + // --------------------------------------------------------------------------- + + /** + * Walk `rootPath`, apply filters, read files, and build a CrawlResult. + * `branch` is embedded verbatim into the returned result. + */ + private async crawlDirectory( + rootPath: string, + callerConfig: RepoConfig | undefined, + onProgress: LocalCrawlOptions['onProgress'], + branch: string + ): Promise { + // Step 1: Walk the directory tree and collect (relPath, size) pairs. + const statCache = new Map(); + const allRelPaths = await this.walkDirectory(rootPath, '', statCache); + + // Step 2: Detect trueref.json / context7.json at the repo root first. + // Only root-level config files are honoured (no directory prefix). + const configRelPath = allRelPaths.find((p) => CONFIG_FILE_NAMES.has(p)); + let config = callerConfig; + if (configRelPath && !config) { + config = await parseConfigFile(join(rootPath, configRelPath)); + } + + // Step 3: Filter files according to extension, size, and config rules. + const filteredPaths = allRelPaths.filter((relPath) => { + const size = statCache.get(relPath) ?? 0; + return shouldIndexFile(relPath, size, config); + }); + + // Step 4: Read file contents and build CrawledFile records. + const crawledFiles: CrawledFile[] = []; + + for (const [i, relPath] of filteredPaths.entries()) { + const absPath = join(rootPath, relPath); + try { + const content = await fs.readFile(absPath, 'utf-8'); + const sha = computeSHA256(content); + crawledFiles.push({ + path: relPath, + content, + size: Buffer.byteLength(content, 'utf-8'), + sha, + language: detectLanguage(relPath) + }); + } catch (err) { + console.warn( + `[LocalCrawler] Could not read file: ${relPath} — ${err instanceof Error ? err.message : String(err)}` + ); + } + onProgress?.(i + 1, filteredPaths.length); + } + + // Step 5: Build a deterministic repo-level fingerprint from file SHAs. + const commitSha = computeSHA256(crawledFiles.map((f) => f.sha).join('')); + + return { + files: crawledFiles, + totalFiles: filteredPaths.length, + skippedFiles: allRelPaths.length - filteredPaths.length, + branch, + commitSha + }; + } + + /** + * Recursively walk a directory and collect relative paths of all regular files. + * Symlinks and special files (devices, sockets, FIFOs) are silently skipped. + * Populates `statCache` with file sizes so the caller can filter without a + * second `stat()` call. + * + * @param dir - Absolute path of the directory to read. + * @param rel - Relative path prefix accumulated during recursion. + * @param statCache - Mutable map from relative path → byte size. + */ + private async walkDirectory( + dir: string, + rel: string, + statCache: Map + ): Promise { + let entries; + try { + entries = await fs.readdir(dir, { withFileTypes: true }); + } catch { + // Directory is unreadable (permissions, etc.) — skip silently. + return []; + } + + const files: string[] = []; + + for (const entry of entries) { + // Only descend into plain directories and collect plain files. + // entry.isFile() / entry.isDirectory() return false for symlinks, + // devices, sockets, and FIFOs, so those are all implicitly skipped. + if (!entry.isFile() && !entry.isDirectory()) continue; + + const relPath = rel ? `${rel}/${entry.name}` : entry.name; + + if (entry.isDirectory()) { + const children = await this.walkDirectory(join(dir, entry.name), relPath, statCache); + files.push(...children); + } else { + // Capture file size from stat so shouldIndexFile can enforce the limit + // without reading the file. + try { + const stat = await fs.stat(join(dir, entry.name)); + statCache.set(relPath, stat.size); + } catch { + statCache.set(relPath, 0); + } + files.push(relPath); + } + } + + return files; + } +} diff --git a/src/lib/server/crawler/rate-limiter.ts b/src/lib/server/crawler/rate-limiter.ts new file mode 100644 index 0000000..09236bd --- /dev/null +++ b/src/lib/server/crawler/rate-limiter.ts @@ -0,0 +1,123 @@ +/** + * GitHub API rate-limit tracker and backoff helper (TRUEREF-0003). + * + * Reads X-RateLimit-* headers from every API response and pauses outgoing + * requests when the remaining allowance drops to ≤ 10. + */ + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +export class GitHubRateLimiter { + private remaining = 5000; + private resetAt = Date.now(); + + /** + * Update internal counters from the headers of a GitHub API response. + */ + updateFromHeaders(headers: Headers): void { + const remaining = headers.get('X-RateLimit-Remaining'); + const reset = headers.get('X-RateLimit-Reset'); + + if (remaining !== null) { + this.remaining = parseInt(remaining, 10); + } + if (reset !== null) { + // GitHub returns a Unix epoch in seconds. + this.resetAt = parseInt(reset, 10) * 1000; + } + } + + /** + * If the remaining allowance is critically low (≤ 10), sleep until the + * rate-limit window resets (plus a 1 s buffer). + */ + async waitIfNeeded(): Promise { + if (this.remaining <= 10) { + const waitMs = Math.max(0, this.resetAt - Date.now()) + 1000; + await sleep(waitMs); + } + } + + /** Remaining requests in the current window (for testing). */ + get remainingRequests(): number { + return this.remaining; + } + + /** Reset timestamp as a Unix epoch in ms (for testing). */ + get resetTimestamp(): number { + return this.resetAt; + } +} + +/** + * Exponential-backoff retry wrapper for network-level errors. + * + * Retries up to `maxAttempts` times (default 3) with 1 s, 2 s, 4 s delays. + * + * @param fn - Async function to attempt. + * @param maxAttempts - Maximum number of attempts (default 3). + * @param isRetryable - Optional predicate; when it returns false for a given + * error the error is re-thrown immediately without further + * retries. Defaults to retrying all errors. + */ +export async function withRetry( + fn: () => Promise, + maxAttempts = 3, + isRetryable: (err: unknown) => boolean = () => true +): Promise { + let lastError: unknown; + for (let attempt = 0; attempt < maxAttempts; attempt++) { + try { + return await fn(); + } catch (err) { + if (!isRetryable(err)) throw err; + lastError = err; + if (attempt < maxAttempts - 1) { + await sleep(1000 * Math.pow(2, attempt)); + } + } + } + throw lastError; +} + +/** + * Async semaphore — limits the number of concurrently executing promises. + */ +export class Semaphore { + private count: number; + private readonly queue: Array<() => void> = []; + + constructor(concurrency: number) { + this.count = concurrency; + } + + async acquire(): Promise { + if (this.count > 0) { + this.count--; + return; + } + return new Promise((resolve) => { + this.queue.push(resolve); + }); + } + + release(): void { + const next = this.queue.shift(); + if (next) { + next(); + } else { + this.count++; + } + } + + async run(fn: () => Promise): Promise { + await this.acquire(); + try { + return await fn(); + } finally { + this.release(); + } + } +} diff --git a/src/lib/server/crawler/types.ts b/src/lib/server/crawler/types.ts new file mode 100644 index 0000000..a40eba6 --- /dev/null +++ b/src/lib/server/crawler/types.ts @@ -0,0 +1,135 @@ +/** + * Types for the GitHub repository crawler (TRUEREF-0003). + */ + +import type { TrueRefConfig } from '$lib/types'; + +// Re-export RepoConfig alias so crawler modules can reference it consistently. +export type RepoConfig = TrueRefConfig; + +// --------------------------------------------------------------------------- +// Core crawler data types +// --------------------------------------------------------------------------- + +export interface CrawledFile { + /** Relative path within the repo, e.g. "src/index.ts" */ + path: string; + /** UTF-8 file content */ + content: string; + /** File size in bytes */ + size: number; + /** GitHub blob SHA (used as checksum) */ + sha: string; + /** Programming language detected from extension */ + language: string; +} + +export interface CrawlResult { + /** Successfully downloaded files */ + files: CrawledFile[]; + /** Total files that matched filters */ + totalFiles: number; + /** Files that were filtered out or too large */ + skippedFiles: number; + /** Branch or tag that was crawled */ + branch: string; + /** HEAD commit SHA */ + commitSha: string; +} + +export interface CrawlOptions { + owner: string; + repo: string; + /** Branch, tag, or commit SHA; defaults to repo default branch */ + ref?: string; + /** GitHub PAT for private repos */ + token?: string; + /** Parsed trueref.json / context7.json configuration */ + config?: RepoConfig; + /** Progress callback invoked after each file is processed */ + onProgress?: (processed: number, total: number) => void; +} + +// --------------------------------------------------------------------------- +// GitHub API response shapes (minimal — only fields we use) +// --------------------------------------------------------------------------- + +export interface GitHubRepoResponse { + default_branch: string; + stargazers_count: number; +} + +export interface GitHubTreeItem { + path: string; + type: 'blob' | 'tree'; + size?: number; + sha: string; + url: string; +} + +export interface GitHubTreeResponse { + tree: GitHubTreeItem[]; + truncated: boolean; +} + +export interface GitHubContentResponse { + content: string; + encoding: string; + size: number; + sha: string; +} + +// --------------------------------------------------------------------------- +// Domain errors +// --------------------------------------------------------------------------- + +export class RepositoryNotFoundError extends Error { + readonly code = 'REPOSITORY_NOT_FOUND'; + constructor(message: string) { + super(message); + this.name = 'RepositoryNotFoundError'; + } +} + +export class AuthenticationError extends Error { + readonly code = 'AUTHENTICATION_ERROR'; + constructor(message: string) { + super(message); + this.name = 'AuthenticationError'; + } +} + +export class PermissionError extends Error { + readonly code = 'PERMISSION_ERROR'; + constructor(message: string) { + super(message); + this.name = 'PermissionError'; + } +} + +export class RateLimitError extends Error { + readonly code = 'RATE_LIMIT_ERROR'; + constructor( + message: string, + public readonly resetAt: number + ) { + super(message); + this.name = 'RateLimitError'; + } +} + +export class NotAGitRepositoryError extends Error { + readonly code = 'NOT_A_GIT_REPOSITORY'; + constructor(message: string) { + super(message); + this.name = 'NotAGitRepositoryError'; + } +} + +export class InvalidRefError extends Error { + readonly code = 'INVALID_REF'; + constructor(message: string) { + super(message); + this.name = 'InvalidRefError'; + } +}