chore: initial project scaffold
This commit is contained in:
218
docs/features/TRUEREF-0003.md
Normal file
218
docs/features/TRUEREF-0003.md
Normal file
@@ -0,0 +1,218 @@
|
||||
# TRUEREF-0003 — GitHub Repository Crawler
|
||||
|
||||
**Priority:** P0
|
||||
**Status:** Pending
|
||||
**Depends On:** TRUEREF-0001
|
||||
**Blocks:** TRUEREF-0009, TRUEREF-0013
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Implement the GitHub crawler that fetches repository file trees and downloads file contents using the GitHub REST API. The crawler respects rate limits, supports private repos via PAT, and applies include/exclude filtering from `trueref.json` configuration.
|
||||
|
||||
---
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
- [ ] Fetch complete file tree for a GitHub repo (default branch or specific tag/branch)
|
||||
- [ ] Filter files by extension (only index relevant file types)
|
||||
- [ ] Apply `trueref.json` folder/file include/exclude rules
|
||||
- [ ] Download file contents in parallel (with concurrency limit)
|
||||
- [ ] Handle GitHub API rate limiting (respect `X-RateLimit-*` headers, exponential backoff)
|
||||
- [ ] Support private repositories via GitHub Personal Access Token (PAT)
|
||||
- [ ] Return structured `CrawledFile` objects for each fetched file
|
||||
- [ ] Report progress via callback (for job tracking)
|
||||
- [ ] Unit tests with mocked GitHub API responses
|
||||
|
||||
---
|
||||
|
||||
## Indexable File Types
|
||||
|
||||
The crawler only downloads files with these extensions:
|
||||
|
||||
```typescript
|
||||
const INDEXABLE_EXTENSIONS = new Set([
|
||||
// Documentation
|
||||
'.md', '.mdx', '.txt', '.rst',
|
||||
// Code
|
||||
'.ts', '.tsx', '.js', '.jsx',
|
||||
'.py', '.rb', '.go', '.rs', '.java', '.cs', '.cpp', '.c', '.h',
|
||||
'.swift', '.kt', '.php', '.scala', '.clj', '.ex', '.exs',
|
||||
'.sh', '.bash', '.zsh', '.fish',
|
||||
// Config / data
|
||||
'.json', '.yaml', '.yml', '.toml',
|
||||
// Web
|
||||
'.html', '.css', '.svelte', '.vue',
|
||||
]);
|
||||
|
||||
const MAX_FILE_SIZE_BYTES = 500_000; // 500 KB — skip large generated files
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Data Types
|
||||
|
||||
```typescript
|
||||
export interface CrawledFile {
|
||||
path: string; // relative path within repo, e.g. "src/index.ts"
|
||||
content: string; // UTF-8 file content
|
||||
size: number; // bytes
|
||||
sha: string; // GitHub blob SHA (used as checksum)
|
||||
language: string; // detected from extension
|
||||
}
|
||||
|
||||
export interface CrawlResult {
|
||||
files: CrawledFile[];
|
||||
totalFiles: number; // files matching filters
|
||||
skippedFiles: number; // filtered out or too large
|
||||
branch: string; // branch/tag that was crawled
|
||||
commitSha: string; // HEAD commit SHA
|
||||
}
|
||||
|
||||
export interface CrawlOptions {
|
||||
owner: string;
|
||||
repo: string;
|
||||
ref?: string; // branch, tag, or commit SHA; defaults to repo default branch
|
||||
token?: string; // GitHub PAT for private repos
|
||||
config?: RepoConfig; // parsed trueref.json
|
||||
onProgress?: (processed: number, total: number) => void;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## GitHub API Usage
|
||||
|
||||
### Step 1: Get default branch (if ref not specified)
|
||||
```
|
||||
GET https://api.github.com/repos/{owner}/{repo}
|
||||
→ { default_branch: "main", stargazers_count: 12345 }
|
||||
```
|
||||
|
||||
### Step 2: Fetch file tree (recursive)
|
||||
```
|
||||
GET https://api.github.com/repos/{owner}/{repo}/git/trees/{ref}?recursive=1
|
||||
→ {
|
||||
tree: [
|
||||
{ path: "src/index.ts", type: "blob", size: 1234, sha: "abc123", url: "..." },
|
||||
...
|
||||
],
|
||||
truncated: false
|
||||
}
|
||||
```
|
||||
|
||||
If `truncated: true`, the tree has >100k items. Use `--depth` pagination or filter top-level directories first.
|
||||
|
||||
### Step 3: Download file contents (parallel)
|
||||
```
|
||||
GET https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={ref}
|
||||
→ { content: "<base64>", encoding: "base64", size: 1234, sha: "abc123" }
|
||||
```
|
||||
|
||||
Alternative for large repos: use raw content URL:
|
||||
```
|
||||
GET https://raw.githubusercontent.com/{owner}/{repo}/{ref}/{path}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Filtering Logic
|
||||
|
||||
```typescript
|
||||
function shouldIndexFile(
|
||||
filePath: string,
|
||||
fileSize: number,
|
||||
config?: RepoConfig
|
||||
): boolean {
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
|
||||
// 1. Must have indexable extension
|
||||
if (!INDEXABLE_EXTENSIONS.has(ext)) return false;
|
||||
|
||||
// 2. Must not exceed size limit
|
||||
if (fileSize > MAX_FILE_SIZE_BYTES) return false;
|
||||
|
||||
// 3. Apply config excludeFiles (exact filename match)
|
||||
if (config?.excludeFiles?.includes(path.basename(filePath))) return false;
|
||||
|
||||
// 4. Apply config excludeFolders (regex or prefix match)
|
||||
if (config?.excludeFolders?.some(folder =>
|
||||
filePath.startsWith(folder) || new RegExp(folder).test(filePath)
|
||||
)) return false;
|
||||
|
||||
// 5. Apply config folders allowlist (if specified, only index those paths)
|
||||
if (config?.folders?.length) {
|
||||
const inAllowedFolder = config.folders.some(folder =>
|
||||
filePath.startsWith(folder) || new RegExp(folder).test(filePath)
|
||||
);
|
||||
if (!inAllowedFolder) return false;
|
||||
}
|
||||
|
||||
// 6. Default excludes: node_modules, .git, dist, build, coverage
|
||||
const defaultExcludes = [
|
||||
'node_modules/', '.git/', 'dist/', 'build/', 'coverage/',
|
||||
'.next/', '__pycache__/', 'vendor/', 'target/', '.cache/',
|
||||
];
|
||||
if (defaultExcludes.some(ex => filePath.startsWith(ex))) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rate Limiting
|
||||
|
||||
```typescript
|
||||
class GitHubRateLimiter {
|
||||
private remaining = 5000;
|
||||
private resetAt = Date.now();
|
||||
|
||||
updateFromHeaders(headers: Headers): void {
|
||||
this.remaining = parseInt(headers.get('X-RateLimit-Remaining') ?? '5000');
|
||||
this.resetAt = parseInt(headers.get('X-RateLimit-Reset') ?? '0') * 1000;
|
||||
}
|
||||
|
||||
async waitIfNeeded(): Promise<void> {
|
||||
if (this.remaining <= 10) {
|
||||
const waitMs = Math.max(0, this.resetAt - Date.now()) + 1000;
|
||||
await sleep(waitMs);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Requests are made with a concurrency limit of 10 parallel downloads using a semaphore/pool pattern.
|
||||
|
||||
---
|
||||
|
||||
## Error Handling
|
||||
|
||||
| Scenario | Behavior |
|
||||
|----------|---------|
|
||||
| 404 Not Found | Throw `RepositoryNotFoundError` |
|
||||
| 401 Unauthorized | Throw `AuthenticationError` (invalid or missing token) |
|
||||
| 403 Forbidden | If `X-RateLimit-Remaining: 0`, wait and retry; else throw `PermissionError` |
|
||||
| 422 Unprocessable | Tree too large; switch to directory-by-directory traversal |
|
||||
| Network error | Retry up to 3 times with exponential backoff |
|
||||
| File content decode error | Skip file, log warning |
|
||||
|
||||
---
|
||||
|
||||
## Implementation Notes
|
||||
|
||||
- Prefer `raw.githubusercontent.com` for file downloads — faster and doesn't count against rate limit as heavily as API.
|
||||
- Cache the file tree in memory during a single crawl run to avoid redundant requests.
|
||||
- The `sha` field from the tree response is the blob SHA — use this as the document checksum, not the file content SHA.
|
||||
- Detect `trueref.json` / `context7.json` in the tree before downloading other files, so filtering rules apply to the rest of the crawl.
|
||||
|
||||
---
|
||||
|
||||
## Files to Create
|
||||
|
||||
- `src/lib/server/crawler/github.crawler.ts`
|
||||
- `src/lib/server/crawler/rate-limiter.ts`
|
||||
- `src/lib/server/crawler/file-filter.ts`
|
||||
- `src/lib/server/crawler/types.ts`
|
||||
- `src/lib/server/crawler/github.crawler.test.ts`
|
||||
Reference in New Issue
Block a user