chore(FEEDBACK-0001): linting
This commit is contained in:
@@ -37,17 +37,46 @@ The crawler only downloads files with these extensions:
|
||||
|
||||
```typescript
|
||||
const INDEXABLE_EXTENSIONS = new Set([
|
||||
// Documentation
|
||||
'.md', '.mdx', '.txt', '.rst',
|
||||
// Code
|
||||
'.ts', '.tsx', '.js', '.jsx',
|
||||
'.py', '.rb', '.go', '.rs', '.java', '.cs', '.cpp', '.c', '.h',
|
||||
'.swift', '.kt', '.php', '.scala', '.clj', '.ex', '.exs',
|
||||
'.sh', '.bash', '.zsh', '.fish',
|
||||
// Config / data
|
||||
'.json', '.yaml', '.yml', '.toml',
|
||||
// Web
|
||||
'.html', '.css', '.svelte', '.vue',
|
||||
// Documentation
|
||||
'.md',
|
||||
'.mdx',
|
||||
'.txt',
|
||||
'.rst',
|
||||
// Code
|
||||
'.ts',
|
||||
'.tsx',
|
||||
'.js',
|
||||
'.jsx',
|
||||
'.py',
|
||||
'.rb',
|
||||
'.go',
|
||||
'.rs',
|
||||
'.java',
|
||||
'.cs',
|
||||
'.cpp',
|
||||
'.c',
|
||||
'.h',
|
||||
'.swift',
|
||||
'.kt',
|
||||
'.php',
|
||||
'.scala',
|
||||
'.clj',
|
||||
'.ex',
|
||||
'.exs',
|
||||
'.sh',
|
||||
'.bash',
|
||||
'.zsh',
|
||||
'.fish',
|
||||
// Config / data
|
||||
'.json',
|
||||
'.yaml',
|
||||
'.yml',
|
||||
'.toml',
|
||||
// Web
|
||||
'.html',
|
||||
'.css',
|
||||
'.svelte',
|
||||
'.vue'
|
||||
]);
|
||||
|
||||
const MAX_FILE_SIZE_BYTES = 500_000; // 500 KB — skip large generated files
|
||||
@@ -59,28 +88,28 @@ const MAX_FILE_SIZE_BYTES = 500_000; // 500 KB — skip large generated files
|
||||
|
||||
```typescript
|
||||
export interface CrawledFile {
|
||||
path: string; // relative path within repo, e.g. "src/index.ts"
|
||||
content: string; // UTF-8 file content
|
||||
size: number; // bytes
|
||||
sha: string; // GitHub blob SHA (used as checksum)
|
||||
language: string; // detected from extension
|
||||
path: string; // relative path within repo, e.g. "src/index.ts"
|
||||
content: string; // UTF-8 file content
|
||||
size: number; // bytes
|
||||
sha: string; // GitHub blob SHA (used as checksum)
|
||||
language: string; // detected from extension
|
||||
}
|
||||
|
||||
export interface CrawlResult {
|
||||
files: CrawledFile[];
|
||||
totalFiles: number; // files matching filters
|
||||
skippedFiles: number; // filtered out or too large
|
||||
branch: string; // branch/tag that was crawled
|
||||
commitSha: string; // HEAD commit SHA
|
||||
files: CrawledFile[];
|
||||
totalFiles: number; // files matching filters
|
||||
skippedFiles: number; // filtered out or too large
|
||||
branch: string; // branch/tag that was crawled
|
||||
commitSha: string; // HEAD commit SHA
|
||||
}
|
||||
|
||||
export interface CrawlOptions {
|
||||
owner: string;
|
||||
repo: string;
|
||||
ref?: string; // branch, tag, or commit SHA; defaults to repo default branch
|
||||
token?: string; // GitHub PAT for private repos
|
||||
config?: RepoConfig; // parsed trueref.json
|
||||
onProgress?: (processed: number, total: number) => void;
|
||||
owner: string;
|
||||
repo: string;
|
||||
ref?: string; // branch, tag, or commit SHA; defaults to repo default branch
|
||||
token?: string; // GitHub PAT for private repos
|
||||
config?: RepoConfig; // parsed trueref.json
|
||||
onProgress?: (processed: number, total: number) => void;
|
||||
}
|
||||
```
|
||||
|
||||
@@ -89,12 +118,14 @@ export interface CrawlOptions {
|
||||
## GitHub API Usage
|
||||
|
||||
### Step 1: Get default branch (if ref not specified)
|
||||
|
||||
```
|
||||
GET https://api.github.com/repos/{owner}/{repo}
|
||||
→ { default_branch: "main", stargazers_count: 12345 }
|
||||
```
|
||||
|
||||
### Step 2: Fetch file tree (recursive)
|
||||
|
||||
```
|
||||
GET https://api.github.com/repos/{owner}/{repo}/git/trees/{ref}?recursive=1
|
||||
→ {
|
||||
@@ -109,12 +140,14 @@ GET https://api.github.com/repos/{owner}/{repo}/git/trees/{ref}?recursive=1
|
||||
If `truncated: true`, the tree has >100k items. Use `--depth` pagination or filter top-level directories first.
|
||||
|
||||
### Step 3: Download file contents (parallel)
|
||||
|
||||
```
|
||||
GET https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={ref}
|
||||
→ { content: "<base64>", encoding: "base64", size: 1234, sha: "abc123" }
|
||||
```
|
||||
|
||||
Alternative for large repos: use raw content URL:
|
||||
|
||||
```
|
||||
GET https://raw.githubusercontent.com/{owner}/{repo}/{ref}/{path}
|
||||
```
|
||||
@@ -124,48 +157,47 @@ GET https://raw.githubusercontent.com/{owner}/{repo}/{ref}/{path}
|
||||
## Filtering Logic
|
||||
|
||||
```typescript
|
||||
function shouldIndexFile(
|
||||
filePath: string,
|
||||
fileSize: number,
|
||||
config?: RepoConfig
|
||||
): boolean {
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
const base = path.basename(filePath);
|
||||
function shouldIndexFile(filePath: string, fileSize: number, config?: RepoConfig): boolean {
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
const base = path.basename(filePath);
|
||||
|
||||
// 1. Must have indexable extension
|
||||
if (!INDEXABLE_EXTENSIONS.has(ext)) return false;
|
||||
// 1. Must have indexable extension
|
||||
if (!INDEXABLE_EXTENSIONS.has(ext)) return false;
|
||||
|
||||
// 2. Must not exceed size limit
|
||||
if (fileSize > MAX_FILE_SIZE_BYTES) return false;
|
||||
// 2. Must not exceed size limit
|
||||
if (fileSize > MAX_FILE_SIZE_BYTES) return false;
|
||||
|
||||
// 3. Exclude lockfiles and other non-source artifacts
|
||||
if (IGNORED_FILE_NAMES.has(base)) return false;
|
||||
// 3. Exclude lockfiles and other non-source artifacts
|
||||
if (IGNORED_FILE_NAMES.has(base)) return false;
|
||||
|
||||
// 4. Exclude minified and bundled assets
|
||||
if (base.includes('.min.') || base.endsWith('.bundle.js') || base.endsWith('.bundle.css')) {
|
||||
return false;
|
||||
}
|
||||
// 4. Exclude minified and bundled assets
|
||||
if (base.includes('.min.') || base.endsWith('.bundle.js') || base.endsWith('.bundle.css')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 5. Apply config excludeFiles (exact filename match)
|
||||
if (config?.excludeFiles?.includes(base)) return false;
|
||||
// 5. Apply config excludeFiles (exact filename match)
|
||||
if (config?.excludeFiles?.includes(base)) return false;
|
||||
|
||||
// 6. Exclude common dependency/build/cache directories at any depth
|
||||
if (isInIgnoredDirectory(filePath)) return false;
|
||||
// 6. Exclude common dependency/build/cache directories at any depth
|
||||
if (isInIgnoredDirectory(filePath)) return false;
|
||||
|
||||
// 7. Apply config excludeFolders (regex or prefix match)
|
||||
if (config?.excludeFolders?.some(folder =>
|
||||
filePath.startsWith(folder) || new RegExp(folder).test(filePath)
|
||||
)) return false;
|
||||
// 7. Apply config excludeFolders (regex or prefix match)
|
||||
if (
|
||||
config?.excludeFolders?.some(
|
||||
(folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath)
|
||||
)
|
||||
)
|
||||
return false;
|
||||
|
||||
// 8. Apply config folders allowlist (if specified, only index those paths)
|
||||
if (config?.folders?.length) {
|
||||
const inAllowedFolder = config.folders.some(folder =>
|
||||
filePath.startsWith(folder) || new RegExp(folder).test(filePath)
|
||||
);
|
||||
if (!inAllowedFolder) return false;
|
||||
}
|
||||
// 8. Apply config folders allowlist (if specified, only index those paths)
|
||||
if (config?.folders?.length) {
|
||||
const inAllowedFolder = config.folders.some(
|
||||
(folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath)
|
||||
);
|
||||
if (!inAllowedFolder) return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
```
|
||||
|
||||
@@ -177,20 +209,20 @@ The shared ignored-directory list is intentionally broader than the original bas
|
||||
|
||||
```typescript
|
||||
class GitHubRateLimiter {
|
||||
private remaining = 5000;
|
||||
private resetAt = Date.now();
|
||||
private remaining = 5000;
|
||||
private resetAt = Date.now();
|
||||
|
||||
updateFromHeaders(headers: Headers): void {
|
||||
this.remaining = parseInt(headers.get('X-RateLimit-Remaining') ?? '5000');
|
||||
this.resetAt = parseInt(headers.get('X-RateLimit-Reset') ?? '0') * 1000;
|
||||
}
|
||||
updateFromHeaders(headers: Headers): void {
|
||||
this.remaining = parseInt(headers.get('X-RateLimit-Remaining') ?? '5000');
|
||||
this.resetAt = parseInt(headers.get('X-RateLimit-Reset') ?? '0') * 1000;
|
||||
}
|
||||
|
||||
async waitIfNeeded(): Promise<void> {
|
||||
if (this.remaining <= 10) {
|
||||
const waitMs = Math.max(0, this.resetAt - Date.now()) + 1000;
|
||||
await sleep(waitMs);
|
||||
}
|
||||
}
|
||||
async waitIfNeeded(): Promise<void> {
|
||||
if (this.remaining <= 10) {
|
||||
const waitMs = Math.max(0, this.resetAt - Date.now()) + 1000;
|
||||
await sleep(waitMs);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
@@ -200,14 +232,14 @@ Requests are made with a concurrency limit of 10 parallel downloads using a sema
|
||||
|
||||
## Error Handling
|
||||
|
||||
| Scenario | Behavior |
|
||||
|----------|---------|
|
||||
| 404 Not Found | Throw `RepositoryNotFoundError` |
|
||||
| 401 Unauthorized | Throw `AuthenticationError` (invalid or missing token) |
|
||||
| 403 Forbidden | If `X-RateLimit-Remaining: 0`, wait and retry; else throw `PermissionError` |
|
||||
| 422 Unprocessable | Tree too large; switch to directory-by-directory traversal |
|
||||
| Network error | Retry up to 3 times with exponential backoff |
|
||||
| File content decode error | Skip file, log warning |
|
||||
| Scenario | Behavior |
|
||||
| ------------------------- | --------------------------------------------------------------------------- |
|
||||
| 404 Not Found | Throw `RepositoryNotFoundError` |
|
||||
| 401 Unauthorized | Throw `AuthenticationError` (invalid or missing token) |
|
||||
| 403 Forbidden | If `X-RateLimit-Remaining: 0`, wait and retry; else throw `PermissionError` |
|
||||
| 422 Unprocessable | Tree too large; switch to directory-by-directory traversal |
|
||||
| Network error | Retry up to 3 times with exponential backoff |
|
||||
| File content decode error | Skip file, log warning |
|
||||
|
||||
---
|
||||
|
||||
|
||||
Reference in New Issue
Block a user