chore(FEEDBACK-0001): linting

This commit is contained in:
Giancarmine Salucci
2026-03-27 02:23:01 +01:00
parent 16436bfab2
commit 5a3c27224d
102 changed files with 5108 additions and 4976 deletions

View File

@@ -37,17 +37,46 @@ The crawler only downloads files with these extensions:
```typescript
const INDEXABLE_EXTENSIONS = new Set([
// Documentation
'.md', '.mdx', '.txt', '.rst',
// Code
'.ts', '.tsx', '.js', '.jsx',
'.py', '.rb', '.go', '.rs', '.java', '.cs', '.cpp', '.c', '.h',
'.swift', '.kt', '.php', '.scala', '.clj', '.ex', '.exs',
'.sh', '.bash', '.zsh', '.fish',
// Config / data
'.json', '.yaml', '.yml', '.toml',
// Web
'.html', '.css', '.svelte', '.vue',
// Documentation
'.md',
'.mdx',
'.txt',
'.rst',
// Code
'.ts',
'.tsx',
'.js',
'.jsx',
'.py',
'.rb',
'.go',
'.rs',
'.java',
'.cs',
'.cpp',
'.c',
'.h',
'.swift',
'.kt',
'.php',
'.scala',
'.clj',
'.ex',
'.exs',
'.sh',
'.bash',
'.zsh',
'.fish',
// Config / data
'.json',
'.yaml',
'.yml',
'.toml',
// Web
'.html',
'.css',
'.svelte',
'.vue'
]);
const MAX_FILE_SIZE_BYTES = 500_000; // 500 KB — skip large generated files
@@ -59,28 +88,28 @@ const MAX_FILE_SIZE_BYTES = 500_000; // 500 KB — skip large generated files
```typescript
export interface CrawledFile {
path: string; // relative path within repo, e.g. "src/index.ts"
content: string; // UTF-8 file content
size: number; // bytes
sha: string; // GitHub blob SHA (used as checksum)
language: string; // detected from extension
path: string; // relative path within repo, e.g. "src/index.ts"
content: string; // UTF-8 file content
size: number; // bytes
sha: string; // GitHub blob SHA (used as checksum)
language: string; // detected from extension
}
export interface CrawlResult {
files: CrawledFile[];
totalFiles: number; // files matching filters
skippedFiles: number; // filtered out or too large
branch: string; // branch/tag that was crawled
commitSha: string; // HEAD commit SHA
files: CrawledFile[];
totalFiles: number; // files matching filters
skippedFiles: number; // filtered out or too large
branch: string; // branch/tag that was crawled
commitSha: string; // HEAD commit SHA
}
export interface CrawlOptions {
owner: string;
repo: string;
ref?: string; // branch, tag, or commit SHA; defaults to repo default branch
token?: string; // GitHub PAT for private repos
config?: RepoConfig; // parsed trueref.json
onProgress?: (processed: number, total: number) => void;
owner: string;
repo: string;
ref?: string; // branch, tag, or commit SHA; defaults to repo default branch
token?: string; // GitHub PAT for private repos
config?: RepoConfig; // parsed trueref.json
onProgress?: (processed: number, total: number) => void;
}
```
@@ -89,12 +118,14 @@ export interface CrawlOptions {
## GitHub API Usage
### Step 1: Get default branch (if ref not specified)
```
GET https://api.github.com/repos/{owner}/{repo}
→ { default_branch: "main", stargazers_count: 12345 }
```
### Step 2: Fetch file tree (recursive)
```
GET https://api.github.com/repos/{owner}/{repo}/git/trees/{ref}?recursive=1
→ {
@@ -109,12 +140,14 @@ GET https://api.github.com/repos/{owner}/{repo}/git/trees/{ref}?recursive=1
If `truncated: true`, the tree has >100k items. Use `--depth` pagination or filter top-level directories first.
### Step 3: Download file contents (parallel)
```
GET https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={ref}
→ { content: "<base64>", encoding: "base64", size: 1234, sha: "abc123" }
```
Alternative for large repos: use raw content URL:
```
GET https://raw.githubusercontent.com/{owner}/{repo}/{ref}/{path}
```
@@ -124,48 +157,47 @@ GET https://raw.githubusercontent.com/{owner}/{repo}/{ref}/{path}
## Filtering Logic
```typescript
function shouldIndexFile(
filePath: string,
fileSize: number,
config?: RepoConfig
): boolean {
const ext = path.extname(filePath).toLowerCase();
const base = path.basename(filePath);
function shouldIndexFile(filePath: string, fileSize: number, config?: RepoConfig): boolean {
const ext = path.extname(filePath).toLowerCase();
const base = path.basename(filePath);
// 1. Must have indexable extension
if (!INDEXABLE_EXTENSIONS.has(ext)) return false;
// 1. Must have indexable extension
if (!INDEXABLE_EXTENSIONS.has(ext)) return false;
// 2. Must not exceed size limit
if (fileSize > MAX_FILE_SIZE_BYTES) return false;
// 2. Must not exceed size limit
if (fileSize > MAX_FILE_SIZE_BYTES) return false;
// 3. Exclude lockfiles and other non-source artifacts
if (IGNORED_FILE_NAMES.has(base)) return false;
// 3. Exclude lockfiles and other non-source artifacts
if (IGNORED_FILE_NAMES.has(base)) return false;
// 4. Exclude minified and bundled assets
if (base.includes('.min.') || base.endsWith('.bundle.js') || base.endsWith('.bundle.css')) {
return false;
}
// 4. Exclude minified and bundled assets
if (base.includes('.min.') || base.endsWith('.bundle.js') || base.endsWith('.bundle.css')) {
return false;
}
// 5. Apply config excludeFiles (exact filename match)
if (config?.excludeFiles?.includes(base)) return false;
// 5. Apply config excludeFiles (exact filename match)
if (config?.excludeFiles?.includes(base)) return false;
// 6. Exclude common dependency/build/cache directories at any depth
if (isInIgnoredDirectory(filePath)) return false;
// 6. Exclude common dependency/build/cache directories at any depth
if (isInIgnoredDirectory(filePath)) return false;
// 7. Apply config excludeFolders (regex or prefix match)
if (config?.excludeFolders?.some(folder =>
filePath.startsWith(folder) || new RegExp(folder).test(filePath)
)) return false;
// 7. Apply config excludeFolders (regex or prefix match)
if (
config?.excludeFolders?.some(
(folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath)
)
)
return false;
// 8. Apply config folders allowlist (if specified, only index those paths)
if (config?.folders?.length) {
const inAllowedFolder = config.folders.some(folder =>
filePath.startsWith(folder) || new RegExp(folder).test(filePath)
);
if (!inAllowedFolder) return false;
}
// 8. Apply config folders allowlist (if specified, only index those paths)
if (config?.folders?.length) {
const inAllowedFolder = config.folders.some(
(folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath)
);
if (!inAllowedFolder) return false;
}
return true;
return true;
}
```
@@ -177,20 +209,20 @@ The shared ignored-directory list is intentionally broader than the original bas
```typescript
class GitHubRateLimiter {
private remaining = 5000;
private resetAt = Date.now();
private remaining = 5000;
private resetAt = Date.now();
updateFromHeaders(headers: Headers): void {
this.remaining = parseInt(headers.get('X-RateLimit-Remaining') ?? '5000');
this.resetAt = parseInt(headers.get('X-RateLimit-Reset') ?? '0') * 1000;
}
updateFromHeaders(headers: Headers): void {
this.remaining = parseInt(headers.get('X-RateLimit-Remaining') ?? '5000');
this.resetAt = parseInt(headers.get('X-RateLimit-Reset') ?? '0') * 1000;
}
async waitIfNeeded(): Promise<void> {
if (this.remaining <= 10) {
const waitMs = Math.max(0, this.resetAt - Date.now()) + 1000;
await sleep(waitMs);
}
}
async waitIfNeeded(): Promise<void> {
if (this.remaining <= 10) {
const waitMs = Math.max(0, this.resetAt - Date.now()) + 1000;
await sleep(waitMs);
}
}
}
```
@@ -200,14 +232,14 @@ Requests are made with a concurrency limit of 10 parallel downloads using a sema
## Error Handling
| Scenario | Behavior |
|----------|---------|
| 404 Not Found | Throw `RepositoryNotFoundError` |
| 401 Unauthorized | Throw `AuthenticationError` (invalid or missing token) |
| 403 Forbidden | If `X-RateLimit-Remaining: 0`, wait and retry; else throw `PermissionError` |
| 422 Unprocessable | Tree too large; switch to directory-by-directory traversal |
| Network error | Retry up to 3 times with exponential backoff |
| File content decode error | Skip file, log warning |
| Scenario | Behavior |
| ------------------------- | --------------------------------------------------------------------------- |
| 404 Not Found | Throw `RepositoryNotFoundError` |
| 401 Unauthorized | Throw `AuthenticationError` (invalid or missing token) |
| 403 Forbidden | If `X-RateLimit-Remaining: 0`, wait and retry; else throw `PermissionError` |
| 422 Unprocessable | Tree too large; switch to directory-by-directory traversal |
| Network error | Retry up to 3 times with exponential backoff |
| File content decode error | Skip file, log warning |
---