feat(TRUEREF-0003-0004): implement GitHub and local filesystem crawlers

- GitHub crawler with rate limiting, semaphore concurrency, retry logic
- File filtering by extension, size, and trueref.json rules
- Local filesystem crawler with SHA-256 checksums and progress callbacks
- Shared types and file filter logic between both crawlers

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Giancarmine Salucci
2026-03-23 09:06:07 +01:00
parent cb253ffe98
commit 1c15d6c474
7 changed files with 2308 additions and 0 deletions

View File

@@ -0,0 +1,123 @@
/**
* GitHub API rate-limit tracker and backoff helper (TRUEREF-0003).
*
* Reads X-RateLimit-* headers from every API response and pauses outgoing
* requests when the remaining allowance drops to ≤ 10.
*/
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
export class GitHubRateLimiter {
private remaining = 5000;
private resetAt = Date.now();
/**
* Update internal counters from the headers of a GitHub API response.
*/
updateFromHeaders(headers: Headers): void {
const remaining = headers.get('X-RateLimit-Remaining');
const reset = headers.get('X-RateLimit-Reset');
if (remaining !== null) {
this.remaining = parseInt(remaining, 10);
}
if (reset !== null) {
// GitHub returns a Unix epoch in seconds.
this.resetAt = parseInt(reset, 10) * 1000;
}
}
/**
* If the remaining allowance is critically low (≤ 10), sleep until the
* rate-limit window resets (plus a 1 s buffer).
*/
async waitIfNeeded(): Promise<void> {
if (this.remaining <= 10) {
const waitMs = Math.max(0, this.resetAt - Date.now()) + 1000;
await sleep(waitMs);
}
}
/** Remaining requests in the current window (for testing). */
get remainingRequests(): number {
return this.remaining;
}
/** Reset timestamp as a Unix epoch in ms (for testing). */
get resetTimestamp(): number {
return this.resetAt;
}
}
/**
* Exponential-backoff retry wrapper for network-level errors.
*
* Retries up to `maxAttempts` times (default 3) with 1 s, 2 s, 4 s delays.
*
* @param fn - Async function to attempt.
* @param maxAttempts - Maximum number of attempts (default 3).
* @param isRetryable - Optional predicate; when it returns false for a given
* error the error is re-thrown immediately without further
* retries. Defaults to retrying all errors.
*/
export async function withRetry<T>(
fn: () => Promise<T>,
maxAttempts = 3,
isRetryable: (err: unknown) => boolean = () => true
): Promise<T> {
let lastError: unknown;
for (let attempt = 0; attempt < maxAttempts; attempt++) {
try {
return await fn();
} catch (err) {
if (!isRetryable(err)) throw err;
lastError = err;
if (attempt < maxAttempts - 1) {
await sleep(1000 * Math.pow(2, attempt));
}
}
}
throw lastError;
}
/**
* Async semaphore — limits the number of concurrently executing promises.
*/
export class Semaphore {
private count: number;
private readonly queue: Array<() => void> = [];
constructor(concurrency: number) {
this.count = concurrency;
}
async acquire(): Promise<void> {
if (this.count > 0) {
this.count--;
return;
}
return new Promise((resolve) => {
this.queue.push(resolve);
});
}
release(): void {
const next = this.queue.shift();
if (next) {
next();
} else {
this.count++;
}
}
async run<T>(fn: () => Promise<T>): Promise<T> {
await this.acquire();
try {
return await fn();
} finally {
this.release();
}
}
}