feat(TRUEREF-0009): implement indexing pipeline and job queue

Implements the end-to-end indexing pipeline with a SQLite-backed job
queue, startup recovery, and REST API endpoints for job status.

- IndexingPipeline: orchestrates crawl → parse → atomic replace → embed
  → repo stats update with progress tracking at each stage
- JobQueue: sequential SQLite-backed queue (no external broker), deduplicates
  active jobs per repository, drains queued jobs on startup
- startup.ts: stale job recovery (running→failed), repo state reset, singleton
  initialization wired from hooks.server.ts
- GET /api/v1/jobs with repositoryId/status/limit filtering
- GET /api/v1/jobs/[id] single job lookup
- hooks.server.ts: initializes DB and pipeline on server start
- 18 unit tests covering queue, pipeline stages, recovery, and atomicity

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Giancarmine Salucci
2026-03-22 18:22:20 +01:00
parent bf4caf5e3b
commit 956b2a3a62
7 changed files with 1342 additions and 0 deletions

View File

@@ -0,0 +1,203 @@
/**
* SQLite-backed job queue for indexing jobs (TRUEREF-0009).
*
* Jobs are processed sequentially (one at a time) to avoid SQLite write
* contention. The queue uses setImmediate to yield to the event loop between
* jobs so that API requests remain responsive.
*/
import type Database from 'better-sqlite3';
import type { IndexingJob, NewIndexingJob } from '$lib/types';
import type { IndexingPipeline } from './indexing.pipeline.js';
export class JobQueue {
private isRunning = false;
private pipeline: IndexingPipeline | null = null;
constructor(private readonly db: Database.Database) {}
/**
* Inject the pipeline dependency (avoids circular construction order).
*/
setPipeline(pipeline: IndexingPipeline): void {
this.pipeline = pipeline;
}
/**
* Enqueue a new indexing job for the given repository.
* If a job for this repository is already queued or running, returns the
* existing job instead of creating a duplicate.
*/
enqueue(repositoryId: string, versionId?: string): IndexingJob {
// Return early if there's already an active job for this repo.
const active = this.db
.prepare<[string], IndexingJob>(
`SELECT * FROM indexing_jobs
WHERE repository_id = ? AND status IN ('queued', 'running')
ORDER BY created_at DESC LIMIT 1`
)
.get(repositoryId);
if (active) return active;
const now = Math.floor(Date.now() / 1000);
const job: NewIndexingJob = {
id: crypto.randomUUID(),
repositoryId,
versionId: versionId ?? null,
status: 'queued',
progress: 0,
totalFiles: 0,
processedFiles: 0,
error: null,
startedAt: null,
completedAt: null,
createdAt: new Date(now * 1000)
};
this.db
.prepare(
`INSERT INTO indexing_jobs
(id, repository_id, version_id, status, progress, total_files,
processed_files, error, started_at, completed_at, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
)
.run(
job.id,
job.repositoryId,
job.versionId,
job.status,
job.progress,
job.totalFiles,
job.processedFiles,
job.error,
job.startedAt,
job.completedAt,
now
);
// Kick off sequential processing if not already running.
if (!this.isRunning) {
setImmediate(() => this.processNext());
}
return this.db
.prepare<[string], IndexingJob>(`SELECT * FROM indexing_jobs WHERE id = ?`)
.get(job.id)!;
}
/**
* Pick the oldest queued job and run it through the pipeline.
* Called recursively via setImmediate so the event loop stays unblocked.
*/
private async processNext(): Promise<void> {
if (this.isRunning) return;
if (!this.pipeline) {
console.warn('[JobQueue] No pipeline configured — cannot process jobs.');
return;
}
const job = this.db
.prepare<[], IndexingJob>(
`SELECT * FROM indexing_jobs
WHERE status = 'queued'
ORDER BY created_at ASC LIMIT 1`
)
.get();
if (!job) return;
this.isRunning = true;
try {
await this.pipeline.run(job);
} catch (err) {
// Error is logged inside pipeline.run(); no action needed here.
console.error(
`[JobQueue] Job ${job.id} failed: ${err instanceof Error ? err.message : String(err)}`
);
} finally {
this.isRunning = false;
// Check whether another job was queued while this one ran.
const next = this.db
.prepare<[], { id: string }>(
`SELECT id FROM indexing_jobs WHERE status = 'queued' LIMIT 1`
)
.get();
if (next) {
setImmediate(() => this.processNext());
}
}
}
/**
* Retrieve a single job by ID.
*/
getJob(id: string): IndexingJob | null {
return (
this.db
.prepare<[string], IndexingJob>(`SELECT * FROM indexing_jobs WHERE id = ?`)
.get(id) ?? null
);
}
/**
* List recent jobs, optionally filtered by repository and/or status.
*/
listJobs(options?: {
repositoryId?: string;
status?: IndexingJob['status'];
limit?: number;
}): IndexingJob[] {
const limit = Math.min(options?.limit ?? 20, 200);
const conditions: string[] = [];
const params: unknown[] = [];
if (options?.repositoryId) {
conditions.push('repository_id = ?');
params.push(options.repositoryId);
}
if (options?.status) {
conditions.push('status = ?');
params.push(options.status);
}
const where = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
const sql = `SELECT * FROM indexing_jobs ${where} ORDER BY created_at DESC LIMIT ?`;
params.push(limit);
return this.db.prepare<unknown[], IndexingJob>(sql).all(...params);
}
/**
* Trigger processing of any queued jobs (e.g. after server restart).
* Safe to call multiple times; a no-op if the queue is already running.
*/
drainQueued(): void {
if (!this.isRunning) {
setImmediate(() => this.processNext());
}
}
/**
* Count all jobs matching optional filters.
*/
countJobs(options?: { repositoryId?: string; status?: IndexingJob['status'] }): number {
const conditions: string[] = [];
const params: unknown[] = [];
if (options?.repositoryId) {
conditions.push('repository_id = ?');
params.push(options.repositoryId);
}
if (options?.status) {
conditions.push('status = ?');
params.push(options.status);
}
const where = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
const sql = `SELECT COUNT(*) as n FROM indexing_jobs ${where}`;
const row = this.db.prepare<unknown[], { n: number }>(sql).get(...params);
return row?.n ?? 0;
}
}