feat(TRUEREF-0009): implement indexing pipeline and job queue

Implements the end-to-end indexing pipeline with a SQLite-backed job queue, startup recovery, and REST API endpoints for job status. - IndexingPipeline: orchestrates crawl → parse → atomic replace → embed → repo stats update with progress tracking at each stage - JobQueue: sequential SQLite-backed queue (no external broker), deduplicates active jobs per repository, drains queued jobs on startup - startup.ts: stale job recovery (running→failed), repo state reset, singleton initialization wired from hooks.server.ts - GET /api/v1/jobs with repositoryId/status/limit filtering - GET /api/v1/jobs/[id] single job lookup - hooks.server.ts: initializes DB and pipeline on server start - 18 unit tests covering queue, pipeline stages, recovery, and atomicity Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-22 18:22:20 +01:00
parent bf4caf5e3b
commit 956b2a3a62
7 changed files with 1342 additions and 0 deletions
--- a/src/lib/server/pipeline/job-queue.ts
+++ b/src/lib/server/pipeline/job-queue.ts
@@ -0,0 +1,203 @@
+/**
+ * SQLite-backed job queue for indexing jobs (TRUEREF-0009).
+ *
+ * Jobs are processed sequentially (one at a time) to avoid SQLite write
+ * contention. The queue uses setImmediate to yield to the event loop between
+ * jobs so that API requests remain responsive.
+ */
+
+import type Database from 'better-sqlite3';
+import type { IndexingJob, NewIndexingJob } from '$lib/types';
+import type { IndexingPipeline } from './indexing.pipeline.js';
+
+export class JobQueue {
+	private isRunning = false;
+	private pipeline: IndexingPipeline | null = null;
+
+	constructor(private readonly db: Database.Database) {}
+
+	/**
+	 * Inject the pipeline dependency (avoids circular construction order).
+	 */
+	setPipeline(pipeline: IndexingPipeline): void {
+		this.pipeline = pipeline;
+	}
+
+	/**
+	 * Enqueue a new indexing job for the given repository.
+	 * If a job for this repository is already queued or running, returns the
+	 * existing job instead of creating a duplicate.
+	 */
+	enqueue(repositoryId: string, versionId?: string): IndexingJob {
+		// Return early if there's already an active job for this repo.
+		const active = this.db
+			.prepare<[string], IndexingJob>(
+				`SELECT * FROM indexing_jobs
+         WHERE repository_id = ? AND status IN ('queued', 'running')
+         ORDER BY created_at DESC LIMIT 1`
+			)
+			.get(repositoryId);
+
+		if (active) return active;
+
+		const now = Math.floor(Date.now() / 1000);
+		const job: NewIndexingJob = {
+			id: crypto.randomUUID(),
+			repositoryId,
+			versionId: versionId ?? null,
+			status: 'queued',
+			progress: 0,
+			totalFiles: 0,
+			processedFiles: 0,
+			error: null,
+			startedAt: null,
+			completedAt: null,
+			createdAt: new Date(now * 1000)
+		};
+
+		this.db
+			.prepare(
+				`INSERT INTO indexing_jobs
+         (id, repository_id, version_id, status, progress, total_files,
+          processed_files, error, started_at, completed_at, created_at)
+         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
+			)
+			.run(
+				job.id,
+				job.repositoryId,
+				job.versionId,
+				job.status,
+				job.progress,
+				job.totalFiles,
+				job.processedFiles,
+				job.error,
+				job.startedAt,
+				job.completedAt,
+				now
+			);
+
+		// Kick off sequential processing if not already running.
+		if (!this.isRunning) {
+			setImmediate(() => this.processNext());
+		}
+
+		return this.db
+			.prepare<[string], IndexingJob>(`SELECT * FROM indexing_jobs WHERE id = ?`)
+			.get(job.id)!;
+	}
+
+	/**
+	 * Pick the oldest queued job and run it through the pipeline.
+	 * Called recursively via setImmediate so the event loop stays unblocked.
+	 */
+	private async processNext(): Promise<void> {
+		if (this.isRunning) return;
+		if (!this.pipeline) {
+			console.warn('[JobQueue] No pipeline configured — cannot process jobs.');
+			return;
+		}
+
+		const job = this.db
+			.prepare<[], IndexingJob>(
+				`SELECT * FROM indexing_jobs
+         WHERE status = 'queued'
+         ORDER BY created_at ASC LIMIT 1`
+			)
+			.get();
+
+		if (!job) return;
+
+		this.isRunning = true;
+		try {
+			await this.pipeline.run(job);
+		} catch (err) {
+			// Error is logged inside pipeline.run(); no action needed here.
+			console.error(
+				`[JobQueue] Job ${job.id} failed: ${err instanceof Error ? err.message : String(err)}`
+			);
+		} finally {
+			this.isRunning = false;
+
+			// Check whether another job was queued while this one ran.
+			const next = this.db
+				.prepare<[], { id: string }>(
+					`SELECT id FROM indexing_jobs WHERE status = 'queued' LIMIT 1`
+				)
+				.get();
+			if (next) {
+				setImmediate(() => this.processNext());
+			}
+		}
+	}
+
+	/**
+	 * Retrieve a single job by ID.
+	 */
+	getJob(id: string): IndexingJob | null {
+		return (
+			this.db
+				.prepare<[string], IndexingJob>(`SELECT * FROM indexing_jobs WHERE id = ?`)
+				.get(id) ?? null
+		);
+	}
+
+	/**
+	 * List recent jobs, optionally filtered by repository and/or status.
+	 */
+	listJobs(options?: {
+		repositoryId?: string;
+		status?: IndexingJob['status'];
+		limit?: number;
+	}): IndexingJob[] {
+		const limit = Math.min(options?.limit ?? 20, 200);
+		const conditions: string[] = [];
+		const params: unknown[] = [];
+
+		if (options?.repositoryId) {
+			conditions.push('repository_id = ?');
+			params.push(options.repositoryId);
+		}
+		if (options?.status) {
+			conditions.push('status = ?');
+			params.push(options.status);
+		}
+
+		const where = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
+		const sql = `SELECT * FROM indexing_jobs ${where} ORDER BY created_at DESC LIMIT ?`;
+		params.push(limit);
+
+		return this.db.prepare<unknown[], IndexingJob>(sql).all(...params);
+	}
+
+	/**
+	 * Trigger processing of any queued jobs (e.g. after server restart).
+	 * Safe to call multiple times; a no-op if the queue is already running.
+	 */
+	drainQueued(): void {
+		if (!this.isRunning) {
+			setImmediate(() => this.processNext());
+		}
+	}
+
+	/**
+	 * Count all jobs matching optional filters.
+	 */
+	countJobs(options?: { repositoryId?: string; status?: IndexingJob['status'] }): number {
+		const conditions: string[] = [];
+		const params: unknown[] = [];
+
+		if (options?.repositoryId) {
+			conditions.push('repository_id = ?');
+			params.push(options.repositoryId);
+		}
+		if (options?.status) {
+			conditions.push('status = ?');
+			params.push(options.status);
+		}
+
+		const where = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
+		const sql = `SELECT COUNT(*) as n FROM indexing_jobs ${where}`;
+		const row = this.db.prepare<unknown[], { n: number }>(sql).get(...params);
+		return row?.n ?? 0;
+	}
+}