feat(TRUEREF-0009): implement indexing pipeline and job queue

Implements the end-to-end indexing pipeline with a SQLite-backed job queue, startup recovery, and REST API endpoints for job status. - IndexingPipeline: orchestrates crawl → parse → atomic replace → embed → repo stats update with progress tracking at each stage - JobQueue: sequential SQLite-backed queue (no external broker), deduplicates active jobs per repository, drains queued jobs on startup - startup.ts: stale job recovery (running→failed), repo state reset, singleton initialization wired from hooks.server.ts - GET /api/v1/jobs with repositoryId/status/limit filtering - GET /api/v1/jobs/[id] single job lookup - hooks.server.ts: initializes DB and pipeline on server start - 18 unit tests covering queue, pipeline stages, recovery, and atomicity Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-22 18:22:20 +01:00
parent bf4caf5e3b
commit 956b2a3a62
7 changed files with 1342 additions and 0 deletions
--- a/src/lib/server/pipeline/startup.ts
+++ b/src/lib/server/pipeline/startup.ts
@@ -0,0 +1,122 @@
+/**
+ * Server startup routines for the indexing pipeline (TRUEREF-0009).
+ *
+ * On every server start:
+ *  1. Mark any jobs that were left in 'running' state as 'failed'
+ *     (they were interrupted by a process crash / restart).
+ *  2. Reset any repositories stuck in 'indexing' state to 'error'.
+ *  3. Construct and wire together the JobQueue + IndexingPipeline singletons.
+ *  4. Kick off processing of any jobs that were 'queued' before the restart.
+ */
+
+import type Database from 'better-sqlite3';
+import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
+import { crawl as githubCrawl } from '$lib/server/crawler/github.crawler.js';
+import { LocalCrawler } from '$lib/server/crawler/local.crawler.js';
+import { IndexingPipeline } from './indexing.pipeline.js';
+import { JobQueue } from './job-queue.js';
+
+// ---------------------------------------------------------------------------
+// Stale-job recovery
+// ---------------------------------------------------------------------------
+
+/**
+ * Mark jobs that were `running` when the server crashed as `failed`, and
+ * reset repositories that were stuck in `indexing` state to `error`.
+ *
+ * Safe to call on every startup — uses unixepoch() so timestamps stay in
+ * the same integer-seconds format as the rest of the schema.
+ */
+export function recoverStaleJobs(db: Database.Database): void {
+	db.prepare(
+		`UPDATE indexing_jobs
+     SET status      = 'failed',
+         error       = 'Server restarted while job was running',
+         completed_at = unixepoch()
+     WHERE status = 'running'`
+	).run();
+
+	db.prepare(
+		`UPDATE repositories
+     SET state = 'error'
+     WHERE state = 'indexing'`
+	).run();
+}
+
+// ---------------------------------------------------------------------------
+// Singleton instances
+// ---------------------------------------------------------------------------
+
+let _queue: JobQueue | null = null;
+let _pipeline: IndexingPipeline | null = null;
+
+/**
+ * Initialise (or return the existing) JobQueue + IndexingPipeline pair.
+ *
+ * Must be called once at server startup (e.g. from `hooks.server.ts`).
+ * Calling it more than once is harmless — subsequent calls are no-ops that
+ * return the already-constructed singletons.
+ *
+ * @param db               - Raw better-sqlite3 Database instance.
+ * @param embeddingService - Optional embedding service; pass null to disable.
+ * @returns An object with `queue` and `pipeline` accessors.
+ */
+export function initializePipeline(
+	db: Database.Database,
+	embeddingService: EmbeddingService | null = null
+): { queue: JobQueue; pipeline: IndexingPipeline } {
+	if (_queue && _pipeline) {
+		return { queue: _queue, pipeline: _pipeline };
+	}
+
+	// Recover before constructing so no stale job gets picked up.
+	recoverStaleJobs(db);
+
+	const localCrawler = new LocalCrawler();
+	const pipeline = new IndexingPipeline(db, githubCrawl, localCrawler, embeddingService);
+	const queue = new JobQueue(db);
+
+	queue.setPipeline(pipeline);
+
+	_queue = queue;
+	_pipeline = pipeline;
+
+	// Drain any jobs that were queued before the restart.
+	setImmediate(() => {
+		const pending = db
+			.prepare<[], { id: string }>(`SELECT id FROM indexing_jobs WHERE status = 'queued' LIMIT 1`)
+			.get();
+		if (pending) {
+			// Re-enqueue logic is handled inside JobQueue.processNext; we trigger
+			// it by asking the queue for any job that is already queued.
+			// The simplest way is to call enqueue on a repo that has a queued job —
+			// but since enqueue deduplicates, we just trigger processNext directly.
+			// We do this via a public helper to avoid exposing private methods.
+			queue.drainQueued();
+		}
+	});
+
+	return { queue, pipeline };
+}
+
+/**
+ * Return the current JobQueue singleton, or null if not yet initialised.
+ */
+export function getQueue(): JobQueue | null {
+	return _queue;
+}
+
+/**
+ * Return the current IndexingPipeline singleton, or null if not yet initialised.
+ */
+export function getPipeline(): IndexingPipeline | null {
+	return _pipeline;
+}
+
+/**
+ * Reset singletons — intended for use in tests only.
+ */
+export function _resetSingletons(): void {
+	_queue = null;
+	_pipeline = null;
+}