feat(TRUEREF-0009): implement indexing pipeline and job queue
Implements the end-to-end indexing pipeline with a SQLite-backed job queue, startup recovery, and REST API endpoints for job status. - IndexingPipeline: orchestrates crawl → parse → atomic replace → embed → repo stats update with progress tracking at each stage - JobQueue: sequential SQLite-backed queue (no external broker), deduplicates active jobs per repository, drains queued jobs on startup - startup.ts: stale job recovery (running→failed), repo state reset, singleton initialization wired from hooks.server.ts - GET /api/v1/jobs with repositoryId/status/limit filtering - GET /api/v1/jobs/[id] single job lookup - hooks.server.ts: initializes DB and pipeline on server start - 18 unit tests covering queue, pipeline stages, recovery, and atomicity Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
122
src/lib/server/pipeline/startup.ts
Normal file
122
src/lib/server/pipeline/startup.ts
Normal file
@@ -0,0 +1,122 @@
|
||||
/**
|
||||
* Server startup routines for the indexing pipeline (TRUEREF-0009).
|
||||
*
|
||||
* On every server start:
|
||||
* 1. Mark any jobs that were left in 'running' state as 'failed'
|
||||
* (they were interrupted by a process crash / restart).
|
||||
* 2. Reset any repositories stuck in 'indexing' state to 'error'.
|
||||
* 3. Construct and wire together the JobQueue + IndexingPipeline singletons.
|
||||
* 4. Kick off processing of any jobs that were 'queued' before the restart.
|
||||
*/
|
||||
|
||||
import type Database from 'better-sqlite3';
|
||||
import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
|
||||
import { crawl as githubCrawl } from '$lib/server/crawler/github.crawler.js';
|
||||
import { LocalCrawler } from '$lib/server/crawler/local.crawler.js';
|
||||
import { IndexingPipeline } from './indexing.pipeline.js';
|
||||
import { JobQueue } from './job-queue.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stale-job recovery
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Mark jobs that were `running` when the server crashed as `failed`, and
|
||||
* reset repositories that were stuck in `indexing` state to `error`.
|
||||
*
|
||||
* Safe to call on every startup — uses unixepoch() so timestamps stay in
|
||||
* the same integer-seconds format as the rest of the schema.
|
||||
*/
|
||||
export function recoverStaleJobs(db: Database.Database): void {
|
||||
db.prepare(
|
||||
`UPDATE indexing_jobs
|
||||
SET status = 'failed',
|
||||
error = 'Server restarted while job was running',
|
||||
completed_at = unixepoch()
|
||||
WHERE status = 'running'`
|
||||
).run();
|
||||
|
||||
db.prepare(
|
||||
`UPDATE repositories
|
||||
SET state = 'error'
|
||||
WHERE state = 'indexing'`
|
||||
).run();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Singleton instances
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
let _queue: JobQueue | null = null;
|
||||
let _pipeline: IndexingPipeline | null = null;
|
||||
|
||||
/**
|
||||
* Initialise (or return the existing) JobQueue + IndexingPipeline pair.
|
||||
*
|
||||
* Must be called once at server startup (e.g. from `hooks.server.ts`).
|
||||
* Calling it more than once is harmless — subsequent calls are no-ops that
|
||||
* return the already-constructed singletons.
|
||||
*
|
||||
* @param db - Raw better-sqlite3 Database instance.
|
||||
* @param embeddingService - Optional embedding service; pass null to disable.
|
||||
* @returns An object with `queue` and `pipeline` accessors.
|
||||
*/
|
||||
export function initializePipeline(
|
||||
db: Database.Database,
|
||||
embeddingService: EmbeddingService | null = null
|
||||
): { queue: JobQueue; pipeline: IndexingPipeline } {
|
||||
if (_queue && _pipeline) {
|
||||
return { queue: _queue, pipeline: _pipeline };
|
||||
}
|
||||
|
||||
// Recover before constructing so no stale job gets picked up.
|
||||
recoverStaleJobs(db);
|
||||
|
||||
const localCrawler = new LocalCrawler();
|
||||
const pipeline = new IndexingPipeline(db, githubCrawl, localCrawler, embeddingService);
|
||||
const queue = new JobQueue(db);
|
||||
|
||||
queue.setPipeline(pipeline);
|
||||
|
||||
_queue = queue;
|
||||
_pipeline = pipeline;
|
||||
|
||||
// Drain any jobs that were queued before the restart.
|
||||
setImmediate(() => {
|
||||
const pending = db
|
||||
.prepare<[], { id: string }>(`SELECT id FROM indexing_jobs WHERE status = 'queued' LIMIT 1`)
|
||||
.get();
|
||||
if (pending) {
|
||||
// Re-enqueue logic is handled inside JobQueue.processNext; we trigger
|
||||
// it by asking the queue for any job that is already queued.
|
||||
// The simplest way is to call enqueue on a repo that has a queued job —
|
||||
// but since enqueue deduplicates, we just trigger processNext directly.
|
||||
// We do this via a public helper to avoid exposing private methods.
|
||||
queue.drainQueued();
|
||||
}
|
||||
});
|
||||
|
||||
return { queue, pipeline };
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the current JobQueue singleton, or null if not yet initialised.
|
||||
*/
|
||||
export function getQueue(): JobQueue | null {
|
||||
return _queue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the current IndexingPipeline singleton, or null if not yet initialised.
|
||||
*/
|
||||
export function getPipeline(): IndexingPipeline | null {
|
||||
return _pipeline;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset singletons — intended for use in tests only.
|
||||
*/
|
||||
export function _resetSingletons(): void {
|
||||
_queue = null;
|
||||
_pipeline = null;
|
||||
}
|
||||
Reference in New Issue
Block a user