feat(TRUEREF-0009): implement indexing pipeline and job queue

Implements the end-to-end indexing pipeline with a SQLite-backed job queue, startup recovery, and REST API endpoints for job status. - IndexingPipeline: orchestrates crawl → parse → atomic replace → embed → repo stats update with progress tracking at each stage - JobQueue: sequential SQLite-backed queue (no external broker), deduplicates active jobs per repository, drains queued jobs on startup - startup.ts: stale job recovery (running→failed), repo state reset, singleton initialization wired from hooks.server.ts - GET /api/v1/jobs with repositoryId/status/limit filtering - GET /api/v1/jobs/[id] single job lookup - hooks.server.ts: initializes DB and pipeline on server start - 18 unit tests covering queue, pipeline stages, recovery, and atomicity Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-22 18:22:20 +01:00
parent bf4caf5e3b
commit 956b2a3a62
7 changed files with 1342 additions and 0 deletions
--- a/src/lib/server/pipeline/indexing.pipeline.test.ts
+++ b/src/lib/server/pipeline/indexing.pipeline.test.ts
@@ -0,0 +1,460 @@
+/**
+ * Unit tests for IndexingPipeline and JobQueue (TRUEREF-0009).
+ *
+ * Uses an in-memory SQLite database populated with the same migration SQL
+ * as the production database.
+ */
+
+import { describe, it, expect, beforeEach, vi } from 'vitest';
+import Database from 'better-sqlite3';
+import { readFileSync } from 'node:fs';
+import { join } from 'node:path';
+import { JobQueue } from './job-queue.js';
+import { IndexingPipeline } from './indexing.pipeline.js';
+import { recoverStaleJobs } from './startup.js';
+
+// ---------------------------------------------------------------------------
+// Test DB factory
+// ---------------------------------------------------------------------------
+
+function createTestDb(): Database.Database {
+	const client = new Database(':memory:');
+	client.pragma('foreign_keys = ON');
+
+	const migrationsFolder = join(import.meta.dirname, '../db/migrations');
+	const migrationSql = readFileSync(
+		join(migrationsFolder, '0000_large_master_chief.sql'),
+		'utf-8'
+	);
+
+	const statements = migrationSql
+		.split('--> statement-breakpoint')
+		.map((s) => s.trim())
+		.filter(Boolean);
+
+	for (const stmt of statements) {
+		client.exec(stmt);
+	}
+
+	return client;
+}
+
+// ---------------------------------------------------------------------------
+// Fixtures
+// ---------------------------------------------------------------------------
+
+const now = Math.floor(Date.now() / 1000);
+
+function insertRepo(
+	db: Database.Database,
+	overrides: Partial<Record<string, unknown>> = {}
+): void {
+	db.prepare(
+		`INSERT INTO repositories
+       (id, title, source, source_url, branch, state,
+        total_snippets, total_tokens, trust_score, benchmark_score,
+        stars, github_token, last_indexed_at, created_at, updated_at)
+     VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
+	).run(
+		overrides.id ?? '/test/repo',
+		overrides.title ?? 'Test Repo',
+		overrides.source ?? 'local',
+		overrides.source_url ?? '/tmp/test-repo',
+		overrides.branch ?? 'main',
+		overrides.state ?? 'pending',
+		0, 0, 0, 0, null, null, null, now, now
+	);
+}
+
+function insertJob(
+	db: Database.Database,
+	overrides: Partial<Record<string, unknown>> = {}
+): string {
+	const id = crypto.randomUUID();
+	db.prepare(
+		`INSERT INTO indexing_jobs
+       (id, repository_id, version_id, status, progress,
+        total_files, processed_files, error, started_at, completed_at, created_at)
+     VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
+	).run(
+		overrides.id ?? id,
+		overrides.repository_id ?? '/test/repo',
+		overrides.version_id ?? null,
+		overrides.status ?? 'queued',
+		overrides.progress ?? 0,
+		overrides.total_files ?? 0,
+		overrides.processed_files ?? 0,
+		overrides.error ?? null,
+		overrides.started_at ?? null,
+		overrides.completed_at ?? null,
+		overrides.created_at ?? now
+	);
+	return (overrides.id as string) ?? id;
+}
+
+// ---------------------------------------------------------------------------
+// recoverStaleJobs
+// ---------------------------------------------------------------------------
+
+describe('recoverStaleJobs', () => {
+	let db: Database.Database;
+
+	beforeEach(() => {
+		db = createTestDb();
+		insertRepo(db);
+	});
+
+	it('marks running jobs as failed', () => {
+		insertJob(db, { status: 'running' });
+		recoverStaleJobs(db);
+
+		const row = db
+			.prepare(`SELECT status, error FROM indexing_jobs LIMIT 1`)
+			.get() as { status: string; error: string };
+		expect(row.status).toBe('failed');
+		expect(row.error).toMatch(/restarted/i);
+	});
+
+	it('resets repositories in indexing state to error', () => {
+		db.prepare(`UPDATE repositories SET state = 'indexing' WHERE id = '/test/repo'`).run();
+		recoverStaleJobs(db);
+
+		const row = db
+			.prepare(`SELECT state FROM repositories WHERE id = '/test/repo'`)
+			.get() as { state: string };
+		expect(row.state).toBe('error');
+	});
+
+	it('leaves queued and done jobs untouched', () => {
+		insertJob(db, { status: 'queued' });
+		insertJob(db, { status: 'done' });
+		recoverStaleJobs(db);
+
+		const rows = db
+			.prepare(`SELECT status FROM indexing_jobs WHERE status IN ('queued', 'done')`)
+			.all() as { status: string }[];
+		expect(rows).toHaveLength(2);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// JobQueue
+// ---------------------------------------------------------------------------
+
+describe('JobQueue', () => {
+	let db: Database.Database;
+	let queue: JobQueue;
+
+	beforeEach(() => {
+		db = createTestDb();
+		insertRepo(db);
+		queue = new JobQueue(db);
+	});
+
+	it('enqueues a new job and returns it', () => {
+		const job = queue.enqueue('/test/repo');
+		expect(job.status).toBe('queued');
+		expect(job.repositoryId ?? (job as unknown as { repository_id: string }).repository_id).toBe(
+			'/test/repo'
+		);
+	});
+
+	it('deduplicates: returns existing active job instead of creating a new one', () => {
+		const job1 = queue.enqueue('/test/repo');
+		const job2 = queue.enqueue('/test/repo');
+		expect(job1.id).toBe(job2.id);
+
+		const count = (
+			db.prepare(`SELECT COUNT(*) as n FROM indexing_jobs`).get() as { n: number }
+		).n;
+		expect(count).toBe(1);
+	});
+
+	it('getJob returns null for unknown ID', () => {
+		expect(queue.getJob('non-existent')).toBeNull();
+	});
+
+	it('getJob returns the job if it exists', () => {
+		const jobId = insertJob(db, { status: 'done' });
+		const job = queue.getJob(jobId);
+		expect(job).not.toBeNull();
+		expect(job!.id).toBe(jobId);
+	});
+
+	it('listJobs returns all jobs ordered by created_at desc', () => {
+		insertJob(db, { created_at: now - 10, status: 'done' });
+		insertJob(db, { created_at: now - 5, status: 'done' });
+		insertJob(db, { created_at: now, status: 'queued' });
+
+		const jobs = queue.listJobs();
+		expect(jobs.length).toBeGreaterThanOrEqual(3);
+		// Most recent first.
+		expect(jobs[0].status).toBe('queued');
+	});
+
+	it('listJobs filters by repositoryId', () => {
+		insertRepo(db, { id: '/other/repo', source_url: '/tmp/other' });
+		insertJob(db, { repository_id: '/other/repo', status: 'done' });
+		insertJob(db, { status: 'queued' });
+
+		const jobs = queue.listJobs({ repositoryId: '/other/repo' });
+		expect(jobs).toHaveLength(1);
+	});
+
+	it('listJobs filters by status', () => {
+		insertJob(db, { status: 'queued' });
+		insertJob(db, { status: 'done' });
+		insertJob(db, { status: 'failed' });
+
+		const queued = queue.listJobs({ status: 'queued' });
+		expect(queued.every((j) => j.status === 'queued')).toBe(true);
+	});
+
+	it('countJobs returns correct count', () => {
+		insertJob(db, { status: 'done' });
+		insertJob(db, { status: 'done' });
+		insertJob(db, { status: 'failed' });
+
+		expect(queue.countJobs()).toBe(3);
+		expect(queue.countJobs({ status: 'done' })).toBe(2);
+		expect(queue.countJobs({ status: 'failed' })).toBe(1);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// IndexingPipeline
+// ---------------------------------------------------------------------------
+
+describe('IndexingPipeline', () => {
+	let db: Database.Database;
+
+	beforeEach(() => {
+		db = createTestDb();
+		insertRepo(db, { source: 'local', source_url: '/tmp/test-repo' });
+	});
+
+	function makePipeline(
+		crawlResult: {
+			files: Array<{ path: string; content: string; sha: string; language: string }>;
+			totalFiles: number;
+		} = { files: [], totalFiles: 0 }
+	) {
+		const mockGithubCrawl = vi.fn().mockResolvedValue({
+			...crawlResult,
+			skippedFiles: 0,
+			branch: 'main',
+			commitSha: 'abc'
+		});
+
+		const mockLocalCrawler = {
+			crawl: vi.fn().mockResolvedValue({
+				...crawlResult,
+				skippedFiles: 0,
+				branch: 'main',
+				commitSha: 'abc'
+			})
+		};
+
+		return new IndexingPipeline(
+			db,
+			mockGithubCrawl as never,
+			mockLocalCrawler as never,
+			null
+		);
+	}
+
+	function makeJob(repositoryId = '/test/repo') {
+		const jobId = insertJob(db, { repository_id: repositoryId, status: 'queued' });
+		return db
+			.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`)
+			.get(jobId) as { id: string; repositoryId?: string; repository_id?: string; status: string; versionId?: string; version_id?: string };
+	}
+
+	it('marks job as done when there are no files to index', async () => {
+		const pipeline = makePipeline({ files: [], totalFiles: 0 });
+		const job = makeJob();
+
+		await pipeline.run(job as never);
+
+		const updated = db
+			.prepare(`SELECT status, progress FROM indexing_jobs WHERE id = ?`)
+			.get(job.id) as { status: string; progress: number };
+		expect(updated.status).toBe('done');
+		expect(updated.progress).toBe(100);
+	});
+
+	it('marks job as running then done (final state is done)', async () => {
+		const pipeline = makePipeline({ files: [], totalFiles: 0 });
+		const job = makeJob();
+
+		await pipeline.run(job as never);
+
+		const updated = db
+			.prepare(`SELECT status FROM indexing_jobs WHERE id = ?`)
+			.get(job.id) as { status: string };
+		// The job should end in 'done' — the running→done transition is covered
+		// by the pipeline's internal updateJob calls.
+		expect(updated.status).toBe('done');
+	});
+
+	it('marks job as failed and repo as error when pipeline throws', async () => {
+		const errorCrawl = vi.fn().mockRejectedValue(new Error('crawl failed'));
+		const pipeline = new IndexingPipeline(
+			db,
+			errorCrawl as never,
+			{ crawl: errorCrawl } as never,
+			null
+		);
+
+		const job = makeJob();
+
+		await expect(pipeline.run(job as never)).rejects.toThrow('crawl failed');
+
+		const updatedJob = db
+			.prepare(`SELECT status, error FROM indexing_jobs WHERE id = ?`)
+			.get(job.id) as { status: string; error: string };
+		expect(updatedJob.status).toBe('failed');
+		expect(updatedJob.error).toBe('crawl failed');
+
+		const updatedRepo = db
+			.prepare(`SELECT state FROM repositories WHERE id = '/test/repo'`)
+			.get() as { state: string };
+		expect(updatedRepo.state).toBe('error');
+	});
+
+	it('inserts documents and snippets for new files', async () => {
+		const files = [
+			{
+				path: 'README.md',
+				content: '# Hello\n\nThis is documentation.',
+				sha: 'sha-readme',
+				language: 'markdown'
+			}
+		];
+		const pipeline = makePipeline({ files, totalFiles: 1 });
+		const job = makeJob();
+
+		await pipeline.run(job as never);
+
+		const docs = db.prepare(`SELECT * FROM documents`).all() as unknown[];
+		expect(docs.length).toBeGreaterThan(0);
+
+		const snippets = db.prepare(`SELECT * FROM snippets`).all() as unknown[];
+		expect(snippets.length).toBeGreaterThan(0);
+
+		const repo = db
+			.prepare(`SELECT state, total_snippets FROM repositories WHERE id = '/test/repo'`)
+			.get() as { state: string; total_snippets: number };
+		expect(repo.state).toBe('indexed');
+		expect(repo.total_snippets).toBeGreaterThan(0);
+	});
+
+	it('skips unchanged files (checksum match)', async () => {
+		// First indexing run.
+		const files = [
+			{
+				path: 'README.md',
+				content: '# Hello\n\nThis is documentation.',
+				sha: 'sha-readme',
+				language: 'markdown'
+			}
+		];
+		const pipeline = makePipeline({ files, totalFiles: 1 });
+		const job1 = makeJob();
+		await pipeline.run(job1 as never);
+
+		const firstDocCount = (
+			db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }
+		).n;
+		const firstSnippetIds = (
+			db.prepare(`SELECT id FROM snippets`).all() as { id: string }[]
+		).map((r) => r.id);
+
+		// Second run with identical files.
+		const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' });
+		const job2 = db
+			.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`)
+			.get(job2Id) as never;
+
+		await pipeline.run(job2);
+
+		const secondDocCount = (
+			db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }
+		).n;
+		const secondSnippetIds = (
+			db.prepare(`SELECT id FROM snippets`).all() as { id: string }[]
+		).map((r) => r.id);
+
+		// Document count stays the same and snippet IDs are unchanged.
+		expect(secondDocCount).toBe(firstDocCount);
+		expect(secondSnippetIds).toEqual(firstSnippetIds);
+	});
+
+	it('replaces snippets atomically when a file changes', async () => {
+		const pipeline1 = makePipeline({
+			files: [
+				{
+					path: 'README.md',
+					content: '# Original\n\nThis is the original version of the documentation with sufficient content.',
+					sha: 'sha-v1',
+					language: 'markdown'
+				}
+			],
+			totalFiles: 1
+		});
+		const job1 = makeJob();
+		await pipeline1.run(job1 as never);
+
+		const originalSnippetCount = (
+			db.prepare(`SELECT COUNT(*) as n FROM snippets`).get() as { n: number }
+		).n;
+		expect(originalSnippetCount).toBeGreaterThan(0);
+
+		// Second run with changed file content.
+		const pipeline2 = makePipeline({
+			files: [
+				{
+					path: 'README.md',
+					content: '# Updated\n\nThis is a completely different version of the documentation with new content.',
+					sha: 'sha-v2',
+					language: 'markdown'
+				}
+			],
+			totalFiles: 1
+		});
+		const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' });
+		const job2 = db
+			.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`)
+			.get(job2Id) as never;
+		await pipeline2.run(job2);
+
+		const finalDocCount = (
+			db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }
+		).n;
+		// Only one document should exist (the updated one).
+		expect(finalDocCount).toBe(1);
+
+		const finalChecksum = (
+			db.prepare(`SELECT checksum FROM documents LIMIT 1`).get() as { checksum: string }
+		).checksum;
+		expect(finalChecksum).toBe('sha-v2');
+	});
+
+	it('updates job progress as files are processed', async () => {
+		const files = Array.from({ length: 5 }, (_, i) => ({
+			path: `file${i}.md`,
+			content: `# File ${i}\n\nContent ${i}.`,
+			sha: `sha-${i}`,
+			language: 'markdown'
+		}));
+
+		const pipeline = makePipeline({ files, totalFiles: 5 });
+		const job = makeJob();
+		await pipeline.run(job as never);
+
+		const updated = db
+			.prepare(`SELECT progress FROM indexing_jobs WHERE id = ?`)
+			.get(job.id) as { progress: number };
+		expect(updated.progress).toBe(100);
+	});
+});