feat(TRUEREF-0009): implement indexing pipeline and job queue

Implements the end-to-end indexing pipeline with a SQLite-backed job
queue, startup recovery, and REST API endpoints for job status.

- IndexingPipeline: orchestrates crawl → parse → atomic replace → embed
  → repo stats update with progress tracking at each stage
- JobQueue: sequential SQLite-backed queue (no external broker), deduplicates
  active jobs per repository, drains queued jobs on startup
- startup.ts: stale job recovery (running→failed), repo state reset, singleton
  initialization wired from hooks.server.ts
- GET /api/v1/jobs with repositoryId/status/limit filtering
- GET /api/v1/jobs/[id] single job lookup
- hooks.server.ts: initializes DB and pipeline on server start
- 18 unit tests covering queue, pipeline stages, recovery, and atomicity

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Giancarmine Salucci
2026-03-22 18:22:20 +01:00
parent bf4caf5e3b
commit 956b2a3a62
7 changed files with 1342 additions and 0 deletions

72
src/hooks.server.ts Normal file
View File

@@ -0,0 +1,72 @@
/**
* SvelteKit server hooks (TRUEREF-0009).
*
* Runs once when the Node.js server process starts. Initialises the database
* and kicks off the indexing pipeline + job queue so queued jobs resume
* automatically after a restart.
*/
import { initializeDatabase } from '$lib/server/db/index.js';
import { getClient } from '$lib/server/db/client.js';
import { initializePipeline } from '$lib/server/pipeline/startup.js';
import { EMBEDDING_CONFIG_KEY, createProviderFromConfig, defaultEmbeddingConfig } from '$lib/server/embeddings/factory.js';
import { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
import type { EmbeddingConfig } from '$lib/server/embeddings/factory.js';
import type { Handle } from '@sveltejs/kit';
// ---------------------------------------------------------------------------
// Server initialisation — runs once on startup
// ---------------------------------------------------------------------------
try {
initializeDatabase();
const db = getClient();
// Load persisted embedding configuration (if any).
const configRow = db
.prepare<[string], { value: string }>(`SELECT value FROM settings WHERE key = ?`)
.get(EMBEDDING_CONFIG_KEY);
let embeddingService: EmbeddingService | null = null;
if (configRow) {
try {
const config: EmbeddingConfig =
typeof configRow.value === 'string'
? JSON.parse(configRow.value)
: (configRow.value as EmbeddingConfig);
if (config.provider !== 'none') {
const provider = createProviderFromConfig(config);
embeddingService = new EmbeddingService(db, provider);
}
} catch (err) {
console.warn(
`[hooks.server] Could not load embedding config: ${err instanceof Error ? err.message : String(err)}`
);
}
} else {
// Use the default (noop) config so the pipeline is still wired up.
const config = defaultEmbeddingConfig();
if (config.provider !== 'none') {
const provider = createProviderFromConfig(config);
embeddingService = new EmbeddingService(db, provider);
}
}
initializePipeline(db, embeddingService);
console.log('[hooks.server] Indexing pipeline initialised.');
} catch (err) {
console.error(
`[hooks.server] Failed to initialise server: ${err instanceof Error ? err.message : String(err)}`
);
}
// ---------------------------------------------------------------------------
// Request handler (pass-through)
// ---------------------------------------------------------------------------
export const handle: Handle = async ({ event, resolve }) => {
return resolve(event);
};

View File

@@ -0,0 +1,460 @@
/**
* Unit tests for IndexingPipeline and JobQueue (TRUEREF-0009).
*
* Uses an in-memory SQLite database populated with the same migration SQL
* as the production database.
*/
import { describe, it, expect, beforeEach, vi } from 'vitest';
import Database from 'better-sqlite3';
import { readFileSync } from 'node:fs';
import { join } from 'node:path';
import { JobQueue } from './job-queue.js';
import { IndexingPipeline } from './indexing.pipeline.js';
import { recoverStaleJobs } from './startup.js';
// ---------------------------------------------------------------------------
// Test DB factory
// ---------------------------------------------------------------------------
function createTestDb(): Database.Database {
const client = new Database(':memory:');
client.pragma('foreign_keys = ON');
const migrationsFolder = join(import.meta.dirname, '../db/migrations');
const migrationSql = readFileSync(
join(migrationsFolder, '0000_large_master_chief.sql'),
'utf-8'
);
const statements = migrationSql
.split('--> statement-breakpoint')
.map((s) => s.trim())
.filter(Boolean);
for (const stmt of statements) {
client.exec(stmt);
}
return client;
}
// ---------------------------------------------------------------------------
// Fixtures
// ---------------------------------------------------------------------------
const now = Math.floor(Date.now() / 1000);
function insertRepo(
db: Database.Database,
overrides: Partial<Record<string, unknown>> = {}
): void {
db.prepare(
`INSERT INTO repositories
(id, title, source, source_url, branch, state,
total_snippets, total_tokens, trust_score, benchmark_score,
stars, github_token, last_indexed_at, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
).run(
overrides.id ?? '/test/repo',
overrides.title ?? 'Test Repo',
overrides.source ?? 'local',
overrides.source_url ?? '/tmp/test-repo',
overrides.branch ?? 'main',
overrides.state ?? 'pending',
0, 0, 0, 0, null, null, null, now, now
);
}
function insertJob(
db: Database.Database,
overrides: Partial<Record<string, unknown>> = {}
): string {
const id = crypto.randomUUID();
db.prepare(
`INSERT INTO indexing_jobs
(id, repository_id, version_id, status, progress,
total_files, processed_files, error, started_at, completed_at, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
).run(
overrides.id ?? id,
overrides.repository_id ?? '/test/repo',
overrides.version_id ?? null,
overrides.status ?? 'queued',
overrides.progress ?? 0,
overrides.total_files ?? 0,
overrides.processed_files ?? 0,
overrides.error ?? null,
overrides.started_at ?? null,
overrides.completed_at ?? null,
overrides.created_at ?? now
);
return (overrides.id as string) ?? id;
}
// ---------------------------------------------------------------------------
// recoverStaleJobs
// ---------------------------------------------------------------------------
describe('recoverStaleJobs', () => {
let db: Database.Database;
beforeEach(() => {
db = createTestDb();
insertRepo(db);
});
it('marks running jobs as failed', () => {
insertJob(db, { status: 'running' });
recoverStaleJobs(db);
const row = db
.prepare(`SELECT status, error FROM indexing_jobs LIMIT 1`)
.get() as { status: string; error: string };
expect(row.status).toBe('failed');
expect(row.error).toMatch(/restarted/i);
});
it('resets repositories in indexing state to error', () => {
db.prepare(`UPDATE repositories SET state = 'indexing' WHERE id = '/test/repo'`).run();
recoverStaleJobs(db);
const row = db
.prepare(`SELECT state FROM repositories WHERE id = '/test/repo'`)
.get() as { state: string };
expect(row.state).toBe('error');
});
it('leaves queued and done jobs untouched', () => {
insertJob(db, { status: 'queued' });
insertJob(db, { status: 'done' });
recoverStaleJobs(db);
const rows = db
.prepare(`SELECT status FROM indexing_jobs WHERE status IN ('queued', 'done')`)
.all() as { status: string }[];
expect(rows).toHaveLength(2);
});
});
// ---------------------------------------------------------------------------
// JobQueue
// ---------------------------------------------------------------------------
describe('JobQueue', () => {
let db: Database.Database;
let queue: JobQueue;
beforeEach(() => {
db = createTestDb();
insertRepo(db);
queue = new JobQueue(db);
});
it('enqueues a new job and returns it', () => {
const job = queue.enqueue('/test/repo');
expect(job.status).toBe('queued');
expect(job.repositoryId ?? (job as unknown as { repository_id: string }).repository_id).toBe(
'/test/repo'
);
});
it('deduplicates: returns existing active job instead of creating a new one', () => {
const job1 = queue.enqueue('/test/repo');
const job2 = queue.enqueue('/test/repo');
expect(job1.id).toBe(job2.id);
const count = (
db.prepare(`SELECT COUNT(*) as n FROM indexing_jobs`).get() as { n: number }
).n;
expect(count).toBe(1);
});
it('getJob returns null for unknown ID', () => {
expect(queue.getJob('non-existent')).toBeNull();
});
it('getJob returns the job if it exists', () => {
const jobId = insertJob(db, { status: 'done' });
const job = queue.getJob(jobId);
expect(job).not.toBeNull();
expect(job!.id).toBe(jobId);
});
it('listJobs returns all jobs ordered by created_at desc', () => {
insertJob(db, { created_at: now - 10, status: 'done' });
insertJob(db, { created_at: now - 5, status: 'done' });
insertJob(db, { created_at: now, status: 'queued' });
const jobs = queue.listJobs();
expect(jobs.length).toBeGreaterThanOrEqual(3);
// Most recent first.
expect(jobs[0].status).toBe('queued');
});
it('listJobs filters by repositoryId', () => {
insertRepo(db, { id: '/other/repo', source_url: '/tmp/other' });
insertJob(db, { repository_id: '/other/repo', status: 'done' });
insertJob(db, { status: 'queued' });
const jobs = queue.listJobs({ repositoryId: '/other/repo' });
expect(jobs).toHaveLength(1);
});
it('listJobs filters by status', () => {
insertJob(db, { status: 'queued' });
insertJob(db, { status: 'done' });
insertJob(db, { status: 'failed' });
const queued = queue.listJobs({ status: 'queued' });
expect(queued.every((j) => j.status === 'queued')).toBe(true);
});
it('countJobs returns correct count', () => {
insertJob(db, { status: 'done' });
insertJob(db, { status: 'done' });
insertJob(db, { status: 'failed' });
expect(queue.countJobs()).toBe(3);
expect(queue.countJobs({ status: 'done' })).toBe(2);
expect(queue.countJobs({ status: 'failed' })).toBe(1);
});
});
// ---------------------------------------------------------------------------
// IndexingPipeline
// ---------------------------------------------------------------------------
describe('IndexingPipeline', () => {
let db: Database.Database;
beforeEach(() => {
db = createTestDb();
insertRepo(db, { source: 'local', source_url: '/tmp/test-repo' });
});
function makePipeline(
crawlResult: {
files: Array<{ path: string; content: string; sha: string; language: string }>;
totalFiles: number;
} = { files: [], totalFiles: 0 }
) {
const mockGithubCrawl = vi.fn().mockResolvedValue({
...crawlResult,
skippedFiles: 0,
branch: 'main',
commitSha: 'abc'
});
const mockLocalCrawler = {
crawl: vi.fn().mockResolvedValue({
...crawlResult,
skippedFiles: 0,
branch: 'main',
commitSha: 'abc'
})
};
return new IndexingPipeline(
db,
mockGithubCrawl as never,
mockLocalCrawler as never,
null
);
}
function makeJob(repositoryId = '/test/repo') {
const jobId = insertJob(db, { repository_id: repositoryId, status: 'queued' });
return db
.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`)
.get(jobId) as { id: string; repositoryId?: string; repository_id?: string; status: string; versionId?: string; version_id?: string };
}
it('marks job as done when there are no files to index', async () => {
const pipeline = makePipeline({ files: [], totalFiles: 0 });
const job = makeJob();
await pipeline.run(job as never);
const updated = db
.prepare(`SELECT status, progress FROM indexing_jobs WHERE id = ?`)
.get(job.id) as { status: string; progress: number };
expect(updated.status).toBe('done');
expect(updated.progress).toBe(100);
});
it('marks job as running then done (final state is done)', async () => {
const pipeline = makePipeline({ files: [], totalFiles: 0 });
const job = makeJob();
await pipeline.run(job as never);
const updated = db
.prepare(`SELECT status FROM indexing_jobs WHERE id = ?`)
.get(job.id) as { status: string };
// The job should end in 'done' — the running→done transition is covered
// by the pipeline's internal updateJob calls.
expect(updated.status).toBe('done');
});
it('marks job as failed and repo as error when pipeline throws', async () => {
const errorCrawl = vi.fn().mockRejectedValue(new Error('crawl failed'));
const pipeline = new IndexingPipeline(
db,
errorCrawl as never,
{ crawl: errorCrawl } as never,
null
);
const job = makeJob();
await expect(pipeline.run(job as never)).rejects.toThrow('crawl failed');
const updatedJob = db
.prepare(`SELECT status, error FROM indexing_jobs WHERE id = ?`)
.get(job.id) as { status: string; error: string };
expect(updatedJob.status).toBe('failed');
expect(updatedJob.error).toBe('crawl failed');
const updatedRepo = db
.prepare(`SELECT state FROM repositories WHERE id = '/test/repo'`)
.get() as { state: string };
expect(updatedRepo.state).toBe('error');
});
it('inserts documents and snippets for new files', async () => {
const files = [
{
path: 'README.md',
content: '# Hello\n\nThis is documentation.',
sha: 'sha-readme',
language: 'markdown'
}
];
const pipeline = makePipeline({ files, totalFiles: 1 });
const job = makeJob();
await pipeline.run(job as never);
const docs = db.prepare(`SELECT * FROM documents`).all() as unknown[];
expect(docs.length).toBeGreaterThan(0);
const snippets = db.prepare(`SELECT * FROM snippets`).all() as unknown[];
expect(snippets.length).toBeGreaterThan(0);
const repo = db
.prepare(`SELECT state, total_snippets FROM repositories WHERE id = '/test/repo'`)
.get() as { state: string; total_snippets: number };
expect(repo.state).toBe('indexed');
expect(repo.total_snippets).toBeGreaterThan(0);
});
it('skips unchanged files (checksum match)', async () => {
// First indexing run.
const files = [
{
path: 'README.md',
content: '# Hello\n\nThis is documentation.',
sha: 'sha-readme',
language: 'markdown'
}
];
const pipeline = makePipeline({ files, totalFiles: 1 });
const job1 = makeJob();
await pipeline.run(job1 as never);
const firstDocCount = (
db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }
).n;
const firstSnippetIds = (
db.prepare(`SELECT id FROM snippets`).all() as { id: string }[]
).map((r) => r.id);
// Second run with identical files.
const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' });
const job2 = db
.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`)
.get(job2Id) as never;
await pipeline.run(job2);
const secondDocCount = (
db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }
).n;
const secondSnippetIds = (
db.prepare(`SELECT id FROM snippets`).all() as { id: string }[]
).map((r) => r.id);
// Document count stays the same and snippet IDs are unchanged.
expect(secondDocCount).toBe(firstDocCount);
expect(secondSnippetIds).toEqual(firstSnippetIds);
});
it('replaces snippets atomically when a file changes', async () => {
const pipeline1 = makePipeline({
files: [
{
path: 'README.md',
content: '# Original\n\nThis is the original version of the documentation with sufficient content.',
sha: 'sha-v1',
language: 'markdown'
}
],
totalFiles: 1
});
const job1 = makeJob();
await pipeline1.run(job1 as never);
const originalSnippetCount = (
db.prepare(`SELECT COUNT(*) as n FROM snippets`).get() as { n: number }
).n;
expect(originalSnippetCount).toBeGreaterThan(0);
// Second run with changed file content.
const pipeline2 = makePipeline({
files: [
{
path: 'README.md',
content: '# Updated\n\nThis is a completely different version of the documentation with new content.',
sha: 'sha-v2',
language: 'markdown'
}
],
totalFiles: 1
});
const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' });
const job2 = db
.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`)
.get(job2Id) as never;
await pipeline2.run(job2);
const finalDocCount = (
db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }
).n;
// Only one document should exist (the updated one).
expect(finalDocCount).toBe(1);
const finalChecksum = (
db.prepare(`SELECT checksum FROM documents LIMIT 1`).get() as { checksum: string }
).checksum;
expect(finalChecksum).toBe('sha-v2');
});
it('updates job progress as files are processed', async () => {
const files = Array.from({ length: 5 }, (_, i) => ({
path: `file${i}.md`,
content: `# File ${i}\n\nContent ${i}.`,
sha: `sha-${i}`,
language: 'markdown'
}));
const pipeline = makePipeline({ files, totalFiles: 5 });
const job = makeJob();
await pipeline.run(job as never);
const updated = db
.prepare(`SELECT progress FROM indexing_jobs WHERE id = ?`)
.get(job.id) as { progress: number };
expect(updated.progress).toBe(100);
});
});

View File

@@ -0,0 +1,405 @@
/**
* IndexingPipeline — orchestrates the full crawl → parse → store → embed
* flow for a single repository indexing job (TRUEREF-0009).
*
* Atomicity guarantee:
* Old documents/snippets for changed files are deleted and replaced inside
* a single SQLite transaction. If anything fails after that transaction the
* already-committed data stays intact and the job is marked failed so
* callers can retry.
*
* Progress model:
* - Without embeddings: crawl+parse = 100 %
* - With embeddings : crawl+parse = 80 %, embeddings = 20 %
*/
import { createHash } from 'node:crypto';
import type Database from 'better-sqlite3';
import type { IndexingJob, NewDocument, NewSnippet, Repository } from '$lib/types';
import type { crawl as GithubCrawlFn } from '$lib/server/crawler/github.crawler.js';
import type { LocalCrawler } from '$lib/server/crawler/local.crawler.js';
import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
import { parseFile } from '$lib/server/parser/index.js';
import { computeTrustScore } from '$lib/server/search/trust-score.js';
// ---------------------------------------------------------------------------
// Progress calculation
// ---------------------------------------------------------------------------
function calculateProgress(
processedFiles: number,
totalFiles: number,
embeddingsDone: number,
embeddingsTotal: number,
hasEmbeddings: boolean
): number {
if (totalFiles === 0) return 0;
if (!hasEmbeddings) {
return Math.round((processedFiles / totalFiles) * 100);
}
const parseProgress = (processedFiles / totalFiles) * 80;
const embedProgress = embeddingsTotal > 0 ? (embeddingsDone / embeddingsTotal) * 20 : 0;
return Math.round(parseProgress + embedProgress);
}
// ---------------------------------------------------------------------------
// SHA-256 helper
// ---------------------------------------------------------------------------
function sha256(content: string): string {
return createHash('sha256').update(content, 'utf-8').digest('hex');
}
// ---------------------------------------------------------------------------
// IndexingPipeline
// ---------------------------------------------------------------------------
export class IndexingPipeline {
constructor(
private readonly db: Database.Database,
private readonly githubCrawl: typeof GithubCrawlFn,
private readonly localCrawler: LocalCrawler,
private readonly embeddingService: EmbeddingService | null
) {}
// -------------------------------------------------------------------------
// Public — run a job end to end
// -------------------------------------------------------------------------
async run(job: IndexingJob): Promise<void> {
// better-sqlite3 raw queries return snake_case keys; Drizzle types use camelCase.
// Accept both so the pipeline works when called from raw SQL contexts.
const raw = job as unknown as Record<string, unknown>;
const repositoryId = (job.repositoryId ?? raw['repository_id']) as string;
const versionId = (job.versionId ?? raw['version_id'] ?? null) as string | null;
// Rebuild a normalised job view for the rest of this method.
const normJob = { ...job, repositoryId, versionId };
this.updateJob(job.id, { status: 'running', startedAt: Math.floor(Date.now() / 1000) });
try {
const repo = this.getRepository(repositoryId);
if (!repo) throw new Error(`Repository ${repositoryId} not found`);
// Mark repo as actively indexing.
this.updateRepo(repo.id, { state: 'indexing' });
// ---- Stage 1: Crawl -------------------------------------------------
const crawlResult = await this.crawl(repo, normJob);
const totalFiles = crawlResult.totalFiles;
this.updateJob(job.id, { totalFiles });
// ---- Stage 2: Parse & diff ------------------------------------------
// Accumulate new documents/snippets; skip unchanged files.
const newDocuments: NewDocument[] = [];
const newSnippets: NewSnippet[] = [];
const changedDocIds: string[] = [];
let processedFiles = 0;
for (const file of crawlResult.files) {
const checksum = file.sha || sha256(file.content);
// Check whether an identical document already exists.
const existingDoc = this.db
.prepare<[string, string], { id: string; checksum: string }>(
`SELECT id, checksum FROM documents
WHERE repository_id = ? AND file_path = ? LIMIT 1`
)
.get(repo.id, file.path);
if (existingDoc && existingDoc.checksum === checksum) {
// File unchanged — reuse existing snippets, nothing to do.
processedFiles++;
const progress = calculateProgress(
processedFiles,
totalFiles,
0,
0,
this.embeddingService !== null
);
this.updateJob(job.id, { processedFiles, progress });
continue;
}
// File is new or changed — schedule old doc for deletion.
if (existingDoc) {
changedDocIds.push(existingDoc.id);
}
// Create new document record.
const documentId = crypto.randomUUID();
const now = new Date();
const newDoc: NewDocument = {
id: documentId,
repositoryId: repo.id,
versionId: normJob.versionId ?? null,
filePath: file.path,
title: null,
language: file.language,
tokenCount: 0,
checksum,
indexedAt: now
};
// Parse into snippets.
const snippets = parseFile(file, {
repositoryId: repo.id,
documentId,
versionId: normJob.versionId ?? undefined
});
// Update document token count from snippet totals.
const tokenCount = snippets.reduce((sum, s) => sum + (s.tokenCount ?? 0), 0);
newDoc.tokenCount = tokenCount;
newDocuments.push(newDoc);
newSnippets.push(...snippets);
processedFiles++;
const progress = calculateProgress(
processedFiles,
totalFiles,
0,
0,
this.embeddingService !== null
);
this.updateJob(job.id, { processedFiles, progress });
}
// ---- Stage 3: Atomic replacement ------------------------------------
this.replaceSnippets(repo.id, changedDocIds, newDocuments, newSnippets);
// ---- Stage 4: Embeddings (if provider is configured) ----------------
if (this.embeddingService && newSnippets.length > 0) {
const snippetIds = newSnippets.map((s) => s.id!);
const embeddingsTotal = snippetIds.length;
await this.embeddingService.embedSnippets(snippetIds, (done) => {
const progress = calculateProgress(
processedFiles,
totalFiles,
done,
embeddingsTotal,
true
);
this.updateJob(job.id, { progress });
});
}
// ---- Stage 5: Update repository stats --------------------------------
const stats = this.computeStats(repo.id);
const freshRepo = this.getRepository(repo.id)!;
const trustScore = computeTrustScore({
...freshRepo,
totalSnippets: stats.totalSnippets,
totalTokens: stats.totalTokens,
state: 'indexed'
});
this.updateRepo(repo.id, {
state: 'indexed',
totalSnippets: stats.totalSnippets,
totalTokens: stats.totalTokens,
trustScore,
lastIndexedAt: Math.floor(Date.now() / 1000)
});
this.updateJob(job.id, {
status: 'done',
progress: 100,
completedAt: Math.floor(Date.now() / 1000)
});
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.error(`[IndexingPipeline] Job ${job.id} failed: ${message}`);
this.updateJob(job.id, {
status: 'failed',
error: message,
completedAt: Math.floor(Date.now() / 1000)
});
// Restore repo to error state but preserve any existing indexed data.
this.updateRepo(repositoryId, { state: 'error' });
throw error;
}
}
// -------------------------------------------------------------------------
// Private — crawl
// -------------------------------------------------------------------------
private async crawl(
repo: Repository,
job: IndexingJob
): Promise<{ files: Array<{ path: string; content: string; sha: string; size: number; language: string }>; totalFiles: number }> {
if (repo.source === 'github') {
// Parse owner/repo from the canonical ID: "/owner/repo"
const parts = repo.id.replace(/^\//, '').split('/');
const owner = parts[0];
const repoName = parts[1];
if (!owner || !repoName) {
throw new Error(`Cannot parse GitHub owner/repo from id: ${repo.id}`);
}
const result = await this.githubCrawl({
owner,
repo: repoName,
ref: repo.branch ?? undefined,
token: repo.githubToken ?? undefined
});
return { files: result.files, totalFiles: result.totalFiles };
} else {
// Local filesystem crawl.
const result = await this.localCrawler.crawl({
rootPath: repo.sourceUrl,
ref: repo.branch !== 'main' ? (repo.branch ?? undefined) : undefined
});
return { files: result.files, totalFiles: result.totalFiles };
}
}
// -------------------------------------------------------------------------
// Private — atomic snippet replacement
// -------------------------------------------------------------------------
private replaceSnippets(
_repositoryId: string,
changedDocIds: string[],
newDocuments: NewDocument[],
newSnippets: NewSnippet[]
): void {
const insertDoc = this.db.prepare(
`INSERT INTO documents
(id, repository_id, version_id, file_path, title, language,
token_count, checksum, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
);
const insertSnippet = this.db.prepare(
`INSERT INTO snippets
(id, document_id, repository_id, version_id, type, title,
content, language, breadcrumb, token_count, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
);
this.db.transaction(() => {
// Delete stale documents (cascade deletes their snippets via FK).
if (changedDocIds.length > 0) {
const placeholders = changedDocIds.map(() => '?').join(',');
this.db
.prepare(`DELETE FROM documents WHERE id IN (${placeholders})`)
.run(...changedDocIds);
}
// Insert new documents.
for (const doc of newDocuments) {
const indexedAtSeconds =
doc.indexedAt instanceof Date
? Math.floor(doc.indexedAt.getTime() / 1000)
: Math.floor(Date.now() / 1000);
insertDoc.run(
doc.id,
doc.repositoryId,
doc.versionId ?? null,
doc.filePath,
doc.title ?? null,
doc.language ?? null,
doc.tokenCount ?? 0,
doc.checksum,
indexedAtSeconds
);
}
// Insert new snippets.
for (const snippet of newSnippets) {
const createdAtSeconds =
snippet.createdAt instanceof Date
? Math.floor(snippet.createdAt.getTime() / 1000)
: Math.floor(Date.now() / 1000);
insertSnippet.run(
snippet.id,
snippet.documentId,
snippet.repositoryId,
snippet.versionId ?? null,
snippet.type,
snippet.title ?? null,
snippet.content,
snippet.language ?? null,
snippet.breadcrumb ?? null,
snippet.tokenCount ?? 0,
createdAtSeconds
);
}
})();
}
// -------------------------------------------------------------------------
// Private — stats
// -------------------------------------------------------------------------
private computeStats(repositoryId: string): { totalSnippets: number; totalTokens: number } {
const row = this.db
.prepare<[string], { total_snippets: number; total_tokens: number }>(
`SELECT COUNT(*) as total_snippets,
COALESCE(SUM(token_count), 0) as total_tokens
FROM snippets WHERE repository_id = ?`
)
.get(repositoryId);
return {
totalSnippets: row?.total_snippets ?? 0,
totalTokens: row?.total_tokens ?? 0
};
}
// -------------------------------------------------------------------------
// Private — DB helpers
// -------------------------------------------------------------------------
private getRepository(id: string): Repository | null {
return (
(this.db
.prepare<[string], Repository>(`SELECT * FROM repositories WHERE id = ?`)
.get(id) as Repository | undefined) ?? null
);
}
private updateJob(id: string, fields: Record<string, unknown>): void {
const sets = Object.keys(fields)
.map((k) => `${toSnake(k)} = ?`)
.join(', ');
const values = [...Object.values(fields), id];
this.db.prepare(`UPDATE indexing_jobs SET ${sets} WHERE id = ?`).run(...values);
}
private updateRepo(id: string, fields: Record<string, unknown>): void {
const now = Math.floor(Date.now() / 1000);
const allFields = { ...fields, updatedAt: now };
const sets = Object.keys(allFields)
.map((k) => `${toSnake(k)} = ?`)
.join(', ');
const values = [...Object.values(allFields), id];
this.db.prepare(`UPDATE repositories SET ${sets} WHERE id = ?`).run(...values);
}
}
// ---------------------------------------------------------------------------
// Utility
// ---------------------------------------------------------------------------
/** Convert camelCase to snake_case for DB column mapping. */
function toSnake(key: string): string {
return key.replace(/[A-Z]/g, (c) => `_${c.toLowerCase()}`);
}

View File

@@ -0,0 +1,203 @@
/**
* SQLite-backed job queue for indexing jobs (TRUEREF-0009).
*
* Jobs are processed sequentially (one at a time) to avoid SQLite write
* contention. The queue uses setImmediate to yield to the event loop between
* jobs so that API requests remain responsive.
*/
import type Database from 'better-sqlite3';
import type { IndexingJob, NewIndexingJob } from '$lib/types';
import type { IndexingPipeline } from './indexing.pipeline.js';
export class JobQueue {
private isRunning = false;
private pipeline: IndexingPipeline | null = null;
constructor(private readonly db: Database.Database) {}
/**
* Inject the pipeline dependency (avoids circular construction order).
*/
setPipeline(pipeline: IndexingPipeline): void {
this.pipeline = pipeline;
}
/**
* Enqueue a new indexing job for the given repository.
* If a job for this repository is already queued or running, returns the
* existing job instead of creating a duplicate.
*/
enqueue(repositoryId: string, versionId?: string): IndexingJob {
// Return early if there's already an active job for this repo.
const active = this.db
.prepare<[string], IndexingJob>(
`SELECT * FROM indexing_jobs
WHERE repository_id = ? AND status IN ('queued', 'running')
ORDER BY created_at DESC LIMIT 1`
)
.get(repositoryId);
if (active) return active;
const now = Math.floor(Date.now() / 1000);
const job: NewIndexingJob = {
id: crypto.randomUUID(),
repositoryId,
versionId: versionId ?? null,
status: 'queued',
progress: 0,
totalFiles: 0,
processedFiles: 0,
error: null,
startedAt: null,
completedAt: null,
createdAt: new Date(now * 1000)
};
this.db
.prepare(
`INSERT INTO indexing_jobs
(id, repository_id, version_id, status, progress, total_files,
processed_files, error, started_at, completed_at, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
)
.run(
job.id,
job.repositoryId,
job.versionId,
job.status,
job.progress,
job.totalFiles,
job.processedFiles,
job.error,
job.startedAt,
job.completedAt,
now
);
// Kick off sequential processing if not already running.
if (!this.isRunning) {
setImmediate(() => this.processNext());
}
return this.db
.prepare<[string], IndexingJob>(`SELECT * FROM indexing_jobs WHERE id = ?`)
.get(job.id)!;
}
/**
* Pick the oldest queued job and run it through the pipeline.
* Called recursively via setImmediate so the event loop stays unblocked.
*/
private async processNext(): Promise<void> {
if (this.isRunning) return;
if (!this.pipeline) {
console.warn('[JobQueue] No pipeline configured — cannot process jobs.');
return;
}
const job = this.db
.prepare<[], IndexingJob>(
`SELECT * FROM indexing_jobs
WHERE status = 'queued'
ORDER BY created_at ASC LIMIT 1`
)
.get();
if (!job) return;
this.isRunning = true;
try {
await this.pipeline.run(job);
} catch (err) {
// Error is logged inside pipeline.run(); no action needed here.
console.error(
`[JobQueue] Job ${job.id} failed: ${err instanceof Error ? err.message : String(err)}`
);
} finally {
this.isRunning = false;
// Check whether another job was queued while this one ran.
const next = this.db
.prepare<[], { id: string }>(
`SELECT id FROM indexing_jobs WHERE status = 'queued' LIMIT 1`
)
.get();
if (next) {
setImmediate(() => this.processNext());
}
}
}
/**
* Retrieve a single job by ID.
*/
getJob(id: string): IndexingJob | null {
return (
this.db
.prepare<[string], IndexingJob>(`SELECT * FROM indexing_jobs WHERE id = ?`)
.get(id) ?? null
);
}
/**
* List recent jobs, optionally filtered by repository and/or status.
*/
listJobs(options?: {
repositoryId?: string;
status?: IndexingJob['status'];
limit?: number;
}): IndexingJob[] {
const limit = Math.min(options?.limit ?? 20, 200);
const conditions: string[] = [];
const params: unknown[] = [];
if (options?.repositoryId) {
conditions.push('repository_id = ?');
params.push(options.repositoryId);
}
if (options?.status) {
conditions.push('status = ?');
params.push(options.status);
}
const where = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
const sql = `SELECT * FROM indexing_jobs ${where} ORDER BY created_at DESC LIMIT ?`;
params.push(limit);
return this.db.prepare<unknown[], IndexingJob>(sql).all(...params);
}
/**
* Trigger processing of any queued jobs (e.g. after server restart).
* Safe to call multiple times; a no-op if the queue is already running.
*/
drainQueued(): void {
if (!this.isRunning) {
setImmediate(() => this.processNext());
}
}
/**
* Count all jobs matching optional filters.
*/
countJobs(options?: { repositoryId?: string; status?: IndexingJob['status'] }): number {
const conditions: string[] = [];
const params: unknown[] = [];
if (options?.repositoryId) {
conditions.push('repository_id = ?');
params.push(options.repositoryId);
}
if (options?.status) {
conditions.push('status = ?');
params.push(options.status);
}
const where = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
const sql = `SELECT COUNT(*) as n FROM indexing_jobs ${where}`;
const row = this.db.prepare<unknown[], { n: number }>(sql).get(...params);
return row?.n ?? 0;
}
}

View File

@@ -0,0 +1,122 @@
/**
* Server startup routines for the indexing pipeline (TRUEREF-0009).
*
* On every server start:
* 1. Mark any jobs that were left in 'running' state as 'failed'
* (they were interrupted by a process crash / restart).
* 2. Reset any repositories stuck in 'indexing' state to 'error'.
* 3. Construct and wire together the JobQueue + IndexingPipeline singletons.
* 4. Kick off processing of any jobs that were 'queued' before the restart.
*/
import type Database from 'better-sqlite3';
import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
import { crawl as githubCrawl } from '$lib/server/crawler/github.crawler.js';
import { LocalCrawler } from '$lib/server/crawler/local.crawler.js';
import { IndexingPipeline } from './indexing.pipeline.js';
import { JobQueue } from './job-queue.js';
// ---------------------------------------------------------------------------
// Stale-job recovery
// ---------------------------------------------------------------------------
/**
* Mark jobs that were `running` when the server crashed as `failed`, and
* reset repositories that were stuck in `indexing` state to `error`.
*
* Safe to call on every startup — uses unixepoch() so timestamps stay in
* the same integer-seconds format as the rest of the schema.
*/
export function recoverStaleJobs(db: Database.Database): void {
db.prepare(
`UPDATE indexing_jobs
SET status = 'failed',
error = 'Server restarted while job was running',
completed_at = unixepoch()
WHERE status = 'running'`
).run();
db.prepare(
`UPDATE repositories
SET state = 'error'
WHERE state = 'indexing'`
).run();
}
// ---------------------------------------------------------------------------
// Singleton instances
// ---------------------------------------------------------------------------
let _queue: JobQueue | null = null;
let _pipeline: IndexingPipeline | null = null;
/**
* Initialise (or return the existing) JobQueue + IndexingPipeline pair.
*
* Must be called once at server startup (e.g. from `hooks.server.ts`).
* Calling it more than once is harmless — subsequent calls are no-ops that
* return the already-constructed singletons.
*
* @param db - Raw better-sqlite3 Database instance.
* @param embeddingService - Optional embedding service; pass null to disable.
* @returns An object with `queue` and `pipeline` accessors.
*/
export function initializePipeline(
db: Database.Database,
embeddingService: EmbeddingService | null = null
): { queue: JobQueue; pipeline: IndexingPipeline } {
if (_queue && _pipeline) {
return { queue: _queue, pipeline: _pipeline };
}
// Recover before constructing so no stale job gets picked up.
recoverStaleJobs(db);
const localCrawler = new LocalCrawler();
const pipeline = new IndexingPipeline(db, githubCrawl, localCrawler, embeddingService);
const queue = new JobQueue(db);
queue.setPipeline(pipeline);
_queue = queue;
_pipeline = pipeline;
// Drain any jobs that were queued before the restart.
setImmediate(() => {
const pending = db
.prepare<[], { id: string }>(`SELECT id FROM indexing_jobs WHERE status = 'queued' LIMIT 1`)
.get();
if (pending) {
// Re-enqueue logic is handled inside JobQueue.processNext; we trigger
// it by asking the queue for any job that is already queued.
// The simplest way is to call enqueue on a repo that has a queued job —
// but since enqueue deduplicates, we just trigger processNext directly.
// We do this via a public helper to avoid exposing private methods.
queue.drainQueued();
}
});
return { queue, pipeline };
}
/**
* Return the current JobQueue singleton, or null if not yet initialised.
*/
export function getQueue(): JobQueue | null {
return _queue;
}
/**
* Return the current IndexingPipeline singleton, or null if not yet initialised.
*/
export function getPipeline(): IndexingPipeline | null {
return _pipeline;
}
/**
* Reset singletons — intended for use in tests only.
*/
export function _resetSingletons(): void {
_queue = null;
_pipeline = null;
}

View File

@@ -0,0 +1,46 @@
/**
* GET /api/v1/jobs — list recent indexing jobs.
*
* Query parameters:
* repositoryId (optional) — filter by repository
* status (optional) — filter by status: queued|running|done|failed
* limit (optional, default 20, max 200)
*/
import { json } from '@sveltejs/kit';
import type { RequestHandler } from './$types';
import { getClient } from '$lib/server/db/client.js';
import { JobQueue } from '$lib/server/pipeline/job-queue.js';
import { handleServiceError } from '$lib/server/utils/validation.js';
import type { IndexingJob } from '$lib/types';
export const GET: RequestHandler = ({ url }) => {
try {
const db = getClient();
const queue = new JobQueue(db);
const repositoryId = url.searchParams.get('repositoryId') ?? undefined;
const status = (url.searchParams.get('status') ?? undefined) as
| IndexingJob['status']
| undefined;
const limit = Math.min(parseInt(url.searchParams.get('limit') ?? '20', 10) || 20, 200);
const jobs = queue.listJobs({ repositoryId, status, limit });
const total = queue.countJobs({ repositoryId, status });
return json({ jobs, total });
} catch (err) {
return handleServiceError(err);
}
};
export const OPTIONS: RequestHandler = () => {
return new Response(null, {
status: 204,
headers: {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET, OPTIONS',
'Access-Control-Allow-Headers': 'Content-Type, Authorization'
}
});
};

View File

@@ -0,0 +1,34 @@
/**
* GET /api/v1/jobs/:id — retrieve a single indexing job by ID.
*/
import { json } from '@sveltejs/kit';
import type { RequestHandler } from './$types';
import { getClient } from '$lib/server/db/client.js';
import { JobQueue } from '$lib/server/pipeline/job-queue.js';
import { handleServiceError, NotFoundError } from '$lib/server/utils/validation.js';
export const GET: RequestHandler = ({ params }) => {
try {
const db = getClient();
const queue = new JobQueue(db);
const job = queue.getJob(params.id);
if (!job) throw new NotFoundError(`Job ${params.id} not found`);
return json({ job });
} catch (err) {
return handleServiceError(err);
}
};
export const OPTIONS: RequestHandler = () => {
return new Response(null, {
status: 204,
headers: {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET, OPTIONS',
'Access-Control-Allow-Headers': 'Content-Type, Authorization'
}
});
};