feat(TRUEREF-0009): implement indexing pipeline and job queue
Implements the end-to-end indexing pipeline with a SQLite-backed job queue, startup recovery, and REST API endpoints for job status. - IndexingPipeline: orchestrates crawl → parse → atomic replace → embed → repo stats update with progress tracking at each stage - JobQueue: sequential SQLite-backed queue (no external broker), deduplicates active jobs per repository, drains queued jobs on startup - startup.ts: stale job recovery (running→failed), repo state reset, singleton initialization wired from hooks.server.ts - GET /api/v1/jobs with repositoryId/status/limit filtering - GET /api/v1/jobs/[id] single job lookup - hooks.server.ts: initializes DB and pipeline on server start - 18 unit tests covering queue, pipeline stages, recovery, and atomicity Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
72
src/hooks.server.ts
Normal file
72
src/hooks.server.ts
Normal file
@@ -0,0 +1,72 @@
|
||||
/**
|
||||
* SvelteKit server hooks (TRUEREF-0009).
|
||||
*
|
||||
* Runs once when the Node.js server process starts. Initialises the database
|
||||
* and kicks off the indexing pipeline + job queue so queued jobs resume
|
||||
* automatically after a restart.
|
||||
*/
|
||||
|
||||
import { initializeDatabase } from '$lib/server/db/index.js';
|
||||
import { getClient } from '$lib/server/db/client.js';
|
||||
import { initializePipeline } from '$lib/server/pipeline/startup.js';
|
||||
import { EMBEDDING_CONFIG_KEY, createProviderFromConfig, defaultEmbeddingConfig } from '$lib/server/embeddings/factory.js';
|
||||
import { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
|
||||
import type { EmbeddingConfig } from '$lib/server/embeddings/factory.js';
|
||||
import type { Handle } from '@sveltejs/kit';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Server initialisation — runs once on startup
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
try {
|
||||
initializeDatabase();
|
||||
|
||||
const db = getClient();
|
||||
|
||||
// Load persisted embedding configuration (if any).
|
||||
const configRow = db
|
||||
.prepare<[string], { value: string }>(`SELECT value FROM settings WHERE key = ?`)
|
||||
.get(EMBEDDING_CONFIG_KEY);
|
||||
|
||||
let embeddingService: EmbeddingService | null = null;
|
||||
|
||||
if (configRow) {
|
||||
try {
|
||||
const config: EmbeddingConfig =
|
||||
typeof configRow.value === 'string'
|
||||
? JSON.parse(configRow.value)
|
||||
: (configRow.value as EmbeddingConfig);
|
||||
|
||||
if (config.provider !== 'none') {
|
||||
const provider = createProviderFromConfig(config);
|
||||
embeddingService = new EmbeddingService(db, provider);
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
`[hooks.server] Could not load embedding config: ${err instanceof Error ? err.message : String(err)}`
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// Use the default (noop) config so the pipeline is still wired up.
|
||||
const config = defaultEmbeddingConfig();
|
||||
if (config.provider !== 'none') {
|
||||
const provider = createProviderFromConfig(config);
|
||||
embeddingService = new EmbeddingService(db, provider);
|
||||
}
|
||||
}
|
||||
|
||||
initializePipeline(db, embeddingService);
|
||||
console.log('[hooks.server] Indexing pipeline initialised.');
|
||||
} catch (err) {
|
||||
console.error(
|
||||
`[hooks.server] Failed to initialise server: ${err instanceof Error ? err.message : String(err)}`
|
||||
);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Request handler (pass-through)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const handle: Handle = async ({ event, resolve }) => {
|
||||
return resolve(event);
|
||||
};
|
||||
460
src/lib/server/pipeline/indexing.pipeline.test.ts
Normal file
460
src/lib/server/pipeline/indexing.pipeline.test.ts
Normal file
@@ -0,0 +1,460 @@
|
||||
/**
|
||||
* Unit tests for IndexingPipeline and JobQueue (TRUEREF-0009).
|
||||
*
|
||||
* Uses an in-memory SQLite database populated with the same migration SQL
|
||||
* as the production database.
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeEach, vi } from 'vitest';
|
||||
import Database from 'better-sqlite3';
|
||||
import { readFileSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { JobQueue } from './job-queue.js';
|
||||
import { IndexingPipeline } from './indexing.pipeline.js';
|
||||
import { recoverStaleJobs } from './startup.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test DB factory
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function createTestDb(): Database.Database {
|
||||
const client = new Database(':memory:');
|
||||
client.pragma('foreign_keys = ON');
|
||||
|
||||
const migrationsFolder = join(import.meta.dirname, '../db/migrations');
|
||||
const migrationSql = readFileSync(
|
||||
join(migrationsFolder, '0000_large_master_chief.sql'),
|
||||
'utf-8'
|
||||
);
|
||||
|
||||
const statements = migrationSql
|
||||
.split('--> statement-breakpoint')
|
||||
.map((s) => s.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
for (const stmt of statements) {
|
||||
client.exec(stmt);
|
||||
}
|
||||
|
||||
return client;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Fixtures
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const now = Math.floor(Date.now() / 1000);
|
||||
|
||||
function insertRepo(
|
||||
db: Database.Database,
|
||||
overrides: Partial<Record<string, unknown>> = {}
|
||||
): void {
|
||||
db.prepare(
|
||||
`INSERT INTO repositories
|
||||
(id, title, source, source_url, branch, state,
|
||||
total_snippets, total_tokens, trust_score, benchmark_score,
|
||||
stars, github_token, last_indexed_at, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
).run(
|
||||
overrides.id ?? '/test/repo',
|
||||
overrides.title ?? 'Test Repo',
|
||||
overrides.source ?? 'local',
|
||||
overrides.source_url ?? '/tmp/test-repo',
|
||||
overrides.branch ?? 'main',
|
||||
overrides.state ?? 'pending',
|
||||
0, 0, 0, 0, null, null, null, now, now
|
||||
);
|
||||
}
|
||||
|
||||
function insertJob(
|
||||
db: Database.Database,
|
||||
overrides: Partial<Record<string, unknown>> = {}
|
||||
): string {
|
||||
const id = crypto.randomUUID();
|
||||
db.prepare(
|
||||
`INSERT INTO indexing_jobs
|
||||
(id, repository_id, version_id, status, progress,
|
||||
total_files, processed_files, error, started_at, completed_at, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
).run(
|
||||
overrides.id ?? id,
|
||||
overrides.repository_id ?? '/test/repo',
|
||||
overrides.version_id ?? null,
|
||||
overrides.status ?? 'queued',
|
||||
overrides.progress ?? 0,
|
||||
overrides.total_files ?? 0,
|
||||
overrides.processed_files ?? 0,
|
||||
overrides.error ?? null,
|
||||
overrides.started_at ?? null,
|
||||
overrides.completed_at ?? null,
|
||||
overrides.created_at ?? now
|
||||
);
|
||||
return (overrides.id as string) ?? id;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// recoverStaleJobs
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('recoverStaleJobs', () => {
|
||||
let db: Database.Database;
|
||||
|
||||
beforeEach(() => {
|
||||
db = createTestDb();
|
||||
insertRepo(db);
|
||||
});
|
||||
|
||||
it('marks running jobs as failed', () => {
|
||||
insertJob(db, { status: 'running' });
|
||||
recoverStaleJobs(db);
|
||||
|
||||
const row = db
|
||||
.prepare(`SELECT status, error FROM indexing_jobs LIMIT 1`)
|
||||
.get() as { status: string; error: string };
|
||||
expect(row.status).toBe('failed');
|
||||
expect(row.error).toMatch(/restarted/i);
|
||||
});
|
||||
|
||||
it('resets repositories in indexing state to error', () => {
|
||||
db.prepare(`UPDATE repositories SET state = 'indexing' WHERE id = '/test/repo'`).run();
|
||||
recoverStaleJobs(db);
|
||||
|
||||
const row = db
|
||||
.prepare(`SELECT state FROM repositories WHERE id = '/test/repo'`)
|
||||
.get() as { state: string };
|
||||
expect(row.state).toBe('error');
|
||||
});
|
||||
|
||||
it('leaves queued and done jobs untouched', () => {
|
||||
insertJob(db, { status: 'queued' });
|
||||
insertJob(db, { status: 'done' });
|
||||
recoverStaleJobs(db);
|
||||
|
||||
const rows = db
|
||||
.prepare(`SELECT status FROM indexing_jobs WHERE status IN ('queued', 'done')`)
|
||||
.all() as { status: string }[];
|
||||
expect(rows).toHaveLength(2);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// JobQueue
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('JobQueue', () => {
|
||||
let db: Database.Database;
|
||||
let queue: JobQueue;
|
||||
|
||||
beforeEach(() => {
|
||||
db = createTestDb();
|
||||
insertRepo(db);
|
||||
queue = new JobQueue(db);
|
||||
});
|
||||
|
||||
it('enqueues a new job and returns it', () => {
|
||||
const job = queue.enqueue('/test/repo');
|
||||
expect(job.status).toBe('queued');
|
||||
expect(job.repositoryId ?? (job as unknown as { repository_id: string }).repository_id).toBe(
|
||||
'/test/repo'
|
||||
);
|
||||
});
|
||||
|
||||
it('deduplicates: returns existing active job instead of creating a new one', () => {
|
||||
const job1 = queue.enqueue('/test/repo');
|
||||
const job2 = queue.enqueue('/test/repo');
|
||||
expect(job1.id).toBe(job2.id);
|
||||
|
||||
const count = (
|
||||
db.prepare(`SELECT COUNT(*) as n FROM indexing_jobs`).get() as { n: number }
|
||||
).n;
|
||||
expect(count).toBe(1);
|
||||
});
|
||||
|
||||
it('getJob returns null for unknown ID', () => {
|
||||
expect(queue.getJob('non-existent')).toBeNull();
|
||||
});
|
||||
|
||||
it('getJob returns the job if it exists', () => {
|
||||
const jobId = insertJob(db, { status: 'done' });
|
||||
const job = queue.getJob(jobId);
|
||||
expect(job).not.toBeNull();
|
||||
expect(job!.id).toBe(jobId);
|
||||
});
|
||||
|
||||
it('listJobs returns all jobs ordered by created_at desc', () => {
|
||||
insertJob(db, { created_at: now - 10, status: 'done' });
|
||||
insertJob(db, { created_at: now - 5, status: 'done' });
|
||||
insertJob(db, { created_at: now, status: 'queued' });
|
||||
|
||||
const jobs = queue.listJobs();
|
||||
expect(jobs.length).toBeGreaterThanOrEqual(3);
|
||||
// Most recent first.
|
||||
expect(jobs[0].status).toBe('queued');
|
||||
});
|
||||
|
||||
it('listJobs filters by repositoryId', () => {
|
||||
insertRepo(db, { id: '/other/repo', source_url: '/tmp/other' });
|
||||
insertJob(db, { repository_id: '/other/repo', status: 'done' });
|
||||
insertJob(db, { status: 'queued' });
|
||||
|
||||
const jobs = queue.listJobs({ repositoryId: '/other/repo' });
|
||||
expect(jobs).toHaveLength(1);
|
||||
});
|
||||
|
||||
it('listJobs filters by status', () => {
|
||||
insertJob(db, { status: 'queued' });
|
||||
insertJob(db, { status: 'done' });
|
||||
insertJob(db, { status: 'failed' });
|
||||
|
||||
const queued = queue.listJobs({ status: 'queued' });
|
||||
expect(queued.every((j) => j.status === 'queued')).toBe(true);
|
||||
});
|
||||
|
||||
it('countJobs returns correct count', () => {
|
||||
insertJob(db, { status: 'done' });
|
||||
insertJob(db, { status: 'done' });
|
||||
insertJob(db, { status: 'failed' });
|
||||
|
||||
expect(queue.countJobs()).toBe(3);
|
||||
expect(queue.countJobs({ status: 'done' })).toBe(2);
|
||||
expect(queue.countJobs({ status: 'failed' })).toBe(1);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// IndexingPipeline
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('IndexingPipeline', () => {
|
||||
let db: Database.Database;
|
||||
|
||||
beforeEach(() => {
|
||||
db = createTestDb();
|
||||
insertRepo(db, { source: 'local', source_url: '/tmp/test-repo' });
|
||||
});
|
||||
|
||||
function makePipeline(
|
||||
crawlResult: {
|
||||
files: Array<{ path: string; content: string; sha: string; language: string }>;
|
||||
totalFiles: number;
|
||||
} = { files: [], totalFiles: 0 }
|
||||
) {
|
||||
const mockGithubCrawl = vi.fn().mockResolvedValue({
|
||||
...crawlResult,
|
||||
skippedFiles: 0,
|
||||
branch: 'main',
|
||||
commitSha: 'abc'
|
||||
});
|
||||
|
||||
const mockLocalCrawler = {
|
||||
crawl: vi.fn().mockResolvedValue({
|
||||
...crawlResult,
|
||||
skippedFiles: 0,
|
||||
branch: 'main',
|
||||
commitSha: 'abc'
|
||||
})
|
||||
};
|
||||
|
||||
return new IndexingPipeline(
|
||||
db,
|
||||
mockGithubCrawl as never,
|
||||
mockLocalCrawler as never,
|
||||
null
|
||||
);
|
||||
}
|
||||
|
||||
function makeJob(repositoryId = '/test/repo') {
|
||||
const jobId = insertJob(db, { repository_id: repositoryId, status: 'queued' });
|
||||
return db
|
||||
.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`)
|
||||
.get(jobId) as { id: string; repositoryId?: string; repository_id?: string; status: string; versionId?: string; version_id?: string };
|
||||
}
|
||||
|
||||
it('marks job as done when there are no files to index', async () => {
|
||||
const pipeline = makePipeline({ files: [], totalFiles: 0 });
|
||||
const job = makeJob();
|
||||
|
||||
await pipeline.run(job as never);
|
||||
|
||||
const updated = db
|
||||
.prepare(`SELECT status, progress FROM indexing_jobs WHERE id = ?`)
|
||||
.get(job.id) as { status: string; progress: number };
|
||||
expect(updated.status).toBe('done');
|
||||
expect(updated.progress).toBe(100);
|
||||
});
|
||||
|
||||
it('marks job as running then done (final state is done)', async () => {
|
||||
const pipeline = makePipeline({ files: [], totalFiles: 0 });
|
||||
const job = makeJob();
|
||||
|
||||
await pipeline.run(job as never);
|
||||
|
||||
const updated = db
|
||||
.prepare(`SELECT status FROM indexing_jobs WHERE id = ?`)
|
||||
.get(job.id) as { status: string };
|
||||
// The job should end in 'done' — the running→done transition is covered
|
||||
// by the pipeline's internal updateJob calls.
|
||||
expect(updated.status).toBe('done');
|
||||
});
|
||||
|
||||
it('marks job as failed and repo as error when pipeline throws', async () => {
|
||||
const errorCrawl = vi.fn().mockRejectedValue(new Error('crawl failed'));
|
||||
const pipeline = new IndexingPipeline(
|
||||
db,
|
||||
errorCrawl as never,
|
||||
{ crawl: errorCrawl } as never,
|
||||
null
|
||||
);
|
||||
|
||||
const job = makeJob();
|
||||
|
||||
await expect(pipeline.run(job as never)).rejects.toThrow('crawl failed');
|
||||
|
||||
const updatedJob = db
|
||||
.prepare(`SELECT status, error FROM indexing_jobs WHERE id = ?`)
|
||||
.get(job.id) as { status: string; error: string };
|
||||
expect(updatedJob.status).toBe('failed');
|
||||
expect(updatedJob.error).toBe('crawl failed');
|
||||
|
||||
const updatedRepo = db
|
||||
.prepare(`SELECT state FROM repositories WHERE id = '/test/repo'`)
|
||||
.get() as { state: string };
|
||||
expect(updatedRepo.state).toBe('error');
|
||||
});
|
||||
|
||||
it('inserts documents and snippets for new files', async () => {
|
||||
const files = [
|
||||
{
|
||||
path: 'README.md',
|
||||
content: '# Hello\n\nThis is documentation.',
|
||||
sha: 'sha-readme',
|
||||
language: 'markdown'
|
||||
}
|
||||
];
|
||||
const pipeline = makePipeline({ files, totalFiles: 1 });
|
||||
const job = makeJob();
|
||||
|
||||
await pipeline.run(job as never);
|
||||
|
||||
const docs = db.prepare(`SELECT * FROM documents`).all() as unknown[];
|
||||
expect(docs.length).toBeGreaterThan(0);
|
||||
|
||||
const snippets = db.prepare(`SELECT * FROM snippets`).all() as unknown[];
|
||||
expect(snippets.length).toBeGreaterThan(0);
|
||||
|
||||
const repo = db
|
||||
.prepare(`SELECT state, total_snippets FROM repositories WHERE id = '/test/repo'`)
|
||||
.get() as { state: string; total_snippets: number };
|
||||
expect(repo.state).toBe('indexed');
|
||||
expect(repo.total_snippets).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('skips unchanged files (checksum match)', async () => {
|
||||
// First indexing run.
|
||||
const files = [
|
||||
{
|
||||
path: 'README.md',
|
||||
content: '# Hello\n\nThis is documentation.',
|
||||
sha: 'sha-readme',
|
||||
language: 'markdown'
|
||||
}
|
||||
];
|
||||
const pipeline = makePipeline({ files, totalFiles: 1 });
|
||||
const job1 = makeJob();
|
||||
await pipeline.run(job1 as never);
|
||||
|
||||
const firstDocCount = (
|
||||
db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }
|
||||
).n;
|
||||
const firstSnippetIds = (
|
||||
db.prepare(`SELECT id FROM snippets`).all() as { id: string }[]
|
||||
).map((r) => r.id);
|
||||
|
||||
// Second run with identical files.
|
||||
const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' });
|
||||
const job2 = db
|
||||
.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`)
|
||||
.get(job2Id) as never;
|
||||
|
||||
await pipeline.run(job2);
|
||||
|
||||
const secondDocCount = (
|
||||
db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }
|
||||
).n;
|
||||
const secondSnippetIds = (
|
||||
db.prepare(`SELECT id FROM snippets`).all() as { id: string }[]
|
||||
).map((r) => r.id);
|
||||
|
||||
// Document count stays the same and snippet IDs are unchanged.
|
||||
expect(secondDocCount).toBe(firstDocCount);
|
||||
expect(secondSnippetIds).toEqual(firstSnippetIds);
|
||||
});
|
||||
|
||||
it('replaces snippets atomically when a file changes', async () => {
|
||||
const pipeline1 = makePipeline({
|
||||
files: [
|
||||
{
|
||||
path: 'README.md',
|
||||
content: '# Original\n\nThis is the original version of the documentation with sufficient content.',
|
||||
sha: 'sha-v1',
|
||||
language: 'markdown'
|
||||
}
|
||||
],
|
||||
totalFiles: 1
|
||||
});
|
||||
const job1 = makeJob();
|
||||
await pipeline1.run(job1 as never);
|
||||
|
||||
const originalSnippetCount = (
|
||||
db.prepare(`SELECT COUNT(*) as n FROM snippets`).get() as { n: number }
|
||||
).n;
|
||||
expect(originalSnippetCount).toBeGreaterThan(0);
|
||||
|
||||
// Second run with changed file content.
|
||||
const pipeline2 = makePipeline({
|
||||
files: [
|
||||
{
|
||||
path: 'README.md',
|
||||
content: '# Updated\n\nThis is a completely different version of the documentation with new content.',
|
||||
sha: 'sha-v2',
|
||||
language: 'markdown'
|
||||
}
|
||||
],
|
||||
totalFiles: 1
|
||||
});
|
||||
const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' });
|
||||
const job2 = db
|
||||
.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`)
|
||||
.get(job2Id) as never;
|
||||
await pipeline2.run(job2);
|
||||
|
||||
const finalDocCount = (
|
||||
db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }
|
||||
).n;
|
||||
// Only one document should exist (the updated one).
|
||||
expect(finalDocCount).toBe(1);
|
||||
|
||||
const finalChecksum = (
|
||||
db.prepare(`SELECT checksum FROM documents LIMIT 1`).get() as { checksum: string }
|
||||
).checksum;
|
||||
expect(finalChecksum).toBe('sha-v2');
|
||||
});
|
||||
|
||||
it('updates job progress as files are processed', async () => {
|
||||
const files = Array.from({ length: 5 }, (_, i) => ({
|
||||
path: `file${i}.md`,
|
||||
content: `# File ${i}\n\nContent ${i}.`,
|
||||
sha: `sha-${i}`,
|
||||
language: 'markdown'
|
||||
}));
|
||||
|
||||
const pipeline = makePipeline({ files, totalFiles: 5 });
|
||||
const job = makeJob();
|
||||
await pipeline.run(job as never);
|
||||
|
||||
const updated = db
|
||||
.prepare(`SELECT progress FROM indexing_jobs WHERE id = ?`)
|
||||
.get(job.id) as { progress: number };
|
||||
expect(updated.progress).toBe(100);
|
||||
});
|
||||
});
|
||||
405
src/lib/server/pipeline/indexing.pipeline.ts
Normal file
405
src/lib/server/pipeline/indexing.pipeline.ts
Normal file
@@ -0,0 +1,405 @@
|
||||
/**
|
||||
* IndexingPipeline — orchestrates the full crawl → parse → store → embed
|
||||
* flow for a single repository indexing job (TRUEREF-0009).
|
||||
*
|
||||
* Atomicity guarantee:
|
||||
* Old documents/snippets for changed files are deleted and replaced inside
|
||||
* a single SQLite transaction. If anything fails after that transaction the
|
||||
* already-committed data stays intact and the job is marked failed so
|
||||
* callers can retry.
|
||||
*
|
||||
* Progress model:
|
||||
* - Without embeddings: crawl+parse = 100 %
|
||||
* - With embeddings : crawl+parse = 80 %, embeddings = 20 %
|
||||
*/
|
||||
|
||||
import { createHash } from 'node:crypto';
|
||||
import type Database from 'better-sqlite3';
|
||||
import type { IndexingJob, NewDocument, NewSnippet, Repository } from '$lib/types';
|
||||
import type { crawl as GithubCrawlFn } from '$lib/server/crawler/github.crawler.js';
|
||||
import type { LocalCrawler } from '$lib/server/crawler/local.crawler.js';
|
||||
import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
|
||||
import { parseFile } from '$lib/server/parser/index.js';
|
||||
import { computeTrustScore } from '$lib/server/search/trust-score.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Progress calculation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function calculateProgress(
|
||||
processedFiles: number,
|
||||
totalFiles: number,
|
||||
embeddingsDone: number,
|
||||
embeddingsTotal: number,
|
||||
hasEmbeddings: boolean
|
||||
): number {
|
||||
if (totalFiles === 0) return 0;
|
||||
|
||||
if (!hasEmbeddings) {
|
||||
return Math.round((processedFiles / totalFiles) * 100);
|
||||
}
|
||||
|
||||
const parseProgress = (processedFiles / totalFiles) * 80;
|
||||
const embedProgress = embeddingsTotal > 0 ? (embeddingsDone / embeddingsTotal) * 20 : 0;
|
||||
return Math.round(parseProgress + embedProgress);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// SHA-256 helper
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function sha256(content: string): string {
|
||||
return createHash('sha256').update(content, 'utf-8').digest('hex');
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// IndexingPipeline
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export class IndexingPipeline {
|
||||
constructor(
|
||||
private readonly db: Database.Database,
|
||||
private readonly githubCrawl: typeof GithubCrawlFn,
|
||||
private readonly localCrawler: LocalCrawler,
|
||||
private readonly embeddingService: EmbeddingService | null
|
||||
) {}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Public — run a job end to end
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
async run(job: IndexingJob): Promise<void> {
|
||||
// better-sqlite3 raw queries return snake_case keys; Drizzle types use camelCase.
|
||||
// Accept both so the pipeline works when called from raw SQL contexts.
|
||||
const raw = job as unknown as Record<string, unknown>;
|
||||
const repositoryId = (job.repositoryId ?? raw['repository_id']) as string;
|
||||
const versionId = (job.versionId ?? raw['version_id'] ?? null) as string | null;
|
||||
|
||||
// Rebuild a normalised job view for the rest of this method.
|
||||
const normJob = { ...job, repositoryId, versionId };
|
||||
|
||||
this.updateJob(job.id, { status: 'running', startedAt: Math.floor(Date.now() / 1000) });
|
||||
|
||||
try {
|
||||
const repo = this.getRepository(repositoryId);
|
||||
if (!repo) throw new Error(`Repository ${repositoryId} not found`);
|
||||
|
||||
// Mark repo as actively indexing.
|
||||
this.updateRepo(repo.id, { state: 'indexing' });
|
||||
|
||||
// ---- Stage 1: Crawl -------------------------------------------------
|
||||
const crawlResult = await this.crawl(repo, normJob);
|
||||
const totalFiles = crawlResult.totalFiles;
|
||||
|
||||
this.updateJob(job.id, { totalFiles });
|
||||
|
||||
// ---- Stage 2: Parse & diff ------------------------------------------
|
||||
// Accumulate new documents/snippets; skip unchanged files.
|
||||
const newDocuments: NewDocument[] = [];
|
||||
const newSnippets: NewSnippet[] = [];
|
||||
const changedDocIds: string[] = [];
|
||||
|
||||
let processedFiles = 0;
|
||||
|
||||
for (const file of crawlResult.files) {
|
||||
const checksum = file.sha || sha256(file.content);
|
||||
|
||||
// Check whether an identical document already exists.
|
||||
const existingDoc = this.db
|
||||
.prepare<[string, string], { id: string; checksum: string }>(
|
||||
`SELECT id, checksum FROM documents
|
||||
WHERE repository_id = ? AND file_path = ? LIMIT 1`
|
||||
)
|
||||
.get(repo.id, file.path);
|
||||
|
||||
if (existingDoc && existingDoc.checksum === checksum) {
|
||||
// File unchanged — reuse existing snippets, nothing to do.
|
||||
processedFiles++;
|
||||
const progress = calculateProgress(
|
||||
processedFiles,
|
||||
totalFiles,
|
||||
0,
|
||||
0,
|
||||
this.embeddingService !== null
|
||||
);
|
||||
this.updateJob(job.id, { processedFiles, progress });
|
||||
continue;
|
||||
}
|
||||
|
||||
// File is new or changed — schedule old doc for deletion.
|
||||
if (existingDoc) {
|
||||
changedDocIds.push(existingDoc.id);
|
||||
}
|
||||
|
||||
// Create new document record.
|
||||
const documentId = crypto.randomUUID();
|
||||
const now = new Date();
|
||||
const newDoc: NewDocument = {
|
||||
id: documentId,
|
||||
repositoryId: repo.id,
|
||||
versionId: normJob.versionId ?? null,
|
||||
filePath: file.path,
|
||||
title: null,
|
||||
language: file.language,
|
||||
tokenCount: 0,
|
||||
checksum,
|
||||
indexedAt: now
|
||||
};
|
||||
|
||||
// Parse into snippets.
|
||||
const snippets = parseFile(file, {
|
||||
repositoryId: repo.id,
|
||||
documentId,
|
||||
versionId: normJob.versionId ?? undefined
|
||||
});
|
||||
|
||||
// Update document token count from snippet totals.
|
||||
const tokenCount = snippets.reduce((sum, s) => sum + (s.tokenCount ?? 0), 0);
|
||||
newDoc.tokenCount = tokenCount;
|
||||
|
||||
newDocuments.push(newDoc);
|
||||
newSnippets.push(...snippets);
|
||||
|
||||
processedFiles++;
|
||||
const progress = calculateProgress(
|
||||
processedFiles,
|
||||
totalFiles,
|
||||
0,
|
||||
0,
|
||||
this.embeddingService !== null
|
||||
);
|
||||
this.updateJob(job.id, { processedFiles, progress });
|
||||
}
|
||||
|
||||
// ---- Stage 3: Atomic replacement ------------------------------------
|
||||
this.replaceSnippets(repo.id, changedDocIds, newDocuments, newSnippets);
|
||||
|
||||
// ---- Stage 4: Embeddings (if provider is configured) ----------------
|
||||
if (this.embeddingService && newSnippets.length > 0) {
|
||||
const snippetIds = newSnippets.map((s) => s.id!);
|
||||
const embeddingsTotal = snippetIds.length;
|
||||
|
||||
await this.embeddingService.embedSnippets(snippetIds, (done) => {
|
||||
const progress = calculateProgress(
|
||||
processedFiles,
|
||||
totalFiles,
|
||||
done,
|
||||
embeddingsTotal,
|
||||
true
|
||||
);
|
||||
this.updateJob(job.id, { progress });
|
||||
});
|
||||
}
|
||||
|
||||
// ---- Stage 5: Update repository stats --------------------------------
|
||||
const stats = this.computeStats(repo.id);
|
||||
const freshRepo = this.getRepository(repo.id)!;
|
||||
const trustScore = computeTrustScore({
|
||||
...freshRepo,
|
||||
totalSnippets: stats.totalSnippets,
|
||||
totalTokens: stats.totalTokens,
|
||||
state: 'indexed'
|
||||
});
|
||||
|
||||
this.updateRepo(repo.id, {
|
||||
state: 'indexed',
|
||||
totalSnippets: stats.totalSnippets,
|
||||
totalTokens: stats.totalTokens,
|
||||
trustScore,
|
||||
lastIndexedAt: Math.floor(Date.now() / 1000)
|
||||
});
|
||||
|
||||
this.updateJob(job.id, {
|
||||
status: 'done',
|
||||
progress: 100,
|
||||
completedAt: Math.floor(Date.now() / 1000)
|
||||
});
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
console.error(`[IndexingPipeline] Job ${job.id} failed: ${message}`);
|
||||
|
||||
this.updateJob(job.id, {
|
||||
status: 'failed',
|
||||
error: message,
|
||||
completedAt: Math.floor(Date.now() / 1000)
|
||||
});
|
||||
|
||||
// Restore repo to error state but preserve any existing indexed data.
|
||||
this.updateRepo(repositoryId, { state: 'error' });
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Private — crawl
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private async crawl(
|
||||
repo: Repository,
|
||||
job: IndexingJob
|
||||
): Promise<{ files: Array<{ path: string; content: string; sha: string; size: number; language: string }>; totalFiles: number }> {
|
||||
if (repo.source === 'github') {
|
||||
// Parse owner/repo from the canonical ID: "/owner/repo"
|
||||
const parts = repo.id.replace(/^\//, '').split('/');
|
||||
const owner = parts[0];
|
||||
const repoName = parts[1];
|
||||
|
||||
if (!owner || !repoName) {
|
||||
throw new Error(`Cannot parse GitHub owner/repo from id: ${repo.id}`);
|
||||
}
|
||||
|
||||
const result = await this.githubCrawl({
|
||||
owner,
|
||||
repo: repoName,
|
||||
ref: repo.branch ?? undefined,
|
||||
token: repo.githubToken ?? undefined
|
||||
});
|
||||
|
||||
return { files: result.files, totalFiles: result.totalFiles };
|
||||
} else {
|
||||
// Local filesystem crawl.
|
||||
const result = await this.localCrawler.crawl({
|
||||
rootPath: repo.sourceUrl,
|
||||
ref: repo.branch !== 'main' ? (repo.branch ?? undefined) : undefined
|
||||
});
|
||||
|
||||
return { files: result.files, totalFiles: result.totalFiles };
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Private — atomic snippet replacement
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private replaceSnippets(
|
||||
_repositoryId: string,
|
||||
changedDocIds: string[],
|
||||
newDocuments: NewDocument[],
|
||||
newSnippets: NewSnippet[]
|
||||
): void {
|
||||
const insertDoc = this.db.prepare(
|
||||
`INSERT INTO documents
|
||||
(id, repository_id, version_id, file_path, title, language,
|
||||
token_count, checksum, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
);
|
||||
|
||||
const insertSnippet = this.db.prepare(
|
||||
`INSERT INTO snippets
|
||||
(id, document_id, repository_id, version_id, type, title,
|
||||
content, language, breadcrumb, token_count, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
);
|
||||
|
||||
this.db.transaction(() => {
|
||||
// Delete stale documents (cascade deletes their snippets via FK).
|
||||
if (changedDocIds.length > 0) {
|
||||
const placeholders = changedDocIds.map(() => '?').join(',');
|
||||
this.db
|
||||
.prepare(`DELETE FROM documents WHERE id IN (${placeholders})`)
|
||||
.run(...changedDocIds);
|
||||
}
|
||||
|
||||
// Insert new documents.
|
||||
for (const doc of newDocuments) {
|
||||
const indexedAtSeconds =
|
||||
doc.indexedAt instanceof Date
|
||||
? Math.floor(doc.indexedAt.getTime() / 1000)
|
||||
: Math.floor(Date.now() / 1000);
|
||||
|
||||
insertDoc.run(
|
||||
doc.id,
|
||||
doc.repositoryId,
|
||||
doc.versionId ?? null,
|
||||
doc.filePath,
|
||||
doc.title ?? null,
|
||||
doc.language ?? null,
|
||||
doc.tokenCount ?? 0,
|
||||
doc.checksum,
|
||||
indexedAtSeconds
|
||||
);
|
||||
}
|
||||
|
||||
// Insert new snippets.
|
||||
for (const snippet of newSnippets) {
|
||||
const createdAtSeconds =
|
||||
snippet.createdAt instanceof Date
|
||||
? Math.floor(snippet.createdAt.getTime() / 1000)
|
||||
: Math.floor(Date.now() / 1000);
|
||||
|
||||
insertSnippet.run(
|
||||
snippet.id,
|
||||
snippet.documentId,
|
||||
snippet.repositoryId,
|
||||
snippet.versionId ?? null,
|
||||
snippet.type,
|
||||
snippet.title ?? null,
|
||||
snippet.content,
|
||||
snippet.language ?? null,
|
||||
snippet.breadcrumb ?? null,
|
||||
snippet.tokenCount ?? 0,
|
||||
createdAtSeconds
|
||||
);
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Private — stats
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private computeStats(repositoryId: string): { totalSnippets: number; totalTokens: number } {
|
||||
const row = this.db
|
||||
.prepare<[string], { total_snippets: number; total_tokens: number }>(
|
||||
`SELECT COUNT(*) as total_snippets,
|
||||
COALESCE(SUM(token_count), 0) as total_tokens
|
||||
FROM snippets WHERE repository_id = ?`
|
||||
)
|
||||
.get(repositoryId);
|
||||
|
||||
return {
|
||||
totalSnippets: row?.total_snippets ?? 0,
|
||||
totalTokens: row?.total_tokens ?? 0
|
||||
};
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Private — DB helpers
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private getRepository(id: string): Repository | null {
|
||||
return (
|
||||
(this.db
|
||||
.prepare<[string], Repository>(`SELECT * FROM repositories WHERE id = ?`)
|
||||
.get(id) as Repository | undefined) ?? null
|
||||
);
|
||||
}
|
||||
|
||||
private updateJob(id: string, fields: Record<string, unknown>): void {
|
||||
const sets = Object.keys(fields)
|
||||
.map((k) => `${toSnake(k)} = ?`)
|
||||
.join(', ');
|
||||
const values = [...Object.values(fields), id];
|
||||
this.db.prepare(`UPDATE indexing_jobs SET ${sets} WHERE id = ?`).run(...values);
|
||||
}
|
||||
|
||||
private updateRepo(id: string, fields: Record<string, unknown>): void {
|
||||
const now = Math.floor(Date.now() / 1000);
|
||||
const allFields = { ...fields, updatedAt: now };
|
||||
const sets = Object.keys(allFields)
|
||||
.map((k) => `${toSnake(k)} = ?`)
|
||||
.join(', ');
|
||||
const values = [...Object.values(allFields), id];
|
||||
this.db.prepare(`UPDATE repositories SET ${sets} WHERE id = ?`).run(...values);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Utility
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Convert camelCase to snake_case for DB column mapping. */
|
||||
function toSnake(key: string): string {
|
||||
return key.replace(/[A-Z]/g, (c) => `_${c.toLowerCase()}`);
|
||||
}
|
||||
203
src/lib/server/pipeline/job-queue.ts
Normal file
203
src/lib/server/pipeline/job-queue.ts
Normal file
@@ -0,0 +1,203 @@
|
||||
/**
|
||||
* SQLite-backed job queue for indexing jobs (TRUEREF-0009).
|
||||
*
|
||||
* Jobs are processed sequentially (one at a time) to avoid SQLite write
|
||||
* contention. The queue uses setImmediate to yield to the event loop between
|
||||
* jobs so that API requests remain responsive.
|
||||
*/
|
||||
|
||||
import type Database from 'better-sqlite3';
|
||||
import type { IndexingJob, NewIndexingJob } from '$lib/types';
|
||||
import type { IndexingPipeline } from './indexing.pipeline.js';
|
||||
|
||||
export class JobQueue {
|
||||
private isRunning = false;
|
||||
private pipeline: IndexingPipeline | null = null;
|
||||
|
||||
constructor(private readonly db: Database.Database) {}
|
||||
|
||||
/**
|
||||
* Inject the pipeline dependency (avoids circular construction order).
|
||||
*/
|
||||
setPipeline(pipeline: IndexingPipeline): void {
|
||||
this.pipeline = pipeline;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enqueue a new indexing job for the given repository.
|
||||
* If a job for this repository is already queued or running, returns the
|
||||
* existing job instead of creating a duplicate.
|
||||
*/
|
||||
enqueue(repositoryId: string, versionId?: string): IndexingJob {
|
||||
// Return early if there's already an active job for this repo.
|
||||
const active = this.db
|
||||
.prepare<[string], IndexingJob>(
|
||||
`SELECT * FROM indexing_jobs
|
||||
WHERE repository_id = ? AND status IN ('queued', 'running')
|
||||
ORDER BY created_at DESC LIMIT 1`
|
||||
)
|
||||
.get(repositoryId);
|
||||
|
||||
if (active) return active;
|
||||
|
||||
const now = Math.floor(Date.now() / 1000);
|
||||
const job: NewIndexingJob = {
|
||||
id: crypto.randomUUID(),
|
||||
repositoryId,
|
||||
versionId: versionId ?? null,
|
||||
status: 'queued',
|
||||
progress: 0,
|
||||
totalFiles: 0,
|
||||
processedFiles: 0,
|
||||
error: null,
|
||||
startedAt: null,
|
||||
completedAt: null,
|
||||
createdAt: new Date(now * 1000)
|
||||
};
|
||||
|
||||
this.db
|
||||
.prepare(
|
||||
`INSERT INTO indexing_jobs
|
||||
(id, repository_id, version_id, status, progress, total_files,
|
||||
processed_files, error, started_at, completed_at, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
)
|
||||
.run(
|
||||
job.id,
|
||||
job.repositoryId,
|
||||
job.versionId,
|
||||
job.status,
|
||||
job.progress,
|
||||
job.totalFiles,
|
||||
job.processedFiles,
|
||||
job.error,
|
||||
job.startedAt,
|
||||
job.completedAt,
|
||||
now
|
||||
);
|
||||
|
||||
// Kick off sequential processing if not already running.
|
||||
if (!this.isRunning) {
|
||||
setImmediate(() => this.processNext());
|
||||
}
|
||||
|
||||
return this.db
|
||||
.prepare<[string], IndexingJob>(`SELECT * FROM indexing_jobs WHERE id = ?`)
|
||||
.get(job.id)!;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pick the oldest queued job and run it through the pipeline.
|
||||
* Called recursively via setImmediate so the event loop stays unblocked.
|
||||
*/
|
||||
private async processNext(): Promise<void> {
|
||||
if (this.isRunning) return;
|
||||
if (!this.pipeline) {
|
||||
console.warn('[JobQueue] No pipeline configured — cannot process jobs.');
|
||||
return;
|
||||
}
|
||||
|
||||
const job = this.db
|
||||
.prepare<[], IndexingJob>(
|
||||
`SELECT * FROM indexing_jobs
|
||||
WHERE status = 'queued'
|
||||
ORDER BY created_at ASC LIMIT 1`
|
||||
)
|
||||
.get();
|
||||
|
||||
if (!job) return;
|
||||
|
||||
this.isRunning = true;
|
||||
try {
|
||||
await this.pipeline.run(job);
|
||||
} catch (err) {
|
||||
// Error is logged inside pipeline.run(); no action needed here.
|
||||
console.error(
|
||||
`[JobQueue] Job ${job.id} failed: ${err instanceof Error ? err.message : String(err)}`
|
||||
);
|
||||
} finally {
|
||||
this.isRunning = false;
|
||||
|
||||
// Check whether another job was queued while this one ran.
|
||||
const next = this.db
|
||||
.prepare<[], { id: string }>(
|
||||
`SELECT id FROM indexing_jobs WHERE status = 'queued' LIMIT 1`
|
||||
)
|
||||
.get();
|
||||
if (next) {
|
||||
setImmediate(() => this.processNext());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve a single job by ID.
|
||||
*/
|
||||
getJob(id: string): IndexingJob | null {
|
||||
return (
|
||||
this.db
|
||||
.prepare<[string], IndexingJob>(`SELECT * FROM indexing_jobs WHERE id = ?`)
|
||||
.get(id) ?? null
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* List recent jobs, optionally filtered by repository and/or status.
|
||||
*/
|
||||
listJobs(options?: {
|
||||
repositoryId?: string;
|
||||
status?: IndexingJob['status'];
|
||||
limit?: number;
|
||||
}): IndexingJob[] {
|
||||
const limit = Math.min(options?.limit ?? 20, 200);
|
||||
const conditions: string[] = [];
|
||||
const params: unknown[] = [];
|
||||
|
||||
if (options?.repositoryId) {
|
||||
conditions.push('repository_id = ?');
|
||||
params.push(options.repositoryId);
|
||||
}
|
||||
if (options?.status) {
|
||||
conditions.push('status = ?');
|
||||
params.push(options.status);
|
||||
}
|
||||
|
||||
const where = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
|
||||
const sql = `SELECT * FROM indexing_jobs ${where} ORDER BY created_at DESC LIMIT ?`;
|
||||
params.push(limit);
|
||||
|
||||
return this.db.prepare<unknown[], IndexingJob>(sql).all(...params);
|
||||
}
|
||||
|
||||
/**
|
||||
* Trigger processing of any queued jobs (e.g. after server restart).
|
||||
* Safe to call multiple times; a no-op if the queue is already running.
|
||||
*/
|
||||
drainQueued(): void {
|
||||
if (!this.isRunning) {
|
||||
setImmediate(() => this.processNext());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Count all jobs matching optional filters.
|
||||
*/
|
||||
countJobs(options?: { repositoryId?: string; status?: IndexingJob['status'] }): number {
|
||||
const conditions: string[] = [];
|
||||
const params: unknown[] = [];
|
||||
|
||||
if (options?.repositoryId) {
|
||||
conditions.push('repository_id = ?');
|
||||
params.push(options.repositoryId);
|
||||
}
|
||||
if (options?.status) {
|
||||
conditions.push('status = ?');
|
||||
params.push(options.status);
|
||||
}
|
||||
|
||||
const where = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
|
||||
const sql = `SELECT COUNT(*) as n FROM indexing_jobs ${where}`;
|
||||
const row = this.db.prepare<unknown[], { n: number }>(sql).get(...params);
|
||||
return row?.n ?? 0;
|
||||
}
|
||||
}
|
||||
122
src/lib/server/pipeline/startup.ts
Normal file
122
src/lib/server/pipeline/startup.ts
Normal file
@@ -0,0 +1,122 @@
|
||||
/**
|
||||
* Server startup routines for the indexing pipeline (TRUEREF-0009).
|
||||
*
|
||||
* On every server start:
|
||||
* 1. Mark any jobs that were left in 'running' state as 'failed'
|
||||
* (they were interrupted by a process crash / restart).
|
||||
* 2. Reset any repositories stuck in 'indexing' state to 'error'.
|
||||
* 3. Construct and wire together the JobQueue + IndexingPipeline singletons.
|
||||
* 4. Kick off processing of any jobs that were 'queued' before the restart.
|
||||
*/
|
||||
|
||||
import type Database from 'better-sqlite3';
|
||||
import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
|
||||
import { crawl as githubCrawl } from '$lib/server/crawler/github.crawler.js';
|
||||
import { LocalCrawler } from '$lib/server/crawler/local.crawler.js';
|
||||
import { IndexingPipeline } from './indexing.pipeline.js';
|
||||
import { JobQueue } from './job-queue.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stale-job recovery
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Mark jobs that were `running` when the server crashed as `failed`, and
|
||||
* reset repositories that were stuck in `indexing` state to `error`.
|
||||
*
|
||||
* Safe to call on every startup — uses unixepoch() so timestamps stay in
|
||||
* the same integer-seconds format as the rest of the schema.
|
||||
*/
|
||||
export function recoverStaleJobs(db: Database.Database): void {
|
||||
db.prepare(
|
||||
`UPDATE indexing_jobs
|
||||
SET status = 'failed',
|
||||
error = 'Server restarted while job was running',
|
||||
completed_at = unixepoch()
|
||||
WHERE status = 'running'`
|
||||
).run();
|
||||
|
||||
db.prepare(
|
||||
`UPDATE repositories
|
||||
SET state = 'error'
|
||||
WHERE state = 'indexing'`
|
||||
).run();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Singleton instances
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
let _queue: JobQueue | null = null;
|
||||
let _pipeline: IndexingPipeline | null = null;
|
||||
|
||||
/**
|
||||
* Initialise (or return the existing) JobQueue + IndexingPipeline pair.
|
||||
*
|
||||
* Must be called once at server startup (e.g. from `hooks.server.ts`).
|
||||
* Calling it more than once is harmless — subsequent calls are no-ops that
|
||||
* return the already-constructed singletons.
|
||||
*
|
||||
* @param db - Raw better-sqlite3 Database instance.
|
||||
* @param embeddingService - Optional embedding service; pass null to disable.
|
||||
* @returns An object with `queue` and `pipeline` accessors.
|
||||
*/
|
||||
export function initializePipeline(
|
||||
db: Database.Database,
|
||||
embeddingService: EmbeddingService | null = null
|
||||
): { queue: JobQueue; pipeline: IndexingPipeline } {
|
||||
if (_queue && _pipeline) {
|
||||
return { queue: _queue, pipeline: _pipeline };
|
||||
}
|
||||
|
||||
// Recover before constructing so no stale job gets picked up.
|
||||
recoverStaleJobs(db);
|
||||
|
||||
const localCrawler = new LocalCrawler();
|
||||
const pipeline = new IndexingPipeline(db, githubCrawl, localCrawler, embeddingService);
|
||||
const queue = new JobQueue(db);
|
||||
|
||||
queue.setPipeline(pipeline);
|
||||
|
||||
_queue = queue;
|
||||
_pipeline = pipeline;
|
||||
|
||||
// Drain any jobs that were queued before the restart.
|
||||
setImmediate(() => {
|
||||
const pending = db
|
||||
.prepare<[], { id: string }>(`SELECT id FROM indexing_jobs WHERE status = 'queued' LIMIT 1`)
|
||||
.get();
|
||||
if (pending) {
|
||||
// Re-enqueue logic is handled inside JobQueue.processNext; we trigger
|
||||
// it by asking the queue for any job that is already queued.
|
||||
// The simplest way is to call enqueue on a repo that has a queued job —
|
||||
// but since enqueue deduplicates, we just trigger processNext directly.
|
||||
// We do this via a public helper to avoid exposing private methods.
|
||||
queue.drainQueued();
|
||||
}
|
||||
});
|
||||
|
||||
return { queue, pipeline };
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the current JobQueue singleton, or null if not yet initialised.
|
||||
*/
|
||||
export function getQueue(): JobQueue | null {
|
||||
return _queue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the current IndexingPipeline singleton, or null if not yet initialised.
|
||||
*/
|
||||
export function getPipeline(): IndexingPipeline | null {
|
||||
return _pipeline;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset singletons — intended for use in tests only.
|
||||
*/
|
||||
export function _resetSingletons(): void {
|
||||
_queue = null;
|
||||
_pipeline = null;
|
||||
}
|
||||
46
src/routes/api/v1/jobs/+server.ts
Normal file
46
src/routes/api/v1/jobs/+server.ts
Normal file
@@ -0,0 +1,46 @@
|
||||
/**
|
||||
* GET /api/v1/jobs — list recent indexing jobs.
|
||||
*
|
||||
* Query parameters:
|
||||
* repositoryId (optional) — filter by repository
|
||||
* status (optional) — filter by status: queued|running|done|failed
|
||||
* limit (optional, default 20, max 200)
|
||||
*/
|
||||
|
||||
import { json } from '@sveltejs/kit';
|
||||
import type { RequestHandler } from './$types';
|
||||
import { getClient } from '$lib/server/db/client.js';
|
||||
import { JobQueue } from '$lib/server/pipeline/job-queue.js';
|
||||
import { handleServiceError } from '$lib/server/utils/validation.js';
|
||||
import type { IndexingJob } from '$lib/types';
|
||||
|
||||
export const GET: RequestHandler = ({ url }) => {
|
||||
try {
|
||||
const db = getClient();
|
||||
const queue = new JobQueue(db);
|
||||
|
||||
const repositoryId = url.searchParams.get('repositoryId') ?? undefined;
|
||||
const status = (url.searchParams.get('status') ?? undefined) as
|
||||
| IndexingJob['status']
|
||||
| undefined;
|
||||
const limit = Math.min(parseInt(url.searchParams.get('limit') ?? '20', 10) || 20, 200);
|
||||
|
||||
const jobs = queue.listJobs({ repositoryId, status, limit });
|
||||
const total = queue.countJobs({ repositoryId, status });
|
||||
|
||||
return json({ jobs, total });
|
||||
} catch (err) {
|
||||
return handleServiceError(err);
|
||||
}
|
||||
};
|
||||
|
||||
export const OPTIONS: RequestHandler = () => {
|
||||
return new Response(null, {
|
||||
status: 204,
|
||||
headers: {
|
||||
'Access-Control-Allow-Origin': '*',
|
||||
'Access-Control-Allow-Methods': 'GET, OPTIONS',
|
||||
'Access-Control-Allow-Headers': 'Content-Type, Authorization'
|
||||
}
|
||||
});
|
||||
};
|
||||
34
src/routes/api/v1/jobs/[id]/+server.ts
Normal file
34
src/routes/api/v1/jobs/[id]/+server.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
/**
|
||||
* GET /api/v1/jobs/:id — retrieve a single indexing job by ID.
|
||||
*/
|
||||
|
||||
import { json } from '@sveltejs/kit';
|
||||
import type { RequestHandler } from './$types';
|
||||
import { getClient } from '$lib/server/db/client.js';
|
||||
import { JobQueue } from '$lib/server/pipeline/job-queue.js';
|
||||
import { handleServiceError, NotFoundError } from '$lib/server/utils/validation.js';
|
||||
|
||||
export const GET: RequestHandler = ({ params }) => {
|
||||
try {
|
||||
const db = getClient();
|
||||
const queue = new JobQueue(db);
|
||||
|
||||
const job = queue.getJob(params.id);
|
||||
if (!job) throw new NotFoundError(`Job ${params.id} not found`);
|
||||
|
||||
return json({ job });
|
||||
} catch (err) {
|
||||
return handleServiceError(err);
|
||||
}
|
||||
};
|
||||
|
||||
export const OPTIONS: RequestHandler = () => {
|
||||
return new Response(null, {
|
||||
status: 204,
|
||||
headers: {
|
||||
'Access-Control-Allow-Origin': '*',
|
||||
'Access-Control-Allow-Methods': 'GET, OPTIONS',
|
||||
'Access-Control-Allow-Headers': 'Content-Type, Authorization'
|
||||
}
|
||||
});
|
||||
};
|
||||
Reference in New Issue
Block a user