feat(TRUEREF-0022): complete iteration 0 — worker-thread indexing, parallel jobs, SSE progress
- Move IndexingPipeline.run() into Worker Threads via WorkerPool - Add dedicated embedding worker thread with single model instance - Add stage/stageDetail columns to indexing_jobs schema - Create ProgressBroadcaster for SSE channel management - Add SSE endpoints: GET /api/v1/jobs/:id/stream, GET /api/v1/jobs/stream - Replace UI polling with EventSource on repo detail and admin pages - Add concurrency settings UI and API endpoint - Build worker entries separately via esbuild
This commit is contained in:
@@ -15,6 +15,11 @@ import { crawl as githubCrawl } from '$lib/server/crawler/github.crawler.js';
|
||||
import { LocalCrawler } from '$lib/server/crawler/local.crawler.js';
|
||||
import { IndexingPipeline } from './indexing.pipeline.js';
|
||||
import { JobQueue } from './job-queue.js';
|
||||
import { WorkerPool } from './worker-pool.js';
|
||||
import { initBroadcaster, getBroadcaster as getBroadcasterFn } from './progress-broadcaster.js';
|
||||
import type { ProgressBroadcaster } from './progress-broadcaster.js';
|
||||
import path from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stale-job recovery
|
||||
@@ -49,6 +54,8 @@ export function recoverStaleJobs(db: Database.Database): void {
|
||||
|
||||
let _queue: JobQueue | null = null;
|
||||
let _pipeline: IndexingPipeline | null = null;
|
||||
let _pool: WorkerPool | null = null;
|
||||
let _broadcaster: ProgressBroadcaster | null = null;
|
||||
|
||||
/**
|
||||
* Initialise (or return the existing) JobQueue + IndexingPipeline pair.
|
||||
@@ -59,11 +66,13 @@ let _pipeline: IndexingPipeline | null = null;
|
||||
*
|
||||
* @param db - Raw better-sqlite3 Database instance.
|
||||
* @param embeddingService - Optional embedding service; pass null to disable.
|
||||
* @param options - Optional configuration for worker pool (concurrency, dbPath).
|
||||
* @returns An object with `queue` and `pipeline` accessors.
|
||||
*/
|
||||
export function initializePipeline(
|
||||
db: Database.Database,
|
||||
embeddingService: EmbeddingService | null = null
|
||||
embeddingService: EmbeddingService | null = null,
|
||||
options?: { concurrency?: number; dbPath?: string }
|
||||
): { queue: JobQueue; pipeline: IndexingPipeline } {
|
||||
if (_queue && _pipeline) {
|
||||
return { queue: _queue, pipeline: _pipeline };
|
||||
@@ -76,7 +85,76 @@ export function initializePipeline(
|
||||
const pipeline = new IndexingPipeline(db, githubCrawl, localCrawler, embeddingService);
|
||||
const queue = new JobQueue(db);
|
||||
|
||||
queue.setPipeline(pipeline);
|
||||
// If worker pool options are provided, create and wire the pool
|
||||
if (options?.dbPath) {
|
||||
_broadcaster = initBroadcaster();
|
||||
|
||||
// Resolve worker script paths relative to this file (build/workers/ directory)
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
const workerScript = path.join(__dirname, '../../../build/workers/worker-entry.mjs');
|
||||
const embedWorkerScript = path.join(__dirname, '../../../build/workers/embed-worker-entry.mjs');
|
||||
|
||||
try {
|
||||
_pool = new WorkerPool({
|
||||
concurrency: options.concurrency ?? 2,
|
||||
workerScript,
|
||||
embedWorkerScript,
|
||||
dbPath: options.dbPath,
|
||||
onProgress: (jobId: string, msg: any) => {
|
||||
// Update DB with progress
|
||||
db.prepare(
|
||||
`UPDATE indexing_jobs
|
||||
SET stage = ?, stage_detail = ?, progress = ?, processed_files = ?, total_files = ?
|
||||
WHERE id = ?`
|
||||
).run(msg.stage, msg.stageDetail ?? null, msg.progress, msg.processedFiles, msg.totalFiles, jobId);
|
||||
|
||||
// Broadcast progress event
|
||||
if (_broadcaster) {
|
||||
_broadcaster.broadcast(jobId, '', 'progress', msg);
|
||||
}
|
||||
},
|
||||
onJobDone: (jobId: string) => {
|
||||
// Update job status to done
|
||||
db.prepare(`UPDATE indexing_jobs SET status = 'done', completed_at = unixepoch() WHERE id = ?`).run(
|
||||
jobId
|
||||
);
|
||||
|
||||
// Broadcast done event
|
||||
if (_broadcaster) {
|
||||
_broadcaster.broadcast(jobId, '', 'job-done', { jobId });
|
||||
}
|
||||
},
|
||||
onJobFailed: (jobId: string, error: string) => {
|
||||
// Update job status to failed with error message
|
||||
db.prepare(
|
||||
`UPDATE indexing_jobs SET status = 'failed', error = ?, completed_at = unixepoch() WHERE id = ?`
|
||||
).run(error, jobId);
|
||||
|
||||
// Broadcast failed event
|
||||
if (_broadcaster) {
|
||||
_broadcaster.broadcast(jobId, '', 'job-failed', { jobId, error });
|
||||
}
|
||||
},
|
||||
onEmbedReady: () => {
|
||||
console.log('[WorkerPool] Embedding worker ready');
|
||||
},
|
||||
onEmbedDone: (jobId: string) => {
|
||||
console.log('[WorkerPool] Embedding complete for job:', jobId);
|
||||
},
|
||||
onEmbedFailed: (jobId: string, error: string) => {
|
||||
console.error('[WorkerPool] Embedding failed for job:', jobId, error);
|
||||
}
|
||||
});
|
||||
|
||||
queue.setWorkerPool(_pool);
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
'[startup] Failed to create WorkerPool (worker scripts may not exist yet):',
|
||||
err instanceof Error ? err.message : String(err)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
_queue = queue;
|
||||
_pipeline = pipeline;
|
||||
@@ -87,11 +165,7 @@ export function initializePipeline(
|
||||
.prepare<[], { id: string }>(`SELECT id FROM indexing_jobs WHERE status = 'queued' LIMIT 1`)
|
||||
.get();
|
||||
if (pending) {
|
||||
// Re-enqueue logic is handled inside JobQueue.processNext; we trigger
|
||||
// it by asking the queue for any job that is already queued.
|
||||
// The simplest way is to call enqueue on a repo that has a queued job —
|
||||
// but since enqueue deduplicates, we just trigger processNext directly.
|
||||
// We do this via a public helper to avoid exposing private methods.
|
||||
// Re-enqueue logic is handled inside JobQueue.drainQueued; we trigger it here.
|
||||
queue.drainQueued();
|
||||
}
|
||||
});
|
||||
@@ -100,23 +174,41 @@ export function initializePipeline(
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the current JobQueue singleton, or null if not yet initialised.
|
||||
* Accessor for the JobQueue singleton.
|
||||
*/
|
||||
export function getQueue(): JobQueue | null {
|
||||
return _queue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the current IndexingPipeline singleton, or null if not yet initialised.
|
||||
* Accessor for the IndexingPipeline singleton.
|
||||
*/
|
||||
export function getPipeline(): IndexingPipeline | null {
|
||||
return _pipeline;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset singletons — intended for use in tests only.
|
||||
* Accessor for the WorkerPool singleton.
|
||||
*/
|
||||
export function getPool(): WorkerPool | null {
|
||||
return _pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* Accessor for the ProgressBroadcaster singleton.
|
||||
*/
|
||||
export function getBroadcaster(): ProgressBroadcaster | null {
|
||||
return _broadcaster;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset singletons (for testing).
|
||||
*/
|
||||
export function _resetSingletons(): void {
|
||||
_queue = null;
|
||||
_pipeline = null;
|
||||
_pool = null;
|
||||
_broadcaster = null;
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user