feat(TRUEREF-0022): complete iteration 0 — worker-thread indexing, parallel jobs, SSE progress

- Move IndexingPipeline.run() into Worker Threads via WorkerPool
- Add dedicated embedding worker thread with single model instance
- Add stage/stageDetail columns to indexing_jobs schema
- Create ProgressBroadcaster for SSE channel management
- Add SSE endpoints: GET /api/v1/jobs/:id/stream, GET /api/v1/jobs/stream
- Replace UI polling with EventSource on repo detail and admin pages
- Add concurrency settings UI and API endpoint
- Build worker entries separately via esbuild
This commit is contained in:
Giancarmine Salucci
2026-03-30 17:08:23 +02:00
parent 6f3f4db19b
commit 7630740403
30 changed files with 2659 additions and 991 deletions

View File

@@ -15,7 +15,7 @@
import { createHash, randomUUID } from 'node:crypto';
import type Database from 'better-sqlite3';
import type { Document, NewDocument, NewSnippet, TrueRefConfig } from '$lib/types';
import type { Document, NewDocument, NewSnippet, TrueRefConfig, IndexingStage } from '$lib/types';
import type { crawl as GithubCrawlFn } from '$lib/server/crawler/github.crawler.js';
import type { LocalCrawler } from '$lib/server/crawler/local.crawler.js';
import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
@@ -74,7 +74,16 @@ export class IndexingPipeline {
// Public — run a job end to end
// -------------------------------------------------------------------------
async run(job: IndexingJob): Promise<void> {
async run(
job: IndexingJob,
onStageChange?: (
stage: IndexingStage,
detail?: string,
progress?: number,
processedFiles?: number,
totalFiles?: number
) => void
): Promise<void> {
// better-sqlite3 raw queries return snake_case keys; Drizzle types use camelCase.
// Accept both so the pipeline works when called from raw SQL contexts.
const raw = job as unknown as Record<string, unknown>;
@@ -84,6 +93,18 @@ export class IndexingPipeline {
// Rebuild a normalised job view for the rest of this method.
const normJob = { ...job, repositoryId, versionId };
// Helper to report stage transitions and invoke optional callback.
const reportStage = (
stage: IndexingStage,
detail?: string,
progress?: number,
processed?: number,
total?: number
) => {
this.updateJob(job.id, { stage, stageDetail: detail ?? null });
onStageChange?.(stage, detail, progress, processed, total);
};
this.updateJob(job.id, { status: 'running', startedAt: Math.floor(Date.now() / 1000) });
try {
@@ -105,6 +126,7 @@ export class IndexingPipeline {
// from an already-indexed ancestor version instead of crawling everything.
let differentialPlan: DifferentialPlan | null = null;
if (normJob.versionId && versionTag) {
reportStage('differential');
differentialPlan = await buildDifferentialPlan({
repo,
targetTag: versionTag,
@@ -119,6 +141,7 @@ export class IndexingPipeline {
// If a differential plan exists, clone unchanged files from ancestor.
if (differentialPlan && differentialPlan.unchangedPaths.size > 0) {
reportStage('cloning');
this.cloneFromAncestor(
differentialPlan.ancestorVersionId,
normJob.versionId!,
@@ -132,6 +155,7 @@ export class IndexingPipeline {
// ---- Stage 1: Crawl -------------------------------------------------
// Pass changedPaths as allowlist so crawl only fetches/returns changed files.
reportStage('crawling');
const crawlAllowedPaths = differentialPlan ? differentialPlan.changedPaths : undefined;
const crawlResult = await this.crawl(repo, versionTag, crawlAllowedPaths);
@@ -219,6 +243,8 @@ export class IndexingPipeline {
// Lower = more responsive UI; higher = less overhead.
const YIELD_EVERY = 20;
reportStage('parsing', `0 / ${totalFiles} files`);
for (const [i, file] of filesToProcess.entries()) {
// Yield the Node.js event loop periodically so the HTTP server can
// handle incoming requests (navigation, polling) between file parses.
@@ -272,6 +298,7 @@ export class IndexingPipeline {
this.embeddingService !== null
);
this.updateJob(job.id, { processedFiles: totalProcessed, progress });
reportStage('parsing', `${totalProcessed} / ${totalFiles} files`, progress, totalProcessed, totalFiles);
}
}
@@ -279,10 +306,12 @@ export class IndexingPipeline {
processedFiles = diff.unchanged.length + filesToProcess.length;
// ---- Stage 3: Atomic replacement ------------------------------------
reportStage('storing');
this.replaceSnippets(repo.id, changedDocIds, newDocuments, newSnippets);
// ---- Stage 4: Embeddings (if provider is configured) ----------------
if (this.embeddingService) {
reportStage('embedding');
const snippetIds = this.embeddingService.findSnippetIdsMissingEmbeddings(
repo.id,
normJob.versionId
@@ -346,6 +375,7 @@ export class IndexingPipeline {
}
}
reportStage('done');
this.updateJob(job.id, {
status: 'done',
progress: 100,
@@ -355,6 +385,7 @@ export class IndexingPipeline {
const message = error instanceof Error ? error.message : String(error);
console.error(`[IndexingPipeline] Job ${job.id} failed: ${message}`);
reportStage('failed');
this.updateJob(job.id, {
status: 'failed',
error: message,