feat(TRUEREF-0022): complete iteration 0 — worker-thread indexing, parallel jobs, SSE progress
- Move IndexingPipeline.run() into Worker Threads via WorkerPool - Add dedicated embedding worker thread with single model instance - Add stage/stageDetail columns to indexing_jobs schema - Create ProgressBroadcaster for SSE channel management - Add SSE endpoints: GET /api/v1/jobs/:id/stream, GET /api/v1/jobs/stream - Replace UI polling with EventSource on repo detail and admin pages - Add concurrency settings UI and API endpoint - Build worker entries separately via esbuild
This commit is contained in:
@@ -15,7 +15,7 @@
|
||||
|
||||
import { createHash, randomUUID } from 'node:crypto';
|
||||
import type Database from 'better-sqlite3';
|
||||
import type { Document, NewDocument, NewSnippet, TrueRefConfig } from '$lib/types';
|
||||
import type { Document, NewDocument, NewSnippet, TrueRefConfig, IndexingStage } from '$lib/types';
|
||||
import type { crawl as GithubCrawlFn } from '$lib/server/crawler/github.crawler.js';
|
||||
import type { LocalCrawler } from '$lib/server/crawler/local.crawler.js';
|
||||
import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
|
||||
@@ -74,7 +74,16 @@ export class IndexingPipeline {
|
||||
// Public — run a job end to end
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
async run(job: IndexingJob): Promise<void> {
|
||||
async run(
|
||||
job: IndexingJob,
|
||||
onStageChange?: (
|
||||
stage: IndexingStage,
|
||||
detail?: string,
|
||||
progress?: number,
|
||||
processedFiles?: number,
|
||||
totalFiles?: number
|
||||
) => void
|
||||
): Promise<void> {
|
||||
// better-sqlite3 raw queries return snake_case keys; Drizzle types use camelCase.
|
||||
// Accept both so the pipeline works when called from raw SQL contexts.
|
||||
const raw = job as unknown as Record<string, unknown>;
|
||||
@@ -84,6 +93,18 @@ export class IndexingPipeline {
|
||||
// Rebuild a normalised job view for the rest of this method.
|
||||
const normJob = { ...job, repositoryId, versionId };
|
||||
|
||||
// Helper to report stage transitions and invoke optional callback.
|
||||
const reportStage = (
|
||||
stage: IndexingStage,
|
||||
detail?: string,
|
||||
progress?: number,
|
||||
processed?: number,
|
||||
total?: number
|
||||
) => {
|
||||
this.updateJob(job.id, { stage, stageDetail: detail ?? null });
|
||||
onStageChange?.(stage, detail, progress, processed, total);
|
||||
};
|
||||
|
||||
this.updateJob(job.id, { status: 'running', startedAt: Math.floor(Date.now() / 1000) });
|
||||
|
||||
try {
|
||||
@@ -105,6 +126,7 @@ export class IndexingPipeline {
|
||||
// from an already-indexed ancestor version instead of crawling everything.
|
||||
let differentialPlan: DifferentialPlan | null = null;
|
||||
if (normJob.versionId && versionTag) {
|
||||
reportStage('differential');
|
||||
differentialPlan = await buildDifferentialPlan({
|
||||
repo,
|
||||
targetTag: versionTag,
|
||||
@@ -119,6 +141,7 @@ export class IndexingPipeline {
|
||||
|
||||
// If a differential plan exists, clone unchanged files from ancestor.
|
||||
if (differentialPlan && differentialPlan.unchangedPaths.size > 0) {
|
||||
reportStage('cloning');
|
||||
this.cloneFromAncestor(
|
||||
differentialPlan.ancestorVersionId,
|
||||
normJob.versionId!,
|
||||
@@ -132,6 +155,7 @@ export class IndexingPipeline {
|
||||
|
||||
// ---- Stage 1: Crawl -------------------------------------------------
|
||||
// Pass changedPaths as allowlist so crawl only fetches/returns changed files.
|
||||
reportStage('crawling');
|
||||
const crawlAllowedPaths = differentialPlan ? differentialPlan.changedPaths : undefined;
|
||||
const crawlResult = await this.crawl(repo, versionTag, crawlAllowedPaths);
|
||||
|
||||
@@ -219,6 +243,8 @@ export class IndexingPipeline {
|
||||
// Lower = more responsive UI; higher = less overhead.
|
||||
const YIELD_EVERY = 20;
|
||||
|
||||
reportStage('parsing', `0 / ${totalFiles} files`);
|
||||
|
||||
for (const [i, file] of filesToProcess.entries()) {
|
||||
// Yield the Node.js event loop periodically so the HTTP server can
|
||||
// handle incoming requests (navigation, polling) between file parses.
|
||||
@@ -272,6 +298,7 @@ export class IndexingPipeline {
|
||||
this.embeddingService !== null
|
||||
);
|
||||
this.updateJob(job.id, { processedFiles: totalProcessed, progress });
|
||||
reportStage('parsing', `${totalProcessed} / ${totalFiles} files`, progress, totalProcessed, totalFiles);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -279,10 +306,12 @@ export class IndexingPipeline {
|
||||
processedFiles = diff.unchanged.length + filesToProcess.length;
|
||||
|
||||
// ---- Stage 3: Atomic replacement ------------------------------------
|
||||
reportStage('storing');
|
||||
this.replaceSnippets(repo.id, changedDocIds, newDocuments, newSnippets);
|
||||
|
||||
// ---- Stage 4: Embeddings (if provider is configured) ----------------
|
||||
if (this.embeddingService) {
|
||||
reportStage('embedding');
|
||||
const snippetIds = this.embeddingService.findSnippetIdsMissingEmbeddings(
|
||||
repo.id,
|
||||
normJob.versionId
|
||||
@@ -346,6 +375,7 @@ export class IndexingPipeline {
|
||||
}
|
||||
}
|
||||
|
||||
reportStage('done');
|
||||
this.updateJob(job.id, {
|
||||
status: 'done',
|
||||
progress: 100,
|
||||
@@ -355,6 +385,7 @@ export class IndexingPipeline {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
console.error(`[IndexingPipeline] Job ${job.id} failed: ${message}`);
|
||||
|
||||
reportStage('failed');
|
||||
this.updateJob(job.id, {
|
||||
status: 'failed',
|
||||
error: message,
|
||||
|
||||
Reference in New Issue
Block a user