feat(TRUEREF-0023): add sqlite-vec search pipeline

This commit is contained in:
Giancarmine Salucci
2026-04-01 14:09:19 +02:00
parent 0752636847
commit 9525c58e9a
45 changed files with 4009 additions and 614 deletions

View File

@@ -21,6 +21,11 @@ const db = new Database(dbPath);
db.pragma('journal_mode = WAL');
db.pragma('foreign_keys = ON');
db.pragma('busy_timeout = 5000');
db.pragma('synchronous = NORMAL');
db.pragma('cache_size = -65536');
db.pragma('temp_store = MEMORY');
db.pragma('mmap_size = 268435456');
db.pragma('wal_autocheckpoint = 1000');
// Load the embedding profile from DB
const rawProfile = db.prepare('SELECT * FROM embedding_profiles WHERE id = ?').get(embeddingProfileId);

View File

@@ -13,6 +13,9 @@ import { JobQueue } from './job-queue.js';
import { IndexingPipeline } from './indexing.pipeline.js';
import { recoverStaleJobs } from './startup.js';
import { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
import { loadSqliteVec } from '$lib/server/db/sqlite-vec.js';
import { SqliteVecStore } from '$lib/server/search/sqlite-vec.store.js';
import { sqliteVecRowidTableName, sqliteVecTableName } from '$lib/server/db/sqlite-vec.js';
import * as diffStrategy from './differential-strategy.js';
// ---------------------------------------------------------------------------
@@ -22,6 +25,7 @@ import * as diffStrategy from './differential-strategy.js';
function createTestDb(): Database.Database {
const client = new Database(':memory:');
client.pragma('foreign_keys = ON');
loadSqliteVec(client);
const migrationsFolder = join(import.meta.dirname, '../db/migrations');
for (const migrationFile of [
@@ -29,7 +33,9 @@ function createTestDb(): Database.Database {
'0001_quick_nighthawk.sql',
'0002_silky_stellaris.sql',
'0003_multiversion_config.sql',
'0004_complete_sentry.sql'
'0004_complete_sentry.sql',
'0005_fix_stage_defaults.sql',
'0006_yielding_centennial.sql'
]) {
const migrationSql = readFileSync(join(migrationsFolder, migrationFile), 'utf-8');
@@ -539,6 +545,52 @@ describe('IndexingPipeline', () => {
expect(finalChecksum).toBe('sha-v2');
});
it('removes derived vec rows when changed documents are replaced', async () => {
const docId = crypto.randomUUID();
const snippetId = crypto.randomUUID();
const embedding = Float32Array.from([1, 0, 0]);
const vecStore = new SqliteVecStore(db);
db.prepare(
`INSERT INTO documents (id, repository_id, version_id, file_path, checksum, indexed_at)
VALUES (?, '/test/repo', NULL, 'README.md', 'stale-doc', ?)`
).run(docId, now);
db.prepare(
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, content, created_at)
VALUES (?, ?, '/test/repo', NULL, 'info', 'stale snippet', ?)`
).run(snippetId, docId, now);
db.prepare(
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
VALUES (?, 'local-default', 'test-model', 3, ?, ?)`
).run(snippetId, Buffer.from(embedding.buffer), now);
vecStore.upsertEmbedding('local-default', snippetId, embedding);
const pipeline = makePipeline({
files: [
{
path: 'README.md',
content: '# Updated\n\nFresh content.',
sha: 'sha-fresh',
language: 'markdown'
}
],
totalFiles: 1
});
const job = makeJob();
await pipeline.run(job as never);
const vecTable = sqliteVecTableName('local-default');
const rowidTable = sqliteVecRowidTableName('local-default');
const vecCount = db.prepare(`SELECT COUNT(*) as n FROM "${vecTable}"`).get() as { n: number };
const rowidCount = db.prepare(`SELECT COUNT(*) as n FROM "${rowidTable}"`).get() as {
n: number;
};
expect(vecCount.n).toBe(0);
expect(rowidCount.n).toBe(0);
});
it('updates job progress as files are processed', async () => {
const files = Array.from({ length: 5 }, (_, i) => ({
path: `file${i}.md`,
@@ -700,6 +752,60 @@ describe('IndexingPipeline', () => {
expect(version.indexed_at).not.toBeNull();
});
it('clones ancestor embeddings into the derived vec store for differential indexing', async () => {
const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
const vecStore = new SqliteVecStore(db);
const docId = crypto.randomUUID();
const snippetId = crypto.randomUUID();
const embedding = Float32Array.from([0.2, 0.4, 0.6]);
db.prepare(
`INSERT INTO documents (id, repository_id, version_id, file_path, checksum, indexed_at)
VALUES (?, '/test/repo', ?, 'README.md', 'ancestor-doc', ?)`
).run(docId, ancestorVersionId, now);
db.prepare(
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, content, created_at)
VALUES (?, ?, '/test/repo', ?, 'info', 'ancestor snippet', ?)`
).run(snippetId, docId, ancestorVersionId, now);
db.prepare(
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
VALUES (?, 'local-default', 'test-model', 3, ?, ?)`
).run(snippetId, Buffer.from(embedding.buffer), now);
vecStore.upsertEmbedding('local-default', snippetId, embedding);
vi.spyOn(diffStrategy, 'buildDifferentialPlan').mockResolvedValue({
ancestorTag: 'v1.0.0',
ancestorVersionId,
changedPaths: new Set<string>(),
unchangedPaths: new Set<string>(['README.md'])
});
const pipeline = makePipeline({ files: [], totalFiles: 0 });
const job = makeJob('/test/repo', targetVersionId);
await pipeline.run(job as never);
const targetRows = db
.prepare(
`SELECT se.snippet_id, se.embedding
FROM snippet_embeddings se
INNER JOIN snippets s ON s.id = se.snippet_id
WHERE s.version_id = ?`
)
.all(targetVersionId) as Array<{ snippet_id: string; embedding: Buffer }>;
expect(targetRows).toHaveLength(1);
const matches = vecStore.queryNearestNeighbors(embedding, {
repositoryId: '/test/repo',
versionId: targetVersionId,
profileId: 'local-default',
limit: 5
});
expect(matches[0]?.snippetId).toBe(targetRows[0].snippet_id);
});
it('updates repository_versions state to error when pipeline throws and job has versionId', async () => {
const versionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' });
const errorCrawl = vi.fn().mockRejectedValue(new Error('crawl failed'));

View File

@@ -22,6 +22,7 @@ import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.
import { RepositoryMapper } from '$lib/server/mappers/repository.mapper.js';
import { IndexingJob } from '$lib/server/models/indexing-job.js';
import { Repository, RepositoryEntity } from '$lib/server/models/repository.js';
import { SqliteVecStore } from '$lib/server/search/sqlite-vec.store.js';
import { resolveConfig, type ParsedConfig } from '$lib/server/config/config-parser.js';
import { parseFile } from '$lib/server/parser/index.js';
import { computeTrustScore } from '$lib/server/search/trust-score.js';
@@ -63,12 +64,16 @@ function sha256(content: string): string {
// ---------------------------------------------------------------------------
export class IndexingPipeline {
private readonly sqliteVecStore: SqliteVecStore;
constructor(
private readonly db: Database.Database,
private readonly githubCrawl: typeof GithubCrawlFn,
private readonly localCrawler: LocalCrawler,
private readonly embeddingService: EmbeddingService | null
) {}
) {
this.sqliteVecStore = new SqliteVecStore(db);
}
// -------------------------------------------------------------------------
// Public — run a job end to end
@@ -593,6 +598,12 @@ export class IndexingPipeline {
emb.embedding,
emb.created_at
);
this.sqliteVecStore.upsertEmbeddingBuffer(
emb.profile_id,
newSnippetId,
emb.embedding,
emb.dimensions
);
}
}
})();
@@ -623,6 +634,8 @@ export class IndexingPipeline {
);
this.db.transaction(() => {
this.sqliteVecStore.deleteEmbeddingsForDocumentIds(changedDocIds);
// Delete stale documents (cascade deletes their snippets via FK).
if (changedDocIds.length > 0) {
const placeholders = changedDocIds.map(() => '?').join(',');

View File

@@ -17,6 +17,54 @@ import type { WorkerPool } from './worker-pool.js';
const JOB_SELECT = `SELECT * FROM indexing_jobs`;
type JobStatusFilter = IndexingJob['status'] | Array<IndexingJob['status']>;
function escapeLikePattern(value: string): string {
return value.replaceAll('\\', '\\\\').replaceAll('%', '\\%').replaceAll('_', '\\_');
}
function isSpecificRepositoryId(repositoryId: string): boolean {
return repositoryId.split('/').filter(Boolean).length >= 2;
}
function normalizeStatuses(status?: JobStatusFilter): Array<IndexingJob['status']> {
if (!status) {
return [];
}
const statuses = Array.isArray(status) ? status : [status];
return [...new Set(statuses)];
}
function buildJobFilterQuery(options?: {
repositoryId?: string;
status?: JobStatusFilter;
}): { where: string; params: unknown[] } {
const conditions: string[] = [];
const params: unknown[] = [];
if (options?.repositoryId) {
if (isSpecificRepositoryId(options.repositoryId)) {
conditions.push('repository_id = ?');
params.push(options.repositoryId);
} else {
conditions.push(`(repository_id = ? OR repository_id LIKE ? ESCAPE '\\')`);
params.push(options.repositoryId, `${escapeLikePattern(options.repositoryId)}/%`);
}
}
const statuses = normalizeStatuses(options?.status);
if (statuses.length > 0) {
conditions.push(`status IN (${statuses.map(() => '?').join(', ')})`);
params.push(...statuses);
}
return {
where: conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '',
params
};
}
export class JobQueue {
private workerPool: WorkerPool | null = null;
@@ -144,23 +192,11 @@ export class JobQueue {
*/
listJobs(options?: {
repositoryId?: string;
status?: IndexingJob['status'];
status?: JobStatusFilter;
limit?: number;
}): IndexingJob[] {
const limit = Math.min(options?.limit ?? 20, 200);
const conditions: string[] = [];
const params: unknown[] = [];
if (options?.repositoryId) {
conditions.push('repository_id = ?');
params.push(options.repositoryId);
}
if (options?.status) {
conditions.push('status = ?');
params.push(options.status);
}
const where = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
const { where, params } = buildJobFilterQuery(options);
const sql = `${JOB_SELECT} ${where} ORDER BY created_at DESC LIMIT ?`;
params.push(limit);
@@ -194,19 +230,7 @@ export class JobQueue {
* Count all jobs matching optional filters.
*/
countJobs(options?: { repositoryId?: string; status?: IndexingJob['status'] }): number {
const conditions: string[] = [];
const params: unknown[] = [];
if (options?.repositoryId) {
conditions.push('repository_id = ?');
params.push(options.repositoryId);
}
if (options?.status) {
conditions.push('status = ?');
params.push(options.status);
}
const where = conditions.length > 0 ? `WHERE ${conditions.join(' AND')}` : '';
const { where, params } = buildJobFilterQuery(options);
const sql = `SELECT COUNT(*) as n FROM indexing_jobs ${where}`;
const row = this.db.prepare<unknown[], { n: number }>(sql).get(...params);
return row?.n ?? 0;

View File

@@ -171,4 +171,25 @@ describe('ProgressBroadcaster', () => {
reader1.cancel();
reader2.cancel();
});
it('broadcastWorkerStatus sends worker-status events to global subscribers', async () => {
const broadcaster = new ProgressBroadcaster();
const stream = broadcaster.subscribeAll();
const reader = stream.getReader();
broadcaster.broadcastWorkerStatus({
concurrency: 2,
active: 1,
idle: 1,
workers: [{ index: 0, state: 'running', jobId: 'job-1', repositoryId: '/repo/1', versionId: null }]
});
const { value } = await reader.read();
const text = value as string;
expect(text).toContain('event: worker-status');
expect(text).toContain('"active":1');
reader.cancel();
});
});

View File

@@ -10,6 +10,7 @@ export class ProgressBroadcaster {
private allSubscribers = new Set<ReadableStreamDefaultController<string>>();
private lastEventCache = new Map<string, SSEEvent>();
private eventCounters = new Map<string, number>();
private globalEventCounter = 0;
subscribe(jobId: string): ReadableStream<string> {
return new ReadableStream({
@@ -135,6 +136,24 @@ export class ProgressBroadcaster {
}
}
broadcastWorkerStatus(data: object): void {
this.globalEventCounter += 1;
const event: SSEEvent = {
id: this.globalEventCounter,
event: 'worker-status',
data: JSON.stringify(data)
};
const sse = this.formatSSE(event);
for (const controller of this.allSubscribers) {
try {
controller.enqueue(sse);
} catch {
// Controller might be closed or errored
}
}
}
getLastEvent(jobId: string): SSEEvent | null {
return this.lastEventCache.get(jobId) ?? null;
}

View File

@@ -16,7 +16,6 @@ import { LocalCrawler } from '$lib/server/crawler/local.crawler.js';
import { IndexingPipeline } from './indexing.pipeline.js';
import { JobQueue } from './job-queue.js';
import { WorkerPool } from './worker-pool.js';
import type { ParseWorkerResponse } from './worker-types.js';
import { initBroadcaster } from './progress-broadcaster.js';
import type { ProgressBroadcaster } from './progress-broadcaster.js';
import path from 'node:path';
@@ -90,17 +89,28 @@ export function initializePipeline(
if (options?.dbPath) {
_broadcaster = initBroadcaster();
const getRepositoryIdForJob = (jobId: string): string => {
const row = db
.prepare<[string], { repository_id: string }>(
`SELECT repository_id FROM indexing_jobs WHERE id = ?`
)
.get(jobId);
return row?.repository_id ?? '';
};
// Resolve worker script paths relative to this file (build/workers/ directory)
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const workerScript = path.join(__dirname, '../../../build/workers/worker-entry.mjs');
const embedWorkerScript = path.join(__dirname, '../../../build/workers/embed-worker-entry.mjs');
const writeWorkerScript = path.join(__dirname, '../../../build/workers/write-worker-entry.mjs');
try {
_pool = new WorkerPool({
concurrency: options.concurrency ?? 2,
workerScript,
embedWorkerScript,
writeWorkerScript,
dbPath: options.dbPath,
onProgress: (jobId, msg) => {
// Update DB with progress
@@ -112,7 +122,10 @@ export function initializePipeline(
// Broadcast progress event
if (_broadcaster) {
_broadcaster.broadcast(jobId, '', 'progress', msg);
_broadcaster.broadcast(jobId, getRepositoryIdForJob(jobId), 'job-progress', {
...msg,
status: 'running'
});
}
},
onJobDone: (jobId: string) => {
@@ -123,7 +136,10 @@ export function initializePipeline(
// Broadcast done event
if (_broadcaster) {
_broadcaster.broadcast(jobId, '', 'job-done', { jobId });
_broadcaster.broadcast(jobId, getRepositoryIdForJob(jobId), 'job-done', {
jobId,
status: 'done'
});
}
},
onJobFailed: (jobId: string, error: string) => {
@@ -134,7 +150,11 @@ export function initializePipeline(
// Broadcast failed event
if (_broadcaster) {
_broadcaster.broadcast(jobId, '', 'job-failed', { jobId, error });
_broadcaster.broadcast(jobId, getRepositoryIdForJob(jobId), 'job-failed', {
jobId,
status: 'failed',
error
});
}
},
onEmbedDone: (jobId: string) => {
@@ -142,6 +162,9 @@ export function initializePipeline(
},
onEmbedFailed: (jobId: string, error: string) => {
console.error('[WorkerPool] Embedding failed for job:', jobId, error);
},
onWorkerStatus: (status) => {
_broadcaster?.broadcastWorkerStatus(status);
}
});

View File

@@ -13,6 +13,11 @@ const db = new Database(dbPath);
db.pragma('journal_mode = WAL');
db.pragma('foreign_keys = ON');
db.pragma('busy_timeout = 5000');
db.pragma('synchronous = NORMAL');
db.pragma('cache_size = -65536');
db.pragma('temp_store = MEMORY');
db.pragma('mmap_size = 268435456');
db.pragma('wal_autocheckpoint = 1000');
const pipeline = new IndexingPipeline(db, githubCrawl, new LocalCrawler(), null);
let currentJobId: string | null = null;

View File

@@ -1,11 +1,19 @@
import { Worker } from 'node:worker_threads';
import { existsSync } from 'node:fs';
import type { ParseWorkerRequest, ParseWorkerResponse, EmbedWorkerRequest, EmbedWorkerResponse, WorkerInitData } from './worker-types.js';
import type {
ParseWorkerRequest,
ParseWorkerResponse,
EmbedWorkerRequest,
EmbedWorkerResponse,
WorkerInitData,
WriteWorkerResponse
} from './worker-types.js';
export interface WorkerPoolOptions {
concurrency: number;
workerScript: string;
embedWorkerScript: string;
writeWorkerScript?: string;
dbPath: string;
embeddingProfileId?: string;
onProgress: (jobId: string, msg: Extract<ParseWorkerResponse, { type: 'progress' }>) => void;
@@ -13,6 +21,22 @@ export interface WorkerPoolOptions {
onJobFailed: (jobId: string, error: string) => void;
onEmbedDone: (jobId: string) => void;
onEmbedFailed: (jobId: string, error: string) => void;
onWorkerStatus?: (status: WorkerPoolStatus) => void;
}
export interface WorkerStatusEntry {
index: number;
state: 'idle' | 'running';
jobId: string | null;
repositoryId: string | null;
versionId: string | null;
}
export interface WorkerPoolStatus {
concurrency: number;
active: number;
idle: number;
workers: WorkerStatusEntry[];
}
interface QueuedJob {
@@ -24,6 +48,7 @@ interface QueuedJob {
interface RunningJob {
jobId: string;
repositoryId: string;
versionId?: string | null;
}
interface EmbedQueuedJob {
@@ -36,10 +61,12 @@ export class WorkerPool {
private workers: Worker[] = [];
private idleWorkers: Worker[] = [];
private embedWorker: Worker | null = null;
private writeWorker: Worker | null = null;
private embedReady = false;
private writeReady = false;
private jobQueue: QueuedJob[] = [];
private runningJobs = new Map<Worker, RunningJob>();
private runningRepoIds = new Set<string>();
private runningJobKeys = new Set<string>();
private embedQueue: EmbedQueuedJob[] = [];
private options: WorkerPoolOptions;
private fallbackMode = false;
@@ -66,6 +93,12 @@ export class WorkerPool {
if (options.embeddingProfileId && existsSync(options.embedWorkerScript)) {
this.embedWorker = this.spawnEmbedWorker();
}
if (options.writeWorkerScript && existsSync(options.writeWorkerScript)) {
this.writeWorker = this.spawnWriteWorker(options.writeWorkerScript);
}
this.emitStatusChanged();
}
private spawnParseWorker(): Worker {
@@ -94,6 +127,22 @@ export class WorkerPool {
return worker;
}
private spawnWriteWorker(writeWorkerScript: string): Worker {
const worker = new Worker(writeWorkerScript, {
workerData: {
dbPath: this.options.dbPath
} satisfies WorkerInitData
});
worker.on('message', (msg: WriteWorkerResponse) => this.onWriteWorkerMessage(msg));
worker.on('exit', () => {
this.writeReady = false;
this.writeWorker = null;
});
return worker;
}
public enqueue(jobId: string, repositoryId: string, versionId?: string | null): void {
if (this.shuttingDown) {
console.warn('WorkerPool is shutting down, ignoring enqueue request');
@@ -109,10 +158,18 @@ export class WorkerPool {
this.dispatch();
}
private static jobKey(repositoryId: string, versionId?: string | null): string {
return `${repositoryId}:${versionId ?? ''}`;
}
private dispatch(): void {
let statusChanged = false;
while (this.idleWorkers.length > 0 && this.jobQueue.length > 0) {
// Find first job whose repositoryId is not currently running
const jobIdx = this.jobQueue.findIndex((j) => !this.runningRepoIds.has(j.repositoryId));
// Find first job whose (repositoryId, versionId) compound key is not currently running
const jobIdx = this.jobQueue.findIndex(
(j) => !this.runningJobKeys.has(WorkerPool.jobKey(j.repositoryId, j.versionId))
);
if (jobIdx === -1) {
// No eligible job found (all repos have running jobs)
@@ -122,12 +179,17 @@ export class WorkerPool {
const job = this.jobQueue.splice(jobIdx, 1)[0];
const worker = this.idleWorkers.pop()!;
this.runningJobs.set(worker, { jobId: job.jobId, repositoryId: job.repositoryId });
this.runningRepoIds.add(job.repositoryId);
this.runningJobs.set(worker, { jobId: job.jobId, repositoryId: job.repositoryId, versionId: job.versionId });
this.runningJobKeys.add(WorkerPool.jobKey(job.repositoryId, job.versionId));
statusChanged = true;
const msg: ParseWorkerRequest = { type: 'run', jobId: job.jobId };
worker.postMessage(msg);
}
if (statusChanged) {
this.emitStatusChanged();
}
}
private onWorkerMessage(worker: Worker, msg: ParseWorkerResponse): void {
@@ -137,15 +199,20 @@ export class WorkerPool {
const runningJob = this.runningJobs.get(worker);
if (runningJob) {
this.runningJobs.delete(worker);
this.runningRepoIds.delete(runningJob.repositoryId);
this.runningJobKeys.delete(WorkerPool.jobKey(runningJob.repositoryId, runningJob.versionId));
}
this.idleWorkers.push(worker);
this.options.onJobDone(msg.jobId);
this.emitStatusChanged();
// If embedding configured, enqueue embed request
if (this.embedWorker && this.options.embeddingProfileId) {
const runningJobData = runningJob || { jobId: msg.jobId, repositoryId: '' };
this.enqueueEmbed(msg.jobId, runningJobData.repositoryId, null);
const runningJobData = runningJob || { jobId: msg.jobId, repositoryId: '', versionId: null };
this.enqueueEmbed(
msg.jobId,
runningJobData.repositoryId,
runningJobData.versionId ?? null
);
}
this.dispatch();
@@ -153,10 +220,11 @@ export class WorkerPool {
const runningJob = this.runningJobs.get(worker);
if (runningJob) {
this.runningJobs.delete(worker);
this.runningRepoIds.delete(runningJob.repositoryId);
this.runningJobKeys.delete(WorkerPool.jobKey(runningJob.repositoryId, runningJob.versionId));
}
this.idleWorkers.push(worker);
this.options.onJobFailed(msg.jobId, msg.error);
this.emitStatusChanged();
this.dispatch();
}
}
@@ -176,13 +244,15 @@ export class WorkerPool {
const runningJob = this.runningJobs.get(worker);
if (runningJob && code !== 0) {
this.runningJobs.delete(worker);
this.runningRepoIds.delete(runningJob.repositoryId);
this.runningJobKeys.delete(WorkerPool.jobKey(runningJob.repositoryId, runningJob.versionId));
this.options.onJobFailed(runningJob.jobId, `Worker crashed with code ${code}`);
} else if (runningJob) {
this.runningJobs.delete(worker);
this.runningRepoIds.delete(runningJob.repositoryId);
this.runningJobKeys.delete(WorkerPool.jobKey(runningJob.repositoryId, runningJob.versionId));
}
this.emitStatusChanged();
// Remove from workers array
const workerIdx = this.workers.indexOf(worker);
if (workerIdx !== -1) {
@@ -212,6 +282,17 @@ export class WorkerPool {
}
}
private onWriteWorkerMessage(msg: WriteWorkerResponse): void {
if (msg.type === 'ready') {
this.writeReady = true;
return;
}
if (msg.type === 'write_error') {
console.error('[WorkerPool] Write worker failed for job:', msg.jobId, msg.error);
}
}
private processEmbedQueue(): void {
if (!this.embedWorker || !this.embedReady) {
return;
@@ -250,6 +331,7 @@ export class WorkerPool {
}
public setMaxConcurrency(n: number): void {
this.options.concurrency = n;
const current = this.workers.length;
if (n > current) {
@@ -274,6 +356,8 @@ export class WorkerPool {
}
}
}
this.emitStatusChanged();
}
public async shutdown(): Promise<void> {
@@ -300,6 +384,14 @@ export class WorkerPool {
}
}
if (this.writeWorker) {
try {
this.writeWorker.postMessage({ type: 'shutdown' });
} catch {
// Worker might already be exited
}
}
// Wait for workers to exit with timeout
const timeout = 5000;
const startTime = Date.now();
@@ -329,9 +421,41 @@ export class WorkerPool {
}
}
if (this.writeWorker) {
try {
this.writeWorker.terminate();
} catch {
// Already terminated
}
}
this.workers = [];
this.idleWorkers = [];
this.embedWorker = null;
this.writeWorker = null;
this.emitStatusChanged();
}
public getStatus(): WorkerPoolStatus {
return {
concurrency: this.options.concurrency,
active: this.runningJobs.size,
idle: this.idleWorkers.length,
workers: this.workers.map((worker, index) => {
const runningJob = this.runningJobs.get(worker);
return {
index,
state: runningJob ? 'running' : 'idle',
jobId: runningJob?.jobId ?? null,
repositoryId: runningJob?.repositoryId ?? null,
versionId: runningJob?.versionId ?? null
};
})
};
}
private emitStatusChanged(): void {
this.options.onWorkerStatus?.(this.getStatus());
}
public get isFallbackMode(): boolean {

View File

@@ -19,7 +19,61 @@ export type EmbedWorkerResponse =
| { type: 'embed-done'; jobId: string }
| { type: 'embed-failed'; jobId: string; error: string };
export type WriteWorkerRequest = WriteRequest | { type: 'shutdown' };
export type WriteWorkerResponse =
| { type: 'ready' }
| WriteAck
| WriteError;
export interface WorkerInitData {
dbPath: string;
embeddingProfileId?: string;
}
// Write worker message types (Phase 6)
export interface SerializedDocument {
id: string;
repositoryId: string;
versionId: string | null;
filePath: string;
title: string | null;
language: string | null;
tokenCount: number;
checksum: string;
indexedAt: number;
}
export interface SerializedSnippet {
id: string;
documentId: string;
repositoryId: string;
versionId: string | null;
type: 'code' | 'info';
title: string | null;
content: string;
language: string | null;
breadcrumb: string | null;
tokenCount: number;
createdAt: number;
}
export type WriteRequest = {
type: 'write';
jobId: string;
documents: SerializedDocument[];
snippets: SerializedSnippet[];
};
export type WriteAck = {
type: 'write_ack';
jobId: string;
documentCount: number;
snippetCount: number;
};
export type WriteError = {
type: 'write_error';
jobId: string;
error: string;
};

View File

@@ -0,0 +1,93 @@
import { workerData, parentPort } from 'node:worker_threads';
import Database from 'better-sqlite3';
import type {
SerializedDocument,
SerializedSnippet,
WorkerInitData,
WriteWorkerRequest,
WriteWorkerResponse
} from './worker-types.js';
const { dbPath } = workerData as WorkerInitData;
const db = new Database(dbPath);
db.pragma('journal_mode = WAL');
db.pragma('foreign_keys = ON');
db.pragma('busy_timeout = 5000');
db.pragma('synchronous = NORMAL');
db.pragma('cache_size = -65536');
db.pragma('temp_store = MEMORY');
db.pragma('mmap_size = 268435456');
db.pragma('wal_autocheckpoint = 1000');
const insertDocument = db.prepare(
`INSERT OR REPLACE INTO documents
(id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
);
const insertSnippet = db.prepare(
`INSERT OR REPLACE INTO snippets
(id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
);
const writeBatch = db.transaction((documents: SerializedDocument[], snippets: SerializedSnippet[]) => {
for (const document of documents) {
insertDocument.run(
document.id,
document.repositoryId,
document.versionId,
document.filePath,
document.title,
document.language,
document.tokenCount,
document.checksum,
document.indexedAt
);
}
for (const snippet of snippets) {
insertSnippet.run(
snippet.id,
snippet.documentId,
snippet.repositoryId,
snippet.versionId,
snippet.type,
snippet.title,
snippet.content,
snippet.language,
snippet.breadcrumb,
snippet.tokenCount,
snippet.createdAt
);
}
});
parentPort?.postMessage({ type: 'ready' } satisfies WriteWorkerResponse);
parentPort?.on('message', (msg: WriteWorkerRequest) => {
if (msg.type === 'shutdown') {
db.close();
process.exit(0);
}
if (msg.type !== 'write') {
return;
}
try {
writeBatch(msg.documents, msg.snippets);
parentPort?.postMessage({
type: 'write_ack',
jobId: msg.jobId,
documentCount: msg.documents.length,
snippetCount: msg.snippets.length
} satisfies WriteWorkerResponse);
} catch (error) {
parentPort?.postMessage({
type: 'write_error',
jobId: msg.jobId,
error: error instanceof Error ? error.message : String(error)
} satisfies WriteWorkerResponse);
}
});