feat(TRUEREF-0023): add sqlite-vec search pipeline
This commit is contained in:
@@ -21,6 +21,11 @@ const db = new Database(dbPath);
|
||||
db.pragma('journal_mode = WAL');
|
||||
db.pragma('foreign_keys = ON');
|
||||
db.pragma('busy_timeout = 5000');
|
||||
db.pragma('synchronous = NORMAL');
|
||||
db.pragma('cache_size = -65536');
|
||||
db.pragma('temp_store = MEMORY');
|
||||
db.pragma('mmap_size = 268435456');
|
||||
db.pragma('wal_autocheckpoint = 1000');
|
||||
|
||||
// Load the embedding profile from DB
|
||||
const rawProfile = db.prepare('SELECT * FROM embedding_profiles WHERE id = ?').get(embeddingProfileId);
|
||||
|
||||
@@ -13,6 +13,9 @@ import { JobQueue } from './job-queue.js';
|
||||
import { IndexingPipeline } from './indexing.pipeline.js';
|
||||
import { recoverStaleJobs } from './startup.js';
|
||||
import { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
|
||||
import { loadSqliteVec } from '$lib/server/db/sqlite-vec.js';
|
||||
import { SqliteVecStore } from '$lib/server/search/sqlite-vec.store.js';
|
||||
import { sqliteVecRowidTableName, sqliteVecTableName } from '$lib/server/db/sqlite-vec.js';
|
||||
import * as diffStrategy from './differential-strategy.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -22,6 +25,7 @@ import * as diffStrategy from './differential-strategy.js';
|
||||
function createTestDb(): Database.Database {
|
||||
const client = new Database(':memory:');
|
||||
client.pragma('foreign_keys = ON');
|
||||
loadSqliteVec(client);
|
||||
|
||||
const migrationsFolder = join(import.meta.dirname, '../db/migrations');
|
||||
for (const migrationFile of [
|
||||
@@ -29,7 +33,9 @@ function createTestDb(): Database.Database {
|
||||
'0001_quick_nighthawk.sql',
|
||||
'0002_silky_stellaris.sql',
|
||||
'0003_multiversion_config.sql',
|
||||
'0004_complete_sentry.sql'
|
||||
'0004_complete_sentry.sql',
|
||||
'0005_fix_stage_defaults.sql',
|
||||
'0006_yielding_centennial.sql'
|
||||
]) {
|
||||
const migrationSql = readFileSync(join(migrationsFolder, migrationFile), 'utf-8');
|
||||
|
||||
@@ -539,6 +545,52 @@ describe('IndexingPipeline', () => {
|
||||
expect(finalChecksum).toBe('sha-v2');
|
||||
});
|
||||
|
||||
it('removes derived vec rows when changed documents are replaced', async () => {
|
||||
const docId = crypto.randomUUID();
|
||||
const snippetId = crypto.randomUUID();
|
||||
const embedding = Float32Array.from([1, 0, 0]);
|
||||
const vecStore = new SqliteVecStore(db);
|
||||
|
||||
db.prepare(
|
||||
`INSERT INTO documents (id, repository_id, version_id, file_path, checksum, indexed_at)
|
||||
VALUES (?, '/test/repo', NULL, 'README.md', 'stale-doc', ?)`
|
||||
).run(docId, now);
|
||||
db.prepare(
|
||||
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, content, created_at)
|
||||
VALUES (?, ?, '/test/repo', NULL, 'info', 'stale snippet', ?)`
|
||||
).run(snippetId, docId, now);
|
||||
db.prepare(
|
||||
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
||||
VALUES (?, 'local-default', 'test-model', 3, ?, ?)`
|
||||
).run(snippetId, Buffer.from(embedding.buffer), now);
|
||||
vecStore.upsertEmbedding('local-default', snippetId, embedding);
|
||||
|
||||
const pipeline = makePipeline({
|
||||
files: [
|
||||
{
|
||||
path: 'README.md',
|
||||
content: '# Updated\n\nFresh content.',
|
||||
sha: 'sha-fresh',
|
||||
language: 'markdown'
|
||||
}
|
||||
],
|
||||
totalFiles: 1
|
||||
});
|
||||
const job = makeJob();
|
||||
|
||||
await pipeline.run(job as never);
|
||||
|
||||
const vecTable = sqliteVecTableName('local-default');
|
||||
const rowidTable = sqliteVecRowidTableName('local-default');
|
||||
const vecCount = db.prepare(`SELECT COUNT(*) as n FROM "${vecTable}"`).get() as { n: number };
|
||||
const rowidCount = db.prepare(`SELECT COUNT(*) as n FROM "${rowidTable}"`).get() as {
|
||||
n: number;
|
||||
};
|
||||
|
||||
expect(vecCount.n).toBe(0);
|
||||
expect(rowidCount.n).toBe(0);
|
||||
});
|
||||
|
||||
it('updates job progress as files are processed', async () => {
|
||||
const files = Array.from({ length: 5 }, (_, i) => ({
|
||||
path: `file${i}.md`,
|
||||
@@ -700,6 +752,60 @@ describe('IndexingPipeline', () => {
|
||||
expect(version.indexed_at).not.toBeNull();
|
||||
});
|
||||
|
||||
it('clones ancestor embeddings into the derived vec store for differential indexing', async () => {
|
||||
const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
|
||||
const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
|
||||
const vecStore = new SqliteVecStore(db);
|
||||
const docId = crypto.randomUUID();
|
||||
const snippetId = crypto.randomUUID();
|
||||
const embedding = Float32Array.from([0.2, 0.4, 0.6]);
|
||||
|
||||
db.prepare(
|
||||
`INSERT INTO documents (id, repository_id, version_id, file_path, checksum, indexed_at)
|
||||
VALUES (?, '/test/repo', ?, 'README.md', 'ancestor-doc', ?)`
|
||||
).run(docId, ancestorVersionId, now);
|
||||
db.prepare(
|
||||
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, content, created_at)
|
||||
VALUES (?, ?, '/test/repo', ?, 'info', 'ancestor snippet', ?)`
|
||||
).run(snippetId, docId, ancestorVersionId, now);
|
||||
db.prepare(
|
||||
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
||||
VALUES (?, 'local-default', 'test-model', 3, ?, ?)`
|
||||
).run(snippetId, Buffer.from(embedding.buffer), now);
|
||||
vecStore.upsertEmbedding('local-default', snippetId, embedding);
|
||||
|
||||
vi.spyOn(diffStrategy, 'buildDifferentialPlan').mockResolvedValue({
|
||||
ancestorTag: 'v1.0.0',
|
||||
ancestorVersionId,
|
||||
changedPaths: new Set<string>(),
|
||||
unchangedPaths: new Set<string>(['README.md'])
|
||||
});
|
||||
|
||||
const pipeline = makePipeline({ files: [], totalFiles: 0 });
|
||||
const job = makeJob('/test/repo', targetVersionId);
|
||||
|
||||
await pipeline.run(job as never);
|
||||
|
||||
const targetRows = db
|
||||
.prepare(
|
||||
`SELECT se.snippet_id, se.embedding
|
||||
FROM snippet_embeddings se
|
||||
INNER JOIN snippets s ON s.id = se.snippet_id
|
||||
WHERE s.version_id = ?`
|
||||
)
|
||||
.all(targetVersionId) as Array<{ snippet_id: string; embedding: Buffer }>;
|
||||
|
||||
expect(targetRows).toHaveLength(1);
|
||||
const matches = vecStore.queryNearestNeighbors(embedding, {
|
||||
repositoryId: '/test/repo',
|
||||
versionId: targetVersionId,
|
||||
profileId: 'local-default',
|
||||
limit: 5
|
||||
});
|
||||
|
||||
expect(matches[0]?.snippetId).toBe(targetRows[0].snippet_id);
|
||||
});
|
||||
|
||||
it('updates repository_versions state to error when pipeline throws and job has versionId', async () => {
|
||||
const versionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' });
|
||||
const errorCrawl = vi.fn().mockRejectedValue(new Error('crawl failed'));
|
||||
|
||||
@@ -22,6 +22,7 @@ import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.
|
||||
import { RepositoryMapper } from '$lib/server/mappers/repository.mapper.js';
|
||||
import { IndexingJob } from '$lib/server/models/indexing-job.js';
|
||||
import { Repository, RepositoryEntity } from '$lib/server/models/repository.js';
|
||||
import { SqliteVecStore } from '$lib/server/search/sqlite-vec.store.js';
|
||||
import { resolveConfig, type ParsedConfig } from '$lib/server/config/config-parser.js';
|
||||
import { parseFile } from '$lib/server/parser/index.js';
|
||||
import { computeTrustScore } from '$lib/server/search/trust-score.js';
|
||||
@@ -63,12 +64,16 @@ function sha256(content: string): string {
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export class IndexingPipeline {
|
||||
private readonly sqliteVecStore: SqliteVecStore;
|
||||
|
||||
constructor(
|
||||
private readonly db: Database.Database,
|
||||
private readonly githubCrawl: typeof GithubCrawlFn,
|
||||
private readonly localCrawler: LocalCrawler,
|
||||
private readonly embeddingService: EmbeddingService | null
|
||||
) {}
|
||||
) {
|
||||
this.sqliteVecStore = new SqliteVecStore(db);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Public — run a job end to end
|
||||
@@ -593,6 +598,12 @@ export class IndexingPipeline {
|
||||
emb.embedding,
|
||||
emb.created_at
|
||||
);
|
||||
this.sqliteVecStore.upsertEmbeddingBuffer(
|
||||
emb.profile_id,
|
||||
newSnippetId,
|
||||
emb.embedding,
|
||||
emb.dimensions
|
||||
);
|
||||
}
|
||||
}
|
||||
})();
|
||||
@@ -623,6 +634,8 @@ export class IndexingPipeline {
|
||||
);
|
||||
|
||||
this.db.transaction(() => {
|
||||
this.sqliteVecStore.deleteEmbeddingsForDocumentIds(changedDocIds);
|
||||
|
||||
// Delete stale documents (cascade deletes their snippets via FK).
|
||||
if (changedDocIds.length > 0) {
|
||||
const placeholders = changedDocIds.map(() => '?').join(',');
|
||||
|
||||
@@ -17,6 +17,54 @@ import type { WorkerPool } from './worker-pool.js';
|
||||
|
||||
const JOB_SELECT = `SELECT * FROM indexing_jobs`;
|
||||
|
||||
type JobStatusFilter = IndexingJob['status'] | Array<IndexingJob['status']>;
|
||||
|
||||
function escapeLikePattern(value: string): string {
|
||||
return value.replaceAll('\\', '\\\\').replaceAll('%', '\\%').replaceAll('_', '\\_');
|
||||
}
|
||||
|
||||
function isSpecificRepositoryId(repositoryId: string): boolean {
|
||||
return repositoryId.split('/').filter(Boolean).length >= 2;
|
||||
}
|
||||
|
||||
function normalizeStatuses(status?: JobStatusFilter): Array<IndexingJob['status']> {
|
||||
if (!status) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const statuses = Array.isArray(status) ? status : [status];
|
||||
return [...new Set(statuses)];
|
||||
}
|
||||
|
||||
function buildJobFilterQuery(options?: {
|
||||
repositoryId?: string;
|
||||
status?: JobStatusFilter;
|
||||
}): { where: string; params: unknown[] } {
|
||||
const conditions: string[] = [];
|
||||
const params: unknown[] = [];
|
||||
|
||||
if (options?.repositoryId) {
|
||||
if (isSpecificRepositoryId(options.repositoryId)) {
|
||||
conditions.push('repository_id = ?');
|
||||
params.push(options.repositoryId);
|
||||
} else {
|
||||
conditions.push(`(repository_id = ? OR repository_id LIKE ? ESCAPE '\\')`);
|
||||
params.push(options.repositoryId, `${escapeLikePattern(options.repositoryId)}/%`);
|
||||
}
|
||||
}
|
||||
|
||||
const statuses = normalizeStatuses(options?.status);
|
||||
if (statuses.length > 0) {
|
||||
conditions.push(`status IN (${statuses.map(() => '?').join(', ')})`);
|
||||
params.push(...statuses);
|
||||
}
|
||||
|
||||
return {
|
||||
where: conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '',
|
||||
params
|
||||
};
|
||||
}
|
||||
|
||||
export class JobQueue {
|
||||
private workerPool: WorkerPool | null = null;
|
||||
|
||||
@@ -144,23 +192,11 @@ export class JobQueue {
|
||||
*/
|
||||
listJobs(options?: {
|
||||
repositoryId?: string;
|
||||
status?: IndexingJob['status'];
|
||||
status?: JobStatusFilter;
|
||||
limit?: number;
|
||||
}): IndexingJob[] {
|
||||
const limit = Math.min(options?.limit ?? 20, 200);
|
||||
const conditions: string[] = [];
|
||||
const params: unknown[] = [];
|
||||
|
||||
if (options?.repositoryId) {
|
||||
conditions.push('repository_id = ?');
|
||||
params.push(options.repositoryId);
|
||||
}
|
||||
if (options?.status) {
|
||||
conditions.push('status = ?');
|
||||
params.push(options.status);
|
||||
}
|
||||
|
||||
const where = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
|
||||
const { where, params } = buildJobFilterQuery(options);
|
||||
const sql = `${JOB_SELECT} ${where} ORDER BY created_at DESC LIMIT ?`;
|
||||
params.push(limit);
|
||||
|
||||
@@ -194,19 +230,7 @@ export class JobQueue {
|
||||
* Count all jobs matching optional filters.
|
||||
*/
|
||||
countJobs(options?: { repositoryId?: string; status?: IndexingJob['status'] }): number {
|
||||
const conditions: string[] = [];
|
||||
const params: unknown[] = [];
|
||||
|
||||
if (options?.repositoryId) {
|
||||
conditions.push('repository_id = ?');
|
||||
params.push(options.repositoryId);
|
||||
}
|
||||
if (options?.status) {
|
||||
conditions.push('status = ?');
|
||||
params.push(options.status);
|
||||
}
|
||||
|
||||
const where = conditions.length > 0 ? `WHERE ${conditions.join(' AND')}` : '';
|
||||
const { where, params } = buildJobFilterQuery(options);
|
||||
const sql = `SELECT COUNT(*) as n FROM indexing_jobs ${where}`;
|
||||
const row = this.db.prepare<unknown[], { n: number }>(sql).get(...params);
|
||||
return row?.n ?? 0;
|
||||
|
||||
@@ -171,4 +171,25 @@ describe('ProgressBroadcaster', () => {
|
||||
reader1.cancel();
|
||||
reader2.cancel();
|
||||
});
|
||||
|
||||
it('broadcastWorkerStatus sends worker-status events to global subscribers', async () => {
|
||||
const broadcaster = new ProgressBroadcaster();
|
||||
const stream = broadcaster.subscribeAll();
|
||||
const reader = stream.getReader();
|
||||
|
||||
broadcaster.broadcastWorkerStatus({
|
||||
concurrency: 2,
|
||||
active: 1,
|
||||
idle: 1,
|
||||
workers: [{ index: 0, state: 'running', jobId: 'job-1', repositoryId: '/repo/1', versionId: null }]
|
||||
});
|
||||
|
||||
const { value } = await reader.read();
|
||||
const text = value as string;
|
||||
|
||||
expect(text).toContain('event: worker-status');
|
||||
expect(text).toContain('"active":1');
|
||||
|
||||
reader.cancel();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -10,6 +10,7 @@ export class ProgressBroadcaster {
|
||||
private allSubscribers = new Set<ReadableStreamDefaultController<string>>();
|
||||
private lastEventCache = new Map<string, SSEEvent>();
|
||||
private eventCounters = new Map<string, number>();
|
||||
private globalEventCounter = 0;
|
||||
|
||||
subscribe(jobId: string): ReadableStream<string> {
|
||||
return new ReadableStream({
|
||||
@@ -135,6 +136,24 @@ export class ProgressBroadcaster {
|
||||
}
|
||||
}
|
||||
|
||||
broadcastWorkerStatus(data: object): void {
|
||||
this.globalEventCounter += 1;
|
||||
const event: SSEEvent = {
|
||||
id: this.globalEventCounter,
|
||||
event: 'worker-status',
|
||||
data: JSON.stringify(data)
|
||||
};
|
||||
const sse = this.formatSSE(event);
|
||||
|
||||
for (const controller of this.allSubscribers) {
|
||||
try {
|
||||
controller.enqueue(sse);
|
||||
} catch {
|
||||
// Controller might be closed or errored
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
getLastEvent(jobId: string): SSEEvent | null {
|
||||
return this.lastEventCache.get(jobId) ?? null;
|
||||
}
|
||||
|
||||
@@ -16,7 +16,6 @@ import { LocalCrawler } from '$lib/server/crawler/local.crawler.js';
|
||||
import { IndexingPipeline } from './indexing.pipeline.js';
|
||||
import { JobQueue } from './job-queue.js';
|
||||
import { WorkerPool } from './worker-pool.js';
|
||||
import type { ParseWorkerResponse } from './worker-types.js';
|
||||
import { initBroadcaster } from './progress-broadcaster.js';
|
||||
import type { ProgressBroadcaster } from './progress-broadcaster.js';
|
||||
import path from 'node:path';
|
||||
@@ -90,17 +89,28 @@ export function initializePipeline(
|
||||
if (options?.dbPath) {
|
||||
_broadcaster = initBroadcaster();
|
||||
|
||||
const getRepositoryIdForJob = (jobId: string): string => {
|
||||
const row = db
|
||||
.prepare<[string], { repository_id: string }>(
|
||||
`SELECT repository_id FROM indexing_jobs WHERE id = ?`
|
||||
)
|
||||
.get(jobId);
|
||||
return row?.repository_id ?? '';
|
||||
};
|
||||
|
||||
// Resolve worker script paths relative to this file (build/workers/ directory)
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
const workerScript = path.join(__dirname, '../../../build/workers/worker-entry.mjs');
|
||||
const embedWorkerScript = path.join(__dirname, '../../../build/workers/embed-worker-entry.mjs');
|
||||
const writeWorkerScript = path.join(__dirname, '../../../build/workers/write-worker-entry.mjs');
|
||||
|
||||
try {
|
||||
_pool = new WorkerPool({
|
||||
concurrency: options.concurrency ?? 2,
|
||||
workerScript,
|
||||
embedWorkerScript,
|
||||
writeWorkerScript,
|
||||
dbPath: options.dbPath,
|
||||
onProgress: (jobId, msg) => {
|
||||
// Update DB with progress
|
||||
@@ -112,7 +122,10 @@ export function initializePipeline(
|
||||
|
||||
// Broadcast progress event
|
||||
if (_broadcaster) {
|
||||
_broadcaster.broadcast(jobId, '', 'progress', msg);
|
||||
_broadcaster.broadcast(jobId, getRepositoryIdForJob(jobId), 'job-progress', {
|
||||
...msg,
|
||||
status: 'running'
|
||||
});
|
||||
}
|
||||
},
|
||||
onJobDone: (jobId: string) => {
|
||||
@@ -123,7 +136,10 @@ export function initializePipeline(
|
||||
|
||||
// Broadcast done event
|
||||
if (_broadcaster) {
|
||||
_broadcaster.broadcast(jobId, '', 'job-done', { jobId });
|
||||
_broadcaster.broadcast(jobId, getRepositoryIdForJob(jobId), 'job-done', {
|
||||
jobId,
|
||||
status: 'done'
|
||||
});
|
||||
}
|
||||
},
|
||||
onJobFailed: (jobId: string, error: string) => {
|
||||
@@ -134,7 +150,11 @@ export function initializePipeline(
|
||||
|
||||
// Broadcast failed event
|
||||
if (_broadcaster) {
|
||||
_broadcaster.broadcast(jobId, '', 'job-failed', { jobId, error });
|
||||
_broadcaster.broadcast(jobId, getRepositoryIdForJob(jobId), 'job-failed', {
|
||||
jobId,
|
||||
status: 'failed',
|
||||
error
|
||||
});
|
||||
}
|
||||
},
|
||||
onEmbedDone: (jobId: string) => {
|
||||
@@ -142,6 +162,9 @@ export function initializePipeline(
|
||||
},
|
||||
onEmbedFailed: (jobId: string, error: string) => {
|
||||
console.error('[WorkerPool] Embedding failed for job:', jobId, error);
|
||||
},
|
||||
onWorkerStatus: (status) => {
|
||||
_broadcaster?.broadcastWorkerStatus(status);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
@@ -13,6 +13,11 @@ const db = new Database(dbPath);
|
||||
db.pragma('journal_mode = WAL');
|
||||
db.pragma('foreign_keys = ON');
|
||||
db.pragma('busy_timeout = 5000');
|
||||
db.pragma('synchronous = NORMAL');
|
||||
db.pragma('cache_size = -65536');
|
||||
db.pragma('temp_store = MEMORY');
|
||||
db.pragma('mmap_size = 268435456');
|
||||
db.pragma('wal_autocheckpoint = 1000');
|
||||
|
||||
const pipeline = new IndexingPipeline(db, githubCrawl, new LocalCrawler(), null);
|
||||
let currentJobId: string | null = null;
|
||||
|
||||
@@ -1,11 +1,19 @@
|
||||
import { Worker } from 'node:worker_threads';
|
||||
import { existsSync } from 'node:fs';
|
||||
import type { ParseWorkerRequest, ParseWorkerResponse, EmbedWorkerRequest, EmbedWorkerResponse, WorkerInitData } from './worker-types.js';
|
||||
import type {
|
||||
ParseWorkerRequest,
|
||||
ParseWorkerResponse,
|
||||
EmbedWorkerRequest,
|
||||
EmbedWorkerResponse,
|
||||
WorkerInitData,
|
||||
WriteWorkerResponse
|
||||
} from './worker-types.js';
|
||||
|
||||
export interface WorkerPoolOptions {
|
||||
concurrency: number;
|
||||
workerScript: string;
|
||||
embedWorkerScript: string;
|
||||
writeWorkerScript?: string;
|
||||
dbPath: string;
|
||||
embeddingProfileId?: string;
|
||||
onProgress: (jobId: string, msg: Extract<ParseWorkerResponse, { type: 'progress' }>) => void;
|
||||
@@ -13,6 +21,22 @@ export interface WorkerPoolOptions {
|
||||
onJobFailed: (jobId: string, error: string) => void;
|
||||
onEmbedDone: (jobId: string) => void;
|
||||
onEmbedFailed: (jobId: string, error: string) => void;
|
||||
onWorkerStatus?: (status: WorkerPoolStatus) => void;
|
||||
}
|
||||
|
||||
export interface WorkerStatusEntry {
|
||||
index: number;
|
||||
state: 'idle' | 'running';
|
||||
jobId: string | null;
|
||||
repositoryId: string | null;
|
||||
versionId: string | null;
|
||||
}
|
||||
|
||||
export interface WorkerPoolStatus {
|
||||
concurrency: number;
|
||||
active: number;
|
||||
idle: number;
|
||||
workers: WorkerStatusEntry[];
|
||||
}
|
||||
|
||||
interface QueuedJob {
|
||||
@@ -24,6 +48,7 @@ interface QueuedJob {
|
||||
interface RunningJob {
|
||||
jobId: string;
|
||||
repositoryId: string;
|
||||
versionId?: string | null;
|
||||
}
|
||||
|
||||
interface EmbedQueuedJob {
|
||||
@@ -36,10 +61,12 @@ export class WorkerPool {
|
||||
private workers: Worker[] = [];
|
||||
private idleWorkers: Worker[] = [];
|
||||
private embedWorker: Worker | null = null;
|
||||
private writeWorker: Worker | null = null;
|
||||
private embedReady = false;
|
||||
private writeReady = false;
|
||||
private jobQueue: QueuedJob[] = [];
|
||||
private runningJobs = new Map<Worker, RunningJob>();
|
||||
private runningRepoIds = new Set<string>();
|
||||
private runningJobKeys = new Set<string>();
|
||||
private embedQueue: EmbedQueuedJob[] = [];
|
||||
private options: WorkerPoolOptions;
|
||||
private fallbackMode = false;
|
||||
@@ -66,6 +93,12 @@ export class WorkerPool {
|
||||
if (options.embeddingProfileId && existsSync(options.embedWorkerScript)) {
|
||||
this.embedWorker = this.spawnEmbedWorker();
|
||||
}
|
||||
|
||||
if (options.writeWorkerScript && existsSync(options.writeWorkerScript)) {
|
||||
this.writeWorker = this.spawnWriteWorker(options.writeWorkerScript);
|
||||
}
|
||||
|
||||
this.emitStatusChanged();
|
||||
}
|
||||
|
||||
private spawnParseWorker(): Worker {
|
||||
@@ -94,6 +127,22 @@ export class WorkerPool {
|
||||
return worker;
|
||||
}
|
||||
|
||||
private spawnWriteWorker(writeWorkerScript: string): Worker {
|
||||
const worker = new Worker(writeWorkerScript, {
|
||||
workerData: {
|
||||
dbPath: this.options.dbPath
|
||||
} satisfies WorkerInitData
|
||||
});
|
||||
|
||||
worker.on('message', (msg: WriteWorkerResponse) => this.onWriteWorkerMessage(msg));
|
||||
worker.on('exit', () => {
|
||||
this.writeReady = false;
|
||||
this.writeWorker = null;
|
||||
});
|
||||
|
||||
return worker;
|
||||
}
|
||||
|
||||
public enqueue(jobId: string, repositoryId: string, versionId?: string | null): void {
|
||||
if (this.shuttingDown) {
|
||||
console.warn('WorkerPool is shutting down, ignoring enqueue request');
|
||||
@@ -109,10 +158,18 @@ export class WorkerPool {
|
||||
this.dispatch();
|
||||
}
|
||||
|
||||
private static jobKey(repositoryId: string, versionId?: string | null): string {
|
||||
return `${repositoryId}:${versionId ?? ''}`;
|
||||
}
|
||||
|
||||
private dispatch(): void {
|
||||
let statusChanged = false;
|
||||
|
||||
while (this.idleWorkers.length > 0 && this.jobQueue.length > 0) {
|
||||
// Find first job whose repositoryId is not currently running
|
||||
const jobIdx = this.jobQueue.findIndex((j) => !this.runningRepoIds.has(j.repositoryId));
|
||||
// Find first job whose (repositoryId, versionId) compound key is not currently running
|
||||
const jobIdx = this.jobQueue.findIndex(
|
||||
(j) => !this.runningJobKeys.has(WorkerPool.jobKey(j.repositoryId, j.versionId))
|
||||
);
|
||||
|
||||
if (jobIdx === -1) {
|
||||
// No eligible job found (all repos have running jobs)
|
||||
@@ -122,12 +179,17 @@ export class WorkerPool {
|
||||
const job = this.jobQueue.splice(jobIdx, 1)[0];
|
||||
const worker = this.idleWorkers.pop()!;
|
||||
|
||||
this.runningJobs.set(worker, { jobId: job.jobId, repositoryId: job.repositoryId });
|
||||
this.runningRepoIds.add(job.repositoryId);
|
||||
this.runningJobs.set(worker, { jobId: job.jobId, repositoryId: job.repositoryId, versionId: job.versionId });
|
||||
this.runningJobKeys.add(WorkerPool.jobKey(job.repositoryId, job.versionId));
|
||||
statusChanged = true;
|
||||
|
||||
const msg: ParseWorkerRequest = { type: 'run', jobId: job.jobId };
|
||||
worker.postMessage(msg);
|
||||
}
|
||||
|
||||
if (statusChanged) {
|
||||
this.emitStatusChanged();
|
||||
}
|
||||
}
|
||||
|
||||
private onWorkerMessage(worker: Worker, msg: ParseWorkerResponse): void {
|
||||
@@ -137,15 +199,20 @@ export class WorkerPool {
|
||||
const runningJob = this.runningJobs.get(worker);
|
||||
if (runningJob) {
|
||||
this.runningJobs.delete(worker);
|
||||
this.runningRepoIds.delete(runningJob.repositoryId);
|
||||
this.runningJobKeys.delete(WorkerPool.jobKey(runningJob.repositoryId, runningJob.versionId));
|
||||
}
|
||||
this.idleWorkers.push(worker);
|
||||
this.options.onJobDone(msg.jobId);
|
||||
this.emitStatusChanged();
|
||||
|
||||
// If embedding configured, enqueue embed request
|
||||
if (this.embedWorker && this.options.embeddingProfileId) {
|
||||
const runningJobData = runningJob || { jobId: msg.jobId, repositoryId: '' };
|
||||
this.enqueueEmbed(msg.jobId, runningJobData.repositoryId, null);
|
||||
const runningJobData = runningJob || { jobId: msg.jobId, repositoryId: '', versionId: null };
|
||||
this.enqueueEmbed(
|
||||
msg.jobId,
|
||||
runningJobData.repositoryId,
|
||||
runningJobData.versionId ?? null
|
||||
);
|
||||
}
|
||||
|
||||
this.dispatch();
|
||||
@@ -153,10 +220,11 @@ export class WorkerPool {
|
||||
const runningJob = this.runningJobs.get(worker);
|
||||
if (runningJob) {
|
||||
this.runningJobs.delete(worker);
|
||||
this.runningRepoIds.delete(runningJob.repositoryId);
|
||||
this.runningJobKeys.delete(WorkerPool.jobKey(runningJob.repositoryId, runningJob.versionId));
|
||||
}
|
||||
this.idleWorkers.push(worker);
|
||||
this.options.onJobFailed(msg.jobId, msg.error);
|
||||
this.emitStatusChanged();
|
||||
this.dispatch();
|
||||
}
|
||||
}
|
||||
@@ -176,13 +244,15 @@ export class WorkerPool {
|
||||
const runningJob = this.runningJobs.get(worker);
|
||||
if (runningJob && code !== 0) {
|
||||
this.runningJobs.delete(worker);
|
||||
this.runningRepoIds.delete(runningJob.repositoryId);
|
||||
this.runningJobKeys.delete(WorkerPool.jobKey(runningJob.repositoryId, runningJob.versionId));
|
||||
this.options.onJobFailed(runningJob.jobId, `Worker crashed with code ${code}`);
|
||||
} else if (runningJob) {
|
||||
this.runningJobs.delete(worker);
|
||||
this.runningRepoIds.delete(runningJob.repositoryId);
|
||||
this.runningJobKeys.delete(WorkerPool.jobKey(runningJob.repositoryId, runningJob.versionId));
|
||||
}
|
||||
|
||||
this.emitStatusChanged();
|
||||
|
||||
// Remove from workers array
|
||||
const workerIdx = this.workers.indexOf(worker);
|
||||
if (workerIdx !== -1) {
|
||||
@@ -212,6 +282,17 @@ export class WorkerPool {
|
||||
}
|
||||
}
|
||||
|
||||
private onWriteWorkerMessage(msg: WriteWorkerResponse): void {
|
||||
if (msg.type === 'ready') {
|
||||
this.writeReady = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if (msg.type === 'write_error') {
|
||||
console.error('[WorkerPool] Write worker failed for job:', msg.jobId, msg.error);
|
||||
}
|
||||
}
|
||||
|
||||
private processEmbedQueue(): void {
|
||||
if (!this.embedWorker || !this.embedReady) {
|
||||
return;
|
||||
@@ -250,6 +331,7 @@ export class WorkerPool {
|
||||
}
|
||||
|
||||
public setMaxConcurrency(n: number): void {
|
||||
this.options.concurrency = n;
|
||||
const current = this.workers.length;
|
||||
|
||||
if (n > current) {
|
||||
@@ -274,6 +356,8 @@ export class WorkerPool {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.emitStatusChanged();
|
||||
}
|
||||
|
||||
public async shutdown(): Promise<void> {
|
||||
@@ -300,6 +384,14 @@ export class WorkerPool {
|
||||
}
|
||||
}
|
||||
|
||||
if (this.writeWorker) {
|
||||
try {
|
||||
this.writeWorker.postMessage({ type: 'shutdown' });
|
||||
} catch {
|
||||
// Worker might already be exited
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for workers to exit with timeout
|
||||
const timeout = 5000;
|
||||
const startTime = Date.now();
|
||||
@@ -329,9 +421,41 @@ export class WorkerPool {
|
||||
}
|
||||
}
|
||||
|
||||
if (this.writeWorker) {
|
||||
try {
|
||||
this.writeWorker.terminate();
|
||||
} catch {
|
||||
// Already terminated
|
||||
}
|
||||
}
|
||||
|
||||
this.workers = [];
|
||||
this.idleWorkers = [];
|
||||
this.embedWorker = null;
|
||||
this.writeWorker = null;
|
||||
this.emitStatusChanged();
|
||||
}
|
||||
|
||||
public getStatus(): WorkerPoolStatus {
|
||||
return {
|
||||
concurrency: this.options.concurrency,
|
||||
active: this.runningJobs.size,
|
||||
idle: this.idleWorkers.length,
|
||||
workers: this.workers.map((worker, index) => {
|
||||
const runningJob = this.runningJobs.get(worker);
|
||||
return {
|
||||
index,
|
||||
state: runningJob ? 'running' : 'idle',
|
||||
jobId: runningJob?.jobId ?? null,
|
||||
repositoryId: runningJob?.repositoryId ?? null,
|
||||
versionId: runningJob?.versionId ?? null
|
||||
};
|
||||
})
|
||||
};
|
||||
}
|
||||
|
||||
private emitStatusChanged(): void {
|
||||
this.options.onWorkerStatus?.(this.getStatus());
|
||||
}
|
||||
|
||||
public get isFallbackMode(): boolean {
|
||||
|
||||
@@ -19,7 +19,61 @@ export type EmbedWorkerResponse =
|
||||
| { type: 'embed-done'; jobId: string }
|
||||
| { type: 'embed-failed'; jobId: string; error: string };
|
||||
|
||||
export type WriteWorkerRequest = WriteRequest | { type: 'shutdown' };
|
||||
|
||||
export type WriteWorkerResponse =
|
||||
| { type: 'ready' }
|
||||
| WriteAck
|
||||
| WriteError;
|
||||
|
||||
export interface WorkerInitData {
|
||||
dbPath: string;
|
||||
embeddingProfileId?: string;
|
||||
}
|
||||
|
||||
// Write worker message types (Phase 6)
|
||||
export interface SerializedDocument {
|
||||
id: string;
|
||||
repositoryId: string;
|
||||
versionId: string | null;
|
||||
filePath: string;
|
||||
title: string | null;
|
||||
language: string | null;
|
||||
tokenCount: number;
|
||||
checksum: string;
|
||||
indexedAt: number;
|
||||
}
|
||||
|
||||
export interface SerializedSnippet {
|
||||
id: string;
|
||||
documentId: string;
|
||||
repositoryId: string;
|
||||
versionId: string | null;
|
||||
type: 'code' | 'info';
|
||||
title: string | null;
|
||||
content: string;
|
||||
language: string | null;
|
||||
breadcrumb: string | null;
|
||||
tokenCount: number;
|
||||
createdAt: number;
|
||||
}
|
||||
|
||||
export type WriteRequest = {
|
||||
type: 'write';
|
||||
jobId: string;
|
||||
documents: SerializedDocument[];
|
||||
snippets: SerializedSnippet[];
|
||||
};
|
||||
|
||||
export type WriteAck = {
|
||||
type: 'write_ack';
|
||||
jobId: string;
|
||||
documentCount: number;
|
||||
snippetCount: number;
|
||||
};
|
||||
|
||||
export type WriteError = {
|
||||
type: 'write_error';
|
||||
jobId: string;
|
||||
error: string;
|
||||
};
|
||||
|
||||
93
src/lib/server/pipeline/write-worker-entry.ts
Normal file
93
src/lib/server/pipeline/write-worker-entry.ts
Normal file
@@ -0,0 +1,93 @@
|
||||
import { workerData, parentPort } from 'node:worker_threads';
|
||||
import Database from 'better-sqlite3';
|
||||
import type {
|
||||
SerializedDocument,
|
||||
SerializedSnippet,
|
||||
WorkerInitData,
|
||||
WriteWorkerRequest,
|
||||
WriteWorkerResponse
|
||||
} from './worker-types.js';
|
||||
|
||||
const { dbPath } = workerData as WorkerInitData;
|
||||
const db = new Database(dbPath);
|
||||
db.pragma('journal_mode = WAL');
|
||||
db.pragma('foreign_keys = ON');
|
||||
db.pragma('busy_timeout = 5000');
|
||||
db.pragma('synchronous = NORMAL');
|
||||
db.pragma('cache_size = -65536');
|
||||
db.pragma('temp_store = MEMORY');
|
||||
db.pragma('mmap_size = 268435456');
|
||||
db.pragma('wal_autocheckpoint = 1000');
|
||||
|
||||
const insertDocument = db.prepare(
|
||||
`INSERT OR REPLACE INTO documents
|
||||
(id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
);
|
||||
|
||||
const insertSnippet = db.prepare(
|
||||
`INSERT OR REPLACE INTO snippets
|
||||
(id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
);
|
||||
|
||||
const writeBatch = db.transaction((documents: SerializedDocument[], snippets: SerializedSnippet[]) => {
|
||||
for (const document of documents) {
|
||||
insertDocument.run(
|
||||
document.id,
|
||||
document.repositoryId,
|
||||
document.versionId,
|
||||
document.filePath,
|
||||
document.title,
|
||||
document.language,
|
||||
document.tokenCount,
|
||||
document.checksum,
|
||||
document.indexedAt
|
||||
);
|
||||
}
|
||||
|
||||
for (const snippet of snippets) {
|
||||
insertSnippet.run(
|
||||
snippet.id,
|
||||
snippet.documentId,
|
||||
snippet.repositoryId,
|
||||
snippet.versionId,
|
||||
snippet.type,
|
||||
snippet.title,
|
||||
snippet.content,
|
||||
snippet.language,
|
||||
snippet.breadcrumb,
|
||||
snippet.tokenCount,
|
||||
snippet.createdAt
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
parentPort?.postMessage({ type: 'ready' } satisfies WriteWorkerResponse);
|
||||
|
||||
parentPort?.on('message', (msg: WriteWorkerRequest) => {
|
||||
if (msg.type === 'shutdown') {
|
||||
db.close();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
if (msg.type !== 'write') {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
writeBatch(msg.documents, msg.snippets);
|
||||
parentPort?.postMessage({
|
||||
type: 'write_ack',
|
||||
jobId: msg.jobId,
|
||||
documentCount: msg.documents.length,
|
||||
snippetCount: msg.snippets.length
|
||||
} satisfies WriteWorkerResponse);
|
||||
} catch (error) {
|
||||
parentPort?.postMessage({
|
||||
type: 'write_error',
|
||||
jobId: msg.jobId,
|
||||
error: error instanceof Error ? error.message : String(error)
|
||||
} satisfies WriteWorkerResponse);
|
||||
}
|
||||
});
|
||||
Reference in New Issue
Block a user