TRUEREF-0023 rewrite indexing pipeline - parallel reads - serialized writes

This commit is contained in:
Giancarmine Salucci
2026-04-02 09:49:38 +02:00
parent 9525c58e9a
commit f86be4106b
68 changed files with 5042 additions and 3131 deletions

View File

@@ -6,6 +6,10 @@
import type Database from 'better-sqlite3';
import type { EmbeddingProvider } from './provider.js';
import { SqliteVecStore } from '$lib/server/search/sqlite-vec.store.js';
import {
upsertEmbeddings,
type PersistedEmbedding
} from '$lib/server/pipeline/write-operations.js';
interface SnippetRow {
id: string;
@@ -23,7 +27,10 @@ export class EmbeddingService {
constructor(
private readonly db: Database.Database,
private readonly provider: EmbeddingProvider,
private readonly profileId: string = 'local-default'
private readonly profileId: string = 'local-default',
private readonly persistenceDelegate?: {
persistEmbeddings?: (embeddings: PersistedEmbedding[]) => Promise<void>;
}
) {
this.sqliteVecStore = new SqliteVecStore(db);
}
@@ -94,37 +101,31 @@ export class EmbeddingService {
[s.title, s.breadcrumb, s.content].filter(Boolean).join('\n').slice(0, TEXT_MAX_CHARS)
);
const insert = this.db.prepare<[string, string, string, number, Buffer]>(`
INSERT OR REPLACE INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
VALUES (?, ?, ?, ?, ?, unixepoch())
`);
for (let i = 0; i < snippets.length; i += BATCH_SIZE) {
const batchSnippets = snippets.slice(i, i + BATCH_SIZE);
const batchTexts = texts.slice(i, i + BATCH_SIZE);
const embeddings = await this.provider.embed(batchTexts);
const insertMany = this.db.transaction(() => {
for (let j = 0; j < batchSnippets.length; j++) {
const snippet = batchSnippets[j];
const embedding = embeddings[j];
insert.run(
snippet.id,
this.profileId,
embedding.model,
embedding.dimensions,
Buffer.from(
embedding.values.buffer,
embedding.values.byteOffset,
embedding.values.byteLength
)
);
this.sqliteVecStore.upsertEmbedding(this.profileId, snippet.id, embedding.values);
}
const persistedEmbeddings: PersistedEmbedding[] = batchSnippets.map((snippet, index) => {
const embedding = embeddings[index];
return {
snippetId: snippet.id,
profileId: this.profileId,
model: embedding.model,
dimensions: embedding.dimensions,
embedding: Buffer.from(
embedding.values.buffer,
embedding.values.byteOffset,
embedding.values.byteLength
)
};
});
insertMany();
if (this.persistenceDelegate?.persistEmbeddings) {
await this.persistenceDelegate.persistEmbeddings(persistedEmbeddings);
} else {
upsertEmbeddings(this.db, persistedEmbeddings);
}
onProgress?.(Math.min(i + BATCH_SIZE, snippets.length), snippets.length);
}