TRUEREF-0023 rewrite indexing pipeline - parallel reads - serialized writes
This commit is contained in:
@@ -6,6 +6,10 @@
|
||||
import type Database from 'better-sqlite3';
|
||||
import type { EmbeddingProvider } from './provider.js';
|
||||
import { SqliteVecStore } from '$lib/server/search/sqlite-vec.store.js';
|
||||
import {
|
||||
upsertEmbeddings,
|
||||
type PersistedEmbedding
|
||||
} from '$lib/server/pipeline/write-operations.js';
|
||||
|
||||
interface SnippetRow {
|
||||
id: string;
|
||||
@@ -23,7 +27,10 @@ export class EmbeddingService {
|
||||
constructor(
|
||||
private readonly db: Database.Database,
|
||||
private readonly provider: EmbeddingProvider,
|
||||
private readonly profileId: string = 'local-default'
|
||||
private readonly profileId: string = 'local-default',
|
||||
private readonly persistenceDelegate?: {
|
||||
persistEmbeddings?: (embeddings: PersistedEmbedding[]) => Promise<void>;
|
||||
}
|
||||
) {
|
||||
this.sqliteVecStore = new SqliteVecStore(db);
|
||||
}
|
||||
@@ -94,37 +101,31 @@ export class EmbeddingService {
|
||||
[s.title, s.breadcrumb, s.content].filter(Boolean).join('\n').slice(0, TEXT_MAX_CHARS)
|
||||
);
|
||||
|
||||
const insert = this.db.prepare<[string, string, string, number, Buffer]>(`
|
||||
INSERT OR REPLACE INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, unixepoch())
|
||||
`);
|
||||
|
||||
for (let i = 0; i < snippets.length; i += BATCH_SIZE) {
|
||||
const batchSnippets = snippets.slice(i, i + BATCH_SIZE);
|
||||
const batchTexts = texts.slice(i, i + BATCH_SIZE);
|
||||
|
||||
const embeddings = await this.provider.embed(batchTexts);
|
||||
|
||||
const insertMany = this.db.transaction(() => {
|
||||
for (let j = 0; j < batchSnippets.length; j++) {
|
||||
const snippet = batchSnippets[j];
|
||||
const embedding = embeddings[j];
|
||||
|
||||
insert.run(
|
||||
snippet.id,
|
||||
this.profileId,
|
||||
embedding.model,
|
||||
embedding.dimensions,
|
||||
Buffer.from(
|
||||
embedding.values.buffer,
|
||||
embedding.values.byteOffset,
|
||||
embedding.values.byteLength
|
||||
)
|
||||
);
|
||||
this.sqliteVecStore.upsertEmbedding(this.profileId, snippet.id, embedding.values);
|
||||
}
|
||||
const persistedEmbeddings: PersistedEmbedding[] = batchSnippets.map((snippet, index) => {
|
||||
const embedding = embeddings[index];
|
||||
return {
|
||||
snippetId: snippet.id,
|
||||
profileId: this.profileId,
|
||||
model: embedding.model,
|
||||
dimensions: embedding.dimensions,
|
||||
embedding: Buffer.from(
|
||||
embedding.values.buffer,
|
||||
embedding.values.byteOffset,
|
||||
embedding.values.byteLength
|
||||
)
|
||||
};
|
||||
});
|
||||
insertMany();
|
||||
|
||||
if (this.persistenceDelegate?.persistEmbeddings) {
|
||||
await this.persistenceDelegate.persistEmbeddings(persistedEmbeddings);
|
||||
} else {
|
||||
upsertEmbeddings(this.db, persistedEmbeddings);
|
||||
}
|
||||
|
||||
onProgress?.(Math.min(i + BATCH_SIZE, snippets.length), snippets.length);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user