/** * EmbeddingService — batches embedding requests and persists results to * the snippet_embeddings table. */ import type Database from 'better-sqlite3'; import type { EmbeddingProvider } from './provider.js'; import { SqliteVecStore } from '$lib/server/search/sqlite-vec.store.js'; import { upsertEmbeddings, type PersistedEmbedding } from '$lib/server/pipeline/write-operations.js'; interface SnippetRow { id: string; title: string | null; breadcrumb: string | null; content: string; } const BATCH_SIZE = 50; const TEXT_MAX_CHARS = 2048; export class EmbeddingService { private readonly sqliteVecStore: SqliteVecStore; constructor( private readonly db: Database.Database, private readonly provider: EmbeddingProvider, private readonly profileId: string = 'local-default', private readonly persistenceDelegate?: { persistEmbeddings?: (embeddings: PersistedEmbedding[]) => Promise; } ) { this.sqliteVecStore = new SqliteVecStore(db); } findSnippetIdsMissingEmbeddings(repositoryId: string, versionId: string | null): string[] { if (versionId) { const rows = this.db .prepare<[string, string, string], { id: string }>( `SELECT snippets.id FROM snippets LEFT JOIN snippet_embeddings ON snippet_embeddings.snippet_id = snippets.id AND snippet_embeddings.profile_id = ? WHERE snippets.repository_id = ? AND snippets.version_id = ? AND snippet_embeddings.snippet_id IS NULL ORDER BY snippets.id` ) .all(this.profileId, repositoryId, versionId); return rows.map((row) => row.id); } const rows = this.db .prepare<[string, string], { id: string }>( `SELECT snippets.id FROM snippets LEFT JOIN snippet_embeddings ON snippet_embeddings.snippet_id = snippets.id AND snippet_embeddings.profile_id = ? WHERE snippets.repository_id = ? AND snippets.version_id IS NULL AND snippet_embeddings.snippet_id IS NULL ORDER BY snippets.id` ) .all(this.profileId, repositoryId); return rows.map((row) => row.id); } /** * Embed the given snippet IDs and store the results in snippet_embeddings. * * Only snippets that actually exist in the database are processed. * Results are upserted (INSERT OR REPLACE) so re-embedding is idempotent. * * @param snippetIds - Array of snippet UUIDs to embed. * @param onProgress - Optional callback invoked after each batch with * (completedCount, totalCount). */ async embedSnippets( snippetIds: string[], onProgress?: (done: number, total: number) => void ): Promise { if (snippetIds.length === 0) return; const placeholders = snippetIds.map(() => '?').join(','); const snippets = this.db .prepare< string[], SnippetRow >(`SELECT id, title, breadcrumb, content FROM snippets WHERE id IN (${placeholders})`) .all(...snippetIds); if (snippets.length === 0) return; const texts = snippets.map((s) => [s.title, s.breadcrumb, s.content].filter(Boolean).join('\n').slice(0, TEXT_MAX_CHARS) ); for (let i = 0; i < snippets.length; i += BATCH_SIZE) { const batchSnippets = snippets.slice(i, i + BATCH_SIZE); const batchTexts = texts.slice(i, i + BATCH_SIZE); const embeddings = await this.provider.embed(batchTexts); const persistedEmbeddings: PersistedEmbedding[] = batchSnippets.map((snippet, index) => { const embedding = embeddings[index]; return { snippetId: snippet.id, profileId: this.profileId, model: embedding.model, dimensions: embedding.dimensions, embedding: Buffer.from( embedding.values.buffer, embedding.values.byteOffset, embedding.values.byteLength ) }; }); if (this.persistenceDelegate?.persistEmbeddings) { await this.persistenceDelegate.persistEmbeddings(persistedEmbeddings); } else { upsertEmbeddings(this.db, persistedEmbeddings); } onProgress?.(Math.min(i + BATCH_SIZE, snippets.length), snippets.length); } } /** * Retrieve a stored embedding for a snippet as a Float32Array. * Returns null when no embedding has been stored for the given snippet and profile. * * @param snippetId - Snippet UUID * @param profileId - Embedding profile ID (default: 'local-default') */ getEmbedding(snippetId: string, profileId: string = 'local-default'): Float32Array | null { const row = this.db .prepare< [string, string], { embedding: Buffer; dimensions: number } >(`SELECT embedding, dimensions FROM snippet_embeddings WHERE snippet_id = ? AND profile_id = ?`) .get(snippetId, profileId); if (!row) return null; return new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.dimensions); } }