Files
trueref-legacy/src/lib/server/embeddings/embedding.service.ts

154 lines
4.6 KiB
TypeScript

/**
* EmbeddingService — batches embedding requests and persists results to
* the snippet_embeddings table.
*/
import type Database from 'better-sqlite3';
import type { EmbeddingProvider } from './provider.js';
import { SqliteVecStore } from '$lib/server/search/sqlite-vec.store.js';
import {
upsertEmbeddings,
type PersistedEmbedding
} from '$lib/server/pipeline/write-operations.js';
interface SnippetRow {
id: string;
title: string | null;
breadcrumb: string | null;
content: string;
}
const BATCH_SIZE = 50;
const TEXT_MAX_CHARS = 2048;
export class EmbeddingService {
private readonly sqliteVecStore: SqliteVecStore;
constructor(
private readonly db: Database.Database,
private readonly provider: EmbeddingProvider,
private readonly profileId: string = 'local-default',
private readonly persistenceDelegate?: {
persistEmbeddings?: (embeddings: PersistedEmbedding[]) => Promise<void>;
}
) {
this.sqliteVecStore = new SqliteVecStore(db);
}
findSnippetIdsMissingEmbeddings(repositoryId: string, versionId: string | null): string[] {
if (versionId) {
const rows = this.db
.prepare<[string, string, string], { id: string }>(
`SELECT snippets.id
FROM snippets
LEFT JOIN snippet_embeddings
ON snippet_embeddings.snippet_id = snippets.id
AND snippet_embeddings.profile_id = ?
WHERE snippets.repository_id = ?
AND snippets.version_id = ?
AND snippet_embeddings.snippet_id IS NULL
ORDER BY snippets.id`
)
.all(this.profileId, repositoryId, versionId);
return rows.map((row) => row.id);
}
const rows = this.db
.prepare<[string, string], { id: string }>(
`SELECT snippets.id
FROM snippets
LEFT JOIN snippet_embeddings
ON snippet_embeddings.snippet_id = snippets.id
AND snippet_embeddings.profile_id = ?
WHERE snippets.repository_id = ?
AND snippets.version_id IS NULL
AND snippet_embeddings.snippet_id IS NULL
ORDER BY snippets.id`
)
.all(this.profileId, repositoryId);
return rows.map((row) => row.id);
}
/**
* Embed the given snippet IDs and store the results in snippet_embeddings.
*
* Only snippets that actually exist in the database are processed.
* Results are upserted (INSERT OR REPLACE) so re-embedding is idempotent.
*
* @param snippetIds - Array of snippet UUIDs to embed.
* @param onProgress - Optional callback invoked after each batch with
* (completedCount, totalCount).
*/
async embedSnippets(
snippetIds: string[],
onProgress?: (done: number, total: number) => void
): Promise<void> {
if (snippetIds.length === 0) return;
const placeholders = snippetIds.map(() => '?').join(',');
const snippets = this.db
.prepare<
string[],
SnippetRow
>(`SELECT id, title, breadcrumb, content FROM snippets WHERE id IN (${placeholders})`)
.all(...snippetIds);
if (snippets.length === 0) return;
const texts = snippets.map((s) =>
[s.title, s.breadcrumb, s.content].filter(Boolean).join('\n').slice(0, TEXT_MAX_CHARS)
);
for (let i = 0; i < snippets.length; i += BATCH_SIZE) {
const batchSnippets = snippets.slice(i, i + BATCH_SIZE);
const batchTexts = texts.slice(i, i + BATCH_SIZE);
const embeddings = await this.provider.embed(batchTexts);
const persistedEmbeddings: PersistedEmbedding[] = batchSnippets.map((snippet, index) => {
const embedding = embeddings[index];
return {
snippetId: snippet.id,
profileId: this.profileId,
model: embedding.model,
dimensions: embedding.dimensions,
embedding: Buffer.from(
embedding.values.buffer,
embedding.values.byteOffset,
embedding.values.byteLength
)
};
});
if (this.persistenceDelegate?.persistEmbeddings) {
await this.persistenceDelegate.persistEmbeddings(persistedEmbeddings);
} else {
upsertEmbeddings(this.db, persistedEmbeddings);
}
onProgress?.(Math.min(i + BATCH_SIZE, snippets.length), snippets.length);
}
}
/**
* Retrieve a stored embedding for a snippet as a Float32Array.
* Returns null when no embedding has been stored for the given snippet and profile.
*
* @param snippetId - Snippet UUID
* @param profileId - Embedding profile ID (default: 'local-default')
*/
getEmbedding(snippetId: string, profileId: string = 'local-default'): Float32Array | null {
const row = this.db
.prepare<
[string, string],
{ embedding: Buffer; dimensions: number }
>(`SELECT embedding, dimensions FROM snippet_embeddings WHERE snippet_id = ? AND profile_id = ?`)
.get(snippetId, profileId);
if (!row) return null;
return new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.dimensions);
}
}