154 lines
4.6 KiB
TypeScript
154 lines
4.6 KiB
TypeScript
/**
|
|
* EmbeddingService — batches embedding requests and persists results to
|
|
* the snippet_embeddings table.
|
|
*/
|
|
|
|
import type Database from 'better-sqlite3';
|
|
import type { EmbeddingProvider } from './provider.js';
|
|
import { SqliteVecStore } from '$lib/server/search/sqlite-vec.store.js';
|
|
import {
|
|
upsertEmbeddings,
|
|
type PersistedEmbedding
|
|
} from '$lib/server/pipeline/write-operations.js';
|
|
|
|
interface SnippetRow {
|
|
id: string;
|
|
title: string | null;
|
|
breadcrumb: string | null;
|
|
content: string;
|
|
}
|
|
|
|
const BATCH_SIZE = 50;
|
|
const TEXT_MAX_CHARS = 2048;
|
|
|
|
export class EmbeddingService {
|
|
private readonly sqliteVecStore: SqliteVecStore;
|
|
|
|
constructor(
|
|
private readonly db: Database.Database,
|
|
private readonly provider: EmbeddingProvider,
|
|
private readonly profileId: string = 'local-default',
|
|
private readonly persistenceDelegate?: {
|
|
persistEmbeddings?: (embeddings: PersistedEmbedding[]) => Promise<void>;
|
|
}
|
|
) {
|
|
this.sqliteVecStore = new SqliteVecStore(db);
|
|
}
|
|
|
|
findSnippetIdsMissingEmbeddings(repositoryId: string, versionId: string | null): string[] {
|
|
if (versionId) {
|
|
const rows = this.db
|
|
.prepare<[string, string, string], { id: string }>(
|
|
`SELECT snippets.id
|
|
FROM snippets
|
|
LEFT JOIN snippet_embeddings
|
|
ON snippet_embeddings.snippet_id = snippets.id
|
|
AND snippet_embeddings.profile_id = ?
|
|
WHERE snippets.repository_id = ?
|
|
AND snippets.version_id = ?
|
|
AND snippet_embeddings.snippet_id IS NULL
|
|
ORDER BY snippets.id`
|
|
)
|
|
.all(this.profileId, repositoryId, versionId);
|
|
|
|
return rows.map((row) => row.id);
|
|
}
|
|
|
|
const rows = this.db
|
|
.prepare<[string, string], { id: string }>(
|
|
`SELECT snippets.id
|
|
FROM snippets
|
|
LEFT JOIN snippet_embeddings
|
|
ON snippet_embeddings.snippet_id = snippets.id
|
|
AND snippet_embeddings.profile_id = ?
|
|
WHERE snippets.repository_id = ?
|
|
AND snippets.version_id IS NULL
|
|
AND snippet_embeddings.snippet_id IS NULL
|
|
ORDER BY snippets.id`
|
|
)
|
|
.all(this.profileId, repositoryId);
|
|
|
|
return rows.map((row) => row.id);
|
|
}
|
|
|
|
/**
|
|
* Embed the given snippet IDs and store the results in snippet_embeddings.
|
|
*
|
|
* Only snippets that actually exist in the database are processed.
|
|
* Results are upserted (INSERT OR REPLACE) so re-embedding is idempotent.
|
|
*
|
|
* @param snippetIds - Array of snippet UUIDs to embed.
|
|
* @param onProgress - Optional callback invoked after each batch with
|
|
* (completedCount, totalCount).
|
|
*/
|
|
async embedSnippets(
|
|
snippetIds: string[],
|
|
onProgress?: (done: number, total: number) => void
|
|
): Promise<void> {
|
|
if (snippetIds.length === 0) return;
|
|
|
|
const placeholders = snippetIds.map(() => '?').join(',');
|
|
const snippets = this.db
|
|
.prepare<
|
|
string[],
|
|
SnippetRow
|
|
>(`SELECT id, title, breadcrumb, content FROM snippets WHERE id IN (${placeholders})`)
|
|
.all(...snippetIds);
|
|
|
|
if (snippets.length === 0) return;
|
|
|
|
const texts = snippets.map((s) =>
|
|
[s.title, s.breadcrumb, s.content].filter(Boolean).join('\n').slice(0, TEXT_MAX_CHARS)
|
|
);
|
|
|
|
for (let i = 0; i < snippets.length; i += BATCH_SIZE) {
|
|
const batchSnippets = snippets.slice(i, i + BATCH_SIZE);
|
|
const batchTexts = texts.slice(i, i + BATCH_SIZE);
|
|
|
|
const embeddings = await this.provider.embed(batchTexts);
|
|
const persistedEmbeddings: PersistedEmbedding[] = batchSnippets.map((snippet, index) => {
|
|
const embedding = embeddings[index];
|
|
return {
|
|
snippetId: snippet.id,
|
|
profileId: this.profileId,
|
|
model: embedding.model,
|
|
dimensions: embedding.dimensions,
|
|
embedding: Buffer.from(
|
|
embedding.values.buffer,
|
|
embedding.values.byteOffset,
|
|
embedding.values.byteLength
|
|
)
|
|
};
|
|
});
|
|
|
|
if (this.persistenceDelegate?.persistEmbeddings) {
|
|
await this.persistenceDelegate.persistEmbeddings(persistedEmbeddings);
|
|
} else {
|
|
upsertEmbeddings(this.db, persistedEmbeddings);
|
|
}
|
|
|
|
onProgress?.(Math.min(i + BATCH_SIZE, snippets.length), snippets.length);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Retrieve a stored embedding for a snippet as a Float32Array.
|
|
* Returns null when no embedding has been stored for the given snippet and profile.
|
|
*
|
|
* @param snippetId - Snippet UUID
|
|
* @param profileId - Embedding profile ID (default: 'local-default')
|
|
*/
|
|
getEmbedding(snippetId: string, profileId: string = 'local-default'): Float32Array | null {
|
|
const row = this.db
|
|
.prepare<
|
|
[string, string],
|
|
{ embedding: Buffer; dimensions: number }
|
|
>(`SELECT embedding, dimensions FROM snippet_embeddings WHERE snippet_id = ? AND profile_id = ?`)
|
|
.get(snippetId, profileId);
|
|
|
|
if (!row) return null;
|
|
|
|
return new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.dimensions);
|
|
}
|
|
}
|