Files
trueref-legacy/src/lib/server/search/sqlite-vec.store.ts
2026-04-01 14:09:19 +02:00

394 lines
10 KiB
TypeScript

import type Database from 'better-sqlite3';
import {
loadSqliteVec,
quoteSqliteIdentifier,
sqliteVecRowidTableName,
sqliteVecTableName
} from '$lib/server/db/sqlite-vec.js';
export interface SqliteVecQueryOptions {
repositoryId: string;
versionId?: string;
profileId?: string;
limit?: number;
}
export interface SqliteVecQueryResult {
snippetId: string;
score: number;
distance: number;
}
interface ProfileDimensionsRow {
dimensions: number;
}
interface StoredDimensionsRow {
count: number;
min_dimensions: number | null;
max_dimensions: number | null;
}
interface SnippetRowidRow {
rowid: number;
}
interface RawKnnRow {
snippet_id: string;
distance: number;
}
interface CanonicalEmbeddingRow {
snippet_id: string;
embedding: Buffer;
}
interface StoredEmbeddingRef {
profile_id: string;
snippet_id: string;
}
interface ProfileStoreTables {
vectorTableName: string;
rowidTableName: string;
quotedVectorTableName: string;
quotedRowidTableName: string;
dimensions: number;
}
function toEmbeddingBuffer(values: Float32Array): Buffer {
return Buffer.from(values.buffer, values.byteOffset, values.byteLength);
}
function distanceToScore(distance: number): number {
return 1 / (1 + distance);
}
export class SqliteVecStore {
constructor(private readonly db: Database.Database) {}
ensureProfileStore(profileId: string, preferredDimensions?: number): number {
const tables = this.getProfileStoreTables(profileId, preferredDimensions);
this.db.exec(`
CREATE TABLE IF NOT EXISTS ${tables.quotedRowidTableName} (
rowid INTEGER PRIMARY KEY,
snippet_id TEXT NOT NULL UNIQUE REFERENCES snippets(id) ON DELETE CASCADE
);
`);
this.db.exec(`
CREATE VIRTUAL TABLE IF NOT EXISTS ${tables.quotedVectorTableName}
USING vec0(embedding float[${tables.dimensions}]);
`);
return tables.dimensions;
}
upsertEmbedding(profileId: string, snippetId: string, embedding: Float32Array): void {
const tables = this.getProfileStoreTables(profileId, embedding.length);
this.ensureProfileStore(profileId, tables.dimensions);
const existingRow = this.db
.prepare<[string], SnippetRowidRow>(
`SELECT rowid FROM ${tables.quotedRowidTableName} WHERE snippet_id = ?`
)
.get(snippetId);
const embeddingBuffer = toEmbeddingBuffer(embedding);
if (existingRow) {
this.db
.prepare<[Buffer, number]>(
`UPDATE ${tables.quotedVectorTableName} SET embedding = ? WHERE rowid = ?`
)
.run(embeddingBuffer, existingRow.rowid);
return;
}
const insertResult = this.db
.prepare<[Buffer]>(`INSERT INTO ${tables.quotedVectorTableName} (embedding) VALUES (?)`)
.run(embeddingBuffer);
this.db
.prepare<[number, string]>(
`INSERT INTO ${tables.quotedRowidTableName} (rowid, snippet_id) VALUES (?, ?)`
)
.run(Number(insertResult.lastInsertRowid), snippetId);
}
upsertEmbeddingBuffer(
profileId: string,
snippetId: string,
embedding: Buffer,
dimensions?: number
): void {
const vector = new Float32Array(
embedding.buffer,
embedding.byteOffset,
dimensions ?? Math.floor(embedding.byteLength / Float32Array.BYTES_PER_ELEMENT)
);
this.upsertEmbedding(profileId, snippetId, vector);
}
deleteEmbedding(profileId: string, snippetId: string): void {
const tables = this.getProfileStoreTables(profileId);
this.ensureProfileStore(profileId);
const existingRow = this.db
.prepare<[string], SnippetRowidRow>(
`SELECT rowid FROM ${tables.quotedRowidTableName} WHERE snippet_id = ?`
)
.get(snippetId);
if (!existingRow) {
return;
}
this.db
.prepare<[number]>(`DELETE FROM ${tables.quotedVectorTableName} WHERE rowid = ?`)
.run(existingRow.rowid);
this.db
.prepare<[string]>(`DELETE FROM ${tables.quotedRowidTableName} WHERE snippet_id = ?`)
.run(snippetId);
}
deleteEmbeddingsForDocumentIds(documentIds: string[]): void {
if (documentIds.length === 0) {
return;
}
const placeholders = documentIds.map(() => '?').join(', ');
const rows = this.db
.prepare<unknown[], StoredEmbeddingRef>(
`SELECT DISTINCT se.profile_id, se.snippet_id
FROM snippet_embeddings se
INNER JOIN snippets s ON s.id = se.snippet_id
WHERE s.document_id IN (${placeholders})`
)
.all(...documentIds);
this.deleteEmbeddingRefs(rows);
}
deleteEmbeddingsForRepository(repositoryId: string): void {
const rows = this.db
.prepare<[string], StoredEmbeddingRef>(
`SELECT DISTINCT se.profile_id, se.snippet_id
FROM snippet_embeddings se
INNER JOIN snippets s ON s.id = se.snippet_id
WHERE s.repository_id = ?`
)
.all(repositoryId);
this.deleteEmbeddingRefs(rows);
}
deleteEmbeddingsForVersion(repositoryId: string, versionId: string): void {
const rows = this.db
.prepare<[string, string], StoredEmbeddingRef>(
`SELECT DISTINCT se.profile_id, se.snippet_id
FROM snippet_embeddings se
INNER JOIN snippets s ON s.id = se.snippet_id
WHERE s.repository_id = ? AND s.version_id = ?`
)
.all(repositoryId, versionId);
this.deleteEmbeddingRefs(rows);
}
queryNearestNeighbors(
queryEmbedding: Float32Array,
options: SqliteVecQueryOptions
): SqliteVecQueryResult[] {
const { repositoryId, versionId, profileId = 'local-default', limit = 50 } = options;
if (limit <= 0) {
return [];
}
const tables = this.getProfileStoreTables(profileId, queryEmbedding.length);
this.ensureProfileStore(profileId, tables.dimensions);
const totalRows = this.synchronizeProfileStore(profileId, tables);
if (totalRows === 0) {
return [];
}
let sql = `
SELECT rowids.snippet_id, vec.distance
FROM ${tables.quotedVectorTableName} vec
JOIN ${tables.quotedRowidTableName} rowids ON rowids.rowid = vec.rowid
JOIN snippets s ON s.id = rowids.snippet_id
WHERE vec.embedding MATCH ?
AND vec.k = ?
AND s.repository_id = ?
`;
const params: unknown[] = [toEmbeddingBuffer(queryEmbedding), totalRows, repositoryId];
if (versionId !== undefined) {
sql += ' AND s.version_id = ?';
params.push(versionId);
}
sql += ' ORDER BY vec.distance ASC LIMIT ?';
params.push(limit);
const rows = this.db.prepare<unknown[], RawKnnRow>(sql).all(...params);
return rows.map((row) => ({
snippetId: row.snippet_id,
score: distanceToScore(row.distance),
distance: row.distance
}));
}
private synchronizeProfileStore(profileId: string, tables: ProfileStoreTables): number {
this.db
.prepare<[string, number]>(
`DELETE FROM ${tables.quotedRowidTableName}
WHERE rowid IN (
SELECT rowids.rowid
FROM ${tables.quotedRowidTableName} rowids
LEFT JOIN snippet_embeddings se
ON se.snippet_id = rowids.snippet_id
AND se.profile_id = ?
AND se.dimensions = ?
LEFT JOIN ${tables.quotedVectorTableName} vec ON vec.rowid = rowids.rowid
WHERE se.snippet_id IS NULL OR vec.rowid IS NULL
)`
)
.run(profileId, tables.dimensions);
this.db
.prepare(
`DELETE FROM ${tables.quotedVectorTableName}
WHERE rowid NOT IN (SELECT rowid FROM ${tables.quotedRowidTableName})`
)
.run();
const missingRows = this.db
.prepare<[string, number], CanonicalEmbeddingRow>(
`SELECT se.snippet_id, se.embedding
FROM snippet_embeddings se
LEFT JOIN ${tables.quotedRowidTableName} rowids ON rowids.snippet_id = se.snippet_id
WHERE se.profile_id = ?
AND se.dimensions = ?
AND rowids.snippet_id IS NULL`
)
.all(profileId, tables.dimensions);
if (missingRows.length > 0) {
const backfill = this.db.transaction((rows: CanonicalEmbeddingRow[]) => {
for (const row of rows) {
this.upsertEmbedding(
profileId,
row.snippet_id,
new Float32Array(
row.embedding.buffer,
row.embedding.byteOffset,
tables.dimensions
)
);
}
});
backfill(missingRows);
}
return (
this.db
.prepare<[], { count: number }>(
`SELECT COUNT(*) AS count
FROM ${tables.quotedVectorTableName} vec
JOIN ${tables.quotedRowidTableName} rowids ON rowids.rowid = vec.rowid`
)
.get()?.count ?? 0
);
}
private deleteEmbeddingRefs(rows: StoredEmbeddingRef[]): void {
if (rows.length === 0) {
return;
}
const removeRows = this.db.transaction((refs: StoredEmbeddingRef[]) => {
for (const ref of refs) {
this.deleteEmbedding(ref.profile_id, ref.snippet_id);
}
});
removeRows(rows);
}
private getProfileStoreTables(
profileId: string,
preferredDimensions?: number
): ProfileStoreTables {
loadSqliteVec(this.db);
const dimensionsRow = this.db
.prepare<[string], ProfileDimensionsRow>(
'SELECT dimensions FROM embedding_profiles WHERE id = ?'
)
.get(profileId);
if (!dimensionsRow) {
throw new Error(`Embedding profile not found: ${profileId}`);
}
const storedDimensions = this.db
.prepare<[string], StoredDimensionsRow>(
`SELECT
COUNT(*) AS count,
MIN(dimensions) AS min_dimensions,
MAX(dimensions) AS max_dimensions
FROM snippet_embeddings
WHERE profile_id = ?`
)
.get(profileId);
const effectiveDimensions = this.resolveDimensions(
profileId,
dimensionsRow.dimensions,
storedDimensions,
preferredDimensions
);
const vectorTableName = sqliteVecTableName(profileId);
const rowidTableName = sqliteVecRowidTableName(profileId);
return {
vectorTableName,
rowidTableName,
quotedVectorTableName: quoteSqliteIdentifier(vectorTableName),
quotedRowidTableName: quoteSqliteIdentifier(rowidTableName),
dimensions: effectiveDimensions
};
}
private resolveDimensions(
profileId: string,
profileDimensions: number,
storedDimensions: StoredDimensionsRow | undefined,
preferredDimensions?: number
): number {
if (storedDimensions && storedDimensions.count > 0) {
if (storedDimensions.min_dimensions !== storedDimensions.max_dimensions) {
throw new Error(`Stored embedding dimensions are inconsistent for profile ${profileId}`);
}
const canonicalDimensions = storedDimensions.min_dimensions;
if (canonicalDimensions === null) {
throw new Error(`Stored embedding dimensions are missing for profile ${profileId}`);
}
if (
preferredDimensions !== undefined &&
preferredDimensions !== canonicalDimensions
) {
throw new Error(
`Embedding dimension mismatch for profile ${profileId}: expected ${canonicalDimensions}, received ${preferredDimensions}`
);
}
return canonicalDimensions;
}
return preferredDimensions ?? profileDimensions;
}
}