feat(TRUEREF-0023): add sqlite-vec search pipeline
This commit is contained in:
394
src/lib/server/search/sqlite-vec.store.ts
Normal file
394
src/lib/server/search/sqlite-vec.store.ts
Normal file
@@ -0,0 +1,394 @@
|
||||
import type Database from 'better-sqlite3';
|
||||
import {
|
||||
loadSqliteVec,
|
||||
quoteSqliteIdentifier,
|
||||
sqliteVecRowidTableName,
|
||||
sqliteVecTableName
|
||||
} from '$lib/server/db/sqlite-vec.js';
|
||||
|
||||
export interface SqliteVecQueryOptions {
|
||||
repositoryId: string;
|
||||
versionId?: string;
|
||||
profileId?: string;
|
||||
limit?: number;
|
||||
}
|
||||
|
||||
export interface SqliteVecQueryResult {
|
||||
snippetId: string;
|
||||
score: number;
|
||||
distance: number;
|
||||
}
|
||||
|
||||
interface ProfileDimensionsRow {
|
||||
dimensions: number;
|
||||
}
|
||||
|
||||
interface StoredDimensionsRow {
|
||||
count: number;
|
||||
min_dimensions: number | null;
|
||||
max_dimensions: number | null;
|
||||
}
|
||||
|
||||
interface SnippetRowidRow {
|
||||
rowid: number;
|
||||
}
|
||||
|
||||
interface RawKnnRow {
|
||||
snippet_id: string;
|
||||
distance: number;
|
||||
}
|
||||
|
||||
interface CanonicalEmbeddingRow {
|
||||
snippet_id: string;
|
||||
embedding: Buffer;
|
||||
}
|
||||
|
||||
interface StoredEmbeddingRef {
|
||||
profile_id: string;
|
||||
snippet_id: string;
|
||||
}
|
||||
|
||||
interface ProfileStoreTables {
|
||||
vectorTableName: string;
|
||||
rowidTableName: string;
|
||||
quotedVectorTableName: string;
|
||||
quotedRowidTableName: string;
|
||||
dimensions: number;
|
||||
}
|
||||
|
||||
function toEmbeddingBuffer(values: Float32Array): Buffer {
|
||||
return Buffer.from(values.buffer, values.byteOffset, values.byteLength);
|
||||
}
|
||||
|
||||
function distanceToScore(distance: number): number {
|
||||
return 1 / (1 + distance);
|
||||
}
|
||||
|
||||
export class SqliteVecStore {
|
||||
constructor(private readonly db: Database.Database) {}
|
||||
|
||||
ensureProfileStore(profileId: string, preferredDimensions?: number): number {
|
||||
const tables = this.getProfileStoreTables(profileId, preferredDimensions);
|
||||
|
||||
this.db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS ${tables.quotedRowidTableName} (
|
||||
rowid INTEGER PRIMARY KEY,
|
||||
snippet_id TEXT NOT NULL UNIQUE REFERENCES snippets(id) ON DELETE CASCADE
|
||||
);
|
||||
`);
|
||||
this.db.exec(`
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS ${tables.quotedVectorTableName}
|
||||
USING vec0(embedding float[${tables.dimensions}]);
|
||||
`);
|
||||
|
||||
return tables.dimensions;
|
||||
}
|
||||
|
||||
upsertEmbedding(profileId: string, snippetId: string, embedding: Float32Array): void {
|
||||
const tables = this.getProfileStoreTables(profileId, embedding.length);
|
||||
|
||||
this.ensureProfileStore(profileId, tables.dimensions);
|
||||
|
||||
const existingRow = this.db
|
||||
.prepare<[string], SnippetRowidRow>(
|
||||
`SELECT rowid FROM ${tables.quotedRowidTableName} WHERE snippet_id = ?`
|
||||
)
|
||||
.get(snippetId);
|
||||
|
||||
const embeddingBuffer = toEmbeddingBuffer(embedding);
|
||||
if (existingRow) {
|
||||
this.db
|
||||
.prepare<[Buffer, number]>(
|
||||
`UPDATE ${tables.quotedVectorTableName} SET embedding = ? WHERE rowid = ?`
|
||||
)
|
||||
.run(embeddingBuffer, existingRow.rowid);
|
||||
return;
|
||||
}
|
||||
|
||||
const insertResult = this.db
|
||||
.prepare<[Buffer]>(`INSERT INTO ${tables.quotedVectorTableName} (embedding) VALUES (?)`)
|
||||
.run(embeddingBuffer);
|
||||
this.db
|
||||
.prepare<[number, string]>(
|
||||
`INSERT INTO ${tables.quotedRowidTableName} (rowid, snippet_id) VALUES (?, ?)`
|
||||
)
|
||||
.run(Number(insertResult.lastInsertRowid), snippetId);
|
||||
}
|
||||
|
||||
upsertEmbeddingBuffer(
|
||||
profileId: string,
|
||||
snippetId: string,
|
||||
embedding: Buffer,
|
||||
dimensions?: number
|
||||
): void {
|
||||
const vector = new Float32Array(
|
||||
embedding.buffer,
|
||||
embedding.byteOffset,
|
||||
dimensions ?? Math.floor(embedding.byteLength / Float32Array.BYTES_PER_ELEMENT)
|
||||
);
|
||||
this.upsertEmbedding(profileId, snippetId, vector);
|
||||
}
|
||||
|
||||
deleteEmbedding(profileId: string, snippetId: string): void {
|
||||
const tables = this.getProfileStoreTables(profileId);
|
||||
this.ensureProfileStore(profileId);
|
||||
|
||||
const existingRow = this.db
|
||||
.prepare<[string], SnippetRowidRow>(
|
||||
`SELECT rowid FROM ${tables.quotedRowidTableName} WHERE snippet_id = ?`
|
||||
)
|
||||
.get(snippetId);
|
||||
|
||||
if (!existingRow) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.db
|
||||
.prepare<[number]>(`DELETE FROM ${tables.quotedVectorTableName} WHERE rowid = ?`)
|
||||
.run(existingRow.rowid);
|
||||
this.db
|
||||
.prepare<[string]>(`DELETE FROM ${tables.quotedRowidTableName} WHERE snippet_id = ?`)
|
||||
.run(snippetId);
|
||||
}
|
||||
|
||||
deleteEmbeddingsForDocumentIds(documentIds: string[]): void {
|
||||
if (documentIds.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const placeholders = documentIds.map(() => '?').join(', ');
|
||||
const rows = this.db
|
||||
.prepare<unknown[], StoredEmbeddingRef>(
|
||||
`SELECT DISTINCT se.profile_id, se.snippet_id
|
||||
FROM snippet_embeddings se
|
||||
INNER JOIN snippets s ON s.id = se.snippet_id
|
||||
WHERE s.document_id IN (${placeholders})`
|
||||
)
|
||||
.all(...documentIds);
|
||||
|
||||
this.deleteEmbeddingRefs(rows);
|
||||
}
|
||||
|
||||
deleteEmbeddingsForRepository(repositoryId: string): void {
|
||||
const rows = this.db
|
||||
.prepare<[string], StoredEmbeddingRef>(
|
||||
`SELECT DISTINCT se.profile_id, se.snippet_id
|
||||
FROM snippet_embeddings se
|
||||
INNER JOIN snippets s ON s.id = se.snippet_id
|
||||
WHERE s.repository_id = ?`
|
||||
)
|
||||
.all(repositoryId);
|
||||
|
||||
this.deleteEmbeddingRefs(rows);
|
||||
}
|
||||
|
||||
deleteEmbeddingsForVersion(repositoryId: string, versionId: string): void {
|
||||
const rows = this.db
|
||||
.prepare<[string, string], StoredEmbeddingRef>(
|
||||
`SELECT DISTINCT se.profile_id, se.snippet_id
|
||||
FROM snippet_embeddings se
|
||||
INNER JOIN snippets s ON s.id = se.snippet_id
|
||||
WHERE s.repository_id = ? AND s.version_id = ?`
|
||||
)
|
||||
.all(repositoryId, versionId);
|
||||
|
||||
this.deleteEmbeddingRefs(rows);
|
||||
}
|
||||
|
||||
queryNearestNeighbors(
|
||||
queryEmbedding: Float32Array,
|
||||
options: SqliteVecQueryOptions
|
||||
): SqliteVecQueryResult[] {
|
||||
const { repositoryId, versionId, profileId = 'local-default', limit = 50 } = options;
|
||||
if (limit <= 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const tables = this.getProfileStoreTables(profileId, queryEmbedding.length);
|
||||
|
||||
this.ensureProfileStore(profileId, tables.dimensions);
|
||||
const totalRows = this.synchronizeProfileStore(profileId, tables);
|
||||
if (totalRows === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
let sql = `
|
||||
SELECT rowids.snippet_id, vec.distance
|
||||
FROM ${tables.quotedVectorTableName} vec
|
||||
JOIN ${tables.quotedRowidTableName} rowids ON rowids.rowid = vec.rowid
|
||||
JOIN snippets s ON s.id = rowids.snippet_id
|
||||
WHERE vec.embedding MATCH ?
|
||||
AND vec.k = ?
|
||||
AND s.repository_id = ?
|
||||
`;
|
||||
const params: unknown[] = [toEmbeddingBuffer(queryEmbedding), totalRows, repositoryId];
|
||||
|
||||
if (versionId !== undefined) {
|
||||
sql += ' AND s.version_id = ?';
|
||||
params.push(versionId);
|
||||
}
|
||||
|
||||
sql += ' ORDER BY vec.distance ASC LIMIT ?';
|
||||
params.push(limit);
|
||||
|
||||
const rows = this.db.prepare<unknown[], RawKnnRow>(sql).all(...params);
|
||||
return rows.map((row) => ({
|
||||
snippetId: row.snippet_id,
|
||||
score: distanceToScore(row.distance),
|
||||
distance: row.distance
|
||||
}));
|
||||
}
|
||||
|
||||
private synchronizeProfileStore(profileId: string, tables: ProfileStoreTables): number {
|
||||
this.db
|
||||
.prepare<[string, number]>(
|
||||
`DELETE FROM ${tables.quotedRowidTableName}
|
||||
WHERE rowid IN (
|
||||
SELECT rowids.rowid
|
||||
FROM ${tables.quotedRowidTableName} rowids
|
||||
LEFT JOIN snippet_embeddings se
|
||||
ON se.snippet_id = rowids.snippet_id
|
||||
AND se.profile_id = ?
|
||||
AND se.dimensions = ?
|
||||
LEFT JOIN ${tables.quotedVectorTableName} vec ON vec.rowid = rowids.rowid
|
||||
WHERE se.snippet_id IS NULL OR vec.rowid IS NULL
|
||||
)`
|
||||
)
|
||||
.run(profileId, tables.dimensions);
|
||||
|
||||
this.db
|
||||
.prepare(
|
||||
`DELETE FROM ${tables.quotedVectorTableName}
|
||||
WHERE rowid NOT IN (SELECT rowid FROM ${tables.quotedRowidTableName})`
|
||||
)
|
||||
.run();
|
||||
|
||||
const missingRows = this.db
|
||||
.prepare<[string, number], CanonicalEmbeddingRow>(
|
||||
`SELECT se.snippet_id, se.embedding
|
||||
FROM snippet_embeddings se
|
||||
LEFT JOIN ${tables.quotedRowidTableName} rowids ON rowids.snippet_id = se.snippet_id
|
||||
WHERE se.profile_id = ?
|
||||
AND se.dimensions = ?
|
||||
AND rowids.snippet_id IS NULL`
|
||||
)
|
||||
.all(profileId, tables.dimensions);
|
||||
|
||||
if (missingRows.length > 0) {
|
||||
const backfill = this.db.transaction((rows: CanonicalEmbeddingRow[]) => {
|
||||
for (const row of rows) {
|
||||
this.upsertEmbedding(
|
||||
profileId,
|
||||
row.snippet_id,
|
||||
new Float32Array(
|
||||
row.embedding.buffer,
|
||||
row.embedding.byteOffset,
|
||||
tables.dimensions
|
||||
)
|
||||
);
|
||||
}
|
||||
});
|
||||
backfill(missingRows);
|
||||
}
|
||||
|
||||
return (
|
||||
this.db
|
||||
.prepare<[], { count: number }>(
|
||||
`SELECT COUNT(*) AS count
|
||||
FROM ${tables.quotedVectorTableName} vec
|
||||
JOIN ${tables.quotedRowidTableName} rowids ON rowids.rowid = vec.rowid`
|
||||
)
|
||||
.get()?.count ?? 0
|
||||
);
|
||||
}
|
||||
|
||||
private deleteEmbeddingRefs(rows: StoredEmbeddingRef[]): void {
|
||||
if (rows.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const removeRows = this.db.transaction((refs: StoredEmbeddingRef[]) => {
|
||||
for (const ref of refs) {
|
||||
this.deleteEmbedding(ref.profile_id, ref.snippet_id);
|
||||
}
|
||||
});
|
||||
|
||||
removeRows(rows);
|
||||
}
|
||||
|
||||
private getProfileStoreTables(
|
||||
profileId: string,
|
||||
preferredDimensions?: number
|
||||
): ProfileStoreTables {
|
||||
loadSqliteVec(this.db);
|
||||
|
||||
const dimensionsRow = this.db
|
||||
.prepare<[string], ProfileDimensionsRow>(
|
||||
'SELECT dimensions FROM embedding_profiles WHERE id = ?'
|
||||
)
|
||||
.get(profileId);
|
||||
if (!dimensionsRow) {
|
||||
throw new Error(`Embedding profile not found: ${profileId}`);
|
||||
}
|
||||
|
||||
const storedDimensions = this.db
|
||||
.prepare<[string], StoredDimensionsRow>(
|
||||
`SELECT
|
||||
COUNT(*) AS count,
|
||||
MIN(dimensions) AS min_dimensions,
|
||||
MAX(dimensions) AS max_dimensions
|
||||
FROM snippet_embeddings
|
||||
WHERE profile_id = ?`
|
||||
)
|
||||
.get(profileId);
|
||||
|
||||
const effectiveDimensions = this.resolveDimensions(
|
||||
profileId,
|
||||
dimensionsRow.dimensions,
|
||||
storedDimensions,
|
||||
preferredDimensions
|
||||
);
|
||||
|
||||
const vectorTableName = sqliteVecTableName(profileId);
|
||||
const rowidTableName = sqliteVecRowidTableName(profileId);
|
||||
|
||||
return {
|
||||
vectorTableName,
|
||||
rowidTableName,
|
||||
quotedVectorTableName: quoteSqliteIdentifier(vectorTableName),
|
||||
quotedRowidTableName: quoteSqliteIdentifier(rowidTableName),
|
||||
dimensions: effectiveDimensions
|
||||
};
|
||||
}
|
||||
|
||||
private resolveDimensions(
|
||||
profileId: string,
|
||||
profileDimensions: number,
|
||||
storedDimensions: StoredDimensionsRow | undefined,
|
||||
preferredDimensions?: number
|
||||
): number {
|
||||
if (storedDimensions && storedDimensions.count > 0) {
|
||||
if (storedDimensions.min_dimensions !== storedDimensions.max_dimensions) {
|
||||
throw new Error(`Stored embedding dimensions are inconsistent for profile ${profileId}`);
|
||||
}
|
||||
|
||||
const canonicalDimensions = storedDimensions.min_dimensions;
|
||||
if (canonicalDimensions === null) {
|
||||
throw new Error(`Stored embedding dimensions are missing for profile ${profileId}`);
|
||||
}
|
||||
|
||||
if (
|
||||
preferredDimensions !== undefined &&
|
||||
preferredDimensions !== canonicalDimensions
|
||||
) {
|
||||
throw new Error(
|
||||
`Embedding dimension mismatch for profile ${profileId}: expected ${canonicalDimensions}, received ${preferredDimensions}`
|
||||
);
|
||||
}
|
||||
|
||||
return canonicalDimensions;
|
||||
}
|
||||
|
||||
return preferredDimensions ?? profileDimensions;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user