394 lines
10 KiB
TypeScript
394 lines
10 KiB
TypeScript
import type Database from 'better-sqlite3';
|
|
import {
|
|
loadSqliteVec,
|
|
quoteSqliteIdentifier,
|
|
sqliteVecRowidTableName,
|
|
sqliteVecTableName
|
|
} from '$lib/server/db/sqlite-vec.js';
|
|
|
|
export interface SqliteVecQueryOptions {
|
|
repositoryId: string;
|
|
versionId?: string;
|
|
profileId?: string;
|
|
limit?: number;
|
|
}
|
|
|
|
export interface SqliteVecQueryResult {
|
|
snippetId: string;
|
|
score: number;
|
|
distance: number;
|
|
}
|
|
|
|
interface ProfileDimensionsRow {
|
|
dimensions: number;
|
|
}
|
|
|
|
interface StoredDimensionsRow {
|
|
count: number;
|
|
min_dimensions: number | null;
|
|
max_dimensions: number | null;
|
|
}
|
|
|
|
interface SnippetRowidRow {
|
|
rowid: number;
|
|
}
|
|
|
|
interface RawKnnRow {
|
|
snippet_id: string;
|
|
distance: number;
|
|
}
|
|
|
|
interface CanonicalEmbeddingRow {
|
|
snippet_id: string;
|
|
embedding: Buffer;
|
|
}
|
|
|
|
interface StoredEmbeddingRef {
|
|
profile_id: string;
|
|
snippet_id: string;
|
|
}
|
|
|
|
interface ProfileStoreTables {
|
|
vectorTableName: string;
|
|
rowidTableName: string;
|
|
quotedVectorTableName: string;
|
|
quotedRowidTableName: string;
|
|
dimensions: number;
|
|
}
|
|
|
|
function toEmbeddingBuffer(values: Float32Array): Buffer {
|
|
return Buffer.from(values.buffer, values.byteOffset, values.byteLength);
|
|
}
|
|
|
|
function distanceToScore(distance: number): number {
|
|
return 1 / (1 + distance);
|
|
}
|
|
|
|
export class SqliteVecStore {
|
|
constructor(private readonly db: Database.Database) {}
|
|
|
|
ensureProfileStore(profileId: string, preferredDimensions?: number): number {
|
|
const tables = this.getProfileStoreTables(profileId, preferredDimensions);
|
|
|
|
this.db.exec(`
|
|
CREATE TABLE IF NOT EXISTS ${tables.quotedRowidTableName} (
|
|
rowid INTEGER PRIMARY KEY,
|
|
snippet_id TEXT NOT NULL UNIQUE REFERENCES snippets(id) ON DELETE CASCADE
|
|
);
|
|
`);
|
|
this.db.exec(`
|
|
CREATE VIRTUAL TABLE IF NOT EXISTS ${tables.quotedVectorTableName}
|
|
USING vec0(embedding float[${tables.dimensions}]);
|
|
`);
|
|
|
|
return tables.dimensions;
|
|
}
|
|
|
|
upsertEmbedding(profileId: string, snippetId: string, embedding: Float32Array): void {
|
|
const tables = this.getProfileStoreTables(profileId, embedding.length);
|
|
|
|
this.ensureProfileStore(profileId, tables.dimensions);
|
|
|
|
const existingRow = this.db
|
|
.prepare<[string], SnippetRowidRow>(
|
|
`SELECT rowid FROM ${tables.quotedRowidTableName} WHERE snippet_id = ?`
|
|
)
|
|
.get(snippetId);
|
|
|
|
const embeddingBuffer = toEmbeddingBuffer(embedding);
|
|
if (existingRow) {
|
|
this.db
|
|
.prepare<[Buffer, number]>(
|
|
`UPDATE ${tables.quotedVectorTableName} SET embedding = ? WHERE rowid = ?`
|
|
)
|
|
.run(embeddingBuffer, existingRow.rowid);
|
|
return;
|
|
}
|
|
|
|
const insertResult = this.db
|
|
.prepare<[Buffer]>(`INSERT INTO ${tables.quotedVectorTableName} (embedding) VALUES (?)`)
|
|
.run(embeddingBuffer);
|
|
this.db
|
|
.prepare<[number, string]>(
|
|
`INSERT INTO ${tables.quotedRowidTableName} (rowid, snippet_id) VALUES (?, ?)`
|
|
)
|
|
.run(Number(insertResult.lastInsertRowid), snippetId);
|
|
}
|
|
|
|
upsertEmbeddingBuffer(
|
|
profileId: string,
|
|
snippetId: string,
|
|
embedding: Buffer,
|
|
dimensions?: number
|
|
): void {
|
|
const vector = new Float32Array(
|
|
embedding.buffer,
|
|
embedding.byteOffset,
|
|
dimensions ?? Math.floor(embedding.byteLength / Float32Array.BYTES_PER_ELEMENT)
|
|
);
|
|
this.upsertEmbedding(profileId, snippetId, vector);
|
|
}
|
|
|
|
deleteEmbedding(profileId: string, snippetId: string): void {
|
|
const tables = this.getProfileStoreTables(profileId);
|
|
this.ensureProfileStore(profileId);
|
|
|
|
const existingRow = this.db
|
|
.prepare<[string], SnippetRowidRow>(
|
|
`SELECT rowid FROM ${tables.quotedRowidTableName} WHERE snippet_id = ?`
|
|
)
|
|
.get(snippetId);
|
|
|
|
if (!existingRow) {
|
|
return;
|
|
}
|
|
|
|
this.db
|
|
.prepare<[number]>(`DELETE FROM ${tables.quotedVectorTableName} WHERE rowid = ?`)
|
|
.run(existingRow.rowid);
|
|
this.db
|
|
.prepare<[string]>(`DELETE FROM ${tables.quotedRowidTableName} WHERE snippet_id = ?`)
|
|
.run(snippetId);
|
|
}
|
|
|
|
deleteEmbeddingsForDocumentIds(documentIds: string[]): void {
|
|
if (documentIds.length === 0) {
|
|
return;
|
|
}
|
|
|
|
const placeholders = documentIds.map(() => '?').join(', ');
|
|
const rows = this.db
|
|
.prepare<unknown[], StoredEmbeddingRef>(
|
|
`SELECT DISTINCT se.profile_id, se.snippet_id
|
|
FROM snippet_embeddings se
|
|
INNER JOIN snippets s ON s.id = se.snippet_id
|
|
WHERE s.document_id IN (${placeholders})`
|
|
)
|
|
.all(...documentIds);
|
|
|
|
this.deleteEmbeddingRefs(rows);
|
|
}
|
|
|
|
deleteEmbeddingsForRepository(repositoryId: string): void {
|
|
const rows = this.db
|
|
.prepare<[string], StoredEmbeddingRef>(
|
|
`SELECT DISTINCT se.profile_id, se.snippet_id
|
|
FROM snippet_embeddings se
|
|
INNER JOIN snippets s ON s.id = se.snippet_id
|
|
WHERE s.repository_id = ?`
|
|
)
|
|
.all(repositoryId);
|
|
|
|
this.deleteEmbeddingRefs(rows);
|
|
}
|
|
|
|
deleteEmbeddingsForVersion(repositoryId: string, versionId: string): void {
|
|
const rows = this.db
|
|
.prepare<[string, string], StoredEmbeddingRef>(
|
|
`SELECT DISTINCT se.profile_id, se.snippet_id
|
|
FROM snippet_embeddings se
|
|
INNER JOIN snippets s ON s.id = se.snippet_id
|
|
WHERE s.repository_id = ? AND s.version_id = ?`
|
|
)
|
|
.all(repositoryId, versionId);
|
|
|
|
this.deleteEmbeddingRefs(rows);
|
|
}
|
|
|
|
queryNearestNeighbors(
|
|
queryEmbedding: Float32Array,
|
|
options: SqliteVecQueryOptions
|
|
): SqliteVecQueryResult[] {
|
|
const { repositoryId, versionId, profileId = 'local-default', limit = 50 } = options;
|
|
if (limit <= 0) {
|
|
return [];
|
|
}
|
|
|
|
const tables = this.getProfileStoreTables(profileId, queryEmbedding.length);
|
|
|
|
this.ensureProfileStore(profileId, tables.dimensions);
|
|
const totalRows = this.synchronizeProfileStore(profileId, tables);
|
|
if (totalRows === 0) {
|
|
return [];
|
|
}
|
|
|
|
let sql = `
|
|
SELECT rowids.snippet_id, vec.distance
|
|
FROM ${tables.quotedVectorTableName} vec
|
|
JOIN ${tables.quotedRowidTableName} rowids ON rowids.rowid = vec.rowid
|
|
JOIN snippets s ON s.id = rowids.snippet_id
|
|
WHERE vec.embedding MATCH ?
|
|
AND vec.k = ?
|
|
AND s.repository_id = ?
|
|
`;
|
|
const params: unknown[] = [toEmbeddingBuffer(queryEmbedding), totalRows, repositoryId];
|
|
|
|
if (versionId !== undefined) {
|
|
sql += ' AND s.version_id = ?';
|
|
params.push(versionId);
|
|
}
|
|
|
|
sql += ' ORDER BY vec.distance ASC LIMIT ?';
|
|
params.push(limit);
|
|
|
|
const rows = this.db.prepare<unknown[], RawKnnRow>(sql).all(...params);
|
|
return rows.map((row) => ({
|
|
snippetId: row.snippet_id,
|
|
score: distanceToScore(row.distance),
|
|
distance: row.distance
|
|
}));
|
|
}
|
|
|
|
private synchronizeProfileStore(profileId: string, tables: ProfileStoreTables): number {
|
|
this.db
|
|
.prepare<[string, number]>(
|
|
`DELETE FROM ${tables.quotedRowidTableName}
|
|
WHERE rowid IN (
|
|
SELECT rowids.rowid
|
|
FROM ${tables.quotedRowidTableName} rowids
|
|
LEFT JOIN snippet_embeddings se
|
|
ON se.snippet_id = rowids.snippet_id
|
|
AND se.profile_id = ?
|
|
AND se.dimensions = ?
|
|
LEFT JOIN ${tables.quotedVectorTableName} vec ON vec.rowid = rowids.rowid
|
|
WHERE se.snippet_id IS NULL OR vec.rowid IS NULL
|
|
)`
|
|
)
|
|
.run(profileId, tables.dimensions);
|
|
|
|
this.db
|
|
.prepare(
|
|
`DELETE FROM ${tables.quotedVectorTableName}
|
|
WHERE rowid NOT IN (SELECT rowid FROM ${tables.quotedRowidTableName})`
|
|
)
|
|
.run();
|
|
|
|
const missingRows = this.db
|
|
.prepare<[string, number], CanonicalEmbeddingRow>(
|
|
`SELECT se.snippet_id, se.embedding
|
|
FROM snippet_embeddings se
|
|
LEFT JOIN ${tables.quotedRowidTableName} rowids ON rowids.snippet_id = se.snippet_id
|
|
WHERE se.profile_id = ?
|
|
AND se.dimensions = ?
|
|
AND rowids.snippet_id IS NULL`
|
|
)
|
|
.all(profileId, tables.dimensions);
|
|
|
|
if (missingRows.length > 0) {
|
|
const backfill = this.db.transaction((rows: CanonicalEmbeddingRow[]) => {
|
|
for (const row of rows) {
|
|
this.upsertEmbedding(
|
|
profileId,
|
|
row.snippet_id,
|
|
new Float32Array(
|
|
row.embedding.buffer,
|
|
row.embedding.byteOffset,
|
|
tables.dimensions
|
|
)
|
|
);
|
|
}
|
|
});
|
|
backfill(missingRows);
|
|
}
|
|
|
|
return (
|
|
this.db
|
|
.prepare<[], { count: number }>(
|
|
`SELECT COUNT(*) AS count
|
|
FROM ${tables.quotedVectorTableName} vec
|
|
JOIN ${tables.quotedRowidTableName} rowids ON rowids.rowid = vec.rowid`
|
|
)
|
|
.get()?.count ?? 0
|
|
);
|
|
}
|
|
|
|
private deleteEmbeddingRefs(rows: StoredEmbeddingRef[]): void {
|
|
if (rows.length === 0) {
|
|
return;
|
|
}
|
|
|
|
const removeRows = this.db.transaction((refs: StoredEmbeddingRef[]) => {
|
|
for (const ref of refs) {
|
|
this.deleteEmbedding(ref.profile_id, ref.snippet_id);
|
|
}
|
|
});
|
|
|
|
removeRows(rows);
|
|
}
|
|
|
|
private getProfileStoreTables(
|
|
profileId: string,
|
|
preferredDimensions?: number
|
|
): ProfileStoreTables {
|
|
loadSqliteVec(this.db);
|
|
|
|
const dimensionsRow = this.db
|
|
.prepare<[string], ProfileDimensionsRow>(
|
|
'SELECT dimensions FROM embedding_profiles WHERE id = ?'
|
|
)
|
|
.get(profileId);
|
|
if (!dimensionsRow) {
|
|
throw new Error(`Embedding profile not found: ${profileId}`);
|
|
}
|
|
|
|
const storedDimensions = this.db
|
|
.prepare<[string], StoredDimensionsRow>(
|
|
`SELECT
|
|
COUNT(*) AS count,
|
|
MIN(dimensions) AS min_dimensions,
|
|
MAX(dimensions) AS max_dimensions
|
|
FROM snippet_embeddings
|
|
WHERE profile_id = ?`
|
|
)
|
|
.get(profileId);
|
|
|
|
const effectiveDimensions = this.resolveDimensions(
|
|
profileId,
|
|
dimensionsRow.dimensions,
|
|
storedDimensions,
|
|
preferredDimensions
|
|
);
|
|
|
|
const vectorTableName = sqliteVecTableName(profileId);
|
|
const rowidTableName = sqliteVecRowidTableName(profileId);
|
|
|
|
return {
|
|
vectorTableName,
|
|
rowidTableName,
|
|
quotedVectorTableName: quoteSqliteIdentifier(vectorTableName),
|
|
quotedRowidTableName: quoteSqliteIdentifier(rowidTableName),
|
|
dimensions: effectiveDimensions
|
|
};
|
|
}
|
|
|
|
private resolveDimensions(
|
|
profileId: string,
|
|
profileDimensions: number,
|
|
storedDimensions: StoredDimensionsRow | undefined,
|
|
preferredDimensions?: number
|
|
): number {
|
|
if (storedDimensions && storedDimensions.count > 0) {
|
|
if (storedDimensions.min_dimensions !== storedDimensions.max_dimensions) {
|
|
throw new Error(`Stored embedding dimensions are inconsistent for profile ${profileId}`);
|
|
}
|
|
|
|
const canonicalDimensions = storedDimensions.min_dimensions;
|
|
if (canonicalDimensions === null) {
|
|
throw new Error(`Stored embedding dimensions are missing for profile ${profileId}`);
|
|
}
|
|
|
|
if (
|
|
preferredDimensions !== undefined &&
|
|
preferredDimensions !== canonicalDimensions
|
|
) {
|
|
throw new Error(
|
|
`Embedding dimension mismatch for profile ${profileId}: expected ${canonicalDimensions}, received ${preferredDimensions}`
|
|
);
|
|
}
|
|
|
|
return canonicalDimensions;
|
|
}
|
|
|
|
return preferredDimensions ?? profileDimensions;
|
|
}
|
|
} |