feat(TRUEREF-0023): add sqlite-vec search pipeline

This commit is contained in:
Giancarmine Salucci
2026-04-01 14:09:19 +02:00
parent 0752636847
commit 9525c58e9a
45 changed files with 4009 additions and 614 deletions

View File

@@ -1,16 +1,12 @@
/**
* Vector similarity search over stored snippet embeddings.
*
* SQLite does not natively support vector operations, so cosine similarity is
* computed in JavaScript after loading candidate embeddings from the
* snippet_embeddings table.
*
* Performance note: For repositories with > 50k snippets, pre-filtering by
* FTS5 candidates before computing cosine similarity is recommended. For v1,
* in-memory computation is acceptable.
* Uses sqlite-vec vector_top_k() for ANN search instead of in-memory cosine
* similarity computation over all embeddings.
*/
import type Database from 'better-sqlite3';
import { SqliteVecStore } from './sqlite-vec.store.js';
// ---------------------------------------------------------------------------
// Types
@@ -28,12 +24,6 @@ export interface VectorSearchOptions {
limit?: number;
}
/** Raw DB row from snippet_embeddings joined with snippets. */
interface RawEmbeddingRow {
snippet_id: string;
embedding: Buffer;
}
// ---------------------------------------------------------------------------
// Math helpers
// ---------------------------------------------------------------------------
@@ -69,46 +59,26 @@ export function cosineSimilarity(a: Float32Array, b: Float32Array): number {
// ---------------------------------------------------------------------------
export class VectorSearch {
constructor(private readonly db: Database.Database) {}
private readonly sqliteVecStore: SqliteVecStore;
constructor(private readonly db: Database.Database) {
this.sqliteVecStore = new SqliteVecStore(db);
}
/**
* Search stored embeddings by cosine similarity to the query embedding.
*
* Uses in-memory cosine similarity computation. The vec_embedding column
* stores raw Float32 bytes for forward compatibility with vector-capable
* libSQL builds; scoring is performed in JS using the same bytes.
*
* @param queryEmbedding - The embedded representation of the search query.
* @param options - Search options including repositoryId, optional versionId, profileId, and limit.
* @returns Results sorted by descending cosine similarity score.
*/
vectorSearch(queryEmbedding: Float32Array, options: VectorSearchOptions): VectorSearchResult[] {
const { repositoryId, versionId, profileId = 'local-default', limit = 50 } = options;
let sql = `
SELECT se.snippet_id, se.embedding
FROM snippet_embeddings se
JOIN snippets s ON s.id = se.snippet_id
WHERE s.repository_id = ?
AND se.profile_id = ?
`;
const params: unknown[] = [repositoryId, profileId];
if (versionId) {
sql += ' AND s.version_id = ?';
params.push(versionId);
}
const rows = this.db.prepare<unknown[], RawEmbeddingRow>(sql).all(...params);
const scored: VectorSearchResult[] = rows.map((row) => {
const embedding = new Float32Array(
row.embedding.buffer,
row.embedding.byteOffset,
row.embedding.byteLength / 4
);
return {
snippetId: row.snippet_id,
score: cosineSimilarity(queryEmbedding, embedding)
};
});
return scored.sort((a, b) => b.score - a.score).slice(0, limit);
return this.sqliteVecStore
.queryNearestNeighbors(queryEmbedding, options)
.map((result) => ({ snippetId: result.snippetId, score: result.score }));
}
}