From d3d577a2e257fc95ab7ed7ef9acab9e6c2f403fb Mon Sep 17 00:00:00 2001 From: Giancarmine Salucci Date: Mon, 23 Mar 2026 09:06:25 +0100 Subject: [PATCH] feat(TRUEREF-0008): implement hybrid semantic search with RRF - Cosine similarity vector search over stored embeddings - Reciprocal Rank Fusion (K=60) combining FTS5 + vector rankings - Configurable alpha weight between keyword and semantic search - Graceful degradation to FTS5-only when no embedding provider configured Co-Authored-By: Claude Sonnet 4.6 --- .../search/hybrid.search.service.test.ts | 624 ++++++++++++++++++ .../server/search/hybrid.search.service.ts | 226 +++++++ src/lib/server/search/rrf.ts | 51 ++ src/lib/server/search/vector.search.ts | 108 +++ 4 files changed, 1009 insertions(+) create mode 100644 src/lib/server/search/hybrid.search.service.test.ts create mode 100644 src/lib/server/search/hybrid.search.service.ts create mode 100644 src/lib/server/search/rrf.ts create mode 100644 src/lib/server/search/vector.search.ts diff --git a/src/lib/server/search/hybrid.search.service.test.ts b/src/lib/server/search/hybrid.search.service.test.ts new file mode 100644 index 0000000..9e3bac0 --- /dev/null +++ b/src/lib/server/search/hybrid.search.service.test.ts @@ -0,0 +1,624 @@ +/** + * Unit tests for HybridSearchService, VectorSearch, and RRF (TRUEREF-0008). + * + * Uses an in-memory SQLite database and a mock EmbeddingProvider. + * No real network calls are made. + */ + +import { describe, it, expect, beforeEach } from 'vitest'; +import Database from 'better-sqlite3'; +import { readFileSync } from 'node:fs'; +import { join } from 'node:path'; + +import { SearchService } from './search.service.js'; +import { HybridSearchService } from './hybrid.search.service.js'; +import { VectorSearch, cosineSimilarity } from './vector.search.js'; +import { reciprocalRankFusion } from './rrf.js'; +import type { EmbeddingProvider, EmbeddingVector } from '../embeddings/provider.js'; + +// --------------------------------------------------------------------------- +// In-memory DB factory +// --------------------------------------------------------------------------- + +function createTestDb(): Database.Database { + const client = new Database(':memory:'); + client.pragma('foreign_keys = ON'); + + const migrationsFolder = join(import.meta.dirname, '../db/migrations'); + const migrationSql = readFileSync( + join(migrationsFolder, '0000_large_master_chief.sql'), + 'utf-8' + ); + const statements = migrationSql + .split('--> statement-breakpoint') + .map((s) => s.trim()) + .filter(Boolean); + for (const stmt of statements) { + client.exec(stmt); + } + + const ftsSql = readFileSync(join(import.meta.dirname, '../db/fts.sql'), 'utf-8'); + client.exec(ftsSql); + + return client; +} + +// --------------------------------------------------------------------------- +// Seed helpers +// --------------------------------------------------------------------------- + +const NOW_S = Math.floor(Date.now() / 1000); + +function seedRepo(client: Database.Database, id = '/test/repo'): string { + client + .prepare( + `INSERT OR IGNORE INTO repositories + (id, title, source, source_url, state, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?)` + ) + .run(id, 'Test Repo', 'github', `https://github.com${id}`, 'indexed', NOW_S, NOW_S); + return id; +} + +function seedDocument(client: Database.Database, repositoryId: string): string { + const docId = crypto.randomUUID(); + client + .prepare( + `INSERT INTO documents (id, repository_id, file_path, checksum, indexed_at) + VALUES (?, ?, ?, ?, ?)` + ) + .run(docId, repositoryId, 'README.md', 'abc', NOW_S); + return docId; +} + +function seedSnippet( + client: Database.Database, + opts: { + repositoryId: string; + documentId: string; + content: string; + title?: string | null; + type?: 'code' | 'info'; + } +): string { + const id = crypto.randomUUID(); + client + .prepare( + `INSERT INTO snippets + (id, document_id, repository_id, type, title, content, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?)` + ) + .run( + id, + opts.documentId, + opts.repositoryId, + opts.type ?? 'info', + opts.title ?? null, + opts.content, + NOW_S + ); + return id; +} + +function seedEmbedding( + client: Database.Database, + snippetId: string, + values: number[], + model = 'test-model' +): void { + const f32 = new Float32Array(values); + client + .prepare( + `INSERT OR REPLACE INTO snippet_embeddings + (snippet_id, model, dimensions, embedding, created_at) + VALUES (?, ?, ?, ?, ?)` + ) + .run(snippetId, model, values.length, Buffer.from(f32.buffer), NOW_S); +} + +// --------------------------------------------------------------------------- +// Mock EmbeddingProvider +// --------------------------------------------------------------------------- + +function makeMockProvider( + returnValues: number[][] = [[1, 0, 0, 0]] +): EmbeddingProvider { + return { + name: 'mock', + dimensions: returnValues[0]?.length ?? 4, + model: 'test-model', + async embed(texts: string[]): Promise { + return texts.map((_, i) => { + const vals = returnValues[i % returnValues.length]; + return { + values: new Float32Array(vals), + dimensions: vals.length, + model: 'test-model' + }; + }); + }, + async isAvailable(): Promise { + return true; + } + }; +} + +function makeNoopProvider(): EmbeddingProvider { + return { + name: 'noop', + dimensions: 0, + model: 'none', + async embed(_texts: string[]): Promise { + return []; + }, + async isAvailable(): Promise { + return false; + } + }; +} + +// =========================================================================== +// cosineSimilarity +// =========================================================================== + +describe('cosineSimilarity', () => { + it('returns 1.0 for identical vectors', () => { + const v = new Float32Array([1, 2, 3]); + expect(cosineSimilarity(v, v)).toBeCloseTo(1.0, 5); + }); + + it('returns 0.0 for orthogonal vectors', () => { + const a = new Float32Array([1, 0]); + const b = new Float32Array([0, 1]); + expect(cosineSimilarity(a, b)).toBeCloseTo(0.0, 5); + }); + + it('returns -1.0 for opposite vectors', () => { + const a = new Float32Array([1, 0]); + const b = new Float32Array([-1, 0]); + expect(cosineSimilarity(a, b)).toBeCloseTo(-1.0, 5); + }); + + it('returns 0 for zero-magnitude vector', () => { + const a = new Float32Array([0, 0]); + const b = new Float32Array([1, 2]); + expect(cosineSimilarity(a, b)).toBe(0); + }); + + it('throws when dimensions do not match', () => { + const a = new Float32Array([1, 2]); + const b = new Float32Array([1, 2, 3]); + expect(() => cosineSimilarity(a, b)).toThrow('dimension mismatch'); + }); + + it('computes correct similarity for non-trivial vectors', () => { + // [1,1] · [1,0] = 1; |[1,1]| = sqrt(2); |[1,0]| = 1 → 1/sqrt(2) ≈ 0.7071 + const a = new Float32Array([1, 1]); + const b = new Float32Array([1, 0]); + expect(cosineSimilarity(a, b)).toBeCloseTo(1 / Math.sqrt(2), 4); + }); +}); + +// =========================================================================== +// reciprocalRankFusion +// =========================================================================== + +describe('reciprocalRankFusion', () => { + it('returns empty array for empty inputs', () => { + expect(reciprocalRankFusion([], [])).toHaveLength(0); + }); + + it('fuses a single list preserving order', () => { + const ranking = [ + { id: 'a', score: 10 }, + { id: 'b', score: 5 }, + { id: 'c', score: 1 } + ]; + const result = reciprocalRankFusion(ranking); + expect(result.map((r) => r.id)).toEqual(['a', 'b', 'c']); + }); + + it('deduplicates items appearing in multiple lists', () => { + const r1 = [{ id: 'a', score: 1 }]; + const r2 = [{ id: 'a', score: 1 }]; + const result = reciprocalRankFusion(r1, r2); + expect(result.filter((r) => r.id === 'a')).toHaveLength(1); + }); + + it('boosts items appearing in multiple lists', () => { + // 'a' appears in both rankings at rank 0. + // 'b' appears only in r1 at rank 1. + // 'a' should outscore 'b'. + const r1 = [ + { id: 'a', score: 1 }, + { id: 'b', score: 0.5 } + ]; + const r2 = [{ id: 'a', score: 1 }]; + const result = reciprocalRankFusion(r1, r2); + const aScore = result.find((r) => r.id === 'a')!.rrfScore; + const bScore = result.find((r) => r.id === 'b')!.rrfScore; + expect(aScore).toBeGreaterThan(bScore); + }); + + it('assigns higher rrfScore to higher-ranked items', () => { + const ranking = [ + { id: 'first', score: 100 }, + { id: 'second', score: 50 } + ]; + const result = reciprocalRankFusion(ranking); + expect(result[0].id).toBe('first'); + expect(result[0].rrfScore).toBeGreaterThan(result[1].rrfScore); + }); + + it('handles three lists correctly', () => { + const r1 = [{ id: 'a', score: 1 }, { id: 'b', score: 0 }]; + const r2 = [{ id: 'b', score: 1 }, { id: 'c', score: 0 }]; + const r3 = [{ id: 'a', score: 1 }, { id: 'c', score: 0 }]; + const result = reciprocalRankFusion(r1, r2, r3); + // 'a' appears first in r1 and r3 → higher combined score than 'b' or 'c'. + expect(result[0].id).toBe('a'); + expect(result.map((r) => r.id)).toContain('b'); + expect(result.map((r) => r.id)).toContain('c'); + }); + + it('produces positive rrfScores', () => { + const ranking = [{ id: 'x', score: 0 }]; + const result = reciprocalRankFusion(ranking); + expect(result[0].rrfScore).toBeGreaterThan(0); + }); +}); + +// =========================================================================== +// VectorSearch +// =========================================================================== + +describe('VectorSearch', () => { + let client: Database.Database; + let repoId: string; + let docId: string; + + beforeEach(() => { + client = createTestDb(); + repoId = seedRepo(client); + docId = seedDocument(client, repoId); + }); + + it('returns empty array when no embeddings exist', () => { + const vs = new VectorSearch(client); + const results = vs.vectorSearch(new Float32Array([1, 0]), repoId); + expect(results).toHaveLength(0); + }); + + it('returns results sorted by descending cosine similarity', () => { + const s1 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'alpha' }); + const s2 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'beta' }); + const s3 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'gamma' }); + + // Query: [1, 0, 0, 0] + // s1: [1, 0, 0, 0] → similarity 1.0 (most similar) + // s2: [0, 1, 0, 0] → similarity 0.0 + // s3: [0, 0, 1, 0] → similarity 0.0 + seedEmbedding(client, s1, [1, 0, 0, 0]); + seedEmbedding(client, s2, [0, 1, 0, 0]); + seedEmbedding(client, s3, [0, 0, 1, 0]); + + const vs = new VectorSearch(client); + const results = vs.vectorSearch(new Float32Array([1, 0, 0, 0]), repoId); + + expect(results[0].snippetId).toBe(s1); + expect(results[0].score).toBeCloseTo(1.0, 4); + expect(results.length).toBe(3); + }); + + it('respects the limit parameter', () => { + for (let i = 0; i < 5; i++) { + const id = seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: `item ${i}` + }); + seedEmbedding(client, id, [i * 0.1, 1 - i * 0.1]); + } + + const vs = new VectorSearch(client); + const results = vs.vectorSearch(new Float32Array([1, 0]), repoId, 3); + expect(results.length).toBeLessThanOrEqual(3); + }); + + it('only returns snippets from the specified repository', () => { + const otherRepoId = seedRepo(client, '/other/repo'); + const otherDocId = seedDocument(client, otherRepoId); + + const s1 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'mine' }); + const s2 = seedSnippet(client, { + repositoryId: otherRepoId, + documentId: otherDocId, + content: 'theirs' + }); + + seedEmbedding(client, s1, [1, 0]); + seedEmbedding(client, s2, [1, 0]); + + const vs = new VectorSearch(client); + const results = vs.vectorSearch(new Float32Array([1, 0]), repoId); + + expect(results).toHaveLength(1); + expect(results[0].snippetId).toBe(s1); + }); + + it('handles embeddings with negative values', () => { + const s1 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'neg' }); + seedEmbedding(client, s1, [-0.5, 0.5]); + + const vs = new VectorSearch(client); + const results = vs.vectorSearch(new Float32Array([-0.5, 0.5]), repoId); + expect(results[0].score).toBeCloseTo(1.0, 4); + }); +}); + +// =========================================================================== +// HybridSearchService +// =========================================================================== + +describe('HybridSearchService', () => { + let client: Database.Database; + let searchService: SearchService; + let repoId: string; + let docId: string; + + beforeEach(() => { + client = createTestDb(); + searchService = new SearchService(client); + repoId = seedRepo(client); + docId = seedDocument(client, repoId); + }); + + // ------------------------------------------------------------------------- + // FTS5-only mode (no provider / alpha = 0) + // ------------------------------------------------------------------------- + + it('returns FTS5 results when embeddingProvider is null', async () => { + seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'hello world' }); + + const svc = new HybridSearchService(client, searchService, null); + const results = await svc.search('hello', { repositoryId: repoId }); + + expect(results.length).toBeGreaterThan(0); + expect(results[0].snippet.content).toBe('hello world'); + }); + + it('returns FTS5 results when alpha = 0', async () => { + seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'alpha zero test' }); + + const provider = makeMockProvider([[1, 0]]); + const svc = new HybridSearchService(client, searchService, provider); + const results = await svc.search('alpha zero', { repositoryId: repoId, alpha: 0 }); + + expect(results.length).toBeGreaterThan(0); + }); + + it('returns empty array when FTS5 query is blank and no provider', async () => { + const svc = new HybridSearchService(client, searchService, null); + const results = await svc.search(' ', { repositoryId: repoId }); + expect(results).toHaveLength(0); + }); + + it('falls back to FTS5 when noop provider returns empty embeddings', async () => { + seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'noop fallback test' + }); + + const svc = new HybridSearchService(client, searchService, makeNoopProvider()); + const results = await svc.search('noop fallback', { repositoryId: repoId }); + + expect(results.length).toBeGreaterThan(0); + }); + + // ------------------------------------------------------------------------- + // Hybrid mode + // ------------------------------------------------------------------------- + + it('returns results when hybrid mode is active (alpha = 0.5)', async () => { + const s1 = seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'hybrid search keyword match' + }); + seedEmbedding(client, s1, [1, 0, 0, 0]); + + const provider = makeMockProvider([[1, 0, 0, 0]]); + const svc = new HybridSearchService(client, searchService, provider); + + const results = await svc.search('hybrid search', { + repositoryId: repoId, + alpha: 0.5 + }); + + expect(results.length).toBeGreaterThan(0); + }); + + it('deduplicates snippets appearing in both FTS5 and vector results', async () => { + const s1 = seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'deduplicate this snippet carefully' + }); + seedEmbedding(client, s1, [1, 0]); + + const provider = makeMockProvider([[1, 0]]); + const svc = new HybridSearchService(client, searchService, provider); + + const results = await svc.search('deduplicate snippet', { + repositoryId: repoId, + alpha: 0.5 + }); + + // No duplicate IDs. + const ids = results.map((r) => r.snippet.id); + expect(ids.length).toBe(new Set(ids).size); + }); + + it('respects the limit option', async () => { + for (let i = 0; i < 10; i++) { + const id = seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: `pagination test item number ${i} relevant content here` + }); + seedEmbedding(client, id, [1, i * 0.1]); + } + + const provider = makeMockProvider([[1, 0]]); + const svc = new HybridSearchService(client, searchService, provider); + + const results = await svc.search('pagination test', { + repositoryId: repoId, + limit: 3, + alpha: 0.5 + }); + + expect(results.length).toBeLessThanOrEqual(3); + }); + + // ------------------------------------------------------------------------- + // Pure vector mode + // ------------------------------------------------------------------------- + + it('returns vector-ranked results when alpha = 1', async () => { + const s1 = seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'vector only mode' + }); + const s2 = seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'unrelated content' + }); + + // s1 is aligned with the query; s2 is orthogonal. + seedEmbedding(client, s1, [1, 0]); + seedEmbedding(client, s2, [0, 1]); + + const provider = makeMockProvider([[1, 0]]); + const svc = new HybridSearchService(client, searchService, provider); + + const results = await svc.search('anything', { + repositoryId: repoId, + alpha: 1 + }); + + expect(results[0].snippet.id).toBe(s1); + }); + + // ------------------------------------------------------------------------- + // Result structure + // ------------------------------------------------------------------------- + + it('results include snippet and repository metadata', async () => { + const s1 = seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'metadata check snippet content', + title: 'My Snippet Title' + }); + seedEmbedding(client, s1, [1, 0]); + + const provider = makeMockProvider([[1, 0]]); + const svc = new HybridSearchService(client, searchService, provider); + + const results = await svc.search('metadata check', { + repositoryId: repoId, + alpha: 0.5 + }); + + expect(results.length).toBeGreaterThan(0); + const first = results[0]; + expect(first.snippet.id).toBeDefined(); + expect(first.snippet.content).toBeDefined(); + expect(first.repository.id).toBe(repoId); + expect(first.repository.title).toBe('Test Repo'); + }); + + it('all results belong to the requested repository', async () => { + const otherRepoId = seedRepo(client, '/other/repo'); + const otherDocId = seedDocument(client, otherRepoId); + + for (let i = 0; i < 3; i++) { + const id = seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: `target repository keyword item ${i}` + }); + seedEmbedding(client, id, [1, i * 0.1]); + } + for (let i = 0; i < 3; i++) { + const id = seedSnippet(client, { + repositoryId: otherRepoId, + documentId: otherDocId, + content: `other repository keyword item ${i}` + }); + seedEmbedding(client, id, [1, i * 0.1]); + } + + const provider = makeMockProvider([[1, 0]]); + const svc = new HybridSearchService(client, searchService, provider); + + const results = await svc.search('repository keyword', { + repositoryId: repoId, + alpha: 0.5 + }); + + expect(results.every((r) => r.snippet.repositoryId === repoId)).toBe(true); + }); + + it('filters by snippet type when provided', async () => { + const code = seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'function example code snippet', + type: 'code' + }); + const info = seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'function example info snippet', + type: 'info' + }); + seedEmbedding(client, code, [1, 0]); + seedEmbedding(client, info, [1, 0]); + + const provider = makeMockProvider([[1, 0]]); + const svc = new HybridSearchService(client, searchService, provider); + + const codeResults = await svc.search('function example', { + repositoryId: repoId, + type: 'code', + alpha: 0.5 + }); + + expect(codeResults.every((r) => r.snippet.type === 'code')).toBe(true); + }); + + // ------------------------------------------------------------------------- + // Default alpha + // ------------------------------------------------------------------------- + + it('uses alpha = 0.5 when not specified', async () => { + const s1 = seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'default alpha hybrid test content' + }); + seedEmbedding(client, s1, [1, 0]); + + const provider = makeMockProvider([[1, 0]]); + const svc = new HybridSearchService(client, searchService, provider); + + // Should not throw and should return results. + const results = await svc.search('default alpha hybrid', { repositoryId: repoId }); + expect(Array.isArray(results)).toBe(true); + }); +}); diff --git a/src/lib/server/search/hybrid.search.service.ts b/src/lib/server/search/hybrid.search.service.ts new file mode 100644 index 0000000..2a57918 --- /dev/null +++ b/src/lib/server/search/hybrid.search.service.ts @@ -0,0 +1,226 @@ +/** + * HybridSearchService — combines FTS5 keyword search with vector similarity + * search using Reciprocal Rank Fusion (RRF) to produce a hybrid ranking. + * + * When no embedding provider is configured (or alpha = 0), the service + * transparently falls back to FTS5-only mode with zero overhead. + * + * Configuration model: + * alpha = 0.0 → FTS5 only + * alpha = 0.5 → balanced hybrid (default) + * alpha = 1.0 → vector only + */ + +import type Database from 'better-sqlite3'; +import type { EmbeddingProvider } from '../embeddings/provider.js'; +import type { SnippetSearchResult } from './search.service.js'; +import { SearchService } from './search.service.js'; +import { VectorSearch } from './vector.search.js'; +import { reciprocalRankFusion } from './rrf.js'; +import type { Snippet } from '$lib/types'; + +// --------------------------------------------------------------------------- +// Public interfaces +// --------------------------------------------------------------------------- + +export interface HybridSearchOptions { + repositoryId: string; + versionId?: string; + type?: 'code' | 'info'; + /** Maximum number of results to return. Default: 20. */ + limit?: number; + /** + * Blend weight between FTS5 and vector search. + * 0.0 = FTS5 only, 1.0 = vector only, 0.5 = balanced. + * Default: 0.5. + */ + alpha?: number; +} + +/** + * Global search configuration stored in the settings table under + * `search_config`. + */ +export interface SearchConfig { + /** Blend weight (0.0–1.0). Default: 0.5. */ + alpha: number; + /** Maximum results per search. Default: 20. */ + maxResults: number; + /** True when an embedding provider is configured. */ + enableHybrid: boolean; +} + +// --------------------------------------------------------------------------- +// Raw DB row used when re-fetching snippets by ID +// --------------------------------------------------------------------------- + +interface RawSnippetById { + id: string; + document_id: string; + repository_id: string; + version_id: string | null; + type: 'code' | 'info'; + title: string | null; + content: string; + language: string | null; + breadcrumb: string | null; + token_count: number | null; + created_at: number; + repo_id: string; + repo_title: string; +} + +// --------------------------------------------------------------------------- +// HybridSearchService +// --------------------------------------------------------------------------- + +export class HybridSearchService { + private readonly vectorSearch: VectorSearch; + + constructor( + private readonly db: Database.Database, + private readonly searchService: SearchService, + private readonly embeddingProvider: EmbeddingProvider | null + ) { + this.vectorSearch = new VectorSearch(db); + } + + /** + * Execute a hybrid search combining FTS5 and (optionally) vector search. + * + * When `embeddingProvider` is null or `alpha` is 0, the method returns + * FTS5 results directly without embedding the query. + * + * @param query - Raw search string (preprocessing handled by SearchService). + * @param options - Search parameters including repositoryId and alpha blend. + * @returns Ranked array of SnippetSearchResult, deduplicated by snippet ID. + */ + async search( + query: string, + options: HybridSearchOptions + ): Promise { + const limit = options.limit ?? 20; + const alpha = options.alpha ?? 0.5; + + // Always run FTS5 — it is synchronous and fast. + const ftsResults = this.searchService.searchSnippets(query, { + repositoryId: options.repositoryId, + versionId: options.versionId, + type: options.type, + limit: limit * 3 // wider candidate pool for fusion + }); + + // Degenerate cases: no provider or pure FTS5 mode. + if (!this.embeddingProvider || alpha === 0) { + return ftsResults.slice(0, limit); + } + + // Embed query and run vector search. + const embeddings = await this.embeddingProvider.embed([query]); + + // Provider may be a Noop (returns empty array) — fall back gracefully. + if (embeddings.length === 0) { + return ftsResults.slice(0, limit); + } + + const queryEmbedding = embeddings[0].values; + + const vectorResults = this.vectorSearch.vectorSearch( + queryEmbedding, + options.repositoryId, + limit * 3 + ); + + // Pure vector mode: skip RRF and return vector results directly. + if (alpha === 1) { + const topIds = vectorResults.slice(0, limit).map((r) => r.snippetId); + return this.fetchSnippetsByIds(topIds, options.repositoryId, options.type); + } + + // Build ranked lists for RRF. Score field is unused by RRF — only + // the array index (rank) matters. + const ftsRanked = ftsResults.map((r, i) => ({ id: r.snippet.id, score: i })); + const vecRanked = vectorResults.map((r, i) => ({ id: r.snippetId, score: i })); + + const fused = reciprocalRankFusion(ftsRanked, vecRanked); + + const topIds = fused.slice(0, limit).map((r) => r.id); + return this.fetchSnippetsByIds(topIds, options.repositoryId, options.type); + } + + // ------------------------------------------------------------------------- + // Private helpers + // ------------------------------------------------------------------------- + + /** + * Load full snippet + repository data for the given ordered snippet IDs. + * + * Results are returned in the same order as `ids` so callers receive the + * RRF-ranked list intact. Snippets not found in the database (or filtered + * out by optional type constraint) are silently omitted. + */ + private fetchSnippetsByIds( + ids: string[], + repositoryId: string, + type?: 'code' | 'info' + ): SnippetSearchResult[] { + if (ids.length === 0) return []; + + const placeholders = ids.map(() => '?').join(', '); + const params: unknown[] = [...ids, repositoryId]; + let typeClause = ''; + if (type !== undefined) { + typeClause = ' AND s.type = ?'; + params.push(type); + } + + const rows = this.db + .prepare( + `SELECT + s.id, s.document_id, s.repository_id, s.version_id, s.type, + s.title, s.content, s.language, s.breadcrumb, s.token_count, + s.created_at, + r.id AS repo_id, + r.title AS repo_title + FROM snippets s + JOIN repositories r ON r.id = s.repository_id + WHERE s.id IN (${placeholders}) + AND s.repository_id = ?${typeClause}` + ) + .all(...params) as RawSnippetById[]; + + // Build a map for O(1) lookup, then reconstruct in rank order. + const byId = new Map(); + for (const row of rows) { + byId.set(row.id, row); + } + + const results: SnippetSearchResult[] = []; + for (const id of ids) { + const row = byId.get(id); + if (!row) continue; + + const snippet: Snippet = { + id: row.id, + documentId: row.document_id, + repositoryId: row.repository_id, + versionId: row.version_id, + type: row.type, + title: row.title, + content: row.content, + language: row.language, + breadcrumb: row.breadcrumb, + tokenCount: row.token_count, + createdAt: new Date(row.created_at * 1000) + }; + + results.push({ + snippet, + score: 0, // RRF score not mapped to BM25 scale; consumers use rank position. + repository: { id: row.repo_id, title: row.repo_title } + }); + } + + return results; + } +} diff --git a/src/lib/server/search/rrf.ts b/src/lib/server/search/rrf.ts new file mode 100644 index 0000000..8530df9 --- /dev/null +++ b/src/lib/server/search/rrf.ts @@ -0,0 +1,51 @@ +/** + * Reciprocal Rank Fusion (RRF) implementation. + * + * RRF combines multiple ranked lists into a single fused ranking without + * requiring score normalization. The standard constant K=60 is used to + * dampen the influence of very high ranks. + * + * Reference: Cormack, Clarke & Buettcher (2009) — "Reciprocal Rank Fusion + * outperforms Condorcet and individual Rank Learning Methods." + */ + +/** A single item in a ranked list, identified by an opaque string id. */ +export interface RankedItem { + id: string; + score: number; +} + +/** Output item produced by RRF. */ +export interface FusedItem { + id: string; + rrfScore: number; +} + +/** + * Combine multiple ranked lists using Reciprocal Rank Fusion. + * + * Each item's contribution per list is `1 / (K + rank + 1)` where rank is + * 0-based. Items that appear in multiple lists accumulate contributions from + * each list, naturally boosting items ranked highly across several sources. + * + * @param rankings - One or more arrays of `{ id, score }` items sorted by + * descending relevance (index 0 = most relevant). + * @returns Fused array sorted by descending rrfScore, deduplicated by id. + */ +export function reciprocalRankFusion( + ...rankings: Array> +): Array { + const K = 60; // Standard RRF constant. + const scores = new Map(); + + for (const ranking of rankings) { + ranking.forEach(({ id }, rank) => { + const current = scores.get(id) ?? 0; + scores.set(id, current + 1 / (K + rank + 1)); + }); + } + + return Array.from(scores.entries()) + .map(([id, rrfScore]) => ({ id, rrfScore })) + .sort((a, b) => b.rrfScore - a.rrfScore); +} diff --git a/src/lib/server/search/vector.search.ts b/src/lib/server/search/vector.search.ts new file mode 100644 index 0000000..5c4506d --- /dev/null +++ b/src/lib/server/search/vector.search.ts @@ -0,0 +1,108 @@ +/** + * Vector similarity search over stored snippet embeddings. + * + * SQLite does not natively support vector operations, so cosine similarity is + * computed in JavaScript after loading candidate embeddings from the + * snippet_embeddings table. + * + * Performance note: For repositories with > 50k snippets, pre-filtering by + * FTS5 candidates before computing cosine similarity is recommended. For v1, + * in-memory computation is acceptable. + */ + +import type Database from 'better-sqlite3'; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface VectorSearchResult { + snippetId: string; + score: number; +} + +/** Raw DB row from snippet_embeddings joined with snippets. */ +interface RawEmbeddingRow { + snippet_id: string; + embedding: Buffer; +} + +// --------------------------------------------------------------------------- +// Math helpers +// --------------------------------------------------------------------------- + +/** + * Compute cosine similarity between two Float32Array vectors. + * + * Returns a value in [-1, 1] where 1 is identical direction. Returns 0 when + * either vector has zero magnitude to avoid division by zero. + */ +export function cosineSimilarity(a: Float32Array, b: Float32Array): number { + if (a.length !== b.length) { + throw new Error( + `Embedding dimension mismatch: ${a.length} vs ${b.length}` + ); + } + + let dot = 0; + let normA = 0; + let normB = 0; + + for (let i = 0; i < a.length; i++) { + dot += a[i] * b[i]; + normA += a[i] * a[i]; + normB += b[i] * b[i]; + } + + const denom = Math.sqrt(normA) * Math.sqrt(normB); + if (denom === 0) return 0; + return dot / denom; +} + +// --------------------------------------------------------------------------- +// VectorSearch class +// --------------------------------------------------------------------------- + +export class VectorSearch { + private readonly stmt: Database.Statement<[string], RawEmbeddingRow>; + + constructor(private readonly db: Database.Database) { + // Prepare once — reused for every call. + this.stmt = this.db.prepare<[string], RawEmbeddingRow>(` + SELECT se.snippet_id, se.embedding + FROM snippet_embeddings se + JOIN snippets s ON s.id = se.snippet_id + WHERE s.repository_id = ? + `); + } + + /** + * Search stored embeddings by cosine similarity to the query embedding. + * + * @param queryEmbedding - The embedded representation of the search query. + * @param repositoryId - Scope the search to a single repository. + * @param limit - Maximum number of results to return. Default: 50. + * @returns Results sorted by descending cosine similarity score. + */ + vectorSearch( + queryEmbedding: Float32Array, + repositoryId: string, + limit = 50 + ): VectorSearchResult[] { + const rows = this.stmt.all(repositoryId); + + const scored: VectorSearchResult[] = rows.map((row) => { + const embedding = new Float32Array( + row.embedding.buffer, + row.embedding.byteOffset, + row.embedding.byteLength / 4 + ); + return { + snippetId: row.snippet_id, + score: cosineSimilarity(queryEmbedding, embedding) + }; + }); + + return scored.sort((a, b) => b.score - a.score).slice(0, limit); + } +}