feat(TRUEREF-0008): implement hybrid semantic search with RRF

- Cosine similarity vector search over stored embeddings
- Reciprocal Rank Fusion (K=60) combining FTS5 + vector rankings
- Configurable alpha weight between keyword and semantic search
- Graceful degradation to FTS5-only when no embedding provider configured

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Giancarmine Salucci
2026-03-23 09:06:25 +01:00
parent 33bdf30709
commit d3d577a2e2
4 changed files with 1009 additions and 0 deletions

View File

@@ -0,0 +1,624 @@
/**
* Unit tests for HybridSearchService, VectorSearch, and RRF (TRUEREF-0008).
*
* Uses an in-memory SQLite database and a mock EmbeddingProvider.
* No real network calls are made.
*/
import { describe, it, expect, beforeEach } from 'vitest';
import Database from 'better-sqlite3';
import { readFileSync } from 'node:fs';
import { join } from 'node:path';
import { SearchService } from './search.service.js';
import { HybridSearchService } from './hybrid.search.service.js';
import { VectorSearch, cosineSimilarity } from './vector.search.js';
import { reciprocalRankFusion } from './rrf.js';
import type { EmbeddingProvider, EmbeddingVector } from '../embeddings/provider.js';
// ---------------------------------------------------------------------------
// In-memory DB factory
// ---------------------------------------------------------------------------
function createTestDb(): Database.Database {
const client = new Database(':memory:');
client.pragma('foreign_keys = ON');
const migrationsFolder = join(import.meta.dirname, '../db/migrations');
const migrationSql = readFileSync(
join(migrationsFolder, '0000_large_master_chief.sql'),
'utf-8'
);
const statements = migrationSql
.split('--> statement-breakpoint')
.map((s) => s.trim())
.filter(Boolean);
for (const stmt of statements) {
client.exec(stmt);
}
const ftsSql = readFileSync(join(import.meta.dirname, '../db/fts.sql'), 'utf-8');
client.exec(ftsSql);
return client;
}
// ---------------------------------------------------------------------------
// Seed helpers
// ---------------------------------------------------------------------------
const NOW_S = Math.floor(Date.now() / 1000);
function seedRepo(client: Database.Database, id = '/test/repo'): string {
client
.prepare(
`INSERT OR IGNORE INTO repositories
(id, title, source, source_url, state, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?)`
)
.run(id, 'Test Repo', 'github', `https://github.com${id}`, 'indexed', NOW_S, NOW_S);
return id;
}
function seedDocument(client: Database.Database, repositoryId: string): string {
const docId = crypto.randomUUID();
client
.prepare(
`INSERT INTO documents (id, repository_id, file_path, checksum, indexed_at)
VALUES (?, ?, ?, ?, ?)`
)
.run(docId, repositoryId, 'README.md', 'abc', NOW_S);
return docId;
}
function seedSnippet(
client: Database.Database,
opts: {
repositoryId: string;
documentId: string;
content: string;
title?: string | null;
type?: 'code' | 'info';
}
): string {
const id = crypto.randomUUID();
client
.prepare(
`INSERT INTO snippets
(id, document_id, repository_id, type, title, content, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?)`
)
.run(
id,
opts.documentId,
opts.repositoryId,
opts.type ?? 'info',
opts.title ?? null,
opts.content,
NOW_S
);
return id;
}
function seedEmbedding(
client: Database.Database,
snippetId: string,
values: number[],
model = 'test-model'
): void {
const f32 = new Float32Array(values);
client
.prepare(
`INSERT OR REPLACE INTO snippet_embeddings
(snippet_id, model, dimensions, embedding, created_at)
VALUES (?, ?, ?, ?, ?)`
)
.run(snippetId, model, values.length, Buffer.from(f32.buffer), NOW_S);
}
// ---------------------------------------------------------------------------
// Mock EmbeddingProvider
// ---------------------------------------------------------------------------
function makeMockProvider(
returnValues: number[][] = [[1, 0, 0, 0]]
): EmbeddingProvider {
return {
name: 'mock',
dimensions: returnValues[0]?.length ?? 4,
model: 'test-model',
async embed(texts: string[]): Promise<EmbeddingVector[]> {
return texts.map((_, i) => {
const vals = returnValues[i % returnValues.length];
return {
values: new Float32Array(vals),
dimensions: vals.length,
model: 'test-model'
};
});
},
async isAvailable(): Promise<boolean> {
return true;
}
};
}
function makeNoopProvider(): EmbeddingProvider {
return {
name: 'noop',
dimensions: 0,
model: 'none',
async embed(_texts: string[]): Promise<EmbeddingVector[]> {
return [];
},
async isAvailable(): Promise<boolean> {
return false;
}
};
}
// ===========================================================================
// cosineSimilarity
// ===========================================================================
describe('cosineSimilarity', () => {
it('returns 1.0 for identical vectors', () => {
const v = new Float32Array([1, 2, 3]);
expect(cosineSimilarity(v, v)).toBeCloseTo(1.0, 5);
});
it('returns 0.0 for orthogonal vectors', () => {
const a = new Float32Array([1, 0]);
const b = new Float32Array([0, 1]);
expect(cosineSimilarity(a, b)).toBeCloseTo(0.0, 5);
});
it('returns -1.0 for opposite vectors', () => {
const a = new Float32Array([1, 0]);
const b = new Float32Array([-1, 0]);
expect(cosineSimilarity(a, b)).toBeCloseTo(-1.0, 5);
});
it('returns 0 for zero-magnitude vector', () => {
const a = new Float32Array([0, 0]);
const b = new Float32Array([1, 2]);
expect(cosineSimilarity(a, b)).toBe(0);
});
it('throws when dimensions do not match', () => {
const a = new Float32Array([1, 2]);
const b = new Float32Array([1, 2, 3]);
expect(() => cosineSimilarity(a, b)).toThrow('dimension mismatch');
});
it('computes correct similarity for non-trivial vectors', () => {
// [1,1] · [1,0] = 1; |[1,1]| = sqrt(2); |[1,0]| = 1 → 1/sqrt(2) ≈ 0.7071
const a = new Float32Array([1, 1]);
const b = new Float32Array([1, 0]);
expect(cosineSimilarity(a, b)).toBeCloseTo(1 / Math.sqrt(2), 4);
});
});
// ===========================================================================
// reciprocalRankFusion
// ===========================================================================
describe('reciprocalRankFusion', () => {
it('returns empty array for empty inputs', () => {
expect(reciprocalRankFusion([], [])).toHaveLength(0);
});
it('fuses a single list preserving order', () => {
const ranking = [
{ id: 'a', score: 10 },
{ id: 'b', score: 5 },
{ id: 'c', score: 1 }
];
const result = reciprocalRankFusion(ranking);
expect(result.map((r) => r.id)).toEqual(['a', 'b', 'c']);
});
it('deduplicates items appearing in multiple lists', () => {
const r1 = [{ id: 'a', score: 1 }];
const r2 = [{ id: 'a', score: 1 }];
const result = reciprocalRankFusion(r1, r2);
expect(result.filter((r) => r.id === 'a')).toHaveLength(1);
});
it('boosts items appearing in multiple lists', () => {
// 'a' appears in both rankings at rank 0.
// 'b' appears only in r1 at rank 1.
// 'a' should outscore 'b'.
const r1 = [
{ id: 'a', score: 1 },
{ id: 'b', score: 0.5 }
];
const r2 = [{ id: 'a', score: 1 }];
const result = reciprocalRankFusion(r1, r2);
const aScore = result.find((r) => r.id === 'a')!.rrfScore;
const bScore = result.find((r) => r.id === 'b')!.rrfScore;
expect(aScore).toBeGreaterThan(bScore);
});
it('assigns higher rrfScore to higher-ranked items', () => {
const ranking = [
{ id: 'first', score: 100 },
{ id: 'second', score: 50 }
];
const result = reciprocalRankFusion(ranking);
expect(result[0].id).toBe('first');
expect(result[0].rrfScore).toBeGreaterThan(result[1].rrfScore);
});
it('handles three lists correctly', () => {
const r1 = [{ id: 'a', score: 1 }, { id: 'b', score: 0 }];
const r2 = [{ id: 'b', score: 1 }, { id: 'c', score: 0 }];
const r3 = [{ id: 'a', score: 1 }, { id: 'c', score: 0 }];
const result = reciprocalRankFusion(r1, r2, r3);
// 'a' appears first in r1 and r3 → higher combined score than 'b' or 'c'.
expect(result[0].id).toBe('a');
expect(result.map((r) => r.id)).toContain('b');
expect(result.map((r) => r.id)).toContain('c');
});
it('produces positive rrfScores', () => {
const ranking = [{ id: 'x', score: 0 }];
const result = reciprocalRankFusion(ranking);
expect(result[0].rrfScore).toBeGreaterThan(0);
});
});
// ===========================================================================
// VectorSearch
// ===========================================================================
describe('VectorSearch', () => {
let client: Database.Database;
let repoId: string;
let docId: string;
beforeEach(() => {
client = createTestDb();
repoId = seedRepo(client);
docId = seedDocument(client, repoId);
});
it('returns empty array when no embeddings exist', () => {
const vs = new VectorSearch(client);
const results = vs.vectorSearch(new Float32Array([1, 0]), repoId);
expect(results).toHaveLength(0);
});
it('returns results sorted by descending cosine similarity', () => {
const s1 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'alpha' });
const s2 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'beta' });
const s3 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'gamma' });
// Query: [1, 0, 0, 0]
// s1: [1, 0, 0, 0] → similarity 1.0 (most similar)
// s2: [0, 1, 0, 0] → similarity 0.0
// s3: [0, 0, 1, 0] → similarity 0.0
seedEmbedding(client, s1, [1, 0, 0, 0]);
seedEmbedding(client, s2, [0, 1, 0, 0]);
seedEmbedding(client, s3, [0, 0, 1, 0]);
const vs = new VectorSearch(client);
const results = vs.vectorSearch(new Float32Array([1, 0, 0, 0]), repoId);
expect(results[0].snippetId).toBe(s1);
expect(results[0].score).toBeCloseTo(1.0, 4);
expect(results.length).toBe(3);
});
it('respects the limit parameter', () => {
for (let i = 0; i < 5; i++) {
const id = seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: `item ${i}`
});
seedEmbedding(client, id, [i * 0.1, 1 - i * 0.1]);
}
const vs = new VectorSearch(client);
const results = vs.vectorSearch(new Float32Array([1, 0]), repoId, 3);
expect(results.length).toBeLessThanOrEqual(3);
});
it('only returns snippets from the specified repository', () => {
const otherRepoId = seedRepo(client, '/other/repo');
const otherDocId = seedDocument(client, otherRepoId);
const s1 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'mine' });
const s2 = seedSnippet(client, {
repositoryId: otherRepoId,
documentId: otherDocId,
content: 'theirs'
});
seedEmbedding(client, s1, [1, 0]);
seedEmbedding(client, s2, [1, 0]);
const vs = new VectorSearch(client);
const results = vs.vectorSearch(new Float32Array([1, 0]), repoId);
expect(results).toHaveLength(1);
expect(results[0].snippetId).toBe(s1);
});
it('handles embeddings with negative values', () => {
const s1 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'neg' });
seedEmbedding(client, s1, [-0.5, 0.5]);
const vs = new VectorSearch(client);
const results = vs.vectorSearch(new Float32Array([-0.5, 0.5]), repoId);
expect(results[0].score).toBeCloseTo(1.0, 4);
});
});
// ===========================================================================
// HybridSearchService
// ===========================================================================
describe('HybridSearchService', () => {
let client: Database.Database;
let searchService: SearchService;
let repoId: string;
let docId: string;
beforeEach(() => {
client = createTestDb();
searchService = new SearchService(client);
repoId = seedRepo(client);
docId = seedDocument(client, repoId);
});
// -------------------------------------------------------------------------
// FTS5-only mode (no provider / alpha = 0)
// -------------------------------------------------------------------------
it('returns FTS5 results when embeddingProvider is null', async () => {
seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'hello world' });
const svc = new HybridSearchService(client, searchService, null);
const results = await svc.search('hello', { repositoryId: repoId });
expect(results.length).toBeGreaterThan(0);
expect(results[0].snippet.content).toBe('hello world');
});
it('returns FTS5 results when alpha = 0', async () => {
seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'alpha zero test' });
const provider = makeMockProvider([[1, 0]]);
const svc = new HybridSearchService(client, searchService, provider);
const results = await svc.search('alpha zero', { repositoryId: repoId, alpha: 0 });
expect(results.length).toBeGreaterThan(0);
});
it('returns empty array when FTS5 query is blank and no provider', async () => {
const svc = new HybridSearchService(client, searchService, null);
const results = await svc.search(' ', { repositoryId: repoId });
expect(results).toHaveLength(0);
});
it('falls back to FTS5 when noop provider returns empty embeddings', async () => {
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'noop fallback test'
});
const svc = new HybridSearchService(client, searchService, makeNoopProvider());
const results = await svc.search('noop fallback', { repositoryId: repoId });
expect(results.length).toBeGreaterThan(0);
});
// -------------------------------------------------------------------------
// Hybrid mode
// -------------------------------------------------------------------------
it('returns results when hybrid mode is active (alpha = 0.5)', async () => {
const s1 = seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'hybrid search keyword match'
});
seedEmbedding(client, s1, [1, 0, 0, 0]);
const provider = makeMockProvider([[1, 0, 0, 0]]);
const svc = new HybridSearchService(client, searchService, provider);
const results = await svc.search('hybrid search', {
repositoryId: repoId,
alpha: 0.5
});
expect(results.length).toBeGreaterThan(0);
});
it('deduplicates snippets appearing in both FTS5 and vector results', async () => {
const s1 = seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'deduplicate this snippet carefully'
});
seedEmbedding(client, s1, [1, 0]);
const provider = makeMockProvider([[1, 0]]);
const svc = new HybridSearchService(client, searchService, provider);
const results = await svc.search('deduplicate snippet', {
repositoryId: repoId,
alpha: 0.5
});
// No duplicate IDs.
const ids = results.map((r) => r.snippet.id);
expect(ids.length).toBe(new Set(ids).size);
});
it('respects the limit option', async () => {
for (let i = 0; i < 10; i++) {
const id = seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: `pagination test item number ${i} relevant content here`
});
seedEmbedding(client, id, [1, i * 0.1]);
}
const provider = makeMockProvider([[1, 0]]);
const svc = new HybridSearchService(client, searchService, provider);
const results = await svc.search('pagination test', {
repositoryId: repoId,
limit: 3,
alpha: 0.5
});
expect(results.length).toBeLessThanOrEqual(3);
});
// -------------------------------------------------------------------------
// Pure vector mode
// -------------------------------------------------------------------------
it('returns vector-ranked results when alpha = 1', async () => {
const s1 = seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'vector only mode'
});
const s2 = seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'unrelated content'
});
// s1 is aligned with the query; s2 is orthogonal.
seedEmbedding(client, s1, [1, 0]);
seedEmbedding(client, s2, [0, 1]);
const provider = makeMockProvider([[1, 0]]);
const svc = new HybridSearchService(client, searchService, provider);
const results = await svc.search('anything', {
repositoryId: repoId,
alpha: 1
});
expect(results[0].snippet.id).toBe(s1);
});
// -------------------------------------------------------------------------
// Result structure
// -------------------------------------------------------------------------
it('results include snippet and repository metadata', async () => {
const s1 = seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'metadata check snippet content',
title: 'My Snippet Title'
});
seedEmbedding(client, s1, [1, 0]);
const provider = makeMockProvider([[1, 0]]);
const svc = new HybridSearchService(client, searchService, provider);
const results = await svc.search('metadata check', {
repositoryId: repoId,
alpha: 0.5
});
expect(results.length).toBeGreaterThan(0);
const first = results[0];
expect(first.snippet.id).toBeDefined();
expect(first.snippet.content).toBeDefined();
expect(first.repository.id).toBe(repoId);
expect(first.repository.title).toBe('Test Repo');
});
it('all results belong to the requested repository', async () => {
const otherRepoId = seedRepo(client, '/other/repo');
const otherDocId = seedDocument(client, otherRepoId);
for (let i = 0; i < 3; i++) {
const id = seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: `target repository keyword item ${i}`
});
seedEmbedding(client, id, [1, i * 0.1]);
}
for (let i = 0; i < 3; i++) {
const id = seedSnippet(client, {
repositoryId: otherRepoId,
documentId: otherDocId,
content: `other repository keyword item ${i}`
});
seedEmbedding(client, id, [1, i * 0.1]);
}
const provider = makeMockProvider([[1, 0]]);
const svc = new HybridSearchService(client, searchService, provider);
const results = await svc.search('repository keyword', {
repositoryId: repoId,
alpha: 0.5
});
expect(results.every((r) => r.snippet.repositoryId === repoId)).toBe(true);
});
it('filters by snippet type when provided', async () => {
const code = seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'function example code snippet',
type: 'code'
});
const info = seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'function example info snippet',
type: 'info'
});
seedEmbedding(client, code, [1, 0]);
seedEmbedding(client, info, [1, 0]);
const provider = makeMockProvider([[1, 0]]);
const svc = new HybridSearchService(client, searchService, provider);
const codeResults = await svc.search('function example', {
repositoryId: repoId,
type: 'code',
alpha: 0.5
});
expect(codeResults.every((r) => r.snippet.type === 'code')).toBe(true);
});
// -------------------------------------------------------------------------
// Default alpha
// -------------------------------------------------------------------------
it('uses alpha = 0.5 when not specified', async () => {
const s1 = seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'default alpha hybrid test content'
});
seedEmbedding(client, s1, [1, 0]);
const provider = makeMockProvider([[1, 0]]);
const svc = new HybridSearchService(client, searchService, provider);
// Should not throw and should return results.
const results = await svc.search('default alpha hybrid', { repositoryId: repoId });
expect(Array.isArray(results)).toBe(true);
});
});