1119 lines
34 KiB
TypeScript
1119 lines
34 KiB
TypeScript
/**
|
|
* Unit tests for HybridSearchService, VectorSearch, and RRF (TRUEREF-0008).
|
|
*
|
|
* Uses an in-memory SQLite database and a mock EmbeddingProvider.
|
|
* No real network calls are made.
|
|
*/
|
|
|
|
import { describe, it, expect, beforeEach } from 'vitest';
|
|
import Database from 'better-sqlite3';
|
|
import { readFileSync } from 'node:fs';
|
|
import { join } from 'node:path';
|
|
|
|
import { SearchService } from './search.service.js';
|
|
import { HybridSearchService } from './hybrid.search.service.js';
|
|
import { VectorSearch, cosineSimilarity } from './vector.search.js';
|
|
import { reciprocalRankFusion } from './rrf.js';
|
|
import type { EmbeddingProvider, EmbeddingVector } from '../embeddings/provider.js';
|
|
import { loadSqliteVec } from '../db/sqlite-vec.js';
|
|
import { SqliteVecStore } from './sqlite-vec.store.js';
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// In-memory DB factory
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function createTestDb(): Database.Database {
|
|
const client = new Database(':memory:');
|
|
client.pragma('foreign_keys = ON');
|
|
loadSqliteVec(client);
|
|
|
|
const migrationsFolder = join(import.meta.dirname, '../db/migrations');
|
|
|
|
// Run all migrations in order
|
|
const migrations = [
|
|
'0000_large_master_chief.sql',
|
|
'0001_quick_nighthawk.sql',
|
|
'0002_silky_stellaris.sql',
|
|
'0003_multiversion_config.sql',
|
|
'0004_complete_sentry.sql',
|
|
'0005_fix_stage_defaults.sql',
|
|
'0006_yielding_centennial.sql'
|
|
];
|
|
for (const migrationFile of migrations) {
|
|
const migrationSql = readFileSync(join(migrationsFolder, migrationFile), 'utf-8');
|
|
const statements = migrationSql
|
|
.split('--> statement-breakpoint')
|
|
.map((s) => s.trim())
|
|
.filter(Boolean);
|
|
for (const stmt of statements) {
|
|
client.exec(stmt);
|
|
}
|
|
}
|
|
|
|
const ftsSql = readFileSync(join(import.meta.dirname, '../db/fts.sql'), 'utf-8');
|
|
client.exec(ftsSql);
|
|
|
|
return client;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Seed helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const NOW_S = Math.floor(Date.now() / 1000);
|
|
|
|
function seedRepo(client: Database.Database, id = '/test/repo'): string {
|
|
client
|
|
.prepare(
|
|
`INSERT OR IGNORE INTO repositories
|
|
(id, title, source, source_url, state, created_at, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run(id, 'Test Repo', 'github', `https://github.com${id}`, 'indexed', NOW_S, NOW_S);
|
|
return id;
|
|
}
|
|
|
|
function seedDocument(client: Database.Database, repositoryId: string): string {
|
|
const docId = crypto.randomUUID();
|
|
client
|
|
.prepare(
|
|
`INSERT INTO documents (id, repository_id, file_path, checksum, indexed_at)
|
|
VALUES (?, ?, ?, ?, ?)`
|
|
)
|
|
.run(docId, repositoryId, 'README.md', 'abc', NOW_S);
|
|
return docId;
|
|
}
|
|
|
|
function seedSnippet(
|
|
client: Database.Database,
|
|
opts: {
|
|
repositoryId: string;
|
|
documentId: string;
|
|
content: string;
|
|
title?: string | null;
|
|
type?: 'code' | 'info';
|
|
}
|
|
): string {
|
|
const id = crypto.randomUUID();
|
|
client
|
|
.prepare(
|
|
`INSERT INTO snippets
|
|
(id, document_id, repository_id, type, title, content, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run(
|
|
id,
|
|
opts.documentId,
|
|
opts.repositoryId,
|
|
opts.type ?? 'info',
|
|
opts.title ?? null,
|
|
opts.content,
|
|
NOW_S
|
|
);
|
|
return id;
|
|
}
|
|
|
|
function seedEmbedding(
|
|
client: Database.Database,
|
|
snippetId: string,
|
|
values: number[],
|
|
profileId = 'local-default',
|
|
model = 'test-model'
|
|
): void {
|
|
const f32 = new Float32Array(values);
|
|
client
|
|
.prepare(
|
|
`INSERT OR REPLACE INTO snippet_embeddings
|
|
(snippet_id, profile_id, model, dimensions, embedding, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run(snippetId, profileId, model, values.length, Buffer.from(f32.buffer), NOW_S);
|
|
new SqliteVecStore(client).upsertEmbedding(profileId, snippetId, f32);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Mock EmbeddingProvider
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function makeMockProvider(returnValues: number[][] = [[1, 0, 0, 0]]): EmbeddingProvider {
|
|
return {
|
|
name: 'mock',
|
|
dimensions: returnValues[0]?.length ?? 4,
|
|
model: 'test-model',
|
|
async embed(texts: string[]): Promise<EmbeddingVector[]> {
|
|
return texts.map((_, i) => {
|
|
const vals = returnValues[i % returnValues.length];
|
|
return {
|
|
values: new Float32Array(vals),
|
|
dimensions: vals.length,
|
|
model: 'test-model'
|
|
};
|
|
});
|
|
},
|
|
async isAvailable(): Promise<boolean> {
|
|
return true;
|
|
}
|
|
};
|
|
}
|
|
|
|
function makeNoopProvider(): EmbeddingProvider {
|
|
return {
|
|
name: 'noop',
|
|
dimensions: 0,
|
|
model: 'none',
|
|
async embed(): Promise<EmbeddingVector[]> {
|
|
return [];
|
|
},
|
|
async isAvailable(): Promise<boolean> {
|
|
return false;
|
|
}
|
|
};
|
|
}
|
|
|
|
// ===========================================================================
|
|
// cosineSimilarity
|
|
// ===========================================================================
|
|
|
|
describe('cosineSimilarity', () => {
|
|
it('returns 1.0 for identical vectors', () => {
|
|
const v = new Float32Array([1, 2, 3]);
|
|
expect(cosineSimilarity(v, v)).toBeCloseTo(1.0, 5);
|
|
});
|
|
|
|
it('returns 0.0 for orthogonal vectors', () => {
|
|
const a = new Float32Array([1, 0]);
|
|
const b = new Float32Array([0, 1]);
|
|
expect(cosineSimilarity(a, b)).toBeCloseTo(0.0, 5);
|
|
});
|
|
|
|
it('returns -1.0 for opposite vectors', () => {
|
|
const a = new Float32Array([1, 0]);
|
|
const b = new Float32Array([-1, 0]);
|
|
expect(cosineSimilarity(a, b)).toBeCloseTo(-1.0, 5);
|
|
});
|
|
|
|
it('returns 0 for zero-magnitude vector', () => {
|
|
const a = new Float32Array([0, 0]);
|
|
const b = new Float32Array([1, 2]);
|
|
expect(cosineSimilarity(a, b)).toBe(0);
|
|
});
|
|
|
|
it('throws when dimensions do not match', () => {
|
|
const a = new Float32Array([1, 2]);
|
|
const b = new Float32Array([1, 2, 3]);
|
|
expect(() => cosineSimilarity(a, b)).toThrow('dimension mismatch');
|
|
});
|
|
|
|
it('computes correct similarity for non-trivial vectors', () => {
|
|
// [1,1] · [1,0] = 1; |[1,1]| = sqrt(2); |[1,0]| = 1 → 1/sqrt(2) ≈ 0.7071
|
|
const a = new Float32Array([1, 1]);
|
|
const b = new Float32Array([1, 0]);
|
|
expect(cosineSimilarity(a, b)).toBeCloseTo(1 / Math.sqrt(2), 4);
|
|
});
|
|
});
|
|
|
|
// ===========================================================================
|
|
// reciprocalRankFusion
|
|
// ===========================================================================
|
|
|
|
describe('reciprocalRankFusion', () => {
|
|
it('returns empty array for empty inputs', () => {
|
|
expect(reciprocalRankFusion([], [])).toHaveLength(0);
|
|
});
|
|
|
|
it('fuses a single list preserving order', () => {
|
|
const ranking = [
|
|
{ id: 'a', score: 10 },
|
|
{ id: 'b', score: 5 },
|
|
{ id: 'c', score: 1 }
|
|
];
|
|
const result = reciprocalRankFusion(ranking);
|
|
expect(result.map((r) => r.id)).toEqual(['a', 'b', 'c']);
|
|
});
|
|
|
|
it('deduplicates items appearing in multiple lists', () => {
|
|
const r1 = [{ id: 'a', score: 1 }];
|
|
const r2 = [{ id: 'a', score: 1 }];
|
|
const result = reciprocalRankFusion(r1, r2);
|
|
expect(result.filter((r) => r.id === 'a')).toHaveLength(1);
|
|
});
|
|
|
|
it('boosts items appearing in multiple lists', () => {
|
|
// 'a' appears in both rankings at rank 0.
|
|
// 'b' appears only in r1 at rank 1.
|
|
// 'a' should outscore 'b'.
|
|
const r1 = [
|
|
{ id: 'a', score: 1 },
|
|
{ id: 'b', score: 0.5 }
|
|
];
|
|
const r2 = [{ id: 'a', score: 1 }];
|
|
const result = reciprocalRankFusion(r1, r2);
|
|
const aScore = result.find((r) => r.id === 'a')!.rrfScore;
|
|
const bScore = result.find((r) => r.id === 'b')!.rrfScore;
|
|
expect(aScore).toBeGreaterThan(bScore);
|
|
});
|
|
|
|
it('assigns higher rrfScore to higher-ranked items', () => {
|
|
const ranking = [
|
|
{ id: 'first', score: 100 },
|
|
{ id: 'second', score: 50 }
|
|
];
|
|
const result = reciprocalRankFusion(ranking);
|
|
expect(result[0].id).toBe('first');
|
|
expect(result[0].rrfScore).toBeGreaterThan(result[1].rrfScore);
|
|
});
|
|
|
|
it('handles three lists correctly', () => {
|
|
const r1 = [
|
|
{ id: 'a', score: 1 },
|
|
{ id: 'b', score: 0 }
|
|
];
|
|
const r2 = [
|
|
{ id: 'b', score: 1 },
|
|
{ id: 'c', score: 0 }
|
|
];
|
|
const r3 = [
|
|
{ id: 'a', score: 1 },
|
|
{ id: 'c', score: 0 }
|
|
];
|
|
const result = reciprocalRankFusion(r1, r2, r3);
|
|
// 'a' appears first in r1 and r3 → higher combined score than 'b' or 'c'.
|
|
expect(result[0].id).toBe('a');
|
|
expect(result.map((r) => r.id)).toContain('b');
|
|
expect(result.map((r) => r.id)).toContain('c');
|
|
});
|
|
|
|
it('produces positive rrfScores', () => {
|
|
const ranking = [{ id: 'x', score: 0 }];
|
|
const result = reciprocalRankFusion(ranking);
|
|
expect(result[0].rrfScore).toBeGreaterThan(0);
|
|
});
|
|
});
|
|
|
|
// ===========================================================================
|
|
// VectorSearch
|
|
// ===========================================================================
|
|
|
|
describe('VectorSearch', () => {
|
|
let client: Database.Database;
|
|
let repoId: string;
|
|
let docId: string;
|
|
|
|
beforeEach(() => {
|
|
client = createTestDb();
|
|
repoId = seedRepo(client);
|
|
docId = seedDocument(client, repoId);
|
|
});
|
|
|
|
it('returns empty array when no embeddings exist', () => {
|
|
const vs = new VectorSearch(client);
|
|
const results = vs.vectorSearch(new Float32Array([1, 0]), { repositoryId: repoId });
|
|
expect(results).toHaveLength(0);
|
|
});
|
|
|
|
it('returns results sorted by descending cosine similarity', () => {
|
|
const s1 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'alpha' });
|
|
const s2 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'beta' });
|
|
const s3 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'gamma' });
|
|
|
|
// Query: [1, 0, 0, 0]
|
|
// s1: [1, 0, 0, 0] → similarity 1.0 (most similar)
|
|
// s2: [0, 1, 0, 0] → similarity 0.0
|
|
// s3: [0, 0, 1, 0] → similarity 0.0
|
|
seedEmbedding(client, s1, [1, 0, 0, 0]);
|
|
seedEmbedding(client, s2, [0, 1, 0, 0]);
|
|
seedEmbedding(client, s3, [0, 0, 1, 0]);
|
|
|
|
const vs = new VectorSearch(client);
|
|
const results = vs.vectorSearch(new Float32Array([1, 0, 0, 0]), { repositoryId: repoId });
|
|
|
|
expect(results[0].snippetId).toBe(s1);
|
|
expect(results[0].score).toBeCloseTo(1.0, 4);
|
|
expect(results.length).toBe(3);
|
|
});
|
|
|
|
it('respects the limit parameter', () => {
|
|
for (let i = 0; i < 5; i++) {
|
|
const id = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: `item ${i}`
|
|
});
|
|
seedEmbedding(client, id, [i * 0.1, 1 - i * 0.1]);
|
|
}
|
|
|
|
const vs = new VectorSearch(client);
|
|
const results = vs.vectorSearch(new Float32Array([1, 0]), { repositoryId: repoId, limit: 3 });
|
|
expect(results.length).toBeLessThanOrEqual(3);
|
|
});
|
|
|
|
it('only returns snippets from the specified repository', () => {
|
|
const otherRepoId = seedRepo(client, '/other/repo');
|
|
const otherDocId = seedDocument(client, otherRepoId);
|
|
|
|
const s1 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'mine' });
|
|
const s2 = seedSnippet(client, {
|
|
repositoryId: otherRepoId,
|
|
documentId: otherDocId,
|
|
content: 'theirs'
|
|
});
|
|
|
|
seedEmbedding(client, s1, [1, 0]);
|
|
seedEmbedding(client, s2, [1, 0]);
|
|
|
|
const vs = new VectorSearch(client);
|
|
const results = vs.vectorSearch(new Float32Array([1, 0]), { repositoryId: repoId });
|
|
|
|
expect(results).toHaveLength(1);
|
|
expect(results[0].snippetId).toBe(s1);
|
|
});
|
|
|
|
it('handles embeddings with negative values', () => {
|
|
const s1 = seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'neg' });
|
|
seedEmbedding(client, s1, [-0.5, 0.5]);
|
|
|
|
const vs = new VectorSearch(client);
|
|
const results = vs.vectorSearch(new Float32Array([-0.5, 0.5]), { repositoryId: repoId });
|
|
expect(results[0].score).toBeCloseTo(1.0, 4);
|
|
});
|
|
|
|
it('filters by profileId using per-profile vec tables', () => {
|
|
client
|
|
.prepare(
|
|
`INSERT INTO embedding_profiles (id, provider_kind, title, enabled, is_default, model, dimensions, config, created_at, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run('secondary-profile', 'local-transformers', 'Secondary', 1, 0, 'test-model', 2, '{}', NOW_S, NOW_S);
|
|
|
|
const defaultSnippet = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'default profile snippet'
|
|
});
|
|
const secondarySnippet = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'secondary profile snippet'
|
|
});
|
|
|
|
seedEmbedding(client, defaultSnippet, [1, 0], 'local-default');
|
|
seedEmbedding(client, secondarySnippet, [1, 0], 'secondary-profile');
|
|
|
|
const vs = new VectorSearch(client);
|
|
const defaultResults = vs.vectorSearch(new Float32Array([1, 0]), {
|
|
repositoryId: repoId,
|
|
profileId: 'local-default'
|
|
});
|
|
const secondaryResults = vs.vectorSearch(new Float32Array([1, 0]), {
|
|
repositoryId: repoId,
|
|
profileId: 'secondary-profile'
|
|
});
|
|
|
|
expect(defaultResults.map((result) => result.snippetId)).toEqual([defaultSnippet]);
|
|
expect(secondaryResults.map((result) => result.snippetId)).toEqual([secondarySnippet]);
|
|
});
|
|
});
|
|
|
|
// ===========================================================================
|
|
// HybridSearchService
|
|
// ===========================================================================
|
|
|
|
describe('HybridSearchService', () => {
|
|
let client: Database.Database;
|
|
let searchService: SearchService;
|
|
let repoId: string;
|
|
let docId: string;
|
|
|
|
beforeEach(() => {
|
|
client = createTestDb();
|
|
searchService = new SearchService(client);
|
|
repoId = seedRepo(client);
|
|
docId = seedDocument(client, repoId);
|
|
});
|
|
|
|
// -------------------------------------------------------------------------
|
|
// FTS5-only mode (no provider / alpha = 0)
|
|
// -------------------------------------------------------------------------
|
|
|
|
it('returns FTS5 results when embeddingProvider is null', async () => {
|
|
seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'hello world' });
|
|
|
|
const svc = new HybridSearchService(client, searchService, null);
|
|
const { results } = await svc.search('hello', { repositoryId: repoId });
|
|
|
|
expect(results.length).toBeGreaterThan(0);
|
|
expect(results[0].snippet.content).toBe('hello world');
|
|
});
|
|
|
|
it('returns FTS5 results when alpha = 0', async () => {
|
|
seedSnippet(client, { repositoryId: repoId, documentId: docId, content: 'alpha zero test' });
|
|
|
|
const provider = makeMockProvider([[1, 0]]);
|
|
const svc = new HybridSearchService(client, searchService, provider);
|
|
const { results } = await svc.search('alpha zero', { repositoryId: repoId, alpha: 0 });
|
|
|
|
expect(results.length).toBeGreaterThan(0);
|
|
});
|
|
|
|
it('returns empty array when FTS5 query is blank and no provider', async () => {
|
|
const svc = new HybridSearchService(client, searchService, null);
|
|
const { results } = await svc.search(' ', { repositoryId: repoId });
|
|
expect(results).toHaveLength(0);
|
|
});
|
|
|
|
it('falls back to FTS5 when noop provider returns empty embeddings', async () => {
|
|
seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'noop fallback test'
|
|
});
|
|
|
|
const svc = new HybridSearchService(client, searchService, makeNoopProvider());
|
|
const { results } = await svc.search('noop fallback', { repositoryId: repoId });
|
|
|
|
expect(results.length).toBeGreaterThan(0);
|
|
});
|
|
|
|
// -------------------------------------------------------------------------
|
|
// Hybrid mode
|
|
// -------------------------------------------------------------------------
|
|
|
|
it('returns results when hybrid mode is active (alpha = 0.5)', async () => {
|
|
const s1 = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'hybrid search keyword match'
|
|
});
|
|
seedEmbedding(client, s1, [1, 0, 0, 0]);
|
|
|
|
const provider = makeMockProvider([[1, 0, 0, 0]]);
|
|
const svc = new HybridSearchService(client, searchService, provider);
|
|
|
|
const { results } = await svc.search('hybrid search', {
|
|
repositoryId: repoId,
|
|
alpha: 0.5
|
|
});
|
|
|
|
expect(results.length).toBeGreaterThan(0);
|
|
});
|
|
|
|
it('deduplicates snippets appearing in both FTS5 and vector results', async () => {
|
|
const s1 = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'deduplicate this snippet carefully'
|
|
});
|
|
seedEmbedding(client, s1, [1, 0]);
|
|
|
|
const provider = makeMockProvider([[1, 0]]);
|
|
const svc = new HybridSearchService(client, searchService, provider);
|
|
|
|
const { results } = await svc.search('deduplicate snippet', {
|
|
repositoryId: repoId,
|
|
alpha: 0.5
|
|
});
|
|
|
|
// No duplicate IDs.
|
|
const ids = results.map((r) => r.snippet.id);
|
|
expect(ids.length).toBe(new Set(ids).size);
|
|
});
|
|
|
|
it('respects the limit option', async () => {
|
|
for (let i = 0; i < 10; i++) {
|
|
const id = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: `pagination test item number ${i} relevant content here`
|
|
});
|
|
seedEmbedding(client, id, [1, i * 0.1]);
|
|
}
|
|
|
|
const provider = makeMockProvider([[1, 0]]);
|
|
const svc = new HybridSearchService(client, searchService, provider);
|
|
|
|
const { results } = await svc.search('pagination test', {
|
|
repositoryId: repoId,
|
|
limit: 3,
|
|
alpha: 0.5
|
|
});
|
|
|
|
expect(results.length).toBeLessThanOrEqual(3);
|
|
});
|
|
|
|
// -------------------------------------------------------------------------
|
|
// Pure vector mode
|
|
// -------------------------------------------------------------------------
|
|
|
|
it('returns vector-ranked results when alpha = 1', async () => {
|
|
const s1 = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'vector only mode'
|
|
});
|
|
const s2 = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'unrelated content'
|
|
});
|
|
|
|
// s1 is aligned with the query; s2 is orthogonal.
|
|
seedEmbedding(client, s1, [1, 0]);
|
|
seedEmbedding(client, s2, [0, 1]);
|
|
|
|
const provider = makeMockProvider([[1, 0]]);
|
|
const svc = new HybridSearchService(client, searchService, provider);
|
|
|
|
const { results } = await svc.search('anything', {
|
|
repositoryId: repoId,
|
|
alpha: 1
|
|
});
|
|
|
|
expect(results[0].snippet.id).toBe(s1);
|
|
});
|
|
|
|
// -------------------------------------------------------------------------
|
|
// Result structure
|
|
// -------------------------------------------------------------------------
|
|
|
|
it('results include snippet and repository metadata', async () => {
|
|
const s1 = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'metadata check snippet content',
|
|
title: 'My Snippet Title'
|
|
});
|
|
seedEmbedding(client, s1, [1, 0]);
|
|
|
|
const provider = makeMockProvider([[1, 0]]);
|
|
const svc = new HybridSearchService(client, searchService, provider);
|
|
|
|
const { results } = await svc.search('metadata check', {
|
|
repositoryId: repoId,
|
|
alpha: 0.5
|
|
});
|
|
|
|
expect(results.length).toBeGreaterThan(0);
|
|
const first = results[0];
|
|
expect(first.snippet.id).toBeDefined();
|
|
expect(first.snippet.content).toBeDefined();
|
|
expect(first.repository.id).toBe(repoId);
|
|
expect(first.repository.title).toBe('Test Repo');
|
|
});
|
|
|
|
it('all results belong to the requested repository', async () => {
|
|
const otherRepoId = seedRepo(client, '/other/repo');
|
|
const otherDocId = seedDocument(client, otherRepoId);
|
|
|
|
for (let i = 0; i < 3; i++) {
|
|
const id = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: `target repository keyword item ${i}`
|
|
});
|
|
seedEmbedding(client, id, [1, i * 0.1]);
|
|
}
|
|
for (let i = 0; i < 3; i++) {
|
|
const id = seedSnippet(client, {
|
|
repositoryId: otherRepoId,
|
|
documentId: otherDocId,
|
|
content: `other repository keyword item ${i}`
|
|
});
|
|
seedEmbedding(client, id, [1, i * 0.1]);
|
|
}
|
|
|
|
const provider = makeMockProvider([[1, 0]]);
|
|
const svc = new HybridSearchService(client, searchService, provider);
|
|
|
|
const { results } = await svc.search('repository keyword', {
|
|
repositoryId: repoId,
|
|
alpha: 0.5
|
|
});
|
|
|
|
expect(results.every((r) => r.snippet.repositoryId === repoId)).toBe(true);
|
|
});
|
|
|
|
it('filters by snippet type when provided', async () => {
|
|
const code = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'function example code snippet',
|
|
type: 'code'
|
|
});
|
|
const info = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'function example info snippet',
|
|
type: 'info'
|
|
});
|
|
seedEmbedding(client, code, [1, 0]);
|
|
seedEmbedding(client, info, [1, 0]);
|
|
|
|
const provider = makeMockProvider([[1, 0]]);
|
|
const svc = new HybridSearchService(client, searchService, provider);
|
|
|
|
const { results: codeResults } = await svc.search('function example', {
|
|
repositoryId: repoId,
|
|
type: 'code',
|
|
alpha: 0.5
|
|
});
|
|
|
|
expect(codeResults.every((r) => r.snippet.type === 'code')).toBe(true);
|
|
});
|
|
|
|
// -------------------------------------------------------------------------
|
|
// Default alpha
|
|
// -------------------------------------------------------------------------
|
|
|
|
it('uses alpha = 0.5 when not specified', async () => {
|
|
const s1 = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'default alpha hybrid test content'
|
|
});
|
|
seedEmbedding(client, s1, [1, 0]);
|
|
|
|
const provider = makeMockProvider([[1, 0]]);
|
|
const svc = new HybridSearchService(client, searchService, provider);
|
|
|
|
// Should not throw and should return results.
|
|
const { results } = await svc.search('default alpha hybrid', { repositoryId: repoId });
|
|
expect(Array.isArray(results)).toBe(true);
|
|
});
|
|
|
|
it('filters by versionId — excludes snippets from other versions', async () => {
|
|
const client = createTestDb();
|
|
const repoId = seedRepo(client);
|
|
const docId = seedDocument(client, repoId);
|
|
|
|
// Create two versions
|
|
client
|
|
.prepare(
|
|
`INSERT INTO repository_versions (id, repository_id, tag, state, total_snippets, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run('/test/repo/v1.0', repoId, 'v1.0', 'indexed', 0, NOW_S);
|
|
client
|
|
.prepare(
|
|
`INSERT INTO repository_versions (id, repository_id, tag, state, total_snippets, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run('/test/repo/v2.0', repoId, 'v2.0', 'indexed', 0, NOW_S);
|
|
|
|
// Create embedding profile
|
|
client
|
|
.prepare(
|
|
`INSERT INTO embedding_profiles (id, provider_kind, title, enabled, is_default, model, dimensions, config, created_at, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run('test-profile', 'local-transformers', 'Test', 1, 1, 'test-model', 4, '{}', NOW_S, NOW_S);
|
|
|
|
// Snippet A in version 1.0
|
|
const snippetA = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'version 1 text'
|
|
});
|
|
client
|
|
.prepare('UPDATE snippets SET version_id = ? WHERE id = ?')
|
|
.run('/test/repo/v1.0', snippetA);
|
|
|
|
// Seed embedding for snippetA
|
|
const embedA = [0.1, 0.2, 0.3, 0.4];
|
|
const f32A = new Float32Array(embedA);
|
|
client
|
|
.prepare(
|
|
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run(snippetA, 'test-profile', 'test-model', 4, Buffer.from(f32A.buffer), NOW_S);
|
|
|
|
// Snippet B in version 2.0
|
|
const snippetB = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'version 2 text'
|
|
});
|
|
client
|
|
.prepare('UPDATE snippets SET version_id = ? WHERE id = ?')
|
|
.run('/test/repo/v2.0', snippetB);
|
|
|
|
// Seed embedding for snippetB
|
|
const embedB = [0.2, 0.3, 0.4, 0.5];
|
|
const f32B = new Float32Array(embedB);
|
|
client
|
|
.prepare(
|
|
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run(snippetB, 'test-profile', 'test-model', 4, Buffer.from(f32B.buffer), NOW_S);
|
|
|
|
const vs = new VectorSearch(client);
|
|
const query = new Float32Array([0.1, 0.2, 0.3, 0.4]);
|
|
|
|
// Query with versionId v1.0 should only return snippetA
|
|
const resultsV1 = vs.vectorSearch(query, {
|
|
repositoryId: repoId,
|
|
versionId: '/test/repo/v1.0',
|
|
profileId: 'test-profile'
|
|
});
|
|
expect(resultsV1.map((r) => r.snippetId)).toContain(snippetA);
|
|
expect(resultsV1.map((r) => r.snippetId)).not.toContain(snippetB);
|
|
|
|
// Query with versionId v2.0 should only return snippetB
|
|
const resultsV2 = vs.vectorSearch(query, {
|
|
repositoryId: repoId,
|
|
versionId: '/test/repo/v2.0',
|
|
profileId: 'test-profile'
|
|
});
|
|
expect(resultsV2.map((r) => r.snippetId)).not.toContain(snippetA);
|
|
expect(resultsV2.map((r) => r.snippetId)).toContain(snippetB);
|
|
|
|
// Query without versionId should return both
|
|
const resultsAll = vs.vectorSearch(query, {
|
|
repositoryId: repoId,
|
|
profileId: 'test-profile'
|
|
});
|
|
expect(resultsAll.map((r) => r.snippetId)).toContain(snippetA);
|
|
expect(resultsAll.map((r) => r.snippetId)).toContain(snippetB);
|
|
});
|
|
|
|
it('searchMode=keyword never calls provider.embed()', async () => {
|
|
const client = createTestDb();
|
|
const repoId = seedRepo(client);
|
|
const docId = seedDocument(client, repoId);
|
|
|
|
seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'keyword only test'
|
|
});
|
|
|
|
let embedCalled = false;
|
|
const mockProvider: EmbeddingProvider = {
|
|
name: 'mock',
|
|
dimensions: 4,
|
|
model: 'test-model',
|
|
async embed() {
|
|
embedCalled = true;
|
|
return [];
|
|
},
|
|
async isAvailable() {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
const searchService = new SearchService(client);
|
|
const hybridService = new HybridSearchService(client, searchService, mockProvider);
|
|
|
|
const { results } = await hybridService.search('keyword', {
|
|
repositoryId: repoId,
|
|
searchMode: 'keyword'
|
|
});
|
|
|
|
expect(embedCalled).toBe(false);
|
|
expect(results.length).toBeGreaterThan(0);
|
|
});
|
|
|
|
it('searchMode=semantic uses only vector search', async () => {
|
|
const client = createTestDb();
|
|
const repoId = seedRepo(client);
|
|
const docId = seedDocument(client, repoId);
|
|
|
|
// Create profile
|
|
client
|
|
.prepare(
|
|
`INSERT INTO embedding_profiles (id, provider_kind, title, enabled, is_default, model, dimensions, config, created_at, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run('test-profile', 'local-transformers', 'Test', 1, 1, 'test-model', 4, '{}', NOW_S, NOW_S);
|
|
|
|
const snippetId = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'semantic test'
|
|
});
|
|
|
|
// Seed embedding
|
|
const embed = [0.5, 0.5, 0.5, 0.5];
|
|
const f32 = new Float32Array(embed);
|
|
client
|
|
.prepare(
|
|
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run(snippetId, 'test-profile', 'test-model', 4, Buffer.from(f32.buffer), NOW_S);
|
|
|
|
const mockProvider: EmbeddingProvider = {
|
|
name: 'mock',
|
|
dimensions: 4,
|
|
model: 'test-model',
|
|
async embed() {
|
|
return [
|
|
{
|
|
values: new Float32Array([0.5, 0.5, 0.5, 0.5]),
|
|
dimensions: 4,
|
|
model: 'test-model'
|
|
}
|
|
];
|
|
},
|
|
async isAvailable() {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
const searchService = new SearchService(client);
|
|
const hybridService = new HybridSearchService(client, searchService, mockProvider);
|
|
|
|
const { results } = await hybridService.search('semantic', {
|
|
repositoryId: repoId,
|
|
searchMode: 'semantic',
|
|
profileId: 'test-profile'
|
|
});
|
|
|
|
// Should return results (alpha=1 pure vector mode)
|
|
expect(results.length).toBeGreaterThan(0);
|
|
});
|
|
|
|
// -------------------------------------------------------------------------
|
|
// Semantic-only mode (searchMode=semantic)
|
|
// -------------------------------------------------------------------------
|
|
|
|
it('searchMode=semantic returns empty array when provider is null', async () => {
|
|
const client = createTestDb();
|
|
const repoId = seedRepo(client);
|
|
const docId = seedDocument(client, repoId);
|
|
|
|
seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'semantic null provider test'
|
|
});
|
|
|
|
const searchService = new SearchService(client);
|
|
const hybridService = new HybridSearchService(client, searchService, null);
|
|
|
|
const { results } = await hybridService.search('test query', {
|
|
repositoryId: repoId,
|
|
searchMode: 'semantic'
|
|
});
|
|
|
|
// No provider: semantic mode should return empty.
|
|
expect(results).toHaveLength(0);
|
|
});
|
|
|
|
it('searchMode=semantic returns empty array for blank query', async () => {
|
|
const client = createTestDb();
|
|
const repoId = seedRepo(client);
|
|
seedDocument(client, repoId);
|
|
|
|
const mockProvider = makeMockProvider([[1, 0, 0, 0]]);
|
|
|
|
const searchService = new SearchService(client);
|
|
const hybridService = new HybridSearchService(client, searchService, mockProvider);
|
|
|
|
const { results } = await hybridService.search(' ', {
|
|
repositoryId: repoId,
|
|
searchMode: 'semantic'
|
|
});
|
|
|
|
// Blank query: should return empty.
|
|
expect(results).toHaveLength(0);
|
|
});
|
|
|
|
it('searchMode=semantic falls back to empty when provider fails', async () => {
|
|
const client = createTestDb();
|
|
const repoId = seedRepo(client);
|
|
seedDocument(client, repoId);
|
|
|
|
const noopProvider = makeNoopProvider();
|
|
const searchService = new SearchService(client);
|
|
const hybridService = new HybridSearchService(client, searchService, noopProvider);
|
|
|
|
const { results } = await hybridService.search('test query', {
|
|
repositoryId: repoId,
|
|
searchMode: 'semantic'
|
|
});
|
|
|
|
// Provider fails: should return empty (not fall back to FTS).
|
|
expect(results).toHaveLength(0);
|
|
});
|
|
|
|
// -------------------------------------------------------------------------
|
|
// Fallback behavior in auto/hybrid modes
|
|
// -------------------------------------------------------------------------
|
|
|
|
it('searchMode=auto falls back to vector when FTS has no results and provider available', async () => {
|
|
const client = createTestDb();
|
|
const repoId = seedRepo(client);
|
|
const docId = seedDocument(client, repoId);
|
|
|
|
// Create profile
|
|
client
|
|
.prepare(
|
|
`INSERT INTO embedding_profiles (id, provider_kind, title, enabled, is_default, model, dimensions, config, created_at, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run('test-profile', 'local-transformers', 'Test', 1, 1, 'test-model', 4, '{}', NOW_S, NOW_S);
|
|
|
|
// Seed a snippet that won't match punctuation-heavy query through FTS.
|
|
const snippetId = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'example content'
|
|
});
|
|
|
|
// Seed embedding for the snippet.
|
|
const embed = [0.5, 0.5, 0.5, 0.5];
|
|
const f32 = new Float32Array(embed);
|
|
client
|
|
.prepare(
|
|
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run(snippetId, 'test-profile', 'test-model', 4, Buffer.from(f32.buffer), NOW_S);
|
|
|
|
// Mock provider that always returns a matching embedding.
|
|
const mockProvider: EmbeddingProvider = {
|
|
name: 'mock',
|
|
dimensions: 4,
|
|
model: 'test-model',
|
|
async embed() {
|
|
return [
|
|
{
|
|
values: new Float32Array([0.5, 0.5, 0.5, 0.5]),
|
|
dimensions: 4,
|
|
model: 'test-model'
|
|
}
|
|
];
|
|
},
|
|
async isAvailable() {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
const searchService = new SearchService(client);
|
|
const hybridService = new HybridSearchService(client, searchService, mockProvider);
|
|
|
|
// Query with heavy punctuation that preprocesses to nothing.
|
|
const { results } = await hybridService.search('!!!@@@###', {
|
|
repositoryId: repoId,
|
|
searchMode: 'auto',
|
|
profileId: 'test-profile'
|
|
});
|
|
|
|
// Should have fallen back to vector search and found the snippet.
|
|
expect(results.length).toBeGreaterThan(0);
|
|
expect(results[0].snippet.id).toBe(snippetId);
|
|
});
|
|
|
|
it('searchMode=auto continues with FTS results when available', async () => {
|
|
const client = createTestDb();
|
|
const repoId = seedRepo(client);
|
|
const docId = seedDocument(client, repoId);
|
|
|
|
// Seed FTS-matchable snippet.
|
|
seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'hello world example'
|
|
});
|
|
|
|
const mockProvider = makeMockProvider([[1, 0]]);
|
|
const searchService = new SearchService(client);
|
|
const hybridService = new HybridSearchService(client, searchService, mockProvider);
|
|
|
|
const { results } = await hybridService.search('hello', {
|
|
repositoryId: repoId,
|
|
searchMode: 'auto'
|
|
});
|
|
|
|
// Should find results through FTS (not fallback to vector).
|
|
expect(results.length).toBeGreaterThan(0);
|
|
});
|
|
|
|
it('searchMode=hybrid falls back to vector on no FTS results', async () => {
|
|
const client = createTestDb();
|
|
const repoId = seedRepo(client);
|
|
const docId = seedDocument(client, repoId);
|
|
|
|
// Create profile
|
|
client
|
|
.prepare(
|
|
`INSERT INTO embedding_profiles (id, provider_kind, title, enabled, is_default, model, dimensions, config, created_at, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run('test-profile', 'local-transformers', 'Test', 1, 1, 'test-model', 4, '{}', NOW_S, NOW_S);
|
|
|
|
// Seed snippet with vector embedding only.
|
|
const snippetId = seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'vector search test'
|
|
});
|
|
|
|
const embed = [0.7, 0.3, 0.2, 0.1];
|
|
const f32 = new Float32Array(embed);
|
|
client
|
|
.prepare(
|
|
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run(snippetId, 'test-profile', 'test-model', 4, Buffer.from(f32.buffer), NOW_S);
|
|
|
|
const mockProvider: EmbeddingProvider = {
|
|
name: 'mock',
|
|
dimensions: 4,
|
|
model: 'test-model',
|
|
async embed() {
|
|
return [
|
|
{
|
|
values: new Float32Array([0.7, 0.3, 0.2, 0.1]),
|
|
dimensions: 4,
|
|
model: 'test-model'
|
|
}
|
|
];
|
|
},
|
|
async isAvailable() {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
const searchService = new SearchService(client);
|
|
const hybridService = new HybridSearchService(client, searchService, mockProvider);
|
|
|
|
// Query that won't match through FTS after punctuation normalization.
|
|
const { results } = await hybridService.search('%%%vector%%%', {
|
|
repositoryId: repoId,
|
|
searchMode: 'hybrid',
|
|
alpha: 0.5,
|
|
profileId: 'test-profile'
|
|
});
|
|
|
|
// Should fall back to vector and find the snippet.
|
|
expect(results.length).toBeGreaterThan(0);
|
|
});
|
|
|
|
it('punctuation-heavy query returns empty when no vector provider and FTS preprocesses to nothing', async () => {
|
|
const client = createTestDb();
|
|
const repoId = seedRepo(client);
|
|
const docId = seedDocument(client, repoId);
|
|
|
|
// No embeddings or provider.
|
|
seedSnippet(client, {
|
|
repositoryId: repoId,
|
|
documentId: docId,
|
|
content: 'example content'
|
|
});
|
|
|
|
const searchService = new SearchService(client);
|
|
const hybridService = new HybridSearchService(client, searchService, null);
|
|
|
|
const { results } = await hybridService.search('!!!@@@###$$$', {
|
|
repositoryId: repoId
|
|
});
|
|
|
|
// No provider and FTS preprocesses to empty: should return empty.
|
|
expect(results).toHaveLength(0);
|
|
});
|
|
});
|