feat(TRUEREF-0020): add embedding profiles, default local embeddings, and version-scoped semantic retrieval

- Add embedding_profiles table with provider registry pattern
- Install @xenova/transformers as runtime dependency
- Update snippet_embeddings with composite PK (snippet_id, profile_id)
- Seed default local profile using Xenova/all-MiniLM-L6-v2
- Add provider registry (local-transformers, openai-compatible)
- Update EmbeddingService to persist and retrieve by profileId
- Add version-scoped VectorSearch with optional versionId filtering
- Add searchMode (auto|keyword|semantic|hybrid) to HybridSearchService
- Update API /context route to load active profile, support searchMode/alpha params
- Extend MCP query-docs tool with searchMode and alpha parameters
- Update settings API to work with embedding_profiles table
- Add comprehensive test coverage for profiles, registry, version scoping

Status: 445/451 tests passing, core feature complete
This commit is contained in:
Giancarmine Salucci
2026-03-25 19:16:37 +01:00
parent fef6f66930
commit 169df4d984
19 changed files with 2668 additions and 246 deletions

View File

@@ -248,6 +248,99 @@ describe('OpenAIEmbeddingProvider', () => {
});
});
// ---------------------------------------------------------------------------
// Migration Tests — embedding_profiles table
// ---------------------------------------------------------------------------
describe('Migration — embedding_profiles', () => {
it('creates the embedding_profiles table', () => {
const { client } = createTestDb();
const tables = client
.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='embedding_profiles'")
.all();
expect(tables).toHaveLength(1);
});
it('seeds the default local profile', () => {
const { client } = createTestDb();
const row = client
.prepare("SELECT * FROM embedding_profiles WHERE id = 'local-default'")
.get() as any;
expect(row).toBeDefined();
expect(row.is_default).toBe(1);
expect(row.provider_kind).toBe('local-transformers');
expect(row.model).toBe('Xenova/all-MiniLM-L6-v2');
expect(row.dimensions).toBe(384);
});
});
// ---------------------------------------------------------------------------
// Provider Registry Tests
// ---------------------------------------------------------------------------
describe('Provider Registry', () => {
it('creates LocalEmbeddingProvider for local-transformers', () => {
const { createProviderFromProfile } = require('./registry.js');
const profile: schema.EmbeddingProfile = {
id: 'test-local',
providerKind: 'local-transformers',
title: 'Test Local',
enabled: true,
isDefault: false,
model: 'Xenova/all-MiniLM-L6-v2',
dimensions: 384,
config: {},
createdAt: Date.now(),
updatedAt: Date.now()
};
const provider = createProviderFromProfile(profile);
expect(provider.name).toBe('local');
expect(provider.model).toBe('Xenova/all-MiniLM-L6-v2');
expect(provider.dimensions).toBe(384);
});
it('creates OpenAIEmbeddingProvider for openai-compatible', () => {
const { createProviderFromProfile } = require('./registry.js');
const profile: schema.EmbeddingProfile = {
id: 'test-openai',
providerKind: 'openai-compatible',
title: 'Test OpenAI',
enabled: true,
isDefault: false,
model: 'text-embedding-3-small',
dimensions: 1536,
config: {
baseUrl: 'https://api.openai.com/v1',
apiKey: 'test-key',
model: 'text-embedding-3-small'
},
createdAt: Date.now(),
updatedAt: Date.now()
};
const provider = createProviderFromProfile(profile);
expect(provider.name).toBe('openai');
expect(provider.model).toBe('text-embedding-3-small');
});
it('returns NoopEmbeddingProvider for unknown providerKind', () => {
const { createProviderFromProfile } = require('./registry.js');
const profile: schema.EmbeddingProfile = {
id: 'test-unknown',
providerKind: 'unknown-provider',
title: 'Unknown',
enabled: true,
isDefault: false,
model: 'unknown',
dimensions: 0,
config: {},
createdAt: Date.now(),
updatedAt: Date.now()
};
const provider = createProviderFromProfile(profile);
expect(provider.name).toBe('noop');
});
});
// ---------------------------------------------------------------------------
// EmbeddingService — storage logic
// ---------------------------------------------------------------------------
@@ -281,23 +374,36 @@ describe('EmbeddingService', () => {
it('stores embeddings in snippet_embeddings table', async () => {
const snippetId = seedSnippet(db, client);
const provider = makeProvider(4);
const service = new EmbeddingService(client, provider);
const service = new EmbeddingService(client, provider, 'test-profile');
await service.embedSnippets([snippetId]);
const rows = client.prepare('SELECT * FROM snippet_embeddings WHERE snippet_id = ?').all(snippetId);
const rows = client
.prepare('SELECT * FROM snippet_embeddings WHERE snippet_id = ? AND profile_id = ?')
.all(snippetId, 'test-profile');
expect(rows).toHaveLength(1);
const row = rows[0] as { model: string; dimensions: number; embedding: Buffer };
const row = rows[0] as { model: string; dimensions: number; embedding: Buffer; profile_id: string };
expect(row.model).toBe('test-model');
expect(row.dimensions).toBe(4);
expect(row.profile_id).toBe('test-profile');
expect(row.embedding).toBeInstanceOf(Buffer);
});
it('stores embeddings as retrievable Float32Array blobs', async () => {
const snippetId = seedSnippet(db, client);
const provider = makeProvider(3);
const service = new EmbeddingService(client, provider);
const service = new EmbeddingService(client, provider, 'test-profile');
await service.embedSnippets([snippetId]);
const embedding = service.getEmbedding(snippetId, 'test-profile');
expect(embedding).toBeInstanceOf(Float32Array);
expect(embedding).toHaveLength(3);
expect(embedding![0]).toBeCloseTo(0.0, 5);
expect(embedding![1]).toBeCloseTo(0.1, 5);
expect(embedding![2]).toBeCloseTo(0.2, 5);
});
await service.embedSnippets([snippetId]);

View File

@@ -19,7 +19,8 @@ const TEXT_MAX_CHARS = 2048;
export class EmbeddingService {
constructor(
private readonly db: Database.Database,
private readonly provider: EmbeddingProvider
private readonly provider: EmbeddingProvider,
private readonly profileId: string = 'local-default'
) {}
/**
@@ -54,9 +55,9 @@ export class EmbeddingService {
.slice(0, TEXT_MAX_CHARS)
);
const insert = this.db.prepare<[string, string, number, Buffer]>(`
INSERT OR REPLACE INTO snippet_embeddings (snippet_id, model, dimensions, embedding, created_at)
VALUES (?, ?, ?, ?, unixepoch())
const insert = this.db.prepare<[string, string, string, number, Buffer]>(`
INSERT OR REPLACE INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
VALUES (?, ?, ?, ?, ?, unixepoch())
`);
for (let i = 0; i < snippets.length; i += BATCH_SIZE) {
@@ -71,6 +72,7 @@ export class EmbeddingService {
const embedding = embeddings[j];
insert.run(
snippet.id,
this.profileId,
embedding.model,
embedding.dimensions,
Buffer.from(embedding.values.buffer)
@@ -85,14 +87,17 @@ export class EmbeddingService {
/**
* Retrieve a stored embedding for a snippet as a Float32Array.
* Returns null when no embedding has been stored for the given snippet.
* Returns null when no embedding has been stored for the given snippet and profile.
*
* @param snippetId - Snippet UUID
* @param profileId - Embedding profile ID (default: 'local-default')
*/
getEmbedding(snippetId: string): Float32Array | null {
getEmbedding(snippetId: string, profileId: string = 'local-default'): Float32Array | null {
const row = this.db
.prepare<[string], { embedding: Buffer; dimensions: number }>(
`SELECT embedding, dimensions FROM snippet_embeddings WHERE snippet_id = ?`
.prepare<[string, string], { embedding: Buffer; dimensions: number }>(
`SELECT embedding, dimensions FROM snippet_embeddings WHERE snippet_id = ? AND profile_id = ?`
)
.get(snippetId);
.get(snippetId, profileId);
if (!row) return null;

View File

@@ -1,5 +1,9 @@
/**
* Factory — create an EmbeddingProvider from a persisted EmbeddingConfig.
*
* This module maintains backward compatibility with the old enum-style config
* while the registry pattern is adopted. Settings endpoints transition to
* using embedding_profiles table + registry.ts directly.
*/
import type { EmbeddingProvider } from './provider.js';
@@ -7,6 +11,9 @@ import { NoopEmbeddingProvider } from './provider.js';
import { OpenAIEmbeddingProvider } from './openai.provider.js';
import { LocalEmbeddingProvider } from './local.provider.js';
// Re-export registry functions for new callers
export { createProviderFromProfile, getDefaultLocalProfile, getRegisteredProviderKinds } from './registry.js';
export interface EmbeddingConfig {
provider: 'openai' | 'local' | 'none';
openai?: {

View File

@@ -0,0 +1,64 @@
/**
* Provider Registry — map providerKind to EmbeddingProvider instances.
*
* Replaces the enum-style factory with a registry pattern that supports
* arbitrary custom provider adapters without changing core types.
*/
import type { EmbeddingProvider } from './provider.js';
import { NoopEmbeddingProvider } from './provider.js';
import { OpenAIEmbeddingProvider } from './openai.provider.js';
import { LocalEmbeddingProvider } from './local.provider.js';
import type { EmbeddingProfile } from '../db/schema.js';
export type ProviderFactory = (config: Record<string, unknown>) => EmbeddingProvider;
const PROVIDER_REGISTRY: Record<string, ProviderFactory> = {
'local-transformers': (_config) => new LocalEmbeddingProvider(),
'openai-compatible': (config) =>
new OpenAIEmbeddingProvider({
baseUrl: config.baseUrl as string,
apiKey: config.apiKey as string,
model: config.model as string,
dimensions: config.dimensions as number | undefined,
maxBatchSize: config.maxBatchSize as number | undefined
})
};
/**
* Create an EmbeddingProvider from a persisted EmbeddingProfile.
*
* Falls back to NoopEmbeddingProvider when the providerKind is not recognized.
*/
export function createProviderFromProfile(profile: EmbeddingProfile): EmbeddingProvider {
const factory = PROVIDER_REGISTRY[profile.providerKind];
if (!factory) return new NoopEmbeddingProvider();
const config = (profile.config as Record<string, unknown>) ?? {};
return factory(config);
}
/**
* Return metadata for the default local profile.
*
* Used by migration seeds and runtime defaults.
*/
export function getDefaultLocalProfile(): Pick<
EmbeddingProfile,
'id' | 'providerKind' | 'model' | 'dimensions'
> {
return {
id: 'local-default',
providerKind: 'local-transformers',
model: 'Xenova/all-MiniLM-L6-v2',
dimensions: 384
};
}
/**
* Return all registered providerKind values.
*
* Useful for settings UI validation and provider discovery.
*/
export function getRegisteredProviderKinds(): string[] {
return Object.keys(PROVIDER_REGISTRY);
}