fix(FEEDBACK-0001): complete iteration 0 - harden context search

This commit is contained in:
Giancarmine Salucci
2026-03-27 01:25:46 +01:00
parent e7a2a83cdb
commit 16436bfab2
15 changed files with 1469 additions and 44 deletions

View File

@@ -16,6 +16,7 @@ import { LibrarySearchResult, SnippetSearchResult } from '$lib/server/models/sea
import { Repository } from '$lib/server/models/repository';
import { RepositoryVersion } from '$lib/server/models/repository-version';
import { Snippet } from '$lib/server/models/snippet';
import type { ContextResponseMetadata } from '$lib/server/mappers/context-response.mapper';
// ---------------------------------------------------------------------------
// Helpers
@@ -82,6 +83,25 @@ function makeSnippetResult(snippet: Snippet): SnippetSearchResult {
});
}
function makeMetadata(
overrides: Partial<ContextResponseMetadata> = {}
): ContextResponseMetadata {
return {
localSource: false,
resultCount: 1,
repository: {
id: '/facebook/react',
title: 'React',
source: 'github',
sourceUrl: 'https://github.com/facebook/react',
branch: 'main'
},
version: null,
snippetVersions: {},
...overrides
};
}
// ---------------------------------------------------------------------------
// mapState
// ---------------------------------------------------------------------------
@@ -220,6 +240,46 @@ describe('formatContextJson', () => {
const response = formatContextJson(snippets, []);
expect(response.totalTokens).toBe(0);
});
it('adds repository, version, resultCount, and origin metadata additively', () => {
const snippet = makeSnippet({ versionId: '/facebook/react/v18.3.0' });
const response = formatContextJson(
[makeSnippetResult(snippet)],
[],
makeMetadata({
resultCount: 1,
version: {
requested: 'v18.3.0',
resolved: 'v18.3.0',
id: '/facebook/react/v18.3.0'
},
snippetVersions: {
'/facebook/react/v18.3.0': 'v18.3.0'
}
})
);
expect(response.localSource).toBe(false);
expect(response.resultCount).toBe(1);
expect(response.repository?.id).toBe('/facebook/react');
expect(response.repository?.sourceUrl).toBe('https://github.com/facebook/react');
expect(response.version).toEqual({
requested: 'v18.3.0',
resolved: 'v18.3.0',
id: '/facebook/react/v18.3.0'
});
const resultSnippet = response.snippets[0];
expect(resultSnippet.origin).toEqual({
repositoryId: '/facebook/react',
repositoryTitle: 'React',
source: 'github',
sourceUrl: 'https://github.com/facebook/react',
version: 'v18.3.0',
versionId: '/facebook/react/v18.3.0',
isLocal: false
});
});
});
// ---------------------------------------------------------------------------
@@ -272,8 +332,40 @@ describe('formatContextTxt', () => {
expect(txt).toContain('---');
});
it('returns empty string for empty inputs with no rules', () => {
const txt = formatContextTxt([], []);
expect(txt).toBe('');
it('includes origin lines when metadata is provided', () => {
const snippet = makeSnippet({ versionId: '/facebook/react/v18.3.0' });
const txt = formatContextTxt(
[makeSnippetResult(snippet)],
[],
makeMetadata({
snippetVersions: {
'/facebook/react/v18.3.0': 'v18.3.0'
}
})
);
expect(txt).toContain('Origin: React (/facebook/react) | github | version v18.3.0');
});
it('returns a readable no-results section for empty inputs', () => {
const txt = formatContextTxt(
[],
[],
makeMetadata({
resultCount: 0,
version: {
requested: 'v18.3.0',
resolved: 'v18.3.0',
id: '/facebook/react/v18.3.0'
}
})
);
expect(txt).toContain('## Context Results');
expect(txt).toContain('No matching snippets found');
expect(txt).toContain('Repository: React (/facebook/react)');
expect(txt).toContain('Requested version: v18.3.0');
expect(txt).toContain('Resolved version: v18.3.0');
expect(txt).toContain('Result count: 0');
});
});

View File

@@ -13,6 +13,7 @@
*/
import { ContextResponseMapper } from '$lib/server/mappers/context-response.mapper.js';
import type { ContextResponseMetadata } from '$lib/server/mappers/context-response.mapper.js';
import { LibrarySearchResult, SnippetSearchResult } from '$lib/server/models/search-result.js';
import {
ContextJsonResponseDto,
@@ -77,9 +78,10 @@ export function formatLibrarySearchJson(results: LibrarySearchResult[]): Library
*/
export function formatContextJson(
snippets: SnippetSearchResult[],
rules: string[]
rules: string[],
metadata?: ContextResponseMetadata
): ContextJsonResponseDto {
return ContextResponseMapper.toContextJson(snippets, rules);
return ContextResponseMapper.toContextJson(snippets, rules, metadata);
}
// ---------------------------------------------------------------------------
@@ -92,7 +94,27 @@ export function formatContextJson(
* @param snippets - Ranked snippet search results (already token-budget trimmed).
* @param rules - Rules from `trueref.json` / `repository_configs`.
*/
export function formatContextTxt(snippets: SnippetSearchResult[], rules: string[]): string {
function formatOriginLine(result: SnippetSearchResult, metadata?: ContextResponseMetadata): string | null {
if (!metadata?.repository) return null;
const parts = [
`Origin: ${metadata.repository.title} (${result.snippet.repositoryId})`,
metadata.localSource ? 'local' : metadata.repository.source
];
if (result.snippet.versionId) {
const versionTag = metadata.snippetVersions[result.snippet.versionId];
parts.push(`version ${versionTag ?? result.snippet.versionId}`);
}
return parts.join(' | ');
}
export function formatContextTxt(
snippets: SnippetSearchResult[],
rules: string[],
metadata?: ContextResponseMetadata
): string {
const parts: string[] = [];
if (rules.length > 0) {
@@ -100,16 +122,41 @@ export function formatContextTxt(snippets: SnippetSearchResult[], rules: string[
parts.push('---');
}
for (const { snippet } of snippets) {
if (snippets.length === 0) {
const noResults = ['## Context Results', '_No matching snippets found for this request._'];
if (metadata?.repository) {
noResults.push(`Repository: ${metadata.repository.title} (${metadata.repository.id})`);
}
if (metadata?.version?.requested) {
noResults.push(`Requested version: ${metadata.version.requested}`);
}
if (metadata?.version?.resolved) {
noResults.push(`Resolved version: ${metadata.version.resolved}`);
}
noResults.push(`Result count: ${metadata?.resultCount ?? 0}`);
parts.push(noResults.join('\n'));
return parts.join('\n\n');
}
for (const result of snippets) {
const { snippet } = result;
const section: string[] = [];
const originLine = formatOriginLine(result, metadata);
if (snippet.type === 'code') {
if (snippet.title) section.push(`### ${snippet.title}`);
if (snippet.breadcrumb) section.push(`*${snippet.breadcrumb}*`);
if (originLine) section.push(originLine);
section.push(`\`\`\`${snippet.language ?? ''}\n${snippet.content}\n\`\`\``);
} else {
if (snippet.title) section.push(`### ${snippet.title}`);
if (snippet.breadcrumb) section.push(`*${snippet.breadcrumb}*`);
if (originLine) section.push(originLine);
section.push(snippet.content);
}

View File

@@ -39,10 +39,9 @@ describe('selectSnippetsWithinBudget', () => {
it('stops adding when next snippet exceeds the budget', () => {
const snippets = [makeSnippet('a', 100), makeSnippet('b', 500), makeSnippet('c', 200)];
// budget = 550 → a (100) + b (500) = 600 exceeds; only a fits then b would push over
// budget = 550 → a fits, b is skipped, c still fits
const result = selectSnippetsWithinBudget(snippets, 550);
// a(100) fits; a+b=600 > 550, stop
expect(result.map((s) => s.id)).toEqual(['a']);
expect(result.map((s) => s.id)).toEqual(['a', 'c']);
});
it('includes exactly one snippet when it fits the budget precisely', () => {
@@ -51,10 +50,10 @@ describe('selectSnippetsWithinBudget', () => {
expect(result.map((s) => s.id)).toEqual(['a']);
});
it('returns empty array when first snippet already exceeds budget', () => {
it('skips an oversized first snippet and keeps scanning later ones', () => {
const snippets = [makeSnippet('a', 200), makeSnippet('b', 50)];
const result = selectSnippetsWithinBudget(snippets, 100);
expect(result).toHaveLength(0);
expect(result.map((s) => s.id)).toEqual(['b']);
});
it('treats null tokenCount as 0', () => {

View File

@@ -11,8 +11,8 @@ import type { Snippet } from '$lib/types';
* Select snippets from a ranked list up to a maximum token budget.
*
* Snippets are evaluated in order. A snippet is included when its token count
* does not push the running total past `maxTokens`. The loop halts at the
* first snippet that would exceed the budget.
* does not push the running total past `maxTokens`. Oversized snippets are
* skipped so lower-ranked results can still be considered.
*
* @param snippets - Ranked list of snippets (best first).
* @param maxTokens - Inclusive upper bound on total token count.
@@ -24,7 +24,7 @@ export function selectSnippetsWithinBudget(snippets: Snippet[], maxTokens: numbe
let usedTokens = 0;
for (const snippet of snippets) {
if (usedTokens + (snippet.tokenCount ?? 0) > maxTokens) break;
if (usedTokens + (snippet.tokenCount ?? 0) > maxTokens) continue;
selected.push(snippet);
usedTokens += snippet.tokenCount ?? 0;
}

View File

@@ -1,14 +1,35 @@
import {
CodeListItemDto,
CodeSnippetJsonDto,
ContextRepositoryJsonDto,
ContextJsonResponseDto,
ContextVersionJsonDto,
InfoSnippetJsonDto,
LibrarySearchJsonResponseDto,
LibrarySearchJsonResultDto,
SnippetOriginJsonDto,
type SnippetJsonDto
} from '$lib/server/models/context-response.js';
import { LibrarySearchResult, SnippetSearchResult } from '$lib/server/models/search-result.js';
export interface ContextResponseMetadata {
localSource: boolean;
resultCount: number;
repository: {
id: string;
title: string;
source: 'github' | 'local';
sourceUrl: string;
branch: string | null;
} | null;
version: {
requested: string | null;
resolved: string | null;
id: string | null;
} | null;
snippetVersions: Record<string, string>;
}
export class ContextResponseMapper {
static toLibrarySearchJson(results: LibrarySearchResult[]): LibrarySearchJsonResponseDto {
return new LibrarySearchJsonResponseDto(
@@ -35,8 +56,24 @@ export class ContextResponseMapper {
);
}
static toContextJson(snippets: SnippetSearchResult[], rules: string[]): ContextJsonResponseDto {
static toContextJson(
snippets: SnippetSearchResult[],
rules: string[],
metadata?: ContextResponseMetadata
): ContextJsonResponseDto {
const mapped: SnippetJsonDto[] = snippets.map(({ snippet }) => {
const origin = metadata?.repository
? new SnippetOriginJsonDto({
repositoryId: snippet.repositoryId,
repositoryTitle: metadata.repository.title,
source: metadata.repository.source,
sourceUrl: metadata.repository.sourceUrl,
version: snippet.versionId ? metadata.snippetVersions[snippet.versionId] ?? null : null,
versionId: snippet.versionId,
isLocal: metadata.localSource
})
: null;
if (snippet.type === 'code') {
return new CodeSnippetJsonDto({
title: snippet.title ?? null,
@@ -50,7 +87,8 @@ export class ContextResponseMapper {
],
id: snippet.id,
tokenCount: snippet.tokenCount ?? null,
pageTitle: snippet.breadcrumb ? snippet.breadcrumb.split('>')[0].trim() || null : null
pageTitle: snippet.breadcrumb ? snippet.breadcrumb.split('>')[0].trim() || null : null,
origin
});
}
@@ -58,14 +96,34 @@ export class ContextResponseMapper {
text: snippet.content,
breadcrumb: snippet.breadcrumb ?? null,
pageId: snippet.id,
tokenCount: snippet.tokenCount ?? null
tokenCount: snippet.tokenCount ?? null,
origin
});
});
return new ContextJsonResponseDto({
snippets: mapped,
rules,
totalTokens: snippets.reduce((sum, result) => sum + (result.snippet.tokenCount ?? 0), 0)
totalTokens: snippets.reduce((sum, result) => sum + (result.snippet.tokenCount ?? 0), 0),
localSource: metadata?.localSource ?? false,
repository: metadata?.repository
? new ContextRepositoryJsonDto({
id: metadata.repository.id,
title: metadata.repository.title,
source: metadata.repository.source,
sourceUrl: metadata.repository.sourceUrl,
branch: metadata.repository.branch,
isLocal: metadata.localSource
})
: null,
version: metadata?.version
? new ContextVersionJsonDto({
requested: metadata.version.requested,
resolved: metadata.version.resolved,
id: metadata.version.id
})
: null,
resultCount: metadata?.resultCount ?? snippets.length
});
}
}

View File

@@ -1,3 +1,78 @@
export interface ContextRepositoryJsonDtoProps {
id: string;
title: string;
source: 'github' | 'local';
sourceUrl: string;
branch: string | null;
isLocal: boolean;
}
export class ContextRepositoryJsonDto {
id: string;
title: string;
source: 'github' | 'local';
sourceUrl: string;
branch: string | null;
isLocal: boolean;
constructor(props: ContextRepositoryJsonDtoProps) {
this.id = props.id;
this.title = props.title;
this.source = props.source;
this.sourceUrl = props.sourceUrl;
this.branch = props.branch;
this.isLocal = props.isLocal;
}
}
export interface ContextVersionJsonDtoProps {
requested: string | null;
resolved: string | null;
id: string | null;
}
export class ContextVersionJsonDto {
requested: string | null;
resolved: string | null;
id: string | null;
constructor(props: ContextVersionJsonDtoProps) {
this.requested = props.requested;
this.resolved = props.resolved;
this.id = props.id;
}
}
export interface SnippetOriginJsonDtoProps {
repositoryId: string;
repositoryTitle: string;
source: 'github' | 'local';
sourceUrl: string;
version: string | null;
versionId: string | null;
isLocal: boolean;
}
export class SnippetOriginJsonDto {
repositoryId: string;
repositoryTitle: string;
source: 'github' | 'local';
sourceUrl: string;
version: string | null;
versionId: string | null;
isLocal: boolean;
constructor(props: SnippetOriginJsonDtoProps) {
this.repositoryId = props.repositoryId;
this.repositoryTitle = props.repositoryTitle;
this.source = props.source;
this.sourceUrl = props.sourceUrl;
this.version = props.version;
this.versionId = props.versionId;
this.isLocal = props.isLocal;
}
}
export class LibrarySearchJsonResultDto {
id: string;
title: string;
@@ -57,6 +132,7 @@ export class CodeSnippetJsonDto {
id: string;
tokenCount: number | null;
pageTitle: string | null;
origin: SnippetOriginJsonDto | null;
constructor(props: Omit<CodeSnippetJsonDto, 'type'>) {
this.title = props.title;
@@ -66,6 +142,7 @@ export class CodeSnippetJsonDto {
this.id = props.id;
this.tokenCount = props.tokenCount;
this.pageTitle = props.pageTitle;
this.origin = props.origin;
}
}
@@ -75,12 +152,14 @@ export class InfoSnippetJsonDto {
breadcrumb: string | null;
pageId: string;
tokenCount: number | null;
origin: SnippetOriginJsonDto | null;
constructor(props: Omit<InfoSnippetJsonDto, 'type'>) {
this.text = props.text;
this.breadcrumb = props.breadcrumb;
this.pageId = props.pageId;
this.tokenCount = props.tokenCount;
this.origin = props.origin;
}
}
@@ -90,10 +169,18 @@ export class ContextJsonResponseDto {
snippets: SnippetJsonDto[];
rules: string[];
totalTokens: number;
localSource: boolean;
repository: ContextRepositoryJsonDto | null;
version: ContextVersionJsonDto | null;
resultCount: number;
constructor(props: ContextJsonResponseDto) {
this.snippets = props.snippets;
this.rules = props.rules;
this.totalTokens = props.totalTokens;
this.localSource = props.localSource;
this.repository = props.repository;
this.version = props.version;
this.resultCount = props.resultCount;
}
}

View File

@@ -818,4 +818,246 @@ describe('HybridSearchService', () => {
// Should return results (alpha=1 pure vector mode)
expect(results.length).toBeGreaterThan(0);
});
// -------------------------------------------------------------------------
// Semantic-only mode (searchMode=semantic)
// -------------------------------------------------------------------------
it('searchMode=semantic returns empty array when provider is null', async () => {
const client = createTestDb();
const repoId = seedRepo(client);
const docId = seedDocument(client, repoId);
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'semantic null provider test'
});
const searchService = new SearchService(client);
const hybridService = new HybridSearchService(client, searchService, null);
const results = await hybridService.search('test query', {
repositoryId: repoId,
searchMode: 'semantic'
});
// No provider: semantic mode should return empty.
expect(results).toHaveLength(0);
});
it('searchMode=semantic returns empty array for blank query', async () => {
const client = createTestDb();
const repoId = seedRepo(client);
const docId = seedDocument(client, repoId);
const mockProvider = makeMockProvider([[1, 0, 0, 0]]);
const searchService = new SearchService(client);
const hybridService = new HybridSearchService(client, searchService, mockProvider);
const results = await hybridService.search(' ', {
repositoryId: repoId,
searchMode: 'semantic'
});
// Blank query: should return empty.
expect(results).toHaveLength(0);
});
it('searchMode=semantic falls back to empty when provider fails', async () => {
const client = createTestDb();
const repoId = seedRepo(client);
const docId = seedDocument(client, repoId);
const noopProvider = makeNoopProvider();
const searchService = new SearchService(client);
const hybridService = new HybridSearchService(client, searchService, noopProvider);
const results = await hybridService.search('test query', {
repositoryId: repoId,
searchMode: 'semantic'
});
// Provider fails: should return empty (not fall back to FTS).
expect(results).toHaveLength(0);
});
// -------------------------------------------------------------------------
// Fallback behavior in auto/hybrid modes
// -------------------------------------------------------------------------
it('searchMode=auto falls back to vector when FTS has no results and provider available', async () => {
const client = createTestDb();
const repoId = seedRepo(client);
const docId = seedDocument(client, repoId);
// Create profile
client
.prepare(
`INSERT INTO embedding_profiles (id, provider_kind, title, enabled, is_default, model, dimensions, config, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
)
.run('test-profile', 'local-transformers', 'Test', 1, 1, 'test-model', 4, '{}', NOW_S, NOW_S);
// Seed a snippet that won't match punctuation-heavy query through FTS.
const snippetId = seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'example content'
});
// Seed embedding for the snippet.
const embed = [0.5, 0.5, 0.5, 0.5];
const f32 = new Float32Array(embed);
client
.prepare(
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
VALUES (?, ?, ?, ?, ?, ?)`
)
.run(snippetId, 'test-profile', 'test-model', 4, Buffer.from(f32.buffer), NOW_S);
// Mock provider that always returns a matching embedding.
const mockProvider: EmbeddingProvider = {
name: 'mock',
dimensions: 4,
model: 'test-model',
async embed() {
return [
{
values: new Float32Array([0.5, 0.5, 0.5, 0.5]),
dimensions: 4,
model: 'test-model'
}
];
},
async isAvailable() {
return true;
}
};
const searchService = new SearchService(client);
const hybridService = new HybridSearchService(client, searchService, mockProvider);
// Query with heavy punctuation that preprocesses to nothing.
const results = await hybridService.search('!!!@@@###', {
repositoryId: repoId,
searchMode: 'auto',
profileId: 'test-profile'
});
// Should have fallen back to vector search and found the snippet.
expect(results.length).toBeGreaterThan(0);
expect(results[0].snippet.id).toBe(snippetId);
});
it('searchMode=auto continues with FTS results when available', async () => {
const client = createTestDb();
const repoId = seedRepo(client);
const docId = seedDocument(client, repoId);
// Seed FTS-matchable snippet.
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'hello world example'
});
const mockProvider = makeMockProvider([[1, 0]]);
const searchService = new SearchService(client);
const hybridService = new HybridSearchService(client, searchService, mockProvider);
const results = await hybridService.search('hello', {
repositoryId: repoId,
searchMode: 'auto'
});
// Should find results through FTS (not fallback to vector).
expect(results.length).toBeGreaterThan(0);
});
it('searchMode=hybrid falls back to vector on no FTS results', async () => {
const client = createTestDb();
const repoId = seedRepo(client);
const docId = seedDocument(client, repoId);
// Create profile
client
.prepare(
`INSERT INTO embedding_profiles (id, provider_kind, title, enabled, is_default, model, dimensions, config, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
)
.run('test-profile', 'local-transformers', 'Test', 1, 1, 'test-model', 4, '{}', NOW_S, NOW_S);
// Seed snippet with vector embedding only.
const snippetId = seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'vector search test'
});
const embed = [0.7, 0.3, 0.2, 0.1];
const f32 = new Float32Array(embed);
client
.prepare(
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
VALUES (?, ?, ?, ?, ?, ?)`
)
.run(snippetId, 'test-profile', 'test-model', 4, Buffer.from(f32.buffer), NOW_S);
const mockProvider: EmbeddingProvider = {
name: 'mock',
dimensions: 4,
model: 'test-model',
async embed() {
return [
{
values: new Float32Array([0.7, 0.3, 0.2, 0.1]),
dimensions: 4,
model: 'test-model'
}
];
},
async isAvailable() {
return true;
}
};
const searchService = new SearchService(client);
const hybridService = new HybridSearchService(client, searchService, mockProvider);
// Query that won't match through FTS after punctuation normalization.
const results = await hybridService.search('%%%vector%%%', {
repositoryId: repoId,
searchMode: 'hybrid',
alpha: 0.5,
profileId: 'test-profile'
});
// Should fall back to vector and find the snippet.
expect(results.length).toBeGreaterThan(0);
});
it('punctuation-heavy query returns empty when no vector provider and FTS preprocesses to nothing', async () => {
const client = createTestDb();
const repoId = seedRepo(client);
const docId = seedDocument(client, repoId);
// No embeddings or provider.
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'example content'
});
const searchService = new SearchService(client);
const hybridService = new HybridSearchService(client, searchService, null);
const results = await hybridService.search('!!!@@@###$$$', {
repositoryId: repoId
});
// No provider and FTS preprocesses to empty: should return empty.
expect(results).toHaveLength(0);
});
});

View File

@@ -88,8 +88,16 @@ export class HybridSearchService {
/**
* Execute a hybrid search combining FTS5 and (optionally) vector search.
*
* When `embeddingProvider` is null or `alpha` is 0, the method returns
* FTS5 results directly without embedding the query.
* Search modes:
* - 'keyword' : FTS5-only (alpha = 0)
* - 'semantic' : Vector-only (alpha = 1), skips FTS entirely
* - 'hybrid' : Balanced RRF fusion (alpha = 0.5 by default)
* - 'auto' : Auto-selects: semantic if embedding provider available and FTS
* yields no results on the preprocessed query. Falls back to FTS
* for punctuation-heavy queries.
*
* When embeddingProvider is null or alpha is 0, the method returns FTS5 results
* directly without embedding the query.
*
* @param query - Raw search string (preprocessing handled by SearchService).
* @param options - Search parameters including repositoryId and alpha blend.
@@ -119,7 +127,30 @@ export class HybridSearchService {
alpha = options.alpha ?? 0.5;
}
// Always run FTS5 — it is synchronous and fast.
// Semantic mode: skip FTS entirely and use vector search only.
if (mode === 'semantic') {
if (!this.embeddingProvider || !query.trim()) {
return [];
}
const embeddings = await this.embeddingProvider.embed([query]);
if (embeddings.length === 0) {
return [];
}
const queryEmbedding = embeddings[0].values;
const vectorResults = this.vectorSearch.vectorSearch(queryEmbedding, {
repositoryId: options.repositoryId,
versionId: options.versionId,
profileId: options.profileId,
limit
});
const topIds = vectorResults.slice(0, limit).map((r) => r.snippetId);
return this.fetchSnippetsByIds(topIds, options.repositoryId, options.type);
}
// FTS5 mode (keyword) or hybrid/auto modes: try FTS first.
const ftsResults = this.searchService.searchSnippets(query, {
repositoryId: options.repositoryId,
versionId: options.versionId,
@@ -132,10 +163,40 @@ export class HybridSearchService {
return ftsResults.slice(0, limit);
}
// Embed query and run vector search.
// For auto/hybrid modes: if FTS yielded results, use them; otherwise try vector.
// This handles punctuation-heavy queries that normalize to empty after preprocessing.
const hasFtsResults = ftsResults.length > 0;
if (!hasFtsResults) {
// No FTS results: try vector search as a fallback in auto/hybrid modes.
if (!query.trim()) {
// Query is empty; no point embedding it.
return [];
}
const embeddings = await this.embeddingProvider.embed([query]);
// If provider fails (Noop returns empty array), we're done.
if (embeddings.length === 0) {
return [];
}
const queryEmbedding = embeddings[0].values;
const vectorResults = this.vectorSearch.vectorSearch(queryEmbedding, {
repositoryId: options.repositoryId,
versionId: options.versionId,
profileId: options.profileId,
limit
});
const topIds = vectorResults.slice(0, limit).map((r) => r.snippetId);
return this.fetchSnippetsByIds(topIds, options.repositoryId, options.type);
}
// FTS has results: use RRF to blend with vector search (if alpha < 1).
const embeddings = await this.embeddingProvider.embed([query]);
// Provider may be a Noop (returns empty array) — fall back gracefully.
// Provider may be a Noop (returns empty array) — fall back to FTS gracefully.
if (embeddings.length === 0) {
return ftsResults.slice(0, limit);
}

View File

@@ -2,7 +2,8 @@
* Query preprocessor for FTS5 search queries.
*
* Normalizes raw user input into an FTS5-compatible MATCH expression
* with prefix wildcard expansion on the last token.
* with prefix wildcard expansion on the last token. Handles punctuation-heavy
* and code-like queries by extracting searchable alphanumeric/underscore terms.
*/
/**
@@ -10,25 +11,104 @@
*
* Steps:
* 1. Trim and normalize internal whitespace.
* 2. Strip FTS5 grouping characters `(` and `)` that would cause parse errors.
* 3. Append a prefix wildcard `*` to the last token when it is >= 3 characters
* and does not already end with `*`. This gives a "typing as you go" feel.
* 2. Preserve FTS5 operators (AND, OR, NOT) and extract alphanumeric/underscore terms.
* 3. Strip punctuation that breaks FTS5 parsing (parentheses, brackets, special chars).
* 4. Preserve searchable code-like patterns (snake_case, dot notation parts, etc.).
* 5. Return empty string if no searchable terms remain; otherwise, append a prefix
* wildcard `*` to the last token when it is >= 3 characters and does not already
* end with `*`.
*/
export function preprocessQuery(raw: string): string {
// 1. Trim and collapse whitespace.
let q = raw.trim().replace(/\s+/g, ' ');
// 2. Remove parentheses (not valid in simple FTS5 queries without explicit operators).
q = q.replace(/[()]/g, ' ').replace(/\s+/g, ' ').trim();
if (!q) return q;
// 3. Add prefix wildcard to the last token.
const tokens = q.split(' ');
const lastToken = tokens.at(-1) ?? '';
if (lastToken.length >= 3 && !lastToken.endsWith('*')) {
tokens[tokens.length - 1] = lastToken + '*';
// 2. Split into tokens while preserving FTS operators and extracting searchable terms.
const tokens = q.split(/\s+/);
const processedTokens: string[] = [];
for (const token of tokens) {
// Preserve FTS operators as-is.
if (['AND', 'OR', 'NOT'].includes(token)) {
processedTokens.push(token);
continue;
}
// Extract searchable terms from the token:
// - Keep alphanumeric sequences and underscores
// - Skip pure punctuation
// - Handle code-like patterns (foo_bar, foo.bar.baz, etc.)
const searchableTerms: string[] = [];
// Replace common separators with spaces, then split.
const sanitized = token
.replace(/[()[\]{}]/g, ' ') // Remove grouping characters
.replace(/[;:,!?]/g, ' ') // Remove punctuation that breaks FTS
.replace(/[<>|]/g, ' ') // Remove comparison/pipe chars
.replace(/[\-+*/%]/g, ' ') // Remove operators (but keep underscores)
.replace(/[@#$&^\\~\`]/g, ' '); // Remove special chars
// Split on remaining punctuation (like dots and slashes) but preserve alphanumeric/underscore.
const parts = sanitized.split(/[./\s]+/).filter(Boolean);
for (const part of parts) {
// Keep parts that contain at least one alphanumeric character.
if (/[a-zA-Z0-9_]/.test(part)) {
// Remove leading/trailing non-alphanumeric/underscore characters
const cleaned = part.replace(/^[^a-zA-Z0-9_]+|[^a-zA-Z0-9_]+$/g, '');
if (cleaned) {
searchableTerms.push(cleaned);
}
}
}
// Add unique searchable terms (avoid duplicates from same token).
for (const term of searchableTerms) {
if (!processedTokens.includes(term)) {
processedTokens.push(term);
}
}
}
return tokens.join(' ');
// 3. Separate operators from searchable terms.
const searchableTerms = processedTokens.filter((t) => !['AND', 'OR', 'NOT'].includes(t));
if (searchableTerms.length === 0) return '';
// 4. Reconstruct final tokens keeping operators between searchable terms.
const finalTokens: string[] = [];
for (const token of processedTokens) {
// Keep operators only if we have searchable terms
if (['AND', 'OR', 'NOT'].includes(token)) {
// Only keep if surrounded by searchable terms or at the boundary
if (finalTokens.length > 0) {
finalTokens.push(token);
}
} else {
finalTokens.push(token);
}
}
// Remove trailing operators
while (finalTokens.length > 0 && ['AND', 'OR', 'NOT'].includes(finalTokens[finalTokens.length - 1])) {
finalTokens.pop();
}
if (finalTokens.length === 0) return '';
// 5. Add prefix wildcard to the last non-operator token.
let lastIdx = finalTokens.length - 1;
while (lastIdx >= 0 && ['AND', 'OR', 'NOT'].includes(finalTokens[lastIdx])) {
lastIdx--;
}
if (lastIdx >= 0) {
const lastToken = finalTokens[lastIdx];
if (lastToken.length >= 3 && !lastToken.endsWith('*')) {
finalTokens[lastIdx] = lastToken + '*';
}
}
return finalTokens.join(' ');
}

View File

@@ -177,6 +177,85 @@ describe('preprocessQuery', () => {
it('handles single short token without wildcard', () => {
expect(preprocessQuery('ab')).toBe('ab');
});
// Punctuation-heavy and code-like queries
it('normalizes code-like queries with slashes', () => {
// "foo/bar/baz" should extract searchable terms
const result = preprocessQuery('foo/bar/baz');
expect(result).toContain('foo');
expect(result).toContain('bar');
expect(result).toContain('baz');
});
it('extracts terms from dot-notation queries', () => {
// "object.method.name" should extract searchable parts
const result = preprocessQuery('object.method.name');
expect(result).toContain('object');
expect(result).toContain('method');
expect(result).toContain('name');
});
it('handles snake_case identifiers', () => {
// "my_function_name" should be preserved
const result = preprocessQuery('my_function_name');
expect(result).toContain('my_function_name');
});
it('removes punctuation from parenthesized expressions', () => {
// "(hello world)" → "hello world*"
const result = preprocessQuery('(hello world)');
expect(result).toContain('hello');
expect(result).toContain('world');
});
it('handles bracket-enclosed content', () => {
// "[foo bar]" → "foo bar*"
const result = preprocessQuery('[foo bar]');
expect(result).toContain('foo');
expect(result).toContain('bar');
});
it('returns empty string for pure punctuation', () => {
expect(preprocessQuery('!@#$%^&*()')).toBe('');
});
it('returns empty string for punctuation with operators only', () => {
expect(preprocessQuery('!!! AND *** OR ((()))')).toBe('');
});
it('normalizes C++ style template syntax', () => {
// "vector<int>" → "vector int*"
const result = preprocessQuery('vector<int>');
expect(result).toContain('vector');
expect(result).toContain('int');
});
it('handles colons and semicolons in code snippets', () => {
// "http://example.com; function()" → extracts searchable terms
const result = preprocessQuery('http://example.com; function()');
expect(result).toContain('http');
expect(result).toContain('example');
expect(result).toContain('com');
expect(result).toContain('function');
});
it('normalizes arithmetic operators', () => {
// "a + b * c" → "a b c*"
const result = preprocessQuery('a + b * c');
// Should extract terms, but skip operators
const terms = result.split(/\s+/).filter((t) => !['AND', 'OR', 'NOT'].includes(t));
expect(terms.length).toBeGreaterThan(0);
});
it('returns single searchable term with wildcard when >=3 chars', () => {
const result = preprocessQuery('!!!hello!!!');
expect(result).toBe('hello*');
});
it('returns single short term without wildcard', () => {
const result = preprocessQuery('!!!ab!!!');
expect(result).toBe('ab');
});
});
// ---------------------------------------------------------------------------

View File

@@ -2,6 +2,7 @@ import { beforeEach, describe, expect, it, vi } from 'vitest';
import Database from 'better-sqlite3';
import { readFileSync } from 'node:fs';
import { join } from 'node:path';
import crypto from 'node:crypto';
import { RepositoryService } from '$lib/server/services/repository.service';
import { VersionService } from '$lib/server/services/version.service';
@@ -24,21 +25,34 @@ vi.mock('$lib/server/pipeline/startup.js', () => ({
getQueue: () => queue
}));
vi.mock('$lib/server/embeddings/registry', () => ({
createProviderFromProfile: () => null
}));
vi.mock('$lib/server/embeddings/registry.js', () => ({
createProviderFromProfile: () => null
}));
import { POST as postLibraries } from './libs/+server.js';
import { GET as getLibrary } from './libs/[id]/+server.js';
import { GET as getJobs } from './jobs/+server.js';
import { GET as getJob } from './jobs/[id]/+server.js';
import { GET as getVersions, POST as postVersions } from './libs/[id]/versions/+server.js';
import { GET as getContext } from './context/+server.js';
const NOW_S = Math.floor(Date.now() / 1000);
function createTestDb(): Database.Database {
const client = new Database(':memory:');
client.pragma('foreign_keys = ON');
const migrationsFolder = join(import.meta.dirname, '../../../lib/server/db/migrations');
const ftsFile = join(import.meta.dirname, '../../../lib/server/db/fts.sql');
// Apply all migration files in order
const migration0 = readFileSync(join(migrationsFolder, '0000_large_master_chief.sql'), 'utf-8');
const migration1 = readFileSync(join(migrationsFolder, '0001_quick_nighthawk.sql'), 'utf-8');
const migration2 = readFileSync(join(migrationsFolder, '0002_silky_stellaris.sql'), 'utf-8');
// Apply first migration
const statements0 = migration0
@@ -60,9 +74,126 @@ function createTestDb(): Database.Database {
client.exec(statement);
}
const statements2 = migration2
.split('--> statement-breakpoint')
.map((statement) => statement.trim())
.filter(Boolean);
for (const statement of statements2) {
client.exec(statement);
}
client.exec(readFileSync(ftsFile, 'utf-8'));
return client;
}
function seedRepo(
client: Database.Database,
overrides: {
id?: string;
title?: string;
source?: 'github' | 'local';
sourceUrl?: string;
state?: 'pending' | 'indexing' | 'indexed' | 'error';
} = {}
): string {
const id = overrides.id ?? '/facebook/react';
client
.prepare(
`INSERT INTO repositories
(id, title, source, source_url, state, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?)`
)
.run(
id,
overrides.title ?? 'React',
overrides.source ?? 'github',
overrides.sourceUrl ?? 'https://github.com/facebook/react',
overrides.state ?? 'indexed',
NOW_S,
NOW_S
);
return id;
}
function seedVersion(client: Database.Database, repositoryId: string, tag: string): string {
const versionId = `${repositoryId}/${tag}`;
client
.prepare(
`INSERT INTO repository_versions
(id, repository_id, tag, state, total_snippets, indexed_at, created_at)
VALUES (?, ?, ?, 'indexed', 0, ?, ?)`
)
.run(versionId, repositoryId, tag, NOW_S, NOW_S);
return versionId;
}
function seedDocument(
client: Database.Database,
repositoryId: string,
versionId: string | null = null
): string {
const documentId = crypto.randomUUID();
client
.prepare(
`INSERT INTO documents (id, repository_id, version_id, file_path, checksum, indexed_at)
VALUES (?, ?, ?, ?, ?, ?)`
)
.run(documentId, repositoryId, versionId, 'README.md', 'checksum', NOW_S);
return documentId;
}
function seedSnippet(
client: Database.Database,
options: {
documentId: string;
repositoryId: string;
versionId?: string | null;
type?: 'code' | 'info';
title?: string | null;
content: string;
language?: string | null;
breadcrumb?: string | null;
tokenCount?: number;
}
): string {
const snippetId = crypto.randomUUID();
client
.prepare(
`INSERT INTO snippets
(id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
)
.run(
snippetId,
options.documentId,
options.repositoryId,
options.versionId ?? null,
options.type ?? 'info',
options.title ?? null,
options.content,
options.language ?? null,
options.breadcrumb ?? null,
options.tokenCount ?? 0,
NOW_S
);
return snippetId;
}
function seedRules(client: Database.Database, repositoryId: string, rules: string[]) {
client
.prepare(
`INSERT INTO repository_configs (repository_id, rules, updated_at)
VALUES (?, ?, ?)`
)
.run(repositoryId, JSON.stringify(rules), NOW_S);
}
describe('API contract integration', () => {
beforeEach(() => {
db = createTestDb();
@@ -174,4 +305,78 @@ describe('API contract integration', () => {
expect(getBody.versions[0]).not.toHaveProperty('repository_id');
expect(getBody.versions[0]).not.toHaveProperty('total_snippets');
});
it('GET /api/v1/context returns informative txt output for empty results', async () => {
const repositoryId = seedRepo(db);
const response = await getContext({
url: new URL(
`http://test/api/v1/context?libraryId=${encodeURIComponent(repositoryId)}&query=${encodeURIComponent('no matches here')}&type=txt`
)
} as never);
expect(response.status).toBe(200);
expect(response.headers.get('content-type')).toContain('text/plain');
const body = await response.text();
expect(body).toContain('## Context Results');
expect(body).toContain('No matching snippets found');
expect(body).toContain('Repository: React (/facebook/react)');
expect(body).toContain('Result count: 0');
});
it('GET /api/v1/context returns additive repository and version metadata for versioned results', async () => {
const repositoryId = seedRepo(db);
const versionId = seedVersion(db, repositoryId, 'v18.3.0');
const documentId = seedDocument(db, repositoryId, versionId);
seedRules(db, repositoryId, ['Prefer hooks over classes']);
seedSnippet(db, {
documentId,
repositoryId,
versionId,
type: 'code',
title: 'useThing',
content: 'export function useThing() { return true; }',
language: 'ts',
breadcrumb: 'Hooks > useThing',
tokenCount: 42
});
const response = await getContext({
url: new URL(
`http://test/api/v1/context?libraryId=${encodeURIComponent(`${repositoryId}/v18.3.0`)}&query=${encodeURIComponent('useThing')}`
)
} as never);
expect(response.status).toBe(200);
const body = await response.json();
expect(body.snippets).toHaveLength(1);
expect(body.rules).toEqual(['Prefer hooks over classes']);
expect(body.totalTokens).toBe(42);
expect(body.localSource).toBe(false);
expect(body.resultCount).toBe(1);
expect(body.repository).toEqual({
id: '/facebook/react',
title: 'React',
source: 'github',
sourceUrl: 'https://github.com/facebook/react',
branch: 'main',
isLocal: false
});
expect(body.version).toEqual({
requested: 'v18.3.0',
resolved: 'v18.3.0',
id: '/facebook/react/v18.3.0'
});
expect(body.snippets[0].origin).toEqual({
repositoryId: '/facebook/react',
repositoryTitle: 'React',
source: 'github',
sourceUrl: 'https://github.com/facebook/react',
version: 'v18.3.0',
versionId: '/facebook/react/v18.3.0',
isLocal: false
});
});
});

View File

@@ -25,6 +25,7 @@ import {
formatContextTxt,
CORS_HEADERS
} from '$lib/server/api/formatters';
import type { ContextResponseMetadata } from '$lib/server/mappers/context-response.mapper';
// ---------------------------------------------------------------------------
// Helpers
@@ -67,7 +68,32 @@ function getRules(db: ReturnType<typeof getClient>, repositoryId: string): strin
interface RawRepoState {
state: 'pending' | 'indexing' | 'indexed' | 'error';
id: string;
title: string;
source: 'github' | 'local';
source_url: string;
branch: string | null;
}
interface RawVersionRow {
id: string;
tag: string;
}
function getSnippetVersionTags(
db: ReturnType<typeof getClient>,
versionIds: string[]
): Record<string, string> {
if (versionIds.length === 0) return {};
const placeholders = versionIds.map(() => '?').join(', ');
const rows = db
.prepare<string[], RawVersionRow>(
`SELECT id, tag FROM repository_versions WHERE id IN (${placeholders})`
)
.all(...versionIds);
return Object.fromEntries(rows.map((row) => [row.id, row.tag]));
}
// ---------------------------------------------------------------------------
@@ -131,7 +157,9 @@ export const GET: RequestHandler = async ({ url }) => {
// Verify the repository exists and check its state.
const repo = db
.prepare<[string], RawRepoState>(`SELECT state, title FROM repositories WHERE id = ?`)
.prepare<[string], RawRepoState>(
`SELECT id, state, title, source, source_url, branch FROM repositories WHERE id = ?`
)
.get(parsed.repositoryId);
if (!repo) {
@@ -162,15 +190,16 @@ export const GET: RequestHandler = async ({ url }) => {
// Resolve version ID if a specific version was requested.
let versionId: string | undefined;
let resolvedVersion: RawVersionRow | undefined;
if (parsed.version) {
const versionRow = db
.prepare<[string, string], { id: string }>(
`SELECT id FROM repository_versions WHERE repository_id = ? AND tag = ?`
resolvedVersion = db
.prepare<[string, string], RawVersionRow>(
`SELECT id, tag FROM repository_versions WHERE repository_id = ? AND tag = ?`
)
.get(parsed.repositoryId, parsed.version);
// Version not found is not fatal — fall back to default branch.
versionId = versionRow?.id;
versionId = resolvedVersion?.id;
}
// Execute hybrid search (falls back to FTS5 when no embedding provider is set).
@@ -193,11 +222,39 @@ export const GET: RequestHandler = async ({ url }) => {
return found;
});
const snippetVersionIds = Array.from(
new Set(
selectedResults
.map((result) => result.snippet.versionId)
.filter((value): value is string => Boolean(value))
)
);
const snippetVersions = getSnippetVersionTags(db, snippetVersionIds);
const metadata: ContextResponseMetadata = {
localSource: repo.source === 'local',
resultCount: selectedResults.length,
repository: {
id: repo.id,
title: repo.title,
source: repo.source,
sourceUrl: repo.source_url,
branch: repo.branch
},
version: parsed.version || resolvedVersion
? {
requested: parsed.version ?? null,
resolved: resolvedVersion?.tag ?? null,
id: resolvedVersion?.id ?? null
}
: null,
snippetVersions
};
// Load rules from repository_configs.
const rules = getRules(db, parsed.repositoryId);
if (responseType === 'txt') {
const text = formatContextTxt(selectedResults, rules);
const text = formatContextTxt(selectedResults, rules, metadata);
return new Response(text, {
status: 200,
headers: {
@@ -208,7 +265,7 @@ export const GET: RequestHandler = async ({ url }) => {
}
// Default: JSON
const body = formatContextJson(selectedResults, rules);
const body = formatContextJson(selectedResults, rules, metadata);
return dtoJsonResponse(body, {
status: 200,
headers: CORS_HEADERS