feat(TRUEREF-0006): implement SQLite FTS5 full-text search engine
- BM25 ranking via SQLite FTS5 bm25() function - Query preprocessor with wildcard expansion and special char escaping - Library search with composite scoring (name match, trust score, snippet count) - Trust score computation from stars, coverage, and source type - Response formatters for library and snippet results Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
80
src/lib/server/search/formatters.ts
Normal file
80
src/lib/server/search/formatters.ts
Normal file
@@ -0,0 +1,80 @@
|
||||
/**
|
||||
* Result formatters for search responses.
|
||||
*
|
||||
* These produce human-readable (Markdown) strings for use in REST API
|
||||
* responses and MCP tool outputs.
|
||||
*/
|
||||
|
||||
import type { LibrarySearchResult, SnippetSearchResult } from './search.service';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Library search formatter (`resolve-library-id`)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Format library search results for the `resolve-library-id` MCP tool.
|
||||
*/
|
||||
export function formatLibraryResults(results: LibrarySearchResult[]): string {
|
||||
if (results.length === 0) {
|
||||
return 'No libraries found matching your search.';
|
||||
}
|
||||
|
||||
return results
|
||||
.map((r, i) => {
|
||||
const repo = r.repository;
|
||||
const versions = r.versions.map((v) => v.tag).join(', ') || 'default branch';
|
||||
return [
|
||||
`${i + 1}. ${repo.title}`,
|
||||
` Library ID: ${repo.id}`,
|
||||
` Description: ${repo.description ?? 'No description'}`,
|
||||
` Snippets: ${repo.totalSnippets ?? 0} | Trust Score: ${(repo.trustScore ?? 0).toFixed(1)}/10`,
|
||||
` Available Versions: ${versions}`
|
||||
].join('\n');
|
||||
})
|
||||
.join('\n\n');
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Snippet search formatter (`query-docs`)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Format snippet search results for the `query-docs` MCP tool.
|
||||
*
|
||||
* @param results - Ranked snippet search results.
|
||||
* @param rules - Optional repository rules injected before the snippets.
|
||||
*/
|
||||
export function formatSnippetResults(results: SnippetSearchResult[], rules?: string[]): string {
|
||||
const parts: string[] = [];
|
||||
|
||||
// Prepend repository rules when present.
|
||||
if (rules?.length) {
|
||||
parts.push('## Library Rules\n' + rules.map((r) => `- ${r}`).join('\n'));
|
||||
}
|
||||
|
||||
for (const { snippet } of results) {
|
||||
if (snippet.type === 'code') {
|
||||
parts.push(
|
||||
[
|
||||
snippet.title ? `### ${snippet.title}` : '',
|
||||
snippet.breadcrumb ? `*${snippet.breadcrumb}*` : '',
|
||||
`\`\`\`${snippet.language ?? ''}\n${snippet.content}\n\`\`\``
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join('\n')
|
||||
);
|
||||
} else {
|
||||
parts.push(
|
||||
[
|
||||
snippet.title ? `### ${snippet.title}` : '',
|
||||
snippet.breadcrumb ? `*${snippet.breadcrumb}*` : '',
|
||||
snippet.content
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join('\n')
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return parts.join('\n\n---\n\n');
|
||||
}
|
||||
34
src/lib/server/search/query-preprocessor.ts
Normal file
34
src/lib/server/search/query-preprocessor.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
/**
|
||||
* Query preprocessor for FTS5 search queries.
|
||||
*
|
||||
* Normalizes raw user input into an FTS5-compatible MATCH expression
|
||||
* with prefix wildcard expansion on the last token.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Preprocess a raw search query string for FTS5 MATCH.
|
||||
*
|
||||
* Steps:
|
||||
* 1. Trim and normalize internal whitespace.
|
||||
* 2. Strip FTS5 grouping characters `(` and `)` that would cause parse errors.
|
||||
* 3. Append a prefix wildcard `*` to the last token when it is >= 3 characters
|
||||
* and does not already end with `*`. This gives a "typing as you go" feel.
|
||||
*/
|
||||
export function preprocessQuery(raw: string): string {
|
||||
// 1. Trim and collapse whitespace.
|
||||
let q = raw.trim().replace(/\s+/g, ' ');
|
||||
|
||||
// 2. Remove parentheses (not valid in simple FTS5 queries without explicit operators).
|
||||
q = q.replace(/[()]/g, ' ').replace(/\s+/g, ' ').trim();
|
||||
|
||||
if (!q) return q;
|
||||
|
||||
// 3. Add prefix wildcard to the last token.
|
||||
const tokens = q.split(' ');
|
||||
const lastToken = tokens.at(-1) ?? '';
|
||||
if (lastToken.length >= 3 && !lastToken.endsWith('*')) {
|
||||
tokens[tokens.length - 1] = lastToken + '*';
|
||||
}
|
||||
|
||||
return tokens.join(' ');
|
||||
}
|
||||
762
src/lib/server/search/search.service.test.ts
Normal file
762
src/lib/server/search/search.service.test.ts
Normal file
@@ -0,0 +1,762 @@
|
||||
/**
|
||||
* Unit tests for SearchService (TRUEREF-0006).
|
||||
*
|
||||
* Uses an in-memory SQLite database seeded with known data to verify
|
||||
* BM25 snippet search, library search, query preprocessing, and
|
||||
* response formatting.
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeEach } from 'vitest';
|
||||
import Database from 'better-sqlite3';
|
||||
import { readFileSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { SearchService } from './search.service';
|
||||
import { preprocessQuery } from './query-preprocessor';
|
||||
import { computeTrustScore } from './trust-score';
|
||||
import { formatLibraryResults, formatSnippetResults } from './formatters';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// In-memory test DB factory
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function createTestDb(): Database.Database {
|
||||
const client = new Database(':memory:');
|
||||
client.pragma('foreign_keys = ON');
|
||||
|
||||
// Run the migration SQL (split on the drizzle separator).
|
||||
const migrationsFolder = join(import.meta.dirname, '../db/migrations');
|
||||
const migrationSql = readFileSync(join(migrationsFolder, '0000_large_master_chief.sql'), 'utf-8');
|
||||
const statements = migrationSql
|
||||
.split('--> statement-breakpoint')
|
||||
.map((s) => s.trim())
|
||||
.filter(Boolean);
|
||||
for (const stmt of statements) {
|
||||
client.exec(stmt);
|
||||
}
|
||||
|
||||
// Apply FTS5 virtual table + triggers.
|
||||
const ftsSql = readFileSync(join(import.meta.dirname, '../db/fts.sql'), 'utf-8');
|
||||
client.exec(ftsSql);
|
||||
|
||||
return client;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Seed helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const NOW_S = Math.floor(Date.now() / 1000);
|
||||
|
||||
function seedRepo(
|
||||
client: Database.Database,
|
||||
overrides: {
|
||||
id?: string;
|
||||
title?: string;
|
||||
description?: string | null;
|
||||
source?: string;
|
||||
state?: string;
|
||||
total_snippets?: number;
|
||||
trust_score?: number;
|
||||
stars?: number | null;
|
||||
} = {}
|
||||
) {
|
||||
const id = overrides.id ?? '/test/repo';
|
||||
client
|
||||
.prepare(
|
||||
`INSERT INTO repositories
|
||||
(id, title, description, source, source_url, state, total_snippets, trust_score, stars, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
)
|
||||
.run(
|
||||
id,
|
||||
overrides.title ?? 'Test Repo',
|
||||
overrides.description ?? null,
|
||||
overrides.source ?? 'github',
|
||||
`https://github.com${id}`,
|
||||
overrides.state ?? 'indexed',
|
||||
overrides.total_snippets ?? 0,
|
||||
overrides.trust_score ?? 0,
|
||||
overrides.stars ?? null,
|
||||
NOW_S,
|
||||
NOW_S
|
||||
);
|
||||
return id;
|
||||
}
|
||||
|
||||
function seedDocument(client: Database.Database, repositoryId: string): string {
|
||||
const docId = crypto.randomUUID();
|
||||
client
|
||||
.prepare(
|
||||
`INSERT INTO documents (id, repository_id, file_path, checksum, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?)`
|
||||
)
|
||||
.run(docId, repositoryId, 'README.md', 'abc', NOW_S);
|
||||
return docId;
|
||||
}
|
||||
|
||||
function seedSnippet(
|
||||
client: Database.Database,
|
||||
opts: {
|
||||
repositoryId: string;
|
||||
documentId: string;
|
||||
content: string;
|
||||
title?: string | null;
|
||||
breadcrumb?: string | null;
|
||||
type?: 'code' | 'info';
|
||||
language?: string | null;
|
||||
versionId?: string | null;
|
||||
}
|
||||
): string {
|
||||
const id = crypto.randomUUID();
|
||||
client
|
||||
.prepare(
|
||||
`INSERT INTO snippets
|
||||
(id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
)
|
||||
.run(
|
||||
id,
|
||||
opts.documentId,
|
||||
opts.repositoryId,
|
||||
opts.versionId ?? null,
|
||||
opts.type ?? 'info',
|
||||
opts.title ?? null,
|
||||
opts.content,
|
||||
opts.language ?? null,
|
||||
opts.breadcrumb ?? null,
|
||||
NOW_S
|
||||
);
|
||||
return id;
|
||||
}
|
||||
|
||||
function seedVersion(client: Database.Database, repositoryId: string, tag: string): string {
|
||||
const id = `${repositoryId}/${tag}`;
|
||||
client
|
||||
.prepare(
|
||||
`INSERT INTO repository_versions (id, repository_id, tag, state, created_at)
|
||||
VALUES (?, ?, ?, ?, ?)`
|
||||
)
|
||||
.run(id, repositoryId, tag, 'indexed', NOW_S);
|
||||
return id;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// preprocessQuery
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('preprocessQuery', () => {
|
||||
it('trims and collapses whitespace', () => {
|
||||
expect(preprocessQuery(' hello world ')).toBe('hello world*');
|
||||
});
|
||||
|
||||
it('removes parentheses', () => {
|
||||
expect(preprocessQuery('(hello)')).toBe('hello*');
|
||||
});
|
||||
|
||||
it('appends wildcard to last token when >= 3 chars', () => {
|
||||
expect(preprocessQuery('foo bar baz')).toBe('foo bar baz*');
|
||||
});
|
||||
|
||||
it('does not append wildcard when last token is < 3 chars', () => {
|
||||
expect(preprocessQuery('foo ba')).toBe('foo ba');
|
||||
});
|
||||
|
||||
it('does not double-append wildcard', () => {
|
||||
expect(preprocessQuery('hello*')).toBe('hello*');
|
||||
});
|
||||
|
||||
it('preserves AND / OR / NOT operators', () => {
|
||||
const result = preprocessQuery('hello AND world');
|
||||
expect(result).toBe('hello AND world*');
|
||||
});
|
||||
|
||||
it('returns empty string for blank input', () => {
|
||||
expect(preprocessQuery(' ')).toBe('');
|
||||
});
|
||||
|
||||
it('handles single short token without wildcard', () => {
|
||||
expect(preprocessQuery('ab')).toBe('ab');
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// computeTrustScore
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('computeTrustScore', () => {
|
||||
const now = new Date();
|
||||
|
||||
function makeRepo(overrides: Record<string, unknown> = {}) {
|
||||
return {
|
||||
id: '/test/repo',
|
||||
title: 'Test',
|
||||
description: null,
|
||||
source: 'github' as const,
|
||||
sourceUrl: 'https://github.com/test/repo',
|
||||
branch: 'main',
|
||||
state: 'indexed' as const,
|
||||
totalSnippets: 0,
|
||||
totalTokens: 0,
|
||||
trustScore: 0,
|
||||
benchmarkScore: 0,
|
||||
stars: null,
|
||||
githubToken: null,
|
||||
lastIndexedAt: null,
|
||||
createdAt: now,
|
||||
updatedAt: now,
|
||||
...overrides
|
||||
};
|
||||
}
|
||||
|
||||
it('returns 0 for a repo with no qualifying attributes', () => {
|
||||
const repo = makeRepo({ source: 'local', state: 'pending', description: null, stars: null });
|
||||
expect(computeTrustScore(repo)).toBe(0);
|
||||
});
|
||||
|
||||
it('awards 1 point for github source', () => {
|
||||
const repo = makeRepo({ source: 'github', state: 'pending', description: null, stars: null });
|
||||
expect(computeTrustScore(repo)).toBe(1);
|
||||
});
|
||||
|
||||
it('awards 1 point for indexed state', () => {
|
||||
const repo = makeRepo({ source: 'local', state: 'indexed', description: null, stars: null });
|
||||
expect(computeTrustScore(repo)).toBe(1);
|
||||
});
|
||||
|
||||
it('awards 1 point for having a description', () => {
|
||||
const repo = makeRepo({
|
||||
source: 'local',
|
||||
state: 'pending',
|
||||
description: 'A library',
|
||||
stars: null
|
||||
});
|
||||
expect(computeTrustScore(repo)).toBe(1);
|
||||
});
|
||||
|
||||
it('caps score at 10', () => {
|
||||
const repo = makeRepo({
|
||||
source: 'github',
|
||||
state: 'indexed',
|
||||
description: 'A great library',
|
||||
stars: 1_000_000,
|
||||
totalSnippets: 10_000
|
||||
});
|
||||
expect(computeTrustScore(repo)).toBeLessThanOrEqual(10);
|
||||
});
|
||||
|
||||
it('computes star score on log10 scale', () => {
|
||||
// 9999 stars: log10(10000) = 4 → min(4, 4) = 4
|
||||
const repo = makeRepo({ source: 'local', state: 'pending', description: null, stars: 9999 });
|
||||
const score = computeTrustScore(repo);
|
||||
expect(score).toBeCloseTo(Math.min(4, Math.log10(10000)), 1);
|
||||
});
|
||||
|
||||
it('awards documentation coverage proportionally (500 snippets = 1 pt, 1500 = 3 pts)', () => {
|
||||
// 500 snippets → min(3, 500/500) = 1.0
|
||||
const repo500 = makeRepo({
|
||||
source: 'local',
|
||||
state: 'pending',
|
||||
description: null,
|
||||
stars: null,
|
||||
totalSnippets: 500
|
||||
});
|
||||
expect(computeTrustScore(repo500)).toBeCloseTo(1, 1);
|
||||
|
||||
// 1500 snippets → min(3, 1500/500) = 3.0
|
||||
const repo1500 = makeRepo({
|
||||
source: 'local',
|
||||
state: 'pending',
|
||||
description: null,
|
||||
stars: null,
|
||||
totalSnippets: 1500
|
||||
});
|
||||
expect(computeTrustScore(repo1500)).toBeCloseTo(3, 1);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// SearchService.searchSnippets
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('SearchService.searchSnippets', () => {
|
||||
let client: Database.Database;
|
||||
let service: SearchService;
|
||||
let repoId: string;
|
||||
let docId: string;
|
||||
|
||||
beforeEach(() => {
|
||||
client = createTestDb();
|
||||
service = new SearchService(client);
|
||||
|
||||
repoId = seedRepo(client);
|
||||
docId = seedDocument(client, repoId);
|
||||
});
|
||||
|
||||
it('returns results matching a simple keyword', () => {
|
||||
seedSnippet(client, {
|
||||
repositoryId: repoId,
|
||||
documentId: docId,
|
||||
content: 'The quick brown fox jumps over the lazy dog',
|
||||
title: 'Fox story'
|
||||
});
|
||||
|
||||
const results = service.searchSnippets('fox', { repositoryId: repoId });
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
expect(results[0].snippet.title).toBe('Fox story');
|
||||
});
|
||||
|
||||
it('returns empty array for a blank query', () => {
|
||||
const results = service.searchSnippets(' ', { repositoryId: repoId });
|
||||
expect(results).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('returns empty array when no snippets match', () => {
|
||||
seedSnippet(client, {
|
||||
repositoryId: repoId,
|
||||
documentId: docId,
|
||||
content: 'Hello world'
|
||||
});
|
||||
|
||||
const results = service.searchSnippets('zzznomatch', { repositoryId: repoId });
|
||||
expect(results).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('filters by repositoryId — does not return snippets from other repos', () => {
|
||||
const otherRepoId = seedRepo(client, { id: '/other/repo', title: 'Other Repo' });
|
||||
const otherDocId = seedDocument(client, otherRepoId);
|
||||
|
||||
seedSnippet(client, {
|
||||
repositoryId: repoId,
|
||||
documentId: docId,
|
||||
content: 'TypeScript generics tutorial'
|
||||
});
|
||||
seedSnippet(client, {
|
||||
repositoryId: otherRepoId,
|
||||
documentId: otherDocId,
|
||||
content: 'TypeScript generics advanced'
|
||||
});
|
||||
|
||||
const results = service.searchSnippets('TypeScript generics', { repositoryId: repoId });
|
||||
expect(results.every((r) => r.snippet.repositoryId === repoId)).toBe(true);
|
||||
});
|
||||
|
||||
it('filters by type when provided', () => {
|
||||
seedSnippet(client, {
|
||||
repositoryId: repoId,
|
||||
documentId: docId,
|
||||
content: 'TypeScript interface definition',
|
||||
type: 'info'
|
||||
});
|
||||
seedSnippet(client, {
|
||||
repositoryId: repoId,
|
||||
documentId: docId,
|
||||
content: 'TypeScript interface example',
|
||||
type: 'code',
|
||||
language: 'typescript'
|
||||
});
|
||||
|
||||
const codeResults = service.searchSnippets('TypeScript interface', {
|
||||
repositoryId: repoId,
|
||||
type: 'code'
|
||||
});
|
||||
expect(codeResults.every((r) => r.snippet.type === 'code')).toBe(true);
|
||||
|
||||
const infoResults = service.searchSnippets('TypeScript interface', {
|
||||
repositoryId: repoId,
|
||||
type: 'info'
|
||||
});
|
||||
expect(infoResults.every((r) => r.snippet.type === 'info')).toBe(true);
|
||||
});
|
||||
|
||||
it('filters by versionId when provided', () => {
|
||||
const versionId = seedVersion(client, repoId, 'v1.0.0');
|
||||
|
||||
seedSnippet(client, {
|
||||
repositoryId: repoId,
|
||||
documentId: docId,
|
||||
content: 'Versioned React hooks documentation',
|
||||
versionId
|
||||
});
|
||||
seedSnippet(client, {
|
||||
repositoryId: repoId,
|
||||
documentId: docId,
|
||||
content: 'React hooks documentation (unversioned)',
|
||||
versionId: null
|
||||
});
|
||||
|
||||
const results = service.searchSnippets('React hooks', {
|
||||
repositoryId: repoId,
|
||||
versionId
|
||||
});
|
||||
expect(results.every((r) => r.snippet.versionId === versionId)).toBe(true);
|
||||
});
|
||||
|
||||
it('respects limit and offset', () => {
|
||||
for (let i = 0; i < 5; i++) {
|
||||
seedSnippet(client, {
|
||||
repositoryId: repoId,
|
||||
documentId: docId,
|
||||
content: `pagination content item number ${i} relevant`
|
||||
});
|
||||
}
|
||||
|
||||
const page1 = service.searchSnippets('pagination content', {
|
||||
repositoryId: repoId,
|
||||
limit: 2,
|
||||
offset: 0
|
||||
});
|
||||
const page2 = service.searchSnippets('pagination content', {
|
||||
repositoryId: repoId,
|
||||
limit: 2,
|
||||
offset: 2
|
||||
});
|
||||
|
||||
expect(page1.length).toBeLessThanOrEqual(2);
|
||||
expect(page2.length).toBeLessThanOrEqual(2);
|
||||
if (page1.length > 0 && page2.length > 0) {
|
||||
// Pages must not overlap.
|
||||
const ids1 = new Set(page1.map((r) => r.snippet.id));
|
||||
expect(page2.some((r) => ids1.has(r.snippet.id))).toBe(false);
|
||||
}
|
||||
});
|
||||
|
||||
it('returns scores (negative BM25 values)', () => {
|
||||
seedSnippet(client, {
|
||||
repositoryId: repoId,
|
||||
documentId: docId,
|
||||
content: 'SQLite full text search tutorial'
|
||||
});
|
||||
|
||||
const results = service.searchSnippets('SQLite full text search', { repositoryId: repoId });
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
// BM25 returns negative values for matched documents.
|
||||
expect(results[0].score).toBeLessThan(0);
|
||||
});
|
||||
|
||||
it('includes repository metadata in results', () => {
|
||||
seedSnippet(client, {
|
||||
repositoryId: repoId,
|
||||
documentId: docId,
|
||||
content: 'repository metadata check'
|
||||
});
|
||||
|
||||
const results = service.searchSnippets('metadata check', { repositoryId: repoId });
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
expect(results[0].repository.id).toBe(repoId);
|
||||
expect(results[0].repository.title).toBe('Test Repo');
|
||||
});
|
||||
|
||||
it('uses porter stemmer — matches stemmed forms', () => {
|
||||
seedSnippet(client, {
|
||||
repositoryId: repoId,
|
||||
documentId: docId,
|
||||
content: 'running tests efficiently'
|
||||
});
|
||||
|
||||
// "run" should match "running" via porter stemmer.
|
||||
const results = service.searchSnippets('run', { repositoryId: repoId });
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('uses prefix wildcard — partial word matches', () => {
|
||||
seedSnippet(client, {
|
||||
repositoryId: repoId,
|
||||
documentId: docId,
|
||||
content: 'authentication middleware pattern'
|
||||
});
|
||||
|
||||
// preprocessQuery appends '*' to tokens >= 3 chars.
|
||||
const results = service.searchSnippets('authen', { repositoryId: repoId });
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// SearchService.searchRepositories
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('SearchService.searchRepositories', () => {
|
||||
let client: Database.Database;
|
||||
let service: SearchService;
|
||||
|
||||
beforeEach(() => {
|
||||
client = createTestDb();
|
||||
service = new SearchService(client);
|
||||
});
|
||||
|
||||
it('returns empty array when no indexed repos match', () => {
|
||||
seedRepo(client, { id: '/unrelated/lib', title: 'Unrelated Library' });
|
||||
|
||||
const results = service.searchRepositories({ libraryName: 'react' });
|
||||
expect(results).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('finds a repo by title', () => {
|
||||
seedRepo(client, { id: '/facebook/react', title: 'React', state: 'indexed' });
|
||||
|
||||
const results = service.searchRepositories({ libraryName: 'react' });
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
expect(results[0].repository.id).toBe('/facebook/react');
|
||||
});
|
||||
|
||||
it('exact match ranks above prefix match', () => {
|
||||
seedRepo(client, { id: '/facebook/react', title: 'React', state: 'indexed' });
|
||||
seedRepo(client, { id: '/some/reactive', title: 'Reactive Lib', state: 'indexed' });
|
||||
|
||||
const results = service.searchRepositories({ libraryName: 'React' });
|
||||
expect(results[0].repository.title).toBe('React');
|
||||
});
|
||||
|
||||
it('excludes non-indexed repositories', () => {
|
||||
seedRepo(client, { id: '/facebook/react', title: 'React', state: 'pending' });
|
||||
|
||||
const results = service.searchRepositories({ libraryName: 'react' });
|
||||
expect(results).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('includes versions in results', () => {
|
||||
const repoId = seedRepo(client, { id: '/facebook/react', title: 'React', state: 'indexed' });
|
||||
seedVersion(client, repoId, 'v18.0.0');
|
||||
seedVersion(client, repoId, 'v17.0.0');
|
||||
|
||||
const results = service.searchRepositories({ libraryName: 'react' });
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
expect(results[0].versions.length).toBe(2);
|
||||
});
|
||||
|
||||
it('respects the limit option', () => {
|
||||
for (let i = 0; i < 5; i++) {
|
||||
seedRepo(client, {
|
||||
id: `/test/lib${i}`,
|
||||
title: `Test Library ${i}`,
|
||||
state: 'indexed'
|
||||
});
|
||||
}
|
||||
|
||||
const results = service.searchRepositories({ libraryName: 'library', limit: 2 });
|
||||
expect(results.length).toBeLessThanOrEqual(2);
|
||||
});
|
||||
|
||||
it('returns a composite score for each result', () => {
|
||||
seedRepo(client, { id: '/facebook/react', title: 'React', state: 'indexed' });
|
||||
|
||||
const results = service.searchRepositories({ libraryName: 'react' });
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
expect(typeof results[0].score).toBe('number');
|
||||
expect(results[0].score).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('matches on repository description', () => {
|
||||
seedRepo(client, {
|
||||
id: '/some/lib',
|
||||
title: 'Some Library',
|
||||
description: 'A react-compatible UI toolkit',
|
||||
state: 'indexed'
|
||||
});
|
||||
|
||||
const results = service.searchRepositories({ libraryName: 'react-compatible' });
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// formatLibraryResults
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('formatLibraryResults', () => {
|
||||
it('returns no-match message for empty results', () => {
|
||||
expect(formatLibraryResults([])).toBe('No libraries found matching your search.');
|
||||
});
|
||||
|
||||
it('formats a single result with versions', () => {
|
||||
const now = new Date();
|
||||
const results: Parameters<typeof formatLibraryResults>[0] = [
|
||||
{
|
||||
repository: {
|
||||
id: '/facebook/react',
|
||||
title: 'React',
|
||||
description: 'A JavaScript library for building user interfaces',
|
||||
source: 'github',
|
||||
sourceUrl: 'https://github.com/facebook/react',
|
||||
branch: 'main',
|
||||
state: 'indexed',
|
||||
totalSnippets: 1000,
|
||||
totalTokens: 50000,
|
||||
trustScore: 8.5,
|
||||
benchmarkScore: 0,
|
||||
stars: 200000,
|
||||
githubToken: null,
|
||||
lastIndexedAt: null,
|
||||
createdAt: now,
|
||||
updatedAt: now
|
||||
},
|
||||
versions: [
|
||||
{
|
||||
id: '/facebook/react/v18',
|
||||
repositoryId: '/facebook/react',
|
||||
tag: 'v18',
|
||||
title: 'React 18',
|
||||
state: 'indexed',
|
||||
totalSnippets: 1000,
|
||||
indexedAt: null,
|
||||
createdAt: now
|
||||
}
|
||||
],
|
||||
score: 150
|
||||
}
|
||||
];
|
||||
|
||||
const output = formatLibraryResults(results);
|
||||
expect(output).toContain('1. React');
|
||||
expect(output).toContain('Library ID: /facebook/react');
|
||||
expect(output).toContain('Snippets: 1000');
|
||||
expect(output).toContain('Trust Score: 8.5/10');
|
||||
expect(output).toContain('v18');
|
||||
});
|
||||
|
||||
it('shows "default branch" when no versions are present', () => {
|
||||
const now = new Date();
|
||||
const results: Parameters<typeof formatLibraryResults>[0] = [
|
||||
{
|
||||
repository: {
|
||||
id: '/test/lib',
|
||||
title: 'Test Lib',
|
||||
description: null,
|
||||
source: 'local',
|
||||
sourceUrl: '/path/to/lib',
|
||||
branch: 'main',
|
||||
state: 'indexed',
|
||||
totalSnippets: 0,
|
||||
totalTokens: 0,
|
||||
trustScore: 0,
|
||||
benchmarkScore: 0,
|
||||
stars: null,
|
||||
githubToken: null,
|
||||
lastIndexedAt: null,
|
||||
createdAt: now,
|
||||
updatedAt: now
|
||||
},
|
||||
versions: [],
|
||||
score: 50
|
||||
}
|
||||
];
|
||||
|
||||
const output = formatLibraryResults(results);
|
||||
expect(output).toContain('default branch');
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// formatSnippetResults
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('formatSnippetResults', () => {
|
||||
const now = new Date();
|
||||
|
||||
function makeSnippetResult(overrides: Partial<Parameters<typeof formatSnippetResults>[0][number]> = {}): Parameters<typeof formatSnippetResults>[0][number] {
|
||||
return {
|
||||
snippet: {
|
||||
id: crypto.randomUUID(),
|
||||
documentId: crypto.randomUUID(),
|
||||
repositoryId: '/test/repo',
|
||||
versionId: null,
|
||||
type: 'info',
|
||||
title: 'My Title',
|
||||
content: 'Some content here.',
|
||||
language: null,
|
||||
breadcrumb: null,
|
||||
tokenCount: 10,
|
||||
createdAt: now
|
||||
},
|
||||
score: -1.5,
|
||||
repository: { id: '/test/repo', title: 'Test Repo' },
|
||||
...overrides
|
||||
};
|
||||
}
|
||||
|
||||
it('returns empty string for no results and no rules', () => {
|
||||
expect(formatSnippetResults([])).toBe('');
|
||||
});
|
||||
|
||||
it('prepends library rules when provided', () => {
|
||||
const output = formatSnippetResults([], ['Use TypeScript', 'Prefer const']);
|
||||
expect(output).toContain('## Library Rules');
|
||||
expect(output).toContain('- Use TypeScript');
|
||||
expect(output).toContain('- Prefer const');
|
||||
});
|
||||
|
||||
it('formats an info snippet with title and breadcrumb', () => {
|
||||
const result = makeSnippetResult({
|
||||
snippet: {
|
||||
id: crypto.randomUUID(),
|
||||
documentId: crypto.randomUUID(),
|
||||
repositoryId: '/test/repo',
|
||||
versionId: null,
|
||||
type: 'info',
|
||||
title: 'Getting Started',
|
||||
content: 'Install the package using npm.',
|
||||
language: null,
|
||||
breadcrumb: 'Docs > Intro',
|
||||
tokenCount: 5,
|
||||
createdAt: now
|
||||
}
|
||||
});
|
||||
|
||||
const output = formatSnippetResults([result]);
|
||||
expect(output).toContain('### Getting Started');
|
||||
expect(output).toContain('*Docs > Intro*');
|
||||
expect(output).toContain('Install the package using npm.');
|
||||
});
|
||||
|
||||
it('formats a code snippet with fenced code block', () => {
|
||||
const result = makeSnippetResult({
|
||||
snippet: {
|
||||
id: crypto.randomUUID(),
|
||||
documentId: crypto.randomUUID(),
|
||||
repositoryId: '/test/repo',
|
||||
versionId: null,
|
||||
type: 'code',
|
||||
title: 'Example',
|
||||
content: 'const x = 1;',
|
||||
language: 'typescript',
|
||||
breadcrumb: null,
|
||||
tokenCount: 5,
|
||||
createdAt: now
|
||||
}
|
||||
});
|
||||
|
||||
const output = formatSnippetResults([result]);
|
||||
expect(output).toContain('```typescript');
|
||||
expect(output).toContain('const x = 1;');
|
||||
expect(output).toContain('```');
|
||||
});
|
||||
|
||||
it('separates multiple results with horizontal rules', () => {
|
||||
const r1 = makeSnippetResult();
|
||||
const r2 = makeSnippetResult();
|
||||
const output = formatSnippetResults([r1, r2]);
|
||||
expect(output).toContain('---');
|
||||
});
|
||||
|
||||
it('omits title/breadcrumb lines when they are null', () => {
|
||||
const result = makeSnippetResult({
|
||||
snippet: {
|
||||
id: crypto.randomUUID(),
|
||||
documentId: crypto.randomUUID(),
|
||||
repositoryId: '/test/repo',
|
||||
versionId: null,
|
||||
type: 'info',
|
||||
title: null,
|
||||
content: 'Bare content.',
|
||||
language: null,
|
||||
breadcrumb: null,
|
||||
tokenCount: 3,
|
||||
createdAt: now
|
||||
}
|
||||
});
|
||||
|
||||
const output = formatSnippetResults([result]);
|
||||
expect(output).not.toContain('###');
|
||||
expect(output).toContain('Bare content.');
|
||||
});
|
||||
});
|
||||
310
src/lib/server/search/search.service.ts
Normal file
310
src/lib/server/search/search.service.ts
Normal file
@@ -0,0 +1,310 @@
|
||||
/**
|
||||
* SearchService — FTS5-backed full-text search over snippets and repositories.
|
||||
*
|
||||
* Implements keyword search using SQLite's built-in BM25 ranking via the
|
||||
* `bm25()` function exposed by FTS5 virtual tables. Library search uses
|
||||
* LIKE-based matching on the `repositories` table with a composite relevance
|
||||
* score.
|
||||
*/
|
||||
|
||||
import type Database from 'better-sqlite3';
|
||||
import type { Repository, RepositoryVersion, Snippet } from '$lib/types';
|
||||
import { preprocessQuery } from './query-preprocessor';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public interface types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface SnippetSearchOptions {
|
||||
repositoryId: string;
|
||||
versionId?: string;
|
||||
type?: 'code' | 'info';
|
||||
/** Number of results to return. Default: 20. */
|
||||
limit?: number;
|
||||
/** Number of results to skip. Default: 0. */
|
||||
offset?: number;
|
||||
}
|
||||
|
||||
export interface SnippetSearchResult {
|
||||
snippet: Snippet;
|
||||
/** BM25 rank — negative value; lower (more negative) = more relevant. */
|
||||
score: number;
|
||||
repository: Pick<Repository, 'id' | 'title'>;
|
||||
}
|
||||
|
||||
export interface LibrarySearchOptions {
|
||||
libraryName: string;
|
||||
/** Semantic relevance hint (reserved for future hybrid use). */
|
||||
query?: string;
|
||||
/** Number of results to return. Default: 10. */
|
||||
limit?: number;
|
||||
}
|
||||
|
||||
export interface LibrarySearchResult {
|
||||
repository: Repository;
|
||||
versions: RepositoryVersion[];
|
||||
/** Composite relevance score. Higher = more relevant. */
|
||||
score: number;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Raw DB row types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Raw row returned by the snippet FTS query (snake_case column names). */
|
||||
interface RawSnippetRow {
|
||||
id: string;
|
||||
document_id: string;
|
||||
repository_id: string;
|
||||
version_id: string | null;
|
||||
type: 'code' | 'info';
|
||||
title: string | null;
|
||||
content: string;
|
||||
language: string | null;
|
||||
breadcrumb: string | null;
|
||||
token_count: number | null;
|
||||
created_at: number;
|
||||
repo_id: string;
|
||||
repo_title: string;
|
||||
score: number;
|
||||
}
|
||||
|
||||
/** Raw row returned by the library search query. */
|
||||
interface RawRepoRow {
|
||||
id: string;
|
||||
title: string;
|
||||
description: string | null;
|
||||
source: 'github' | 'local';
|
||||
source_url: string;
|
||||
branch: string | null;
|
||||
state: 'pending' | 'indexing' | 'indexed' | 'error';
|
||||
total_snippets: number | null;
|
||||
total_tokens: number | null;
|
||||
trust_score: number | null;
|
||||
benchmark_score: number | null;
|
||||
stars: number | null;
|
||||
github_token: string | null;
|
||||
last_indexed_at: number | null;
|
||||
created_at: number;
|
||||
updated_at: number;
|
||||
exact_match: number;
|
||||
prefix_match: number;
|
||||
desc_match: number;
|
||||
snippet_score: number;
|
||||
trust_component: number;
|
||||
}
|
||||
|
||||
/** Raw row returned by the version query. */
|
||||
interface RawVersionRow {
|
||||
id: string;
|
||||
repository_id: string;
|
||||
tag: string;
|
||||
title: string | null;
|
||||
state: 'pending' | 'indexing' | 'indexed' | 'error';
|
||||
total_snippets: number | null;
|
||||
indexed_at: number | null;
|
||||
created_at: number;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Mappers: raw DB rows → domain types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function mapSnippet(row: RawSnippetRow): Snippet {
|
||||
return {
|
||||
id: row.id,
|
||||
documentId: row.document_id,
|
||||
repositoryId: row.repository_id,
|
||||
versionId: row.version_id,
|
||||
type: row.type,
|
||||
title: row.title,
|
||||
content: row.content,
|
||||
language: row.language,
|
||||
breadcrumb: row.breadcrumb,
|
||||
tokenCount: row.token_count,
|
||||
createdAt: new Date(row.created_at * 1000)
|
||||
};
|
||||
}
|
||||
|
||||
function mapRepository(row: RawRepoRow): Repository {
|
||||
return {
|
||||
id: row.id,
|
||||
title: row.title,
|
||||
description: row.description,
|
||||
source: row.source,
|
||||
sourceUrl: row.source_url,
|
||||
branch: row.branch,
|
||||
state: row.state,
|
||||
totalSnippets: row.total_snippets,
|
||||
totalTokens: row.total_tokens,
|
||||
trustScore: row.trust_score,
|
||||
benchmarkScore: row.benchmark_score,
|
||||
stars: row.stars,
|
||||
githubToken: row.github_token,
|
||||
lastIndexedAt: row.last_indexed_at ? new Date(row.last_indexed_at * 1000) : null,
|
||||
createdAt: new Date(row.created_at * 1000),
|
||||
updatedAt: new Date(row.updated_at * 1000)
|
||||
};
|
||||
}
|
||||
|
||||
function mapVersion(row: RawVersionRow): RepositoryVersion {
|
||||
return {
|
||||
id: row.id,
|
||||
repositoryId: row.repository_id,
|
||||
tag: row.tag,
|
||||
title: row.title,
|
||||
state: row.state,
|
||||
totalSnippets: row.total_snippets,
|
||||
indexedAt: row.indexed_at ? new Date(row.indexed_at * 1000) : null,
|
||||
createdAt: new Date(row.created_at * 1000)
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// SearchService
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export class SearchService {
|
||||
constructor(private readonly db: Database.Database) {}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// searchSnippets
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Search snippets within a repository using FTS5 BM25 ranking.
|
||||
*
|
||||
* The query is preprocessed (whitespace normalization + prefix wildcard)
|
||||
* before being passed to the MATCH expression. Results are ordered by BM25
|
||||
* score ascending (lower = more relevant).
|
||||
*/
|
||||
searchSnippets(query: string, options: SnippetSearchOptions): SnippetSearchResult[] {
|
||||
const { repositoryId, versionId, type, limit = 20, offset = 0 } = options;
|
||||
|
||||
const processedQuery = preprocessQuery(query);
|
||||
if (!processedQuery) return [];
|
||||
|
||||
// Build the WHERE clause dynamically based on optional filters.
|
||||
const conditions: string[] = [
|
||||
'snippets_fts MATCH ?',
|
||||
's.repository_id = ?'
|
||||
];
|
||||
const params: unknown[] = [processedQuery, repositoryId];
|
||||
|
||||
if (versionId !== undefined) {
|
||||
conditions.push('s.version_id = ?');
|
||||
params.push(versionId);
|
||||
}
|
||||
|
||||
if (type !== undefined) {
|
||||
conditions.push('s.type = ?');
|
||||
params.push(type);
|
||||
}
|
||||
|
||||
params.push(limit, offset);
|
||||
|
||||
const sql = `
|
||||
SELECT
|
||||
s.id,
|
||||
s.document_id,
|
||||
s.repository_id,
|
||||
s.version_id,
|
||||
s.type,
|
||||
s.title,
|
||||
s.content,
|
||||
s.language,
|
||||
s.breadcrumb,
|
||||
s.token_count,
|
||||
s.created_at,
|
||||
r.id AS repo_id,
|
||||
r.title AS repo_title,
|
||||
bm25(snippets_fts) AS score
|
||||
FROM snippets_fts
|
||||
JOIN snippets s ON s.rowid = snippets_fts.rowid
|
||||
JOIN repositories r ON r.id = s.repository_id
|
||||
WHERE ${conditions.join(' AND ')}
|
||||
ORDER BY score ASC
|
||||
LIMIT ? OFFSET ?
|
||||
`;
|
||||
|
||||
const rows = this.db.prepare(sql).all(...params) as RawSnippetRow[];
|
||||
|
||||
return rows.map((row) => ({
|
||||
snippet: mapSnippet(row),
|
||||
score: row.score,
|
||||
repository: { id: row.repo_id, title: row.repo_title }
|
||||
}));
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// searchRepositories
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Search repositories by library name using LIKE-based matching.
|
||||
*
|
||||
* Applies a composite scoring model:
|
||||
* - Exact title match : 100 pts
|
||||
* - Prefix title match : 50 pts
|
||||
* - Description match : 20 pts
|
||||
* - Snippet density : total_snippets / 100
|
||||
* - Trust score : trust_score * 10
|
||||
*/
|
||||
searchRepositories(options: LibrarySearchOptions): LibrarySearchResult[] {
|
||||
const { libraryName, limit = 10 } = options;
|
||||
|
||||
const rows = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT r.*,
|
||||
CASE WHEN LOWER(r.title) = LOWER(?) THEN 100 ELSE 0 END AS exact_match,
|
||||
CASE WHEN LOWER(r.title) LIKE LOWER(?) THEN 50 ELSE 0 END AS prefix_match,
|
||||
CASE WHEN LOWER(r.description) LIKE LOWER(?) THEN 20 ELSE 0 END AS desc_match,
|
||||
(COALESCE(r.total_snippets, 0) / 100.0) AS snippet_score,
|
||||
COALESCE(r.trust_score, 0) * 10 AS trust_component
|
||||
FROM repositories r
|
||||
WHERE r.state = 'indexed'
|
||||
AND (
|
||||
LOWER(r.title) LIKE LOWER(?)
|
||||
OR LOWER(r.id) LIKE LOWER(?)
|
||||
OR LOWER(r.description) LIKE LOWER(?)
|
||||
)
|
||||
ORDER BY (exact_match + prefix_match + desc_match + snippet_score + trust_component) DESC
|
||||
LIMIT ?
|
||||
`
|
||||
)
|
||||
.all(
|
||||
libraryName, // exact_match
|
||||
`${libraryName}%`, // prefix_match
|
||||
`%${libraryName}%`, // desc_match
|
||||
`%${libraryName}%`, // WHERE title LIKE
|
||||
`%${libraryName}%`, // WHERE id LIKE
|
||||
`%${libraryName}%`, // WHERE description LIKE
|
||||
limit
|
||||
) as RawRepoRow[];
|
||||
|
||||
return rows.map((row) => {
|
||||
const repository = mapRepository(row);
|
||||
const compositeScore =
|
||||
row.exact_match + row.prefix_match + row.desc_match + row.snippet_score + row.trust_component;
|
||||
return {
|
||||
repository,
|
||||
versions: this.getVersions(row.id),
|
||||
score: compositeScore
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Private helpers
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private getVersions(repositoryId: string): RepositoryVersion[] {
|
||||
const rows = this.db
|
||||
.prepare(
|
||||
`SELECT * FROM repository_versions WHERE repository_id = ? ORDER BY created_at DESC`
|
||||
)
|
||||
.all(repositoryId) as RawVersionRow[];
|
||||
return rows.map(mapVersion);
|
||||
}
|
||||
}
|
||||
41
src/lib/server/search/trust-score.ts
Normal file
41
src/lib/server/search/trust-score.ts
Normal file
@@ -0,0 +1,41 @@
|
||||
/**
|
||||
* Trust score computation for repositories.
|
||||
*
|
||||
* Produces a composite score in [0, 10] that reflects the credibility and
|
||||
* completeness of a repository's documentation.
|
||||
*/
|
||||
|
||||
import type { Repository } from '$lib/types';
|
||||
|
||||
/**
|
||||
* Compute a trust score (0–10) for a repository.
|
||||
*
|
||||
* Score components:
|
||||
* - Stars : up to 4 points on a log10 scale (10 k stars = 4 pts)
|
||||
* - Doc coverage : up to 3 points (500 snippets = 3 pts)
|
||||
* - Source type : 1 point for GitHub repos
|
||||
* - Indexed state: 1 point when state is "indexed"
|
||||
* - Description : 1 point when a description is present
|
||||
*/
|
||||
export function computeTrustScore(repo: Repository): number {
|
||||
let score = 0;
|
||||
|
||||
// Stars (up to 4 points): log scale, 10k stars ≈ 4 pts.
|
||||
if (repo.stars) {
|
||||
score += Math.min(4, Math.log10(repo.stars + 1));
|
||||
}
|
||||
|
||||
// Documentation coverage (up to 3 points).
|
||||
score += Math.min(3, (repo.totalSnippets ?? 0) / 500);
|
||||
|
||||
// Source type (1 point for GitHub).
|
||||
if (repo.source === 'github') score += 1;
|
||||
|
||||
// Successful indexing (1 point).
|
||||
if (repo.state === 'indexed') score += 1;
|
||||
|
||||
// Has description (1 point).
|
||||
if (repo.description) score += 1;
|
||||
|
||||
return Math.min(10, parseFloat(score.toFixed(1)));
|
||||
}
|
||||
Reference in New Issue
Block a user