feat(TRUEREF-0006): implement SQLite FTS5 full-text search engine

- BM25 ranking via SQLite FTS5 bm25() function
- Query preprocessor with wildcard expansion and special char escaping
- Library search with composite scoring (name match, trust score, snippet count)
- Trust score computation from stars, coverage, and source type
- Response formatters for library and snippet results

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Giancarmine Salucci
2026-03-23 09:06:18 +01:00
parent f6be3cfd47
commit 33bdf30709
5 changed files with 1227 additions and 0 deletions

View File

@@ -0,0 +1,80 @@
/**
* Result formatters for search responses.
*
* These produce human-readable (Markdown) strings for use in REST API
* responses and MCP tool outputs.
*/
import type { LibrarySearchResult, SnippetSearchResult } from './search.service';
// ---------------------------------------------------------------------------
// Library search formatter (`resolve-library-id`)
// ---------------------------------------------------------------------------
/**
* Format library search results for the `resolve-library-id` MCP tool.
*/
export function formatLibraryResults(results: LibrarySearchResult[]): string {
if (results.length === 0) {
return 'No libraries found matching your search.';
}
return results
.map((r, i) => {
const repo = r.repository;
const versions = r.versions.map((v) => v.tag).join(', ') || 'default branch';
return [
`${i + 1}. ${repo.title}`,
` Library ID: ${repo.id}`,
` Description: ${repo.description ?? 'No description'}`,
` Snippets: ${repo.totalSnippets ?? 0} | Trust Score: ${(repo.trustScore ?? 0).toFixed(1)}/10`,
` Available Versions: ${versions}`
].join('\n');
})
.join('\n\n');
}
// ---------------------------------------------------------------------------
// Snippet search formatter (`query-docs`)
// ---------------------------------------------------------------------------
/**
* Format snippet search results for the `query-docs` MCP tool.
*
* @param results - Ranked snippet search results.
* @param rules - Optional repository rules injected before the snippets.
*/
export function formatSnippetResults(results: SnippetSearchResult[], rules?: string[]): string {
const parts: string[] = [];
// Prepend repository rules when present.
if (rules?.length) {
parts.push('## Library Rules\n' + rules.map((r) => `- ${r}`).join('\n'));
}
for (const { snippet } of results) {
if (snippet.type === 'code') {
parts.push(
[
snippet.title ? `### ${snippet.title}` : '',
snippet.breadcrumb ? `*${snippet.breadcrumb}*` : '',
`\`\`\`${snippet.language ?? ''}\n${snippet.content}\n\`\`\``
]
.filter(Boolean)
.join('\n')
);
} else {
parts.push(
[
snippet.title ? `### ${snippet.title}` : '',
snippet.breadcrumb ? `*${snippet.breadcrumb}*` : '',
snippet.content
]
.filter(Boolean)
.join('\n')
);
}
}
return parts.join('\n\n---\n\n');
}

View File

@@ -0,0 +1,34 @@
/**
* Query preprocessor for FTS5 search queries.
*
* Normalizes raw user input into an FTS5-compatible MATCH expression
* with prefix wildcard expansion on the last token.
*/
/**
* Preprocess a raw search query string for FTS5 MATCH.
*
* Steps:
* 1. Trim and normalize internal whitespace.
* 2. Strip FTS5 grouping characters `(` and `)` that would cause parse errors.
* 3. Append a prefix wildcard `*` to the last token when it is >= 3 characters
* and does not already end with `*`. This gives a "typing as you go" feel.
*/
export function preprocessQuery(raw: string): string {
// 1. Trim and collapse whitespace.
let q = raw.trim().replace(/\s+/g, ' ');
// 2. Remove parentheses (not valid in simple FTS5 queries without explicit operators).
q = q.replace(/[()]/g, ' ').replace(/\s+/g, ' ').trim();
if (!q) return q;
// 3. Add prefix wildcard to the last token.
const tokens = q.split(' ');
const lastToken = tokens.at(-1) ?? '';
if (lastToken.length >= 3 && !lastToken.endsWith('*')) {
tokens[tokens.length - 1] = lastToken + '*';
}
return tokens.join(' ');
}

View File

@@ -0,0 +1,762 @@
/**
* Unit tests for SearchService (TRUEREF-0006).
*
* Uses an in-memory SQLite database seeded with known data to verify
* BM25 snippet search, library search, query preprocessing, and
* response formatting.
*/
import { describe, it, expect, beforeEach } from 'vitest';
import Database from 'better-sqlite3';
import { readFileSync } from 'node:fs';
import { join } from 'node:path';
import { SearchService } from './search.service';
import { preprocessQuery } from './query-preprocessor';
import { computeTrustScore } from './trust-score';
import { formatLibraryResults, formatSnippetResults } from './formatters';
// ---------------------------------------------------------------------------
// In-memory test DB factory
// ---------------------------------------------------------------------------
function createTestDb(): Database.Database {
const client = new Database(':memory:');
client.pragma('foreign_keys = ON');
// Run the migration SQL (split on the drizzle separator).
const migrationsFolder = join(import.meta.dirname, '../db/migrations');
const migrationSql = readFileSync(join(migrationsFolder, '0000_large_master_chief.sql'), 'utf-8');
const statements = migrationSql
.split('--> statement-breakpoint')
.map((s) => s.trim())
.filter(Boolean);
for (const stmt of statements) {
client.exec(stmt);
}
// Apply FTS5 virtual table + triggers.
const ftsSql = readFileSync(join(import.meta.dirname, '../db/fts.sql'), 'utf-8');
client.exec(ftsSql);
return client;
}
// ---------------------------------------------------------------------------
// Seed helpers
// ---------------------------------------------------------------------------
const NOW_S = Math.floor(Date.now() / 1000);
function seedRepo(
client: Database.Database,
overrides: {
id?: string;
title?: string;
description?: string | null;
source?: string;
state?: string;
total_snippets?: number;
trust_score?: number;
stars?: number | null;
} = {}
) {
const id = overrides.id ?? '/test/repo';
client
.prepare(
`INSERT INTO repositories
(id, title, description, source, source_url, state, total_snippets, trust_score, stars, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
)
.run(
id,
overrides.title ?? 'Test Repo',
overrides.description ?? null,
overrides.source ?? 'github',
`https://github.com${id}`,
overrides.state ?? 'indexed',
overrides.total_snippets ?? 0,
overrides.trust_score ?? 0,
overrides.stars ?? null,
NOW_S,
NOW_S
);
return id;
}
function seedDocument(client: Database.Database, repositoryId: string): string {
const docId = crypto.randomUUID();
client
.prepare(
`INSERT INTO documents (id, repository_id, file_path, checksum, indexed_at)
VALUES (?, ?, ?, ?, ?)`
)
.run(docId, repositoryId, 'README.md', 'abc', NOW_S);
return docId;
}
function seedSnippet(
client: Database.Database,
opts: {
repositoryId: string;
documentId: string;
content: string;
title?: string | null;
breadcrumb?: string | null;
type?: 'code' | 'info';
language?: string | null;
versionId?: string | null;
}
): string {
const id = crypto.randomUUID();
client
.prepare(
`INSERT INTO snippets
(id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
)
.run(
id,
opts.documentId,
opts.repositoryId,
opts.versionId ?? null,
opts.type ?? 'info',
opts.title ?? null,
opts.content,
opts.language ?? null,
opts.breadcrumb ?? null,
NOW_S
);
return id;
}
function seedVersion(client: Database.Database, repositoryId: string, tag: string): string {
const id = `${repositoryId}/${tag}`;
client
.prepare(
`INSERT INTO repository_versions (id, repository_id, tag, state, created_at)
VALUES (?, ?, ?, ?, ?)`
)
.run(id, repositoryId, tag, 'indexed', NOW_S);
return id;
}
// ---------------------------------------------------------------------------
// preprocessQuery
// ---------------------------------------------------------------------------
describe('preprocessQuery', () => {
it('trims and collapses whitespace', () => {
expect(preprocessQuery(' hello world ')).toBe('hello world*');
});
it('removes parentheses', () => {
expect(preprocessQuery('(hello)')).toBe('hello*');
});
it('appends wildcard to last token when >= 3 chars', () => {
expect(preprocessQuery('foo bar baz')).toBe('foo bar baz*');
});
it('does not append wildcard when last token is < 3 chars', () => {
expect(preprocessQuery('foo ba')).toBe('foo ba');
});
it('does not double-append wildcard', () => {
expect(preprocessQuery('hello*')).toBe('hello*');
});
it('preserves AND / OR / NOT operators', () => {
const result = preprocessQuery('hello AND world');
expect(result).toBe('hello AND world*');
});
it('returns empty string for blank input', () => {
expect(preprocessQuery(' ')).toBe('');
});
it('handles single short token without wildcard', () => {
expect(preprocessQuery('ab')).toBe('ab');
});
});
// ---------------------------------------------------------------------------
// computeTrustScore
// ---------------------------------------------------------------------------
describe('computeTrustScore', () => {
const now = new Date();
function makeRepo(overrides: Record<string, unknown> = {}) {
return {
id: '/test/repo',
title: 'Test',
description: null,
source: 'github' as const,
sourceUrl: 'https://github.com/test/repo',
branch: 'main',
state: 'indexed' as const,
totalSnippets: 0,
totalTokens: 0,
trustScore: 0,
benchmarkScore: 0,
stars: null,
githubToken: null,
lastIndexedAt: null,
createdAt: now,
updatedAt: now,
...overrides
};
}
it('returns 0 for a repo with no qualifying attributes', () => {
const repo = makeRepo({ source: 'local', state: 'pending', description: null, stars: null });
expect(computeTrustScore(repo)).toBe(0);
});
it('awards 1 point for github source', () => {
const repo = makeRepo({ source: 'github', state: 'pending', description: null, stars: null });
expect(computeTrustScore(repo)).toBe(1);
});
it('awards 1 point for indexed state', () => {
const repo = makeRepo({ source: 'local', state: 'indexed', description: null, stars: null });
expect(computeTrustScore(repo)).toBe(1);
});
it('awards 1 point for having a description', () => {
const repo = makeRepo({
source: 'local',
state: 'pending',
description: 'A library',
stars: null
});
expect(computeTrustScore(repo)).toBe(1);
});
it('caps score at 10', () => {
const repo = makeRepo({
source: 'github',
state: 'indexed',
description: 'A great library',
stars: 1_000_000,
totalSnippets: 10_000
});
expect(computeTrustScore(repo)).toBeLessThanOrEqual(10);
});
it('computes star score on log10 scale', () => {
// 9999 stars: log10(10000) = 4 → min(4, 4) = 4
const repo = makeRepo({ source: 'local', state: 'pending', description: null, stars: 9999 });
const score = computeTrustScore(repo);
expect(score).toBeCloseTo(Math.min(4, Math.log10(10000)), 1);
});
it('awards documentation coverage proportionally (500 snippets = 1 pt, 1500 = 3 pts)', () => {
// 500 snippets → min(3, 500/500) = 1.0
const repo500 = makeRepo({
source: 'local',
state: 'pending',
description: null,
stars: null,
totalSnippets: 500
});
expect(computeTrustScore(repo500)).toBeCloseTo(1, 1);
// 1500 snippets → min(3, 1500/500) = 3.0
const repo1500 = makeRepo({
source: 'local',
state: 'pending',
description: null,
stars: null,
totalSnippets: 1500
});
expect(computeTrustScore(repo1500)).toBeCloseTo(3, 1);
});
});
// ---------------------------------------------------------------------------
// SearchService.searchSnippets
// ---------------------------------------------------------------------------
describe('SearchService.searchSnippets', () => {
let client: Database.Database;
let service: SearchService;
let repoId: string;
let docId: string;
beforeEach(() => {
client = createTestDb();
service = new SearchService(client);
repoId = seedRepo(client);
docId = seedDocument(client, repoId);
});
it('returns results matching a simple keyword', () => {
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'The quick brown fox jumps over the lazy dog',
title: 'Fox story'
});
const results = service.searchSnippets('fox', { repositoryId: repoId });
expect(results.length).toBeGreaterThan(0);
expect(results[0].snippet.title).toBe('Fox story');
});
it('returns empty array for a blank query', () => {
const results = service.searchSnippets(' ', { repositoryId: repoId });
expect(results).toHaveLength(0);
});
it('returns empty array when no snippets match', () => {
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'Hello world'
});
const results = service.searchSnippets('zzznomatch', { repositoryId: repoId });
expect(results).toHaveLength(0);
});
it('filters by repositoryId — does not return snippets from other repos', () => {
const otherRepoId = seedRepo(client, { id: '/other/repo', title: 'Other Repo' });
const otherDocId = seedDocument(client, otherRepoId);
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'TypeScript generics tutorial'
});
seedSnippet(client, {
repositoryId: otherRepoId,
documentId: otherDocId,
content: 'TypeScript generics advanced'
});
const results = service.searchSnippets('TypeScript generics', { repositoryId: repoId });
expect(results.every((r) => r.snippet.repositoryId === repoId)).toBe(true);
});
it('filters by type when provided', () => {
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'TypeScript interface definition',
type: 'info'
});
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'TypeScript interface example',
type: 'code',
language: 'typescript'
});
const codeResults = service.searchSnippets('TypeScript interface', {
repositoryId: repoId,
type: 'code'
});
expect(codeResults.every((r) => r.snippet.type === 'code')).toBe(true);
const infoResults = service.searchSnippets('TypeScript interface', {
repositoryId: repoId,
type: 'info'
});
expect(infoResults.every((r) => r.snippet.type === 'info')).toBe(true);
});
it('filters by versionId when provided', () => {
const versionId = seedVersion(client, repoId, 'v1.0.0');
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'Versioned React hooks documentation',
versionId
});
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'React hooks documentation (unversioned)',
versionId: null
});
const results = service.searchSnippets('React hooks', {
repositoryId: repoId,
versionId
});
expect(results.every((r) => r.snippet.versionId === versionId)).toBe(true);
});
it('respects limit and offset', () => {
for (let i = 0; i < 5; i++) {
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: `pagination content item number ${i} relevant`
});
}
const page1 = service.searchSnippets('pagination content', {
repositoryId: repoId,
limit: 2,
offset: 0
});
const page2 = service.searchSnippets('pagination content', {
repositoryId: repoId,
limit: 2,
offset: 2
});
expect(page1.length).toBeLessThanOrEqual(2);
expect(page2.length).toBeLessThanOrEqual(2);
if (page1.length > 0 && page2.length > 0) {
// Pages must not overlap.
const ids1 = new Set(page1.map((r) => r.snippet.id));
expect(page2.some((r) => ids1.has(r.snippet.id))).toBe(false);
}
});
it('returns scores (negative BM25 values)', () => {
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'SQLite full text search tutorial'
});
const results = service.searchSnippets('SQLite full text search', { repositoryId: repoId });
expect(results.length).toBeGreaterThan(0);
// BM25 returns negative values for matched documents.
expect(results[0].score).toBeLessThan(0);
});
it('includes repository metadata in results', () => {
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'repository metadata check'
});
const results = service.searchSnippets('metadata check', { repositoryId: repoId });
expect(results.length).toBeGreaterThan(0);
expect(results[0].repository.id).toBe(repoId);
expect(results[0].repository.title).toBe('Test Repo');
});
it('uses porter stemmer — matches stemmed forms', () => {
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'running tests efficiently'
});
// "run" should match "running" via porter stemmer.
const results = service.searchSnippets('run', { repositoryId: repoId });
expect(results.length).toBeGreaterThan(0);
});
it('uses prefix wildcard — partial word matches', () => {
seedSnippet(client, {
repositoryId: repoId,
documentId: docId,
content: 'authentication middleware pattern'
});
// preprocessQuery appends '*' to tokens >= 3 chars.
const results = service.searchSnippets('authen', { repositoryId: repoId });
expect(results.length).toBeGreaterThan(0);
});
});
// ---------------------------------------------------------------------------
// SearchService.searchRepositories
// ---------------------------------------------------------------------------
describe('SearchService.searchRepositories', () => {
let client: Database.Database;
let service: SearchService;
beforeEach(() => {
client = createTestDb();
service = new SearchService(client);
});
it('returns empty array when no indexed repos match', () => {
seedRepo(client, { id: '/unrelated/lib', title: 'Unrelated Library' });
const results = service.searchRepositories({ libraryName: 'react' });
expect(results).toHaveLength(0);
});
it('finds a repo by title', () => {
seedRepo(client, { id: '/facebook/react', title: 'React', state: 'indexed' });
const results = service.searchRepositories({ libraryName: 'react' });
expect(results.length).toBeGreaterThan(0);
expect(results[0].repository.id).toBe('/facebook/react');
});
it('exact match ranks above prefix match', () => {
seedRepo(client, { id: '/facebook/react', title: 'React', state: 'indexed' });
seedRepo(client, { id: '/some/reactive', title: 'Reactive Lib', state: 'indexed' });
const results = service.searchRepositories({ libraryName: 'React' });
expect(results[0].repository.title).toBe('React');
});
it('excludes non-indexed repositories', () => {
seedRepo(client, { id: '/facebook/react', title: 'React', state: 'pending' });
const results = service.searchRepositories({ libraryName: 'react' });
expect(results).toHaveLength(0);
});
it('includes versions in results', () => {
const repoId = seedRepo(client, { id: '/facebook/react', title: 'React', state: 'indexed' });
seedVersion(client, repoId, 'v18.0.0');
seedVersion(client, repoId, 'v17.0.0');
const results = service.searchRepositories({ libraryName: 'react' });
expect(results.length).toBeGreaterThan(0);
expect(results[0].versions.length).toBe(2);
});
it('respects the limit option', () => {
for (let i = 0; i < 5; i++) {
seedRepo(client, {
id: `/test/lib${i}`,
title: `Test Library ${i}`,
state: 'indexed'
});
}
const results = service.searchRepositories({ libraryName: 'library', limit: 2 });
expect(results.length).toBeLessThanOrEqual(2);
});
it('returns a composite score for each result', () => {
seedRepo(client, { id: '/facebook/react', title: 'React', state: 'indexed' });
const results = service.searchRepositories({ libraryName: 'react' });
expect(results.length).toBeGreaterThan(0);
expect(typeof results[0].score).toBe('number');
expect(results[0].score).toBeGreaterThan(0);
});
it('matches on repository description', () => {
seedRepo(client, {
id: '/some/lib',
title: 'Some Library',
description: 'A react-compatible UI toolkit',
state: 'indexed'
});
const results = service.searchRepositories({ libraryName: 'react-compatible' });
expect(results.length).toBeGreaterThan(0);
});
});
// ---------------------------------------------------------------------------
// formatLibraryResults
// ---------------------------------------------------------------------------
describe('formatLibraryResults', () => {
it('returns no-match message for empty results', () => {
expect(formatLibraryResults([])).toBe('No libraries found matching your search.');
});
it('formats a single result with versions', () => {
const now = new Date();
const results: Parameters<typeof formatLibraryResults>[0] = [
{
repository: {
id: '/facebook/react',
title: 'React',
description: 'A JavaScript library for building user interfaces',
source: 'github',
sourceUrl: 'https://github.com/facebook/react',
branch: 'main',
state: 'indexed',
totalSnippets: 1000,
totalTokens: 50000,
trustScore: 8.5,
benchmarkScore: 0,
stars: 200000,
githubToken: null,
lastIndexedAt: null,
createdAt: now,
updatedAt: now
},
versions: [
{
id: '/facebook/react/v18',
repositoryId: '/facebook/react',
tag: 'v18',
title: 'React 18',
state: 'indexed',
totalSnippets: 1000,
indexedAt: null,
createdAt: now
}
],
score: 150
}
];
const output = formatLibraryResults(results);
expect(output).toContain('1. React');
expect(output).toContain('Library ID: /facebook/react');
expect(output).toContain('Snippets: 1000');
expect(output).toContain('Trust Score: 8.5/10');
expect(output).toContain('v18');
});
it('shows "default branch" when no versions are present', () => {
const now = new Date();
const results: Parameters<typeof formatLibraryResults>[0] = [
{
repository: {
id: '/test/lib',
title: 'Test Lib',
description: null,
source: 'local',
sourceUrl: '/path/to/lib',
branch: 'main',
state: 'indexed',
totalSnippets: 0,
totalTokens: 0,
trustScore: 0,
benchmarkScore: 0,
stars: null,
githubToken: null,
lastIndexedAt: null,
createdAt: now,
updatedAt: now
},
versions: [],
score: 50
}
];
const output = formatLibraryResults(results);
expect(output).toContain('default branch');
});
});
// ---------------------------------------------------------------------------
// formatSnippetResults
// ---------------------------------------------------------------------------
describe('formatSnippetResults', () => {
const now = new Date();
function makeSnippetResult(overrides: Partial<Parameters<typeof formatSnippetResults>[0][number]> = {}): Parameters<typeof formatSnippetResults>[0][number] {
return {
snippet: {
id: crypto.randomUUID(),
documentId: crypto.randomUUID(),
repositoryId: '/test/repo',
versionId: null,
type: 'info',
title: 'My Title',
content: 'Some content here.',
language: null,
breadcrumb: null,
tokenCount: 10,
createdAt: now
},
score: -1.5,
repository: { id: '/test/repo', title: 'Test Repo' },
...overrides
};
}
it('returns empty string for no results and no rules', () => {
expect(formatSnippetResults([])).toBe('');
});
it('prepends library rules when provided', () => {
const output = formatSnippetResults([], ['Use TypeScript', 'Prefer const']);
expect(output).toContain('## Library Rules');
expect(output).toContain('- Use TypeScript');
expect(output).toContain('- Prefer const');
});
it('formats an info snippet with title and breadcrumb', () => {
const result = makeSnippetResult({
snippet: {
id: crypto.randomUUID(),
documentId: crypto.randomUUID(),
repositoryId: '/test/repo',
versionId: null,
type: 'info',
title: 'Getting Started',
content: 'Install the package using npm.',
language: null,
breadcrumb: 'Docs > Intro',
tokenCount: 5,
createdAt: now
}
});
const output = formatSnippetResults([result]);
expect(output).toContain('### Getting Started');
expect(output).toContain('*Docs > Intro*');
expect(output).toContain('Install the package using npm.');
});
it('formats a code snippet with fenced code block', () => {
const result = makeSnippetResult({
snippet: {
id: crypto.randomUUID(),
documentId: crypto.randomUUID(),
repositoryId: '/test/repo',
versionId: null,
type: 'code',
title: 'Example',
content: 'const x = 1;',
language: 'typescript',
breadcrumb: null,
tokenCount: 5,
createdAt: now
}
});
const output = formatSnippetResults([result]);
expect(output).toContain('```typescript');
expect(output).toContain('const x = 1;');
expect(output).toContain('```');
});
it('separates multiple results with horizontal rules', () => {
const r1 = makeSnippetResult();
const r2 = makeSnippetResult();
const output = formatSnippetResults([r1, r2]);
expect(output).toContain('---');
});
it('omits title/breadcrumb lines when they are null', () => {
const result = makeSnippetResult({
snippet: {
id: crypto.randomUUID(),
documentId: crypto.randomUUID(),
repositoryId: '/test/repo',
versionId: null,
type: 'info',
title: null,
content: 'Bare content.',
language: null,
breadcrumb: null,
tokenCount: 3,
createdAt: now
}
});
const output = formatSnippetResults([result]);
expect(output).not.toContain('###');
expect(output).toContain('Bare content.');
});
});

View File

@@ -0,0 +1,310 @@
/**
* SearchService — FTS5-backed full-text search over snippets and repositories.
*
* Implements keyword search using SQLite's built-in BM25 ranking via the
* `bm25()` function exposed by FTS5 virtual tables. Library search uses
* LIKE-based matching on the `repositories` table with a composite relevance
* score.
*/
import type Database from 'better-sqlite3';
import type { Repository, RepositoryVersion, Snippet } from '$lib/types';
import { preprocessQuery } from './query-preprocessor';
// ---------------------------------------------------------------------------
// Public interface types
// ---------------------------------------------------------------------------
export interface SnippetSearchOptions {
repositoryId: string;
versionId?: string;
type?: 'code' | 'info';
/** Number of results to return. Default: 20. */
limit?: number;
/** Number of results to skip. Default: 0. */
offset?: number;
}
export interface SnippetSearchResult {
snippet: Snippet;
/** BM25 rank — negative value; lower (more negative) = more relevant. */
score: number;
repository: Pick<Repository, 'id' | 'title'>;
}
export interface LibrarySearchOptions {
libraryName: string;
/** Semantic relevance hint (reserved for future hybrid use). */
query?: string;
/** Number of results to return. Default: 10. */
limit?: number;
}
export interface LibrarySearchResult {
repository: Repository;
versions: RepositoryVersion[];
/** Composite relevance score. Higher = more relevant. */
score: number;
}
// ---------------------------------------------------------------------------
// Raw DB row types
// ---------------------------------------------------------------------------
/** Raw row returned by the snippet FTS query (snake_case column names). */
interface RawSnippetRow {
id: string;
document_id: string;
repository_id: string;
version_id: string | null;
type: 'code' | 'info';
title: string | null;
content: string;
language: string | null;
breadcrumb: string | null;
token_count: number | null;
created_at: number;
repo_id: string;
repo_title: string;
score: number;
}
/** Raw row returned by the library search query. */
interface RawRepoRow {
id: string;
title: string;
description: string | null;
source: 'github' | 'local';
source_url: string;
branch: string | null;
state: 'pending' | 'indexing' | 'indexed' | 'error';
total_snippets: number | null;
total_tokens: number | null;
trust_score: number | null;
benchmark_score: number | null;
stars: number | null;
github_token: string | null;
last_indexed_at: number | null;
created_at: number;
updated_at: number;
exact_match: number;
prefix_match: number;
desc_match: number;
snippet_score: number;
trust_component: number;
}
/** Raw row returned by the version query. */
interface RawVersionRow {
id: string;
repository_id: string;
tag: string;
title: string | null;
state: 'pending' | 'indexing' | 'indexed' | 'error';
total_snippets: number | null;
indexed_at: number | null;
created_at: number;
}
// ---------------------------------------------------------------------------
// Mappers: raw DB rows → domain types
// ---------------------------------------------------------------------------
function mapSnippet(row: RawSnippetRow): Snippet {
return {
id: row.id,
documentId: row.document_id,
repositoryId: row.repository_id,
versionId: row.version_id,
type: row.type,
title: row.title,
content: row.content,
language: row.language,
breadcrumb: row.breadcrumb,
tokenCount: row.token_count,
createdAt: new Date(row.created_at * 1000)
};
}
function mapRepository(row: RawRepoRow): Repository {
return {
id: row.id,
title: row.title,
description: row.description,
source: row.source,
sourceUrl: row.source_url,
branch: row.branch,
state: row.state,
totalSnippets: row.total_snippets,
totalTokens: row.total_tokens,
trustScore: row.trust_score,
benchmarkScore: row.benchmark_score,
stars: row.stars,
githubToken: row.github_token,
lastIndexedAt: row.last_indexed_at ? new Date(row.last_indexed_at * 1000) : null,
createdAt: new Date(row.created_at * 1000),
updatedAt: new Date(row.updated_at * 1000)
};
}
function mapVersion(row: RawVersionRow): RepositoryVersion {
return {
id: row.id,
repositoryId: row.repository_id,
tag: row.tag,
title: row.title,
state: row.state,
totalSnippets: row.total_snippets,
indexedAt: row.indexed_at ? new Date(row.indexed_at * 1000) : null,
createdAt: new Date(row.created_at * 1000)
};
}
// ---------------------------------------------------------------------------
// SearchService
// ---------------------------------------------------------------------------
export class SearchService {
constructor(private readonly db: Database.Database) {}
// -------------------------------------------------------------------------
// searchSnippets
// -------------------------------------------------------------------------
/**
* Search snippets within a repository using FTS5 BM25 ranking.
*
* The query is preprocessed (whitespace normalization + prefix wildcard)
* before being passed to the MATCH expression. Results are ordered by BM25
* score ascending (lower = more relevant).
*/
searchSnippets(query: string, options: SnippetSearchOptions): SnippetSearchResult[] {
const { repositoryId, versionId, type, limit = 20, offset = 0 } = options;
const processedQuery = preprocessQuery(query);
if (!processedQuery) return [];
// Build the WHERE clause dynamically based on optional filters.
const conditions: string[] = [
'snippets_fts MATCH ?',
's.repository_id = ?'
];
const params: unknown[] = [processedQuery, repositoryId];
if (versionId !== undefined) {
conditions.push('s.version_id = ?');
params.push(versionId);
}
if (type !== undefined) {
conditions.push('s.type = ?');
params.push(type);
}
params.push(limit, offset);
const sql = `
SELECT
s.id,
s.document_id,
s.repository_id,
s.version_id,
s.type,
s.title,
s.content,
s.language,
s.breadcrumb,
s.token_count,
s.created_at,
r.id AS repo_id,
r.title AS repo_title,
bm25(snippets_fts) AS score
FROM snippets_fts
JOIN snippets s ON s.rowid = snippets_fts.rowid
JOIN repositories r ON r.id = s.repository_id
WHERE ${conditions.join(' AND ')}
ORDER BY score ASC
LIMIT ? OFFSET ?
`;
const rows = this.db.prepare(sql).all(...params) as RawSnippetRow[];
return rows.map((row) => ({
snippet: mapSnippet(row),
score: row.score,
repository: { id: row.repo_id, title: row.repo_title }
}));
}
// -------------------------------------------------------------------------
// searchRepositories
// -------------------------------------------------------------------------
/**
* Search repositories by library name using LIKE-based matching.
*
* Applies a composite scoring model:
* - Exact title match : 100 pts
* - Prefix title match : 50 pts
* - Description match : 20 pts
* - Snippet density : total_snippets / 100
* - Trust score : trust_score * 10
*/
searchRepositories(options: LibrarySearchOptions): LibrarySearchResult[] {
const { libraryName, limit = 10 } = options;
const rows = this.db
.prepare(
`
SELECT r.*,
CASE WHEN LOWER(r.title) = LOWER(?) THEN 100 ELSE 0 END AS exact_match,
CASE WHEN LOWER(r.title) LIKE LOWER(?) THEN 50 ELSE 0 END AS prefix_match,
CASE WHEN LOWER(r.description) LIKE LOWER(?) THEN 20 ELSE 0 END AS desc_match,
(COALESCE(r.total_snippets, 0) / 100.0) AS snippet_score,
COALESCE(r.trust_score, 0) * 10 AS trust_component
FROM repositories r
WHERE r.state = 'indexed'
AND (
LOWER(r.title) LIKE LOWER(?)
OR LOWER(r.id) LIKE LOWER(?)
OR LOWER(r.description) LIKE LOWER(?)
)
ORDER BY (exact_match + prefix_match + desc_match + snippet_score + trust_component) DESC
LIMIT ?
`
)
.all(
libraryName, // exact_match
`${libraryName}%`, // prefix_match
`%${libraryName}%`, // desc_match
`%${libraryName}%`, // WHERE title LIKE
`%${libraryName}%`, // WHERE id LIKE
`%${libraryName}%`, // WHERE description LIKE
limit
) as RawRepoRow[];
return rows.map((row) => {
const repository = mapRepository(row);
const compositeScore =
row.exact_match + row.prefix_match + row.desc_match + row.snippet_score + row.trust_component;
return {
repository,
versions: this.getVersions(row.id),
score: compositeScore
};
});
}
// -------------------------------------------------------------------------
// Private helpers
// -------------------------------------------------------------------------
private getVersions(repositoryId: string): RepositoryVersion[] {
const rows = this.db
.prepare(
`SELECT * FROM repository_versions WHERE repository_id = ? ORDER BY created_at DESC`
)
.all(repositoryId) as RawVersionRow[];
return rows.map(mapVersion);
}
}

View File

@@ -0,0 +1,41 @@
/**
* Trust score computation for repositories.
*
* Produces a composite score in [0, 10] that reflects the credibility and
* completeness of a repository's documentation.
*/
import type { Repository } from '$lib/types';
/**
* Compute a trust score (010) for a repository.
*
* Score components:
* - Stars : up to 4 points on a log10 scale (10 k stars = 4 pts)
* - Doc coverage : up to 3 points (500 snippets = 3 pts)
* - Source type : 1 point for GitHub repos
* - Indexed state: 1 point when state is "indexed"
* - Description : 1 point when a description is present
*/
export function computeTrustScore(repo: Repository): number {
let score = 0;
// Stars (up to 4 points): log scale, 10k stars ≈ 4 pts.
if (repo.stars) {
score += Math.min(4, Math.log10(repo.stars + 1));
}
// Documentation coverage (up to 3 points).
score += Math.min(3, (repo.totalSnippets ?? 0) / 500);
// Source type (1 point for GitHub).
if (repo.source === 'github') score += 1;
// Successful indexing (1 point).
if (repo.state === 'indexed') score += 1;
// Has description (1 point).
if (repo.description) score += 1;
return Math.min(10, parseFloat(score.toFixed(1)));
}