diff --git a/src/lib/server/search/formatters.ts b/src/lib/server/search/formatters.ts new file mode 100644 index 0000000..580b08c --- /dev/null +++ b/src/lib/server/search/formatters.ts @@ -0,0 +1,80 @@ +/** + * Result formatters for search responses. + * + * These produce human-readable (Markdown) strings for use in REST API + * responses and MCP tool outputs. + */ + +import type { LibrarySearchResult, SnippetSearchResult } from './search.service'; + +// --------------------------------------------------------------------------- +// Library search formatter (`resolve-library-id`) +// --------------------------------------------------------------------------- + +/** + * Format library search results for the `resolve-library-id` MCP tool. + */ +export function formatLibraryResults(results: LibrarySearchResult[]): string { + if (results.length === 0) { + return 'No libraries found matching your search.'; + } + + return results + .map((r, i) => { + const repo = r.repository; + const versions = r.versions.map((v) => v.tag).join(', ') || 'default branch'; + return [ + `${i + 1}. ${repo.title}`, + ` Library ID: ${repo.id}`, + ` Description: ${repo.description ?? 'No description'}`, + ` Snippets: ${repo.totalSnippets ?? 0} | Trust Score: ${(repo.trustScore ?? 0).toFixed(1)}/10`, + ` Available Versions: ${versions}` + ].join('\n'); + }) + .join('\n\n'); +} + +// --------------------------------------------------------------------------- +// Snippet search formatter (`query-docs`) +// --------------------------------------------------------------------------- + +/** + * Format snippet search results for the `query-docs` MCP tool. + * + * @param results - Ranked snippet search results. + * @param rules - Optional repository rules injected before the snippets. + */ +export function formatSnippetResults(results: SnippetSearchResult[], rules?: string[]): string { + const parts: string[] = []; + + // Prepend repository rules when present. + if (rules?.length) { + parts.push('## Library Rules\n' + rules.map((r) => `- ${r}`).join('\n')); + } + + for (const { snippet } of results) { + if (snippet.type === 'code') { + parts.push( + [ + snippet.title ? `### ${snippet.title}` : '', + snippet.breadcrumb ? `*${snippet.breadcrumb}*` : '', + `\`\`\`${snippet.language ?? ''}\n${snippet.content}\n\`\`\`` + ] + .filter(Boolean) + .join('\n') + ); + } else { + parts.push( + [ + snippet.title ? `### ${snippet.title}` : '', + snippet.breadcrumb ? `*${snippet.breadcrumb}*` : '', + snippet.content + ] + .filter(Boolean) + .join('\n') + ); + } + } + + return parts.join('\n\n---\n\n'); +} diff --git a/src/lib/server/search/query-preprocessor.ts b/src/lib/server/search/query-preprocessor.ts new file mode 100644 index 0000000..6f8e9ba --- /dev/null +++ b/src/lib/server/search/query-preprocessor.ts @@ -0,0 +1,34 @@ +/** + * Query preprocessor for FTS5 search queries. + * + * Normalizes raw user input into an FTS5-compatible MATCH expression + * with prefix wildcard expansion on the last token. + */ + +/** + * Preprocess a raw search query string for FTS5 MATCH. + * + * Steps: + * 1. Trim and normalize internal whitespace. + * 2. Strip FTS5 grouping characters `(` and `)` that would cause parse errors. + * 3. Append a prefix wildcard `*` to the last token when it is >= 3 characters + * and does not already end with `*`. This gives a "typing as you go" feel. + */ +export function preprocessQuery(raw: string): string { + // 1. Trim and collapse whitespace. + let q = raw.trim().replace(/\s+/g, ' '); + + // 2. Remove parentheses (not valid in simple FTS5 queries without explicit operators). + q = q.replace(/[()]/g, ' ').replace(/\s+/g, ' ').trim(); + + if (!q) return q; + + // 3. Add prefix wildcard to the last token. + const tokens = q.split(' '); + const lastToken = tokens.at(-1) ?? ''; + if (lastToken.length >= 3 && !lastToken.endsWith('*')) { + tokens[tokens.length - 1] = lastToken + '*'; + } + + return tokens.join(' '); +} diff --git a/src/lib/server/search/search.service.test.ts b/src/lib/server/search/search.service.test.ts new file mode 100644 index 0000000..31c8ddf --- /dev/null +++ b/src/lib/server/search/search.service.test.ts @@ -0,0 +1,762 @@ +/** + * Unit tests for SearchService (TRUEREF-0006). + * + * Uses an in-memory SQLite database seeded with known data to verify + * BM25 snippet search, library search, query preprocessing, and + * response formatting. + */ + +import { describe, it, expect, beforeEach } from 'vitest'; +import Database from 'better-sqlite3'; +import { readFileSync } from 'node:fs'; +import { join } from 'node:path'; +import { SearchService } from './search.service'; +import { preprocessQuery } from './query-preprocessor'; +import { computeTrustScore } from './trust-score'; +import { formatLibraryResults, formatSnippetResults } from './formatters'; + +// --------------------------------------------------------------------------- +// In-memory test DB factory +// --------------------------------------------------------------------------- + +function createTestDb(): Database.Database { + const client = new Database(':memory:'); + client.pragma('foreign_keys = ON'); + + // Run the migration SQL (split on the drizzle separator). + const migrationsFolder = join(import.meta.dirname, '../db/migrations'); + const migrationSql = readFileSync(join(migrationsFolder, '0000_large_master_chief.sql'), 'utf-8'); + const statements = migrationSql + .split('--> statement-breakpoint') + .map((s) => s.trim()) + .filter(Boolean); + for (const stmt of statements) { + client.exec(stmt); + } + + // Apply FTS5 virtual table + triggers. + const ftsSql = readFileSync(join(import.meta.dirname, '../db/fts.sql'), 'utf-8'); + client.exec(ftsSql); + + return client; +} + +// --------------------------------------------------------------------------- +// Seed helpers +// --------------------------------------------------------------------------- + +const NOW_S = Math.floor(Date.now() / 1000); + +function seedRepo( + client: Database.Database, + overrides: { + id?: string; + title?: string; + description?: string | null; + source?: string; + state?: string; + total_snippets?: number; + trust_score?: number; + stars?: number | null; + } = {} +) { + const id = overrides.id ?? '/test/repo'; + client + .prepare( + `INSERT INTO repositories + (id, title, description, source, source_url, state, total_snippets, trust_score, stars, created_at, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` + ) + .run( + id, + overrides.title ?? 'Test Repo', + overrides.description ?? null, + overrides.source ?? 'github', + `https://github.com${id}`, + overrides.state ?? 'indexed', + overrides.total_snippets ?? 0, + overrides.trust_score ?? 0, + overrides.stars ?? null, + NOW_S, + NOW_S + ); + return id; +} + +function seedDocument(client: Database.Database, repositoryId: string): string { + const docId = crypto.randomUUID(); + client + .prepare( + `INSERT INTO documents (id, repository_id, file_path, checksum, indexed_at) + VALUES (?, ?, ?, ?, ?)` + ) + .run(docId, repositoryId, 'README.md', 'abc', NOW_S); + return docId; +} + +function seedSnippet( + client: Database.Database, + opts: { + repositoryId: string; + documentId: string; + content: string; + title?: string | null; + breadcrumb?: string | null; + type?: 'code' | 'info'; + language?: string | null; + versionId?: string | null; + } +): string { + const id = crypto.randomUUID(); + client + .prepare( + `INSERT INTO snippets + (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` + ) + .run( + id, + opts.documentId, + opts.repositoryId, + opts.versionId ?? null, + opts.type ?? 'info', + opts.title ?? null, + opts.content, + opts.language ?? null, + opts.breadcrumb ?? null, + NOW_S + ); + return id; +} + +function seedVersion(client: Database.Database, repositoryId: string, tag: string): string { + const id = `${repositoryId}/${tag}`; + client + .prepare( + `INSERT INTO repository_versions (id, repository_id, tag, state, created_at) + VALUES (?, ?, ?, ?, ?)` + ) + .run(id, repositoryId, tag, 'indexed', NOW_S); + return id; +} + +// --------------------------------------------------------------------------- +// preprocessQuery +// --------------------------------------------------------------------------- + +describe('preprocessQuery', () => { + it('trims and collapses whitespace', () => { + expect(preprocessQuery(' hello world ')).toBe('hello world*'); + }); + + it('removes parentheses', () => { + expect(preprocessQuery('(hello)')).toBe('hello*'); + }); + + it('appends wildcard to last token when >= 3 chars', () => { + expect(preprocessQuery('foo bar baz')).toBe('foo bar baz*'); + }); + + it('does not append wildcard when last token is < 3 chars', () => { + expect(preprocessQuery('foo ba')).toBe('foo ba'); + }); + + it('does not double-append wildcard', () => { + expect(preprocessQuery('hello*')).toBe('hello*'); + }); + + it('preserves AND / OR / NOT operators', () => { + const result = preprocessQuery('hello AND world'); + expect(result).toBe('hello AND world*'); + }); + + it('returns empty string for blank input', () => { + expect(preprocessQuery(' ')).toBe(''); + }); + + it('handles single short token without wildcard', () => { + expect(preprocessQuery('ab')).toBe('ab'); + }); +}); + +// --------------------------------------------------------------------------- +// computeTrustScore +// --------------------------------------------------------------------------- + +describe('computeTrustScore', () => { + const now = new Date(); + + function makeRepo(overrides: Record = {}) { + return { + id: '/test/repo', + title: 'Test', + description: null, + source: 'github' as const, + sourceUrl: 'https://github.com/test/repo', + branch: 'main', + state: 'indexed' as const, + totalSnippets: 0, + totalTokens: 0, + trustScore: 0, + benchmarkScore: 0, + stars: null, + githubToken: null, + lastIndexedAt: null, + createdAt: now, + updatedAt: now, + ...overrides + }; + } + + it('returns 0 for a repo with no qualifying attributes', () => { + const repo = makeRepo({ source: 'local', state: 'pending', description: null, stars: null }); + expect(computeTrustScore(repo)).toBe(0); + }); + + it('awards 1 point for github source', () => { + const repo = makeRepo({ source: 'github', state: 'pending', description: null, stars: null }); + expect(computeTrustScore(repo)).toBe(1); + }); + + it('awards 1 point for indexed state', () => { + const repo = makeRepo({ source: 'local', state: 'indexed', description: null, stars: null }); + expect(computeTrustScore(repo)).toBe(1); + }); + + it('awards 1 point for having a description', () => { + const repo = makeRepo({ + source: 'local', + state: 'pending', + description: 'A library', + stars: null + }); + expect(computeTrustScore(repo)).toBe(1); + }); + + it('caps score at 10', () => { + const repo = makeRepo({ + source: 'github', + state: 'indexed', + description: 'A great library', + stars: 1_000_000, + totalSnippets: 10_000 + }); + expect(computeTrustScore(repo)).toBeLessThanOrEqual(10); + }); + + it('computes star score on log10 scale', () => { + // 9999 stars: log10(10000) = 4 → min(4, 4) = 4 + const repo = makeRepo({ source: 'local', state: 'pending', description: null, stars: 9999 }); + const score = computeTrustScore(repo); + expect(score).toBeCloseTo(Math.min(4, Math.log10(10000)), 1); + }); + + it('awards documentation coverage proportionally (500 snippets = 1 pt, 1500 = 3 pts)', () => { + // 500 snippets → min(3, 500/500) = 1.0 + const repo500 = makeRepo({ + source: 'local', + state: 'pending', + description: null, + stars: null, + totalSnippets: 500 + }); + expect(computeTrustScore(repo500)).toBeCloseTo(1, 1); + + // 1500 snippets → min(3, 1500/500) = 3.0 + const repo1500 = makeRepo({ + source: 'local', + state: 'pending', + description: null, + stars: null, + totalSnippets: 1500 + }); + expect(computeTrustScore(repo1500)).toBeCloseTo(3, 1); + }); +}); + +// --------------------------------------------------------------------------- +// SearchService.searchSnippets +// --------------------------------------------------------------------------- + +describe('SearchService.searchSnippets', () => { + let client: Database.Database; + let service: SearchService; + let repoId: string; + let docId: string; + + beforeEach(() => { + client = createTestDb(); + service = new SearchService(client); + + repoId = seedRepo(client); + docId = seedDocument(client, repoId); + }); + + it('returns results matching a simple keyword', () => { + seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'The quick brown fox jumps over the lazy dog', + title: 'Fox story' + }); + + const results = service.searchSnippets('fox', { repositoryId: repoId }); + expect(results.length).toBeGreaterThan(0); + expect(results[0].snippet.title).toBe('Fox story'); + }); + + it('returns empty array for a blank query', () => { + const results = service.searchSnippets(' ', { repositoryId: repoId }); + expect(results).toHaveLength(0); + }); + + it('returns empty array when no snippets match', () => { + seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'Hello world' + }); + + const results = service.searchSnippets('zzznomatch', { repositoryId: repoId }); + expect(results).toHaveLength(0); + }); + + it('filters by repositoryId — does not return snippets from other repos', () => { + const otherRepoId = seedRepo(client, { id: '/other/repo', title: 'Other Repo' }); + const otherDocId = seedDocument(client, otherRepoId); + + seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'TypeScript generics tutorial' + }); + seedSnippet(client, { + repositoryId: otherRepoId, + documentId: otherDocId, + content: 'TypeScript generics advanced' + }); + + const results = service.searchSnippets('TypeScript generics', { repositoryId: repoId }); + expect(results.every((r) => r.snippet.repositoryId === repoId)).toBe(true); + }); + + it('filters by type when provided', () => { + seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'TypeScript interface definition', + type: 'info' + }); + seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'TypeScript interface example', + type: 'code', + language: 'typescript' + }); + + const codeResults = service.searchSnippets('TypeScript interface', { + repositoryId: repoId, + type: 'code' + }); + expect(codeResults.every((r) => r.snippet.type === 'code')).toBe(true); + + const infoResults = service.searchSnippets('TypeScript interface', { + repositoryId: repoId, + type: 'info' + }); + expect(infoResults.every((r) => r.snippet.type === 'info')).toBe(true); + }); + + it('filters by versionId when provided', () => { + const versionId = seedVersion(client, repoId, 'v1.0.0'); + + seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'Versioned React hooks documentation', + versionId + }); + seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'React hooks documentation (unversioned)', + versionId: null + }); + + const results = service.searchSnippets('React hooks', { + repositoryId: repoId, + versionId + }); + expect(results.every((r) => r.snippet.versionId === versionId)).toBe(true); + }); + + it('respects limit and offset', () => { + for (let i = 0; i < 5; i++) { + seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: `pagination content item number ${i} relevant` + }); + } + + const page1 = service.searchSnippets('pagination content', { + repositoryId: repoId, + limit: 2, + offset: 0 + }); + const page2 = service.searchSnippets('pagination content', { + repositoryId: repoId, + limit: 2, + offset: 2 + }); + + expect(page1.length).toBeLessThanOrEqual(2); + expect(page2.length).toBeLessThanOrEqual(2); + if (page1.length > 0 && page2.length > 0) { + // Pages must not overlap. + const ids1 = new Set(page1.map((r) => r.snippet.id)); + expect(page2.some((r) => ids1.has(r.snippet.id))).toBe(false); + } + }); + + it('returns scores (negative BM25 values)', () => { + seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'SQLite full text search tutorial' + }); + + const results = service.searchSnippets('SQLite full text search', { repositoryId: repoId }); + expect(results.length).toBeGreaterThan(0); + // BM25 returns negative values for matched documents. + expect(results[0].score).toBeLessThan(0); + }); + + it('includes repository metadata in results', () => { + seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'repository metadata check' + }); + + const results = service.searchSnippets('metadata check', { repositoryId: repoId }); + expect(results.length).toBeGreaterThan(0); + expect(results[0].repository.id).toBe(repoId); + expect(results[0].repository.title).toBe('Test Repo'); + }); + + it('uses porter stemmer — matches stemmed forms', () => { + seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'running tests efficiently' + }); + + // "run" should match "running" via porter stemmer. + const results = service.searchSnippets('run', { repositoryId: repoId }); + expect(results.length).toBeGreaterThan(0); + }); + + it('uses prefix wildcard — partial word matches', () => { + seedSnippet(client, { + repositoryId: repoId, + documentId: docId, + content: 'authentication middleware pattern' + }); + + // preprocessQuery appends '*' to tokens >= 3 chars. + const results = service.searchSnippets('authen', { repositoryId: repoId }); + expect(results.length).toBeGreaterThan(0); + }); +}); + +// --------------------------------------------------------------------------- +// SearchService.searchRepositories +// --------------------------------------------------------------------------- + +describe('SearchService.searchRepositories', () => { + let client: Database.Database; + let service: SearchService; + + beforeEach(() => { + client = createTestDb(); + service = new SearchService(client); + }); + + it('returns empty array when no indexed repos match', () => { + seedRepo(client, { id: '/unrelated/lib', title: 'Unrelated Library' }); + + const results = service.searchRepositories({ libraryName: 'react' }); + expect(results).toHaveLength(0); + }); + + it('finds a repo by title', () => { + seedRepo(client, { id: '/facebook/react', title: 'React', state: 'indexed' }); + + const results = service.searchRepositories({ libraryName: 'react' }); + expect(results.length).toBeGreaterThan(0); + expect(results[0].repository.id).toBe('/facebook/react'); + }); + + it('exact match ranks above prefix match', () => { + seedRepo(client, { id: '/facebook/react', title: 'React', state: 'indexed' }); + seedRepo(client, { id: '/some/reactive', title: 'Reactive Lib', state: 'indexed' }); + + const results = service.searchRepositories({ libraryName: 'React' }); + expect(results[0].repository.title).toBe('React'); + }); + + it('excludes non-indexed repositories', () => { + seedRepo(client, { id: '/facebook/react', title: 'React', state: 'pending' }); + + const results = service.searchRepositories({ libraryName: 'react' }); + expect(results).toHaveLength(0); + }); + + it('includes versions in results', () => { + const repoId = seedRepo(client, { id: '/facebook/react', title: 'React', state: 'indexed' }); + seedVersion(client, repoId, 'v18.0.0'); + seedVersion(client, repoId, 'v17.0.0'); + + const results = service.searchRepositories({ libraryName: 'react' }); + expect(results.length).toBeGreaterThan(0); + expect(results[0].versions.length).toBe(2); + }); + + it('respects the limit option', () => { + for (let i = 0; i < 5; i++) { + seedRepo(client, { + id: `/test/lib${i}`, + title: `Test Library ${i}`, + state: 'indexed' + }); + } + + const results = service.searchRepositories({ libraryName: 'library', limit: 2 }); + expect(results.length).toBeLessThanOrEqual(2); + }); + + it('returns a composite score for each result', () => { + seedRepo(client, { id: '/facebook/react', title: 'React', state: 'indexed' }); + + const results = service.searchRepositories({ libraryName: 'react' }); + expect(results.length).toBeGreaterThan(0); + expect(typeof results[0].score).toBe('number'); + expect(results[0].score).toBeGreaterThan(0); + }); + + it('matches on repository description', () => { + seedRepo(client, { + id: '/some/lib', + title: 'Some Library', + description: 'A react-compatible UI toolkit', + state: 'indexed' + }); + + const results = service.searchRepositories({ libraryName: 'react-compatible' }); + expect(results.length).toBeGreaterThan(0); + }); +}); + +// --------------------------------------------------------------------------- +// formatLibraryResults +// --------------------------------------------------------------------------- + +describe('formatLibraryResults', () => { + it('returns no-match message for empty results', () => { + expect(formatLibraryResults([])).toBe('No libraries found matching your search.'); + }); + + it('formats a single result with versions', () => { + const now = new Date(); + const results: Parameters[0] = [ + { + repository: { + id: '/facebook/react', + title: 'React', + description: 'A JavaScript library for building user interfaces', + source: 'github', + sourceUrl: 'https://github.com/facebook/react', + branch: 'main', + state: 'indexed', + totalSnippets: 1000, + totalTokens: 50000, + trustScore: 8.5, + benchmarkScore: 0, + stars: 200000, + githubToken: null, + lastIndexedAt: null, + createdAt: now, + updatedAt: now + }, + versions: [ + { + id: '/facebook/react/v18', + repositoryId: '/facebook/react', + tag: 'v18', + title: 'React 18', + state: 'indexed', + totalSnippets: 1000, + indexedAt: null, + createdAt: now + } + ], + score: 150 + } + ]; + + const output = formatLibraryResults(results); + expect(output).toContain('1. React'); + expect(output).toContain('Library ID: /facebook/react'); + expect(output).toContain('Snippets: 1000'); + expect(output).toContain('Trust Score: 8.5/10'); + expect(output).toContain('v18'); + }); + + it('shows "default branch" when no versions are present', () => { + const now = new Date(); + const results: Parameters[0] = [ + { + repository: { + id: '/test/lib', + title: 'Test Lib', + description: null, + source: 'local', + sourceUrl: '/path/to/lib', + branch: 'main', + state: 'indexed', + totalSnippets: 0, + totalTokens: 0, + trustScore: 0, + benchmarkScore: 0, + stars: null, + githubToken: null, + lastIndexedAt: null, + createdAt: now, + updatedAt: now + }, + versions: [], + score: 50 + } + ]; + + const output = formatLibraryResults(results); + expect(output).toContain('default branch'); + }); +}); + +// --------------------------------------------------------------------------- +// formatSnippetResults +// --------------------------------------------------------------------------- + +describe('formatSnippetResults', () => { + const now = new Date(); + + function makeSnippetResult(overrides: Partial[0][number]> = {}): Parameters[0][number] { + return { + snippet: { + id: crypto.randomUUID(), + documentId: crypto.randomUUID(), + repositoryId: '/test/repo', + versionId: null, + type: 'info', + title: 'My Title', + content: 'Some content here.', + language: null, + breadcrumb: null, + tokenCount: 10, + createdAt: now + }, + score: -1.5, + repository: { id: '/test/repo', title: 'Test Repo' }, + ...overrides + }; + } + + it('returns empty string for no results and no rules', () => { + expect(formatSnippetResults([])).toBe(''); + }); + + it('prepends library rules when provided', () => { + const output = formatSnippetResults([], ['Use TypeScript', 'Prefer const']); + expect(output).toContain('## Library Rules'); + expect(output).toContain('- Use TypeScript'); + expect(output).toContain('- Prefer const'); + }); + + it('formats an info snippet with title and breadcrumb', () => { + const result = makeSnippetResult({ + snippet: { + id: crypto.randomUUID(), + documentId: crypto.randomUUID(), + repositoryId: '/test/repo', + versionId: null, + type: 'info', + title: 'Getting Started', + content: 'Install the package using npm.', + language: null, + breadcrumb: 'Docs > Intro', + tokenCount: 5, + createdAt: now + } + }); + + const output = formatSnippetResults([result]); + expect(output).toContain('### Getting Started'); + expect(output).toContain('*Docs > Intro*'); + expect(output).toContain('Install the package using npm.'); + }); + + it('formats a code snippet with fenced code block', () => { + const result = makeSnippetResult({ + snippet: { + id: crypto.randomUUID(), + documentId: crypto.randomUUID(), + repositoryId: '/test/repo', + versionId: null, + type: 'code', + title: 'Example', + content: 'const x = 1;', + language: 'typescript', + breadcrumb: null, + tokenCount: 5, + createdAt: now + } + }); + + const output = formatSnippetResults([result]); + expect(output).toContain('```typescript'); + expect(output).toContain('const x = 1;'); + expect(output).toContain('```'); + }); + + it('separates multiple results with horizontal rules', () => { + const r1 = makeSnippetResult(); + const r2 = makeSnippetResult(); + const output = formatSnippetResults([r1, r2]); + expect(output).toContain('---'); + }); + + it('omits title/breadcrumb lines when they are null', () => { + const result = makeSnippetResult({ + snippet: { + id: crypto.randomUUID(), + documentId: crypto.randomUUID(), + repositoryId: '/test/repo', + versionId: null, + type: 'info', + title: null, + content: 'Bare content.', + language: null, + breadcrumb: null, + tokenCount: 3, + createdAt: now + } + }); + + const output = formatSnippetResults([result]); + expect(output).not.toContain('###'); + expect(output).toContain('Bare content.'); + }); +}); diff --git a/src/lib/server/search/search.service.ts b/src/lib/server/search/search.service.ts new file mode 100644 index 0000000..588281b --- /dev/null +++ b/src/lib/server/search/search.service.ts @@ -0,0 +1,310 @@ +/** + * SearchService — FTS5-backed full-text search over snippets and repositories. + * + * Implements keyword search using SQLite's built-in BM25 ranking via the + * `bm25()` function exposed by FTS5 virtual tables. Library search uses + * LIKE-based matching on the `repositories` table with a composite relevance + * score. + */ + +import type Database from 'better-sqlite3'; +import type { Repository, RepositoryVersion, Snippet } from '$lib/types'; +import { preprocessQuery } from './query-preprocessor'; + +// --------------------------------------------------------------------------- +// Public interface types +// --------------------------------------------------------------------------- + +export interface SnippetSearchOptions { + repositoryId: string; + versionId?: string; + type?: 'code' | 'info'; + /** Number of results to return. Default: 20. */ + limit?: number; + /** Number of results to skip. Default: 0. */ + offset?: number; +} + +export interface SnippetSearchResult { + snippet: Snippet; + /** BM25 rank — negative value; lower (more negative) = more relevant. */ + score: number; + repository: Pick; +} + +export interface LibrarySearchOptions { + libraryName: string; + /** Semantic relevance hint (reserved for future hybrid use). */ + query?: string; + /** Number of results to return. Default: 10. */ + limit?: number; +} + +export interface LibrarySearchResult { + repository: Repository; + versions: RepositoryVersion[]; + /** Composite relevance score. Higher = more relevant. */ + score: number; +} + +// --------------------------------------------------------------------------- +// Raw DB row types +// --------------------------------------------------------------------------- + +/** Raw row returned by the snippet FTS query (snake_case column names). */ +interface RawSnippetRow { + id: string; + document_id: string; + repository_id: string; + version_id: string | null; + type: 'code' | 'info'; + title: string | null; + content: string; + language: string | null; + breadcrumb: string | null; + token_count: number | null; + created_at: number; + repo_id: string; + repo_title: string; + score: number; +} + +/** Raw row returned by the library search query. */ +interface RawRepoRow { + id: string; + title: string; + description: string | null; + source: 'github' | 'local'; + source_url: string; + branch: string | null; + state: 'pending' | 'indexing' | 'indexed' | 'error'; + total_snippets: number | null; + total_tokens: number | null; + trust_score: number | null; + benchmark_score: number | null; + stars: number | null; + github_token: string | null; + last_indexed_at: number | null; + created_at: number; + updated_at: number; + exact_match: number; + prefix_match: number; + desc_match: number; + snippet_score: number; + trust_component: number; +} + +/** Raw row returned by the version query. */ +interface RawVersionRow { + id: string; + repository_id: string; + tag: string; + title: string | null; + state: 'pending' | 'indexing' | 'indexed' | 'error'; + total_snippets: number | null; + indexed_at: number | null; + created_at: number; +} + +// --------------------------------------------------------------------------- +// Mappers: raw DB rows → domain types +// --------------------------------------------------------------------------- + +function mapSnippet(row: RawSnippetRow): Snippet { + return { + id: row.id, + documentId: row.document_id, + repositoryId: row.repository_id, + versionId: row.version_id, + type: row.type, + title: row.title, + content: row.content, + language: row.language, + breadcrumb: row.breadcrumb, + tokenCount: row.token_count, + createdAt: new Date(row.created_at * 1000) + }; +} + +function mapRepository(row: RawRepoRow): Repository { + return { + id: row.id, + title: row.title, + description: row.description, + source: row.source, + sourceUrl: row.source_url, + branch: row.branch, + state: row.state, + totalSnippets: row.total_snippets, + totalTokens: row.total_tokens, + trustScore: row.trust_score, + benchmarkScore: row.benchmark_score, + stars: row.stars, + githubToken: row.github_token, + lastIndexedAt: row.last_indexed_at ? new Date(row.last_indexed_at * 1000) : null, + createdAt: new Date(row.created_at * 1000), + updatedAt: new Date(row.updated_at * 1000) + }; +} + +function mapVersion(row: RawVersionRow): RepositoryVersion { + return { + id: row.id, + repositoryId: row.repository_id, + tag: row.tag, + title: row.title, + state: row.state, + totalSnippets: row.total_snippets, + indexedAt: row.indexed_at ? new Date(row.indexed_at * 1000) : null, + createdAt: new Date(row.created_at * 1000) + }; +} + +// --------------------------------------------------------------------------- +// SearchService +// --------------------------------------------------------------------------- + +export class SearchService { + constructor(private readonly db: Database.Database) {} + + // ------------------------------------------------------------------------- + // searchSnippets + // ------------------------------------------------------------------------- + + /** + * Search snippets within a repository using FTS5 BM25 ranking. + * + * The query is preprocessed (whitespace normalization + prefix wildcard) + * before being passed to the MATCH expression. Results are ordered by BM25 + * score ascending (lower = more relevant). + */ + searchSnippets(query: string, options: SnippetSearchOptions): SnippetSearchResult[] { + const { repositoryId, versionId, type, limit = 20, offset = 0 } = options; + + const processedQuery = preprocessQuery(query); + if (!processedQuery) return []; + + // Build the WHERE clause dynamically based on optional filters. + const conditions: string[] = [ + 'snippets_fts MATCH ?', + 's.repository_id = ?' + ]; + const params: unknown[] = [processedQuery, repositoryId]; + + if (versionId !== undefined) { + conditions.push('s.version_id = ?'); + params.push(versionId); + } + + if (type !== undefined) { + conditions.push('s.type = ?'); + params.push(type); + } + + params.push(limit, offset); + + const sql = ` + SELECT + s.id, + s.document_id, + s.repository_id, + s.version_id, + s.type, + s.title, + s.content, + s.language, + s.breadcrumb, + s.token_count, + s.created_at, + r.id AS repo_id, + r.title AS repo_title, + bm25(snippets_fts) AS score + FROM snippets_fts + JOIN snippets s ON s.rowid = snippets_fts.rowid + JOIN repositories r ON r.id = s.repository_id + WHERE ${conditions.join(' AND ')} + ORDER BY score ASC + LIMIT ? OFFSET ? + `; + + const rows = this.db.prepare(sql).all(...params) as RawSnippetRow[]; + + return rows.map((row) => ({ + snippet: mapSnippet(row), + score: row.score, + repository: { id: row.repo_id, title: row.repo_title } + })); + } + + // ------------------------------------------------------------------------- + // searchRepositories + // ------------------------------------------------------------------------- + + /** + * Search repositories by library name using LIKE-based matching. + * + * Applies a composite scoring model: + * - Exact title match : 100 pts + * - Prefix title match : 50 pts + * - Description match : 20 pts + * - Snippet density : total_snippets / 100 + * - Trust score : trust_score * 10 + */ + searchRepositories(options: LibrarySearchOptions): LibrarySearchResult[] { + const { libraryName, limit = 10 } = options; + + const rows = this.db + .prepare( + ` + SELECT r.*, + CASE WHEN LOWER(r.title) = LOWER(?) THEN 100 ELSE 0 END AS exact_match, + CASE WHEN LOWER(r.title) LIKE LOWER(?) THEN 50 ELSE 0 END AS prefix_match, + CASE WHEN LOWER(r.description) LIKE LOWER(?) THEN 20 ELSE 0 END AS desc_match, + (COALESCE(r.total_snippets, 0) / 100.0) AS snippet_score, + COALESCE(r.trust_score, 0) * 10 AS trust_component + FROM repositories r + WHERE r.state = 'indexed' + AND ( + LOWER(r.title) LIKE LOWER(?) + OR LOWER(r.id) LIKE LOWER(?) + OR LOWER(r.description) LIKE LOWER(?) + ) + ORDER BY (exact_match + prefix_match + desc_match + snippet_score + trust_component) DESC + LIMIT ? + ` + ) + .all( + libraryName, // exact_match + `${libraryName}%`, // prefix_match + `%${libraryName}%`, // desc_match + `%${libraryName}%`, // WHERE title LIKE + `%${libraryName}%`, // WHERE id LIKE + `%${libraryName}%`, // WHERE description LIKE + limit + ) as RawRepoRow[]; + + return rows.map((row) => { + const repository = mapRepository(row); + const compositeScore = + row.exact_match + row.prefix_match + row.desc_match + row.snippet_score + row.trust_component; + return { + repository, + versions: this.getVersions(row.id), + score: compositeScore + }; + }); + } + + // ------------------------------------------------------------------------- + // Private helpers + // ------------------------------------------------------------------------- + + private getVersions(repositoryId: string): RepositoryVersion[] { + const rows = this.db + .prepare( + `SELECT * FROM repository_versions WHERE repository_id = ? ORDER BY created_at DESC` + ) + .all(repositoryId) as RawVersionRow[]; + return rows.map(mapVersion); + } +} diff --git a/src/lib/server/search/trust-score.ts b/src/lib/server/search/trust-score.ts new file mode 100644 index 0000000..866d94b --- /dev/null +++ b/src/lib/server/search/trust-score.ts @@ -0,0 +1,41 @@ +/** + * Trust score computation for repositories. + * + * Produces a composite score in [0, 10] that reflects the credibility and + * completeness of a repository's documentation. + */ + +import type { Repository } from '$lib/types'; + +/** + * Compute a trust score (0–10) for a repository. + * + * Score components: + * - Stars : up to 4 points on a log10 scale (10 k stars = 4 pts) + * - Doc coverage : up to 3 points (500 snippets = 3 pts) + * - Source type : 1 point for GitHub repos + * - Indexed state: 1 point when state is "indexed" + * - Description : 1 point when a description is present + */ +export function computeTrustScore(repo: Repository): number { + let score = 0; + + // Stars (up to 4 points): log scale, 10k stars ≈ 4 pts. + if (repo.stars) { + score += Math.min(4, Math.log10(repo.stars + 1)); + } + + // Documentation coverage (up to 3 points). + score += Math.min(3, (repo.totalSnippets ?? 0) / 500); + + // Source type (1 point for GitHub). + if (repo.source === 'github') score += 1; + + // Successful indexing (1 point). + if (repo.state === 'indexed') score += 1; + + // Has description (1 point). + if (repo.description) score += 1; + + return Math.min(10, parseFloat(score.toFixed(1))); +}