Files
trueref/src/lib/server/search/search.service.ts
Giancarmine Salucci 33bdf30709 feat(TRUEREF-0006): implement SQLite FTS5 full-text search engine
- BM25 ranking via SQLite FTS5 bm25() function
- Query preprocessor with wildcard expansion and special char escaping
- Library search with composite scoring (name match, trust score, snippet count)
- Trust score computation from stars, coverage, and source type
- Response formatters for library and snippet results

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-23 09:06:18 +01:00

311 lines
9.0 KiB
TypeScript

/**
* SearchService — FTS5-backed full-text search over snippets and repositories.
*
* Implements keyword search using SQLite's built-in BM25 ranking via the
* `bm25()` function exposed by FTS5 virtual tables. Library search uses
* LIKE-based matching on the `repositories` table with a composite relevance
* score.
*/
import type Database from 'better-sqlite3';
import type { Repository, RepositoryVersion, Snippet } from '$lib/types';
import { preprocessQuery } from './query-preprocessor';
// ---------------------------------------------------------------------------
// Public interface types
// ---------------------------------------------------------------------------
export interface SnippetSearchOptions {
repositoryId: string;
versionId?: string;
type?: 'code' | 'info';
/** Number of results to return. Default: 20. */
limit?: number;
/** Number of results to skip. Default: 0. */
offset?: number;
}
export interface SnippetSearchResult {
snippet: Snippet;
/** BM25 rank — negative value; lower (more negative) = more relevant. */
score: number;
repository: Pick<Repository, 'id' | 'title'>;
}
export interface LibrarySearchOptions {
libraryName: string;
/** Semantic relevance hint (reserved for future hybrid use). */
query?: string;
/** Number of results to return. Default: 10. */
limit?: number;
}
export interface LibrarySearchResult {
repository: Repository;
versions: RepositoryVersion[];
/** Composite relevance score. Higher = more relevant. */
score: number;
}
// ---------------------------------------------------------------------------
// Raw DB row types
// ---------------------------------------------------------------------------
/** Raw row returned by the snippet FTS query (snake_case column names). */
interface RawSnippetRow {
id: string;
document_id: string;
repository_id: string;
version_id: string | null;
type: 'code' | 'info';
title: string | null;
content: string;
language: string | null;
breadcrumb: string | null;
token_count: number | null;
created_at: number;
repo_id: string;
repo_title: string;
score: number;
}
/** Raw row returned by the library search query. */
interface RawRepoRow {
id: string;
title: string;
description: string | null;
source: 'github' | 'local';
source_url: string;
branch: string | null;
state: 'pending' | 'indexing' | 'indexed' | 'error';
total_snippets: number | null;
total_tokens: number | null;
trust_score: number | null;
benchmark_score: number | null;
stars: number | null;
github_token: string | null;
last_indexed_at: number | null;
created_at: number;
updated_at: number;
exact_match: number;
prefix_match: number;
desc_match: number;
snippet_score: number;
trust_component: number;
}
/** Raw row returned by the version query. */
interface RawVersionRow {
id: string;
repository_id: string;
tag: string;
title: string | null;
state: 'pending' | 'indexing' | 'indexed' | 'error';
total_snippets: number | null;
indexed_at: number | null;
created_at: number;
}
// ---------------------------------------------------------------------------
// Mappers: raw DB rows → domain types
// ---------------------------------------------------------------------------
function mapSnippet(row: RawSnippetRow): Snippet {
return {
id: row.id,
documentId: row.document_id,
repositoryId: row.repository_id,
versionId: row.version_id,
type: row.type,
title: row.title,
content: row.content,
language: row.language,
breadcrumb: row.breadcrumb,
tokenCount: row.token_count,
createdAt: new Date(row.created_at * 1000)
};
}
function mapRepository(row: RawRepoRow): Repository {
return {
id: row.id,
title: row.title,
description: row.description,
source: row.source,
sourceUrl: row.source_url,
branch: row.branch,
state: row.state,
totalSnippets: row.total_snippets,
totalTokens: row.total_tokens,
trustScore: row.trust_score,
benchmarkScore: row.benchmark_score,
stars: row.stars,
githubToken: row.github_token,
lastIndexedAt: row.last_indexed_at ? new Date(row.last_indexed_at * 1000) : null,
createdAt: new Date(row.created_at * 1000),
updatedAt: new Date(row.updated_at * 1000)
};
}
function mapVersion(row: RawVersionRow): RepositoryVersion {
return {
id: row.id,
repositoryId: row.repository_id,
tag: row.tag,
title: row.title,
state: row.state,
totalSnippets: row.total_snippets,
indexedAt: row.indexed_at ? new Date(row.indexed_at * 1000) : null,
createdAt: new Date(row.created_at * 1000)
};
}
// ---------------------------------------------------------------------------
// SearchService
// ---------------------------------------------------------------------------
export class SearchService {
constructor(private readonly db: Database.Database) {}
// -------------------------------------------------------------------------
// searchSnippets
// -------------------------------------------------------------------------
/**
* Search snippets within a repository using FTS5 BM25 ranking.
*
* The query is preprocessed (whitespace normalization + prefix wildcard)
* before being passed to the MATCH expression. Results are ordered by BM25
* score ascending (lower = more relevant).
*/
searchSnippets(query: string, options: SnippetSearchOptions): SnippetSearchResult[] {
const { repositoryId, versionId, type, limit = 20, offset = 0 } = options;
const processedQuery = preprocessQuery(query);
if (!processedQuery) return [];
// Build the WHERE clause dynamically based on optional filters.
const conditions: string[] = [
'snippets_fts MATCH ?',
's.repository_id = ?'
];
const params: unknown[] = [processedQuery, repositoryId];
if (versionId !== undefined) {
conditions.push('s.version_id = ?');
params.push(versionId);
}
if (type !== undefined) {
conditions.push('s.type = ?');
params.push(type);
}
params.push(limit, offset);
const sql = `
SELECT
s.id,
s.document_id,
s.repository_id,
s.version_id,
s.type,
s.title,
s.content,
s.language,
s.breadcrumb,
s.token_count,
s.created_at,
r.id AS repo_id,
r.title AS repo_title,
bm25(snippets_fts) AS score
FROM snippets_fts
JOIN snippets s ON s.rowid = snippets_fts.rowid
JOIN repositories r ON r.id = s.repository_id
WHERE ${conditions.join(' AND ')}
ORDER BY score ASC
LIMIT ? OFFSET ?
`;
const rows = this.db.prepare(sql).all(...params) as RawSnippetRow[];
return rows.map((row) => ({
snippet: mapSnippet(row),
score: row.score,
repository: { id: row.repo_id, title: row.repo_title }
}));
}
// -------------------------------------------------------------------------
// searchRepositories
// -------------------------------------------------------------------------
/**
* Search repositories by library name using LIKE-based matching.
*
* Applies a composite scoring model:
* - Exact title match : 100 pts
* - Prefix title match : 50 pts
* - Description match : 20 pts
* - Snippet density : total_snippets / 100
* - Trust score : trust_score * 10
*/
searchRepositories(options: LibrarySearchOptions): LibrarySearchResult[] {
const { libraryName, limit = 10 } = options;
const rows = this.db
.prepare(
`
SELECT r.*,
CASE WHEN LOWER(r.title) = LOWER(?) THEN 100 ELSE 0 END AS exact_match,
CASE WHEN LOWER(r.title) LIKE LOWER(?) THEN 50 ELSE 0 END AS prefix_match,
CASE WHEN LOWER(r.description) LIKE LOWER(?) THEN 20 ELSE 0 END AS desc_match,
(COALESCE(r.total_snippets, 0) / 100.0) AS snippet_score,
COALESCE(r.trust_score, 0) * 10 AS trust_component
FROM repositories r
WHERE r.state = 'indexed'
AND (
LOWER(r.title) LIKE LOWER(?)
OR LOWER(r.id) LIKE LOWER(?)
OR LOWER(r.description) LIKE LOWER(?)
)
ORDER BY (exact_match + prefix_match + desc_match + snippet_score + trust_component) DESC
LIMIT ?
`
)
.all(
libraryName, // exact_match
`${libraryName}%`, // prefix_match
`%${libraryName}%`, // desc_match
`%${libraryName}%`, // WHERE title LIKE
`%${libraryName}%`, // WHERE id LIKE
`%${libraryName}%`, // WHERE description LIKE
limit
) as RawRepoRow[];
return rows.map((row) => {
const repository = mapRepository(row);
const compositeScore =
row.exact_match + row.prefix_match + row.desc_match + row.snippet_score + row.trust_component;
return {
repository,
versions: this.getVersions(row.id),
score: compositeScore
};
});
}
// -------------------------------------------------------------------------
// Private helpers
// -------------------------------------------------------------------------
private getVersions(repositoryId: string): RepositoryVersion[] {
const rows = this.db
.prepare(
`SELECT * FROM repository_versions WHERE repository_id = ? ORDER BY created_at DESC`
)
.all(repositoryId) as RawVersionRow[];
return rows.map(mapVersion);
}
}