feat(TRUEREF-0006): implement SQLite FTS5 full-text search engine
- BM25 ranking via SQLite FTS5 bm25() function - Query preprocessor with wildcard expansion and special char escaping - Library search with composite scoring (name match, trust score, snippet count) - Trust score computation from stars, coverage, and source type - Response formatters for library and snippet results Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
310
src/lib/server/search/search.service.ts
Normal file
310
src/lib/server/search/search.service.ts
Normal file
@@ -0,0 +1,310 @@
|
||||
/**
|
||||
* SearchService — FTS5-backed full-text search over snippets and repositories.
|
||||
*
|
||||
* Implements keyword search using SQLite's built-in BM25 ranking via the
|
||||
* `bm25()` function exposed by FTS5 virtual tables. Library search uses
|
||||
* LIKE-based matching on the `repositories` table with a composite relevance
|
||||
* score.
|
||||
*/
|
||||
|
||||
import type Database from 'better-sqlite3';
|
||||
import type { Repository, RepositoryVersion, Snippet } from '$lib/types';
|
||||
import { preprocessQuery } from './query-preprocessor';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public interface types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface SnippetSearchOptions {
|
||||
repositoryId: string;
|
||||
versionId?: string;
|
||||
type?: 'code' | 'info';
|
||||
/** Number of results to return. Default: 20. */
|
||||
limit?: number;
|
||||
/** Number of results to skip. Default: 0. */
|
||||
offset?: number;
|
||||
}
|
||||
|
||||
export interface SnippetSearchResult {
|
||||
snippet: Snippet;
|
||||
/** BM25 rank — negative value; lower (more negative) = more relevant. */
|
||||
score: number;
|
||||
repository: Pick<Repository, 'id' | 'title'>;
|
||||
}
|
||||
|
||||
export interface LibrarySearchOptions {
|
||||
libraryName: string;
|
||||
/** Semantic relevance hint (reserved for future hybrid use). */
|
||||
query?: string;
|
||||
/** Number of results to return. Default: 10. */
|
||||
limit?: number;
|
||||
}
|
||||
|
||||
export interface LibrarySearchResult {
|
||||
repository: Repository;
|
||||
versions: RepositoryVersion[];
|
||||
/** Composite relevance score. Higher = more relevant. */
|
||||
score: number;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Raw DB row types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Raw row returned by the snippet FTS query (snake_case column names). */
|
||||
interface RawSnippetRow {
|
||||
id: string;
|
||||
document_id: string;
|
||||
repository_id: string;
|
||||
version_id: string | null;
|
||||
type: 'code' | 'info';
|
||||
title: string | null;
|
||||
content: string;
|
||||
language: string | null;
|
||||
breadcrumb: string | null;
|
||||
token_count: number | null;
|
||||
created_at: number;
|
||||
repo_id: string;
|
||||
repo_title: string;
|
||||
score: number;
|
||||
}
|
||||
|
||||
/** Raw row returned by the library search query. */
|
||||
interface RawRepoRow {
|
||||
id: string;
|
||||
title: string;
|
||||
description: string | null;
|
||||
source: 'github' | 'local';
|
||||
source_url: string;
|
||||
branch: string | null;
|
||||
state: 'pending' | 'indexing' | 'indexed' | 'error';
|
||||
total_snippets: number | null;
|
||||
total_tokens: number | null;
|
||||
trust_score: number | null;
|
||||
benchmark_score: number | null;
|
||||
stars: number | null;
|
||||
github_token: string | null;
|
||||
last_indexed_at: number | null;
|
||||
created_at: number;
|
||||
updated_at: number;
|
||||
exact_match: number;
|
||||
prefix_match: number;
|
||||
desc_match: number;
|
||||
snippet_score: number;
|
||||
trust_component: number;
|
||||
}
|
||||
|
||||
/** Raw row returned by the version query. */
|
||||
interface RawVersionRow {
|
||||
id: string;
|
||||
repository_id: string;
|
||||
tag: string;
|
||||
title: string | null;
|
||||
state: 'pending' | 'indexing' | 'indexed' | 'error';
|
||||
total_snippets: number | null;
|
||||
indexed_at: number | null;
|
||||
created_at: number;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Mappers: raw DB rows → domain types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function mapSnippet(row: RawSnippetRow): Snippet {
|
||||
return {
|
||||
id: row.id,
|
||||
documentId: row.document_id,
|
||||
repositoryId: row.repository_id,
|
||||
versionId: row.version_id,
|
||||
type: row.type,
|
||||
title: row.title,
|
||||
content: row.content,
|
||||
language: row.language,
|
||||
breadcrumb: row.breadcrumb,
|
||||
tokenCount: row.token_count,
|
||||
createdAt: new Date(row.created_at * 1000)
|
||||
};
|
||||
}
|
||||
|
||||
function mapRepository(row: RawRepoRow): Repository {
|
||||
return {
|
||||
id: row.id,
|
||||
title: row.title,
|
||||
description: row.description,
|
||||
source: row.source,
|
||||
sourceUrl: row.source_url,
|
||||
branch: row.branch,
|
||||
state: row.state,
|
||||
totalSnippets: row.total_snippets,
|
||||
totalTokens: row.total_tokens,
|
||||
trustScore: row.trust_score,
|
||||
benchmarkScore: row.benchmark_score,
|
||||
stars: row.stars,
|
||||
githubToken: row.github_token,
|
||||
lastIndexedAt: row.last_indexed_at ? new Date(row.last_indexed_at * 1000) : null,
|
||||
createdAt: new Date(row.created_at * 1000),
|
||||
updatedAt: new Date(row.updated_at * 1000)
|
||||
};
|
||||
}
|
||||
|
||||
function mapVersion(row: RawVersionRow): RepositoryVersion {
|
||||
return {
|
||||
id: row.id,
|
||||
repositoryId: row.repository_id,
|
||||
tag: row.tag,
|
||||
title: row.title,
|
||||
state: row.state,
|
||||
totalSnippets: row.total_snippets,
|
||||
indexedAt: row.indexed_at ? new Date(row.indexed_at * 1000) : null,
|
||||
createdAt: new Date(row.created_at * 1000)
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// SearchService
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export class SearchService {
|
||||
constructor(private readonly db: Database.Database) {}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// searchSnippets
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Search snippets within a repository using FTS5 BM25 ranking.
|
||||
*
|
||||
* The query is preprocessed (whitespace normalization + prefix wildcard)
|
||||
* before being passed to the MATCH expression. Results are ordered by BM25
|
||||
* score ascending (lower = more relevant).
|
||||
*/
|
||||
searchSnippets(query: string, options: SnippetSearchOptions): SnippetSearchResult[] {
|
||||
const { repositoryId, versionId, type, limit = 20, offset = 0 } = options;
|
||||
|
||||
const processedQuery = preprocessQuery(query);
|
||||
if (!processedQuery) return [];
|
||||
|
||||
// Build the WHERE clause dynamically based on optional filters.
|
||||
const conditions: string[] = [
|
||||
'snippets_fts MATCH ?',
|
||||
's.repository_id = ?'
|
||||
];
|
||||
const params: unknown[] = [processedQuery, repositoryId];
|
||||
|
||||
if (versionId !== undefined) {
|
||||
conditions.push('s.version_id = ?');
|
||||
params.push(versionId);
|
||||
}
|
||||
|
||||
if (type !== undefined) {
|
||||
conditions.push('s.type = ?');
|
||||
params.push(type);
|
||||
}
|
||||
|
||||
params.push(limit, offset);
|
||||
|
||||
const sql = `
|
||||
SELECT
|
||||
s.id,
|
||||
s.document_id,
|
||||
s.repository_id,
|
||||
s.version_id,
|
||||
s.type,
|
||||
s.title,
|
||||
s.content,
|
||||
s.language,
|
||||
s.breadcrumb,
|
||||
s.token_count,
|
||||
s.created_at,
|
||||
r.id AS repo_id,
|
||||
r.title AS repo_title,
|
||||
bm25(snippets_fts) AS score
|
||||
FROM snippets_fts
|
||||
JOIN snippets s ON s.rowid = snippets_fts.rowid
|
||||
JOIN repositories r ON r.id = s.repository_id
|
||||
WHERE ${conditions.join(' AND ')}
|
||||
ORDER BY score ASC
|
||||
LIMIT ? OFFSET ?
|
||||
`;
|
||||
|
||||
const rows = this.db.prepare(sql).all(...params) as RawSnippetRow[];
|
||||
|
||||
return rows.map((row) => ({
|
||||
snippet: mapSnippet(row),
|
||||
score: row.score,
|
||||
repository: { id: row.repo_id, title: row.repo_title }
|
||||
}));
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// searchRepositories
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Search repositories by library name using LIKE-based matching.
|
||||
*
|
||||
* Applies a composite scoring model:
|
||||
* - Exact title match : 100 pts
|
||||
* - Prefix title match : 50 pts
|
||||
* - Description match : 20 pts
|
||||
* - Snippet density : total_snippets / 100
|
||||
* - Trust score : trust_score * 10
|
||||
*/
|
||||
searchRepositories(options: LibrarySearchOptions): LibrarySearchResult[] {
|
||||
const { libraryName, limit = 10 } = options;
|
||||
|
||||
const rows = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT r.*,
|
||||
CASE WHEN LOWER(r.title) = LOWER(?) THEN 100 ELSE 0 END AS exact_match,
|
||||
CASE WHEN LOWER(r.title) LIKE LOWER(?) THEN 50 ELSE 0 END AS prefix_match,
|
||||
CASE WHEN LOWER(r.description) LIKE LOWER(?) THEN 20 ELSE 0 END AS desc_match,
|
||||
(COALESCE(r.total_snippets, 0) / 100.0) AS snippet_score,
|
||||
COALESCE(r.trust_score, 0) * 10 AS trust_component
|
||||
FROM repositories r
|
||||
WHERE r.state = 'indexed'
|
||||
AND (
|
||||
LOWER(r.title) LIKE LOWER(?)
|
||||
OR LOWER(r.id) LIKE LOWER(?)
|
||||
OR LOWER(r.description) LIKE LOWER(?)
|
||||
)
|
||||
ORDER BY (exact_match + prefix_match + desc_match + snippet_score + trust_component) DESC
|
||||
LIMIT ?
|
||||
`
|
||||
)
|
||||
.all(
|
||||
libraryName, // exact_match
|
||||
`${libraryName}%`, // prefix_match
|
||||
`%${libraryName}%`, // desc_match
|
||||
`%${libraryName}%`, // WHERE title LIKE
|
||||
`%${libraryName}%`, // WHERE id LIKE
|
||||
`%${libraryName}%`, // WHERE description LIKE
|
||||
limit
|
||||
) as RawRepoRow[];
|
||||
|
||||
return rows.map((row) => {
|
||||
const repository = mapRepository(row);
|
||||
const compositeScore =
|
||||
row.exact_match + row.prefix_match + row.desc_match + row.snippet_score + row.trust_component;
|
||||
return {
|
||||
repository,
|
||||
versions: this.getVersions(row.id),
|
||||
score: compositeScore
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Private helpers
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private getVersions(repositoryId: string): RepositoryVersion[] {
|
||||
const rows = this.db
|
||||
.prepare(
|
||||
`SELECT * FROM repository_versions WHERE repository_id = ? ORDER BY created_at DESC`
|
||||
)
|
||||
.all(repositoryId) as RawVersionRow[];
|
||||
return rows.map(mapVersion);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user