- BM25 ranking via SQLite FTS5 bm25() function - Query preprocessor with wildcard expansion and special char escaping - Library search with composite scoring (name match, trust score, snippet count) - Trust score computation from stars, coverage, and source type - Response formatters for library and snippet results Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
311 lines
9.0 KiB
TypeScript
311 lines
9.0 KiB
TypeScript
/**
|
|
* SearchService — FTS5-backed full-text search over snippets and repositories.
|
|
*
|
|
* Implements keyword search using SQLite's built-in BM25 ranking via the
|
|
* `bm25()` function exposed by FTS5 virtual tables. Library search uses
|
|
* LIKE-based matching on the `repositories` table with a composite relevance
|
|
* score.
|
|
*/
|
|
|
|
import type Database from 'better-sqlite3';
|
|
import type { Repository, RepositoryVersion, Snippet } from '$lib/types';
|
|
import { preprocessQuery } from './query-preprocessor';
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Public interface types
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export interface SnippetSearchOptions {
|
|
repositoryId: string;
|
|
versionId?: string;
|
|
type?: 'code' | 'info';
|
|
/** Number of results to return. Default: 20. */
|
|
limit?: number;
|
|
/** Number of results to skip. Default: 0. */
|
|
offset?: number;
|
|
}
|
|
|
|
export interface SnippetSearchResult {
|
|
snippet: Snippet;
|
|
/** BM25 rank — negative value; lower (more negative) = more relevant. */
|
|
score: number;
|
|
repository: Pick<Repository, 'id' | 'title'>;
|
|
}
|
|
|
|
export interface LibrarySearchOptions {
|
|
libraryName: string;
|
|
/** Semantic relevance hint (reserved for future hybrid use). */
|
|
query?: string;
|
|
/** Number of results to return. Default: 10. */
|
|
limit?: number;
|
|
}
|
|
|
|
export interface LibrarySearchResult {
|
|
repository: Repository;
|
|
versions: RepositoryVersion[];
|
|
/** Composite relevance score. Higher = more relevant. */
|
|
score: number;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Raw DB row types
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/** Raw row returned by the snippet FTS query (snake_case column names). */
|
|
interface RawSnippetRow {
|
|
id: string;
|
|
document_id: string;
|
|
repository_id: string;
|
|
version_id: string | null;
|
|
type: 'code' | 'info';
|
|
title: string | null;
|
|
content: string;
|
|
language: string | null;
|
|
breadcrumb: string | null;
|
|
token_count: number | null;
|
|
created_at: number;
|
|
repo_id: string;
|
|
repo_title: string;
|
|
score: number;
|
|
}
|
|
|
|
/** Raw row returned by the library search query. */
|
|
interface RawRepoRow {
|
|
id: string;
|
|
title: string;
|
|
description: string | null;
|
|
source: 'github' | 'local';
|
|
source_url: string;
|
|
branch: string | null;
|
|
state: 'pending' | 'indexing' | 'indexed' | 'error';
|
|
total_snippets: number | null;
|
|
total_tokens: number | null;
|
|
trust_score: number | null;
|
|
benchmark_score: number | null;
|
|
stars: number | null;
|
|
github_token: string | null;
|
|
last_indexed_at: number | null;
|
|
created_at: number;
|
|
updated_at: number;
|
|
exact_match: number;
|
|
prefix_match: number;
|
|
desc_match: number;
|
|
snippet_score: number;
|
|
trust_component: number;
|
|
}
|
|
|
|
/** Raw row returned by the version query. */
|
|
interface RawVersionRow {
|
|
id: string;
|
|
repository_id: string;
|
|
tag: string;
|
|
title: string | null;
|
|
state: 'pending' | 'indexing' | 'indexed' | 'error';
|
|
total_snippets: number | null;
|
|
indexed_at: number | null;
|
|
created_at: number;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Mappers: raw DB rows → domain types
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function mapSnippet(row: RawSnippetRow): Snippet {
|
|
return {
|
|
id: row.id,
|
|
documentId: row.document_id,
|
|
repositoryId: row.repository_id,
|
|
versionId: row.version_id,
|
|
type: row.type,
|
|
title: row.title,
|
|
content: row.content,
|
|
language: row.language,
|
|
breadcrumb: row.breadcrumb,
|
|
tokenCount: row.token_count,
|
|
createdAt: new Date(row.created_at * 1000)
|
|
};
|
|
}
|
|
|
|
function mapRepository(row: RawRepoRow): Repository {
|
|
return {
|
|
id: row.id,
|
|
title: row.title,
|
|
description: row.description,
|
|
source: row.source,
|
|
sourceUrl: row.source_url,
|
|
branch: row.branch,
|
|
state: row.state,
|
|
totalSnippets: row.total_snippets,
|
|
totalTokens: row.total_tokens,
|
|
trustScore: row.trust_score,
|
|
benchmarkScore: row.benchmark_score,
|
|
stars: row.stars,
|
|
githubToken: row.github_token,
|
|
lastIndexedAt: row.last_indexed_at ? new Date(row.last_indexed_at * 1000) : null,
|
|
createdAt: new Date(row.created_at * 1000),
|
|
updatedAt: new Date(row.updated_at * 1000)
|
|
};
|
|
}
|
|
|
|
function mapVersion(row: RawVersionRow): RepositoryVersion {
|
|
return {
|
|
id: row.id,
|
|
repositoryId: row.repository_id,
|
|
tag: row.tag,
|
|
title: row.title,
|
|
state: row.state,
|
|
totalSnippets: row.total_snippets,
|
|
indexedAt: row.indexed_at ? new Date(row.indexed_at * 1000) : null,
|
|
createdAt: new Date(row.created_at * 1000)
|
|
};
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// SearchService
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export class SearchService {
|
|
constructor(private readonly db: Database.Database) {}
|
|
|
|
// -------------------------------------------------------------------------
|
|
// searchSnippets
|
|
// -------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Search snippets within a repository using FTS5 BM25 ranking.
|
|
*
|
|
* The query is preprocessed (whitespace normalization + prefix wildcard)
|
|
* before being passed to the MATCH expression. Results are ordered by BM25
|
|
* score ascending (lower = more relevant).
|
|
*/
|
|
searchSnippets(query: string, options: SnippetSearchOptions): SnippetSearchResult[] {
|
|
const { repositoryId, versionId, type, limit = 20, offset = 0 } = options;
|
|
|
|
const processedQuery = preprocessQuery(query);
|
|
if (!processedQuery) return [];
|
|
|
|
// Build the WHERE clause dynamically based on optional filters.
|
|
const conditions: string[] = [
|
|
'snippets_fts MATCH ?',
|
|
's.repository_id = ?'
|
|
];
|
|
const params: unknown[] = [processedQuery, repositoryId];
|
|
|
|
if (versionId !== undefined) {
|
|
conditions.push('s.version_id = ?');
|
|
params.push(versionId);
|
|
}
|
|
|
|
if (type !== undefined) {
|
|
conditions.push('s.type = ?');
|
|
params.push(type);
|
|
}
|
|
|
|
params.push(limit, offset);
|
|
|
|
const sql = `
|
|
SELECT
|
|
s.id,
|
|
s.document_id,
|
|
s.repository_id,
|
|
s.version_id,
|
|
s.type,
|
|
s.title,
|
|
s.content,
|
|
s.language,
|
|
s.breadcrumb,
|
|
s.token_count,
|
|
s.created_at,
|
|
r.id AS repo_id,
|
|
r.title AS repo_title,
|
|
bm25(snippets_fts) AS score
|
|
FROM snippets_fts
|
|
JOIN snippets s ON s.rowid = snippets_fts.rowid
|
|
JOIN repositories r ON r.id = s.repository_id
|
|
WHERE ${conditions.join(' AND ')}
|
|
ORDER BY score ASC
|
|
LIMIT ? OFFSET ?
|
|
`;
|
|
|
|
const rows = this.db.prepare(sql).all(...params) as RawSnippetRow[];
|
|
|
|
return rows.map((row) => ({
|
|
snippet: mapSnippet(row),
|
|
score: row.score,
|
|
repository: { id: row.repo_id, title: row.repo_title }
|
|
}));
|
|
}
|
|
|
|
// -------------------------------------------------------------------------
|
|
// searchRepositories
|
|
// -------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Search repositories by library name using LIKE-based matching.
|
|
*
|
|
* Applies a composite scoring model:
|
|
* - Exact title match : 100 pts
|
|
* - Prefix title match : 50 pts
|
|
* - Description match : 20 pts
|
|
* - Snippet density : total_snippets / 100
|
|
* - Trust score : trust_score * 10
|
|
*/
|
|
searchRepositories(options: LibrarySearchOptions): LibrarySearchResult[] {
|
|
const { libraryName, limit = 10 } = options;
|
|
|
|
const rows = this.db
|
|
.prepare(
|
|
`
|
|
SELECT r.*,
|
|
CASE WHEN LOWER(r.title) = LOWER(?) THEN 100 ELSE 0 END AS exact_match,
|
|
CASE WHEN LOWER(r.title) LIKE LOWER(?) THEN 50 ELSE 0 END AS prefix_match,
|
|
CASE WHEN LOWER(r.description) LIKE LOWER(?) THEN 20 ELSE 0 END AS desc_match,
|
|
(COALESCE(r.total_snippets, 0) / 100.0) AS snippet_score,
|
|
COALESCE(r.trust_score, 0) * 10 AS trust_component
|
|
FROM repositories r
|
|
WHERE r.state = 'indexed'
|
|
AND (
|
|
LOWER(r.title) LIKE LOWER(?)
|
|
OR LOWER(r.id) LIKE LOWER(?)
|
|
OR LOWER(r.description) LIKE LOWER(?)
|
|
)
|
|
ORDER BY (exact_match + prefix_match + desc_match + snippet_score + trust_component) DESC
|
|
LIMIT ?
|
|
`
|
|
)
|
|
.all(
|
|
libraryName, // exact_match
|
|
`${libraryName}%`, // prefix_match
|
|
`%${libraryName}%`, // desc_match
|
|
`%${libraryName}%`, // WHERE title LIKE
|
|
`%${libraryName}%`, // WHERE id LIKE
|
|
`%${libraryName}%`, // WHERE description LIKE
|
|
limit
|
|
) as RawRepoRow[];
|
|
|
|
return rows.map((row) => {
|
|
const repository = mapRepository(row);
|
|
const compositeScore =
|
|
row.exact_match + row.prefix_match + row.desc_match + row.snippet_score + row.trust_component;
|
|
return {
|
|
repository,
|
|
versions: this.getVersions(row.id),
|
|
score: compositeScore
|
|
};
|
|
});
|
|
}
|
|
|
|
// -------------------------------------------------------------------------
|
|
// Private helpers
|
|
// -------------------------------------------------------------------------
|
|
|
|
private getVersions(repositoryId: string): RepositoryVersion[] {
|
|
const rows = this.db
|
|
.prepare(
|
|
`SELECT * FROM repository_versions WHERE repository_id = ? ORDER BY created_at DESC`
|
|
)
|
|
.all(repositoryId) as RawVersionRow[];
|
|
return rows.map(mapVersion);
|
|
}
|
|
}
|