chore: initial project scaffold

This commit is contained in:
Giancarmine Salucci
2026-03-22 17:08:15 +01:00
commit 18437dfa7c
53 changed files with 12002 additions and 0 deletions

View File

@@ -0,0 +1,269 @@
# TRUEREF-0006 — SQLite FTS5 Full-Text Search
**Priority:** P0
**Status:** Pending
**Depends On:** TRUEREF-0001, TRUEREF-0005
**Blocks:** TRUEREF-0010
---
## Overview
Implement the full-text search engine using SQLite's built-in FTS5 extension. This provides keyword-based BM25 search over all indexed snippets without requiring any external search service. It serves as both the primary search backend (when embeddings are not configured) and the keyword component of the hybrid search engine (TRUEREF-0008).
---
## Acceptance Criteria
- [ ] FTS5 virtual table created and kept in sync via triggers (defined in TRUEREF-0001)
- [ ] `SearchService.searchSnippets(query, repositoryId, options)` method implemented
- [ ] `SearchService.searchRepositories(libraryName, query)` method implemented
- [ ] Results ranked by BM25 relevance score
- [ ] Filter by `repositoryId` (required), `type` (optional), `versionId` (optional)
- [ ] Limit and offset support for pagination
- [ ] Query preprocessing: tokenization, stop-word handling, wildcard expansion
- [ ] Library search matches on title, description, and snippet content
- [ ] Unit tests with seeded test data
---
## Search Service Interface
```typescript
// src/lib/server/search/search.service.ts
export interface SnippetSearchOptions {
repositoryId: string;
versionId?: string;
type?: 'code' | 'info';
limit?: number; // default: 20
offset?: number; // default: 0
}
export interface SnippetSearchResult {
snippet: Snippet;
score: number; // BM25 rank (negative, lower = better)
repository: Pick<Repository, 'id' | 'title'>;
}
export interface LibrarySearchOptions {
libraryName: string;
query?: string; // semantic relevance hint
limit?: number; // default: 10
}
export interface LibrarySearchResult {
repository: Repository;
versions: RepositoryVersion[];
score: number; // composite relevance score
}
export class SearchService {
constructor(private db: BetterSQLite3.Database) {}
searchSnippets(
query: string,
options: SnippetSearchOptions
): SnippetSearchResult[]
searchRepositories(
options: LibrarySearchOptions
): LibrarySearchResult[]
}
```
---
## FTS5 Snippet Search Query
```sql
SELECT
s.*,
r.id AS repo_id,
r.title AS repo_title,
bm25(snippets_fts) AS score
FROM snippets_fts
JOIN snippets s ON s.rowid = snippets_fts.rowid
JOIN repositories r ON r.id = s.repository_id
WHERE snippets_fts MATCH ?
AND s.repository_id = ?
[AND s.version_id = ?]
[AND s.type = ?]
ORDER BY score ASC -- bm25() returns negative values; lower = more relevant
LIMIT ? OFFSET ?;
```
The FTS5 MATCH query uses the porter stemmer and unicode61 tokenizer (configured in the virtual table definition).
---
## Query Preprocessing
```typescript
function preprocessQuery(raw: string): string {
// 1. Trim and normalize whitespace
let q = raw.trim().replace(/\s+/g, ' ');
// 2. Escape FTS5 special characters that aren't intended as operators
// Keep: * (prefix), " " (phrase), AND, OR, NOT
q = q.replace(/[()]/g, ' ');
// 3. Add prefix wildcard to last token for "typing as you go" feel
const tokens = q.split(' ');
const lastToken = tokens.at(-1) ?? '';
if (lastToken.length >= 3 && !lastToken.endsWith('*')) {
tokens[tokens.length - 1] = lastToken + '*';
}
return tokens.join(' ');
}
```
---
## Library Search
Library search operates on the `repositories` table (not FTS5) since it's matching library names and descriptions:
```typescript
searchRepositories(options: LibrarySearchOptions): LibrarySearchResult[] {
const { libraryName, query, limit = 10 } = options;
// Simple LIKE-based search on name and description
// Enhanced with scoring:
const rows = this.db.prepare(`
SELECT r.*,
-- Score components
CASE WHEN LOWER(r.title) = LOWER(?) THEN 100 ELSE 0 END AS exact_match,
CASE WHEN LOWER(r.title) LIKE LOWER(?) THEN 50 ELSE 0 END AS prefix_match,
CASE WHEN LOWER(r.description) LIKE LOWER(?) THEN 20 ELSE 0 END AS desc_match,
(r.total_snippets / 100.0) AS snippet_score,
COALESCE(r.trust_score, 0) * 10 AS trust_component
FROM repositories r
WHERE r.state = 'indexed'
AND (
LOWER(r.title) LIKE LOWER(?)
OR LOWER(r.id) LIKE LOWER(?)
OR LOWER(r.description) LIKE LOWER(?)
)
ORDER BY (exact_match + prefix_match + desc_match + snippet_score + trust_component) DESC
LIMIT ?
`).all(
libraryName,
`${libraryName}%`,
`%${libraryName}%`,
`%${libraryName}%`,
`%${libraryName}%`,
`%${libraryName}%`,
limit
);
return rows.map(row => ({
repository: row as Repository,
versions: this.getVersions(row.id),
score: row.exact_match + row.prefix_match + row.desc_match +
row.snippet_score + row.trust_component,
}));
}
```
---
## Response Formatting
The search results must be formatted for the REST API and MCP tool responses:
### Library search response (for `resolve-library-id`):
```typescript
function formatLibraryResults(results: LibrarySearchResult[]): string {
if (results.length === 0) {
return 'No libraries found matching your search.';
}
return results.map((r, i) => {
const repo = r.repository;
const versions = r.versions.map(v => v.tag).join(', ') || 'default branch';
return [
`${i + 1}. ${repo.title}`,
` Library ID: ${repo.id}`,
` Description: ${repo.description ?? 'No description'}`,
` Snippets: ${repo.totalSnippets} | Trust Score: ${repo.trustScore.toFixed(1)}/10`,
` Available Versions: ${versions}`,
].join('\n');
}).join('\n\n');
}
```
### Snippet search response (for `query-docs`):
```typescript
function formatSnippetResults(
results: SnippetSearchResult[],
rules?: string[]
): string {
const parts: string[] = [];
// Prepend repository rules if present
if (rules?.length) {
parts.push('## Library Rules\n' + rules.map(r => `- ${r}`).join('\n'));
}
for (const { snippet } of results) {
if (snippet.type === 'code') {
parts.push([
snippet.title ? `### ${snippet.title}` : '',
snippet.breadcrumb ? `*${snippet.breadcrumb}*` : '',
`\`\`\`${snippet.language ?? ''}\n${snippet.content}\n\`\`\``,
].filter(Boolean).join('\n'));
} else {
parts.push([
snippet.title ? `### ${snippet.title}` : '',
snippet.breadcrumb ? `*${snippet.breadcrumb}*` : '',
snippet.content,
].filter(Boolean).join('\n'));
}
}
return parts.join('\n\n---\n\n');
}
```
---
## Trust Score Computation
Compute `trustScore` (010) when a repository is first indexed:
```typescript
function computeTrustScore(repo: Repository): number {
let score = 0;
// Stars (up to 4 points): log scale, 10k stars = 4 pts
if (repo.stars) {
score += Math.min(4, Math.log10(repo.stars + 1));
}
// Documentation coverage (up to 3 points)
score += Math.min(3, repo.totalSnippets / 500);
// Source type (1 point for GitHub, 0 for local)
if (repo.source === 'github') score += 1;
// Successful indexing (1 point)
if (repo.state === 'indexed') score += 1;
// Has description (1 point)
if (repo.description) score += 1;
return Math.min(10, parseFloat(score.toFixed(1)));
}
```
---
## Files to Create
- `src/lib/server/search/search.service.ts`
- `src/lib/server/search/query-preprocessor.ts`
- `src/lib/server/search/formatters.ts`
- `src/lib/server/search/trust-score.ts`
- `src/lib/server/search/search.service.test.ts`