chore: initial project scaffold

2026-03-22 17:08:15 +01:00
commit 18437dfa7c
53 changed files with 12002 additions and 0 deletions
--- a/docs/features/TRUEREF-0006.md
+++ b/docs/features/TRUEREF-0006.md
@@ -0,0 +1,269 @@
+# TRUEREF-0006 — SQLite FTS5 Full-Text Search
+
+**Priority:** P0
+**Status:** Pending
+**Depends On:** TRUEREF-0001, TRUEREF-0005
+**Blocks:** TRUEREF-0010
+
+---
+
+## Overview
+
+Implement the full-text search engine using SQLite's built-in FTS5 extension. This provides keyword-based BM25 search over all indexed snippets without requiring any external search service. It serves as both the primary search backend (when embeddings are not configured) and the keyword component of the hybrid search engine (TRUEREF-0008).
+
+---
+
+## Acceptance Criteria
+
+- [ ] FTS5 virtual table created and kept in sync via triggers (defined in TRUEREF-0001)
+- [ ] `SearchService.searchSnippets(query, repositoryId, options)` method implemented
+- [ ] `SearchService.searchRepositories(libraryName, query)` method implemented
+- [ ] Results ranked by BM25 relevance score
+- [ ] Filter by `repositoryId` (required), `type` (optional), `versionId` (optional)
+- [ ] Limit and offset support for pagination
+- [ ] Query preprocessing: tokenization, stop-word handling, wildcard expansion
+- [ ] Library search matches on title, description, and snippet content
+- [ ] Unit tests with seeded test data
+
+---
+
+## Search Service Interface
+
+```typescript
+// src/lib/server/search/search.service.ts
+
+export interface SnippetSearchOptions {
+  repositoryId: string;
+  versionId?: string;
+  type?: 'code' | 'info';
+  limit?: number;          // default: 20
+  offset?: number;         // default: 0
+}
+
+export interface SnippetSearchResult {
+  snippet: Snippet;
+  score: number;           // BM25 rank (negative, lower = better)
+  repository: Pick<Repository, 'id' | 'title'>;
+}
+
+export interface LibrarySearchOptions {
+  libraryName: string;
+  query?: string;          // semantic relevance hint
+  limit?: number;          // default: 10
+}
+
+export interface LibrarySearchResult {
+  repository: Repository;
+  versions: RepositoryVersion[];
+  score: number;           // composite relevance score
+}
+
+export class SearchService {
+  constructor(private db: BetterSQLite3.Database) {}
+
+  searchSnippets(
+    query: string,
+    options: SnippetSearchOptions
+  ): SnippetSearchResult[]
+
+  searchRepositories(
+    options: LibrarySearchOptions
+  ): LibrarySearchResult[]
+}
+```
+
+---
+
+## FTS5 Snippet Search Query
+
+```sql
+SELECT
+  s.*,
+  r.id AS repo_id,
+  r.title AS repo_title,
+  bm25(snippets_fts) AS score
+FROM snippets_fts
+JOIN snippets s ON s.rowid = snippets_fts.rowid
+JOIN repositories r ON r.id = s.repository_id
+WHERE snippets_fts MATCH ?
+  AND s.repository_id = ?
+  [AND s.version_id = ?]
+  [AND s.type = ?]
+ORDER BY score ASC          -- bm25() returns negative values; lower = more relevant
+LIMIT ? OFFSET ?;
+```
+
+The FTS5 MATCH query uses the porter stemmer and unicode61 tokenizer (configured in the virtual table definition).
+
+---
+
+## Query Preprocessing
+
+```typescript
+function preprocessQuery(raw: string): string {
+  // 1. Trim and normalize whitespace
+  let q = raw.trim().replace(/\s+/g, ' ');
+
+  // 2. Escape FTS5 special characters that aren't intended as operators
+  // Keep: * (prefix), " " (phrase), AND, OR, NOT
+  q = q.replace(/[()]/g, ' ');
+
+  // 3. Add prefix wildcard to last token for "typing as you go" feel
+  const tokens = q.split(' ');
+  const lastToken = tokens.at(-1) ?? '';
+  if (lastToken.length >= 3 && !lastToken.endsWith('*')) {
+    tokens[tokens.length - 1] = lastToken + '*';
+  }
+
+  return tokens.join(' ');
+}
+```
+
+---
+
+## Library Search
+
+Library search operates on the `repositories` table (not FTS5) since it's matching library names and descriptions:
+
+```typescript
+searchRepositories(options: LibrarySearchOptions): LibrarySearchResult[] {
+  const { libraryName, query, limit = 10 } = options;
+
+  // Simple LIKE-based search on name and description
+  // Enhanced with scoring:
+  const rows = this.db.prepare(`
+    SELECT r.*,
+      -- Score components
+      CASE WHEN LOWER(r.title) = LOWER(?) THEN 100 ELSE 0 END AS exact_match,
+      CASE WHEN LOWER(r.title) LIKE LOWER(?) THEN 50 ELSE 0 END AS prefix_match,
+      CASE WHEN LOWER(r.description) LIKE LOWER(?) THEN 20 ELSE 0 END AS desc_match,
+      (r.total_snippets / 100.0) AS snippet_score,
+      COALESCE(r.trust_score, 0) * 10 AS trust_component
+    FROM repositories r
+    WHERE r.state = 'indexed'
+      AND (
+        LOWER(r.title) LIKE LOWER(?)
+        OR LOWER(r.id) LIKE LOWER(?)
+        OR LOWER(r.description) LIKE LOWER(?)
+      )
+    ORDER BY (exact_match + prefix_match + desc_match + snippet_score + trust_component) DESC
+    LIMIT ?
+  `).all(
+    libraryName,
+    `${libraryName}%`,
+    `%${libraryName}%`,
+    `%${libraryName}%`,
+    `%${libraryName}%`,
+    `%${libraryName}%`,
+    limit
+  );
+
+  return rows.map(row => ({
+    repository: row as Repository,
+    versions: this.getVersions(row.id),
+    score: row.exact_match + row.prefix_match + row.desc_match +
+           row.snippet_score + row.trust_component,
+  }));
+}
+```
+
+---
+
+## Response Formatting
+
+The search results must be formatted for the REST API and MCP tool responses:
+
+### Library search response (for `resolve-library-id`):
+```typescript
+function formatLibraryResults(results: LibrarySearchResult[]): string {
+  if (results.length === 0) {
+    return 'No libraries found matching your search.';
+  }
+
+  return results.map((r, i) => {
+    const repo = r.repository;
+    const versions = r.versions.map(v => v.tag).join(', ') || 'default branch';
+    return [
+      `${i + 1}. ${repo.title}`,
+      `   Library ID: ${repo.id}`,
+      `   Description: ${repo.description ?? 'No description'}`,
+      `   Snippets: ${repo.totalSnippets} | Trust Score: ${repo.trustScore.toFixed(1)}/10`,
+      `   Available Versions: ${versions}`,
+    ].join('\n');
+  }).join('\n\n');
+}
+```
+
+### Snippet search response (for `query-docs`):
+```typescript
+function formatSnippetResults(
+  results: SnippetSearchResult[],
+  rules?: string[]
+): string {
+  const parts: string[] = [];
+
+  // Prepend repository rules if present
+  if (rules?.length) {
+    parts.push('## Library Rules\n' + rules.map(r => `- ${r}`).join('\n'));
+  }
+
+  for (const { snippet } of results) {
+    if (snippet.type === 'code') {
+      parts.push([
+        snippet.title ? `### ${snippet.title}` : '',
+        snippet.breadcrumb ? `*${snippet.breadcrumb}*` : '',
+        `\`\`\`${snippet.language ?? ''}\n${snippet.content}\n\`\`\``,
+      ].filter(Boolean).join('\n'));
+    } else {
+      parts.push([
+        snippet.title ? `### ${snippet.title}` : '',
+        snippet.breadcrumb ? `*${snippet.breadcrumb}*` : '',
+        snippet.content,
+      ].filter(Boolean).join('\n'));
+    }
+  }
+
+  return parts.join('\n\n---\n\n');
+}
+```
+
+---
+
+## Trust Score Computation
+
+Compute `trustScore` (0–10) when a repository is first indexed:
+
+```typescript
+function computeTrustScore(repo: Repository): number {
+  let score = 0;
+
+  // Stars (up to 4 points): log scale, 10k stars = 4 pts
+  if (repo.stars) {
+    score += Math.min(4, Math.log10(repo.stars + 1));
+  }
+
+  // Documentation coverage (up to 3 points)
+  score += Math.min(3, repo.totalSnippets / 500);
+
+  // Source type (1 point for GitHub, 0 for local)
+  if (repo.source === 'github') score += 1;
+
+  // Successful indexing (1 point)
+  if (repo.state === 'indexed') score += 1;
+
+  // Has description (1 point)
+  if (repo.description) score += 1;
+
+  return Math.min(10, parseFloat(score.toFixed(1)));
+}
+```
+
+---
+
+## Files to Create
+
+- `src/lib/server/search/search.service.ts`
+- `src/lib/server/search/query-preprocessor.ts`
+- `src/lib/server/search/formatters.ts`
+- `src/lib/server/search/trust-score.ts`
+- `src/lib/server/search/search.service.test.ts`