chore: initial project scaffold

2026-03-22 17:08:15 +01:00
commit 18437dfa7c
53 changed files with 12002 additions and 0 deletions
--- a/docs/features/TRUEREF-0005.md
+++ b/docs/features/TRUEREF-0005.md
@@ -0,0 +1,297 @@
+# TRUEREF-0005 — Document Parser & Chunker
+
+**Priority:** P0
+**Status:** Pending
+**Depends On:** TRUEREF-0001
+**Blocks:** TRUEREF-0006, TRUEREF-0007, TRUEREF-0009
+
+---
+
+## Overview
+
+Implement the document parsing and chunking pipeline that transforms raw file contents (from the crawlers) into structured, searchable `Snippet` records. This is the core intellectual layer of TrueRef — the quality of the chunks directly determines the quality of documentation retrieval.
+
+---
+
+## Acceptance Criteria
+
+- [ ] Parse Markdown files into heading-based sections (info snippets)
+- [ ] Extract fenced code blocks from Markdown as separate code snippets
+- [ ] Parse standalone code files into function/class-level chunks
+- [ ] Respect token limits per chunk (max 512 tokens, with 50-token overlap)
+- [ ] Assign breadcrumb paths based on heading hierarchy (Markdown) or file path (code)
+- [ ] Detect programming language from file extension
+- [ ] Produce both `code` and `info` type snippets
+- [ ] Calculate approximate token counts using character-based estimation
+- [ ] Skip empty or trivially short content (< 20 chars)
+- [ ] Unit tests with representative samples of each file type
+
+---
+
+## Supported File Types
+
+| Extension | Parser Strategy |
+|-----------|----------------|
+| `.md`, `.mdx` | Heading-based section splitting + code block extraction |
+| `.txt`, `.rst` | Paragraph-based splitting |
+| `.ts`, `.tsx`, `.js`, `.jsx` | AST-free: function/class boundary detection via regex |
+| `.py` | `def`/`class` boundary detection |
+| `.go` | `func`/`type` boundary detection |
+| `.rs` | `fn`/`impl`/`struct` boundary detection |
+| `.java`, `.cs`, `.kt`, `.swift` | Class/method boundary detection |
+| `.rb` | `def`/`class` boundary detection |
+| `.json`, `.yaml`, `.yml`, `.toml` | Structural chunking (top-level keys) |
+| `.html`, `.svelte`, `.vue` | Text content extraction + script block splitting |
+| Other code | Line-count-based sliding window (200 lines per chunk) |
+
+---
+
+## Token Counting
+
+Use a simple character-based approximation (no tokenizer library needed for v1):
+
+```typescript
+function estimateTokens(text: string): number {
+  // Empirically: ~4 chars per token for English prose
+  // ~3 chars per token for code (more symbols)
+  return Math.ceil(text.length / 3.5);
+}
+```
+
+---
+
+## Markdown Parser
+
+The Markdown parser is the most important parser as most documentation is Markdown.
+
+### Algorithm
+
+1. Split the file into lines.
+2. Track current heading stack (H1 > H2 > H3 > H4).
+3. When a new heading is encountered, emit the accumulated content as an info snippet.
+4. Fenced code blocks (` ``` `) within sections are extracted as separate code snippets.
+5. The breadcrumb is built from the heading stack: `"Getting Started > Installation"`.
+
+```typescript
+interface MarkdownSection {
+  headings: string[];    // heading stack at this point
+  content: string;       // text content (sans code blocks)
+  codeBlocks: { language: string; code: string }[];
+}
+
+function parseMarkdown(content: string, filePath: string): Snippet[] {
+  const sections = splitIntoSections(content);
+  const snippets: Snippet[] = [];
+
+  for (const section of sections) {
+    const breadcrumb = section.headings.join(' > ');
+    const title = section.headings.at(-1) ?? path.basename(filePath);
+
+    // Emit info snippet for text content
+    if (section.content.trim().length >= 20) {
+      const chunks = chunkText(section.content, MAX_TOKENS, OVERLAP_TOKENS);
+      for (const chunk of chunks) {
+        snippets.push({
+          type: 'info',
+          title,
+          content: chunk,
+          breadcrumb,
+          tokenCount: estimateTokens(chunk),
+        });
+      }
+    }
+
+    // Emit code snippets for each code block
+    for (const block of section.codeBlocks) {
+      if (block.code.trim().length >= 20) {
+        snippets.push({
+          type: 'code',
+          title,
+          content: block.code,
+          language: block.language || detectLanguage('.' + block.language),
+          breadcrumb,
+          tokenCount: estimateTokens(block.code),
+        });
+      }
+    }
+  }
+
+  return snippets;
+}
+```
+
+---
+
+## Code File Parser
+
+For non-Markdown code files, use regex-based function/class boundary detection.
+
+### Algorithm
+
+1. Detect language-specific top-level declaration patterns.
+2. Split the file at those boundaries.
+3. Each chunk: the declaration line(s) + body up to the next declaration.
+4. If a chunk exceeds `MAX_TOKENS`, apply sliding window splitting with overlap.
+
+```typescript
+const BOUNDARY_PATTERNS: Record<string, RegExp> = {
+  typescript: /^(export\s+)?(async\s+)?(function|class|interface|type|const|let|var)\s+\w+/m,
+  python: /^(async\s+)?(def|class)\s+\w+/m,
+  go: /^(func|type|var|const)\s+\w+/m,
+  rust: /^(pub\s+)?(fn|impl|struct|enum|trait)\s+\w+/m,
+  java: /^(public|private|protected|static).*?(class|interface|enum|void|\w+)\s+\w+\s*[({]/m,
+};
+
+function parseCodeFile(
+  content: string,
+  filePath: string,
+  language: string
+): Snippet[] {
+  const pattern = BOUNDARY_PATTERNS[language];
+  const breadcrumb = filePath;
+  const title = path.basename(filePath);
+
+  if (!pattern) {
+    // Fallback: sliding window
+    return slidingWindowChunks(content, filePath, language);
+  }
+
+  const chunks = splitAtBoundaries(content, pattern);
+  return chunks
+    .filter(chunk => chunk.trim().length >= 20)
+    .flatMap(chunk => {
+      if (estimateTokens(chunk) <= MAX_TOKENS) {
+        return [{
+          type: 'code' as const,
+          title,
+          content: chunk,
+          language,
+          breadcrumb,
+          tokenCount: estimateTokens(chunk),
+        }];
+      }
+      return slidingWindowChunks(chunk, filePath, language);
+    });
+}
+```
+
+---
+
+## Chunking Constants
+
+```typescript
+const MAX_TOKENS = 512;
+const OVERLAP_TOKENS = 50;
+const MIN_CONTENT_LENGTH = 20; // characters
+```
+
+### Sliding Window Chunker
+
+```typescript
+function chunkText(
+  text: string,
+  maxTokens: number,
+  overlapTokens: number
+): string[] {
+  const words = text.split(/\s+/);
+  const wordsPerToken = 0.75; // ~0.75 words per token
+  const maxWords = Math.floor(maxTokens * wordsPerToken);
+  const overlapWords = Math.floor(overlapTokens * wordsPerToken);
+
+  const chunks: string[] = [];
+  let start = 0;
+
+  while (start < words.length) {
+    const end = Math.min(start + maxWords, words.length);
+    chunks.push(words.slice(start, end).join(' '));
+    if (end === words.length) break;
+    start = end - overlapWords;
+  }
+
+  return chunks;
+}
+```
+
+---
+
+## Language Detection
+
+```typescript
+const LANGUAGE_MAP: Record<string, string> = {
+  '.ts': 'typescript', '.tsx': 'typescript',
+  '.js': 'javascript', '.jsx': 'javascript',
+  '.py': 'python',
+  '.rb': 'ruby',
+  '.go': 'go',
+  '.rs': 'rust',
+  '.java': 'java',
+  '.cs': 'csharp',
+  '.cpp': 'cpp', '.c': 'c', '.h': 'c',
+  '.swift': 'swift',
+  '.kt': 'kotlin',
+  '.php': 'php',
+  '.scala': 'scala',
+  '.sh': 'bash', '.bash': 'bash', '.zsh': 'bash',
+  '.md': 'markdown', '.mdx': 'markdown',
+  '.json': 'json',
+  '.yaml': 'yaml', '.yml': 'yaml',
+  '.toml': 'toml',
+  '.html': 'html',
+  '.css': 'css',
+  '.svelte': 'svelte',
+  '.vue': 'vue',
+  '.sql': 'sql',
+};
+
+function detectLanguage(filePath: string): string {
+  const ext = path.extname(filePath).toLowerCase();
+  return LANGUAGE_MAP[ext] ?? 'text';
+}
+```
+
+---
+
+## Main Entry Point
+
+```typescript
+export interface ParseOptions {
+  repositoryId: string;
+  documentId: string;
+  versionId?: string;
+}
+
+export function parseFile(
+  file: CrawledFile,
+  options: ParseOptions
+): NewSnippet[] {
+  const language = detectLanguage(file.path);
+  let rawSnippets: Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>[];
+
+  if (language === 'markdown') {
+    rawSnippets = parseMarkdown(file.content, file.path);
+  } else {
+    rawSnippets = parseCodeFile(file.content, file.path, language);
+  }
+
+  return rawSnippets.map(s => ({
+    ...s,
+    id: crypto.randomUUID(),
+    repositoryId: options.repositoryId,
+    documentId: options.documentId,
+    versionId: options.versionId ?? null,
+    createdAt: new Date(),
+  }));
+}
+```
+
+---
+
+## Files to Create
+
+- `src/lib/server/parser/markdown.parser.ts`
+- `src/lib/server/parser/code.parser.ts`
+- `src/lib/server/parser/chunker.ts`
+- `src/lib/server/parser/language.ts`
+- `src/lib/server/parser/index.ts` — exports `parseFile`
+- `src/lib/server/parser/markdown.parser.test.ts`
+- `src/lib/server/parser/code.parser.test.ts`