chore: initial project scaffold
This commit is contained in:
297
docs/features/TRUEREF-0005.md
Normal file
297
docs/features/TRUEREF-0005.md
Normal file
@@ -0,0 +1,297 @@
|
||||
# TRUEREF-0005 — Document Parser & Chunker
|
||||
|
||||
**Priority:** P0
|
||||
**Status:** Pending
|
||||
**Depends On:** TRUEREF-0001
|
||||
**Blocks:** TRUEREF-0006, TRUEREF-0007, TRUEREF-0009
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Implement the document parsing and chunking pipeline that transforms raw file contents (from the crawlers) into structured, searchable `Snippet` records. This is the core intellectual layer of TrueRef — the quality of the chunks directly determines the quality of documentation retrieval.
|
||||
|
||||
---
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
- [ ] Parse Markdown files into heading-based sections (info snippets)
|
||||
- [ ] Extract fenced code blocks from Markdown as separate code snippets
|
||||
- [ ] Parse standalone code files into function/class-level chunks
|
||||
- [ ] Respect token limits per chunk (max 512 tokens, with 50-token overlap)
|
||||
- [ ] Assign breadcrumb paths based on heading hierarchy (Markdown) or file path (code)
|
||||
- [ ] Detect programming language from file extension
|
||||
- [ ] Produce both `code` and `info` type snippets
|
||||
- [ ] Calculate approximate token counts using character-based estimation
|
||||
- [ ] Skip empty or trivially short content (< 20 chars)
|
||||
- [ ] Unit tests with representative samples of each file type
|
||||
|
||||
---
|
||||
|
||||
## Supported File Types
|
||||
|
||||
| Extension | Parser Strategy |
|
||||
|-----------|----------------|
|
||||
| `.md`, `.mdx` | Heading-based section splitting + code block extraction |
|
||||
| `.txt`, `.rst` | Paragraph-based splitting |
|
||||
| `.ts`, `.tsx`, `.js`, `.jsx` | AST-free: function/class boundary detection via regex |
|
||||
| `.py` | `def`/`class` boundary detection |
|
||||
| `.go` | `func`/`type` boundary detection |
|
||||
| `.rs` | `fn`/`impl`/`struct` boundary detection |
|
||||
| `.java`, `.cs`, `.kt`, `.swift` | Class/method boundary detection |
|
||||
| `.rb` | `def`/`class` boundary detection |
|
||||
| `.json`, `.yaml`, `.yml`, `.toml` | Structural chunking (top-level keys) |
|
||||
| `.html`, `.svelte`, `.vue` | Text content extraction + script block splitting |
|
||||
| Other code | Line-count-based sliding window (200 lines per chunk) |
|
||||
|
||||
---
|
||||
|
||||
## Token Counting
|
||||
|
||||
Use a simple character-based approximation (no tokenizer library needed for v1):
|
||||
|
||||
```typescript
|
||||
function estimateTokens(text: string): number {
|
||||
// Empirically: ~4 chars per token for English prose
|
||||
// ~3 chars per token for code (more symbols)
|
||||
return Math.ceil(text.length / 3.5);
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Markdown Parser
|
||||
|
||||
The Markdown parser is the most important parser as most documentation is Markdown.
|
||||
|
||||
### Algorithm
|
||||
|
||||
1. Split the file into lines.
|
||||
2. Track current heading stack (H1 > H2 > H3 > H4).
|
||||
3. When a new heading is encountered, emit the accumulated content as an info snippet.
|
||||
4. Fenced code blocks (` ``` `) within sections are extracted as separate code snippets.
|
||||
5. The breadcrumb is built from the heading stack: `"Getting Started > Installation"`.
|
||||
|
||||
```typescript
|
||||
interface MarkdownSection {
|
||||
headings: string[]; // heading stack at this point
|
||||
content: string; // text content (sans code blocks)
|
||||
codeBlocks: { language: string; code: string }[];
|
||||
}
|
||||
|
||||
function parseMarkdown(content: string, filePath: string): Snippet[] {
|
||||
const sections = splitIntoSections(content);
|
||||
const snippets: Snippet[] = [];
|
||||
|
||||
for (const section of sections) {
|
||||
const breadcrumb = section.headings.join(' > ');
|
||||
const title = section.headings.at(-1) ?? path.basename(filePath);
|
||||
|
||||
// Emit info snippet for text content
|
||||
if (section.content.trim().length >= 20) {
|
||||
const chunks = chunkText(section.content, MAX_TOKENS, OVERLAP_TOKENS);
|
||||
for (const chunk of chunks) {
|
||||
snippets.push({
|
||||
type: 'info',
|
||||
title,
|
||||
content: chunk,
|
||||
breadcrumb,
|
||||
tokenCount: estimateTokens(chunk),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Emit code snippets for each code block
|
||||
for (const block of section.codeBlocks) {
|
||||
if (block.code.trim().length >= 20) {
|
||||
snippets.push({
|
||||
type: 'code',
|
||||
title,
|
||||
content: block.code,
|
||||
language: block.language || detectLanguage('.' + block.language),
|
||||
breadcrumb,
|
||||
tokenCount: estimateTokens(block.code),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return snippets;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Code File Parser
|
||||
|
||||
For non-Markdown code files, use regex-based function/class boundary detection.
|
||||
|
||||
### Algorithm
|
||||
|
||||
1. Detect language-specific top-level declaration patterns.
|
||||
2. Split the file at those boundaries.
|
||||
3. Each chunk: the declaration line(s) + body up to the next declaration.
|
||||
4. If a chunk exceeds `MAX_TOKENS`, apply sliding window splitting with overlap.
|
||||
|
||||
```typescript
|
||||
const BOUNDARY_PATTERNS: Record<string, RegExp> = {
|
||||
typescript: /^(export\s+)?(async\s+)?(function|class|interface|type|const|let|var)\s+\w+/m,
|
||||
python: /^(async\s+)?(def|class)\s+\w+/m,
|
||||
go: /^(func|type|var|const)\s+\w+/m,
|
||||
rust: /^(pub\s+)?(fn|impl|struct|enum|trait)\s+\w+/m,
|
||||
java: /^(public|private|protected|static).*?(class|interface|enum|void|\w+)\s+\w+\s*[({]/m,
|
||||
};
|
||||
|
||||
function parseCodeFile(
|
||||
content: string,
|
||||
filePath: string,
|
||||
language: string
|
||||
): Snippet[] {
|
||||
const pattern = BOUNDARY_PATTERNS[language];
|
||||
const breadcrumb = filePath;
|
||||
const title = path.basename(filePath);
|
||||
|
||||
if (!pattern) {
|
||||
// Fallback: sliding window
|
||||
return slidingWindowChunks(content, filePath, language);
|
||||
}
|
||||
|
||||
const chunks = splitAtBoundaries(content, pattern);
|
||||
return chunks
|
||||
.filter(chunk => chunk.trim().length >= 20)
|
||||
.flatMap(chunk => {
|
||||
if (estimateTokens(chunk) <= MAX_TOKENS) {
|
||||
return [{
|
||||
type: 'code' as const,
|
||||
title,
|
||||
content: chunk,
|
||||
language,
|
||||
breadcrumb,
|
||||
tokenCount: estimateTokens(chunk),
|
||||
}];
|
||||
}
|
||||
return slidingWindowChunks(chunk, filePath, language);
|
||||
});
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Chunking Constants
|
||||
|
||||
```typescript
|
||||
const MAX_TOKENS = 512;
|
||||
const OVERLAP_TOKENS = 50;
|
||||
const MIN_CONTENT_LENGTH = 20; // characters
|
||||
```
|
||||
|
||||
### Sliding Window Chunker
|
||||
|
||||
```typescript
|
||||
function chunkText(
|
||||
text: string,
|
||||
maxTokens: number,
|
||||
overlapTokens: number
|
||||
): string[] {
|
||||
const words = text.split(/\s+/);
|
||||
const wordsPerToken = 0.75; // ~0.75 words per token
|
||||
const maxWords = Math.floor(maxTokens * wordsPerToken);
|
||||
const overlapWords = Math.floor(overlapTokens * wordsPerToken);
|
||||
|
||||
const chunks: string[] = [];
|
||||
let start = 0;
|
||||
|
||||
while (start < words.length) {
|
||||
const end = Math.min(start + maxWords, words.length);
|
||||
chunks.push(words.slice(start, end).join(' '));
|
||||
if (end === words.length) break;
|
||||
start = end - overlapWords;
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Language Detection
|
||||
|
||||
```typescript
|
||||
const LANGUAGE_MAP: Record<string, string> = {
|
||||
'.ts': 'typescript', '.tsx': 'typescript',
|
||||
'.js': 'javascript', '.jsx': 'javascript',
|
||||
'.py': 'python',
|
||||
'.rb': 'ruby',
|
||||
'.go': 'go',
|
||||
'.rs': 'rust',
|
||||
'.java': 'java',
|
||||
'.cs': 'csharp',
|
||||
'.cpp': 'cpp', '.c': 'c', '.h': 'c',
|
||||
'.swift': 'swift',
|
||||
'.kt': 'kotlin',
|
||||
'.php': 'php',
|
||||
'.scala': 'scala',
|
||||
'.sh': 'bash', '.bash': 'bash', '.zsh': 'bash',
|
||||
'.md': 'markdown', '.mdx': 'markdown',
|
||||
'.json': 'json',
|
||||
'.yaml': 'yaml', '.yml': 'yaml',
|
||||
'.toml': 'toml',
|
||||
'.html': 'html',
|
||||
'.css': 'css',
|
||||
'.svelte': 'svelte',
|
||||
'.vue': 'vue',
|
||||
'.sql': 'sql',
|
||||
};
|
||||
|
||||
function detectLanguage(filePath: string): string {
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
return LANGUAGE_MAP[ext] ?? 'text';
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Main Entry Point
|
||||
|
||||
```typescript
|
||||
export interface ParseOptions {
|
||||
repositoryId: string;
|
||||
documentId: string;
|
||||
versionId?: string;
|
||||
}
|
||||
|
||||
export function parseFile(
|
||||
file: CrawledFile,
|
||||
options: ParseOptions
|
||||
): NewSnippet[] {
|
||||
const language = detectLanguage(file.path);
|
||||
let rawSnippets: Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>[];
|
||||
|
||||
if (language === 'markdown') {
|
||||
rawSnippets = parseMarkdown(file.content, file.path);
|
||||
} else {
|
||||
rawSnippets = parseCodeFile(file.content, file.path, language);
|
||||
}
|
||||
|
||||
return rawSnippets.map(s => ({
|
||||
...s,
|
||||
id: crypto.randomUUID(),
|
||||
repositoryId: options.repositoryId,
|
||||
documentId: options.documentId,
|
||||
versionId: options.versionId ?? null,
|
||||
createdAt: new Date(),
|
||||
}));
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Files to Create
|
||||
|
||||
- `src/lib/server/parser/markdown.parser.ts`
|
||||
- `src/lib/server/parser/code.parser.ts`
|
||||
- `src/lib/server/parser/chunker.ts`
|
||||
- `src/lib/server/parser/language.ts`
|
||||
- `src/lib/server/parser/index.ts` — exports `parseFile`
|
||||
- `src/lib/server/parser/markdown.parser.test.ts`
|
||||
- `src/lib/server/parser/code.parser.test.ts`
|
||||
Reference in New Issue
Block a user