300 lines
8.5 KiB
Markdown
300 lines
8.5 KiB
Markdown
# TRUEREF-0005 — Document Parser & Chunker
|
|
|
|
**Priority:** P0
|
|
**Status:** Pending
|
|
**Depends On:** TRUEREF-0001
|
|
**Blocks:** TRUEREF-0006, TRUEREF-0007, TRUEREF-0009
|
|
|
|
---
|
|
|
|
## Overview
|
|
|
|
Implement the document parsing and chunking pipeline that transforms raw file contents (from the crawlers) into structured, searchable `Snippet` records. This is the core intellectual layer of TrueRef — the quality of the chunks directly determines the quality of documentation retrieval.
|
|
|
|
---
|
|
|
|
## Acceptance Criteria
|
|
|
|
- [ ] Parse Markdown files into heading-based sections (info snippets)
|
|
- [ ] Extract fenced code blocks from Markdown as separate code snippets
|
|
- [ ] Parse standalone code files into function/class-level chunks
|
|
- [ ] Respect token limits per chunk (max 512 tokens, with 50-token overlap)
|
|
- [ ] Assign breadcrumb paths based on heading hierarchy (Markdown) or file path (code)
|
|
- [ ] Detect programming language from file extension
|
|
- [ ] Produce both `code` and `info` type snippets
|
|
- [ ] Calculate approximate token counts using character-based estimation
|
|
- [ ] Skip empty or trivially short content (< 20 chars)
|
|
- [ ] Unit tests with representative samples of each file type
|
|
|
|
---
|
|
|
|
## Supported File Types
|
|
|
|
| Extension | Parser Strategy |
|
|
| --------------------------------- | ------------------------------------------------------- |
|
|
| `.md`, `.mdx` | Heading-based section splitting + code block extraction |
|
|
| `.txt`, `.rst` | Paragraph-based splitting |
|
|
| `.ts`, `.tsx`, `.js`, `.jsx` | AST-free: function/class boundary detection via regex |
|
|
| `.py` | `def`/`class` boundary detection |
|
|
| `.go` | `func`/`type` boundary detection |
|
|
| `.rs` | `fn`/`impl`/`struct` boundary detection |
|
|
| `.java`, `.cs`, `.kt`, `.swift` | Class/method boundary detection |
|
|
| `.rb` | `def`/`class` boundary detection |
|
|
| `.json`, `.yaml`, `.yml`, `.toml` | Structural chunking (top-level keys) |
|
|
| `.html`, `.svelte`, `.vue` | Text content extraction + script block splitting |
|
|
| Other code | Line-count-based sliding window (200 lines per chunk) |
|
|
|
|
---
|
|
|
|
## Token Counting
|
|
|
|
Use a simple character-based approximation (no tokenizer library needed for v1):
|
|
|
|
```typescript
|
|
function estimateTokens(text: string): number {
|
|
// Empirically: ~4 chars per token for English prose
|
|
// ~3 chars per token for code (more symbols)
|
|
return Math.ceil(text.length / 3.5);
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Markdown Parser
|
|
|
|
The Markdown parser is the most important parser as most documentation is Markdown.
|
|
|
|
### Algorithm
|
|
|
|
1. Split the file into lines.
|
|
2. Track current heading stack (H1 > H2 > H3 > H4).
|
|
3. When a new heading is encountered, emit the accumulated content as an info snippet.
|
|
4. Fenced code blocks (` ``` `) within sections are extracted as separate code snippets.
|
|
5. The breadcrumb is built from the heading stack: `"Getting Started > Installation"`.
|
|
|
|
```typescript
|
|
interface MarkdownSection {
|
|
headings: string[]; // heading stack at this point
|
|
content: string; // text content (sans code blocks)
|
|
codeBlocks: { language: string; code: string }[];
|
|
}
|
|
|
|
function parseMarkdown(content: string, filePath: string): Snippet[] {
|
|
const sections = splitIntoSections(content);
|
|
const snippets: Snippet[] = [];
|
|
|
|
for (const section of sections) {
|
|
const breadcrumb = section.headings.join(' > ');
|
|
const title = section.headings.at(-1) ?? path.basename(filePath);
|
|
|
|
// Emit info snippet for text content
|
|
if (section.content.trim().length >= 20) {
|
|
const chunks = chunkText(section.content, MAX_TOKENS, OVERLAP_TOKENS);
|
|
for (const chunk of chunks) {
|
|
snippets.push({
|
|
type: 'info',
|
|
title,
|
|
content: chunk,
|
|
breadcrumb,
|
|
tokenCount: estimateTokens(chunk)
|
|
});
|
|
}
|
|
}
|
|
|
|
// Emit code snippets for each code block
|
|
for (const block of section.codeBlocks) {
|
|
if (block.code.trim().length >= 20) {
|
|
snippets.push({
|
|
type: 'code',
|
|
title,
|
|
content: block.code,
|
|
language: block.language || detectLanguage('.' + block.language),
|
|
breadcrumb,
|
|
tokenCount: estimateTokens(block.code)
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return snippets;
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Code File Parser
|
|
|
|
For non-Markdown code files, use regex-based function/class boundary detection.
|
|
|
|
### Algorithm
|
|
|
|
1. Detect language-specific top-level declaration patterns.
|
|
2. Split the file at those boundaries.
|
|
3. Each chunk: the declaration line(s) + body up to the next declaration.
|
|
4. If a chunk exceeds `MAX_TOKENS`, apply sliding window splitting with overlap.
|
|
|
|
```typescript
|
|
const BOUNDARY_PATTERNS: Record<string, RegExp> = {
|
|
typescript: /^(export\s+)?(async\s+)?(function|class|interface|type|const|let|var)\s+\w+/m,
|
|
python: /^(async\s+)?(def|class)\s+\w+/m,
|
|
go: /^(func|type|var|const)\s+\w+/m,
|
|
rust: /^(pub\s+)?(fn|impl|struct|enum|trait)\s+\w+/m,
|
|
java: /^(public|private|protected|static).*?(class|interface|enum|void|\w+)\s+\w+\s*[({]/m
|
|
};
|
|
|
|
function parseCodeFile(content: string, filePath: string, language: string): Snippet[] {
|
|
const pattern = BOUNDARY_PATTERNS[language];
|
|
const breadcrumb = filePath;
|
|
const title = path.basename(filePath);
|
|
|
|
if (!pattern) {
|
|
// Fallback: sliding window
|
|
return slidingWindowChunks(content, filePath, language);
|
|
}
|
|
|
|
const chunks = splitAtBoundaries(content, pattern);
|
|
return chunks
|
|
.filter((chunk) => chunk.trim().length >= 20)
|
|
.flatMap((chunk) => {
|
|
if (estimateTokens(chunk) <= MAX_TOKENS) {
|
|
return [
|
|
{
|
|
type: 'code' as const,
|
|
title,
|
|
content: chunk,
|
|
language,
|
|
breadcrumb,
|
|
tokenCount: estimateTokens(chunk)
|
|
}
|
|
];
|
|
}
|
|
return slidingWindowChunks(chunk, filePath, language);
|
|
});
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Chunking Constants
|
|
|
|
```typescript
|
|
const MAX_TOKENS = 512;
|
|
const OVERLAP_TOKENS = 50;
|
|
const MIN_CONTENT_LENGTH = 20; // characters
|
|
```
|
|
|
|
### Sliding Window Chunker
|
|
|
|
```typescript
|
|
function chunkText(text: string, maxTokens: number, overlapTokens: number): string[] {
|
|
const words = text.split(/\s+/);
|
|
const wordsPerToken = 0.75; // ~0.75 words per token
|
|
const maxWords = Math.floor(maxTokens * wordsPerToken);
|
|
const overlapWords = Math.floor(overlapTokens * wordsPerToken);
|
|
|
|
const chunks: string[] = [];
|
|
let start = 0;
|
|
|
|
while (start < words.length) {
|
|
const end = Math.min(start + maxWords, words.length);
|
|
chunks.push(words.slice(start, end).join(' '));
|
|
if (end === words.length) break;
|
|
start = end - overlapWords;
|
|
}
|
|
|
|
return chunks;
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Language Detection
|
|
|
|
```typescript
|
|
const LANGUAGE_MAP: Record<string, string> = {
|
|
'.ts': 'typescript',
|
|
'.tsx': 'typescript',
|
|
'.js': 'javascript',
|
|
'.jsx': 'javascript',
|
|
'.py': 'python',
|
|
'.rb': 'ruby',
|
|
'.go': 'go',
|
|
'.rs': 'rust',
|
|
'.java': 'java',
|
|
'.cs': 'csharp',
|
|
'.cpp': 'cpp',
|
|
'.c': 'c',
|
|
'.h': 'c',
|
|
'.swift': 'swift',
|
|
'.kt': 'kotlin',
|
|
'.php': 'php',
|
|
'.scala': 'scala',
|
|
'.sh': 'bash',
|
|
'.bash': 'bash',
|
|
'.zsh': 'bash',
|
|
'.md': 'markdown',
|
|
'.mdx': 'markdown',
|
|
'.json': 'json',
|
|
'.yaml': 'yaml',
|
|
'.yml': 'yaml',
|
|
'.toml': 'toml',
|
|
'.html': 'html',
|
|
'.css': 'css',
|
|
'.svelte': 'svelte',
|
|
'.vue': 'vue',
|
|
'.sql': 'sql'
|
|
};
|
|
|
|
function detectLanguage(filePath: string): string {
|
|
const ext = path.extname(filePath).toLowerCase();
|
|
return LANGUAGE_MAP[ext] ?? 'text';
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Main Entry Point
|
|
|
|
```typescript
|
|
export interface ParseOptions {
|
|
repositoryId: string;
|
|
documentId: string;
|
|
versionId?: string;
|
|
}
|
|
|
|
export function parseFile(file: CrawledFile, options: ParseOptions): NewSnippet[] {
|
|
const language = detectLanguage(file.path);
|
|
let rawSnippets: Omit<
|
|
NewSnippet,
|
|
'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'
|
|
>[];
|
|
|
|
if (language === 'markdown') {
|
|
rawSnippets = parseMarkdown(file.content, file.path);
|
|
} else {
|
|
rawSnippets = parseCodeFile(file.content, file.path, language);
|
|
}
|
|
|
|
return rawSnippets.map((s) => ({
|
|
...s,
|
|
id: crypto.randomUUID(),
|
|
repositoryId: options.repositoryId,
|
|
documentId: options.documentId,
|
|
versionId: options.versionId ?? null,
|
|
createdAt: new Date()
|
|
}));
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Files to Create
|
|
|
|
- `src/lib/server/parser/markdown.parser.ts`
|
|
- `src/lib/server/parser/code.parser.ts`
|
|
- `src/lib/server/parser/chunker.ts`
|
|
- `src/lib/server/parser/language.ts`
|
|
- `src/lib/server/parser/index.ts` — exports `parseFile`
|
|
- `src/lib/server/parser/markdown.parser.test.ts`
|
|
- `src/lib/server/parser/code.parser.test.ts`
|