chore: initial project scaffold

This commit is contained in:
Giancarmine Salucci
2026-03-22 17:08:15 +01:00
commit 18437dfa7c
53 changed files with 12002 additions and 0 deletions

View File

@@ -0,0 +1,297 @@
# TRUEREF-0005 — Document Parser & Chunker
**Priority:** P0
**Status:** Pending
**Depends On:** TRUEREF-0001
**Blocks:** TRUEREF-0006, TRUEREF-0007, TRUEREF-0009
---
## Overview
Implement the document parsing and chunking pipeline that transforms raw file contents (from the crawlers) into structured, searchable `Snippet` records. This is the core intellectual layer of TrueRef — the quality of the chunks directly determines the quality of documentation retrieval.
---
## Acceptance Criteria
- [ ] Parse Markdown files into heading-based sections (info snippets)
- [ ] Extract fenced code blocks from Markdown as separate code snippets
- [ ] Parse standalone code files into function/class-level chunks
- [ ] Respect token limits per chunk (max 512 tokens, with 50-token overlap)
- [ ] Assign breadcrumb paths based on heading hierarchy (Markdown) or file path (code)
- [ ] Detect programming language from file extension
- [ ] Produce both `code` and `info` type snippets
- [ ] Calculate approximate token counts using character-based estimation
- [ ] Skip empty or trivially short content (< 20 chars)
- [ ] Unit tests with representative samples of each file type
---
## Supported File Types
| Extension | Parser Strategy |
|-----------|----------------|
| `.md`, `.mdx` | Heading-based section splitting + code block extraction |
| `.txt`, `.rst` | Paragraph-based splitting |
| `.ts`, `.tsx`, `.js`, `.jsx` | AST-free: function/class boundary detection via regex |
| `.py` | `def`/`class` boundary detection |
| `.go` | `func`/`type` boundary detection |
| `.rs` | `fn`/`impl`/`struct` boundary detection |
| `.java`, `.cs`, `.kt`, `.swift` | Class/method boundary detection |
| `.rb` | `def`/`class` boundary detection |
| `.json`, `.yaml`, `.yml`, `.toml` | Structural chunking (top-level keys) |
| `.html`, `.svelte`, `.vue` | Text content extraction + script block splitting |
| Other code | Line-count-based sliding window (200 lines per chunk) |
---
## Token Counting
Use a simple character-based approximation (no tokenizer library needed for v1):
```typescript
function estimateTokens(text: string): number {
// Empirically: ~4 chars per token for English prose
// ~3 chars per token for code (more symbols)
return Math.ceil(text.length / 3.5);
}
```
---
## Markdown Parser
The Markdown parser is the most important parser as most documentation is Markdown.
### Algorithm
1. Split the file into lines.
2. Track current heading stack (H1 > H2 > H3 > H4).
3. When a new heading is encountered, emit the accumulated content as an info snippet.
4. Fenced code blocks (` ``` `) within sections are extracted as separate code snippets.
5. The breadcrumb is built from the heading stack: `"Getting Started > Installation"`.
```typescript
interface MarkdownSection {
headings: string[]; // heading stack at this point
content: string; // text content (sans code blocks)
codeBlocks: { language: string; code: string }[];
}
function parseMarkdown(content: string, filePath: string): Snippet[] {
const sections = splitIntoSections(content);
const snippets: Snippet[] = [];
for (const section of sections) {
const breadcrumb = section.headings.join(' > ');
const title = section.headings.at(-1) ?? path.basename(filePath);
// Emit info snippet for text content
if (section.content.trim().length >= 20) {
const chunks = chunkText(section.content, MAX_TOKENS, OVERLAP_TOKENS);
for (const chunk of chunks) {
snippets.push({
type: 'info',
title,
content: chunk,
breadcrumb,
tokenCount: estimateTokens(chunk),
});
}
}
// Emit code snippets for each code block
for (const block of section.codeBlocks) {
if (block.code.trim().length >= 20) {
snippets.push({
type: 'code',
title,
content: block.code,
language: block.language || detectLanguage('.' + block.language),
breadcrumb,
tokenCount: estimateTokens(block.code),
});
}
}
}
return snippets;
}
```
---
## Code File Parser
For non-Markdown code files, use regex-based function/class boundary detection.
### Algorithm
1. Detect language-specific top-level declaration patterns.
2. Split the file at those boundaries.
3. Each chunk: the declaration line(s) + body up to the next declaration.
4. If a chunk exceeds `MAX_TOKENS`, apply sliding window splitting with overlap.
```typescript
const BOUNDARY_PATTERNS: Record<string, RegExp> = {
typescript: /^(export\s+)?(async\s+)?(function|class|interface|type|const|let|var)\s+\w+/m,
python: /^(async\s+)?(def|class)\s+\w+/m,
go: /^(func|type|var|const)\s+\w+/m,
rust: /^(pub\s+)?(fn|impl|struct|enum|trait)\s+\w+/m,
java: /^(public|private|protected|static).*?(class|interface|enum|void|\w+)\s+\w+\s*[({]/m,
};
function parseCodeFile(
content: string,
filePath: string,
language: string
): Snippet[] {
const pattern = BOUNDARY_PATTERNS[language];
const breadcrumb = filePath;
const title = path.basename(filePath);
if (!pattern) {
// Fallback: sliding window
return slidingWindowChunks(content, filePath, language);
}
const chunks = splitAtBoundaries(content, pattern);
return chunks
.filter(chunk => chunk.trim().length >= 20)
.flatMap(chunk => {
if (estimateTokens(chunk) <= MAX_TOKENS) {
return [{
type: 'code' as const,
title,
content: chunk,
language,
breadcrumb,
tokenCount: estimateTokens(chunk),
}];
}
return slidingWindowChunks(chunk, filePath, language);
});
}
```
---
## Chunking Constants
```typescript
const MAX_TOKENS = 512;
const OVERLAP_TOKENS = 50;
const MIN_CONTENT_LENGTH = 20; // characters
```
### Sliding Window Chunker
```typescript
function chunkText(
text: string,
maxTokens: number,
overlapTokens: number
): string[] {
const words = text.split(/\s+/);
const wordsPerToken = 0.75; // ~0.75 words per token
const maxWords = Math.floor(maxTokens * wordsPerToken);
const overlapWords = Math.floor(overlapTokens * wordsPerToken);
const chunks: string[] = [];
let start = 0;
while (start < words.length) {
const end = Math.min(start + maxWords, words.length);
chunks.push(words.slice(start, end).join(' '));
if (end === words.length) break;
start = end - overlapWords;
}
return chunks;
}
```
---
## Language Detection
```typescript
const LANGUAGE_MAP: Record<string, string> = {
'.ts': 'typescript', '.tsx': 'typescript',
'.js': 'javascript', '.jsx': 'javascript',
'.py': 'python',
'.rb': 'ruby',
'.go': 'go',
'.rs': 'rust',
'.java': 'java',
'.cs': 'csharp',
'.cpp': 'cpp', '.c': 'c', '.h': 'c',
'.swift': 'swift',
'.kt': 'kotlin',
'.php': 'php',
'.scala': 'scala',
'.sh': 'bash', '.bash': 'bash', '.zsh': 'bash',
'.md': 'markdown', '.mdx': 'markdown',
'.json': 'json',
'.yaml': 'yaml', '.yml': 'yaml',
'.toml': 'toml',
'.html': 'html',
'.css': 'css',
'.svelte': 'svelte',
'.vue': 'vue',
'.sql': 'sql',
};
function detectLanguage(filePath: string): string {
const ext = path.extname(filePath).toLowerCase();
return LANGUAGE_MAP[ext] ?? 'text';
}
```
---
## Main Entry Point
```typescript
export interface ParseOptions {
repositoryId: string;
documentId: string;
versionId?: string;
}
export function parseFile(
file: CrawledFile,
options: ParseOptions
): NewSnippet[] {
const language = detectLanguage(file.path);
let rawSnippets: Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>[];
if (language === 'markdown') {
rawSnippets = parseMarkdown(file.content, file.path);
} else {
rawSnippets = parseCodeFile(file.content, file.path, language);
}
return rawSnippets.map(s => ({
...s,
id: crypto.randomUUID(),
repositoryId: options.repositoryId,
documentId: options.documentId,
versionId: options.versionId ?? null,
createdAt: new Date(),
}));
}
```
---
## Files to Create
- `src/lib/server/parser/markdown.parser.ts`
- `src/lib/server/parser/code.parser.ts`
- `src/lib/server/parser/chunker.ts`
- `src/lib/server/parser/language.ts`
- `src/lib/server/parser/index.ts` — exports `parseFile`
- `src/lib/server/parser/markdown.parser.test.ts`
- `src/lib/server/parser/code.parser.test.ts`