Files
trueref/docs/features/TRUEREF-0005.md
2026-03-27 02:23:01 +01:00

8.5 KiB

TRUEREF-0005 — Document Parser & Chunker

Priority: P0 Status: Pending Depends On: TRUEREF-0001 Blocks: TRUEREF-0006, TRUEREF-0007, TRUEREF-0009


Overview

Implement the document parsing and chunking pipeline that transforms raw file contents (from the crawlers) into structured, searchable Snippet records. This is the core intellectual layer of TrueRef — the quality of the chunks directly determines the quality of documentation retrieval.


Acceptance Criteria

  • Parse Markdown files into heading-based sections (info snippets)
  • Extract fenced code blocks from Markdown as separate code snippets
  • Parse standalone code files into function/class-level chunks
  • Respect token limits per chunk (max 512 tokens, with 50-token overlap)
  • Assign breadcrumb paths based on heading hierarchy (Markdown) or file path (code)
  • Detect programming language from file extension
  • Produce both code and info type snippets
  • Calculate approximate token counts using character-based estimation
  • Skip empty or trivially short content (< 20 chars)
  • Unit tests with representative samples of each file type

Supported File Types

Extension Parser Strategy
.md, .mdx Heading-based section splitting + code block extraction
.txt, .rst Paragraph-based splitting
.ts, .tsx, .js, .jsx AST-free: function/class boundary detection via regex
.py def/class boundary detection
.go func/type boundary detection
.rs fn/impl/struct boundary detection
.java, .cs, .kt, .swift Class/method boundary detection
.rb def/class boundary detection
.json, .yaml, .yml, .toml Structural chunking (top-level keys)
.html, .svelte, .vue Text content extraction + script block splitting
Other code Line-count-based sliding window (200 lines per chunk)

Token Counting

Use a simple character-based approximation (no tokenizer library needed for v1):

function estimateTokens(text: string): number {
	// Empirically: ~4 chars per token for English prose
	// ~3 chars per token for code (more symbols)
	return Math.ceil(text.length / 3.5);
}

Markdown Parser

The Markdown parser is the most important parser as most documentation is Markdown.

Algorithm

  1. Split the file into lines.
  2. Track current heading stack (H1 > H2 > H3 > H4).
  3. When a new heading is encountered, emit the accumulated content as an info snippet.
  4. Fenced code blocks (```) within sections are extracted as separate code snippets.
  5. The breadcrumb is built from the heading stack: "Getting Started > Installation".
interface MarkdownSection {
	headings: string[]; // heading stack at this point
	content: string; // text content (sans code blocks)
	codeBlocks: { language: string; code: string }[];
}

function parseMarkdown(content: string, filePath: string): Snippet[] {
	const sections = splitIntoSections(content);
	const snippets: Snippet[] = [];

	for (const section of sections) {
		const breadcrumb = section.headings.join(' > ');
		const title = section.headings.at(-1) ?? path.basename(filePath);

		// Emit info snippet for text content
		if (section.content.trim().length >= 20) {
			const chunks = chunkText(section.content, MAX_TOKENS, OVERLAP_TOKENS);
			for (const chunk of chunks) {
				snippets.push({
					type: 'info',
					title,
					content: chunk,
					breadcrumb,
					tokenCount: estimateTokens(chunk)
				});
			}
		}

		// Emit code snippets for each code block
		for (const block of section.codeBlocks) {
			if (block.code.trim().length >= 20) {
				snippets.push({
					type: 'code',
					title,
					content: block.code,
					language: block.language || detectLanguage('.' + block.language),
					breadcrumb,
					tokenCount: estimateTokens(block.code)
				});
			}
		}
	}

	return snippets;
}

Code File Parser

For non-Markdown code files, use regex-based function/class boundary detection.

Algorithm

  1. Detect language-specific top-level declaration patterns.
  2. Split the file at those boundaries.
  3. Each chunk: the declaration line(s) + body up to the next declaration.
  4. If a chunk exceeds MAX_TOKENS, apply sliding window splitting with overlap.
const BOUNDARY_PATTERNS: Record<string, RegExp> = {
	typescript: /^(export\s+)?(async\s+)?(function|class|interface|type|const|let|var)\s+\w+/m,
	python: /^(async\s+)?(def|class)\s+\w+/m,
	go: /^(func|type|var|const)\s+\w+/m,
	rust: /^(pub\s+)?(fn|impl|struct|enum|trait)\s+\w+/m,
	java: /^(public|private|protected|static).*?(class|interface|enum|void|\w+)\s+\w+\s*[({]/m
};

function parseCodeFile(content: string, filePath: string, language: string): Snippet[] {
	const pattern = BOUNDARY_PATTERNS[language];
	const breadcrumb = filePath;
	const title = path.basename(filePath);

	if (!pattern) {
		// Fallback: sliding window
		return slidingWindowChunks(content, filePath, language);
	}

	const chunks = splitAtBoundaries(content, pattern);
	return chunks
		.filter((chunk) => chunk.trim().length >= 20)
		.flatMap((chunk) => {
			if (estimateTokens(chunk) <= MAX_TOKENS) {
				return [
					{
						type: 'code' as const,
						title,
						content: chunk,
						language,
						breadcrumb,
						tokenCount: estimateTokens(chunk)
					}
				];
			}
			return slidingWindowChunks(chunk, filePath, language);
		});
}

Chunking Constants

const MAX_TOKENS = 512;
const OVERLAP_TOKENS = 50;
const MIN_CONTENT_LENGTH = 20; // characters

Sliding Window Chunker

function chunkText(text: string, maxTokens: number, overlapTokens: number): string[] {
	const words = text.split(/\s+/);
	const wordsPerToken = 0.75; // ~0.75 words per token
	const maxWords = Math.floor(maxTokens * wordsPerToken);
	const overlapWords = Math.floor(overlapTokens * wordsPerToken);

	const chunks: string[] = [];
	let start = 0;

	while (start < words.length) {
		const end = Math.min(start + maxWords, words.length);
		chunks.push(words.slice(start, end).join(' '));
		if (end === words.length) break;
		start = end - overlapWords;
	}

	return chunks;
}

Language Detection

const LANGUAGE_MAP: Record<string, string> = {
	'.ts': 'typescript',
	'.tsx': 'typescript',
	'.js': 'javascript',
	'.jsx': 'javascript',
	'.py': 'python',
	'.rb': 'ruby',
	'.go': 'go',
	'.rs': 'rust',
	'.java': 'java',
	'.cs': 'csharp',
	'.cpp': 'cpp',
	'.c': 'c',
	'.h': 'c',
	'.swift': 'swift',
	'.kt': 'kotlin',
	'.php': 'php',
	'.scala': 'scala',
	'.sh': 'bash',
	'.bash': 'bash',
	'.zsh': 'bash',
	'.md': 'markdown',
	'.mdx': 'markdown',
	'.json': 'json',
	'.yaml': 'yaml',
	'.yml': 'yaml',
	'.toml': 'toml',
	'.html': 'html',
	'.css': 'css',
	'.svelte': 'svelte',
	'.vue': 'vue',
	'.sql': 'sql'
};

function detectLanguage(filePath: string): string {
	const ext = path.extname(filePath).toLowerCase();
	return LANGUAGE_MAP[ext] ?? 'text';
}

Main Entry Point

export interface ParseOptions {
	repositoryId: string;
	documentId: string;
	versionId?: string;
}

export function parseFile(file: CrawledFile, options: ParseOptions): NewSnippet[] {
	const language = detectLanguage(file.path);
	let rawSnippets: Omit<
		NewSnippet,
		'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'
	>[];

	if (language === 'markdown') {
		rawSnippets = parseMarkdown(file.content, file.path);
	} else {
		rawSnippets = parseCodeFile(file.content, file.path, language);
	}

	return rawSnippets.map((s) => ({
		...s,
		id: crypto.randomUUID(),
		repositoryId: options.repositoryId,
		documentId: options.documentId,
		versionId: options.versionId ?? null,
		createdAt: new Date()
	}));
}

Files to Create

  • src/lib/server/parser/markdown.parser.ts
  • src/lib/server/parser/code.parser.ts
  • src/lib/server/parser/chunker.ts
  • src/lib/server/parser/language.ts
  • src/lib/server/parser/index.ts — exports parseFile
  • src/lib/server/parser/markdown.parser.test.ts
  • src/lib/server/parser/code.parser.test.ts