feat(TRUEREF-0005): implement document parser and chunker

- Markdown parser with heading-based section splitting and code block extraction - Code file parser with regex boundary detection for 10+ languages - Sliding window chunker with configurable token limits and overlap - Language detection from file extensions Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-23 09:06:12 +01:00
parent 1c15d6c474
commit f6be3cfd47
7 changed files with 1350 additions and 0 deletions
--- a/src/lib/server/parser/chunker.ts
+++ b/src/lib/server/parser/chunker.ts
@@ -0,0 +1,92 @@
 /**
 * Text chunking utilities for the document parser (TRUEREF-0005).
 *
 * Provides sliding-window chunking with overlap and token estimation.
 */
 // ---------------------------------------------------------------------------
 // Constants
 // ---------------------------------------------------------------------------
 export const MAX_TOKENS = 512;
 export const OVERLAP_TOKENS = 50;
 export const MIN_CONTENT_LENGTH = 20; // characters
 // ---------------------------------------------------------------------------
 // Token estimation
 // ---------------------------------------------------------------------------
 /**
 * Estimate the token count for a piece of text using a character-based
 * approximation (~3.5 chars per token on average for mixed prose/code).
 */
 export function estimateTokens(text: string): number {
 	return Math.ceil(text.length / 3.5);
 }
 // ---------------------------------------------------------------------------
 // Sliding-window chunker
 // ---------------------------------------------------------------------------
 /**
 * Split `text` into overlapping word-based chunks that stay within the token
 * budget.  Returns at least one chunk even when the text fits in a single
 * window.
 */
 export function chunkText(
 	text: string,
 	maxTokens: number = MAX_TOKENS,
 	overlapTokens: number = OVERLAP_TOKENS
 ): string[] {
 	const words = text.split(/\s+/).filter((w) => w.length > 0);
 	if (words.length === 0) return [];
 	// ~0.75 words per token
 	const maxWords = Math.max(1, Math.floor(maxTokens * 0.75));
 	const overlapWords = Math.max(0, Math.floor(overlapTokens * 0.75));
 	if (words.length <= maxWords) {
 		return [words.join(' ')];
 	}
 	const chunks: string[] = [];
 	let start = 0;
 	while (start < words.length) {
 		const end = Math.min(start + maxWords, words.length);
 		chunks.push(words.slice(start, end).join(' '));
 		if (end === words.length) break;
 		start = end - overlapWords;
 		// Guard against infinite loop when overlapWords >= maxWords
 		if (start <= 0) start = end;
 	}
 	return chunks;
 }
 // ---------------------------------------------------------------------------
 // Line-count sliding window (for code files without recognised boundaries)
 // ---------------------------------------------------------------------------
 /** Split `lines` into groups of at most `maxLines` with `overlapLines` overlap. */
 export function chunkLines(
 	lines: string[],
 	maxLines: number = 200,
 	overlapLines: number = 20
 ): string[] {
 	if (lines.length === 0) return [];
 	if (lines.length <= maxLines) return [lines.join('\n')];
 	const chunks: string[] = [];
 	let start = 0;
 	while (start < lines.length) {
 		const end = Math.min(start + maxLines, lines.length);
 		chunks.push(lines.slice(start, end).join('\n'));
 		if (end === lines.length) break;
 		start = end - overlapLines;
 		if (start <= 0) start = end;
 	}
 	return chunks;
 }
--- a/src/lib/server/parser/code.parser.test.ts
+++ b/src/lib/server/parser/code.parser.test.ts
@@ -0,0 +1,404 @@
 /**
 * Unit tests for the code file parser (TRUEREF-0005).
 */
 import { describe, it, expect } from 'vitest';
 import { parseCodeFile, BOUNDARY_PATTERNS } from './code.parser.js';
 import { estimateTokens, MAX_TOKENS } from './chunker.js';
 import { parseFile } from './index.js';
 import type { CrawledFile } from '$lib/server/crawler/types.js';
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
 function makeFile(path: string, content: string, language = 'typescript'): CrawledFile {
 	return { path, content, size: content.length, sha: 'abc123', language };
 }
 // ---------------------------------------------------------------------------
 // TypeScript / JavaScript boundary detection
 // ---------------------------------------------------------------------------
 describe('parseCodeFile — TypeScript', () => {
 	it('splits at function boundaries', () => {
 		const content = `
 export function foo(): string {
  return 'foo';
 }
 export function bar(x: number): number {
  return x * 2;
 }
 `.trim();
 		const snippets = parseCodeFile(content, 'utils.ts', 'typescript');
 		expect(snippets.length).toBeGreaterThanOrEqual(2);
 		expect(snippets.every((s) => s.type === 'code')).toBe(true);
 		expect(snippets.some((s) => s.content.includes('function foo'))).toBe(true);
 		expect(snippets.some((s) => s.content.includes('function bar'))).toBe(true);
 	});
 	it('splits at class boundaries', () => {
 		const content = `
 export class Greeter {
  greet(name: string) {
    return \`Hello, \${name}\`;
  }
 }
 export class Farewell {
  bye(name: string) {
    return \`Goodbye, \${name}\`;
  }
 }
 `.trim();
 		const snippets = parseCodeFile(content, 'greet.ts', 'typescript');
 		expect(snippets.length).toBeGreaterThanOrEqual(2);
 		expect(snippets.some((s) => s.content.includes('class Greeter'))).toBe(true);
 		expect(snippets.some((s) => s.content.includes('class Farewell'))).toBe(true);
 	});
 	it('sets correct metadata on snippets', () => {
 		const content = `
 export function example(): void {
  console.log('example function body here');
 }
 `.trim();
 		const snippets = parseCodeFile(content, 'src/utils.ts', 'typescript');
 		expect(snippets.length).toBeGreaterThanOrEqual(1);
 		const s = snippets[0]!;
 		expect(s.type).toBe('code');
 		expect(s.language).toBe('typescript');
 		expect(s.title).toBe('utils.ts');
 		expect(s.breadcrumb).toBe('src/utils.ts');
 	});
 	it('produces at least one snippet from a file with many small declarations', () => {
 		// Each block: a multi-line function — boundary detection fires but chunks are
 		// large enough to survive the MIN_CONTENT_LENGTH filter.
 		const blocks = Array.from(
 			{ length: 10 },
 			(_, i) => `export function helper${i}(x: number): number {\n  return x + ${i};\n}`
 		);
 		const content = blocks.join('\n\n');
 		const snippets = parseCodeFile(content, 'generated.ts', 'typescript');
 		expect(snippets.length).toBeGreaterThanOrEqual(1);
 		expect(snippets.every((s) => s.type === 'code')).toBe(true);
 	});
 });
 // ---------------------------------------------------------------------------
 // Python
 // ---------------------------------------------------------------------------
 describe('parseCodeFile — Python', () => {
 	it('splits at def and class boundaries', () => {
 		const content = `
 def greet(name):
    return f"Hello, {name}"
 class MyClass:
    def __init__(self):
        self.value = 0
    def increment(self):
        self.value += 1
 async def fetch_data(url):
    return await http.get(url)
 `.trim();
 		const snippets = parseCodeFile(content, 'app.py', 'python');
 		expect(snippets.some((s) => s.content.includes('def greet'))).toBe(true);
 		expect(snippets.some((s) => s.content.includes('class MyClass'))).toBe(true);
 		expect(snippets.some((s) => s.content.includes('async def fetch_data'))).toBe(true);
 	});
 });
 // ---------------------------------------------------------------------------
 // Go
 // ---------------------------------------------------------------------------
 describe('parseCodeFile — Go', () => {
 	it('splits at func boundaries', () => {
 		const content = `
 package main
 import "fmt"
 func greet(name string) string {
 	return fmt.Sprintf("Hello, %s", name)
 }
 func main() {
 	fmt.Println(greet("world"))
 }
 `.trim();
 		const snippets = parseCodeFile(content, 'main.go', 'go');
 		expect(snippets.some((s) => s.content.includes('func greet'))).toBe(true);
 		expect(snippets.some((s) => s.content.includes('func main'))).toBe(true);
 	});
 });
 // ---------------------------------------------------------------------------
 // Rust
 // ---------------------------------------------------------------------------
 describe('parseCodeFile — Rust', () => {
 	it('splits at fn and struct boundaries', () => {
 		const content = `
 pub struct Config {
    pub name: String,
    pub value: u32,
 }
 pub fn create_config(name: &str, value: u32) -> Config {
    Config { name: name.to_string(), value }
 }
 impl Config {
    pub fn new() -> Self {
        Config { name: String::new(), value: 0 }
    }
 }
 `.trim();
 		const snippets = parseCodeFile(content, 'config.rs', 'rust');
 		expect(snippets.some((s) => s.content.includes('pub struct Config'))).toBe(true);
 		expect(snippets.some((s) => s.content.includes('pub fn create_config'))).toBe(true);
 	});
 });
 // ---------------------------------------------------------------------------
 // Ruby
 // ---------------------------------------------------------------------------
 describe('parseCodeFile — Ruby', () => {
 	it('splits at def and class boundaries', () => {
 		const content = `
 class Greeter
  def initialize(name)
    @name = name
  end
  def greet
    "Hello, #{@name}!"
  end
 end
 def standalone_helper
  puts "helper"
 end
 `.trim();
 		const snippets = parseCodeFile(content, 'greeter.rb', 'ruby');
 		expect(snippets.some((s) => s.content.includes('class Greeter'))).toBe(true);
 		expect(snippets.some((s) => s.content.includes('def standalone_helper'))).toBe(true);
 	});
 });
 // ---------------------------------------------------------------------------
 // Config / data files
 // ---------------------------------------------------------------------------
 describe('parseCodeFile — JSON', () => {
 	it('produces at least one code snippet from a JSON object', () => {
 		const content = JSON.stringify(
 			{
 				name: 'my-package',
 				version: '1.0.0',
 				dependencies: { lodash: '^4.17.21' }
 			},
 			null,
 			2
 		);
 		const snippets = parseCodeFile(content, 'package.json', 'json');
 		expect(snippets.length).toBeGreaterThanOrEqual(1);
 		expect(snippets.every((s) => s.type === 'code')).toBe(true);
 	});
 });
 describe('parseCodeFile — YAML', () => {
 	it('splits a YAML file at top-level keys', () => {
 		const content = `
 name: my-project
 version: 1.0.0
 scripts:
  build: tsc
  test: vitest
 dependencies:
  lodash: ^4.17.21
 `.trim();
 		const snippets = parseCodeFile(content, 'config.yaml', 'yaml');
 		expect(snippets.length).toBeGreaterThanOrEqual(1);
 	});
 });
 // ---------------------------------------------------------------------------
 // HTML-like files
 // ---------------------------------------------------------------------------
 describe('parseCodeFile — HTML', () => {
 	it('extracts script block and text content', () => {
 		const content = `
 <!DOCTYPE html>
 <html>
 <head><title>Test Page</title></head>
 <body>
  <p>This is the page body content with enough text for an info snippet.</p>
  <script>
    function init() {
      console.log('page loaded and ready for interaction');
    }
  </script>
 </body>
 </html>
 `.trim();
 		const snippets = parseCodeFile(content, 'index.html', 'html');
 		expect(snippets.some((s) => s.type === 'code')).toBe(true);
 		expect(snippets.some((s) => s.type === 'info')).toBe(true);
 	});
 });
 // ---------------------------------------------------------------------------
 // Plain text
 // ---------------------------------------------------------------------------
 describe('parseCodeFile — plain text', () => {
 	it('splits on paragraph boundaries', () => {
 		const content = `
 This is the first paragraph with enough content to pass the minimum length check.
 This is the second paragraph that also has enough content to be included here.
 `.trim();
 		const snippets = parseCodeFile(content, 'notes.txt', 'text');
 		expect(snippets.length).toBeGreaterThanOrEqual(2);
 		expect(snippets.every((s) => s.type === 'info')).toBe(true);
 	});
 	it('skips paragraphs shorter than 20 characters', () => {
 		const content = 'Short.\n\nThis is a much longer paragraph that definitely passes the minimum length filter.';
 		const snippets = parseCodeFile(content, 'notes.txt', 'text');
 		expect(snippets.length).toBe(1);
 	});
 });
 // ---------------------------------------------------------------------------
 // Unknown language fallback
 // ---------------------------------------------------------------------------
 describe('parseCodeFile — unknown language', () => {
 	it('falls back to sliding window for unrecognised languages', () => {
 		const lines = Array.from({ length: 50 }, (_, i) => `line ${i}: some code content here`);
 		const content = lines.join('\n');
 		const snippets = parseCodeFile(content, 'script.lua', 'lua');
 		expect(snippets.length).toBeGreaterThanOrEqual(1);
 		expect(snippets.every((s) => s.type === 'code')).toBe(true);
 	});
 });
 // ---------------------------------------------------------------------------
 // Min content filter
 // ---------------------------------------------------------------------------
 describe('parseCodeFile — minimum content filter', () => {
 	it('skips segments shorter than 20 characters', () => {
 		const content = `
 export function realFunction(): string {
  // A function with enough content to be included in the output snippets.
  return 'result value from the function that does the operation here';
 }
 `.trim();
 		const snippets = parseCodeFile(content, 'test.ts', 'typescript');
 		expect(snippets.every((s) => s.content.length >= 20)).toBe(true);
 	});
 });
 // ---------------------------------------------------------------------------
 // Token count cap
 // ---------------------------------------------------------------------------
 describe('parseCodeFile — token count', () => {
 	it('all snippets have tokenCount within MAX_TOKENS', () => {
 		const lines = Array.from({ length: 300 }, (_, i) => `// comment line number ${i} here\nconst x${i} = ${i};`);
 		const content = lines.join('\n');
 		const snippets = parseCodeFile(content, 'large.ts', 'typescript');
 		for (const s of snippets) {
 			expect(estimateTokens(s.content)).toBeLessThanOrEqual(MAX_TOKENS + 50); // slight tolerance for boundary chunks
 		}
 	});
 });
 // ---------------------------------------------------------------------------
 // parseFile integration
 // ---------------------------------------------------------------------------
 describe('parseFile — integration', () => {
 	const opts = { repositoryId: 'repo-1', documentId: 'doc-1', versionId: 'v1' };
 	it('returns NewSnippet records with all required fields for a .ts file', () => {
 		const file = makeFile(
 			'src/utils.ts',
 			`export function add(a: number, b: number): number {\n  return a + b;\n}\n`
 		);
 		const snippets = parseFile(file, opts);
 		expect(snippets.length).toBeGreaterThanOrEqual(1);
 		for (const s of snippets) {
 			expect(s.id).toBeTruthy();
 			expect(s.repositoryId).toBe('repo-1');
 			expect(s.documentId).toBe('doc-1');
 			expect(s.versionId).toBe('v1');
 			expect(s.createdAt).toBeInstanceOf(Date);
 			expect(s.content).toBeTruthy();
 			expect(s.type).toMatch(/^(code|info)$/);
 		}
 	});
 	it('returns NewSnippet records for a .md file', () => {
 		const file = makeFile(
 			'README.md',
 			`# Hello\n\nThis is a long enough paragraph to pass the minimum content length filter.\n`,
 			'markdown'
 		);
 		const snippets = parseFile(file, opts);
 		expect(snippets.length).toBeGreaterThanOrEqual(1);
 		expect(snippets[0]?.type).toBe('info');
 	});
 	it('uses null for versionId when not provided', () => {
 		const file = makeFile('src/index.ts', `export function noop(): void {}\n`);
 		const snippets = parseFile(file, { repositoryId: 'r', documentId: 'd' });
 		// noop is too short; file may return 0 snippets — just verify no error thrown
 		expect(Array.isArray(snippets)).toBe(true);
 	});
 });
 // ---------------------------------------------------------------------------
 // BOUNDARY_PATTERNS export
 // ---------------------------------------------------------------------------
 describe('BOUNDARY_PATTERNS', () => {
 	it('contains entries for core languages', () => {
 		expect(BOUNDARY_PATTERNS['typescript']).toBeInstanceOf(RegExp);
 		expect(BOUNDARY_PATTERNS['python']).toBeInstanceOf(RegExp);
 		expect(BOUNDARY_PATTERNS['go']).toBeInstanceOf(RegExp);
 		expect(BOUNDARY_PATTERNS['rust']).toBeInstanceOf(RegExp);
 		expect(BOUNDARY_PATTERNS['ruby']).toBeInstanceOf(RegExp);
 	});
 });
--- a/src/lib/server/parser/code.parser.ts
+++ b/src/lib/server/parser/code.parser.ts
@@ -0,0 +1,302 @@
 /**
 * Code file parser for TRUEREF-0005.
 *
 * Splits source-code files into function/class-level chunks using
 * language-specific regex boundary detection.  Falls back to a line-count
 * sliding window for unrecognised languages.
 */
 import { basename } from 'node:path';
 import type { NewSnippet } from '$lib/server/db/schema.js';
 import {
 	estimateTokens,
 	chunkLines,
 	chunkText,
 	MAX_TOKENS,
 	OVERLAP_TOKENS,
 	MIN_CONTENT_LENGTH
 } from './chunker.js';
 // ---------------------------------------------------------------------------
 // Boundary patterns per language
 // ---------------------------------------------------------------------------
 /**
 * Each pattern must match the START of a top-level declaration line.
 * The regex is tested line-by-line (multiline flag not needed).
 */
 export const BOUNDARY_PATTERNS: Record<string, RegExp> = {
 	typescript: /^(export\s+)?(declare\s+)?(async\s+)?(function|class|interface|type|enum|const|let|var)\s+\w+/,
 	javascript: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
 	python: /^(async\s+)?(def|class)\s+\w+/,
 	go: /^(func|type|var|const)\s+\w+/,
 	rust: /^(pub(\s*\(crate\))?\s+)?(async\s+)?(fn|impl|struct|enum|trait|type|const|static)\s+\w+/,
 	java: /^(\s*(public|private|protected|static|final|abstract|synchronized)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
 	csharp: /^(\s*(public|private|protected|internal|static|override|virtual|abstract|sealed)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
 	kotlin: /^(\s*(public|private|protected|internal|override|suspend|inline|open|abstract|sealed)\s+)*(fun|class|object|interface|data class|sealed class|enum class)\s+\w+/,
 	swift: /^(\s*(public|private|internal|fileprivate|open|override|static|final|class)\s+)*(func|class|struct|enum|protocol|extension)\s+\w+/,
 	ruby: /^(def|class|module)\s+\w+/
 };
 // ---------------------------------------------------------------------------
 // Internal types
 // ---------------------------------------------------------------------------
 type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
 /**
 * Split `content` at lines that match `pattern`, returning the segments
 * between boundaries (each segment includes its opening boundary line).
 */
 function splitAtBoundaries(content: string, pattern: RegExp): string[] {
 	const lines = content.split('\n');
 	const segments: string[] = [];
 	let current: string[] = [];
 	for (const line of lines) {
 		if (pattern.test(line) && current.length > 0) {
 			// Emit what we have, start a new segment from this boundary line
 			segments.push(current.join('\n'));
 			current = [line];
 		} else {
 			current.push(line);
 		}
 	}
 	if (current.length > 0) {
 		segments.push(current.join('\n'));
 	}
 	return segments;
 }
 // ---------------------------------------------------------------------------
 // Sliding-window fallback for code
 // ---------------------------------------------------------------------------
 function slidingWindowChunks(content: string, filePath: string, language: string): RawSnippet[] {
 	const lines = content.split('\n');
 	const windowedChunks = chunkLines(lines, 200, 20);
 	return windowedChunks
 		.filter((chunk) => chunk.trim().length >= MIN_CONTENT_LENGTH)
 		.map((chunk) => ({
 			type: 'code' as const,
 			title: basename(filePath),
 			content: chunk,
 			language,
 			breadcrumb: filePath,
 			tokenCount: estimateTokens(chunk)
 		}));
 }
 // ---------------------------------------------------------------------------
 // Config / data file parser (JSON, YAML, TOML)
 // ---------------------------------------------------------------------------
 /**
 * Chunk config/data files by splitting on top-level keys.
 *
 * Strategy: find lines that look like top-level keys (zero indentation,
 * followed by colon/equals/brace) and treat each as a boundary.
 */
 function parseConfigFile(content: string, filePath: string, language: string): RawSnippet[] {
 	const topLevelKey = /^[\w"'\-]+\s*[:=\[{]/;
 	const lines = content.split('\n');
 	const segments: string[] = [];
 	let current: string[] = [];
 	for (const line of lines) {
 		if (topLevelKey.test(line) && current.length > 0) {
 			segments.push(current.join('\n'));
 			current = [line];
 		} else {
 			current.push(line);
 		}
 	}
 	if (current.length > 0) segments.push(current.join('\n'));
 	// If we got only one segment (no structure detected), fall back to sliding window
 	if (segments.length <= 1) {
 		return slidingWindowChunks(content, filePath, language);
 	}
 	return segments
 		.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
 		.flatMap((seg) => {
 			if (estimateTokens(seg) <= MAX_TOKENS) {
 				return [
 					{
 						type: 'code' as const,
 						title: basename(filePath),
 						content: seg.trim(),
 						language,
 						breadcrumb: filePath,
 						tokenCount: estimateTokens(seg.trim())
 					}
 				];
 			}
 			return slidingWindowChunks(seg, filePath, language);
 		});
 }
 // ---------------------------------------------------------------------------
 // HTML / Svelte / Vue parser
 // ---------------------------------------------------------------------------
 /**
 * Extract script blocks and text content from HTML-like files.
 */
 function parseHtmlLikeFile(content: string, filePath: string, language: string): RawSnippet[] {
 	const snippets: RawSnippet[] = [];
 	const title = basename(filePath);
 	// Extract <script> blocks (including <script lang="ts">)
 	const scriptPattern = /<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi;
 	let match: RegExpExecArray | null;
 	const scriptBlocks: string[] = [];
 	while ((match = scriptPattern.exec(content)) !== null) {
 		// Strip the outer tags, keep just the code
 		const inner = match[0].replace(/^<script[^>]*>/, '').replace(/<\/script>$/, '').trim();
 		if (inner.length >= MIN_CONTENT_LENGTH) {
 			scriptBlocks.push(inner);
 		}
 	}
 	for (const block of scriptBlocks) {
 		if (estimateTokens(block) <= MAX_TOKENS) {
 			snippets.push({
 				type: 'code',
 				title,
 				content: block,
 				language,
 				breadcrumb: filePath,
 				tokenCount: estimateTokens(block)
 			});
 		} else {
 			snippets.push(...slidingWindowChunks(block, filePath, language));
 		}
 	}
 	// Strip tags and extract text content for info snippets
 	const text = content
 		.replace(/<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi, '')
 		.replace(/<style(?:\s[^>]*)?>[\s\S]*?<\/style>/gi, '')
 		.replace(/<[^>]+>/g, ' ')
 		.replace(/\s{2,}/g, ' ')
 		.trim();
 	if (text.length >= MIN_CONTENT_LENGTH) {
 		const chunks = chunkText(text, MAX_TOKENS, OVERLAP_TOKENS);
 		for (const chunk of chunks) {
 			snippets.push({
 				type: 'info',
 				title,
 				content: chunk,
 				language: null,
 				breadcrumb: filePath,
 				tokenCount: estimateTokens(chunk)
 			});
 		}
 	}
 	return snippets;
 }
 // ---------------------------------------------------------------------------
 // Plain-text / RST parser
 // ---------------------------------------------------------------------------
 function parsePlainText(content: string, filePath: string): RawSnippet[] {
 	// Split on blank lines (paragraph boundaries)
 	const paragraphs = content.split(/\n{2,}/).filter((p) => p.trim().length >= MIN_CONTENT_LENGTH);
 	if (paragraphs.length === 0) return [];
 	const title = basename(filePath);
 	const snippets: RawSnippet[] = [];
 	for (const para of paragraphs) {
 		const chunks = chunkText(para.trim(), MAX_TOKENS, OVERLAP_TOKENS);
 		for (const chunk of chunks) {
 			snippets.push({
 				type: 'info',
 				title,
 				content: chunk,
 				language: null,
 				breadcrumb: filePath,
 				tokenCount: estimateTokens(chunk)
 			});
 		}
 	}
 	return snippets;
 }
 // ---------------------------------------------------------------------------
 // Public parser
 // ---------------------------------------------------------------------------
 /**
 * Parse a non-Markdown code or data file into raw snippets.
 */
 export function parseCodeFile(content: string, filePath: string, language: string): RawSnippet[] {
 	// Plain text / RST
 	if (language === 'text') {
 		return parsePlainText(content, filePath);
 	}
 	// Config / data files
 	if (['json', 'yaml', 'toml'].includes(language)) {
 		return parseConfigFile(content, filePath, language);
 	}
 	// HTML-like files
 	if (['html', 'svelte', 'vue'].includes(language)) {
 		return parseHtmlLikeFile(content, filePath, language);
 	}
 	// Normalise csharp alias
 	const normalisedLang = language === 'csharp' ? 'csharp' : language;
 	const pattern = BOUNDARY_PATTERNS[normalisedLang];
 	const title = basename(filePath);
 	const breadcrumb = filePath;
 	if (!pattern) {
 		// Fallback: line-count sliding window
 		return slidingWindowChunks(content, filePath, language);
 	}
 	const segments = splitAtBoundaries(content, pattern);
 	// If boundary detection produced only one segment covering the whole file,
 	// it means no boundaries matched — fall back to sliding window.
 	if (segments.length === 1 && !pattern.test(content.split('\n')[0])) {
 		return slidingWindowChunks(content, filePath, language);
 	}
 	return segments
 		.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
 		.flatMap((seg) => {
 			const trimmed = seg.trim();
 			if (estimateTokens(trimmed) <= MAX_TOKENS) {
 				return [
 					{
 						type: 'code' as const,
 						title,
 						content: trimmed,
 						language,
 						breadcrumb,
 						tokenCount: estimateTokens(trimmed)
 					}
 				];
 			}
 			// Chunk oversized segments with sliding window
 			return slidingWindowChunks(trimmed, filePath, language);
 		});
 }
--- a/src/lib/server/parser/index.ts
+++ b/src/lib/server/parser/index.ts
@@ -0,0 +1,53 @@
 /**
 * Document parser entry point for TRUEREF-0005.
 *
 * Exposes `parseFile` which transforms a `CrawledFile` into an array of
 * `NewSnippet` records ready for database insertion.
 */
 import type { CrawledFile } from '$lib/server/crawler/types.js';
 import type { NewSnippet } from '$lib/server/db/schema.js';
 import { detectLanguage } from './language.js';
 import { parseMarkdown } from './markdown.parser.js';
 import { parseCodeFile } from './code.parser.js';
 // ---------------------------------------------------------------------------
 // Public API
 // ---------------------------------------------------------------------------
 export interface ParseOptions {
 	repositoryId: string;
 	documentId: string;
 	versionId?: string;
 }
 /**
 * Parse a crawled file into an array of `NewSnippet` records.
 *
 * The language is detected from the file extension.  Markdown/MDX files are
 * split by heading hierarchy; all other files use language-specific boundary
 * detection or a sliding-window fallback.
 */
 export function parseFile(file: CrawledFile, options: ParseOptions): NewSnippet[] {
 	const language = detectLanguage(file.path);
 	const rawSnippets =
 		language === 'markdown'
 			? parseMarkdown(file.content, file.path)
 			: parseCodeFile(file.content, file.path, language);
 	return rawSnippets.map((s) => ({
 		...s,
 		id: crypto.randomUUID(),
 		repositoryId: options.repositoryId,
 		documentId: options.documentId,
 		versionId: options.versionId ?? null,
 		createdAt: new Date()
 	}));
 }
 // Re-export helpers for consumers that need them individually
 export { detectLanguage } from './language.js';
 export { estimateTokens, chunkText, chunkLines, MAX_TOKENS, OVERLAP_TOKENS, MIN_CONTENT_LENGTH } from './chunker.js';
 export { parseMarkdown } from './markdown.parser.js';
 export { parseCodeFile, BOUNDARY_PATTERNS } from './code.parser.js';
--- a/src/lib/server/parser/language.ts
+++ b/src/lib/server/parser/language.ts
@@ -0,0 +1,56 @@
 /**
 * Language detection for the document parser (TRUEREF-0005).
 *
 * Maps file extensions to canonical language names used throughout the parser.
 */
 import { extname } from 'node:path';
 // ---------------------------------------------------------------------------
 // Language map
 // ---------------------------------------------------------------------------
 export const LANGUAGE_MAP: Record<string, string> = {
 	'.ts': 'typescript',
 	'.tsx': 'typescript',
 	'.js': 'javascript',
 	'.jsx': 'javascript',
 	'.py': 'python',
 	'.rb': 'ruby',
 	'.go': 'go',
 	'.rs': 'rust',
 	'.java': 'java',
 	'.cs': 'csharp',
 	'.cpp': 'cpp',
 	'.c': 'c',
 	'.h': 'c',
 	'.swift': 'swift',
 	'.kt': 'kotlin',
 	'.php': 'php',
 	'.scala': 'scala',
 	'.sh': 'bash',
 	'.bash': 'bash',
 	'.zsh': 'bash',
 	'.md': 'markdown',
 	'.mdx': 'markdown',
 	'.json': 'json',
 	'.yaml': 'yaml',
 	'.yml': 'yaml',
 	'.toml': 'toml',
 	'.html': 'html',
 	'.css': 'css',
 	'.svelte': 'svelte',
 	'.vue': 'vue',
 	'.sql': 'sql',
 	'.txt': 'text',
 	'.rst': 'text'
 };
 /**
 * Detect the canonical language name from a file path.
 * Returns 'text' when the extension is unknown.
 */
 export function detectLanguage(filePath: string): string {
 	const ext = extname(filePath).toLowerCase();
 	return LANGUAGE_MAP[ext] ?? 'text';
 }
--- a/src/lib/server/parser/markdown.parser.test.ts
+++ b/src/lib/server/parser/markdown.parser.test.ts
@@ -0,0 +1,272 @@
 /**
 * Unit tests for the Markdown parser (TRUEREF-0005).
 */
 import { describe, it, expect } from 'vitest';
 import { parseMarkdown } from './markdown.parser.js';
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
 /** Build a fenced code block string without nesting backticks in template literals. */
 function fence(lang: string, code: string): string {
 	return '```' + lang + '\n' + code + '\n' + '```';
 }
 function tildeFence(lang: string, code: string): string {
 	return '~~~' + lang + '\n' + code + '\n' + '~~~';
 }
 // ---------------------------------------------------------------------------
 // Basic section splitting
 // ---------------------------------------------------------------------------
 describe('parseMarkdown — section splitting', () => {
 	it('produces no snippets for empty content', () => {
 		expect(parseMarkdown('', 'README.md')).toHaveLength(0);
 	});
 	it('skips content shorter than 20 characters', () => {
 		const result = parseMarkdown('# Title\n\nShort.\n', 'README.md');
 		expect(result).toHaveLength(0);
 	});
 	it('parses a single heading section into an info snippet', () => {
 		const source = [
 			'# Introduction',
 			'',
 			'This is a paragraph with enough content to pass the minimum length check.'
 		].join('\n');
 		const snippets = parseMarkdown(source, 'README.md');
 		expect(snippets.length).toBeGreaterThanOrEqual(1);
 		const info = snippets.find((s) => s.type === 'info');
 		expect(info).toBeDefined();
 		expect(info?.title).toBe('Introduction');
 		expect(info?.breadcrumb).toBe('Introduction');
 	});
 	it('builds correct breadcrumb for nested headings', () => {
 		const source = [
 			'# Getting Started',
 			'',
 			'Intro text that is long enough to be included here.',
 			'',
 			'## Installation',
 			'',
 			'Install by running the command shown below in your terminal.'
 		].join('\n');
 		const snippets = parseMarkdown(source, 'README.md');
 		const installation = snippets.find((s) => s.title === 'Installation');
 		expect(installation).toBeDefined();
 		expect(installation?.breadcrumb).toBe('Getting Started > Installation');
 	});
 	it('resets heading stack correctly when headings ascend', () => {
 		const source = [
 			'# H1',
 			'',
 			'Some introductory prose that is longer than twenty characters.',
 			'',
 			'## H2',
 			'',
 			'More content here, also long enough to pass the threshold check.',
 			'',
 			'# Second H1',
 			'',
 			'Content for second top-level heading, long enough to be included.'
 		].join('\n');
 		const snippets = parseMarkdown(source, 'doc.md');
 		const secondH1 = snippets.find((s) => s.title === 'Second H1');
 		expect(secondH1).toBeDefined();
 		expect(secondH1?.breadcrumb).toBe('Second H1');
 	});
 	it('falls back to filename when no heading is present', () => {
 		const source = 'This is some standalone prose content that is long enough to pass.';
 		const snippets = parseMarkdown(source, 'notes.md');
 		expect(snippets.length).toBeGreaterThanOrEqual(1);
 		expect(snippets[0]?.title).toBe('notes.md');
 	});
 });
 // ---------------------------------------------------------------------------
 // Fenced code block extraction
 // ---------------------------------------------------------------------------
 describe('parseMarkdown — code block extraction', () => {
 	it('extracts a fenced code block as a code snippet', () => {
 		const codeBlock = fence('typescript', 'function hello(name: string): string {\n  return `Hello, ${name}!`;\n}');
 		const source = [
 			'# Example',
 			'',
 			'Some prose here that is long enough to pass the minimum check.',
 			'',
 			codeBlock
 		].join('\n');
 		const snippets = parseMarkdown(source, 'README.md');
 		const code = snippets.find((s) => s.type === 'code');
 		expect(code).toBeDefined();
 		expect(code?.language).toBe('typescript');
 		expect(code?.content).toContain('function hello');
 	});
 	it('extracts multiple code blocks from the same section', () => {
 		const bashBlock = fence('bash', 'npm install my-library --save-dev');
 		const jsBlock = fence('javascript', "const lib = require('my-lib');\nlib.doSomething();");
 		const source = [
 			'# Usage',
 			'',
 			'Description of the usage pattern with enough text here.',
 			'',
 			bashBlock,
 			'',
 			'More text in between the two code blocks, just enough.',
 			'',
 			jsBlock
 		].join('\n');
 		const snippets = parseMarkdown(source, 'README.md');
 		const codeSnippets = snippets.filter((s) => s.type === 'code');
 		expect(codeSnippets.length).toBe(2);
 		const langs = codeSnippets.map((s) => s.language);
 		expect(langs).toContain('bash');
 		expect(langs).toContain('javascript');
 	});
 	it('skips code blocks shorter than 20 characters', () => {
 		const shortBlock = fence('', 'x = 1');
 		const source = [
 			'# Example',
 			'',
 			'Some prose here that is long enough to pass.',
 			'',
 			shortBlock
 		].join('\n');
 		const snippets = parseMarkdown(source, 'README.md');
 		expect(snippets.every((s) => s.type === 'info')).toBe(true);
 	});
 	it('handles tilde-fenced code blocks', () => {
 		const pyBlock = tildeFence('python', 'def greet(name):\n    return f"Hello, {name}"');
 		const source = [
 			'# Section',
 			'',
 			'Long enough prose content for the section to be included here.',
 			'',
 			pyBlock
 		].join('\n');
 		const snippets = parseMarkdown(source, 'README.md');
 		const code = snippets.find((s) => s.type === 'code');
 		expect(code).toBeDefined();
 		expect(code?.language).toBe('python');
 	});
 	it('preserves breadcrumb on code snippets', () => {
 		const codeBlock = fence(
 			'typescript',
 			'function connect(url: string): Promise<void> {\n  return Promise.resolve();\n}'
 		);
 		const source = [
 			'# API Reference',
 			'',
 			'## Methods',
 			'',
 			'Overview of the methods available in this library.',
 			'',
 			codeBlock
 		].join('\n');
 		const snippets = parseMarkdown(source, 'API.md');
 		const code = snippets.find((s) => s.type === 'code');
 		expect(code).toBeDefined();
 		expect(code?.breadcrumb).toBe('API Reference > Methods');
 	});
 });
 // ---------------------------------------------------------------------------
 // Token counting
 // ---------------------------------------------------------------------------
 describe('parseMarkdown — token counting', () => {
 	it('attaches a non-zero tokenCount to every snippet', () => {
 		const source = [
 			'# Overview',
 			'',
 			'This section contains enough text to produce an info snippet for the test.'
 		].join('\n');
 		const snippets = parseMarkdown(source, 'README.md');
 		for (const s of snippets) {
 			expect(s.tokenCount).toBeGreaterThan(0);
 		}
 	});
 });
 // ---------------------------------------------------------------------------
 // Large content chunking
 // ---------------------------------------------------------------------------
 describe('parseMarkdown — large content chunking', () => {
 	it('splits a very large prose section into multiple snippets', () => {
 		// Generate ~4 000 characters of prose (well above the ~1 800-char window)
 		const longParagraph = 'word '.repeat(800).trim();
 		const source = `# Big Section\n\n${longParagraph}`;
 		const snippets = parseMarkdown(source, 'big.md');
 		const infoSnippets = snippets.filter((s) => s.type === 'info');
 		expect(infoSnippets.length).toBeGreaterThan(1);
 	});
 });
 // ---------------------------------------------------------------------------
 // Real-world sample
 // ---------------------------------------------------------------------------
 describe('parseMarkdown — real-world sample', () => {
 	it('correctly parses a realistic README excerpt', () => {
 		const bashInstall = fence('bash', 'npm install my-library');
 		const tsUsage = fence('typescript', "import { doTheThing } from 'my-library';\n\ndoTheThing({ verbose: true });");
 		const source = [
 			'# My Library',
 			'',
 			'A handy library for doing things quickly and efficiently.',
 			'',
 			'## Installation',
 			'',
 			'Install via npm using the following command in your project directory:',
 			'',
 			bashInstall,
 			'',
 			'## Usage',
 			'',
 			'Import the library and call the main function as shown below:',
 			'',
 			tsUsage,
 			'',
 			'## API',
 			'',
 			'### doTheThing(options)',
 			'',
 			'Performs the main operation. Options are passed as a plain object.'
 		].join('\n');
 		const snippets = parseMarkdown(source, 'README.md');
 		// Should have both info and code snippets
 		expect(snippets.some((s) => s.type === 'info')).toBe(true);
 		expect(snippets.some((s) => s.type === 'code')).toBe(true);
 		// Breadcrumb depth check
 		const apiSnippet = snippets.find((s) => s.title === 'doTheThing(options)');
 		expect(apiSnippet).toBeDefined();
 		expect(apiSnippet?.breadcrumb).toBe('My Library > API > doTheThing(options)');
 	});
 });
--- a/src/lib/server/parser/markdown.parser.ts
+++ b/src/lib/server/parser/markdown.parser.ts
@@ -0,0 +1,171 @@
 /**
 * Markdown document parser for TRUEREF-0005.
 *
 * Splits Markdown/MDX files into heading-based sections and extracts fenced
 * code blocks as separate code snippets.
 */
 import { basename } from 'node:path';
 import type { NewSnippet } from '$lib/server/db/schema.js';
 import { estimateTokens, chunkText, MAX_TOKENS, OVERLAP_TOKENS, MIN_CONTENT_LENGTH } from './chunker.js';
 // ---------------------------------------------------------------------------
 // Internal types
 // ---------------------------------------------------------------------------
 interface CodeBlock {
 	language: string;
 	code: string;
 }
 interface MarkdownSection {
 	/** Heading stack at this point, e.g. ["Getting Started", "Installation"] */
 	headings: string[];
 	/** Prose text content (code blocks stripped out) */
 	content: string;
 	/** Fenced code blocks found within this section */
 	codeBlocks: CodeBlock[];
 }
 // ---------------------------------------------------------------------------
 // Section splitting
 // ---------------------------------------------------------------------------
 /**
 * Split the full Markdown source into sections delimited by ATX headings
 * (# … ####).  Code blocks inside headings are extracted separately.
 */
 function splitIntoSections(source: string): MarkdownSection[] {
 	const lines = source.split('\n');
 	const sections: MarkdownSection[] = [];
 	// Heading stack: index 0 = H1, 1 = H2, … (we track up to H4)
 	const headingStack: string[] = [];
 	// Accumulator for the current section
 	let textLines: string[] = [];
 	const codeBlocks: CodeBlock[] = [];
 	// Fenced-code-block tracking
 	let inCodeBlock = false;
 	let codeFence = '';
 	let codeLanguage = '';
 	let codeLines: string[] = [];
 	function flushSection() {
 		sections.push({
 			headings: [...headingStack],
 			content: textLines.join('\n'),
 			codeBlocks: [...codeBlocks]
 		});
 		textLines = [];
 		codeBlocks.length = 0;
 	}
 	for (const line of lines) {
 		// ---- Fenced code block handling ----
 		if (!inCodeBlock) {
 			const fenceMatch = line.match(/^(`{3,}|~{3,})([\w-]*)/);
 			if (fenceMatch) {
 				inCodeBlock = true;
 				codeFence = fenceMatch[1].charAt(0).repeat(fenceMatch[1].length);
 				codeLanguage = fenceMatch[2].trim().toLowerCase();
 				codeLines = [];
 				continue;
 			}
 		} else {
 			// Check for closing fence (must be same char and at least same length)
 			const closingFence = new RegExp(`^${codeFence[0]}{${codeFence.length},}\\s*$`);
 			if (closingFence.test(line)) {
 				inCodeBlock = false;
 				const code = codeLines.join('\n');
 				if (code.trim().length >= MIN_CONTENT_LENGTH) {
 					codeBlocks.push({ language: codeLanguage, code });
 				}
 				codeLines = [];
 				continue;
 			}
 			codeLines.push(line);
 			continue;
 		}
 		// ---- Heading detection (ATX only, H1–H4) ----
 		const headingMatch = line.match(/^(#{1,4})\s+(.*)/);
 		if (headingMatch) {
 			// Emit whatever has accumulated before this heading
 			flushSection();
 			const level = headingMatch[1].length; // 1–4
 			const title = headingMatch[2].trim();
 			// Trim the stack to the depth above this heading and push the new title
 			headingStack.splice(level - 1, headingStack.length - (level - 1), title);
 			continue;
 		}
 		// ---- Ordinary prose line ----
 		textLines.push(line);
 	}
 	// Flush any trailing content (unclosed fence treated as prose)
 	if (inCodeBlock) {
 		// Treat remaining code lines as prose if the fence was never closed
 		textLines.push(...codeLines);
 	}
 	flushSection();
 	return sections;
 }
 // ---------------------------------------------------------------------------
 // Public parser
 // ---------------------------------------------------------------------------
 type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;
 /**
 * Parse a Markdown/MDX file into raw snippets (before IDs and DB fields are
 * attached).
 */
 export function parseMarkdown(content: string, filePath: string): RawSnippet[] {
 	const sections = splitIntoSections(content);
 	const snippets: RawSnippet[] = [];
 	for (const section of sections) {
 		const breadcrumb = section.headings.join(' > ') || undefined;
 		const title = section.headings.at(-1) ?? basename(filePath);
 		// ---- Info snippet for prose content ----
 		const prose = section.content.trim();
 		if (prose.length >= MIN_CONTENT_LENGTH) {
 			const chunks = chunkText(prose, MAX_TOKENS, OVERLAP_TOKENS);
 			for (const chunk of chunks) {
 				snippets.push({
 					type: 'info',
 					title,
 					content: chunk,
 					breadcrumb: breadcrumb ?? null,
 					language: null,
 					tokenCount: estimateTokens(chunk)
 				});
 			}
 		}
 		// ---- Code snippets for each fenced code block ----
 		for (const block of section.codeBlocks) {
 			const code = block.code.trim();
 			if (code.length < MIN_CONTENT_LENGTH) continue;
 			snippets.push({
 				type: 'code',
 				title,
 				content: code,
 				language: block.language || null,
 				breadcrumb: breadcrumb ?? null,
 				tokenCount: estimateTokens(code)
 			});
 		}
 	}
 	return snippets;
 }