feat(TRUEREF-0005): implement document parser and chunker

- Markdown parser with heading-based section splitting and code block extraction - Code file parser with regex boundary detection for 10+ languages - Sliding window chunker with configurable token limits and overlap - Language detection from file extensions Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-23 09:06:12 +01:00
parent 1c15d6c474
commit f6be3cfd47
7 changed files with 1350 additions and 0 deletions
--- a/src/lib/server/parser/chunker.ts
+++ b/src/lib/server/parser/chunker.ts
@@ -0,0 +1,92 @@
+/**
+ * Text chunking utilities for the document parser (TRUEREF-0005).
+ *
+ * Provides sliding-window chunking with overlap and token estimation.
+ */
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+export const MAX_TOKENS = 512;
+export const OVERLAP_TOKENS = 50;
+export const MIN_CONTENT_LENGTH = 20; // characters
+
+// ---------------------------------------------------------------------------
+// Token estimation
+// ---------------------------------------------------------------------------
+
+/**
+ * Estimate the token count for a piece of text using a character-based
+ * approximation (~3.5 chars per token on average for mixed prose/code).
+ */
+export function estimateTokens(text: string): number {
+	return Math.ceil(text.length / 3.5);
+}
+
+// ---------------------------------------------------------------------------
+// Sliding-window chunker
+// ---------------------------------------------------------------------------
+
+/**
+ * Split `text` into overlapping word-based chunks that stay within the token
+ * budget.  Returns at least one chunk even when the text fits in a single
+ * window.
+ */
+export function chunkText(
+	text: string,
+	maxTokens: number = MAX_TOKENS,
+	overlapTokens: number = OVERLAP_TOKENS
+): string[] {
+	const words = text.split(/\s+/).filter((w) => w.length > 0);
+	if (words.length === 0) return [];
+
+	// ~0.75 words per token
+	const maxWords = Math.max(1, Math.floor(maxTokens * 0.75));
+	const overlapWords = Math.max(0, Math.floor(overlapTokens * 0.75));
+
+	if (words.length <= maxWords) {
+		return [words.join(' ')];
+	}
+
+	const chunks: string[] = [];
+	let start = 0;
+
+	while (start < words.length) {
+		const end = Math.min(start + maxWords, words.length);
+		chunks.push(words.slice(start, end).join(' '));
+		if (end === words.length) break;
+		start = end - overlapWords;
+		// Guard against infinite loop when overlapWords >= maxWords
+		if (start <= 0) start = end;
+	}
+
+	return chunks;
+}
+
+// ---------------------------------------------------------------------------
+// Line-count sliding window (for code files without recognised boundaries)
+// ---------------------------------------------------------------------------
+
+/** Split `lines` into groups of at most `maxLines` with `overlapLines` overlap. */
+export function chunkLines(
+	lines: string[],
+	maxLines: number = 200,
+	overlapLines: number = 20
+): string[] {
+	if (lines.length === 0) return [];
+	if (lines.length <= maxLines) return [lines.join('\n')];
+
+	const chunks: string[] = [];
+	let start = 0;
+
+	while (start < lines.length) {
+		const end = Math.min(start + maxLines, lines.length);
+		chunks.push(lines.slice(start, end).join('\n'));
+		if (end === lines.length) break;
+		start = end - overlapLines;
+		if (start <= 0) start = end;
+	}
+
+	return chunks;
+}
--- a/src/lib/server/parser/code.parser.test.ts
+++ b/src/lib/server/parser/code.parser.test.ts
@@ -0,0 +1,404 @@
+/**
+ * Unit tests for the code file parser (TRUEREF-0005).
+ */
+
+import { describe, it, expect } from 'vitest';
+import { parseCodeFile, BOUNDARY_PATTERNS } from './code.parser.js';
+import { estimateTokens, MAX_TOKENS } from './chunker.js';
+import { parseFile } from './index.js';
+import type { CrawledFile } from '$lib/server/crawler/types.js';
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function makeFile(path: string, content: string, language = 'typescript'): CrawledFile {
+	return { path, content, size: content.length, sha: 'abc123', language };
+}
+
+// ---------------------------------------------------------------------------
+// TypeScript / JavaScript boundary detection
+// ---------------------------------------------------------------------------
+
+describe('parseCodeFile — TypeScript', () => {
+	it('splits at function boundaries', () => {
+		const content = `
+export function foo(): string {
+  return 'foo';
+}
+
+export function bar(x: number): number {
+  return x * 2;
+}
+`.trim();
+
+		const snippets = parseCodeFile(content, 'utils.ts', 'typescript');
+		expect(snippets.length).toBeGreaterThanOrEqual(2);
+		expect(snippets.every((s) => s.type === 'code')).toBe(true);
+		expect(snippets.some((s) => s.content.includes('function foo'))).toBe(true);
+		expect(snippets.some((s) => s.content.includes('function bar'))).toBe(true);
+	});
+
+	it('splits at class boundaries', () => {
+		const content = `
+export class Greeter {
+  greet(name: string) {
+    return \`Hello, \${name}\`;
+  }
+}
+
+export class Farewell {
+  bye(name: string) {
+    return \`Goodbye, \${name}\`;
+  }
+}
+`.trim();
+
+		const snippets = parseCodeFile(content, 'greet.ts', 'typescript');
+		expect(snippets.length).toBeGreaterThanOrEqual(2);
+		expect(snippets.some((s) => s.content.includes('class Greeter'))).toBe(true);
+		expect(snippets.some((s) => s.content.includes('class Farewell'))).toBe(true);
+	});
+
+	it('sets correct metadata on snippets', () => {
+		const content = `
+export function example(): void {
+  console.log('example function body here');
+}
+`.trim();
+
+		const snippets = parseCodeFile(content, 'src/utils.ts', 'typescript');
+		expect(snippets.length).toBeGreaterThanOrEqual(1);
+		const s = snippets[0]!;
+		expect(s.type).toBe('code');
+		expect(s.language).toBe('typescript');
+		expect(s.title).toBe('utils.ts');
+		expect(s.breadcrumb).toBe('src/utils.ts');
+	});
+
+	it('produces at least one snippet from a file with many small declarations', () => {
+		// Each block: a multi-line function — boundary detection fires but chunks are
+		// large enough to survive the MIN_CONTENT_LENGTH filter.
+		const blocks = Array.from(
+			{ length: 10 },
+			(_, i) => `export function helper${i}(x: number): number {\n  return x + ${i};\n}`
+		);
+		const content = blocks.join('\n\n');
+
+		const snippets = parseCodeFile(content, 'generated.ts', 'typescript');
+		expect(snippets.length).toBeGreaterThanOrEqual(1);
+		expect(snippets.every((s) => s.type === 'code')).toBe(true);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Python
+// ---------------------------------------------------------------------------
+
+describe('parseCodeFile — Python', () => {
+	it('splits at def and class boundaries', () => {
+		const content = `
+def greet(name):
+    return f"Hello, {name}"
+
+class MyClass:
+    def __init__(self):
+        self.value = 0
+
+    def increment(self):
+        self.value += 1
+
+async def fetch_data(url):
+    return await http.get(url)
+`.trim();
+
+		const snippets = parseCodeFile(content, 'app.py', 'python');
+		expect(snippets.some((s) => s.content.includes('def greet'))).toBe(true);
+		expect(snippets.some((s) => s.content.includes('class MyClass'))).toBe(true);
+		expect(snippets.some((s) => s.content.includes('async def fetch_data'))).toBe(true);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Go
+// ---------------------------------------------------------------------------
+
+describe('parseCodeFile — Go', () => {
+	it('splits at func boundaries', () => {
+		const content = `
+package main
+
+import "fmt"
+
+func greet(name string) string {
+	return fmt.Sprintf("Hello, %s", name)
+}
+
+func main() {
+	fmt.Println(greet("world"))
+}
+`.trim();
+
+		const snippets = parseCodeFile(content, 'main.go', 'go');
+		expect(snippets.some((s) => s.content.includes('func greet'))).toBe(true);
+		expect(snippets.some((s) => s.content.includes('func main'))).toBe(true);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Rust
+// ---------------------------------------------------------------------------
+
+describe('parseCodeFile — Rust', () => {
+	it('splits at fn and struct boundaries', () => {
+		const content = `
+pub struct Config {
+    pub name: String,
+    pub value: u32,
+}
+
+pub fn create_config(name: &str, value: u32) -> Config {
+    Config { name: name.to_string(), value }
+}
+
+impl Config {
+    pub fn new() -> Self {
+        Config { name: String::new(), value: 0 }
+    }
+}
+`.trim();
+
+		const snippets = parseCodeFile(content, 'config.rs', 'rust');
+		expect(snippets.some((s) => s.content.includes('pub struct Config'))).toBe(true);
+		expect(snippets.some((s) => s.content.includes('pub fn create_config'))).toBe(true);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Ruby
+// ---------------------------------------------------------------------------
+
+describe('parseCodeFile — Ruby', () => {
+	it('splits at def and class boundaries', () => {
+		const content = `
+class Greeter
+  def initialize(name)
+    @name = name
+  end
+
+  def greet
+    "Hello, #{@name}!"
+  end
+end
+
+def standalone_helper
+  puts "helper"
+end
+`.trim();
+
+		const snippets = parseCodeFile(content, 'greeter.rb', 'ruby');
+		expect(snippets.some((s) => s.content.includes('class Greeter'))).toBe(true);
+		expect(snippets.some((s) => s.content.includes('def standalone_helper'))).toBe(true);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Config / data files
+// ---------------------------------------------------------------------------
+
+describe('parseCodeFile — JSON', () => {
+	it('produces at least one code snippet from a JSON object', () => {
+		const content = JSON.stringify(
+			{
+				name: 'my-package',
+				version: '1.0.0',
+				dependencies: { lodash: '^4.17.21' }
+			},
+			null,
+			2
+		);
+
+		const snippets = parseCodeFile(content, 'package.json', 'json');
+		expect(snippets.length).toBeGreaterThanOrEqual(1);
+		expect(snippets.every((s) => s.type === 'code')).toBe(true);
+	});
+});
+
+describe('parseCodeFile — YAML', () => {
+	it('splits a YAML file at top-level keys', () => {
+		const content = `
+name: my-project
+version: 1.0.0
+scripts:
+  build: tsc
+  test: vitest
+dependencies:
+  lodash: ^4.17.21
+`.trim();
+
+		const snippets = parseCodeFile(content, 'config.yaml', 'yaml');
+		expect(snippets.length).toBeGreaterThanOrEqual(1);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// HTML-like files
+// ---------------------------------------------------------------------------
+
+describe('parseCodeFile — HTML', () => {
+	it('extracts script block and text content', () => {
+		const content = `
+<!DOCTYPE html>
+<html>
+<head><title>Test Page</title></head>
+<body>
+  <p>This is the page body content with enough text for an info snippet.</p>
+  <script>
+    function init() {
+      console.log('page loaded and ready for interaction');
+    }
+  </script>
+</body>
+</html>
+`.trim();
+
+		const snippets = parseCodeFile(content, 'index.html', 'html');
+		expect(snippets.some((s) => s.type === 'code')).toBe(true);
+		expect(snippets.some((s) => s.type === 'info')).toBe(true);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Plain text
+// ---------------------------------------------------------------------------
+
+describe('parseCodeFile — plain text', () => {
+	it('splits on paragraph boundaries', () => {
+		const content = `
+This is the first paragraph with enough content to pass the minimum length check.
+
+This is the second paragraph that also has enough content to be included here.
+`.trim();
+
+		const snippets = parseCodeFile(content, 'notes.txt', 'text');
+		expect(snippets.length).toBeGreaterThanOrEqual(2);
+		expect(snippets.every((s) => s.type === 'info')).toBe(true);
+	});
+
+	it('skips paragraphs shorter than 20 characters', () => {
+		const content = 'Short.\n\nThis is a much longer paragraph that definitely passes the minimum length filter.';
+		const snippets = parseCodeFile(content, 'notes.txt', 'text');
+		expect(snippets.length).toBe(1);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Unknown language fallback
+// ---------------------------------------------------------------------------
+
+describe('parseCodeFile — unknown language', () => {
+	it('falls back to sliding window for unrecognised languages', () => {
+		const lines = Array.from({ length: 50 }, (_, i) => `line ${i}: some code content here`);
+		const content = lines.join('\n');
+
+		const snippets = parseCodeFile(content, 'script.lua', 'lua');
+		expect(snippets.length).toBeGreaterThanOrEqual(1);
+		expect(snippets.every((s) => s.type === 'code')).toBe(true);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Min content filter
+// ---------------------------------------------------------------------------
+
+describe('parseCodeFile — minimum content filter', () => {
+	it('skips segments shorter than 20 characters', () => {
+		const content = `
+export function realFunction(): string {
+  // A function with enough content to be included in the output snippets.
+  return 'result value from the function that does the operation here';
+}
+`.trim();
+
+		const snippets = parseCodeFile(content, 'test.ts', 'typescript');
+		expect(snippets.every((s) => s.content.length >= 20)).toBe(true);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Token count cap
+// ---------------------------------------------------------------------------
+
+describe('parseCodeFile — token count', () => {
+	it('all snippets have tokenCount within MAX_TOKENS', () => {
+		const lines = Array.from({ length: 300 }, (_, i) => `// comment line number ${i} here\nconst x${i} = ${i};`);
+		const content = lines.join('\n');
+
+		const snippets = parseCodeFile(content, 'large.ts', 'typescript');
+		for (const s of snippets) {
+			expect(estimateTokens(s.content)).toBeLessThanOrEqual(MAX_TOKENS + 50); // slight tolerance for boundary chunks
+		}
+	});
+});
+
+// ---------------------------------------------------------------------------
+// parseFile integration
+// ---------------------------------------------------------------------------
+
+describe('parseFile — integration', () => {
+	const opts = { repositoryId: 'repo-1', documentId: 'doc-1', versionId: 'v1' };
+
+	it('returns NewSnippet records with all required fields for a .ts file', () => {
+		const file = makeFile(
+			'src/utils.ts',
+			`export function add(a: number, b: number): number {\n  return a + b;\n}\n`
+		);
+
+		const snippets = parseFile(file, opts);
+		expect(snippets.length).toBeGreaterThanOrEqual(1);
+
+		for (const s of snippets) {
+			expect(s.id).toBeTruthy();
+			expect(s.repositoryId).toBe('repo-1');
+			expect(s.documentId).toBe('doc-1');
+			expect(s.versionId).toBe('v1');
+			expect(s.createdAt).toBeInstanceOf(Date);
+			expect(s.content).toBeTruthy();
+			expect(s.type).toMatch(/^(code|info)$/);
+		}
+	});
+
+	it('returns NewSnippet records for a .md file', () => {
+		const file = makeFile(
+			'README.md',
+			`# Hello\n\nThis is a long enough paragraph to pass the minimum content length filter.\n`,
+			'markdown'
+		);
+
+		const snippets = parseFile(file, opts);
+		expect(snippets.length).toBeGreaterThanOrEqual(1);
+		expect(snippets[0]?.type).toBe('info');
+	});
+
+	it('uses null for versionId when not provided', () => {
+		const file = makeFile('src/index.ts', `export function noop(): void {}\n`);
+		const snippets = parseFile(file, { repositoryId: 'r', documentId: 'd' });
+
+		// noop is too short; file may return 0 snippets — just verify no error thrown
+		expect(Array.isArray(snippets)).toBe(true);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// BOUNDARY_PATTERNS export
+// ---------------------------------------------------------------------------
+
+describe('BOUNDARY_PATTERNS', () => {
+	it('contains entries for core languages', () => {
+		expect(BOUNDARY_PATTERNS['typescript']).toBeInstanceOf(RegExp);
+		expect(BOUNDARY_PATTERNS['python']).toBeInstanceOf(RegExp);
+		expect(BOUNDARY_PATTERNS['go']).toBeInstanceOf(RegExp);
+		expect(BOUNDARY_PATTERNS['rust']).toBeInstanceOf(RegExp);
+		expect(BOUNDARY_PATTERNS['ruby']).toBeInstanceOf(RegExp);
+	});
+});
--- a/src/lib/server/parser/code.parser.ts
+++ b/src/lib/server/parser/code.parser.ts
@@ -0,0 +1,302 @@
+/**
+ * Code file parser for TRUEREF-0005.
+ *
+ * Splits source-code files into function/class-level chunks using
+ * language-specific regex boundary detection.  Falls back to a line-count
+ * sliding window for unrecognised languages.
+ */
+
+import { basename } from 'node:path';
+import type { NewSnippet } from '$lib/server/db/schema.js';
+import {
+	estimateTokens,
+	chunkLines,
+	chunkText,
+	MAX_TOKENS,
+	OVERLAP_TOKENS,
+	MIN_CONTENT_LENGTH
+} from './chunker.js';
+
+// ---------------------------------------------------------------------------
+// Boundary patterns per language
+// ---------------------------------------------------------------------------
+
+/**
+ * Each pattern must match the START of a top-level declaration line.
+ * The regex is tested line-by-line (multiline flag not needed).
+ */
+export const BOUNDARY_PATTERNS: Record<string, RegExp> = {
+	typescript: /^(export\s+)?(declare\s+)?(async\s+)?(function|class|interface|type|enum|const|let|var)\s+\w+/,
+	javascript: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
+	python: /^(async\s+)?(def|class)\s+\w+/,
+	go: /^(func|type|var|const)\s+\w+/,
+	rust: /^(pub(\s*\(crate\))?\s+)?(async\s+)?(fn|impl|struct|enum|trait|type|const|static)\s+\w+/,
+	java: /^(\s*(public|private|protected|static|final|abstract|synchronized)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
+	csharp: /^(\s*(public|private|protected|internal|static|override|virtual|abstract|sealed)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
+	kotlin: /^(\s*(public|private|protected|internal|override|suspend|inline|open|abstract|sealed)\s+)*(fun|class|object|interface|data class|sealed class|enum class)\s+\w+/,
+	swift: /^(\s*(public|private|internal|fileprivate|open|override|static|final|class)\s+)*(func|class|struct|enum|protocol|extension)\s+\w+/,
+	ruby: /^(def|class|module)\s+\w+/
+};
+
+// ---------------------------------------------------------------------------
+// Internal types
+// ---------------------------------------------------------------------------
+
+type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/**
+ * Split `content` at lines that match `pattern`, returning the segments
+ * between boundaries (each segment includes its opening boundary line).
+ */
+function splitAtBoundaries(content: string, pattern: RegExp): string[] {
+	const lines = content.split('\n');
+	const segments: string[] = [];
+	let current: string[] = [];
+
+	for (const line of lines) {
+		if (pattern.test(line) && current.length > 0) {
+			// Emit what we have, start a new segment from this boundary line
+			segments.push(current.join('\n'));
+			current = [line];
+		} else {
+			current.push(line);
+		}
+	}
+
+	if (current.length > 0) {
+		segments.push(current.join('\n'));
+	}
+
+	return segments;
+}
+
+// ---------------------------------------------------------------------------
+// Sliding-window fallback for code
+// ---------------------------------------------------------------------------
+
+function slidingWindowChunks(content: string, filePath: string, language: string): RawSnippet[] {
+	const lines = content.split('\n');
+	const windowedChunks = chunkLines(lines, 200, 20);
+	return windowedChunks
+		.filter((chunk) => chunk.trim().length >= MIN_CONTENT_LENGTH)
+		.map((chunk) => ({
+			type: 'code' as const,
+			title: basename(filePath),
+			content: chunk,
+			language,
+			breadcrumb: filePath,
+			tokenCount: estimateTokens(chunk)
+		}));
+}
+
+// ---------------------------------------------------------------------------
+// Config / data file parser (JSON, YAML, TOML)
+// ---------------------------------------------------------------------------
+
+/**
+ * Chunk config/data files by splitting on top-level keys.
+ *
+ * Strategy: find lines that look like top-level keys (zero indentation,
+ * followed by colon/equals/brace) and treat each as a boundary.
+ */
+function parseConfigFile(content: string, filePath: string, language: string): RawSnippet[] {
+	const topLevelKey = /^[\w"'\-]+\s*[:=\[{]/;
+	const lines = content.split('\n');
+	const segments: string[] = [];
+	let current: string[] = [];
+
+	for (const line of lines) {
+		if (topLevelKey.test(line) && current.length > 0) {
+			segments.push(current.join('\n'));
+			current = [line];
+		} else {
+			current.push(line);
+		}
+	}
+	if (current.length > 0) segments.push(current.join('\n'));
+
+	// If we got only one segment (no structure detected), fall back to sliding window
+	if (segments.length <= 1) {
+		return slidingWindowChunks(content, filePath, language);
+	}
+
+	return segments
+		.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
+		.flatMap((seg) => {
+			if (estimateTokens(seg) <= MAX_TOKENS) {
+				return [
+					{
+						type: 'code' as const,
+						title: basename(filePath),
+						content: seg.trim(),
+						language,
+						breadcrumb: filePath,
+						tokenCount: estimateTokens(seg.trim())
+					}
+				];
+			}
+			return slidingWindowChunks(seg, filePath, language);
+		});
+}
+
+// ---------------------------------------------------------------------------
+// HTML / Svelte / Vue parser
+// ---------------------------------------------------------------------------
+
+/**
+ * Extract script blocks and text content from HTML-like files.
+ */
+function parseHtmlLikeFile(content: string, filePath: string, language: string): RawSnippet[] {
+	const snippets: RawSnippet[] = [];
+	const title = basename(filePath);
+
+	// Extract <script> blocks (including <script lang="ts">)
+	const scriptPattern = /<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi;
+	let match: RegExpExecArray | null;
+	const scriptBlocks: string[] = [];
+
+	while ((match = scriptPattern.exec(content)) !== null) {
+		// Strip the outer tags, keep just the code
+		const inner = match[0].replace(/^<script[^>]*>/, '').replace(/<\/script>$/, '').trim();
+		if (inner.length >= MIN_CONTENT_LENGTH) {
+			scriptBlocks.push(inner);
+		}
+	}
+
+	for (const block of scriptBlocks) {
+		if (estimateTokens(block) <= MAX_TOKENS) {
+			snippets.push({
+				type: 'code',
+				title,
+				content: block,
+				language,
+				breadcrumb: filePath,
+				tokenCount: estimateTokens(block)
+			});
+		} else {
+			snippets.push(...slidingWindowChunks(block, filePath, language));
+		}
+	}
+
+	// Strip tags and extract text content for info snippets
+	const text = content
+		.replace(/<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi, '')
+		.replace(/<style(?:\s[^>]*)?>[\s\S]*?<\/style>/gi, '')
+		.replace(/<[^>]+>/g, ' ')
+		.replace(/\s{2,}/g, ' ')
+		.trim();
+
+	if (text.length >= MIN_CONTENT_LENGTH) {
+		const chunks = chunkText(text, MAX_TOKENS, OVERLAP_TOKENS);
+		for (const chunk of chunks) {
+			snippets.push({
+				type: 'info',
+				title,
+				content: chunk,
+				language: null,
+				breadcrumb: filePath,
+				tokenCount: estimateTokens(chunk)
+			});
+		}
+	}
+
+	return snippets;
+}
+
+// ---------------------------------------------------------------------------
+// Plain-text / RST parser
+// ---------------------------------------------------------------------------
+
+function parsePlainText(content: string, filePath: string): RawSnippet[] {
+	// Split on blank lines (paragraph boundaries)
+	const paragraphs = content.split(/\n{2,}/).filter((p) => p.trim().length >= MIN_CONTENT_LENGTH);
+
+	if (paragraphs.length === 0) return [];
+
+	const title = basename(filePath);
+	const snippets: RawSnippet[] = [];
+
+	for (const para of paragraphs) {
+		const chunks = chunkText(para.trim(), MAX_TOKENS, OVERLAP_TOKENS);
+		for (const chunk of chunks) {
+			snippets.push({
+				type: 'info',
+				title,
+				content: chunk,
+				language: null,
+				breadcrumb: filePath,
+				tokenCount: estimateTokens(chunk)
+			});
+		}
+	}
+
+	return snippets;
+}
+
+// ---------------------------------------------------------------------------
+// Public parser
+// ---------------------------------------------------------------------------
+
+/**
+ * Parse a non-Markdown code or data file into raw snippets.
+ */
+export function parseCodeFile(content: string, filePath: string, language: string): RawSnippet[] {
+	// Plain text / RST
+	if (language === 'text') {
+		return parsePlainText(content, filePath);
+	}
+
+	// Config / data files
+	if (['json', 'yaml', 'toml'].includes(language)) {
+		return parseConfigFile(content, filePath, language);
+	}
+
+	// HTML-like files
+	if (['html', 'svelte', 'vue'].includes(language)) {
+		return parseHtmlLikeFile(content, filePath, language);
+	}
+
+	// Normalise csharp alias
+	const normalisedLang = language === 'csharp' ? 'csharp' : language;
+
+	const pattern = BOUNDARY_PATTERNS[normalisedLang];
+	const title = basename(filePath);
+	const breadcrumb = filePath;
+
+	if (!pattern) {
+		// Fallback: line-count sliding window
+		return slidingWindowChunks(content, filePath, language);
+	}
+
+	const segments = splitAtBoundaries(content, pattern);
+
+	// If boundary detection produced only one segment covering the whole file,
+	// it means no boundaries matched — fall back to sliding window.
+	if (segments.length === 1 && !pattern.test(content.split('\n')[0])) {
+		return slidingWindowChunks(content, filePath, language);
+	}
+
+	return segments
+		.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
+		.flatMap((seg) => {
+			const trimmed = seg.trim();
+			if (estimateTokens(trimmed) <= MAX_TOKENS) {
+				return [
+					{
+						type: 'code' as const,
+						title,
+						content: trimmed,
+						language,
+						breadcrumb,
+						tokenCount: estimateTokens(trimmed)
+					}
+				];
+			}
+			// Chunk oversized segments with sliding window
+			return slidingWindowChunks(trimmed, filePath, language);
+		});
+}
--- a/src/lib/server/parser/index.ts
+++ b/src/lib/server/parser/index.ts
@@ -0,0 +1,53 @@
+/**
+ * Document parser entry point for TRUEREF-0005.
+ *
+ * Exposes `parseFile` which transforms a `CrawledFile` into an array of
+ * `NewSnippet` records ready for database insertion.
+ */
+
+import type { CrawledFile } from '$lib/server/crawler/types.js';
+import type { NewSnippet } from '$lib/server/db/schema.js';
+import { detectLanguage } from './language.js';
+import { parseMarkdown } from './markdown.parser.js';
+import { parseCodeFile } from './code.parser.js';
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+export interface ParseOptions {
+	repositoryId: string;
+	documentId: string;
+	versionId?: string;
+}
+
+/**
+ * Parse a crawled file into an array of `NewSnippet` records.
+ *
+ * The language is detected from the file extension.  Markdown/MDX files are
+ * split by heading hierarchy; all other files use language-specific boundary
+ * detection or a sliding-window fallback.
+ */
+export function parseFile(file: CrawledFile, options: ParseOptions): NewSnippet[] {
+	const language = detectLanguage(file.path);
+
+	const rawSnippets =
+		language === 'markdown'
+			? parseMarkdown(file.content, file.path)
+			: parseCodeFile(file.content, file.path, language);
+
+	return rawSnippets.map((s) => ({
+		...s,
+		id: crypto.randomUUID(),
+		repositoryId: options.repositoryId,
+		documentId: options.documentId,
+		versionId: options.versionId ?? null,
+		createdAt: new Date()
+	}));
+}
+
+// Re-export helpers for consumers that need them individually
+export { detectLanguage } from './language.js';
+export { estimateTokens, chunkText, chunkLines, MAX_TOKENS, OVERLAP_TOKENS, MIN_CONTENT_LENGTH } from './chunker.js';
+export { parseMarkdown } from './markdown.parser.js';
+export { parseCodeFile, BOUNDARY_PATTERNS } from './code.parser.js';
--- a/src/lib/server/parser/language.ts
+++ b/src/lib/server/parser/language.ts
@@ -0,0 +1,56 @@
+/**
+ * Language detection for the document parser (TRUEREF-0005).
+ *
+ * Maps file extensions to canonical language names used throughout the parser.
+ */
+
+import { extname } from 'node:path';
+
+// ---------------------------------------------------------------------------
+// Language map
+// ---------------------------------------------------------------------------
+
+export const LANGUAGE_MAP: Record<string, string> = {
+	'.ts': 'typescript',
+	'.tsx': 'typescript',
+	'.js': 'javascript',
+	'.jsx': 'javascript',
+	'.py': 'python',
+	'.rb': 'ruby',
+	'.go': 'go',
+	'.rs': 'rust',
+	'.java': 'java',
+	'.cs': 'csharp',
+	'.cpp': 'cpp',
+	'.c': 'c',
+	'.h': 'c',
+	'.swift': 'swift',
+	'.kt': 'kotlin',
+	'.php': 'php',
+	'.scala': 'scala',
+	'.sh': 'bash',
+	'.bash': 'bash',
+	'.zsh': 'bash',
+	'.md': 'markdown',
+	'.mdx': 'markdown',
+	'.json': 'json',
+	'.yaml': 'yaml',
+	'.yml': 'yaml',
+	'.toml': 'toml',
+	'.html': 'html',
+	'.css': 'css',
+	'.svelte': 'svelte',
+	'.vue': 'vue',
+	'.sql': 'sql',
+	'.txt': 'text',
+	'.rst': 'text'
+};
+
+/**
+ * Detect the canonical language name from a file path.
+ * Returns 'text' when the extension is unknown.
+ */
+export function detectLanguage(filePath: string): string {
+	const ext = extname(filePath).toLowerCase();
+	return LANGUAGE_MAP[ext] ?? 'text';
+}
--- a/src/lib/server/parser/markdown.parser.test.ts
+++ b/src/lib/server/parser/markdown.parser.test.ts
@@ -0,0 +1,272 @@
+/**
+ * Unit tests for the Markdown parser (TRUEREF-0005).
+ */
+
+import { describe, it, expect } from 'vitest';
+import { parseMarkdown } from './markdown.parser.js';
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/** Build a fenced code block string without nesting backticks in template literals. */
+function fence(lang: string, code: string): string {
+	return '```' + lang + '\n' + code + '\n' + '```';
+}
+
+function tildeFence(lang: string, code: string): string {
+	return '~~~' + lang + '\n' + code + '\n' + '~~~';
+}
+
+// ---------------------------------------------------------------------------
+// Basic section splitting
+// ---------------------------------------------------------------------------
+
+describe('parseMarkdown — section splitting', () => {
+	it('produces no snippets for empty content', () => {
+		expect(parseMarkdown('', 'README.md')).toHaveLength(0);
+	});
+
+	it('skips content shorter than 20 characters', () => {
+		const result = parseMarkdown('# Title\n\nShort.\n', 'README.md');
+		expect(result).toHaveLength(0);
+	});
+
+	it('parses a single heading section into an info snippet', () => {
+		const source = [
+			'# Introduction',
+			'',
+			'This is a paragraph with enough content to pass the minimum length check.'
+		].join('\n');
+
+		const snippets = parseMarkdown(source, 'README.md');
+		expect(snippets.length).toBeGreaterThanOrEqual(1);
+		const info = snippets.find((s) => s.type === 'info');
+		expect(info).toBeDefined();
+		expect(info?.title).toBe('Introduction');
+		expect(info?.breadcrumb).toBe('Introduction');
+	});
+
+	it('builds correct breadcrumb for nested headings', () => {
+		const source = [
+			'# Getting Started',
+			'',
+			'Intro text that is long enough to be included here.',
+			'',
+			'## Installation',
+			'',
+			'Install by running the command shown below in your terminal.'
+		].join('\n');
+
+		const snippets = parseMarkdown(source, 'README.md');
+		const installation = snippets.find((s) => s.title === 'Installation');
+		expect(installation).toBeDefined();
+		expect(installation?.breadcrumb).toBe('Getting Started > Installation');
+	});
+
+	it('resets heading stack correctly when headings ascend', () => {
+		const source = [
+			'# H1',
+			'',
+			'Some introductory prose that is longer than twenty characters.',
+			'',
+			'## H2',
+			'',
+			'More content here, also long enough to pass the threshold check.',
+			'',
+			'# Second H1',
+			'',
+			'Content for second top-level heading, long enough to be included.'
+		].join('\n');
+
+		const snippets = parseMarkdown(source, 'doc.md');
+		const secondH1 = snippets.find((s) => s.title === 'Second H1');
+		expect(secondH1).toBeDefined();
+		expect(secondH1?.breadcrumb).toBe('Second H1');
+	});
+
+	it('falls back to filename when no heading is present', () => {
+		const source = 'This is some standalone prose content that is long enough to pass.';
+		const snippets = parseMarkdown(source, 'notes.md');
+		expect(snippets.length).toBeGreaterThanOrEqual(1);
+		expect(snippets[0]?.title).toBe('notes.md');
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Fenced code block extraction
+// ---------------------------------------------------------------------------
+
+describe('parseMarkdown — code block extraction', () => {
+	it('extracts a fenced code block as a code snippet', () => {
+		const codeBlock = fence('typescript', 'function hello(name: string): string {\n  return `Hello, ${name}!`;\n}');
+		const source = [
+			'# Example',
+			'',
+			'Some prose here that is long enough to pass the minimum check.',
+			'',
+			codeBlock
+		].join('\n');
+
+		const snippets = parseMarkdown(source, 'README.md');
+		const code = snippets.find((s) => s.type === 'code');
+		expect(code).toBeDefined();
+		expect(code?.language).toBe('typescript');
+		expect(code?.content).toContain('function hello');
+	});
+
+	it('extracts multiple code blocks from the same section', () => {
+		const bashBlock = fence('bash', 'npm install my-library --save-dev');
+		const jsBlock = fence('javascript', "const lib = require('my-lib');\nlib.doSomething();");
+		const source = [
+			'# Usage',
+			'',
+			'Description of the usage pattern with enough text here.',
+			'',
+			bashBlock,
+			'',
+			'More text in between the two code blocks, just enough.',
+			'',
+			jsBlock
+		].join('\n');
+
+		const snippets = parseMarkdown(source, 'README.md');
+		const codeSnippets = snippets.filter((s) => s.type === 'code');
+		expect(codeSnippets.length).toBe(2);
+		const langs = codeSnippets.map((s) => s.language);
+		expect(langs).toContain('bash');
+		expect(langs).toContain('javascript');
+	});
+
+	it('skips code blocks shorter than 20 characters', () => {
+		const shortBlock = fence('', 'x = 1');
+		const source = [
+			'# Example',
+			'',
+			'Some prose here that is long enough to pass.',
+			'',
+			shortBlock
+		].join('\n');
+
+		const snippets = parseMarkdown(source, 'README.md');
+		expect(snippets.every((s) => s.type === 'info')).toBe(true);
+	});
+
+	it('handles tilde-fenced code blocks', () => {
+		const pyBlock = tildeFence('python', 'def greet(name):\n    return f"Hello, {name}"');
+		const source = [
+			'# Section',
+			'',
+			'Long enough prose content for the section to be included here.',
+			'',
+			pyBlock
+		].join('\n');
+
+		const snippets = parseMarkdown(source, 'README.md');
+		const code = snippets.find((s) => s.type === 'code');
+		expect(code).toBeDefined();
+		expect(code?.language).toBe('python');
+	});
+
+	it('preserves breadcrumb on code snippets', () => {
+		const codeBlock = fence(
+			'typescript',
+			'function connect(url: string): Promise<void> {\n  return Promise.resolve();\n}'
+		);
+		const source = [
+			'# API Reference',
+			'',
+			'## Methods',
+			'',
+			'Overview of the methods available in this library.',
+			'',
+			codeBlock
+		].join('\n');
+
+		const snippets = parseMarkdown(source, 'API.md');
+		const code = snippets.find((s) => s.type === 'code');
+		expect(code).toBeDefined();
+		expect(code?.breadcrumb).toBe('API Reference > Methods');
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Token counting
+// ---------------------------------------------------------------------------
+
+describe('parseMarkdown — token counting', () => {
+	it('attaches a non-zero tokenCount to every snippet', () => {
+		const source = [
+			'# Overview',
+			'',
+			'This section contains enough text to produce an info snippet for the test.'
+		].join('\n');
+
+		const snippets = parseMarkdown(source, 'README.md');
+		for (const s of snippets) {
+			expect(s.tokenCount).toBeGreaterThan(0);
+		}
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Large content chunking
+// ---------------------------------------------------------------------------
+
+describe('parseMarkdown — large content chunking', () => {
+	it('splits a very large prose section into multiple snippets', () => {
+		// Generate ~4 000 characters of prose (well above the ~1 800-char window)
+		const longParagraph = 'word '.repeat(800).trim();
+		const source = `# Big Section\n\n${longParagraph}`;
+
+		const snippets = parseMarkdown(source, 'big.md');
+		const infoSnippets = snippets.filter((s) => s.type === 'info');
+		expect(infoSnippets.length).toBeGreaterThan(1);
+	});
+});
+
+// ---------------------------------------------------------------------------
+// Real-world sample
+// ---------------------------------------------------------------------------
+
+describe('parseMarkdown — real-world sample', () => {
+	it('correctly parses a realistic README excerpt', () => {
+		const bashInstall = fence('bash', 'npm install my-library');
+		const tsUsage = fence('typescript', "import { doTheThing } from 'my-library';\n\ndoTheThing({ verbose: true });");
+
+		const source = [
+			'# My Library',
+			'',
+			'A handy library for doing things quickly and efficiently.',
+			'',
+			'## Installation',
+			'',
+			'Install via npm using the following command in your project directory:',
+			'',
+			bashInstall,
+			'',
+			'## Usage',
+			'',
+			'Import the library and call the main function as shown below:',
+			'',
+			tsUsage,
+			'',
+			'## API',
+			'',
+			'### doTheThing(options)',
+			'',
+			'Performs the main operation. Options are passed as a plain object.'
+		].join('\n');
+
+		const snippets = parseMarkdown(source, 'README.md');
+
+		// Should have both info and code snippets
+		expect(snippets.some((s) => s.type === 'info')).toBe(true);
+		expect(snippets.some((s) => s.type === 'code')).toBe(true);
+
+		// Breadcrumb depth check
+		const apiSnippet = snippets.find((s) => s.title === 'doTheThing(options)');
+		expect(apiSnippet).toBeDefined();
+		expect(apiSnippet?.breadcrumb).toBe('My Library > API > doTheThing(options)');
+	});
+});
--- a/src/lib/server/parser/markdown.parser.ts
+++ b/src/lib/server/parser/markdown.parser.ts
@@ -0,0 +1,171 @@
+/**
+ * Markdown document parser for TRUEREF-0005.
+ *
+ * Splits Markdown/MDX files into heading-based sections and extracts fenced
+ * code blocks as separate code snippets.
+ */
+
+import { basename } from 'node:path';
+import type { NewSnippet } from '$lib/server/db/schema.js';
+import { estimateTokens, chunkText, MAX_TOKENS, OVERLAP_TOKENS, MIN_CONTENT_LENGTH } from './chunker.js';
+
+// ---------------------------------------------------------------------------
+// Internal types
+// ---------------------------------------------------------------------------
+
+interface CodeBlock {
+	language: string;
+	code: string;
+}
+
+interface MarkdownSection {
+	/** Heading stack at this point, e.g. ["Getting Started", "Installation"] */
+	headings: string[];
+	/** Prose text content (code blocks stripped out) */
+	content: string;
+	/** Fenced code blocks found within this section */
+	codeBlocks: CodeBlock[];
+}
+
+// ---------------------------------------------------------------------------
+// Section splitting
+// ---------------------------------------------------------------------------
+
+/**
+ * Split the full Markdown source into sections delimited by ATX headings
+ * (# … ####).  Code blocks inside headings are extracted separately.
+ */
+function splitIntoSections(source: string): MarkdownSection[] {
+	const lines = source.split('\n');
+	const sections: MarkdownSection[] = [];
+
+	// Heading stack: index 0 = H1, 1 = H2, … (we track up to H4)
+	const headingStack: string[] = [];
+
+	// Accumulator for the current section
+	let textLines: string[] = [];
+	const codeBlocks: CodeBlock[] = [];
+
+	// Fenced-code-block tracking
+	let inCodeBlock = false;
+	let codeFence = '';
+	let codeLanguage = '';
+	let codeLines: string[] = [];
+
+	function flushSection() {
+		sections.push({
+			headings: [...headingStack],
+			content: textLines.join('\n'),
+			codeBlocks: [...codeBlocks]
+		});
+		textLines = [];
+		codeBlocks.length = 0;
+	}
+
+	for (const line of lines) {
+		// ---- Fenced code block handling ----
+		if (!inCodeBlock) {
+			const fenceMatch = line.match(/^(`{3,}|~{3,})([\w-]*)/);
+			if (fenceMatch) {
+				inCodeBlock = true;
+				codeFence = fenceMatch[1].charAt(0).repeat(fenceMatch[1].length);
+				codeLanguage = fenceMatch[2].trim().toLowerCase();
+				codeLines = [];
+				continue;
+			}
+		} else {
+			// Check for closing fence (must be same char and at least same length)
+			const closingFence = new RegExp(`^${codeFence[0]}{${codeFence.length},}\\s*$`);
+			if (closingFence.test(line)) {
+				inCodeBlock = false;
+				const code = codeLines.join('\n');
+				if (code.trim().length >= MIN_CONTENT_LENGTH) {
+					codeBlocks.push({ language: codeLanguage, code });
+				}
+				codeLines = [];
+				continue;
+			}
+			codeLines.push(line);
+			continue;
+		}
+
+		// ---- Heading detection (ATX only, H1–H4) ----
+		const headingMatch = line.match(/^(#{1,4})\s+(.*)/);
+		if (headingMatch) {
+			// Emit whatever has accumulated before this heading
+			flushSection();
+
+			const level = headingMatch[1].length; // 1–4
+			const title = headingMatch[2].trim();
+
+			// Trim the stack to the depth above this heading and push the new title
+			headingStack.splice(level - 1, headingStack.length - (level - 1), title);
+			continue;
+		}
+
+		// ---- Ordinary prose line ----
+		textLines.push(line);
+	}
+
+	// Flush any trailing content (unclosed fence treated as prose)
+	if (inCodeBlock) {
+		// Treat remaining code lines as prose if the fence was never closed
+		textLines.push(...codeLines);
+	}
+	flushSection();
+
+	return sections;
+}
+
+// ---------------------------------------------------------------------------
+// Public parser
+// ---------------------------------------------------------------------------
+
+type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;
+
+/**
+ * Parse a Markdown/MDX file into raw snippets (before IDs and DB fields are
+ * attached).
+ */
+export function parseMarkdown(content: string, filePath: string): RawSnippet[] {
+	const sections = splitIntoSections(content);
+	const snippets: RawSnippet[] = [];
+
+	for (const section of sections) {
+		const breadcrumb = section.headings.join(' > ') || undefined;
+		const title = section.headings.at(-1) ?? basename(filePath);
+
+		// ---- Info snippet for prose content ----
+		const prose = section.content.trim();
+		if (prose.length >= MIN_CONTENT_LENGTH) {
+			const chunks = chunkText(prose, MAX_TOKENS, OVERLAP_TOKENS);
+			for (const chunk of chunks) {
+				snippets.push({
+					type: 'info',
+					title,
+					content: chunk,
+					breadcrumb: breadcrumb ?? null,
+					language: null,
+					tokenCount: estimateTokens(chunk)
+				});
+			}
+		}
+
+		// ---- Code snippets for each fenced code block ----
+		for (const block of section.codeBlocks) {
+			const code = block.code.trim();
+			if (code.length < MIN_CONTENT_LENGTH) continue;
+
+			snippets.push({
+				type: 'code',
+				title,
+				content: code,
+				language: block.language || null,
+				breadcrumb: breadcrumb ?? null,
+				tokenCount: estimateTokens(code)
+			});
+		}
+	}
+
+	return snippets;
+}