feat(TRUEREF-0005): implement document parser and chunker

- Markdown parser with heading-based section splitting and code block extraction - Code file parser with regex boundary detection for 10+ languages - Sliding window chunker with configurable token limits and overlap - Language detection from file extensions Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-23 09:06:12 +01:00
parent 1c15d6c474
commit f6be3cfd47
7 changed files with 1350 additions and 0 deletions
--- a/src/lib/server/parser/code.parser.ts
+++ b/src/lib/server/parser/code.parser.ts
@@ -0,0 +1,302 @@
+/**
+ * Code file parser for TRUEREF-0005.
+ *
+ * Splits source-code files into function/class-level chunks using
+ * language-specific regex boundary detection.  Falls back to a line-count
+ * sliding window for unrecognised languages.
+ */
+
+import { basename } from 'node:path';
+import type { NewSnippet } from '$lib/server/db/schema.js';
+import {
+	estimateTokens,
+	chunkLines,
+	chunkText,
+	MAX_TOKENS,
+	OVERLAP_TOKENS,
+	MIN_CONTENT_LENGTH
+} from './chunker.js';
+
+// ---------------------------------------------------------------------------
+// Boundary patterns per language
+// ---------------------------------------------------------------------------
+
+/**
+ * Each pattern must match the START of a top-level declaration line.
+ * The regex is tested line-by-line (multiline flag not needed).
+ */
+export const BOUNDARY_PATTERNS: Record<string, RegExp> = {
+	typescript: /^(export\s+)?(declare\s+)?(async\s+)?(function|class|interface|type|enum|const|let|var)\s+\w+/,
+	javascript: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
+	python: /^(async\s+)?(def|class)\s+\w+/,
+	go: /^(func|type|var|const)\s+\w+/,
+	rust: /^(pub(\s*\(crate\))?\s+)?(async\s+)?(fn|impl|struct|enum|trait|type|const|static)\s+\w+/,
+	java: /^(\s*(public|private|protected|static|final|abstract|synchronized)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
+	csharp: /^(\s*(public|private|protected|internal|static|override|virtual|abstract|sealed)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
+	kotlin: /^(\s*(public|private|protected|internal|override|suspend|inline|open|abstract|sealed)\s+)*(fun|class|object|interface|data class|sealed class|enum class)\s+\w+/,
+	swift: /^(\s*(public|private|internal|fileprivate|open|override|static|final|class)\s+)*(func|class|struct|enum|protocol|extension)\s+\w+/,
+	ruby: /^(def|class|module)\s+\w+/
+};
+
+// ---------------------------------------------------------------------------
+// Internal types
+// ---------------------------------------------------------------------------
+
+type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/**
+ * Split `content` at lines that match `pattern`, returning the segments
+ * between boundaries (each segment includes its opening boundary line).
+ */
+function splitAtBoundaries(content: string, pattern: RegExp): string[] {
+	const lines = content.split('\n');
+	const segments: string[] = [];
+	let current: string[] = [];
+
+	for (const line of lines) {
+		if (pattern.test(line) && current.length > 0) {
+			// Emit what we have, start a new segment from this boundary line
+			segments.push(current.join('\n'));
+			current = [line];
+		} else {
+			current.push(line);
+		}
+	}
+
+	if (current.length > 0) {
+		segments.push(current.join('\n'));
+	}
+
+	return segments;
+}
+
+// ---------------------------------------------------------------------------
+// Sliding-window fallback for code
+// ---------------------------------------------------------------------------
+
+function slidingWindowChunks(content: string, filePath: string, language: string): RawSnippet[] {
+	const lines = content.split('\n');
+	const windowedChunks = chunkLines(lines, 200, 20);
+	return windowedChunks
+		.filter((chunk) => chunk.trim().length >= MIN_CONTENT_LENGTH)
+		.map((chunk) => ({
+			type: 'code' as const,
+			title: basename(filePath),
+			content: chunk,
+			language,
+			breadcrumb: filePath,
+			tokenCount: estimateTokens(chunk)
+		}));
+}
+
+// ---------------------------------------------------------------------------
+// Config / data file parser (JSON, YAML, TOML)
+// ---------------------------------------------------------------------------
+
+/**
+ * Chunk config/data files by splitting on top-level keys.
+ *
+ * Strategy: find lines that look like top-level keys (zero indentation,
+ * followed by colon/equals/brace) and treat each as a boundary.
+ */
+function parseConfigFile(content: string, filePath: string, language: string): RawSnippet[] {
+	const topLevelKey = /^[\w"'\-]+\s*[:=\[{]/;
+	const lines = content.split('\n');
+	const segments: string[] = [];
+	let current: string[] = [];
+
+	for (const line of lines) {
+		if (topLevelKey.test(line) && current.length > 0) {
+			segments.push(current.join('\n'));
+			current = [line];
+		} else {
+			current.push(line);
+		}
+	}
+	if (current.length > 0) segments.push(current.join('\n'));
+
+	// If we got only one segment (no structure detected), fall back to sliding window
+	if (segments.length <= 1) {
+		return slidingWindowChunks(content, filePath, language);
+	}
+
+	return segments
+		.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
+		.flatMap((seg) => {
+			if (estimateTokens(seg) <= MAX_TOKENS) {
+				return [
+					{
+						type: 'code' as const,
+						title: basename(filePath),
+						content: seg.trim(),
+						language,
+						breadcrumb: filePath,
+						tokenCount: estimateTokens(seg.trim())
+					}
+				];
+			}
+			return slidingWindowChunks(seg, filePath, language);
+		});
+}
+
+// ---------------------------------------------------------------------------
+// HTML / Svelte / Vue parser
+// ---------------------------------------------------------------------------
+
+/**
+ * Extract script blocks and text content from HTML-like files.
+ */
+function parseHtmlLikeFile(content: string, filePath: string, language: string): RawSnippet[] {
+	const snippets: RawSnippet[] = [];
+	const title = basename(filePath);
+
+	// Extract <script> blocks (including <script lang="ts">)
+	const scriptPattern = /<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi;
+	let match: RegExpExecArray | null;
+	const scriptBlocks: string[] = [];
+
+	while ((match = scriptPattern.exec(content)) !== null) {
+		// Strip the outer tags, keep just the code
+		const inner = match[0].replace(/^<script[^>]*>/, '').replace(/<\/script>$/, '').trim();
+		if (inner.length >= MIN_CONTENT_LENGTH) {
+			scriptBlocks.push(inner);
+		}
+	}
+
+	for (const block of scriptBlocks) {
+		if (estimateTokens(block) <= MAX_TOKENS) {
+			snippets.push({
+				type: 'code',
+				title,
+				content: block,
+				language,
+				breadcrumb: filePath,
+				tokenCount: estimateTokens(block)
+			});
+		} else {
+			snippets.push(...slidingWindowChunks(block, filePath, language));
+		}
+	}
+
+	// Strip tags and extract text content for info snippets
+	const text = content
+		.replace(/<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi, '')
+		.replace(/<style(?:\s[^>]*)?>[\s\S]*?<\/style>/gi, '')
+		.replace(/<[^>]+>/g, ' ')
+		.replace(/\s{2,}/g, ' ')
+		.trim();
+
+	if (text.length >= MIN_CONTENT_LENGTH) {
+		const chunks = chunkText(text, MAX_TOKENS, OVERLAP_TOKENS);
+		for (const chunk of chunks) {
+			snippets.push({
+				type: 'info',
+				title,
+				content: chunk,
+				language: null,
+				breadcrumb: filePath,
+				tokenCount: estimateTokens(chunk)
+			});
+		}
+	}
+
+	return snippets;
+}
+
+// ---------------------------------------------------------------------------
+// Plain-text / RST parser
+// ---------------------------------------------------------------------------
+
+function parsePlainText(content: string, filePath: string): RawSnippet[] {
+	// Split on blank lines (paragraph boundaries)
+	const paragraphs = content.split(/\n{2,}/).filter((p) => p.trim().length >= MIN_CONTENT_LENGTH);
+
+	if (paragraphs.length === 0) return [];
+
+	const title = basename(filePath);
+	const snippets: RawSnippet[] = [];
+
+	for (const para of paragraphs) {
+		const chunks = chunkText(para.trim(), MAX_TOKENS, OVERLAP_TOKENS);
+		for (const chunk of chunks) {
+			snippets.push({
+				type: 'info',
+				title,
+				content: chunk,
+				language: null,
+				breadcrumb: filePath,
+				tokenCount: estimateTokens(chunk)
+			});
+		}
+	}
+
+	return snippets;
+}
+
+// ---------------------------------------------------------------------------
+// Public parser
+// ---------------------------------------------------------------------------
+
+/**
+ * Parse a non-Markdown code or data file into raw snippets.
+ */
+export function parseCodeFile(content: string, filePath: string, language: string): RawSnippet[] {
+	// Plain text / RST
+	if (language === 'text') {
+		return parsePlainText(content, filePath);
+	}
+
+	// Config / data files
+	if (['json', 'yaml', 'toml'].includes(language)) {
+		return parseConfigFile(content, filePath, language);
+	}
+
+	// HTML-like files
+	if (['html', 'svelte', 'vue'].includes(language)) {
+		return parseHtmlLikeFile(content, filePath, language);
+	}
+
+	// Normalise csharp alias
+	const normalisedLang = language === 'csharp' ? 'csharp' : language;
+
+	const pattern = BOUNDARY_PATTERNS[normalisedLang];
+	const title = basename(filePath);
+	const breadcrumb = filePath;
+
+	if (!pattern) {
+		// Fallback: line-count sliding window
+		return slidingWindowChunks(content, filePath, language);
+	}
+
+	const segments = splitAtBoundaries(content, pattern);
+
+	// If boundary detection produced only one segment covering the whole file,
+	// it means no boundaries matched — fall back to sliding window.
+	if (segments.length === 1 && !pattern.test(content.split('\n')[0])) {
+		return slidingWindowChunks(content, filePath, language);
+	}
+
+	return segments
+		.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
+		.flatMap((seg) => {
+			const trimmed = seg.trim();
+			if (estimateTokens(trimmed) <= MAX_TOKENS) {
+				return [
+					{
+						type: 'code' as const,
+						title,
+						content: trimmed,
+						language,
+						breadcrumb,
+						tokenCount: estimateTokens(trimmed)
+					}
+				];
+			}
+			// Chunk oversized segments with sliding window
+			return slidingWindowChunks(trimmed, filePath, language);
+		});
+}