/** * Code file parser for TRUEREF-0005. * * Splits source-code files into function/class-level chunks using * language-specific regex boundary detection. Falls back to a line-count * sliding window for unrecognised languages. */ import { basename } from 'node:path'; import type { NewSnippet } from '$lib/server/db/schema.js'; import { estimateTokens, chunkLines, chunkText, MAX_TOKENS, OVERLAP_TOKENS, MIN_CONTENT_LENGTH } from './chunker.js'; // --------------------------------------------------------------------------- // Boundary patterns per language // --------------------------------------------------------------------------- /** * Each pattern must match the START of a top-level declaration line. * The regex is tested line-by-line (multiline flag not needed). */ export const BOUNDARY_PATTERNS: Record = { typescript: /^(export\s+)?(declare\s+)?(async\s+)?(function|class|interface|type|enum|const|let|var)\s+\w+/, javascript: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/, python: /^(async\s+)?(def|class)\s+\w+/, go: /^(func|type|var|const)\s+\w+/, rust: /^(pub(\s*\(crate\))?\s+)?(async\s+)?(fn|impl|struct|enum|trait|type|const|static)\s+\w+/, java: /^(\s*(public|private|protected|static|final|abstract|synchronized)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/, csharp: /^(\s*(public|private|protected|internal|static|override|virtual|abstract|sealed)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/, kotlin: /^(\s*(public|private|protected|internal|override|suspend|inline|open|abstract|sealed)\s+)*(fun|class|object|interface|data class|sealed class|enum class)\s+\w+/, swift: /^(\s*(public|private|internal|fileprivate|open|override|static|final|class)\s+)*(func|class|struct|enum|protocol|extension)\s+\w+/, ruby: /^(def|class|module)\s+\w+/ }; // --------------------------------------------------------------------------- // Internal types // --------------------------------------------------------------------------- type RawSnippet = Omit; // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- /** * Split `content` at lines that match `pattern`, returning the segments * between boundaries (each segment includes its opening boundary line). */ function splitAtBoundaries(content: string, pattern: RegExp): string[] { const lines = content.split('\n'); const segments: string[] = []; let current: string[] = []; for (const line of lines) { if (pattern.test(line) && current.length > 0) { // Emit what we have, start a new segment from this boundary line segments.push(current.join('\n')); current = [line]; } else { current.push(line); } } if (current.length > 0) { segments.push(current.join('\n')); } return segments; } // --------------------------------------------------------------------------- // Sliding-window fallback for code // --------------------------------------------------------------------------- function slidingWindowChunks(content: string, filePath: string, language: string): RawSnippet[] { const lines = content.split('\n'); const windowedChunks = chunkLines(lines, 200, 20); return windowedChunks .filter((chunk) => chunk.trim().length >= MIN_CONTENT_LENGTH) .map((chunk) => ({ type: 'code' as const, title: basename(filePath), content: chunk, language, breadcrumb: filePath, tokenCount: estimateTokens(chunk) })); } // --------------------------------------------------------------------------- // Config / data file parser (JSON, YAML, TOML) // --------------------------------------------------------------------------- /** * Chunk config/data files by splitting on top-level keys. * * Strategy: find lines that look like top-level keys (zero indentation, * followed by colon/equals/brace) and treat each as a boundary. */ function parseConfigFile(content: string, filePath: string, language: string): RawSnippet[] { const topLevelKey = /^[\w"'\-]+\s*[:=\[{]/; const lines = content.split('\n'); const segments: string[] = []; let current: string[] = []; for (const line of lines) { if (topLevelKey.test(line) && current.length > 0) { segments.push(current.join('\n')); current = [line]; } else { current.push(line); } } if (current.length > 0) segments.push(current.join('\n')); // If we got only one segment (no structure detected), fall back to sliding window if (segments.length <= 1) { return slidingWindowChunks(content, filePath, language); } return segments .filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH) .flatMap((seg) => { if (estimateTokens(seg) <= MAX_TOKENS) { return [ { type: 'code' as const, title: basename(filePath), content: seg.trim(), language, breadcrumb: filePath, tokenCount: estimateTokens(seg.trim()) } ]; } return slidingWindowChunks(seg, filePath, language); }); } // --------------------------------------------------------------------------- // HTML / Svelte / Vue parser // --------------------------------------------------------------------------- /** * Extract script blocks and text content from HTML-like files. */ function parseHtmlLikeFile(content: string, filePath: string, language: string): RawSnippet[] { const snippets: RawSnippet[] = []; const title = basename(filePath); // Extract