Files
trueref/src/lib/server/parser/code.parser.ts
Giancarmine Salucci f6be3cfd47 feat(TRUEREF-0005): implement document parser and chunker
- Markdown parser with heading-based section splitting and code block extraction
- Code file parser with regex boundary detection for 10+ languages
- Sliding window chunker with configurable token limits and overlap
- Language detection from file extensions

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-23 09:06:12 +01:00

303 lines
9.4 KiB
TypeScript

/**
* Code file parser for TRUEREF-0005.
*
* Splits source-code files into function/class-level chunks using
* language-specific regex boundary detection. Falls back to a line-count
* sliding window for unrecognised languages.
*/
import { basename } from 'node:path';
import type { NewSnippet } from '$lib/server/db/schema.js';
import {
estimateTokens,
chunkLines,
chunkText,
MAX_TOKENS,
OVERLAP_TOKENS,
MIN_CONTENT_LENGTH
} from './chunker.js';
// ---------------------------------------------------------------------------
// Boundary patterns per language
// ---------------------------------------------------------------------------
/**
* Each pattern must match the START of a top-level declaration line.
* The regex is tested line-by-line (multiline flag not needed).
*/
export const BOUNDARY_PATTERNS: Record<string, RegExp> = {
typescript: /^(export\s+)?(declare\s+)?(async\s+)?(function|class|interface|type|enum|const|let|var)\s+\w+/,
javascript: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
python: /^(async\s+)?(def|class)\s+\w+/,
go: /^(func|type|var|const)\s+\w+/,
rust: /^(pub(\s*\(crate\))?\s+)?(async\s+)?(fn|impl|struct|enum|trait|type|const|static)\s+\w+/,
java: /^(\s*(public|private|protected|static|final|abstract|synchronized)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
csharp: /^(\s*(public|private|protected|internal|static|override|virtual|abstract|sealed)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
kotlin: /^(\s*(public|private|protected|internal|override|suspend|inline|open|abstract|sealed)\s+)*(fun|class|object|interface|data class|sealed class|enum class)\s+\w+/,
swift: /^(\s*(public|private|internal|fileprivate|open|override|static|final|class)\s+)*(func|class|struct|enum|protocol|extension)\s+\w+/,
ruby: /^(def|class|module)\s+\w+/
};
// ---------------------------------------------------------------------------
// Internal types
// ---------------------------------------------------------------------------
type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/**
* Split `content` at lines that match `pattern`, returning the segments
* between boundaries (each segment includes its opening boundary line).
*/
function splitAtBoundaries(content: string, pattern: RegExp): string[] {
const lines = content.split('\n');
const segments: string[] = [];
let current: string[] = [];
for (const line of lines) {
if (pattern.test(line) && current.length > 0) {
// Emit what we have, start a new segment from this boundary line
segments.push(current.join('\n'));
current = [line];
} else {
current.push(line);
}
}
if (current.length > 0) {
segments.push(current.join('\n'));
}
return segments;
}
// ---------------------------------------------------------------------------
// Sliding-window fallback for code
// ---------------------------------------------------------------------------
function slidingWindowChunks(content: string, filePath: string, language: string): RawSnippet[] {
const lines = content.split('\n');
const windowedChunks = chunkLines(lines, 200, 20);
return windowedChunks
.filter((chunk) => chunk.trim().length >= MIN_CONTENT_LENGTH)
.map((chunk) => ({
type: 'code' as const,
title: basename(filePath),
content: chunk,
language,
breadcrumb: filePath,
tokenCount: estimateTokens(chunk)
}));
}
// ---------------------------------------------------------------------------
// Config / data file parser (JSON, YAML, TOML)
// ---------------------------------------------------------------------------
/**
* Chunk config/data files by splitting on top-level keys.
*
* Strategy: find lines that look like top-level keys (zero indentation,
* followed by colon/equals/brace) and treat each as a boundary.
*/
function parseConfigFile(content: string, filePath: string, language: string): RawSnippet[] {
const topLevelKey = /^[\w"'\-]+\s*[:=\[{]/;
const lines = content.split('\n');
const segments: string[] = [];
let current: string[] = [];
for (const line of lines) {
if (topLevelKey.test(line) && current.length > 0) {
segments.push(current.join('\n'));
current = [line];
} else {
current.push(line);
}
}
if (current.length > 0) segments.push(current.join('\n'));
// If we got only one segment (no structure detected), fall back to sliding window
if (segments.length <= 1) {
return slidingWindowChunks(content, filePath, language);
}
return segments
.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
.flatMap((seg) => {
if (estimateTokens(seg) <= MAX_TOKENS) {
return [
{
type: 'code' as const,
title: basename(filePath),
content: seg.trim(),
language,
breadcrumb: filePath,
tokenCount: estimateTokens(seg.trim())
}
];
}
return slidingWindowChunks(seg, filePath, language);
});
}
// ---------------------------------------------------------------------------
// HTML / Svelte / Vue parser
// ---------------------------------------------------------------------------
/**
* Extract script blocks and text content from HTML-like files.
*/
function parseHtmlLikeFile(content: string, filePath: string, language: string): RawSnippet[] {
const snippets: RawSnippet[] = [];
const title = basename(filePath);
// Extract <script> blocks (including <script lang="ts">)
const scriptPattern = /<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi;
let match: RegExpExecArray | null;
const scriptBlocks: string[] = [];
while ((match = scriptPattern.exec(content)) !== null) {
// Strip the outer tags, keep just the code
const inner = match[0].replace(/^<script[^>]*>/, '').replace(/<\/script>$/, '').trim();
if (inner.length >= MIN_CONTENT_LENGTH) {
scriptBlocks.push(inner);
}
}
for (const block of scriptBlocks) {
if (estimateTokens(block) <= MAX_TOKENS) {
snippets.push({
type: 'code',
title,
content: block,
language,
breadcrumb: filePath,
tokenCount: estimateTokens(block)
});
} else {
snippets.push(...slidingWindowChunks(block, filePath, language));
}
}
// Strip tags and extract text content for info snippets
const text = content
.replace(/<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi, '')
.replace(/<style(?:\s[^>]*)?>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s{2,}/g, ' ')
.trim();
if (text.length >= MIN_CONTENT_LENGTH) {
const chunks = chunkText(text, MAX_TOKENS, OVERLAP_TOKENS);
for (const chunk of chunks) {
snippets.push({
type: 'info',
title,
content: chunk,
language: null,
breadcrumb: filePath,
tokenCount: estimateTokens(chunk)
});
}
}
return snippets;
}
// ---------------------------------------------------------------------------
// Plain-text / RST parser
// ---------------------------------------------------------------------------
function parsePlainText(content: string, filePath: string): RawSnippet[] {
// Split on blank lines (paragraph boundaries)
const paragraphs = content.split(/\n{2,}/).filter((p) => p.trim().length >= MIN_CONTENT_LENGTH);
if (paragraphs.length === 0) return [];
const title = basename(filePath);
const snippets: RawSnippet[] = [];
for (const para of paragraphs) {
const chunks = chunkText(para.trim(), MAX_TOKENS, OVERLAP_TOKENS);
for (const chunk of chunks) {
snippets.push({
type: 'info',
title,
content: chunk,
language: null,
breadcrumb: filePath,
tokenCount: estimateTokens(chunk)
});
}
}
return snippets;
}
// ---------------------------------------------------------------------------
// Public parser
// ---------------------------------------------------------------------------
/**
* Parse a non-Markdown code or data file into raw snippets.
*/
export function parseCodeFile(content: string, filePath: string, language: string): RawSnippet[] {
// Plain text / RST
if (language === 'text') {
return parsePlainText(content, filePath);
}
// Config / data files
if (['json', 'yaml', 'toml'].includes(language)) {
return parseConfigFile(content, filePath, language);
}
// HTML-like files
if (['html', 'svelte', 'vue'].includes(language)) {
return parseHtmlLikeFile(content, filePath, language);
}
// Normalise csharp alias
const normalisedLang = language === 'csharp' ? 'csharp' : language;
const pattern = BOUNDARY_PATTERNS[normalisedLang];
const title = basename(filePath);
const breadcrumb = filePath;
if (!pattern) {
// Fallback: line-count sliding window
return slidingWindowChunks(content, filePath, language);
}
const segments = splitAtBoundaries(content, pattern);
// If boundary detection produced only one segment covering the whole file,
// it means no boundaries matched — fall back to sliding window.
if (segments.length === 1 && !pattern.test(content.split('\n')[0])) {
return slidingWindowChunks(content, filePath, language);
}
return segments
.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
.flatMap((seg) => {
const trimmed = seg.trim();
if (estimateTokens(trimmed) <= MAX_TOKENS) {
return [
{
type: 'code' as const,
title,
content: trimmed,
language,
breadcrumb,
tokenCount: estimateTokens(trimmed)
}
];
}
// Chunk oversized segments with sliding window
return slidingWindowChunks(trimmed, filePath, language);
});
}