feat(TRUEREF-0005): implement document parser and chunker
- Markdown parser with heading-based section splitting and code block extraction - Code file parser with regex boundary detection for 10+ languages - Sliding window chunker with configurable token limits and overlap - Language detection from file extensions Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
302
src/lib/server/parser/code.parser.ts
Normal file
302
src/lib/server/parser/code.parser.ts
Normal file
@@ -0,0 +1,302 @@
|
||||
/**
|
||||
* Code file parser for TRUEREF-0005.
|
||||
*
|
||||
* Splits source-code files into function/class-level chunks using
|
||||
* language-specific regex boundary detection. Falls back to a line-count
|
||||
* sliding window for unrecognised languages.
|
||||
*/
|
||||
|
||||
import { basename } from 'node:path';
|
||||
import type { NewSnippet } from '$lib/server/db/schema.js';
|
||||
import {
|
||||
estimateTokens,
|
||||
chunkLines,
|
||||
chunkText,
|
||||
MAX_TOKENS,
|
||||
OVERLAP_TOKENS,
|
||||
MIN_CONTENT_LENGTH
|
||||
} from './chunker.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Boundary patterns per language
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Each pattern must match the START of a top-level declaration line.
|
||||
* The regex is tested line-by-line (multiline flag not needed).
|
||||
*/
|
||||
export const BOUNDARY_PATTERNS: Record<string, RegExp> = {
|
||||
typescript: /^(export\s+)?(declare\s+)?(async\s+)?(function|class|interface|type|enum|const|let|var)\s+\w+/,
|
||||
javascript: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
||||
python: /^(async\s+)?(def|class)\s+\w+/,
|
||||
go: /^(func|type|var|const)\s+\w+/,
|
||||
rust: /^(pub(\s*\(crate\))?\s+)?(async\s+)?(fn|impl|struct|enum|trait|type|const|static)\s+\w+/,
|
||||
java: /^(\s*(public|private|protected|static|final|abstract|synchronized)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
|
||||
csharp: /^(\s*(public|private|protected|internal|static|override|virtual|abstract|sealed)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
|
||||
kotlin: /^(\s*(public|private|protected|internal|override|suspend|inline|open|abstract|sealed)\s+)*(fun|class|object|interface|data class|sealed class|enum class)\s+\w+/,
|
||||
swift: /^(\s*(public|private|internal|fileprivate|open|override|static|final|class)\s+)*(func|class|struct|enum|protocol|extension)\s+\w+/,
|
||||
ruby: /^(def|class|module)\s+\w+/
|
||||
};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Internal types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Split `content` at lines that match `pattern`, returning the segments
|
||||
* between boundaries (each segment includes its opening boundary line).
|
||||
*/
|
||||
function splitAtBoundaries(content: string, pattern: RegExp): string[] {
|
||||
const lines = content.split('\n');
|
||||
const segments: string[] = [];
|
||||
let current: string[] = [];
|
||||
|
||||
for (const line of lines) {
|
||||
if (pattern.test(line) && current.length > 0) {
|
||||
// Emit what we have, start a new segment from this boundary line
|
||||
segments.push(current.join('\n'));
|
||||
current = [line];
|
||||
} else {
|
||||
current.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (current.length > 0) {
|
||||
segments.push(current.join('\n'));
|
||||
}
|
||||
|
||||
return segments;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Sliding-window fallback for code
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function slidingWindowChunks(content: string, filePath: string, language: string): RawSnippet[] {
|
||||
const lines = content.split('\n');
|
||||
const windowedChunks = chunkLines(lines, 200, 20);
|
||||
return windowedChunks
|
||||
.filter((chunk) => chunk.trim().length >= MIN_CONTENT_LENGTH)
|
||||
.map((chunk) => ({
|
||||
type: 'code' as const,
|
||||
title: basename(filePath),
|
||||
content: chunk,
|
||||
language,
|
||||
breadcrumb: filePath,
|
||||
tokenCount: estimateTokens(chunk)
|
||||
}));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Config / data file parser (JSON, YAML, TOML)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Chunk config/data files by splitting on top-level keys.
|
||||
*
|
||||
* Strategy: find lines that look like top-level keys (zero indentation,
|
||||
* followed by colon/equals/brace) and treat each as a boundary.
|
||||
*/
|
||||
function parseConfigFile(content: string, filePath: string, language: string): RawSnippet[] {
|
||||
const topLevelKey = /^[\w"'\-]+\s*[:=\[{]/;
|
||||
const lines = content.split('\n');
|
||||
const segments: string[] = [];
|
||||
let current: string[] = [];
|
||||
|
||||
for (const line of lines) {
|
||||
if (topLevelKey.test(line) && current.length > 0) {
|
||||
segments.push(current.join('\n'));
|
||||
current = [line];
|
||||
} else {
|
||||
current.push(line);
|
||||
}
|
||||
}
|
||||
if (current.length > 0) segments.push(current.join('\n'));
|
||||
|
||||
// If we got only one segment (no structure detected), fall back to sliding window
|
||||
if (segments.length <= 1) {
|
||||
return slidingWindowChunks(content, filePath, language);
|
||||
}
|
||||
|
||||
return segments
|
||||
.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
|
||||
.flatMap((seg) => {
|
||||
if (estimateTokens(seg) <= MAX_TOKENS) {
|
||||
return [
|
||||
{
|
||||
type: 'code' as const,
|
||||
title: basename(filePath),
|
||||
content: seg.trim(),
|
||||
language,
|
||||
breadcrumb: filePath,
|
||||
tokenCount: estimateTokens(seg.trim())
|
||||
}
|
||||
];
|
||||
}
|
||||
return slidingWindowChunks(seg, filePath, language);
|
||||
});
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HTML / Svelte / Vue parser
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Extract script blocks and text content from HTML-like files.
|
||||
*/
|
||||
function parseHtmlLikeFile(content: string, filePath: string, language: string): RawSnippet[] {
|
||||
const snippets: RawSnippet[] = [];
|
||||
const title = basename(filePath);
|
||||
|
||||
// Extract <script> blocks (including <script lang="ts">)
|
||||
const scriptPattern = /<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi;
|
||||
let match: RegExpExecArray | null;
|
||||
const scriptBlocks: string[] = [];
|
||||
|
||||
while ((match = scriptPattern.exec(content)) !== null) {
|
||||
// Strip the outer tags, keep just the code
|
||||
const inner = match[0].replace(/^<script[^>]*>/, '').replace(/<\/script>$/, '').trim();
|
||||
if (inner.length >= MIN_CONTENT_LENGTH) {
|
||||
scriptBlocks.push(inner);
|
||||
}
|
||||
}
|
||||
|
||||
for (const block of scriptBlocks) {
|
||||
if (estimateTokens(block) <= MAX_TOKENS) {
|
||||
snippets.push({
|
||||
type: 'code',
|
||||
title,
|
||||
content: block,
|
||||
language,
|
||||
breadcrumb: filePath,
|
||||
tokenCount: estimateTokens(block)
|
||||
});
|
||||
} else {
|
||||
snippets.push(...slidingWindowChunks(block, filePath, language));
|
||||
}
|
||||
}
|
||||
|
||||
// Strip tags and extract text content for info snippets
|
||||
const text = content
|
||||
.replace(/<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style(?:\s[^>]*)?>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s{2,}/g, ' ')
|
||||
.trim();
|
||||
|
||||
if (text.length >= MIN_CONTENT_LENGTH) {
|
||||
const chunks = chunkText(text, MAX_TOKENS, OVERLAP_TOKENS);
|
||||
for (const chunk of chunks) {
|
||||
snippets.push({
|
||||
type: 'info',
|
||||
title,
|
||||
content: chunk,
|
||||
language: null,
|
||||
breadcrumb: filePath,
|
||||
tokenCount: estimateTokens(chunk)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return snippets;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Plain-text / RST parser
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function parsePlainText(content: string, filePath: string): RawSnippet[] {
|
||||
// Split on blank lines (paragraph boundaries)
|
||||
const paragraphs = content.split(/\n{2,}/).filter((p) => p.trim().length >= MIN_CONTENT_LENGTH);
|
||||
|
||||
if (paragraphs.length === 0) return [];
|
||||
|
||||
const title = basename(filePath);
|
||||
const snippets: RawSnippet[] = [];
|
||||
|
||||
for (const para of paragraphs) {
|
||||
const chunks = chunkText(para.trim(), MAX_TOKENS, OVERLAP_TOKENS);
|
||||
for (const chunk of chunks) {
|
||||
snippets.push({
|
||||
type: 'info',
|
||||
title,
|
||||
content: chunk,
|
||||
language: null,
|
||||
breadcrumb: filePath,
|
||||
tokenCount: estimateTokens(chunk)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return snippets;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public parser
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Parse a non-Markdown code or data file into raw snippets.
|
||||
*/
|
||||
export function parseCodeFile(content: string, filePath: string, language: string): RawSnippet[] {
|
||||
// Plain text / RST
|
||||
if (language === 'text') {
|
||||
return parsePlainText(content, filePath);
|
||||
}
|
||||
|
||||
// Config / data files
|
||||
if (['json', 'yaml', 'toml'].includes(language)) {
|
||||
return parseConfigFile(content, filePath, language);
|
||||
}
|
||||
|
||||
// HTML-like files
|
||||
if (['html', 'svelte', 'vue'].includes(language)) {
|
||||
return parseHtmlLikeFile(content, filePath, language);
|
||||
}
|
||||
|
||||
// Normalise csharp alias
|
||||
const normalisedLang = language === 'csharp' ? 'csharp' : language;
|
||||
|
||||
const pattern = BOUNDARY_PATTERNS[normalisedLang];
|
||||
const title = basename(filePath);
|
||||
const breadcrumb = filePath;
|
||||
|
||||
if (!pattern) {
|
||||
// Fallback: line-count sliding window
|
||||
return slidingWindowChunks(content, filePath, language);
|
||||
}
|
||||
|
||||
const segments = splitAtBoundaries(content, pattern);
|
||||
|
||||
// If boundary detection produced only one segment covering the whole file,
|
||||
// it means no boundaries matched — fall back to sliding window.
|
||||
if (segments.length === 1 && !pattern.test(content.split('\n')[0])) {
|
||||
return slidingWindowChunks(content, filePath, language);
|
||||
}
|
||||
|
||||
return segments
|
||||
.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
|
||||
.flatMap((seg) => {
|
||||
const trimmed = seg.trim();
|
||||
if (estimateTokens(trimmed) <= MAX_TOKENS) {
|
||||
return [
|
||||
{
|
||||
type: 'code' as const,
|
||||
title,
|
||||
content: trimmed,
|
||||
language,
|
||||
breadcrumb,
|
||||
tokenCount: estimateTokens(trimmed)
|
||||
}
|
||||
];
|
||||
}
|
||||
// Chunk oversized segments with sliding window
|
||||
return slidingWindowChunks(trimmed, filePath, language);
|
||||
});
|
||||
}
|
||||
Reference in New Issue
Block a user