- Markdown parser with heading-based section splitting and code block extraction - Code file parser with regex boundary detection for 10+ languages - Sliding window chunker with configurable token limits and overlap - Language detection from file extensions Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
303 lines
9.4 KiB
TypeScript
303 lines
9.4 KiB
TypeScript
/**
|
|
* Code file parser for TRUEREF-0005.
|
|
*
|
|
* Splits source-code files into function/class-level chunks using
|
|
* language-specific regex boundary detection. Falls back to a line-count
|
|
* sliding window for unrecognised languages.
|
|
*/
|
|
|
|
import { basename } from 'node:path';
|
|
import type { NewSnippet } from '$lib/server/db/schema.js';
|
|
import {
|
|
estimateTokens,
|
|
chunkLines,
|
|
chunkText,
|
|
MAX_TOKENS,
|
|
OVERLAP_TOKENS,
|
|
MIN_CONTENT_LENGTH
|
|
} from './chunker.js';
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Boundary patterns per language
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Each pattern must match the START of a top-level declaration line.
|
|
* The regex is tested line-by-line (multiline flag not needed).
|
|
*/
|
|
export const BOUNDARY_PATTERNS: Record<string, RegExp> = {
|
|
typescript: /^(export\s+)?(declare\s+)?(async\s+)?(function|class|interface|type|enum|const|let|var)\s+\w+/,
|
|
javascript: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
|
python: /^(async\s+)?(def|class)\s+\w+/,
|
|
go: /^(func|type|var|const)\s+\w+/,
|
|
rust: /^(pub(\s*\(crate\))?\s+)?(async\s+)?(fn|impl|struct|enum|trait|type|const|static)\s+\w+/,
|
|
java: /^(\s*(public|private|protected|static|final|abstract|synchronized)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
|
|
csharp: /^(\s*(public|private|protected|internal|static|override|virtual|abstract|sealed)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
|
|
kotlin: /^(\s*(public|private|protected|internal|override|suspend|inline|open|abstract|sealed)\s+)*(fun|class|object|interface|data class|sealed class|enum class)\s+\w+/,
|
|
swift: /^(\s*(public|private|internal|fileprivate|open|override|static|final|class)\s+)*(func|class|struct|enum|protocol|extension)\s+\w+/,
|
|
ruby: /^(def|class|module)\s+\w+/
|
|
};
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Internal types
|
|
// ---------------------------------------------------------------------------
|
|
|
|
type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Split `content` at lines that match `pattern`, returning the segments
|
|
* between boundaries (each segment includes its opening boundary line).
|
|
*/
|
|
function splitAtBoundaries(content: string, pattern: RegExp): string[] {
|
|
const lines = content.split('\n');
|
|
const segments: string[] = [];
|
|
let current: string[] = [];
|
|
|
|
for (const line of lines) {
|
|
if (pattern.test(line) && current.length > 0) {
|
|
// Emit what we have, start a new segment from this boundary line
|
|
segments.push(current.join('\n'));
|
|
current = [line];
|
|
} else {
|
|
current.push(line);
|
|
}
|
|
}
|
|
|
|
if (current.length > 0) {
|
|
segments.push(current.join('\n'));
|
|
}
|
|
|
|
return segments;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Sliding-window fallback for code
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function slidingWindowChunks(content: string, filePath: string, language: string): RawSnippet[] {
|
|
const lines = content.split('\n');
|
|
const windowedChunks = chunkLines(lines, 200, 20);
|
|
return windowedChunks
|
|
.filter((chunk) => chunk.trim().length >= MIN_CONTENT_LENGTH)
|
|
.map((chunk) => ({
|
|
type: 'code' as const,
|
|
title: basename(filePath),
|
|
content: chunk,
|
|
language,
|
|
breadcrumb: filePath,
|
|
tokenCount: estimateTokens(chunk)
|
|
}));
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Config / data file parser (JSON, YAML, TOML)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Chunk config/data files by splitting on top-level keys.
|
|
*
|
|
* Strategy: find lines that look like top-level keys (zero indentation,
|
|
* followed by colon/equals/brace) and treat each as a boundary.
|
|
*/
|
|
function parseConfigFile(content: string, filePath: string, language: string): RawSnippet[] {
|
|
const topLevelKey = /^[\w"'\-]+\s*[:=\[{]/;
|
|
const lines = content.split('\n');
|
|
const segments: string[] = [];
|
|
let current: string[] = [];
|
|
|
|
for (const line of lines) {
|
|
if (topLevelKey.test(line) && current.length > 0) {
|
|
segments.push(current.join('\n'));
|
|
current = [line];
|
|
} else {
|
|
current.push(line);
|
|
}
|
|
}
|
|
if (current.length > 0) segments.push(current.join('\n'));
|
|
|
|
// If we got only one segment (no structure detected), fall back to sliding window
|
|
if (segments.length <= 1) {
|
|
return slidingWindowChunks(content, filePath, language);
|
|
}
|
|
|
|
return segments
|
|
.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
|
|
.flatMap((seg) => {
|
|
if (estimateTokens(seg) <= MAX_TOKENS) {
|
|
return [
|
|
{
|
|
type: 'code' as const,
|
|
title: basename(filePath),
|
|
content: seg.trim(),
|
|
language,
|
|
breadcrumb: filePath,
|
|
tokenCount: estimateTokens(seg.trim())
|
|
}
|
|
];
|
|
}
|
|
return slidingWindowChunks(seg, filePath, language);
|
|
});
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// HTML / Svelte / Vue parser
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Extract script blocks and text content from HTML-like files.
|
|
*/
|
|
function parseHtmlLikeFile(content: string, filePath: string, language: string): RawSnippet[] {
|
|
const snippets: RawSnippet[] = [];
|
|
const title = basename(filePath);
|
|
|
|
// Extract <script> blocks (including <script lang="ts">)
|
|
const scriptPattern = /<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi;
|
|
let match: RegExpExecArray | null;
|
|
const scriptBlocks: string[] = [];
|
|
|
|
while ((match = scriptPattern.exec(content)) !== null) {
|
|
// Strip the outer tags, keep just the code
|
|
const inner = match[0].replace(/^<script[^>]*>/, '').replace(/<\/script>$/, '').trim();
|
|
if (inner.length >= MIN_CONTENT_LENGTH) {
|
|
scriptBlocks.push(inner);
|
|
}
|
|
}
|
|
|
|
for (const block of scriptBlocks) {
|
|
if (estimateTokens(block) <= MAX_TOKENS) {
|
|
snippets.push({
|
|
type: 'code',
|
|
title,
|
|
content: block,
|
|
language,
|
|
breadcrumb: filePath,
|
|
tokenCount: estimateTokens(block)
|
|
});
|
|
} else {
|
|
snippets.push(...slidingWindowChunks(block, filePath, language));
|
|
}
|
|
}
|
|
|
|
// Strip tags and extract text content for info snippets
|
|
const text = content
|
|
.replace(/<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi, '')
|
|
.replace(/<style(?:\s[^>]*)?>[\s\S]*?<\/style>/gi, '')
|
|
.replace(/<[^>]+>/g, ' ')
|
|
.replace(/\s{2,}/g, ' ')
|
|
.trim();
|
|
|
|
if (text.length >= MIN_CONTENT_LENGTH) {
|
|
const chunks = chunkText(text, MAX_TOKENS, OVERLAP_TOKENS);
|
|
for (const chunk of chunks) {
|
|
snippets.push({
|
|
type: 'info',
|
|
title,
|
|
content: chunk,
|
|
language: null,
|
|
breadcrumb: filePath,
|
|
tokenCount: estimateTokens(chunk)
|
|
});
|
|
}
|
|
}
|
|
|
|
return snippets;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Plain-text / RST parser
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function parsePlainText(content: string, filePath: string): RawSnippet[] {
|
|
// Split on blank lines (paragraph boundaries)
|
|
const paragraphs = content.split(/\n{2,}/).filter((p) => p.trim().length >= MIN_CONTENT_LENGTH);
|
|
|
|
if (paragraphs.length === 0) return [];
|
|
|
|
const title = basename(filePath);
|
|
const snippets: RawSnippet[] = [];
|
|
|
|
for (const para of paragraphs) {
|
|
const chunks = chunkText(para.trim(), MAX_TOKENS, OVERLAP_TOKENS);
|
|
for (const chunk of chunks) {
|
|
snippets.push({
|
|
type: 'info',
|
|
title,
|
|
content: chunk,
|
|
language: null,
|
|
breadcrumb: filePath,
|
|
tokenCount: estimateTokens(chunk)
|
|
});
|
|
}
|
|
}
|
|
|
|
return snippets;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Public parser
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Parse a non-Markdown code or data file into raw snippets.
|
|
*/
|
|
export function parseCodeFile(content: string, filePath: string, language: string): RawSnippet[] {
|
|
// Plain text / RST
|
|
if (language === 'text') {
|
|
return parsePlainText(content, filePath);
|
|
}
|
|
|
|
// Config / data files
|
|
if (['json', 'yaml', 'toml'].includes(language)) {
|
|
return parseConfigFile(content, filePath, language);
|
|
}
|
|
|
|
// HTML-like files
|
|
if (['html', 'svelte', 'vue'].includes(language)) {
|
|
return parseHtmlLikeFile(content, filePath, language);
|
|
}
|
|
|
|
// Normalise csharp alias
|
|
const normalisedLang = language === 'csharp' ? 'csharp' : language;
|
|
|
|
const pattern = BOUNDARY_PATTERNS[normalisedLang];
|
|
const title = basename(filePath);
|
|
const breadcrumb = filePath;
|
|
|
|
if (!pattern) {
|
|
// Fallback: line-count sliding window
|
|
return slidingWindowChunks(content, filePath, language);
|
|
}
|
|
|
|
const segments = splitAtBoundaries(content, pattern);
|
|
|
|
// If boundary detection produced only one segment covering the whole file,
|
|
// it means no boundaries matched — fall back to sliding window.
|
|
if (segments.length === 1 && !pattern.test(content.split('\n')[0])) {
|
|
return slidingWindowChunks(content, filePath, language);
|
|
}
|
|
|
|
return segments
|
|
.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
|
|
.flatMap((seg) => {
|
|
const trimmed = seg.trim();
|
|
if (estimateTokens(trimmed) <= MAX_TOKENS) {
|
|
return [
|
|
{
|
|
type: 'code' as const,
|
|
title,
|
|
content: trimmed,
|
|
language,
|
|
breadcrumb,
|
|
tokenCount: estimateTokens(trimmed)
|
|
}
|
|
];
|
|
}
|
|
// Chunk oversized segments with sliding window
|
|
return slidingWindowChunks(trimmed, filePath, language);
|
|
});
|
|
}
|