trueref/src/lib/server/parser/code.parser.ts

/**
 * Code file parser for TRUEREF-0005.
 *
 * Splits source-code files into function/class-level chunks using
 * language-specific regex boundary detection.  Falls back to a line-count
 * sliding window for unrecognised languages.
 */

import { basename } from 'node:path';
import type { NewSnippet } from '$lib/server/db/schema.js';
import {
	estimateTokens,
	chunkLines,
	chunkText,
	MAX_TOKENS,
	OVERLAP_TOKENS,
	MIN_CONTENT_LENGTH
} from './chunker.js';

// ---------------------------------------------------------------------------
// Boundary patterns per language
// ---------------------------------------------------------------------------

/**
 * Each pattern must match the START of a top-level declaration line.
 * The regex is tested line-by-line (multiline flag not needed).
 */
export const BOUNDARY_PATTERNS: Record<string, RegExp> = {
	typescript: /^(export\s+)?(declare\s+)?(async\s+)?(function|class|interface|type|enum|const|let|var)\s+\w+/,
	javascript: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
	python: /^(async\s+)?(def|class)\s+\w+/,
	go: /^(func|type|var|const)\s+\w+/,
	rust: /^(pub(\s*\(crate\))?\s+)?(async\s+)?(fn|impl|struct|enum|trait|type|const|static)\s+\w+/,
	java: /^(\s*(public|private|protected|static|final|abstract|synchronized)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
	csharp: /^(\s*(public|private|protected|internal|static|override|virtual|abstract|sealed)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
	kotlin: /^(\s*(public|private|protected|internal|override|suspend|inline|open|abstract|sealed)\s+)*(fun|class|object|interface|data class|sealed class|enum class)\s+\w+/,
	swift: /^(\s*(public|private|internal|fileprivate|open|override|static|final|class)\s+)*(func|class|struct|enum|protocol|extension)\s+\w+/,
	ruby: /^(def|class|module)\s+\w+/
};

// ---------------------------------------------------------------------------
// Internal types
// ---------------------------------------------------------------------------

type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/**
 * Split `content` at lines that match `pattern`, returning the segments
 * between boundaries (each segment includes its opening boundary line).
 */
function splitAtBoundaries(content: string, pattern: RegExp): string[] {
	const lines = content.split('\n');
	const segments: string[] = [];
	let current: string[] = [];

	for (const line of lines) {
		if (pattern.test(line) && current.length > 0) {
			// Emit what we have, start a new segment from this boundary line
			segments.push(current.join('\n'));
			current = [line];
		} else {
			current.push(line);
		}
	}

	if (current.length > 0) {
		segments.push(current.join('\n'));
	}

	return segments;
}

// ---------------------------------------------------------------------------
// Sliding-window fallback for code
// ---------------------------------------------------------------------------

function slidingWindowChunks(content: string, filePath: string, language: string): RawSnippet[] {
	const lines = content.split('\n');
	const windowedChunks = chunkLines(lines, 200, 20);
	return windowedChunks
		.filter((chunk) => chunk.trim().length >= MIN_CONTENT_LENGTH)
		.map((chunk) => ({
			type: 'code' as const,
			title: basename(filePath),
			content: chunk,
			language,
			breadcrumb: filePath,
			tokenCount: estimateTokens(chunk)
		}));
}

// ---------------------------------------------------------------------------
// Config / data file parser (JSON, YAML, TOML)
// ---------------------------------------------------------------------------

/**
 * Chunk config/data files by splitting on top-level keys.
 *
 * Strategy: find lines that look like top-level keys (zero indentation,
 * followed by colon/equals/brace) and treat each as a boundary.
 */
function parseConfigFile(content: string, filePath: string, language: string): RawSnippet[] {
	const topLevelKey = /^[\w"'\-]+\s*[:=\[{]/;
	const lines = content.split('\n');
	const segments: string[] = [];
	let current: string[] = [];

	for (const line of lines) {
		if (topLevelKey.test(line) && current.length > 0) {
			segments.push(current.join('\n'));
			current = [line];
		} else {
			current.push(line);
		}
	}
	if (current.length > 0) segments.push(current.join('\n'));

	// If we got only one segment (no structure detected), fall back to sliding window
	if (segments.length <= 1) {
		return slidingWindowChunks(content, filePath, language);
	}

	return segments
		.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
		.flatMap((seg) => {
			if (estimateTokens(seg) <= MAX_TOKENS) {
				return [
					{
						type: 'code' as const,
						title: basename(filePath),
						content: seg.trim(),
						language,
						breadcrumb: filePath,
						tokenCount: estimateTokens(seg.trim())
					}
				];
			}
			return slidingWindowChunks(seg, filePath, language);
		});
}

// ---------------------------------------------------------------------------
// HTML / Svelte / Vue parser
// ---------------------------------------------------------------------------

/**
 * Extract script blocks and text content from HTML-like files.
 */
function parseHtmlLikeFile(content: string, filePath: string, language: string): RawSnippet[] {
	const snippets: RawSnippet[] = [];
	const title = basename(filePath);

	// Extract <script> blocks (including <script lang="ts">)
	const scriptPattern = /<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi;
	let match: RegExpExecArray | null;
	const scriptBlocks: string[] = [];

	while ((match = scriptPattern.exec(content)) !== null) {
		// Strip the outer tags, keep just the code
		const inner = match[0].replace(/^<script[^>]*>/, '').replace(/<\/script>$/, '').trim();
		if (inner.length >= MIN_CONTENT_LENGTH) {
			scriptBlocks.push(inner);
		}
	}

	for (const block of scriptBlocks) {
		if (estimateTokens(block) <= MAX_TOKENS) {
			snippets.push({
				type: 'code',
				title,
				content: block,
				language,
				breadcrumb: filePath,
				tokenCount: estimateTokens(block)
			});
		} else {
			snippets.push(...slidingWindowChunks(block, filePath, language));
		}
	}

	// Strip tags and extract text content for info snippets
	const text = content
		.replace(/<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi, '')
		.replace(/<style(?:\s[^>]*)?>[\s\S]*?<\/style>/gi, '')
		.replace(/<[^>]+>/g, ' ')
		.replace(/\s{2,}/g, ' ')
		.trim();

	if (text.length >= MIN_CONTENT_LENGTH) {
		const chunks = chunkText(text, MAX_TOKENS, OVERLAP_TOKENS);
		for (const chunk of chunks) {
			snippets.push({
				type: 'info',
				title,
				content: chunk,
				language: null,
				breadcrumb: filePath,
				tokenCount: estimateTokens(chunk)
			});
		}
	}

	return snippets;
}

// ---------------------------------------------------------------------------
// Plain-text / RST parser
// ---------------------------------------------------------------------------

function parsePlainText(content: string, filePath: string): RawSnippet[] {
	// Split on blank lines (paragraph boundaries)
	const paragraphs = content.split(/\n{2,}/).filter((p) => p.trim().length >= MIN_CONTENT_LENGTH);

	if (paragraphs.length === 0) return [];

	const title = basename(filePath);
	const snippets: RawSnippet[] = [];

	for (const para of paragraphs) {
		const chunks = chunkText(para.trim(), MAX_TOKENS, OVERLAP_TOKENS);
		for (const chunk of chunks) {
			snippets.push({
				type: 'info',
				title,
				content: chunk,
				language: null,
				breadcrumb: filePath,
				tokenCount: estimateTokens(chunk)
			});
		}
	}

	return snippets;
}

// ---------------------------------------------------------------------------
// Public parser
// ---------------------------------------------------------------------------

/**
 * Parse a non-Markdown code or data file into raw snippets.
 */
export function parseCodeFile(content: string, filePath: string, language: string): RawSnippet[] {
	// Plain text / RST
	if (language === 'text') {
		return parsePlainText(content, filePath);
	}

	// Config / data files
	if (['json', 'yaml', 'toml'].includes(language)) {
		return parseConfigFile(content, filePath, language);
	}

	// HTML-like files
	if (['html', 'svelte', 'vue'].includes(language)) {
		return parseHtmlLikeFile(content, filePath, language);
	}

	// Normalise csharp alias
	const normalisedLang = language === 'csharp' ? 'csharp' : language;

	const pattern = BOUNDARY_PATTERNS[normalisedLang];
	const title = basename(filePath);
	const breadcrumb = filePath;

	if (!pattern) {
		// Fallback: line-count sliding window
		return slidingWindowChunks(content, filePath, language);
	}

	const segments = splitAtBoundaries(content, pattern);

	// If boundary detection produced only one segment covering the whole file,
	// it means no boundaries matched — fall back to sliding window.
	if (segments.length === 1 && !pattern.test(content.split('\n')[0])) {
		return slidingWindowChunks(content, filePath, language);
	}

	return segments
		.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
		.flatMap((seg) => {
			const trimmed = seg.trim();
			if (estimateTokens(trimmed) <= MAX_TOKENS) {
				return [
					{
						type: 'code' as const,
						title,
						content: trimmed,
						language,
						breadcrumb,
						tokenCount: estimateTokens(trimmed)
					}
				];
			}
			// Chunk oversized segments with sliding window
			return slidingWindowChunks(trimmed, filePath, language);
		});
}