feat(TRUEREF-0005): implement document parser and chunker

- Markdown parser with heading-based section splitting and code block extraction
- Code file parser with regex boundary detection for 10+ languages
- Sliding window chunker with configurable token limits and overlap
- Language detection from file extensions

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Giancarmine Salucci
2026-03-23 09:06:12 +01:00
parent 1c15d6c474
commit f6be3cfd47
7 changed files with 1350 additions and 0 deletions

View File

@@ -0,0 +1,92 @@
/**
* Text chunking utilities for the document parser (TRUEREF-0005).
*
* Provides sliding-window chunking with overlap and token estimation.
*/
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
export const MAX_TOKENS = 512;
export const OVERLAP_TOKENS = 50;
export const MIN_CONTENT_LENGTH = 20; // characters
// ---------------------------------------------------------------------------
// Token estimation
// ---------------------------------------------------------------------------
/**
* Estimate the token count for a piece of text using a character-based
* approximation (~3.5 chars per token on average for mixed prose/code).
*/
export function estimateTokens(text: string): number {
return Math.ceil(text.length / 3.5);
}
// ---------------------------------------------------------------------------
// Sliding-window chunker
// ---------------------------------------------------------------------------
/**
* Split `text` into overlapping word-based chunks that stay within the token
* budget. Returns at least one chunk even when the text fits in a single
* window.
*/
export function chunkText(
text: string,
maxTokens: number = MAX_TOKENS,
overlapTokens: number = OVERLAP_TOKENS
): string[] {
const words = text.split(/\s+/).filter((w) => w.length > 0);
if (words.length === 0) return [];
// ~0.75 words per token
const maxWords = Math.max(1, Math.floor(maxTokens * 0.75));
const overlapWords = Math.max(0, Math.floor(overlapTokens * 0.75));
if (words.length <= maxWords) {
return [words.join(' ')];
}
const chunks: string[] = [];
let start = 0;
while (start < words.length) {
const end = Math.min(start + maxWords, words.length);
chunks.push(words.slice(start, end).join(' '));
if (end === words.length) break;
start = end - overlapWords;
// Guard against infinite loop when overlapWords >= maxWords
if (start <= 0) start = end;
}
return chunks;
}
// ---------------------------------------------------------------------------
// Line-count sliding window (for code files without recognised boundaries)
// ---------------------------------------------------------------------------
/** Split `lines` into groups of at most `maxLines` with `overlapLines` overlap. */
export function chunkLines(
lines: string[],
maxLines: number = 200,
overlapLines: number = 20
): string[] {
if (lines.length === 0) return [];
if (lines.length <= maxLines) return [lines.join('\n')];
const chunks: string[] = [];
let start = 0;
while (start < lines.length) {
const end = Math.min(start + maxLines, lines.length);
chunks.push(lines.slice(start, end).join('\n'));
if (end === lines.length) break;
start = end - overlapLines;
if (start <= 0) start = end;
}
return chunks;
}

View File

@@ -0,0 +1,404 @@
/**
* Unit tests for the code file parser (TRUEREF-0005).
*/
import { describe, it, expect } from 'vitest';
import { parseCodeFile, BOUNDARY_PATTERNS } from './code.parser.js';
import { estimateTokens, MAX_TOKENS } from './chunker.js';
import { parseFile } from './index.js';
import type { CrawledFile } from '$lib/server/crawler/types.js';
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function makeFile(path: string, content: string, language = 'typescript'): CrawledFile {
return { path, content, size: content.length, sha: 'abc123', language };
}
// ---------------------------------------------------------------------------
// TypeScript / JavaScript boundary detection
// ---------------------------------------------------------------------------
describe('parseCodeFile — TypeScript', () => {
it('splits at function boundaries', () => {
const content = `
export function foo(): string {
return 'foo';
}
export function bar(x: number): number {
return x * 2;
}
`.trim();
const snippets = parseCodeFile(content, 'utils.ts', 'typescript');
expect(snippets.length).toBeGreaterThanOrEqual(2);
expect(snippets.every((s) => s.type === 'code')).toBe(true);
expect(snippets.some((s) => s.content.includes('function foo'))).toBe(true);
expect(snippets.some((s) => s.content.includes('function bar'))).toBe(true);
});
it('splits at class boundaries', () => {
const content = `
export class Greeter {
greet(name: string) {
return \`Hello, \${name}\`;
}
}
export class Farewell {
bye(name: string) {
return \`Goodbye, \${name}\`;
}
}
`.trim();
const snippets = parseCodeFile(content, 'greet.ts', 'typescript');
expect(snippets.length).toBeGreaterThanOrEqual(2);
expect(snippets.some((s) => s.content.includes('class Greeter'))).toBe(true);
expect(snippets.some((s) => s.content.includes('class Farewell'))).toBe(true);
});
it('sets correct metadata on snippets', () => {
const content = `
export function example(): void {
console.log('example function body here');
}
`.trim();
const snippets = parseCodeFile(content, 'src/utils.ts', 'typescript');
expect(snippets.length).toBeGreaterThanOrEqual(1);
const s = snippets[0]!;
expect(s.type).toBe('code');
expect(s.language).toBe('typescript');
expect(s.title).toBe('utils.ts');
expect(s.breadcrumb).toBe('src/utils.ts');
});
it('produces at least one snippet from a file with many small declarations', () => {
// Each block: a multi-line function — boundary detection fires but chunks are
// large enough to survive the MIN_CONTENT_LENGTH filter.
const blocks = Array.from(
{ length: 10 },
(_, i) => `export function helper${i}(x: number): number {\n return x + ${i};\n}`
);
const content = blocks.join('\n\n');
const snippets = parseCodeFile(content, 'generated.ts', 'typescript');
expect(snippets.length).toBeGreaterThanOrEqual(1);
expect(snippets.every((s) => s.type === 'code')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Python
// ---------------------------------------------------------------------------
describe('parseCodeFile — Python', () => {
it('splits at def and class boundaries', () => {
const content = `
def greet(name):
return f"Hello, {name}"
class MyClass:
def __init__(self):
self.value = 0
def increment(self):
self.value += 1
async def fetch_data(url):
return await http.get(url)
`.trim();
const snippets = parseCodeFile(content, 'app.py', 'python');
expect(snippets.some((s) => s.content.includes('def greet'))).toBe(true);
expect(snippets.some((s) => s.content.includes('class MyClass'))).toBe(true);
expect(snippets.some((s) => s.content.includes('async def fetch_data'))).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Go
// ---------------------------------------------------------------------------
describe('parseCodeFile — Go', () => {
it('splits at func boundaries', () => {
const content = `
package main
import "fmt"
func greet(name string) string {
return fmt.Sprintf("Hello, %s", name)
}
func main() {
fmt.Println(greet("world"))
}
`.trim();
const snippets = parseCodeFile(content, 'main.go', 'go');
expect(snippets.some((s) => s.content.includes('func greet'))).toBe(true);
expect(snippets.some((s) => s.content.includes('func main'))).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Rust
// ---------------------------------------------------------------------------
describe('parseCodeFile — Rust', () => {
it('splits at fn and struct boundaries', () => {
const content = `
pub struct Config {
pub name: String,
pub value: u32,
}
pub fn create_config(name: &str, value: u32) -> Config {
Config { name: name.to_string(), value }
}
impl Config {
pub fn new() -> Self {
Config { name: String::new(), value: 0 }
}
}
`.trim();
const snippets = parseCodeFile(content, 'config.rs', 'rust');
expect(snippets.some((s) => s.content.includes('pub struct Config'))).toBe(true);
expect(snippets.some((s) => s.content.includes('pub fn create_config'))).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Ruby
// ---------------------------------------------------------------------------
describe('parseCodeFile — Ruby', () => {
it('splits at def and class boundaries', () => {
const content = `
class Greeter
def initialize(name)
@name = name
end
def greet
"Hello, #{@name}!"
end
end
def standalone_helper
puts "helper"
end
`.trim();
const snippets = parseCodeFile(content, 'greeter.rb', 'ruby');
expect(snippets.some((s) => s.content.includes('class Greeter'))).toBe(true);
expect(snippets.some((s) => s.content.includes('def standalone_helper'))).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Config / data files
// ---------------------------------------------------------------------------
describe('parseCodeFile — JSON', () => {
it('produces at least one code snippet from a JSON object', () => {
const content = JSON.stringify(
{
name: 'my-package',
version: '1.0.0',
dependencies: { lodash: '^4.17.21' }
},
null,
2
);
const snippets = parseCodeFile(content, 'package.json', 'json');
expect(snippets.length).toBeGreaterThanOrEqual(1);
expect(snippets.every((s) => s.type === 'code')).toBe(true);
});
});
describe('parseCodeFile — YAML', () => {
it('splits a YAML file at top-level keys', () => {
const content = `
name: my-project
version: 1.0.0
scripts:
build: tsc
test: vitest
dependencies:
lodash: ^4.17.21
`.trim();
const snippets = parseCodeFile(content, 'config.yaml', 'yaml');
expect(snippets.length).toBeGreaterThanOrEqual(1);
});
});
// ---------------------------------------------------------------------------
// HTML-like files
// ---------------------------------------------------------------------------
describe('parseCodeFile — HTML', () => {
it('extracts script block and text content', () => {
const content = `
<!DOCTYPE html>
<html>
<head><title>Test Page</title></head>
<body>
<p>This is the page body content with enough text for an info snippet.</p>
<script>
function init() {
console.log('page loaded and ready for interaction');
}
</script>
</body>
</html>
`.trim();
const snippets = parseCodeFile(content, 'index.html', 'html');
expect(snippets.some((s) => s.type === 'code')).toBe(true);
expect(snippets.some((s) => s.type === 'info')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Plain text
// ---------------------------------------------------------------------------
describe('parseCodeFile — plain text', () => {
it('splits on paragraph boundaries', () => {
const content = `
This is the first paragraph with enough content to pass the minimum length check.
This is the second paragraph that also has enough content to be included here.
`.trim();
const snippets = parseCodeFile(content, 'notes.txt', 'text');
expect(snippets.length).toBeGreaterThanOrEqual(2);
expect(snippets.every((s) => s.type === 'info')).toBe(true);
});
it('skips paragraphs shorter than 20 characters', () => {
const content = 'Short.\n\nThis is a much longer paragraph that definitely passes the minimum length filter.';
const snippets = parseCodeFile(content, 'notes.txt', 'text');
expect(snippets.length).toBe(1);
});
});
// ---------------------------------------------------------------------------
// Unknown language fallback
// ---------------------------------------------------------------------------
describe('parseCodeFile — unknown language', () => {
it('falls back to sliding window for unrecognised languages', () => {
const lines = Array.from({ length: 50 }, (_, i) => `line ${i}: some code content here`);
const content = lines.join('\n');
const snippets = parseCodeFile(content, 'script.lua', 'lua');
expect(snippets.length).toBeGreaterThanOrEqual(1);
expect(snippets.every((s) => s.type === 'code')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Min content filter
// ---------------------------------------------------------------------------
describe('parseCodeFile — minimum content filter', () => {
it('skips segments shorter than 20 characters', () => {
const content = `
export function realFunction(): string {
// A function with enough content to be included in the output snippets.
return 'result value from the function that does the operation here';
}
`.trim();
const snippets = parseCodeFile(content, 'test.ts', 'typescript');
expect(snippets.every((s) => s.content.length >= 20)).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Token count cap
// ---------------------------------------------------------------------------
describe('parseCodeFile — token count', () => {
it('all snippets have tokenCount within MAX_TOKENS', () => {
const lines = Array.from({ length: 300 }, (_, i) => `// comment line number ${i} here\nconst x${i} = ${i};`);
const content = lines.join('\n');
const snippets = parseCodeFile(content, 'large.ts', 'typescript');
for (const s of snippets) {
expect(estimateTokens(s.content)).toBeLessThanOrEqual(MAX_TOKENS + 50); // slight tolerance for boundary chunks
}
});
});
// ---------------------------------------------------------------------------
// parseFile integration
// ---------------------------------------------------------------------------
describe('parseFile — integration', () => {
const opts = { repositoryId: 'repo-1', documentId: 'doc-1', versionId: 'v1' };
it('returns NewSnippet records with all required fields for a .ts file', () => {
const file = makeFile(
'src/utils.ts',
`export function add(a: number, b: number): number {\n return a + b;\n}\n`
);
const snippets = parseFile(file, opts);
expect(snippets.length).toBeGreaterThanOrEqual(1);
for (const s of snippets) {
expect(s.id).toBeTruthy();
expect(s.repositoryId).toBe('repo-1');
expect(s.documentId).toBe('doc-1');
expect(s.versionId).toBe('v1');
expect(s.createdAt).toBeInstanceOf(Date);
expect(s.content).toBeTruthy();
expect(s.type).toMatch(/^(code|info)$/);
}
});
it('returns NewSnippet records for a .md file', () => {
const file = makeFile(
'README.md',
`# Hello\n\nThis is a long enough paragraph to pass the minimum content length filter.\n`,
'markdown'
);
const snippets = parseFile(file, opts);
expect(snippets.length).toBeGreaterThanOrEqual(1);
expect(snippets[0]?.type).toBe('info');
});
it('uses null for versionId when not provided', () => {
const file = makeFile('src/index.ts', `export function noop(): void {}\n`);
const snippets = parseFile(file, { repositoryId: 'r', documentId: 'd' });
// noop is too short; file may return 0 snippets — just verify no error thrown
expect(Array.isArray(snippets)).toBe(true);
});
});
// ---------------------------------------------------------------------------
// BOUNDARY_PATTERNS export
// ---------------------------------------------------------------------------
describe('BOUNDARY_PATTERNS', () => {
it('contains entries for core languages', () => {
expect(BOUNDARY_PATTERNS['typescript']).toBeInstanceOf(RegExp);
expect(BOUNDARY_PATTERNS['python']).toBeInstanceOf(RegExp);
expect(BOUNDARY_PATTERNS['go']).toBeInstanceOf(RegExp);
expect(BOUNDARY_PATTERNS['rust']).toBeInstanceOf(RegExp);
expect(BOUNDARY_PATTERNS['ruby']).toBeInstanceOf(RegExp);
});
});

View File

@@ -0,0 +1,302 @@
/**
* Code file parser for TRUEREF-0005.
*
* Splits source-code files into function/class-level chunks using
* language-specific regex boundary detection. Falls back to a line-count
* sliding window for unrecognised languages.
*/
import { basename } from 'node:path';
import type { NewSnippet } from '$lib/server/db/schema.js';
import {
estimateTokens,
chunkLines,
chunkText,
MAX_TOKENS,
OVERLAP_TOKENS,
MIN_CONTENT_LENGTH
} from './chunker.js';
// ---------------------------------------------------------------------------
// Boundary patterns per language
// ---------------------------------------------------------------------------
/**
* Each pattern must match the START of a top-level declaration line.
* The regex is tested line-by-line (multiline flag not needed).
*/
export const BOUNDARY_PATTERNS: Record<string, RegExp> = {
typescript: /^(export\s+)?(declare\s+)?(async\s+)?(function|class|interface|type|enum|const|let|var)\s+\w+/,
javascript: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
python: /^(async\s+)?(def|class)\s+\w+/,
go: /^(func|type|var|const)\s+\w+/,
rust: /^(pub(\s*\(crate\))?\s+)?(async\s+)?(fn|impl|struct|enum|trait|type|const|static)\s+\w+/,
java: /^(\s*(public|private|protected|static|final|abstract|synchronized)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
csharp: /^(\s*(public|private|protected|internal|static|override|virtual|abstract|sealed)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
kotlin: /^(\s*(public|private|protected|internal|override|suspend|inline|open|abstract|sealed)\s+)*(fun|class|object|interface|data class|sealed class|enum class)\s+\w+/,
swift: /^(\s*(public|private|internal|fileprivate|open|override|static|final|class)\s+)*(func|class|struct|enum|protocol|extension)\s+\w+/,
ruby: /^(def|class|module)\s+\w+/
};
// ---------------------------------------------------------------------------
// Internal types
// ---------------------------------------------------------------------------
type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/**
* Split `content` at lines that match `pattern`, returning the segments
* between boundaries (each segment includes its opening boundary line).
*/
function splitAtBoundaries(content: string, pattern: RegExp): string[] {
const lines = content.split('\n');
const segments: string[] = [];
let current: string[] = [];
for (const line of lines) {
if (pattern.test(line) && current.length > 0) {
// Emit what we have, start a new segment from this boundary line
segments.push(current.join('\n'));
current = [line];
} else {
current.push(line);
}
}
if (current.length > 0) {
segments.push(current.join('\n'));
}
return segments;
}
// ---------------------------------------------------------------------------
// Sliding-window fallback for code
// ---------------------------------------------------------------------------
function slidingWindowChunks(content: string, filePath: string, language: string): RawSnippet[] {
const lines = content.split('\n');
const windowedChunks = chunkLines(lines, 200, 20);
return windowedChunks
.filter((chunk) => chunk.trim().length >= MIN_CONTENT_LENGTH)
.map((chunk) => ({
type: 'code' as const,
title: basename(filePath),
content: chunk,
language,
breadcrumb: filePath,
tokenCount: estimateTokens(chunk)
}));
}
// ---------------------------------------------------------------------------
// Config / data file parser (JSON, YAML, TOML)
// ---------------------------------------------------------------------------
/**
* Chunk config/data files by splitting on top-level keys.
*
* Strategy: find lines that look like top-level keys (zero indentation,
* followed by colon/equals/brace) and treat each as a boundary.
*/
function parseConfigFile(content: string, filePath: string, language: string): RawSnippet[] {
const topLevelKey = /^[\w"'\-]+\s*[:=\[{]/;
const lines = content.split('\n');
const segments: string[] = [];
let current: string[] = [];
for (const line of lines) {
if (topLevelKey.test(line) && current.length > 0) {
segments.push(current.join('\n'));
current = [line];
} else {
current.push(line);
}
}
if (current.length > 0) segments.push(current.join('\n'));
// If we got only one segment (no structure detected), fall back to sliding window
if (segments.length <= 1) {
return slidingWindowChunks(content, filePath, language);
}
return segments
.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
.flatMap((seg) => {
if (estimateTokens(seg) <= MAX_TOKENS) {
return [
{
type: 'code' as const,
title: basename(filePath),
content: seg.trim(),
language,
breadcrumb: filePath,
tokenCount: estimateTokens(seg.trim())
}
];
}
return slidingWindowChunks(seg, filePath, language);
});
}
// ---------------------------------------------------------------------------
// HTML / Svelte / Vue parser
// ---------------------------------------------------------------------------
/**
* Extract script blocks and text content from HTML-like files.
*/
function parseHtmlLikeFile(content: string, filePath: string, language: string): RawSnippet[] {
const snippets: RawSnippet[] = [];
const title = basename(filePath);
// Extract <script> blocks (including <script lang="ts">)
const scriptPattern = /<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi;
let match: RegExpExecArray | null;
const scriptBlocks: string[] = [];
while ((match = scriptPattern.exec(content)) !== null) {
// Strip the outer tags, keep just the code
const inner = match[0].replace(/^<script[^>]*>/, '').replace(/<\/script>$/, '').trim();
if (inner.length >= MIN_CONTENT_LENGTH) {
scriptBlocks.push(inner);
}
}
for (const block of scriptBlocks) {
if (estimateTokens(block) <= MAX_TOKENS) {
snippets.push({
type: 'code',
title,
content: block,
language,
breadcrumb: filePath,
tokenCount: estimateTokens(block)
});
} else {
snippets.push(...slidingWindowChunks(block, filePath, language));
}
}
// Strip tags and extract text content for info snippets
const text = content
.replace(/<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi, '')
.replace(/<style(?:\s[^>]*)?>[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s{2,}/g, ' ')
.trim();
if (text.length >= MIN_CONTENT_LENGTH) {
const chunks = chunkText(text, MAX_TOKENS, OVERLAP_TOKENS);
for (const chunk of chunks) {
snippets.push({
type: 'info',
title,
content: chunk,
language: null,
breadcrumb: filePath,
tokenCount: estimateTokens(chunk)
});
}
}
return snippets;
}
// ---------------------------------------------------------------------------
// Plain-text / RST parser
// ---------------------------------------------------------------------------
function parsePlainText(content: string, filePath: string): RawSnippet[] {
// Split on blank lines (paragraph boundaries)
const paragraphs = content.split(/\n{2,}/).filter((p) => p.trim().length >= MIN_CONTENT_LENGTH);
if (paragraphs.length === 0) return [];
const title = basename(filePath);
const snippets: RawSnippet[] = [];
for (const para of paragraphs) {
const chunks = chunkText(para.trim(), MAX_TOKENS, OVERLAP_TOKENS);
for (const chunk of chunks) {
snippets.push({
type: 'info',
title,
content: chunk,
language: null,
breadcrumb: filePath,
tokenCount: estimateTokens(chunk)
});
}
}
return snippets;
}
// ---------------------------------------------------------------------------
// Public parser
// ---------------------------------------------------------------------------
/**
* Parse a non-Markdown code or data file into raw snippets.
*/
export function parseCodeFile(content: string, filePath: string, language: string): RawSnippet[] {
// Plain text / RST
if (language === 'text') {
return parsePlainText(content, filePath);
}
// Config / data files
if (['json', 'yaml', 'toml'].includes(language)) {
return parseConfigFile(content, filePath, language);
}
// HTML-like files
if (['html', 'svelte', 'vue'].includes(language)) {
return parseHtmlLikeFile(content, filePath, language);
}
// Normalise csharp alias
const normalisedLang = language === 'csharp' ? 'csharp' : language;
const pattern = BOUNDARY_PATTERNS[normalisedLang];
const title = basename(filePath);
const breadcrumb = filePath;
if (!pattern) {
// Fallback: line-count sliding window
return slidingWindowChunks(content, filePath, language);
}
const segments = splitAtBoundaries(content, pattern);
// If boundary detection produced only one segment covering the whole file,
// it means no boundaries matched — fall back to sliding window.
if (segments.length === 1 && !pattern.test(content.split('\n')[0])) {
return slidingWindowChunks(content, filePath, language);
}
return segments
.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
.flatMap((seg) => {
const trimmed = seg.trim();
if (estimateTokens(trimmed) <= MAX_TOKENS) {
return [
{
type: 'code' as const,
title,
content: trimmed,
language,
breadcrumb,
tokenCount: estimateTokens(trimmed)
}
];
}
// Chunk oversized segments with sliding window
return slidingWindowChunks(trimmed, filePath, language);
});
}

View File

@@ -0,0 +1,53 @@
/**
* Document parser entry point for TRUEREF-0005.
*
* Exposes `parseFile` which transforms a `CrawledFile` into an array of
* `NewSnippet` records ready for database insertion.
*/
import type { CrawledFile } from '$lib/server/crawler/types.js';
import type { NewSnippet } from '$lib/server/db/schema.js';
import { detectLanguage } from './language.js';
import { parseMarkdown } from './markdown.parser.js';
import { parseCodeFile } from './code.parser.js';
// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------
export interface ParseOptions {
repositoryId: string;
documentId: string;
versionId?: string;
}
/**
* Parse a crawled file into an array of `NewSnippet` records.
*
* The language is detected from the file extension. Markdown/MDX files are
* split by heading hierarchy; all other files use language-specific boundary
* detection or a sliding-window fallback.
*/
export function parseFile(file: CrawledFile, options: ParseOptions): NewSnippet[] {
const language = detectLanguage(file.path);
const rawSnippets =
language === 'markdown'
? parseMarkdown(file.content, file.path)
: parseCodeFile(file.content, file.path, language);
return rawSnippets.map((s) => ({
...s,
id: crypto.randomUUID(),
repositoryId: options.repositoryId,
documentId: options.documentId,
versionId: options.versionId ?? null,
createdAt: new Date()
}));
}
// Re-export helpers for consumers that need them individually
export { detectLanguage } from './language.js';
export { estimateTokens, chunkText, chunkLines, MAX_TOKENS, OVERLAP_TOKENS, MIN_CONTENT_LENGTH } from './chunker.js';
export { parseMarkdown } from './markdown.parser.js';
export { parseCodeFile, BOUNDARY_PATTERNS } from './code.parser.js';

View File

@@ -0,0 +1,56 @@
/**
* Language detection for the document parser (TRUEREF-0005).
*
* Maps file extensions to canonical language names used throughout the parser.
*/
import { extname } from 'node:path';
// ---------------------------------------------------------------------------
// Language map
// ---------------------------------------------------------------------------
export const LANGUAGE_MAP: Record<string, string> = {
'.ts': 'typescript',
'.tsx': 'typescript',
'.js': 'javascript',
'.jsx': 'javascript',
'.py': 'python',
'.rb': 'ruby',
'.go': 'go',
'.rs': 'rust',
'.java': 'java',
'.cs': 'csharp',
'.cpp': 'cpp',
'.c': 'c',
'.h': 'c',
'.swift': 'swift',
'.kt': 'kotlin',
'.php': 'php',
'.scala': 'scala',
'.sh': 'bash',
'.bash': 'bash',
'.zsh': 'bash',
'.md': 'markdown',
'.mdx': 'markdown',
'.json': 'json',
'.yaml': 'yaml',
'.yml': 'yaml',
'.toml': 'toml',
'.html': 'html',
'.css': 'css',
'.svelte': 'svelte',
'.vue': 'vue',
'.sql': 'sql',
'.txt': 'text',
'.rst': 'text'
};
/**
* Detect the canonical language name from a file path.
* Returns 'text' when the extension is unknown.
*/
export function detectLanguage(filePath: string): string {
const ext = extname(filePath).toLowerCase();
return LANGUAGE_MAP[ext] ?? 'text';
}

View File

@@ -0,0 +1,272 @@
/**
* Unit tests for the Markdown parser (TRUEREF-0005).
*/
import { describe, it, expect } from 'vitest';
import { parseMarkdown } from './markdown.parser.js';
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/** Build a fenced code block string without nesting backticks in template literals. */
function fence(lang: string, code: string): string {
return '```' + lang + '\n' + code + '\n' + '```';
}
function tildeFence(lang: string, code: string): string {
return '~~~' + lang + '\n' + code + '\n' + '~~~';
}
// ---------------------------------------------------------------------------
// Basic section splitting
// ---------------------------------------------------------------------------
describe('parseMarkdown — section splitting', () => {
it('produces no snippets for empty content', () => {
expect(parseMarkdown('', 'README.md')).toHaveLength(0);
});
it('skips content shorter than 20 characters', () => {
const result = parseMarkdown('# Title\n\nShort.\n', 'README.md');
expect(result).toHaveLength(0);
});
it('parses a single heading section into an info snippet', () => {
const source = [
'# Introduction',
'',
'This is a paragraph with enough content to pass the minimum length check.'
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
expect(snippets.length).toBeGreaterThanOrEqual(1);
const info = snippets.find((s) => s.type === 'info');
expect(info).toBeDefined();
expect(info?.title).toBe('Introduction');
expect(info?.breadcrumb).toBe('Introduction');
});
it('builds correct breadcrumb for nested headings', () => {
const source = [
'# Getting Started',
'',
'Intro text that is long enough to be included here.',
'',
'## Installation',
'',
'Install by running the command shown below in your terminal.'
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
const installation = snippets.find((s) => s.title === 'Installation');
expect(installation).toBeDefined();
expect(installation?.breadcrumb).toBe('Getting Started > Installation');
});
it('resets heading stack correctly when headings ascend', () => {
const source = [
'# H1',
'',
'Some introductory prose that is longer than twenty characters.',
'',
'## H2',
'',
'More content here, also long enough to pass the threshold check.',
'',
'# Second H1',
'',
'Content for second top-level heading, long enough to be included.'
].join('\n');
const snippets = parseMarkdown(source, 'doc.md');
const secondH1 = snippets.find((s) => s.title === 'Second H1');
expect(secondH1).toBeDefined();
expect(secondH1?.breadcrumb).toBe('Second H1');
});
it('falls back to filename when no heading is present', () => {
const source = 'This is some standalone prose content that is long enough to pass.';
const snippets = parseMarkdown(source, 'notes.md');
expect(snippets.length).toBeGreaterThanOrEqual(1);
expect(snippets[0]?.title).toBe('notes.md');
});
});
// ---------------------------------------------------------------------------
// Fenced code block extraction
// ---------------------------------------------------------------------------
describe('parseMarkdown — code block extraction', () => {
it('extracts a fenced code block as a code snippet', () => {
const codeBlock = fence('typescript', 'function hello(name: string): string {\n return `Hello, ${name}!`;\n}');
const source = [
'# Example',
'',
'Some prose here that is long enough to pass the minimum check.',
'',
codeBlock
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
const code = snippets.find((s) => s.type === 'code');
expect(code).toBeDefined();
expect(code?.language).toBe('typescript');
expect(code?.content).toContain('function hello');
});
it('extracts multiple code blocks from the same section', () => {
const bashBlock = fence('bash', 'npm install my-library --save-dev');
const jsBlock = fence('javascript', "const lib = require('my-lib');\nlib.doSomething();");
const source = [
'# Usage',
'',
'Description of the usage pattern with enough text here.',
'',
bashBlock,
'',
'More text in between the two code blocks, just enough.',
'',
jsBlock
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
const codeSnippets = snippets.filter((s) => s.type === 'code');
expect(codeSnippets.length).toBe(2);
const langs = codeSnippets.map((s) => s.language);
expect(langs).toContain('bash');
expect(langs).toContain('javascript');
});
it('skips code blocks shorter than 20 characters', () => {
const shortBlock = fence('', 'x = 1');
const source = [
'# Example',
'',
'Some prose here that is long enough to pass.',
'',
shortBlock
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
expect(snippets.every((s) => s.type === 'info')).toBe(true);
});
it('handles tilde-fenced code blocks', () => {
const pyBlock = tildeFence('python', 'def greet(name):\n return f"Hello, {name}"');
const source = [
'# Section',
'',
'Long enough prose content for the section to be included here.',
'',
pyBlock
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
const code = snippets.find((s) => s.type === 'code');
expect(code).toBeDefined();
expect(code?.language).toBe('python');
});
it('preserves breadcrumb on code snippets', () => {
const codeBlock = fence(
'typescript',
'function connect(url: string): Promise<void> {\n return Promise.resolve();\n}'
);
const source = [
'# API Reference',
'',
'## Methods',
'',
'Overview of the methods available in this library.',
'',
codeBlock
].join('\n');
const snippets = parseMarkdown(source, 'API.md');
const code = snippets.find((s) => s.type === 'code');
expect(code).toBeDefined();
expect(code?.breadcrumb).toBe('API Reference > Methods');
});
});
// ---------------------------------------------------------------------------
// Token counting
// ---------------------------------------------------------------------------
describe('parseMarkdown — token counting', () => {
it('attaches a non-zero tokenCount to every snippet', () => {
const source = [
'# Overview',
'',
'This section contains enough text to produce an info snippet for the test.'
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
for (const s of snippets) {
expect(s.tokenCount).toBeGreaterThan(0);
}
});
});
// ---------------------------------------------------------------------------
// Large content chunking
// ---------------------------------------------------------------------------
describe('parseMarkdown — large content chunking', () => {
it('splits a very large prose section into multiple snippets', () => {
// Generate ~4 000 characters of prose (well above the ~1 800-char window)
const longParagraph = 'word '.repeat(800).trim();
const source = `# Big Section\n\n${longParagraph}`;
const snippets = parseMarkdown(source, 'big.md');
const infoSnippets = snippets.filter((s) => s.type === 'info');
expect(infoSnippets.length).toBeGreaterThan(1);
});
});
// ---------------------------------------------------------------------------
// Real-world sample
// ---------------------------------------------------------------------------
describe('parseMarkdown — real-world sample', () => {
it('correctly parses a realistic README excerpt', () => {
const bashInstall = fence('bash', 'npm install my-library');
const tsUsage = fence('typescript', "import { doTheThing } from 'my-library';\n\ndoTheThing({ verbose: true });");
const source = [
'# My Library',
'',
'A handy library for doing things quickly and efficiently.',
'',
'## Installation',
'',
'Install via npm using the following command in your project directory:',
'',
bashInstall,
'',
'## Usage',
'',
'Import the library and call the main function as shown below:',
'',
tsUsage,
'',
'## API',
'',
'### doTheThing(options)',
'',
'Performs the main operation. Options are passed as a plain object.'
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
// Should have both info and code snippets
expect(snippets.some((s) => s.type === 'info')).toBe(true);
expect(snippets.some((s) => s.type === 'code')).toBe(true);
// Breadcrumb depth check
const apiSnippet = snippets.find((s) => s.title === 'doTheThing(options)');
expect(apiSnippet).toBeDefined();
expect(apiSnippet?.breadcrumb).toBe('My Library > API > doTheThing(options)');
});
});

View File

@@ -0,0 +1,171 @@
/**
* Markdown document parser for TRUEREF-0005.
*
* Splits Markdown/MDX files into heading-based sections and extracts fenced
* code blocks as separate code snippets.
*/
import { basename } from 'node:path';
import type { NewSnippet } from '$lib/server/db/schema.js';
import { estimateTokens, chunkText, MAX_TOKENS, OVERLAP_TOKENS, MIN_CONTENT_LENGTH } from './chunker.js';
// ---------------------------------------------------------------------------
// Internal types
// ---------------------------------------------------------------------------
interface CodeBlock {
language: string;
code: string;
}
interface MarkdownSection {
/** Heading stack at this point, e.g. ["Getting Started", "Installation"] */
headings: string[];
/** Prose text content (code blocks stripped out) */
content: string;
/** Fenced code blocks found within this section */
codeBlocks: CodeBlock[];
}
// ---------------------------------------------------------------------------
// Section splitting
// ---------------------------------------------------------------------------
/**
* Split the full Markdown source into sections delimited by ATX headings
* (# … ####). Code blocks inside headings are extracted separately.
*/
function splitIntoSections(source: string): MarkdownSection[] {
const lines = source.split('\n');
const sections: MarkdownSection[] = [];
// Heading stack: index 0 = H1, 1 = H2, … (we track up to H4)
const headingStack: string[] = [];
// Accumulator for the current section
let textLines: string[] = [];
const codeBlocks: CodeBlock[] = [];
// Fenced-code-block tracking
let inCodeBlock = false;
let codeFence = '';
let codeLanguage = '';
let codeLines: string[] = [];
function flushSection() {
sections.push({
headings: [...headingStack],
content: textLines.join('\n'),
codeBlocks: [...codeBlocks]
});
textLines = [];
codeBlocks.length = 0;
}
for (const line of lines) {
// ---- Fenced code block handling ----
if (!inCodeBlock) {
const fenceMatch = line.match(/^(`{3,}|~{3,})([\w-]*)/);
if (fenceMatch) {
inCodeBlock = true;
codeFence = fenceMatch[1].charAt(0).repeat(fenceMatch[1].length);
codeLanguage = fenceMatch[2].trim().toLowerCase();
codeLines = [];
continue;
}
} else {
// Check for closing fence (must be same char and at least same length)
const closingFence = new RegExp(`^${codeFence[0]}{${codeFence.length},}\\s*$`);
if (closingFence.test(line)) {
inCodeBlock = false;
const code = codeLines.join('\n');
if (code.trim().length >= MIN_CONTENT_LENGTH) {
codeBlocks.push({ language: codeLanguage, code });
}
codeLines = [];
continue;
}
codeLines.push(line);
continue;
}
// ---- Heading detection (ATX only, H1H4) ----
const headingMatch = line.match(/^(#{1,4})\s+(.*)/);
if (headingMatch) {
// Emit whatever has accumulated before this heading
flushSection();
const level = headingMatch[1].length; // 14
const title = headingMatch[2].trim();
// Trim the stack to the depth above this heading and push the new title
headingStack.splice(level - 1, headingStack.length - (level - 1), title);
continue;
}
// ---- Ordinary prose line ----
textLines.push(line);
}
// Flush any trailing content (unclosed fence treated as prose)
if (inCodeBlock) {
// Treat remaining code lines as prose if the fence was never closed
textLines.push(...codeLines);
}
flushSection();
return sections;
}
// ---------------------------------------------------------------------------
// Public parser
// ---------------------------------------------------------------------------
type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;
/**
* Parse a Markdown/MDX file into raw snippets (before IDs and DB fields are
* attached).
*/
export function parseMarkdown(content: string, filePath: string): RawSnippet[] {
const sections = splitIntoSections(content);
const snippets: RawSnippet[] = [];
for (const section of sections) {
const breadcrumb = section.headings.join(' > ') || undefined;
const title = section.headings.at(-1) ?? basename(filePath);
// ---- Info snippet for prose content ----
const prose = section.content.trim();
if (prose.length >= MIN_CONTENT_LENGTH) {
const chunks = chunkText(prose, MAX_TOKENS, OVERLAP_TOKENS);
for (const chunk of chunks) {
snippets.push({
type: 'info',
title,
content: chunk,
breadcrumb: breadcrumb ?? null,
language: null,
tokenCount: estimateTokens(chunk)
});
}
}
// ---- Code snippets for each fenced code block ----
for (const block of section.codeBlocks) {
const code = block.code.trim();
if (code.length < MIN_CONTENT_LENGTH) continue;
snippets.push({
type: 'code',
title,
content: code,
language: block.language || null,
breadcrumb: breadcrumb ?? null,
tokenCount: estimateTokens(code)
});
}
}
return snippets;
}