Files
trueref/src/lib/server/parser/markdown.parser.test.ts
Giancarmine Salucci f6be3cfd47 feat(TRUEREF-0005): implement document parser and chunker
- Markdown parser with heading-based section splitting and code block extraction
- Code file parser with regex boundary detection for 10+ languages
- Sliding window chunker with configurable token limits and overlap
- Language detection from file extensions

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-23 09:06:12 +01:00

273 lines
8.6 KiB
TypeScript

/**
* Unit tests for the Markdown parser (TRUEREF-0005).
*/
import { describe, it, expect } from 'vitest';
import { parseMarkdown } from './markdown.parser.js';
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/** Build a fenced code block string without nesting backticks in template literals. */
function fence(lang: string, code: string): string {
return '```' + lang + '\n' + code + '\n' + '```';
}
function tildeFence(lang: string, code: string): string {
return '~~~' + lang + '\n' + code + '\n' + '~~~';
}
// ---------------------------------------------------------------------------
// Basic section splitting
// ---------------------------------------------------------------------------
describe('parseMarkdown — section splitting', () => {
it('produces no snippets for empty content', () => {
expect(parseMarkdown('', 'README.md')).toHaveLength(0);
});
it('skips content shorter than 20 characters', () => {
const result = parseMarkdown('# Title\n\nShort.\n', 'README.md');
expect(result).toHaveLength(0);
});
it('parses a single heading section into an info snippet', () => {
const source = [
'# Introduction',
'',
'This is a paragraph with enough content to pass the minimum length check.'
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
expect(snippets.length).toBeGreaterThanOrEqual(1);
const info = snippets.find((s) => s.type === 'info');
expect(info).toBeDefined();
expect(info?.title).toBe('Introduction');
expect(info?.breadcrumb).toBe('Introduction');
});
it('builds correct breadcrumb for nested headings', () => {
const source = [
'# Getting Started',
'',
'Intro text that is long enough to be included here.',
'',
'## Installation',
'',
'Install by running the command shown below in your terminal.'
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
const installation = snippets.find((s) => s.title === 'Installation');
expect(installation).toBeDefined();
expect(installation?.breadcrumb).toBe('Getting Started > Installation');
});
it('resets heading stack correctly when headings ascend', () => {
const source = [
'# H1',
'',
'Some introductory prose that is longer than twenty characters.',
'',
'## H2',
'',
'More content here, also long enough to pass the threshold check.',
'',
'# Second H1',
'',
'Content for second top-level heading, long enough to be included.'
].join('\n');
const snippets = parseMarkdown(source, 'doc.md');
const secondH1 = snippets.find((s) => s.title === 'Second H1');
expect(secondH1).toBeDefined();
expect(secondH1?.breadcrumb).toBe('Second H1');
});
it('falls back to filename when no heading is present', () => {
const source = 'This is some standalone prose content that is long enough to pass.';
const snippets = parseMarkdown(source, 'notes.md');
expect(snippets.length).toBeGreaterThanOrEqual(1);
expect(snippets[0]?.title).toBe('notes.md');
});
});
// ---------------------------------------------------------------------------
// Fenced code block extraction
// ---------------------------------------------------------------------------
describe('parseMarkdown — code block extraction', () => {
it('extracts a fenced code block as a code snippet', () => {
const codeBlock = fence('typescript', 'function hello(name: string): string {\n return `Hello, ${name}!`;\n}');
const source = [
'# Example',
'',
'Some prose here that is long enough to pass the minimum check.',
'',
codeBlock
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
const code = snippets.find((s) => s.type === 'code');
expect(code).toBeDefined();
expect(code?.language).toBe('typescript');
expect(code?.content).toContain('function hello');
});
it('extracts multiple code blocks from the same section', () => {
const bashBlock = fence('bash', 'npm install my-library --save-dev');
const jsBlock = fence('javascript', "const lib = require('my-lib');\nlib.doSomething();");
const source = [
'# Usage',
'',
'Description of the usage pattern with enough text here.',
'',
bashBlock,
'',
'More text in between the two code blocks, just enough.',
'',
jsBlock
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
const codeSnippets = snippets.filter((s) => s.type === 'code');
expect(codeSnippets.length).toBe(2);
const langs = codeSnippets.map((s) => s.language);
expect(langs).toContain('bash');
expect(langs).toContain('javascript');
});
it('skips code blocks shorter than 20 characters', () => {
const shortBlock = fence('', 'x = 1');
const source = [
'# Example',
'',
'Some prose here that is long enough to pass.',
'',
shortBlock
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
expect(snippets.every((s) => s.type === 'info')).toBe(true);
});
it('handles tilde-fenced code blocks', () => {
const pyBlock = tildeFence('python', 'def greet(name):\n return f"Hello, {name}"');
const source = [
'# Section',
'',
'Long enough prose content for the section to be included here.',
'',
pyBlock
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
const code = snippets.find((s) => s.type === 'code');
expect(code).toBeDefined();
expect(code?.language).toBe('python');
});
it('preserves breadcrumb on code snippets', () => {
const codeBlock = fence(
'typescript',
'function connect(url: string): Promise<void> {\n return Promise.resolve();\n}'
);
const source = [
'# API Reference',
'',
'## Methods',
'',
'Overview of the methods available in this library.',
'',
codeBlock
].join('\n');
const snippets = parseMarkdown(source, 'API.md');
const code = snippets.find((s) => s.type === 'code');
expect(code).toBeDefined();
expect(code?.breadcrumb).toBe('API Reference > Methods');
});
});
// ---------------------------------------------------------------------------
// Token counting
// ---------------------------------------------------------------------------
describe('parseMarkdown — token counting', () => {
it('attaches a non-zero tokenCount to every snippet', () => {
const source = [
'# Overview',
'',
'This section contains enough text to produce an info snippet for the test.'
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
for (const s of snippets) {
expect(s.tokenCount).toBeGreaterThan(0);
}
});
});
// ---------------------------------------------------------------------------
// Large content chunking
// ---------------------------------------------------------------------------
describe('parseMarkdown — large content chunking', () => {
it('splits a very large prose section into multiple snippets', () => {
// Generate ~4 000 characters of prose (well above the ~1 800-char window)
const longParagraph = 'word '.repeat(800).trim();
const source = `# Big Section\n\n${longParagraph}`;
const snippets = parseMarkdown(source, 'big.md');
const infoSnippets = snippets.filter((s) => s.type === 'info');
expect(infoSnippets.length).toBeGreaterThan(1);
});
});
// ---------------------------------------------------------------------------
// Real-world sample
// ---------------------------------------------------------------------------
describe('parseMarkdown — real-world sample', () => {
it('correctly parses a realistic README excerpt', () => {
const bashInstall = fence('bash', 'npm install my-library');
const tsUsage = fence('typescript', "import { doTheThing } from 'my-library';\n\ndoTheThing({ verbose: true });");
const source = [
'# My Library',
'',
'A handy library for doing things quickly and efficiently.',
'',
'## Installation',
'',
'Install via npm using the following command in your project directory:',
'',
bashInstall,
'',
'## Usage',
'',
'Import the library and call the main function as shown below:',
'',
tsUsage,
'',
'## API',
'',
'### doTheThing(options)',
'',
'Performs the main operation. Options are passed as a plain object.'
].join('\n');
const snippets = parseMarkdown(source, 'README.md');
// Should have both info and code snippets
expect(snippets.some((s) => s.type === 'info')).toBe(true);
expect(snippets.some((s) => s.type === 'code')).toBe(true);
// Breadcrumb depth check
const apiSnippet = snippets.find((s) => s.title === 'doTheThing(options)');
expect(apiSnippet).toBeDefined();
expect(apiSnippet?.breadcrumb).toBe('My Library > API > doTheThing(options)');
});
});