feat(TRUEREF-0005): implement document parser and chunker
- Markdown parser with heading-based section splitting and code block extraction - Code file parser with regex boundary detection for 10+ languages - Sliding window chunker with configurable token limits and overlap - Language detection from file extensions Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
272
src/lib/server/parser/markdown.parser.test.ts
Normal file
272
src/lib/server/parser/markdown.parser.test.ts
Normal file
@@ -0,0 +1,272 @@
|
||||
/**
|
||||
* Unit tests for the Markdown parser (TRUEREF-0005).
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { parseMarkdown } from './markdown.parser.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Build a fenced code block string without nesting backticks in template literals. */
|
||||
function fence(lang: string, code: string): string {
|
||||
return '```' + lang + '\n' + code + '\n' + '```';
|
||||
}
|
||||
|
||||
function tildeFence(lang: string, code: string): string {
|
||||
return '~~~' + lang + '\n' + code + '\n' + '~~~';
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Basic section splitting
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseMarkdown — section splitting', () => {
|
||||
it('produces no snippets for empty content', () => {
|
||||
expect(parseMarkdown('', 'README.md')).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('skips content shorter than 20 characters', () => {
|
||||
const result = parseMarkdown('# Title\n\nShort.\n', 'README.md');
|
||||
expect(result).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('parses a single heading section into an info snippet', () => {
|
||||
const source = [
|
||||
'# Introduction',
|
||||
'',
|
||||
'This is a paragraph with enough content to pass the minimum length check.'
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||
const info = snippets.find((s) => s.type === 'info');
|
||||
expect(info).toBeDefined();
|
||||
expect(info?.title).toBe('Introduction');
|
||||
expect(info?.breadcrumb).toBe('Introduction');
|
||||
});
|
||||
|
||||
it('builds correct breadcrumb for nested headings', () => {
|
||||
const source = [
|
||||
'# Getting Started',
|
||||
'',
|
||||
'Intro text that is long enough to be included here.',
|
||||
'',
|
||||
'## Installation',
|
||||
'',
|
||||
'Install by running the command shown below in your terminal.'
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
const installation = snippets.find((s) => s.title === 'Installation');
|
||||
expect(installation).toBeDefined();
|
||||
expect(installation?.breadcrumb).toBe('Getting Started > Installation');
|
||||
});
|
||||
|
||||
it('resets heading stack correctly when headings ascend', () => {
|
||||
const source = [
|
||||
'# H1',
|
||||
'',
|
||||
'Some introductory prose that is longer than twenty characters.',
|
||||
'',
|
||||
'## H2',
|
||||
'',
|
||||
'More content here, also long enough to pass the threshold check.',
|
||||
'',
|
||||
'# Second H1',
|
||||
'',
|
||||
'Content for second top-level heading, long enough to be included.'
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'doc.md');
|
||||
const secondH1 = snippets.find((s) => s.title === 'Second H1');
|
||||
expect(secondH1).toBeDefined();
|
||||
expect(secondH1?.breadcrumb).toBe('Second H1');
|
||||
});
|
||||
|
||||
it('falls back to filename when no heading is present', () => {
|
||||
const source = 'This is some standalone prose content that is long enough to pass.';
|
||||
const snippets = parseMarkdown(source, 'notes.md');
|
||||
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||
expect(snippets[0]?.title).toBe('notes.md');
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Fenced code block extraction
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseMarkdown — code block extraction', () => {
|
||||
it('extracts a fenced code block as a code snippet', () => {
|
||||
const codeBlock = fence('typescript', 'function hello(name: string): string {\n return `Hello, ${name}!`;\n}');
|
||||
const source = [
|
||||
'# Example',
|
||||
'',
|
||||
'Some prose here that is long enough to pass the minimum check.',
|
||||
'',
|
||||
codeBlock
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
const code = snippets.find((s) => s.type === 'code');
|
||||
expect(code).toBeDefined();
|
||||
expect(code?.language).toBe('typescript');
|
||||
expect(code?.content).toContain('function hello');
|
||||
});
|
||||
|
||||
it('extracts multiple code blocks from the same section', () => {
|
||||
const bashBlock = fence('bash', 'npm install my-library --save-dev');
|
||||
const jsBlock = fence('javascript', "const lib = require('my-lib');\nlib.doSomething();");
|
||||
const source = [
|
||||
'# Usage',
|
||||
'',
|
||||
'Description of the usage pattern with enough text here.',
|
||||
'',
|
||||
bashBlock,
|
||||
'',
|
||||
'More text in between the two code blocks, just enough.',
|
||||
'',
|
||||
jsBlock
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
const codeSnippets = snippets.filter((s) => s.type === 'code');
|
||||
expect(codeSnippets.length).toBe(2);
|
||||
const langs = codeSnippets.map((s) => s.language);
|
||||
expect(langs).toContain('bash');
|
||||
expect(langs).toContain('javascript');
|
||||
});
|
||||
|
||||
it('skips code blocks shorter than 20 characters', () => {
|
||||
const shortBlock = fence('', 'x = 1');
|
||||
const source = [
|
||||
'# Example',
|
||||
'',
|
||||
'Some prose here that is long enough to pass.',
|
||||
'',
|
||||
shortBlock
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
expect(snippets.every((s) => s.type === 'info')).toBe(true);
|
||||
});
|
||||
|
||||
it('handles tilde-fenced code blocks', () => {
|
||||
const pyBlock = tildeFence('python', 'def greet(name):\n return f"Hello, {name}"');
|
||||
const source = [
|
||||
'# Section',
|
||||
'',
|
||||
'Long enough prose content for the section to be included here.',
|
||||
'',
|
||||
pyBlock
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
const code = snippets.find((s) => s.type === 'code');
|
||||
expect(code).toBeDefined();
|
||||
expect(code?.language).toBe('python');
|
||||
});
|
||||
|
||||
it('preserves breadcrumb on code snippets', () => {
|
||||
const codeBlock = fence(
|
||||
'typescript',
|
||||
'function connect(url: string): Promise<void> {\n return Promise.resolve();\n}'
|
||||
);
|
||||
const source = [
|
||||
'# API Reference',
|
||||
'',
|
||||
'## Methods',
|
||||
'',
|
||||
'Overview of the methods available in this library.',
|
||||
'',
|
||||
codeBlock
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'API.md');
|
||||
const code = snippets.find((s) => s.type === 'code');
|
||||
expect(code).toBeDefined();
|
||||
expect(code?.breadcrumb).toBe('API Reference > Methods');
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Token counting
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseMarkdown — token counting', () => {
|
||||
it('attaches a non-zero tokenCount to every snippet', () => {
|
||||
const source = [
|
||||
'# Overview',
|
||||
'',
|
||||
'This section contains enough text to produce an info snippet for the test.'
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
for (const s of snippets) {
|
||||
expect(s.tokenCount).toBeGreaterThan(0);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Large content chunking
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseMarkdown — large content chunking', () => {
|
||||
it('splits a very large prose section into multiple snippets', () => {
|
||||
// Generate ~4 000 characters of prose (well above the ~1 800-char window)
|
||||
const longParagraph = 'word '.repeat(800).trim();
|
||||
const source = `# Big Section\n\n${longParagraph}`;
|
||||
|
||||
const snippets = parseMarkdown(source, 'big.md');
|
||||
const infoSnippets = snippets.filter((s) => s.type === 'info');
|
||||
expect(infoSnippets.length).toBeGreaterThan(1);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Real-world sample
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseMarkdown — real-world sample', () => {
|
||||
it('correctly parses a realistic README excerpt', () => {
|
||||
const bashInstall = fence('bash', 'npm install my-library');
|
||||
const tsUsage = fence('typescript', "import { doTheThing } from 'my-library';\n\ndoTheThing({ verbose: true });");
|
||||
|
||||
const source = [
|
||||
'# My Library',
|
||||
'',
|
||||
'A handy library for doing things quickly and efficiently.',
|
||||
'',
|
||||
'## Installation',
|
||||
'',
|
||||
'Install via npm using the following command in your project directory:',
|
||||
'',
|
||||
bashInstall,
|
||||
'',
|
||||
'## Usage',
|
||||
'',
|
||||
'Import the library and call the main function as shown below:',
|
||||
'',
|
||||
tsUsage,
|
||||
'',
|
||||
'## API',
|
||||
'',
|
||||
'### doTheThing(options)',
|
||||
'',
|
||||
'Performs the main operation. Options are passed as a plain object.'
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
|
||||
// Should have both info and code snippets
|
||||
expect(snippets.some((s) => s.type === 'info')).toBe(true);
|
||||
expect(snippets.some((s) => s.type === 'code')).toBe(true);
|
||||
|
||||
// Breadcrumb depth check
|
||||
const apiSnippet = snippets.find((s) => s.title === 'doTheThing(options)');
|
||||
expect(apiSnippet).toBeDefined();
|
||||
expect(apiSnippet?.breadcrumb).toBe('My Library > API > doTheThing(options)');
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user