feat(TRUEREF-0005): implement document parser and chunker
- Markdown parser with heading-based section splitting and code block extraction - Code file parser with regex boundary detection for 10+ languages - Sliding window chunker with configurable token limits and overlap - Language detection from file extensions Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
92
src/lib/server/parser/chunker.ts
Normal file
92
src/lib/server/parser/chunker.ts
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
/**
|
||||||
|
* Text chunking utilities for the document parser (TRUEREF-0005).
|
||||||
|
*
|
||||||
|
* Provides sliding-window chunking with overlap and token estimation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Constants
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export const MAX_TOKENS = 512;
|
||||||
|
export const OVERLAP_TOKENS = 50;
|
||||||
|
export const MIN_CONTENT_LENGTH = 20; // characters
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Token estimation
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Estimate the token count for a piece of text using a character-based
|
||||||
|
* approximation (~3.5 chars per token on average for mixed prose/code).
|
||||||
|
*/
|
||||||
|
export function estimateTokens(text: string): number {
|
||||||
|
return Math.ceil(text.length / 3.5);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Sliding-window chunker
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Split `text` into overlapping word-based chunks that stay within the token
|
||||||
|
* budget. Returns at least one chunk even when the text fits in a single
|
||||||
|
* window.
|
||||||
|
*/
|
||||||
|
export function chunkText(
|
||||||
|
text: string,
|
||||||
|
maxTokens: number = MAX_TOKENS,
|
||||||
|
overlapTokens: number = OVERLAP_TOKENS
|
||||||
|
): string[] {
|
||||||
|
const words = text.split(/\s+/).filter((w) => w.length > 0);
|
||||||
|
if (words.length === 0) return [];
|
||||||
|
|
||||||
|
// ~0.75 words per token
|
||||||
|
const maxWords = Math.max(1, Math.floor(maxTokens * 0.75));
|
||||||
|
const overlapWords = Math.max(0, Math.floor(overlapTokens * 0.75));
|
||||||
|
|
||||||
|
if (words.length <= maxWords) {
|
||||||
|
return [words.join(' ')];
|
||||||
|
}
|
||||||
|
|
||||||
|
const chunks: string[] = [];
|
||||||
|
let start = 0;
|
||||||
|
|
||||||
|
while (start < words.length) {
|
||||||
|
const end = Math.min(start + maxWords, words.length);
|
||||||
|
chunks.push(words.slice(start, end).join(' '));
|
||||||
|
if (end === words.length) break;
|
||||||
|
start = end - overlapWords;
|
||||||
|
// Guard against infinite loop when overlapWords >= maxWords
|
||||||
|
if (start <= 0) start = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
return chunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Line-count sliding window (for code files without recognised boundaries)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/** Split `lines` into groups of at most `maxLines` with `overlapLines` overlap. */
|
||||||
|
export function chunkLines(
|
||||||
|
lines: string[],
|
||||||
|
maxLines: number = 200,
|
||||||
|
overlapLines: number = 20
|
||||||
|
): string[] {
|
||||||
|
if (lines.length === 0) return [];
|
||||||
|
if (lines.length <= maxLines) return [lines.join('\n')];
|
||||||
|
|
||||||
|
const chunks: string[] = [];
|
||||||
|
let start = 0;
|
||||||
|
|
||||||
|
while (start < lines.length) {
|
||||||
|
const end = Math.min(start + maxLines, lines.length);
|
||||||
|
chunks.push(lines.slice(start, end).join('\n'));
|
||||||
|
if (end === lines.length) break;
|
||||||
|
start = end - overlapLines;
|
||||||
|
if (start <= 0) start = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
return chunks;
|
||||||
|
}
|
||||||
404
src/lib/server/parser/code.parser.test.ts
Normal file
404
src/lib/server/parser/code.parser.test.ts
Normal file
@@ -0,0 +1,404 @@
|
|||||||
|
/**
|
||||||
|
* Unit tests for the code file parser (TRUEREF-0005).
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { describe, it, expect } from 'vitest';
|
||||||
|
import { parseCodeFile, BOUNDARY_PATTERNS } from './code.parser.js';
|
||||||
|
import { estimateTokens, MAX_TOKENS } from './chunker.js';
|
||||||
|
import { parseFile } from './index.js';
|
||||||
|
import type { CrawledFile } from '$lib/server/crawler/types.js';
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function makeFile(path: string, content: string, language = 'typescript'): CrawledFile {
|
||||||
|
return { path, content, size: content.length, sha: 'abc123', language };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// TypeScript / JavaScript boundary detection
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseCodeFile — TypeScript', () => {
|
||||||
|
it('splits at function boundaries', () => {
|
||||||
|
const content = `
|
||||||
|
export function foo(): string {
|
||||||
|
return 'foo';
|
||||||
|
}
|
||||||
|
|
||||||
|
export function bar(x: number): number {
|
||||||
|
return x * 2;
|
||||||
|
}
|
||||||
|
`.trim();
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'utils.ts', 'typescript');
|
||||||
|
expect(snippets.length).toBeGreaterThanOrEqual(2);
|
||||||
|
expect(snippets.every((s) => s.type === 'code')).toBe(true);
|
||||||
|
expect(snippets.some((s) => s.content.includes('function foo'))).toBe(true);
|
||||||
|
expect(snippets.some((s) => s.content.includes('function bar'))).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('splits at class boundaries', () => {
|
||||||
|
const content = `
|
||||||
|
export class Greeter {
|
||||||
|
greet(name: string) {
|
||||||
|
return \`Hello, \${name}\`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export class Farewell {
|
||||||
|
bye(name: string) {
|
||||||
|
return \`Goodbye, \${name}\`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
`.trim();
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'greet.ts', 'typescript');
|
||||||
|
expect(snippets.length).toBeGreaterThanOrEqual(2);
|
||||||
|
expect(snippets.some((s) => s.content.includes('class Greeter'))).toBe(true);
|
||||||
|
expect(snippets.some((s) => s.content.includes('class Farewell'))).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('sets correct metadata on snippets', () => {
|
||||||
|
const content = `
|
||||||
|
export function example(): void {
|
||||||
|
console.log('example function body here');
|
||||||
|
}
|
||||||
|
`.trim();
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'src/utils.ts', 'typescript');
|
||||||
|
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||||
|
const s = snippets[0]!;
|
||||||
|
expect(s.type).toBe('code');
|
||||||
|
expect(s.language).toBe('typescript');
|
||||||
|
expect(s.title).toBe('utils.ts');
|
||||||
|
expect(s.breadcrumb).toBe('src/utils.ts');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('produces at least one snippet from a file with many small declarations', () => {
|
||||||
|
// Each block: a multi-line function — boundary detection fires but chunks are
|
||||||
|
// large enough to survive the MIN_CONTENT_LENGTH filter.
|
||||||
|
const blocks = Array.from(
|
||||||
|
{ length: 10 },
|
||||||
|
(_, i) => `export function helper${i}(x: number): number {\n return x + ${i};\n}`
|
||||||
|
);
|
||||||
|
const content = blocks.join('\n\n');
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'generated.ts', 'typescript');
|
||||||
|
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||||
|
expect(snippets.every((s) => s.type === 'code')).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Python
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseCodeFile — Python', () => {
|
||||||
|
it('splits at def and class boundaries', () => {
|
||||||
|
const content = `
|
||||||
|
def greet(name):
|
||||||
|
return f"Hello, {name}"
|
||||||
|
|
||||||
|
class MyClass:
|
||||||
|
def __init__(self):
|
||||||
|
self.value = 0
|
||||||
|
|
||||||
|
def increment(self):
|
||||||
|
self.value += 1
|
||||||
|
|
||||||
|
async def fetch_data(url):
|
||||||
|
return await http.get(url)
|
||||||
|
`.trim();
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'app.py', 'python');
|
||||||
|
expect(snippets.some((s) => s.content.includes('def greet'))).toBe(true);
|
||||||
|
expect(snippets.some((s) => s.content.includes('class MyClass'))).toBe(true);
|
||||||
|
expect(snippets.some((s) => s.content.includes('async def fetch_data'))).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Go
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseCodeFile — Go', () => {
|
||||||
|
it('splits at func boundaries', () => {
|
||||||
|
const content = `
|
||||||
|
package main
|
||||||
|
|
||||||
|
import "fmt"
|
||||||
|
|
||||||
|
func greet(name string) string {
|
||||||
|
return fmt.Sprintf("Hello, %s", name)
|
||||||
|
}
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
fmt.Println(greet("world"))
|
||||||
|
}
|
||||||
|
`.trim();
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'main.go', 'go');
|
||||||
|
expect(snippets.some((s) => s.content.includes('func greet'))).toBe(true);
|
||||||
|
expect(snippets.some((s) => s.content.includes('func main'))).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Rust
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseCodeFile — Rust', () => {
|
||||||
|
it('splits at fn and struct boundaries', () => {
|
||||||
|
const content = `
|
||||||
|
pub struct Config {
|
||||||
|
pub name: String,
|
||||||
|
pub value: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn create_config(name: &str, value: u32) -> Config {
|
||||||
|
Config { name: name.to_string(), value }
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Config {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Config { name: String::new(), value: 0 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
`.trim();
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'config.rs', 'rust');
|
||||||
|
expect(snippets.some((s) => s.content.includes('pub struct Config'))).toBe(true);
|
||||||
|
expect(snippets.some((s) => s.content.includes('pub fn create_config'))).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Ruby
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseCodeFile — Ruby', () => {
|
||||||
|
it('splits at def and class boundaries', () => {
|
||||||
|
const content = `
|
||||||
|
class Greeter
|
||||||
|
def initialize(name)
|
||||||
|
@name = name
|
||||||
|
end
|
||||||
|
|
||||||
|
def greet
|
||||||
|
"Hello, #{@name}!"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def standalone_helper
|
||||||
|
puts "helper"
|
||||||
|
end
|
||||||
|
`.trim();
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'greeter.rb', 'ruby');
|
||||||
|
expect(snippets.some((s) => s.content.includes('class Greeter'))).toBe(true);
|
||||||
|
expect(snippets.some((s) => s.content.includes('def standalone_helper'))).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Config / data files
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseCodeFile — JSON', () => {
|
||||||
|
it('produces at least one code snippet from a JSON object', () => {
|
||||||
|
const content = JSON.stringify(
|
||||||
|
{
|
||||||
|
name: 'my-package',
|
||||||
|
version: '1.0.0',
|
||||||
|
dependencies: { lodash: '^4.17.21' }
|
||||||
|
},
|
||||||
|
null,
|
||||||
|
2
|
||||||
|
);
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'package.json', 'json');
|
||||||
|
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||||
|
expect(snippets.every((s) => s.type === 'code')).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('parseCodeFile — YAML', () => {
|
||||||
|
it('splits a YAML file at top-level keys', () => {
|
||||||
|
const content = `
|
||||||
|
name: my-project
|
||||||
|
version: 1.0.0
|
||||||
|
scripts:
|
||||||
|
build: tsc
|
||||||
|
test: vitest
|
||||||
|
dependencies:
|
||||||
|
lodash: ^4.17.21
|
||||||
|
`.trim();
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'config.yaml', 'yaml');
|
||||||
|
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// HTML-like files
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseCodeFile — HTML', () => {
|
||||||
|
it('extracts script block and text content', () => {
|
||||||
|
const content = `
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><title>Test Page</title></head>
|
||||||
|
<body>
|
||||||
|
<p>This is the page body content with enough text for an info snippet.</p>
|
||||||
|
<script>
|
||||||
|
function init() {
|
||||||
|
console.log('page loaded and ready for interaction');
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`.trim();
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'index.html', 'html');
|
||||||
|
expect(snippets.some((s) => s.type === 'code')).toBe(true);
|
||||||
|
expect(snippets.some((s) => s.type === 'info')).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Plain text
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseCodeFile — plain text', () => {
|
||||||
|
it('splits on paragraph boundaries', () => {
|
||||||
|
const content = `
|
||||||
|
This is the first paragraph with enough content to pass the minimum length check.
|
||||||
|
|
||||||
|
This is the second paragraph that also has enough content to be included here.
|
||||||
|
`.trim();
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'notes.txt', 'text');
|
||||||
|
expect(snippets.length).toBeGreaterThanOrEqual(2);
|
||||||
|
expect(snippets.every((s) => s.type === 'info')).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('skips paragraphs shorter than 20 characters', () => {
|
||||||
|
const content = 'Short.\n\nThis is a much longer paragraph that definitely passes the minimum length filter.';
|
||||||
|
const snippets = parseCodeFile(content, 'notes.txt', 'text');
|
||||||
|
expect(snippets.length).toBe(1);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Unknown language fallback
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseCodeFile — unknown language', () => {
|
||||||
|
it('falls back to sliding window for unrecognised languages', () => {
|
||||||
|
const lines = Array.from({ length: 50 }, (_, i) => `line ${i}: some code content here`);
|
||||||
|
const content = lines.join('\n');
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'script.lua', 'lua');
|
||||||
|
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||||
|
expect(snippets.every((s) => s.type === 'code')).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Min content filter
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseCodeFile — minimum content filter', () => {
|
||||||
|
it('skips segments shorter than 20 characters', () => {
|
||||||
|
const content = `
|
||||||
|
export function realFunction(): string {
|
||||||
|
// A function with enough content to be included in the output snippets.
|
||||||
|
return 'result value from the function that does the operation here';
|
||||||
|
}
|
||||||
|
`.trim();
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'test.ts', 'typescript');
|
||||||
|
expect(snippets.every((s) => s.content.length >= 20)).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Token count cap
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseCodeFile — token count', () => {
|
||||||
|
it('all snippets have tokenCount within MAX_TOKENS', () => {
|
||||||
|
const lines = Array.from({ length: 300 }, (_, i) => `// comment line number ${i} here\nconst x${i} = ${i};`);
|
||||||
|
const content = lines.join('\n');
|
||||||
|
|
||||||
|
const snippets = parseCodeFile(content, 'large.ts', 'typescript');
|
||||||
|
for (const s of snippets) {
|
||||||
|
expect(estimateTokens(s.content)).toBeLessThanOrEqual(MAX_TOKENS + 50); // slight tolerance for boundary chunks
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// parseFile integration
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseFile — integration', () => {
|
||||||
|
const opts = { repositoryId: 'repo-1', documentId: 'doc-1', versionId: 'v1' };
|
||||||
|
|
||||||
|
it('returns NewSnippet records with all required fields for a .ts file', () => {
|
||||||
|
const file = makeFile(
|
||||||
|
'src/utils.ts',
|
||||||
|
`export function add(a: number, b: number): number {\n return a + b;\n}\n`
|
||||||
|
);
|
||||||
|
|
||||||
|
const snippets = parseFile(file, opts);
|
||||||
|
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||||
|
|
||||||
|
for (const s of snippets) {
|
||||||
|
expect(s.id).toBeTruthy();
|
||||||
|
expect(s.repositoryId).toBe('repo-1');
|
||||||
|
expect(s.documentId).toBe('doc-1');
|
||||||
|
expect(s.versionId).toBe('v1');
|
||||||
|
expect(s.createdAt).toBeInstanceOf(Date);
|
||||||
|
expect(s.content).toBeTruthy();
|
||||||
|
expect(s.type).toMatch(/^(code|info)$/);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns NewSnippet records for a .md file', () => {
|
||||||
|
const file = makeFile(
|
||||||
|
'README.md',
|
||||||
|
`# Hello\n\nThis is a long enough paragraph to pass the minimum content length filter.\n`,
|
||||||
|
'markdown'
|
||||||
|
);
|
||||||
|
|
||||||
|
const snippets = parseFile(file, opts);
|
||||||
|
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||||
|
expect(snippets[0]?.type).toBe('info');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('uses null for versionId when not provided', () => {
|
||||||
|
const file = makeFile('src/index.ts', `export function noop(): void {}\n`);
|
||||||
|
const snippets = parseFile(file, { repositoryId: 'r', documentId: 'd' });
|
||||||
|
|
||||||
|
// noop is too short; file may return 0 snippets — just verify no error thrown
|
||||||
|
expect(Array.isArray(snippets)).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// BOUNDARY_PATTERNS export
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('BOUNDARY_PATTERNS', () => {
|
||||||
|
it('contains entries for core languages', () => {
|
||||||
|
expect(BOUNDARY_PATTERNS['typescript']).toBeInstanceOf(RegExp);
|
||||||
|
expect(BOUNDARY_PATTERNS['python']).toBeInstanceOf(RegExp);
|
||||||
|
expect(BOUNDARY_PATTERNS['go']).toBeInstanceOf(RegExp);
|
||||||
|
expect(BOUNDARY_PATTERNS['rust']).toBeInstanceOf(RegExp);
|
||||||
|
expect(BOUNDARY_PATTERNS['ruby']).toBeInstanceOf(RegExp);
|
||||||
|
});
|
||||||
|
});
|
||||||
302
src/lib/server/parser/code.parser.ts
Normal file
302
src/lib/server/parser/code.parser.ts
Normal file
@@ -0,0 +1,302 @@
|
|||||||
|
/**
|
||||||
|
* Code file parser for TRUEREF-0005.
|
||||||
|
*
|
||||||
|
* Splits source-code files into function/class-level chunks using
|
||||||
|
* language-specific regex boundary detection. Falls back to a line-count
|
||||||
|
* sliding window for unrecognised languages.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { basename } from 'node:path';
|
||||||
|
import type { NewSnippet } from '$lib/server/db/schema.js';
|
||||||
|
import {
|
||||||
|
estimateTokens,
|
||||||
|
chunkLines,
|
||||||
|
chunkText,
|
||||||
|
MAX_TOKENS,
|
||||||
|
OVERLAP_TOKENS,
|
||||||
|
MIN_CONTENT_LENGTH
|
||||||
|
} from './chunker.js';
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Boundary patterns per language
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Each pattern must match the START of a top-level declaration line.
|
||||||
|
* The regex is tested line-by-line (multiline flag not needed).
|
||||||
|
*/
|
||||||
|
export const BOUNDARY_PATTERNS: Record<string, RegExp> = {
|
||||||
|
typescript: /^(export\s+)?(declare\s+)?(async\s+)?(function|class|interface|type|enum|const|let|var)\s+\w+/,
|
||||||
|
javascript: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
||||||
|
python: /^(async\s+)?(def|class)\s+\w+/,
|
||||||
|
go: /^(func|type|var|const)\s+\w+/,
|
||||||
|
rust: /^(pub(\s*\(crate\))?\s+)?(async\s+)?(fn|impl|struct|enum|trait|type|const|static)\s+\w+/,
|
||||||
|
java: /^(\s*(public|private|protected|static|final|abstract|synchronized)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
|
||||||
|
csharp: /^(\s*(public|private|protected|internal|static|override|virtual|abstract|sealed)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
|
||||||
|
kotlin: /^(\s*(public|private|protected|internal|override|suspend|inline|open|abstract|sealed)\s+)*(fun|class|object|interface|data class|sealed class|enum class)\s+\w+/,
|
||||||
|
swift: /^(\s*(public|private|internal|fileprivate|open|override|static|final|class)\s+)*(func|class|struct|enum|protocol|extension)\s+\w+/,
|
||||||
|
ruby: /^(def|class|module)\s+\w+/
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Internal types
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Split `content` at lines that match `pattern`, returning the segments
|
||||||
|
* between boundaries (each segment includes its opening boundary line).
|
||||||
|
*/
|
||||||
|
function splitAtBoundaries(content: string, pattern: RegExp): string[] {
|
||||||
|
const lines = content.split('\n');
|
||||||
|
const segments: string[] = [];
|
||||||
|
let current: string[] = [];
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
if (pattern.test(line) && current.length > 0) {
|
||||||
|
// Emit what we have, start a new segment from this boundary line
|
||||||
|
segments.push(current.join('\n'));
|
||||||
|
current = [line];
|
||||||
|
} else {
|
||||||
|
current.push(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current.length > 0) {
|
||||||
|
segments.push(current.join('\n'));
|
||||||
|
}
|
||||||
|
|
||||||
|
return segments;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Sliding-window fallback for code
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function slidingWindowChunks(content: string, filePath: string, language: string): RawSnippet[] {
|
||||||
|
const lines = content.split('\n');
|
||||||
|
const windowedChunks = chunkLines(lines, 200, 20);
|
||||||
|
return windowedChunks
|
||||||
|
.filter((chunk) => chunk.trim().length >= MIN_CONTENT_LENGTH)
|
||||||
|
.map((chunk) => ({
|
||||||
|
type: 'code' as const,
|
||||||
|
title: basename(filePath),
|
||||||
|
content: chunk,
|
||||||
|
language,
|
||||||
|
breadcrumb: filePath,
|
||||||
|
tokenCount: estimateTokens(chunk)
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Config / data file parser (JSON, YAML, TOML)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Chunk config/data files by splitting on top-level keys.
|
||||||
|
*
|
||||||
|
* Strategy: find lines that look like top-level keys (zero indentation,
|
||||||
|
* followed by colon/equals/brace) and treat each as a boundary.
|
||||||
|
*/
|
||||||
|
function parseConfigFile(content: string, filePath: string, language: string): RawSnippet[] {
|
||||||
|
const topLevelKey = /^[\w"'\-]+\s*[:=\[{]/;
|
||||||
|
const lines = content.split('\n');
|
||||||
|
const segments: string[] = [];
|
||||||
|
let current: string[] = [];
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
if (topLevelKey.test(line) && current.length > 0) {
|
||||||
|
segments.push(current.join('\n'));
|
||||||
|
current = [line];
|
||||||
|
} else {
|
||||||
|
current.push(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (current.length > 0) segments.push(current.join('\n'));
|
||||||
|
|
||||||
|
// If we got only one segment (no structure detected), fall back to sliding window
|
||||||
|
if (segments.length <= 1) {
|
||||||
|
return slidingWindowChunks(content, filePath, language);
|
||||||
|
}
|
||||||
|
|
||||||
|
return segments
|
||||||
|
.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
|
||||||
|
.flatMap((seg) => {
|
||||||
|
if (estimateTokens(seg) <= MAX_TOKENS) {
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
type: 'code' as const,
|
||||||
|
title: basename(filePath),
|
||||||
|
content: seg.trim(),
|
||||||
|
language,
|
||||||
|
breadcrumb: filePath,
|
||||||
|
tokenCount: estimateTokens(seg.trim())
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
return slidingWindowChunks(seg, filePath, language);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// HTML / Svelte / Vue parser
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract script blocks and text content from HTML-like files.
|
||||||
|
*/
|
||||||
|
function parseHtmlLikeFile(content: string, filePath: string, language: string): RawSnippet[] {
|
||||||
|
const snippets: RawSnippet[] = [];
|
||||||
|
const title = basename(filePath);
|
||||||
|
|
||||||
|
// Extract <script> blocks (including <script lang="ts">)
|
||||||
|
const scriptPattern = /<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi;
|
||||||
|
let match: RegExpExecArray | null;
|
||||||
|
const scriptBlocks: string[] = [];
|
||||||
|
|
||||||
|
while ((match = scriptPattern.exec(content)) !== null) {
|
||||||
|
// Strip the outer tags, keep just the code
|
||||||
|
const inner = match[0].replace(/^<script[^>]*>/, '').replace(/<\/script>$/, '').trim();
|
||||||
|
if (inner.length >= MIN_CONTENT_LENGTH) {
|
||||||
|
scriptBlocks.push(inner);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const block of scriptBlocks) {
|
||||||
|
if (estimateTokens(block) <= MAX_TOKENS) {
|
||||||
|
snippets.push({
|
||||||
|
type: 'code',
|
||||||
|
title,
|
||||||
|
content: block,
|
||||||
|
language,
|
||||||
|
breadcrumb: filePath,
|
||||||
|
tokenCount: estimateTokens(block)
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
snippets.push(...slidingWindowChunks(block, filePath, language));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strip tags and extract text content for info snippets
|
||||||
|
const text = content
|
||||||
|
.replace(/<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi, '')
|
||||||
|
.replace(/<style(?:\s[^>]*)?>[\s\S]*?<\/style>/gi, '')
|
||||||
|
.replace(/<[^>]+>/g, ' ')
|
||||||
|
.replace(/\s{2,}/g, ' ')
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
if (text.length >= MIN_CONTENT_LENGTH) {
|
||||||
|
const chunks = chunkText(text, MAX_TOKENS, OVERLAP_TOKENS);
|
||||||
|
for (const chunk of chunks) {
|
||||||
|
snippets.push({
|
||||||
|
type: 'info',
|
||||||
|
title,
|
||||||
|
content: chunk,
|
||||||
|
language: null,
|
||||||
|
breadcrumb: filePath,
|
||||||
|
tokenCount: estimateTokens(chunk)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return snippets;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Plain-text / RST parser
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function parsePlainText(content: string, filePath: string): RawSnippet[] {
|
||||||
|
// Split on blank lines (paragraph boundaries)
|
||||||
|
const paragraphs = content.split(/\n{2,}/).filter((p) => p.trim().length >= MIN_CONTENT_LENGTH);
|
||||||
|
|
||||||
|
if (paragraphs.length === 0) return [];
|
||||||
|
|
||||||
|
const title = basename(filePath);
|
||||||
|
const snippets: RawSnippet[] = [];
|
||||||
|
|
||||||
|
for (const para of paragraphs) {
|
||||||
|
const chunks = chunkText(para.trim(), MAX_TOKENS, OVERLAP_TOKENS);
|
||||||
|
for (const chunk of chunks) {
|
||||||
|
snippets.push({
|
||||||
|
type: 'info',
|
||||||
|
title,
|
||||||
|
content: chunk,
|
||||||
|
language: null,
|
||||||
|
breadcrumb: filePath,
|
||||||
|
tokenCount: estimateTokens(chunk)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return snippets;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Public parser
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a non-Markdown code or data file into raw snippets.
|
||||||
|
*/
|
||||||
|
export function parseCodeFile(content: string, filePath: string, language: string): RawSnippet[] {
|
||||||
|
// Plain text / RST
|
||||||
|
if (language === 'text') {
|
||||||
|
return parsePlainText(content, filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Config / data files
|
||||||
|
if (['json', 'yaml', 'toml'].includes(language)) {
|
||||||
|
return parseConfigFile(content, filePath, language);
|
||||||
|
}
|
||||||
|
|
||||||
|
// HTML-like files
|
||||||
|
if (['html', 'svelte', 'vue'].includes(language)) {
|
||||||
|
return parseHtmlLikeFile(content, filePath, language);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalise csharp alias
|
||||||
|
const normalisedLang = language === 'csharp' ? 'csharp' : language;
|
||||||
|
|
||||||
|
const pattern = BOUNDARY_PATTERNS[normalisedLang];
|
||||||
|
const title = basename(filePath);
|
||||||
|
const breadcrumb = filePath;
|
||||||
|
|
||||||
|
if (!pattern) {
|
||||||
|
// Fallback: line-count sliding window
|
||||||
|
return slidingWindowChunks(content, filePath, language);
|
||||||
|
}
|
||||||
|
|
||||||
|
const segments = splitAtBoundaries(content, pattern);
|
||||||
|
|
||||||
|
// If boundary detection produced only one segment covering the whole file,
|
||||||
|
// it means no boundaries matched — fall back to sliding window.
|
||||||
|
if (segments.length === 1 && !pattern.test(content.split('\n')[0])) {
|
||||||
|
return slidingWindowChunks(content, filePath, language);
|
||||||
|
}
|
||||||
|
|
||||||
|
return segments
|
||||||
|
.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
|
||||||
|
.flatMap((seg) => {
|
||||||
|
const trimmed = seg.trim();
|
||||||
|
if (estimateTokens(trimmed) <= MAX_TOKENS) {
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
type: 'code' as const,
|
||||||
|
title,
|
||||||
|
content: trimmed,
|
||||||
|
language,
|
||||||
|
breadcrumb,
|
||||||
|
tokenCount: estimateTokens(trimmed)
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
// Chunk oversized segments with sliding window
|
||||||
|
return slidingWindowChunks(trimmed, filePath, language);
|
||||||
|
});
|
||||||
|
}
|
||||||
53
src/lib/server/parser/index.ts
Normal file
53
src/lib/server/parser/index.ts
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
/**
|
||||||
|
* Document parser entry point for TRUEREF-0005.
|
||||||
|
*
|
||||||
|
* Exposes `parseFile` which transforms a `CrawledFile` into an array of
|
||||||
|
* `NewSnippet` records ready for database insertion.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type { CrawledFile } from '$lib/server/crawler/types.js';
|
||||||
|
import type { NewSnippet } from '$lib/server/db/schema.js';
|
||||||
|
import { detectLanguage } from './language.js';
|
||||||
|
import { parseMarkdown } from './markdown.parser.js';
|
||||||
|
import { parseCodeFile } from './code.parser.js';
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Public API
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export interface ParseOptions {
|
||||||
|
repositoryId: string;
|
||||||
|
documentId: string;
|
||||||
|
versionId?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a crawled file into an array of `NewSnippet` records.
|
||||||
|
*
|
||||||
|
* The language is detected from the file extension. Markdown/MDX files are
|
||||||
|
* split by heading hierarchy; all other files use language-specific boundary
|
||||||
|
* detection or a sliding-window fallback.
|
||||||
|
*/
|
||||||
|
export function parseFile(file: CrawledFile, options: ParseOptions): NewSnippet[] {
|
||||||
|
const language = detectLanguage(file.path);
|
||||||
|
|
||||||
|
const rawSnippets =
|
||||||
|
language === 'markdown'
|
||||||
|
? parseMarkdown(file.content, file.path)
|
||||||
|
: parseCodeFile(file.content, file.path, language);
|
||||||
|
|
||||||
|
return rawSnippets.map((s) => ({
|
||||||
|
...s,
|
||||||
|
id: crypto.randomUUID(),
|
||||||
|
repositoryId: options.repositoryId,
|
||||||
|
documentId: options.documentId,
|
||||||
|
versionId: options.versionId ?? null,
|
||||||
|
createdAt: new Date()
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-export helpers for consumers that need them individually
|
||||||
|
export { detectLanguage } from './language.js';
|
||||||
|
export { estimateTokens, chunkText, chunkLines, MAX_TOKENS, OVERLAP_TOKENS, MIN_CONTENT_LENGTH } from './chunker.js';
|
||||||
|
export { parseMarkdown } from './markdown.parser.js';
|
||||||
|
export { parseCodeFile, BOUNDARY_PATTERNS } from './code.parser.js';
|
||||||
56
src/lib/server/parser/language.ts
Normal file
56
src/lib/server/parser/language.ts
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
/**
|
||||||
|
* Language detection for the document parser (TRUEREF-0005).
|
||||||
|
*
|
||||||
|
* Maps file extensions to canonical language names used throughout the parser.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { extname } from 'node:path';
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Language map
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export const LANGUAGE_MAP: Record<string, string> = {
|
||||||
|
'.ts': 'typescript',
|
||||||
|
'.tsx': 'typescript',
|
||||||
|
'.js': 'javascript',
|
||||||
|
'.jsx': 'javascript',
|
||||||
|
'.py': 'python',
|
||||||
|
'.rb': 'ruby',
|
||||||
|
'.go': 'go',
|
||||||
|
'.rs': 'rust',
|
||||||
|
'.java': 'java',
|
||||||
|
'.cs': 'csharp',
|
||||||
|
'.cpp': 'cpp',
|
||||||
|
'.c': 'c',
|
||||||
|
'.h': 'c',
|
||||||
|
'.swift': 'swift',
|
||||||
|
'.kt': 'kotlin',
|
||||||
|
'.php': 'php',
|
||||||
|
'.scala': 'scala',
|
||||||
|
'.sh': 'bash',
|
||||||
|
'.bash': 'bash',
|
||||||
|
'.zsh': 'bash',
|
||||||
|
'.md': 'markdown',
|
||||||
|
'.mdx': 'markdown',
|
||||||
|
'.json': 'json',
|
||||||
|
'.yaml': 'yaml',
|
||||||
|
'.yml': 'yaml',
|
||||||
|
'.toml': 'toml',
|
||||||
|
'.html': 'html',
|
||||||
|
'.css': 'css',
|
||||||
|
'.svelte': 'svelte',
|
||||||
|
'.vue': 'vue',
|
||||||
|
'.sql': 'sql',
|
||||||
|
'.txt': 'text',
|
||||||
|
'.rst': 'text'
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect the canonical language name from a file path.
|
||||||
|
* Returns 'text' when the extension is unknown.
|
||||||
|
*/
|
||||||
|
export function detectLanguage(filePath: string): string {
|
||||||
|
const ext = extname(filePath).toLowerCase();
|
||||||
|
return LANGUAGE_MAP[ext] ?? 'text';
|
||||||
|
}
|
||||||
272
src/lib/server/parser/markdown.parser.test.ts
Normal file
272
src/lib/server/parser/markdown.parser.test.ts
Normal file
@@ -0,0 +1,272 @@
|
|||||||
|
/**
|
||||||
|
* Unit tests for the Markdown parser (TRUEREF-0005).
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { describe, it, expect } from 'vitest';
|
||||||
|
import { parseMarkdown } from './markdown.parser.js';
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/** Build a fenced code block string without nesting backticks in template literals. */
|
||||||
|
function fence(lang: string, code: string): string {
|
||||||
|
return '```' + lang + '\n' + code + '\n' + '```';
|
||||||
|
}
|
||||||
|
|
||||||
|
function tildeFence(lang: string, code: string): string {
|
||||||
|
return '~~~' + lang + '\n' + code + '\n' + '~~~';
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Basic section splitting
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseMarkdown — section splitting', () => {
|
||||||
|
it('produces no snippets for empty content', () => {
|
||||||
|
expect(parseMarkdown('', 'README.md')).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('skips content shorter than 20 characters', () => {
|
||||||
|
const result = parseMarkdown('# Title\n\nShort.\n', 'README.md');
|
||||||
|
expect(result).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('parses a single heading section into an info snippet', () => {
|
||||||
|
const source = [
|
||||||
|
'# Introduction',
|
||||||
|
'',
|
||||||
|
'This is a paragraph with enough content to pass the minimum length check.'
|
||||||
|
].join('\n');
|
||||||
|
|
||||||
|
const snippets = parseMarkdown(source, 'README.md');
|
||||||
|
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||||
|
const info = snippets.find((s) => s.type === 'info');
|
||||||
|
expect(info).toBeDefined();
|
||||||
|
expect(info?.title).toBe('Introduction');
|
||||||
|
expect(info?.breadcrumb).toBe('Introduction');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('builds correct breadcrumb for nested headings', () => {
|
||||||
|
const source = [
|
||||||
|
'# Getting Started',
|
||||||
|
'',
|
||||||
|
'Intro text that is long enough to be included here.',
|
||||||
|
'',
|
||||||
|
'## Installation',
|
||||||
|
'',
|
||||||
|
'Install by running the command shown below in your terminal.'
|
||||||
|
].join('\n');
|
||||||
|
|
||||||
|
const snippets = parseMarkdown(source, 'README.md');
|
||||||
|
const installation = snippets.find((s) => s.title === 'Installation');
|
||||||
|
expect(installation).toBeDefined();
|
||||||
|
expect(installation?.breadcrumb).toBe('Getting Started > Installation');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('resets heading stack correctly when headings ascend', () => {
|
||||||
|
const source = [
|
||||||
|
'# H1',
|
||||||
|
'',
|
||||||
|
'Some introductory prose that is longer than twenty characters.',
|
||||||
|
'',
|
||||||
|
'## H2',
|
||||||
|
'',
|
||||||
|
'More content here, also long enough to pass the threshold check.',
|
||||||
|
'',
|
||||||
|
'# Second H1',
|
||||||
|
'',
|
||||||
|
'Content for second top-level heading, long enough to be included.'
|
||||||
|
].join('\n');
|
||||||
|
|
||||||
|
const snippets = parseMarkdown(source, 'doc.md');
|
||||||
|
const secondH1 = snippets.find((s) => s.title === 'Second H1');
|
||||||
|
expect(secondH1).toBeDefined();
|
||||||
|
expect(secondH1?.breadcrumb).toBe('Second H1');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('falls back to filename when no heading is present', () => {
|
||||||
|
const source = 'This is some standalone prose content that is long enough to pass.';
|
||||||
|
const snippets = parseMarkdown(source, 'notes.md');
|
||||||
|
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||||
|
expect(snippets[0]?.title).toBe('notes.md');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Fenced code block extraction
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseMarkdown — code block extraction', () => {
|
||||||
|
it('extracts a fenced code block as a code snippet', () => {
|
||||||
|
const codeBlock = fence('typescript', 'function hello(name: string): string {\n return `Hello, ${name}!`;\n}');
|
||||||
|
const source = [
|
||||||
|
'# Example',
|
||||||
|
'',
|
||||||
|
'Some prose here that is long enough to pass the minimum check.',
|
||||||
|
'',
|
||||||
|
codeBlock
|
||||||
|
].join('\n');
|
||||||
|
|
||||||
|
const snippets = parseMarkdown(source, 'README.md');
|
||||||
|
const code = snippets.find((s) => s.type === 'code');
|
||||||
|
expect(code).toBeDefined();
|
||||||
|
expect(code?.language).toBe('typescript');
|
||||||
|
expect(code?.content).toContain('function hello');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('extracts multiple code blocks from the same section', () => {
|
||||||
|
const bashBlock = fence('bash', 'npm install my-library --save-dev');
|
||||||
|
const jsBlock = fence('javascript', "const lib = require('my-lib');\nlib.doSomething();");
|
||||||
|
const source = [
|
||||||
|
'# Usage',
|
||||||
|
'',
|
||||||
|
'Description of the usage pattern with enough text here.',
|
||||||
|
'',
|
||||||
|
bashBlock,
|
||||||
|
'',
|
||||||
|
'More text in between the two code blocks, just enough.',
|
||||||
|
'',
|
||||||
|
jsBlock
|
||||||
|
].join('\n');
|
||||||
|
|
||||||
|
const snippets = parseMarkdown(source, 'README.md');
|
||||||
|
const codeSnippets = snippets.filter((s) => s.type === 'code');
|
||||||
|
expect(codeSnippets.length).toBe(2);
|
||||||
|
const langs = codeSnippets.map((s) => s.language);
|
||||||
|
expect(langs).toContain('bash');
|
||||||
|
expect(langs).toContain('javascript');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('skips code blocks shorter than 20 characters', () => {
|
||||||
|
const shortBlock = fence('', 'x = 1');
|
||||||
|
const source = [
|
||||||
|
'# Example',
|
||||||
|
'',
|
||||||
|
'Some prose here that is long enough to pass.',
|
||||||
|
'',
|
||||||
|
shortBlock
|
||||||
|
].join('\n');
|
||||||
|
|
||||||
|
const snippets = parseMarkdown(source, 'README.md');
|
||||||
|
expect(snippets.every((s) => s.type === 'info')).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles tilde-fenced code blocks', () => {
|
||||||
|
const pyBlock = tildeFence('python', 'def greet(name):\n return f"Hello, {name}"');
|
||||||
|
const source = [
|
||||||
|
'# Section',
|
||||||
|
'',
|
||||||
|
'Long enough prose content for the section to be included here.',
|
||||||
|
'',
|
||||||
|
pyBlock
|
||||||
|
].join('\n');
|
||||||
|
|
||||||
|
const snippets = parseMarkdown(source, 'README.md');
|
||||||
|
const code = snippets.find((s) => s.type === 'code');
|
||||||
|
expect(code).toBeDefined();
|
||||||
|
expect(code?.language).toBe('python');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('preserves breadcrumb on code snippets', () => {
|
||||||
|
const codeBlock = fence(
|
||||||
|
'typescript',
|
||||||
|
'function connect(url: string): Promise<void> {\n return Promise.resolve();\n}'
|
||||||
|
);
|
||||||
|
const source = [
|
||||||
|
'# API Reference',
|
||||||
|
'',
|
||||||
|
'## Methods',
|
||||||
|
'',
|
||||||
|
'Overview of the methods available in this library.',
|
||||||
|
'',
|
||||||
|
codeBlock
|
||||||
|
].join('\n');
|
||||||
|
|
||||||
|
const snippets = parseMarkdown(source, 'API.md');
|
||||||
|
const code = snippets.find((s) => s.type === 'code');
|
||||||
|
expect(code).toBeDefined();
|
||||||
|
expect(code?.breadcrumb).toBe('API Reference > Methods');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Token counting
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseMarkdown — token counting', () => {
|
||||||
|
it('attaches a non-zero tokenCount to every snippet', () => {
|
||||||
|
const source = [
|
||||||
|
'# Overview',
|
||||||
|
'',
|
||||||
|
'This section contains enough text to produce an info snippet for the test.'
|
||||||
|
].join('\n');
|
||||||
|
|
||||||
|
const snippets = parseMarkdown(source, 'README.md');
|
||||||
|
for (const s of snippets) {
|
||||||
|
expect(s.tokenCount).toBeGreaterThan(0);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Large content chunking
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseMarkdown — large content chunking', () => {
|
||||||
|
it('splits a very large prose section into multiple snippets', () => {
|
||||||
|
// Generate ~4 000 characters of prose (well above the ~1 800-char window)
|
||||||
|
const longParagraph = 'word '.repeat(800).trim();
|
||||||
|
const source = `# Big Section\n\n${longParagraph}`;
|
||||||
|
|
||||||
|
const snippets = parseMarkdown(source, 'big.md');
|
||||||
|
const infoSnippets = snippets.filter((s) => s.type === 'info');
|
||||||
|
expect(infoSnippets.length).toBeGreaterThan(1);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Real-world sample
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('parseMarkdown — real-world sample', () => {
|
||||||
|
it('correctly parses a realistic README excerpt', () => {
|
||||||
|
const bashInstall = fence('bash', 'npm install my-library');
|
||||||
|
const tsUsage = fence('typescript', "import { doTheThing } from 'my-library';\n\ndoTheThing({ verbose: true });");
|
||||||
|
|
||||||
|
const source = [
|
||||||
|
'# My Library',
|
||||||
|
'',
|
||||||
|
'A handy library for doing things quickly and efficiently.',
|
||||||
|
'',
|
||||||
|
'## Installation',
|
||||||
|
'',
|
||||||
|
'Install via npm using the following command in your project directory:',
|
||||||
|
'',
|
||||||
|
bashInstall,
|
||||||
|
'',
|
||||||
|
'## Usage',
|
||||||
|
'',
|
||||||
|
'Import the library and call the main function as shown below:',
|
||||||
|
'',
|
||||||
|
tsUsage,
|
||||||
|
'',
|
||||||
|
'## API',
|
||||||
|
'',
|
||||||
|
'### doTheThing(options)',
|
||||||
|
'',
|
||||||
|
'Performs the main operation. Options are passed as a plain object.'
|
||||||
|
].join('\n');
|
||||||
|
|
||||||
|
const snippets = parseMarkdown(source, 'README.md');
|
||||||
|
|
||||||
|
// Should have both info and code snippets
|
||||||
|
expect(snippets.some((s) => s.type === 'info')).toBe(true);
|
||||||
|
expect(snippets.some((s) => s.type === 'code')).toBe(true);
|
||||||
|
|
||||||
|
// Breadcrumb depth check
|
||||||
|
const apiSnippet = snippets.find((s) => s.title === 'doTheThing(options)');
|
||||||
|
expect(apiSnippet).toBeDefined();
|
||||||
|
expect(apiSnippet?.breadcrumb).toBe('My Library > API > doTheThing(options)');
|
||||||
|
});
|
||||||
|
});
|
||||||
171
src/lib/server/parser/markdown.parser.ts
Normal file
171
src/lib/server/parser/markdown.parser.ts
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
/**
|
||||||
|
* Markdown document parser for TRUEREF-0005.
|
||||||
|
*
|
||||||
|
* Splits Markdown/MDX files into heading-based sections and extracts fenced
|
||||||
|
* code blocks as separate code snippets.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { basename } from 'node:path';
|
||||||
|
import type { NewSnippet } from '$lib/server/db/schema.js';
|
||||||
|
import { estimateTokens, chunkText, MAX_TOKENS, OVERLAP_TOKENS, MIN_CONTENT_LENGTH } from './chunker.js';
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Internal types
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
interface CodeBlock {
|
||||||
|
language: string;
|
||||||
|
code: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface MarkdownSection {
|
||||||
|
/** Heading stack at this point, e.g. ["Getting Started", "Installation"] */
|
||||||
|
headings: string[];
|
||||||
|
/** Prose text content (code blocks stripped out) */
|
||||||
|
content: string;
|
||||||
|
/** Fenced code blocks found within this section */
|
||||||
|
codeBlocks: CodeBlock[];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Section splitting
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Split the full Markdown source into sections delimited by ATX headings
|
||||||
|
* (# … ####). Code blocks inside headings are extracted separately.
|
||||||
|
*/
|
||||||
|
function splitIntoSections(source: string): MarkdownSection[] {
|
||||||
|
const lines = source.split('\n');
|
||||||
|
const sections: MarkdownSection[] = [];
|
||||||
|
|
||||||
|
// Heading stack: index 0 = H1, 1 = H2, … (we track up to H4)
|
||||||
|
const headingStack: string[] = [];
|
||||||
|
|
||||||
|
// Accumulator for the current section
|
||||||
|
let textLines: string[] = [];
|
||||||
|
const codeBlocks: CodeBlock[] = [];
|
||||||
|
|
||||||
|
// Fenced-code-block tracking
|
||||||
|
let inCodeBlock = false;
|
||||||
|
let codeFence = '';
|
||||||
|
let codeLanguage = '';
|
||||||
|
let codeLines: string[] = [];
|
||||||
|
|
||||||
|
function flushSection() {
|
||||||
|
sections.push({
|
||||||
|
headings: [...headingStack],
|
||||||
|
content: textLines.join('\n'),
|
||||||
|
codeBlocks: [...codeBlocks]
|
||||||
|
});
|
||||||
|
textLines = [];
|
||||||
|
codeBlocks.length = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
// ---- Fenced code block handling ----
|
||||||
|
if (!inCodeBlock) {
|
||||||
|
const fenceMatch = line.match(/^(`{3,}|~{3,})([\w-]*)/);
|
||||||
|
if (fenceMatch) {
|
||||||
|
inCodeBlock = true;
|
||||||
|
codeFence = fenceMatch[1].charAt(0).repeat(fenceMatch[1].length);
|
||||||
|
codeLanguage = fenceMatch[2].trim().toLowerCase();
|
||||||
|
codeLines = [];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Check for closing fence (must be same char and at least same length)
|
||||||
|
const closingFence = new RegExp(`^${codeFence[0]}{${codeFence.length},}\\s*$`);
|
||||||
|
if (closingFence.test(line)) {
|
||||||
|
inCodeBlock = false;
|
||||||
|
const code = codeLines.join('\n');
|
||||||
|
if (code.trim().length >= MIN_CONTENT_LENGTH) {
|
||||||
|
codeBlocks.push({ language: codeLanguage, code });
|
||||||
|
}
|
||||||
|
codeLines = [];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
codeLines.push(line);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Heading detection (ATX only, H1–H4) ----
|
||||||
|
const headingMatch = line.match(/^(#{1,4})\s+(.*)/);
|
||||||
|
if (headingMatch) {
|
||||||
|
// Emit whatever has accumulated before this heading
|
||||||
|
flushSection();
|
||||||
|
|
||||||
|
const level = headingMatch[1].length; // 1–4
|
||||||
|
const title = headingMatch[2].trim();
|
||||||
|
|
||||||
|
// Trim the stack to the depth above this heading and push the new title
|
||||||
|
headingStack.splice(level - 1, headingStack.length - (level - 1), title);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Ordinary prose line ----
|
||||||
|
textLines.push(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Flush any trailing content (unclosed fence treated as prose)
|
||||||
|
if (inCodeBlock) {
|
||||||
|
// Treat remaining code lines as prose if the fence was never closed
|
||||||
|
textLines.push(...codeLines);
|
||||||
|
}
|
||||||
|
flushSection();
|
||||||
|
|
||||||
|
return sections;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Public parser
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a Markdown/MDX file into raw snippets (before IDs and DB fields are
|
||||||
|
* attached).
|
||||||
|
*/
|
||||||
|
export function parseMarkdown(content: string, filePath: string): RawSnippet[] {
|
||||||
|
const sections = splitIntoSections(content);
|
||||||
|
const snippets: RawSnippet[] = [];
|
||||||
|
|
||||||
|
for (const section of sections) {
|
||||||
|
const breadcrumb = section.headings.join(' > ') || undefined;
|
||||||
|
const title = section.headings.at(-1) ?? basename(filePath);
|
||||||
|
|
||||||
|
// ---- Info snippet for prose content ----
|
||||||
|
const prose = section.content.trim();
|
||||||
|
if (prose.length >= MIN_CONTENT_LENGTH) {
|
||||||
|
const chunks = chunkText(prose, MAX_TOKENS, OVERLAP_TOKENS);
|
||||||
|
for (const chunk of chunks) {
|
||||||
|
snippets.push({
|
||||||
|
type: 'info',
|
||||||
|
title,
|
||||||
|
content: chunk,
|
||||||
|
breadcrumb: breadcrumb ?? null,
|
||||||
|
language: null,
|
||||||
|
tokenCount: estimateTokens(chunk)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Code snippets for each fenced code block ----
|
||||||
|
for (const block of section.codeBlocks) {
|
||||||
|
const code = block.code.trim();
|
||||||
|
if (code.length < MIN_CONTENT_LENGTH) continue;
|
||||||
|
|
||||||
|
snippets.push({
|
||||||
|
type: 'code',
|
||||||
|
title,
|
||||||
|
content: code,
|
||||||
|
language: block.language || null,
|
||||||
|
breadcrumb: breadcrumb ?? null,
|
||||||
|
tokenCount: estimateTokens(code)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return snippets;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user