feat(TRUEREF-0005): implement document parser and chunker
- Markdown parser with heading-based section splitting and code block extraction - Code file parser with regex boundary detection for 10+ languages - Sliding window chunker with configurable token limits and overlap - Language detection from file extensions Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
92
src/lib/server/parser/chunker.ts
Normal file
92
src/lib/server/parser/chunker.ts
Normal file
@@ -0,0 +1,92 @@
|
||||
/**
|
||||
* Text chunking utilities for the document parser (TRUEREF-0005).
|
||||
*
|
||||
* Provides sliding-window chunking with overlap and token estimation.
|
||||
*/
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Constants
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const MAX_TOKENS = 512;
|
||||
export const OVERLAP_TOKENS = 50;
|
||||
export const MIN_CONTENT_LENGTH = 20; // characters
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Token estimation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Estimate the token count for a piece of text using a character-based
|
||||
* approximation (~3.5 chars per token on average for mixed prose/code).
|
||||
*/
|
||||
export function estimateTokens(text: string): number {
|
||||
return Math.ceil(text.length / 3.5);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Sliding-window chunker
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Split `text` into overlapping word-based chunks that stay within the token
|
||||
* budget. Returns at least one chunk even when the text fits in a single
|
||||
* window.
|
||||
*/
|
||||
export function chunkText(
|
||||
text: string,
|
||||
maxTokens: number = MAX_TOKENS,
|
||||
overlapTokens: number = OVERLAP_TOKENS
|
||||
): string[] {
|
||||
const words = text.split(/\s+/).filter((w) => w.length > 0);
|
||||
if (words.length === 0) return [];
|
||||
|
||||
// ~0.75 words per token
|
||||
const maxWords = Math.max(1, Math.floor(maxTokens * 0.75));
|
||||
const overlapWords = Math.max(0, Math.floor(overlapTokens * 0.75));
|
||||
|
||||
if (words.length <= maxWords) {
|
||||
return [words.join(' ')];
|
||||
}
|
||||
|
||||
const chunks: string[] = [];
|
||||
let start = 0;
|
||||
|
||||
while (start < words.length) {
|
||||
const end = Math.min(start + maxWords, words.length);
|
||||
chunks.push(words.slice(start, end).join(' '));
|
||||
if (end === words.length) break;
|
||||
start = end - overlapWords;
|
||||
// Guard against infinite loop when overlapWords >= maxWords
|
||||
if (start <= 0) start = end;
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Line-count sliding window (for code files without recognised boundaries)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Split `lines` into groups of at most `maxLines` with `overlapLines` overlap. */
|
||||
export function chunkLines(
|
||||
lines: string[],
|
||||
maxLines: number = 200,
|
||||
overlapLines: number = 20
|
||||
): string[] {
|
||||
if (lines.length === 0) return [];
|
||||
if (lines.length <= maxLines) return [lines.join('\n')];
|
||||
|
||||
const chunks: string[] = [];
|
||||
let start = 0;
|
||||
|
||||
while (start < lines.length) {
|
||||
const end = Math.min(start + maxLines, lines.length);
|
||||
chunks.push(lines.slice(start, end).join('\n'));
|
||||
if (end === lines.length) break;
|
||||
start = end - overlapLines;
|
||||
if (start <= 0) start = end;
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
404
src/lib/server/parser/code.parser.test.ts
Normal file
404
src/lib/server/parser/code.parser.test.ts
Normal file
@@ -0,0 +1,404 @@
|
||||
/**
|
||||
* Unit tests for the code file parser (TRUEREF-0005).
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { parseCodeFile, BOUNDARY_PATTERNS } from './code.parser.js';
|
||||
import { estimateTokens, MAX_TOKENS } from './chunker.js';
|
||||
import { parseFile } from './index.js';
|
||||
import type { CrawledFile } from '$lib/server/crawler/types.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function makeFile(path: string, content: string, language = 'typescript'): CrawledFile {
|
||||
return { path, content, size: content.length, sha: 'abc123', language };
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// TypeScript / JavaScript boundary detection
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseCodeFile — TypeScript', () => {
|
||||
it('splits at function boundaries', () => {
|
||||
const content = `
|
||||
export function foo(): string {
|
||||
return 'foo';
|
||||
}
|
||||
|
||||
export function bar(x: number): number {
|
||||
return x * 2;
|
||||
}
|
||||
`.trim();
|
||||
|
||||
const snippets = parseCodeFile(content, 'utils.ts', 'typescript');
|
||||
expect(snippets.length).toBeGreaterThanOrEqual(2);
|
||||
expect(snippets.every((s) => s.type === 'code')).toBe(true);
|
||||
expect(snippets.some((s) => s.content.includes('function foo'))).toBe(true);
|
||||
expect(snippets.some((s) => s.content.includes('function bar'))).toBe(true);
|
||||
});
|
||||
|
||||
it('splits at class boundaries', () => {
|
||||
const content = `
|
||||
export class Greeter {
|
||||
greet(name: string) {
|
||||
return \`Hello, \${name}\`;
|
||||
}
|
||||
}
|
||||
|
||||
export class Farewell {
|
||||
bye(name: string) {
|
||||
return \`Goodbye, \${name}\`;
|
||||
}
|
||||
}
|
||||
`.trim();
|
||||
|
||||
const snippets = parseCodeFile(content, 'greet.ts', 'typescript');
|
||||
expect(snippets.length).toBeGreaterThanOrEqual(2);
|
||||
expect(snippets.some((s) => s.content.includes('class Greeter'))).toBe(true);
|
||||
expect(snippets.some((s) => s.content.includes('class Farewell'))).toBe(true);
|
||||
});
|
||||
|
||||
it('sets correct metadata on snippets', () => {
|
||||
const content = `
|
||||
export function example(): void {
|
||||
console.log('example function body here');
|
||||
}
|
||||
`.trim();
|
||||
|
||||
const snippets = parseCodeFile(content, 'src/utils.ts', 'typescript');
|
||||
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||
const s = snippets[0]!;
|
||||
expect(s.type).toBe('code');
|
||||
expect(s.language).toBe('typescript');
|
||||
expect(s.title).toBe('utils.ts');
|
||||
expect(s.breadcrumb).toBe('src/utils.ts');
|
||||
});
|
||||
|
||||
it('produces at least one snippet from a file with many small declarations', () => {
|
||||
// Each block: a multi-line function — boundary detection fires but chunks are
|
||||
// large enough to survive the MIN_CONTENT_LENGTH filter.
|
||||
const blocks = Array.from(
|
||||
{ length: 10 },
|
||||
(_, i) => `export function helper${i}(x: number): number {\n return x + ${i};\n}`
|
||||
);
|
||||
const content = blocks.join('\n\n');
|
||||
|
||||
const snippets = parseCodeFile(content, 'generated.ts', 'typescript');
|
||||
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||
expect(snippets.every((s) => s.type === 'code')).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Python
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseCodeFile — Python', () => {
|
||||
it('splits at def and class boundaries', () => {
|
||||
const content = `
|
||||
def greet(name):
|
||||
return f"Hello, {name}"
|
||||
|
||||
class MyClass:
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
|
||||
def increment(self):
|
||||
self.value += 1
|
||||
|
||||
async def fetch_data(url):
|
||||
return await http.get(url)
|
||||
`.trim();
|
||||
|
||||
const snippets = parseCodeFile(content, 'app.py', 'python');
|
||||
expect(snippets.some((s) => s.content.includes('def greet'))).toBe(true);
|
||||
expect(snippets.some((s) => s.content.includes('class MyClass'))).toBe(true);
|
||||
expect(snippets.some((s) => s.content.includes('async def fetch_data'))).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Go
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseCodeFile — Go', () => {
|
||||
it('splits at func boundaries', () => {
|
||||
const content = `
|
||||
package main
|
||||
|
||||
import "fmt"
|
||||
|
||||
func greet(name string) string {
|
||||
return fmt.Sprintf("Hello, %s", name)
|
||||
}
|
||||
|
||||
func main() {
|
||||
fmt.Println(greet("world"))
|
||||
}
|
||||
`.trim();
|
||||
|
||||
const snippets = parseCodeFile(content, 'main.go', 'go');
|
||||
expect(snippets.some((s) => s.content.includes('func greet'))).toBe(true);
|
||||
expect(snippets.some((s) => s.content.includes('func main'))).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Rust
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseCodeFile — Rust', () => {
|
||||
it('splits at fn and struct boundaries', () => {
|
||||
const content = `
|
||||
pub struct Config {
|
||||
pub name: String,
|
||||
pub value: u32,
|
||||
}
|
||||
|
||||
pub fn create_config(name: &str, value: u32) -> Config {
|
||||
Config { name: name.to_string(), value }
|
||||
}
|
||||
|
||||
impl Config {
|
||||
pub fn new() -> Self {
|
||||
Config { name: String::new(), value: 0 }
|
||||
}
|
||||
}
|
||||
`.trim();
|
||||
|
||||
const snippets = parseCodeFile(content, 'config.rs', 'rust');
|
||||
expect(snippets.some((s) => s.content.includes('pub struct Config'))).toBe(true);
|
||||
expect(snippets.some((s) => s.content.includes('pub fn create_config'))).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Ruby
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseCodeFile — Ruby', () => {
|
||||
it('splits at def and class boundaries', () => {
|
||||
const content = `
|
||||
class Greeter
|
||||
def initialize(name)
|
||||
@name = name
|
||||
end
|
||||
|
||||
def greet
|
||||
"Hello, #{@name}!"
|
||||
end
|
||||
end
|
||||
|
||||
def standalone_helper
|
||||
puts "helper"
|
||||
end
|
||||
`.trim();
|
||||
|
||||
const snippets = parseCodeFile(content, 'greeter.rb', 'ruby');
|
||||
expect(snippets.some((s) => s.content.includes('class Greeter'))).toBe(true);
|
||||
expect(snippets.some((s) => s.content.includes('def standalone_helper'))).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Config / data files
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseCodeFile — JSON', () => {
|
||||
it('produces at least one code snippet from a JSON object', () => {
|
||||
const content = JSON.stringify(
|
||||
{
|
||||
name: 'my-package',
|
||||
version: '1.0.0',
|
||||
dependencies: { lodash: '^4.17.21' }
|
||||
},
|
||||
null,
|
||||
2
|
||||
);
|
||||
|
||||
const snippets = parseCodeFile(content, 'package.json', 'json');
|
||||
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||
expect(snippets.every((s) => s.type === 'code')).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('parseCodeFile — YAML', () => {
|
||||
it('splits a YAML file at top-level keys', () => {
|
||||
const content = `
|
||||
name: my-project
|
||||
version: 1.0.0
|
||||
scripts:
|
||||
build: tsc
|
||||
test: vitest
|
||||
dependencies:
|
||||
lodash: ^4.17.21
|
||||
`.trim();
|
||||
|
||||
const snippets = parseCodeFile(content, 'config.yaml', 'yaml');
|
||||
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HTML-like files
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseCodeFile — HTML', () => {
|
||||
it('extracts script block and text content', () => {
|
||||
const content = `
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Test Page</title></head>
|
||||
<body>
|
||||
<p>This is the page body content with enough text for an info snippet.</p>
|
||||
<script>
|
||||
function init() {
|
||||
console.log('page loaded and ready for interaction');
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
`.trim();
|
||||
|
||||
const snippets = parseCodeFile(content, 'index.html', 'html');
|
||||
expect(snippets.some((s) => s.type === 'code')).toBe(true);
|
||||
expect(snippets.some((s) => s.type === 'info')).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Plain text
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseCodeFile — plain text', () => {
|
||||
it('splits on paragraph boundaries', () => {
|
||||
const content = `
|
||||
This is the first paragraph with enough content to pass the minimum length check.
|
||||
|
||||
This is the second paragraph that also has enough content to be included here.
|
||||
`.trim();
|
||||
|
||||
const snippets = parseCodeFile(content, 'notes.txt', 'text');
|
||||
expect(snippets.length).toBeGreaterThanOrEqual(2);
|
||||
expect(snippets.every((s) => s.type === 'info')).toBe(true);
|
||||
});
|
||||
|
||||
it('skips paragraphs shorter than 20 characters', () => {
|
||||
const content = 'Short.\n\nThis is a much longer paragraph that definitely passes the minimum length filter.';
|
||||
const snippets = parseCodeFile(content, 'notes.txt', 'text');
|
||||
expect(snippets.length).toBe(1);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Unknown language fallback
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseCodeFile — unknown language', () => {
|
||||
it('falls back to sliding window for unrecognised languages', () => {
|
||||
const lines = Array.from({ length: 50 }, (_, i) => `line ${i}: some code content here`);
|
||||
const content = lines.join('\n');
|
||||
|
||||
const snippets = parseCodeFile(content, 'script.lua', 'lua');
|
||||
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||
expect(snippets.every((s) => s.type === 'code')).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Min content filter
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseCodeFile — minimum content filter', () => {
|
||||
it('skips segments shorter than 20 characters', () => {
|
||||
const content = `
|
||||
export function realFunction(): string {
|
||||
// A function with enough content to be included in the output snippets.
|
||||
return 'result value from the function that does the operation here';
|
||||
}
|
||||
`.trim();
|
||||
|
||||
const snippets = parseCodeFile(content, 'test.ts', 'typescript');
|
||||
expect(snippets.every((s) => s.content.length >= 20)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Token count cap
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseCodeFile — token count', () => {
|
||||
it('all snippets have tokenCount within MAX_TOKENS', () => {
|
||||
const lines = Array.from({ length: 300 }, (_, i) => `// comment line number ${i} here\nconst x${i} = ${i};`);
|
||||
const content = lines.join('\n');
|
||||
|
||||
const snippets = parseCodeFile(content, 'large.ts', 'typescript');
|
||||
for (const s of snippets) {
|
||||
expect(estimateTokens(s.content)).toBeLessThanOrEqual(MAX_TOKENS + 50); // slight tolerance for boundary chunks
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// parseFile integration
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseFile — integration', () => {
|
||||
const opts = { repositoryId: 'repo-1', documentId: 'doc-1', versionId: 'v1' };
|
||||
|
||||
it('returns NewSnippet records with all required fields for a .ts file', () => {
|
||||
const file = makeFile(
|
||||
'src/utils.ts',
|
||||
`export function add(a: number, b: number): number {\n return a + b;\n}\n`
|
||||
);
|
||||
|
||||
const snippets = parseFile(file, opts);
|
||||
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||
|
||||
for (const s of snippets) {
|
||||
expect(s.id).toBeTruthy();
|
||||
expect(s.repositoryId).toBe('repo-1');
|
||||
expect(s.documentId).toBe('doc-1');
|
||||
expect(s.versionId).toBe('v1');
|
||||
expect(s.createdAt).toBeInstanceOf(Date);
|
||||
expect(s.content).toBeTruthy();
|
||||
expect(s.type).toMatch(/^(code|info)$/);
|
||||
}
|
||||
});
|
||||
|
||||
it('returns NewSnippet records for a .md file', () => {
|
||||
const file = makeFile(
|
||||
'README.md',
|
||||
`# Hello\n\nThis is a long enough paragraph to pass the minimum content length filter.\n`,
|
||||
'markdown'
|
||||
);
|
||||
|
||||
const snippets = parseFile(file, opts);
|
||||
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||
expect(snippets[0]?.type).toBe('info');
|
||||
});
|
||||
|
||||
it('uses null for versionId when not provided', () => {
|
||||
const file = makeFile('src/index.ts', `export function noop(): void {}\n`);
|
||||
const snippets = parseFile(file, { repositoryId: 'r', documentId: 'd' });
|
||||
|
||||
// noop is too short; file may return 0 snippets — just verify no error thrown
|
||||
expect(Array.isArray(snippets)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// BOUNDARY_PATTERNS export
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('BOUNDARY_PATTERNS', () => {
|
||||
it('contains entries for core languages', () => {
|
||||
expect(BOUNDARY_PATTERNS['typescript']).toBeInstanceOf(RegExp);
|
||||
expect(BOUNDARY_PATTERNS['python']).toBeInstanceOf(RegExp);
|
||||
expect(BOUNDARY_PATTERNS['go']).toBeInstanceOf(RegExp);
|
||||
expect(BOUNDARY_PATTERNS['rust']).toBeInstanceOf(RegExp);
|
||||
expect(BOUNDARY_PATTERNS['ruby']).toBeInstanceOf(RegExp);
|
||||
});
|
||||
});
|
||||
302
src/lib/server/parser/code.parser.ts
Normal file
302
src/lib/server/parser/code.parser.ts
Normal file
@@ -0,0 +1,302 @@
|
||||
/**
|
||||
* Code file parser for TRUEREF-0005.
|
||||
*
|
||||
* Splits source-code files into function/class-level chunks using
|
||||
* language-specific regex boundary detection. Falls back to a line-count
|
||||
* sliding window for unrecognised languages.
|
||||
*/
|
||||
|
||||
import { basename } from 'node:path';
|
||||
import type { NewSnippet } from '$lib/server/db/schema.js';
|
||||
import {
|
||||
estimateTokens,
|
||||
chunkLines,
|
||||
chunkText,
|
||||
MAX_TOKENS,
|
||||
OVERLAP_TOKENS,
|
||||
MIN_CONTENT_LENGTH
|
||||
} from './chunker.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Boundary patterns per language
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Each pattern must match the START of a top-level declaration line.
|
||||
* The regex is tested line-by-line (multiline flag not needed).
|
||||
*/
|
||||
export const BOUNDARY_PATTERNS: Record<string, RegExp> = {
|
||||
typescript: /^(export\s+)?(declare\s+)?(async\s+)?(function|class|interface|type|enum|const|let|var)\s+\w+/,
|
||||
javascript: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/,
|
||||
python: /^(async\s+)?(def|class)\s+\w+/,
|
||||
go: /^(func|type|var|const)\s+\w+/,
|
||||
rust: /^(pub(\s*\(crate\))?\s+)?(async\s+)?(fn|impl|struct|enum|trait|type|const|static)\s+\w+/,
|
||||
java: /^(\s*(public|private|protected|static|final|abstract|synchronized)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
|
||||
csharp: /^(\s*(public|private|protected|internal|static|override|virtual|abstract|sealed)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/,
|
||||
kotlin: /^(\s*(public|private|protected|internal|override|suspend|inline|open|abstract|sealed)\s+)*(fun|class|object|interface|data class|sealed class|enum class)\s+\w+/,
|
||||
swift: /^(\s*(public|private|internal|fileprivate|open|override|static|final|class)\s+)*(func|class|struct|enum|protocol|extension)\s+\w+/,
|
||||
ruby: /^(def|class|module)\s+\w+/
|
||||
};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Internal types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Split `content` at lines that match `pattern`, returning the segments
|
||||
* between boundaries (each segment includes its opening boundary line).
|
||||
*/
|
||||
function splitAtBoundaries(content: string, pattern: RegExp): string[] {
|
||||
const lines = content.split('\n');
|
||||
const segments: string[] = [];
|
||||
let current: string[] = [];
|
||||
|
||||
for (const line of lines) {
|
||||
if (pattern.test(line) && current.length > 0) {
|
||||
// Emit what we have, start a new segment from this boundary line
|
||||
segments.push(current.join('\n'));
|
||||
current = [line];
|
||||
} else {
|
||||
current.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (current.length > 0) {
|
||||
segments.push(current.join('\n'));
|
||||
}
|
||||
|
||||
return segments;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Sliding-window fallback for code
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function slidingWindowChunks(content: string, filePath: string, language: string): RawSnippet[] {
|
||||
const lines = content.split('\n');
|
||||
const windowedChunks = chunkLines(lines, 200, 20);
|
||||
return windowedChunks
|
||||
.filter((chunk) => chunk.trim().length >= MIN_CONTENT_LENGTH)
|
||||
.map((chunk) => ({
|
||||
type: 'code' as const,
|
||||
title: basename(filePath),
|
||||
content: chunk,
|
||||
language,
|
||||
breadcrumb: filePath,
|
||||
tokenCount: estimateTokens(chunk)
|
||||
}));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Config / data file parser (JSON, YAML, TOML)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Chunk config/data files by splitting on top-level keys.
|
||||
*
|
||||
* Strategy: find lines that look like top-level keys (zero indentation,
|
||||
* followed by colon/equals/brace) and treat each as a boundary.
|
||||
*/
|
||||
function parseConfigFile(content: string, filePath: string, language: string): RawSnippet[] {
|
||||
const topLevelKey = /^[\w"'\-]+\s*[:=\[{]/;
|
||||
const lines = content.split('\n');
|
||||
const segments: string[] = [];
|
||||
let current: string[] = [];
|
||||
|
||||
for (const line of lines) {
|
||||
if (topLevelKey.test(line) && current.length > 0) {
|
||||
segments.push(current.join('\n'));
|
||||
current = [line];
|
||||
} else {
|
||||
current.push(line);
|
||||
}
|
||||
}
|
||||
if (current.length > 0) segments.push(current.join('\n'));
|
||||
|
||||
// If we got only one segment (no structure detected), fall back to sliding window
|
||||
if (segments.length <= 1) {
|
||||
return slidingWindowChunks(content, filePath, language);
|
||||
}
|
||||
|
||||
return segments
|
||||
.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
|
||||
.flatMap((seg) => {
|
||||
if (estimateTokens(seg) <= MAX_TOKENS) {
|
||||
return [
|
||||
{
|
||||
type: 'code' as const,
|
||||
title: basename(filePath),
|
||||
content: seg.trim(),
|
||||
language,
|
||||
breadcrumb: filePath,
|
||||
tokenCount: estimateTokens(seg.trim())
|
||||
}
|
||||
];
|
||||
}
|
||||
return slidingWindowChunks(seg, filePath, language);
|
||||
});
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HTML / Svelte / Vue parser
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Extract script blocks and text content from HTML-like files.
|
||||
*/
|
||||
function parseHtmlLikeFile(content: string, filePath: string, language: string): RawSnippet[] {
|
||||
const snippets: RawSnippet[] = [];
|
||||
const title = basename(filePath);
|
||||
|
||||
// Extract <script> blocks (including <script lang="ts">)
|
||||
const scriptPattern = /<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi;
|
||||
let match: RegExpExecArray | null;
|
||||
const scriptBlocks: string[] = [];
|
||||
|
||||
while ((match = scriptPattern.exec(content)) !== null) {
|
||||
// Strip the outer tags, keep just the code
|
||||
const inner = match[0].replace(/^<script[^>]*>/, '').replace(/<\/script>$/, '').trim();
|
||||
if (inner.length >= MIN_CONTENT_LENGTH) {
|
||||
scriptBlocks.push(inner);
|
||||
}
|
||||
}
|
||||
|
||||
for (const block of scriptBlocks) {
|
||||
if (estimateTokens(block) <= MAX_TOKENS) {
|
||||
snippets.push({
|
||||
type: 'code',
|
||||
title,
|
||||
content: block,
|
||||
language,
|
||||
breadcrumb: filePath,
|
||||
tokenCount: estimateTokens(block)
|
||||
});
|
||||
} else {
|
||||
snippets.push(...slidingWindowChunks(block, filePath, language));
|
||||
}
|
||||
}
|
||||
|
||||
// Strip tags and extract text content for info snippets
|
||||
const text = content
|
||||
.replace(/<script(?:\s[^>]*)?>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style(?:\s[^>]*)?>[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s{2,}/g, ' ')
|
||||
.trim();
|
||||
|
||||
if (text.length >= MIN_CONTENT_LENGTH) {
|
||||
const chunks = chunkText(text, MAX_TOKENS, OVERLAP_TOKENS);
|
||||
for (const chunk of chunks) {
|
||||
snippets.push({
|
||||
type: 'info',
|
||||
title,
|
||||
content: chunk,
|
||||
language: null,
|
||||
breadcrumb: filePath,
|
||||
tokenCount: estimateTokens(chunk)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return snippets;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Plain-text / RST parser
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function parsePlainText(content: string, filePath: string): RawSnippet[] {
|
||||
// Split on blank lines (paragraph boundaries)
|
||||
const paragraphs = content.split(/\n{2,}/).filter((p) => p.trim().length >= MIN_CONTENT_LENGTH);
|
||||
|
||||
if (paragraphs.length === 0) return [];
|
||||
|
||||
const title = basename(filePath);
|
||||
const snippets: RawSnippet[] = [];
|
||||
|
||||
for (const para of paragraphs) {
|
||||
const chunks = chunkText(para.trim(), MAX_TOKENS, OVERLAP_TOKENS);
|
||||
for (const chunk of chunks) {
|
||||
snippets.push({
|
||||
type: 'info',
|
||||
title,
|
||||
content: chunk,
|
||||
language: null,
|
||||
breadcrumb: filePath,
|
||||
tokenCount: estimateTokens(chunk)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return snippets;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public parser
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Parse a non-Markdown code or data file into raw snippets.
|
||||
*/
|
||||
export function parseCodeFile(content: string, filePath: string, language: string): RawSnippet[] {
|
||||
// Plain text / RST
|
||||
if (language === 'text') {
|
||||
return parsePlainText(content, filePath);
|
||||
}
|
||||
|
||||
// Config / data files
|
||||
if (['json', 'yaml', 'toml'].includes(language)) {
|
||||
return parseConfigFile(content, filePath, language);
|
||||
}
|
||||
|
||||
// HTML-like files
|
||||
if (['html', 'svelte', 'vue'].includes(language)) {
|
||||
return parseHtmlLikeFile(content, filePath, language);
|
||||
}
|
||||
|
||||
// Normalise csharp alias
|
||||
const normalisedLang = language === 'csharp' ? 'csharp' : language;
|
||||
|
||||
const pattern = BOUNDARY_PATTERNS[normalisedLang];
|
||||
const title = basename(filePath);
|
||||
const breadcrumb = filePath;
|
||||
|
||||
if (!pattern) {
|
||||
// Fallback: line-count sliding window
|
||||
return slidingWindowChunks(content, filePath, language);
|
||||
}
|
||||
|
||||
const segments = splitAtBoundaries(content, pattern);
|
||||
|
||||
// If boundary detection produced only one segment covering the whole file,
|
||||
// it means no boundaries matched — fall back to sliding window.
|
||||
if (segments.length === 1 && !pattern.test(content.split('\n')[0])) {
|
||||
return slidingWindowChunks(content, filePath, language);
|
||||
}
|
||||
|
||||
return segments
|
||||
.filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH)
|
||||
.flatMap((seg) => {
|
||||
const trimmed = seg.trim();
|
||||
if (estimateTokens(trimmed) <= MAX_TOKENS) {
|
||||
return [
|
||||
{
|
||||
type: 'code' as const,
|
||||
title,
|
||||
content: trimmed,
|
||||
language,
|
||||
breadcrumb,
|
||||
tokenCount: estimateTokens(trimmed)
|
||||
}
|
||||
];
|
||||
}
|
||||
// Chunk oversized segments with sliding window
|
||||
return slidingWindowChunks(trimmed, filePath, language);
|
||||
});
|
||||
}
|
||||
53
src/lib/server/parser/index.ts
Normal file
53
src/lib/server/parser/index.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
/**
|
||||
* Document parser entry point for TRUEREF-0005.
|
||||
*
|
||||
* Exposes `parseFile` which transforms a `CrawledFile` into an array of
|
||||
* `NewSnippet` records ready for database insertion.
|
||||
*/
|
||||
|
||||
import type { CrawledFile } from '$lib/server/crawler/types.js';
|
||||
import type { NewSnippet } from '$lib/server/db/schema.js';
|
||||
import { detectLanguage } from './language.js';
|
||||
import { parseMarkdown } from './markdown.parser.js';
|
||||
import { parseCodeFile } from './code.parser.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public API
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface ParseOptions {
|
||||
repositoryId: string;
|
||||
documentId: string;
|
||||
versionId?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a crawled file into an array of `NewSnippet` records.
|
||||
*
|
||||
* The language is detected from the file extension. Markdown/MDX files are
|
||||
* split by heading hierarchy; all other files use language-specific boundary
|
||||
* detection or a sliding-window fallback.
|
||||
*/
|
||||
export function parseFile(file: CrawledFile, options: ParseOptions): NewSnippet[] {
|
||||
const language = detectLanguage(file.path);
|
||||
|
||||
const rawSnippets =
|
||||
language === 'markdown'
|
||||
? parseMarkdown(file.content, file.path)
|
||||
: parseCodeFile(file.content, file.path, language);
|
||||
|
||||
return rawSnippets.map((s) => ({
|
||||
...s,
|
||||
id: crypto.randomUUID(),
|
||||
repositoryId: options.repositoryId,
|
||||
documentId: options.documentId,
|
||||
versionId: options.versionId ?? null,
|
||||
createdAt: new Date()
|
||||
}));
|
||||
}
|
||||
|
||||
// Re-export helpers for consumers that need them individually
|
||||
export { detectLanguage } from './language.js';
|
||||
export { estimateTokens, chunkText, chunkLines, MAX_TOKENS, OVERLAP_TOKENS, MIN_CONTENT_LENGTH } from './chunker.js';
|
||||
export { parseMarkdown } from './markdown.parser.js';
|
||||
export { parseCodeFile, BOUNDARY_PATTERNS } from './code.parser.js';
|
||||
56
src/lib/server/parser/language.ts
Normal file
56
src/lib/server/parser/language.ts
Normal file
@@ -0,0 +1,56 @@
|
||||
/**
|
||||
* Language detection for the document parser (TRUEREF-0005).
|
||||
*
|
||||
* Maps file extensions to canonical language names used throughout the parser.
|
||||
*/
|
||||
|
||||
import { extname } from 'node:path';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Language map
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const LANGUAGE_MAP: Record<string, string> = {
|
||||
'.ts': 'typescript',
|
||||
'.tsx': 'typescript',
|
||||
'.js': 'javascript',
|
||||
'.jsx': 'javascript',
|
||||
'.py': 'python',
|
||||
'.rb': 'ruby',
|
||||
'.go': 'go',
|
||||
'.rs': 'rust',
|
||||
'.java': 'java',
|
||||
'.cs': 'csharp',
|
||||
'.cpp': 'cpp',
|
||||
'.c': 'c',
|
||||
'.h': 'c',
|
||||
'.swift': 'swift',
|
||||
'.kt': 'kotlin',
|
||||
'.php': 'php',
|
||||
'.scala': 'scala',
|
||||
'.sh': 'bash',
|
||||
'.bash': 'bash',
|
||||
'.zsh': 'bash',
|
||||
'.md': 'markdown',
|
||||
'.mdx': 'markdown',
|
||||
'.json': 'json',
|
||||
'.yaml': 'yaml',
|
||||
'.yml': 'yaml',
|
||||
'.toml': 'toml',
|
||||
'.html': 'html',
|
||||
'.css': 'css',
|
||||
'.svelte': 'svelte',
|
||||
'.vue': 'vue',
|
||||
'.sql': 'sql',
|
||||
'.txt': 'text',
|
||||
'.rst': 'text'
|
||||
};
|
||||
|
||||
/**
|
||||
* Detect the canonical language name from a file path.
|
||||
* Returns 'text' when the extension is unknown.
|
||||
*/
|
||||
export function detectLanguage(filePath: string): string {
|
||||
const ext = extname(filePath).toLowerCase();
|
||||
return LANGUAGE_MAP[ext] ?? 'text';
|
||||
}
|
||||
272
src/lib/server/parser/markdown.parser.test.ts
Normal file
272
src/lib/server/parser/markdown.parser.test.ts
Normal file
@@ -0,0 +1,272 @@
|
||||
/**
|
||||
* Unit tests for the Markdown parser (TRUEREF-0005).
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { parseMarkdown } from './markdown.parser.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Build a fenced code block string without nesting backticks in template literals. */
|
||||
function fence(lang: string, code: string): string {
|
||||
return '```' + lang + '\n' + code + '\n' + '```';
|
||||
}
|
||||
|
||||
function tildeFence(lang: string, code: string): string {
|
||||
return '~~~' + lang + '\n' + code + '\n' + '~~~';
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Basic section splitting
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseMarkdown — section splitting', () => {
|
||||
it('produces no snippets for empty content', () => {
|
||||
expect(parseMarkdown('', 'README.md')).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('skips content shorter than 20 characters', () => {
|
||||
const result = parseMarkdown('# Title\n\nShort.\n', 'README.md');
|
||||
expect(result).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('parses a single heading section into an info snippet', () => {
|
||||
const source = [
|
||||
'# Introduction',
|
||||
'',
|
||||
'This is a paragraph with enough content to pass the minimum length check.'
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||
const info = snippets.find((s) => s.type === 'info');
|
||||
expect(info).toBeDefined();
|
||||
expect(info?.title).toBe('Introduction');
|
||||
expect(info?.breadcrumb).toBe('Introduction');
|
||||
});
|
||||
|
||||
it('builds correct breadcrumb for nested headings', () => {
|
||||
const source = [
|
||||
'# Getting Started',
|
||||
'',
|
||||
'Intro text that is long enough to be included here.',
|
||||
'',
|
||||
'## Installation',
|
||||
'',
|
||||
'Install by running the command shown below in your terminal.'
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
const installation = snippets.find((s) => s.title === 'Installation');
|
||||
expect(installation).toBeDefined();
|
||||
expect(installation?.breadcrumb).toBe('Getting Started > Installation');
|
||||
});
|
||||
|
||||
it('resets heading stack correctly when headings ascend', () => {
|
||||
const source = [
|
||||
'# H1',
|
||||
'',
|
||||
'Some introductory prose that is longer than twenty characters.',
|
||||
'',
|
||||
'## H2',
|
||||
'',
|
||||
'More content here, also long enough to pass the threshold check.',
|
||||
'',
|
||||
'# Second H1',
|
||||
'',
|
||||
'Content for second top-level heading, long enough to be included.'
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'doc.md');
|
||||
const secondH1 = snippets.find((s) => s.title === 'Second H1');
|
||||
expect(secondH1).toBeDefined();
|
||||
expect(secondH1?.breadcrumb).toBe('Second H1');
|
||||
});
|
||||
|
||||
it('falls back to filename when no heading is present', () => {
|
||||
const source = 'This is some standalone prose content that is long enough to pass.';
|
||||
const snippets = parseMarkdown(source, 'notes.md');
|
||||
expect(snippets.length).toBeGreaterThanOrEqual(1);
|
||||
expect(snippets[0]?.title).toBe('notes.md');
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Fenced code block extraction
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseMarkdown — code block extraction', () => {
|
||||
it('extracts a fenced code block as a code snippet', () => {
|
||||
const codeBlock = fence('typescript', 'function hello(name: string): string {\n return `Hello, ${name}!`;\n}');
|
||||
const source = [
|
||||
'# Example',
|
||||
'',
|
||||
'Some prose here that is long enough to pass the minimum check.',
|
||||
'',
|
||||
codeBlock
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
const code = snippets.find((s) => s.type === 'code');
|
||||
expect(code).toBeDefined();
|
||||
expect(code?.language).toBe('typescript');
|
||||
expect(code?.content).toContain('function hello');
|
||||
});
|
||||
|
||||
it('extracts multiple code blocks from the same section', () => {
|
||||
const bashBlock = fence('bash', 'npm install my-library --save-dev');
|
||||
const jsBlock = fence('javascript', "const lib = require('my-lib');\nlib.doSomething();");
|
||||
const source = [
|
||||
'# Usage',
|
||||
'',
|
||||
'Description of the usage pattern with enough text here.',
|
||||
'',
|
||||
bashBlock,
|
||||
'',
|
||||
'More text in between the two code blocks, just enough.',
|
||||
'',
|
||||
jsBlock
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
const codeSnippets = snippets.filter((s) => s.type === 'code');
|
||||
expect(codeSnippets.length).toBe(2);
|
||||
const langs = codeSnippets.map((s) => s.language);
|
||||
expect(langs).toContain('bash');
|
||||
expect(langs).toContain('javascript');
|
||||
});
|
||||
|
||||
it('skips code blocks shorter than 20 characters', () => {
|
||||
const shortBlock = fence('', 'x = 1');
|
||||
const source = [
|
||||
'# Example',
|
||||
'',
|
||||
'Some prose here that is long enough to pass.',
|
||||
'',
|
||||
shortBlock
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
expect(snippets.every((s) => s.type === 'info')).toBe(true);
|
||||
});
|
||||
|
||||
it('handles tilde-fenced code blocks', () => {
|
||||
const pyBlock = tildeFence('python', 'def greet(name):\n return f"Hello, {name}"');
|
||||
const source = [
|
||||
'# Section',
|
||||
'',
|
||||
'Long enough prose content for the section to be included here.',
|
||||
'',
|
||||
pyBlock
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
const code = snippets.find((s) => s.type === 'code');
|
||||
expect(code).toBeDefined();
|
||||
expect(code?.language).toBe('python');
|
||||
});
|
||||
|
||||
it('preserves breadcrumb on code snippets', () => {
|
||||
const codeBlock = fence(
|
||||
'typescript',
|
||||
'function connect(url: string): Promise<void> {\n return Promise.resolve();\n}'
|
||||
);
|
||||
const source = [
|
||||
'# API Reference',
|
||||
'',
|
||||
'## Methods',
|
||||
'',
|
||||
'Overview of the methods available in this library.',
|
||||
'',
|
||||
codeBlock
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'API.md');
|
||||
const code = snippets.find((s) => s.type === 'code');
|
||||
expect(code).toBeDefined();
|
||||
expect(code?.breadcrumb).toBe('API Reference > Methods');
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Token counting
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseMarkdown — token counting', () => {
|
||||
it('attaches a non-zero tokenCount to every snippet', () => {
|
||||
const source = [
|
||||
'# Overview',
|
||||
'',
|
||||
'This section contains enough text to produce an info snippet for the test.'
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
for (const s of snippets) {
|
||||
expect(s.tokenCount).toBeGreaterThan(0);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Large content chunking
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseMarkdown — large content chunking', () => {
|
||||
it('splits a very large prose section into multiple snippets', () => {
|
||||
// Generate ~4 000 characters of prose (well above the ~1 800-char window)
|
||||
const longParagraph = 'word '.repeat(800).trim();
|
||||
const source = `# Big Section\n\n${longParagraph}`;
|
||||
|
||||
const snippets = parseMarkdown(source, 'big.md');
|
||||
const infoSnippets = snippets.filter((s) => s.type === 'info');
|
||||
expect(infoSnippets.length).toBeGreaterThan(1);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Real-world sample
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseMarkdown — real-world sample', () => {
|
||||
it('correctly parses a realistic README excerpt', () => {
|
||||
const bashInstall = fence('bash', 'npm install my-library');
|
||||
const tsUsage = fence('typescript', "import { doTheThing } from 'my-library';\n\ndoTheThing({ verbose: true });");
|
||||
|
||||
const source = [
|
||||
'# My Library',
|
||||
'',
|
||||
'A handy library for doing things quickly and efficiently.',
|
||||
'',
|
||||
'## Installation',
|
||||
'',
|
||||
'Install via npm using the following command in your project directory:',
|
||||
'',
|
||||
bashInstall,
|
||||
'',
|
||||
'## Usage',
|
||||
'',
|
||||
'Import the library and call the main function as shown below:',
|
||||
'',
|
||||
tsUsage,
|
||||
'',
|
||||
'## API',
|
||||
'',
|
||||
'### doTheThing(options)',
|
||||
'',
|
||||
'Performs the main operation. Options are passed as a plain object.'
|
||||
].join('\n');
|
||||
|
||||
const snippets = parseMarkdown(source, 'README.md');
|
||||
|
||||
// Should have both info and code snippets
|
||||
expect(snippets.some((s) => s.type === 'info')).toBe(true);
|
||||
expect(snippets.some((s) => s.type === 'code')).toBe(true);
|
||||
|
||||
// Breadcrumb depth check
|
||||
const apiSnippet = snippets.find((s) => s.title === 'doTheThing(options)');
|
||||
expect(apiSnippet).toBeDefined();
|
||||
expect(apiSnippet?.breadcrumb).toBe('My Library > API > doTheThing(options)');
|
||||
});
|
||||
});
|
||||
171
src/lib/server/parser/markdown.parser.ts
Normal file
171
src/lib/server/parser/markdown.parser.ts
Normal file
@@ -0,0 +1,171 @@
|
||||
/**
|
||||
* Markdown document parser for TRUEREF-0005.
|
||||
*
|
||||
* Splits Markdown/MDX files into heading-based sections and extracts fenced
|
||||
* code blocks as separate code snippets.
|
||||
*/
|
||||
|
||||
import { basename } from 'node:path';
|
||||
import type { NewSnippet } from '$lib/server/db/schema.js';
|
||||
import { estimateTokens, chunkText, MAX_TOKENS, OVERLAP_TOKENS, MIN_CONTENT_LENGTH } from './chunker.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Internal types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
interface CodeBlock {
|
||||
language: string;
|
||||
code: string;
|
||||
}
|
||||
|
||||
interface MarkdownSection {
|
||||
/** Heading stack at this point, e.g. ["Getting Started", "Installation"] */
|
||||
headings: string[];
|
||||
/** Prose text content (code blocks stripped out) */
|
||||
content: string;
|
||||
/** Fenced code blocks found within this section */
|
||||
codeBlocks: CodeBlock[];
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Section splitting
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Split the full Markdown source into sections delimited by ATX headings
|
||||
* (# … ####). Code blocks inside headings are extracted separately.
|
||||
*/
|
||||
function splitIntoSections(source: string): MarkdownSection[] {
|
||||
const lines = source.split('\n');
|
||||
const sections: MarkdownSection[] = [];
|
||||
|
||||
// Heading stack: index 0 = H1, 1 = H2, … (we track up to H4)
|
||||
const headingStack: string[] = [];
|
||||
|
||||
// Accumulator for the current section
|
||||
let textLines: string[] = [];
|
||||
const codeBlocks: CodeBlock[] = [];
|
||||
|
||||
// Fenced-code-block tracking
|
||||
let inCodeBlock = false;
|
||||
let codeFence = '';
|
||||
let codeLanguage = '';
|
||||
let codeLines: string[] = [];
|
||||
|
||||
function flushSection() {
|
||||
sections.push({
|
||||
headings: [...headingStack],
|
||||
content: textLines.join('\n'),
|
||||
codeBlocks: [...codeBlocks]
|
||||
});
|
||||
textLines = [];
|
||||
codeBlocks.length = 0;
|
||||
}
|
||||
|
||||
for (const line of lines) {
|
||||
// ---- Fenced code block handling ----
|
||||
if (!inCodeBlock) {
|
||||
const fenceMatch = line.match(/^(`{3,}|~{3,})([\w-]*)/);
|
||||
if (fenceMatch) {
|
||||
inCodeBlock = true;
|
||||
codeFence = fenceMatch[1].charAt(0).repeat(fenceMatch[1].length);
|
||||
codeLanguage = fenceMatch[2].trim().toLowerCase();
|
||||
codeLines = [];
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// Check for closing fence (must be same char and at least same length)
|
||||
const closingFence = new RegExp(`^${codeFence[0]}{${codeFence.length},}\\s*$`);
|
||||
if (closingFence.test(line)) {
|
||||
inCodeBlock = false;
|
||||
const code = codeLines.join('\n');
|
||||
if (code.trim().length >= MIN_CONTENT_LENGTH) {
|
||||
codeBlocks.push({ language: codeLanguage, code });
|
||||
}
|
||||
codeLines = [];
|
||||
continue;
|
||||
}
|
||||
codeLines.push(line);
|
||||
continue;
|
||||
}
|
||||
|
||||
// ---- Heading detection (ATX only, H1–H4) ----
|
||||
const headingMatch = line.match(/^(#{1,4})\s+(.*)/);
|
||||
if (headingMatch) {
|
||||
// Emit whatever has accumulated before this heading
|
||||
flushSection();
|
||||
|
||||
const level = headingMatch[1].length; // 1–4
|
||||
const title = headingMatch[2].trim();
|
||||
|
||||
// Trim the stack to the depth above this heading and push the new title
|
||||
headingStack.splice(level - 1, headingStack.length - (level - 1), title);
|
||||
continue;
|
||||
}
|
||||
|
||||
// ---- Ordinary prose line ----
|
||||
textLines.push(line);
|
||||
}
|
||||
|
||||
// Flush any trailing content (unclosed fence treated as prose)
|
||||
if (inCodeBlock) {
|
||||
// Treat remaining code lines as prose if the fence was never closed
|
||||
textLines.push(...codeLines);
|
||||
}
|
||||
flushSection();
|
||||
|
||||
return sections;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public parser
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type RawSnippet = Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>;
|
||||
|
||||
/**
|
||||
* Parse a Markdown/MDX file into raw snippets (before IDs and DB fields are
|
||||
* attached).
|
||||
*/
|
||||
export function parseMarkdown(content: string, filePath: string): RawSnippet[] {
|
||||
const sections = splitIntoSections(content);
|
||||
const snippets: RawSnippet[] = [];
|
||||
|
||||
for (const section of sections) {
|
||||
const breadcrumb = section.headings.join(' > ') || undefined;
|
||||
const title = section.headings.at(-1) ?? basename(filePath);
|
||||
|
||||
// ---- Info snippet for prose content ----
|
||||
const prose = section.content.trim();
|
||||
if (prose.length >= MIN_CONTENT_LENGTH) {
|
||||
const chunks = chunkText(prose, MAX_TOKENS, OVERLAP_TOKENS);
|
||||
for (const chunk of chunks) {
|
||||
snippets.push({
|
||||
type: 'info',
|
||||
title,
|
||||
content: chunk,
|
||||
breadcrumb: breadcrumb ?? null,
|
||||
language: null,
|
||||
tokenCount: estimateTokens(chunk)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Code snippets for each fenced code block ----
|
||||
for (const block of section.codeBlocks) {
|
||||
const code = block.code.trim();
|
||||
if (code.length < MIN_CONTENT_LENGTH) continue;
|
||||
|
||||
snippets.push({
|
||||
type: 'code',
|
||||
title,
|
||||
content: code,
|
||||
language: block.language || null,
|
||||
breadcrumb: breadcrumb ?? null,
|
||||
tokenCount: estimateTokens(code)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return snippets;
|
||||
}
|
||||
Reference in New Issue
Block a user