diff --git a/src/lib/server/parser/chunker.ts b/src/lib/server/parser/chunker.ts new file mode 100644 index 0000000..10bcc96 --- /dev/null +++ b/src/lib/server/parser/chunker.ts @@ -0,0 +1,92 @@ +/** + * Text chunking utilities for the document parser (TRUEREF-0005). + * + * Provides sliding-window chunking with overlap and token estimation. + */ + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +export const MAX_TOKENS = 512; +export const OVERLAP_TOKENS = 50; +export const MIN_CONTENT_LENGTH = 20; // characters + +// --------------------------------------------------------------------------- +// Token estimation +// --------------------------------------------------------------------------- + +/** + * Estimate the token count for a piece of text using a character-based + * approximation (~3.5 chars per token on average for mixed prose/code). + */ +export function estimateTokens(text: string): number { + return Math.ceil(text.length / 3.5); +} + +// --------------------------------------------------------------------------- +// Sliding-window chunker +// --------------------------------------------------------------------------- + +/** + * Split `text` into overlapping word-based chunks that stay within the token + * budget. Returns at least one chunk even when the text fits in a single + * window. + */ +export function chunkText( + text: string, + maxTokens: number = MAX_TOKENS, + overlapTokens: number = OVERLAP_TOKENS +): string[] { + const words = text.split(/\s+/).filter((w) => w.length > 0); + if (words.length === 0) return []; + + // ~0.75 words per token + const maxWords = Math.max(1, Math.floor(maxTokens * 0.75)); + const overlapWords = Math.max(0, Math.floor(overlapTokens * 0.75)); + + if (words.length <= maxWords) { + return [words.join(' ')]; + } + + const chunks: string[] = []; + let start = 0; + + while (start < words.length) { + const end = Math.min(start + maxWords, words.length); + chunks.push(words.slice(start, end).join(' ')); + if (end === words.length) break; + start = end - overlapWords; + // Guard against infinite loop when overlapWords >= maxWords + if (start <= 0) start = end; + } + + return chunks; +} + +// --------------------------------------------------------------------------- +// Line-count sliding window (for code files without recognised boundaries) +// --------------------------------------------------------------------------- + +/** Split `lines` into groups of at most `maxLines` with `overlapLines` overlap. */ +export function chunkLines( + lines: string[], + maxLines: number = 200, + overlapLines: number = 20 +): string[] { + if (lines.length === 0) return []; + if (lines.length <= maxLines) return [lines.join('\n')]; + + const chunks: string[] = []; + let start = 0; + + while (start < lines.length) { + const end = Math.min(start + maxLines, lines.length); + chunks.push(lines.slice(start, end).join('\n')); + if (end === lines.length) break; + start = end - overlapLines; + if (start <= 0) start = end; + } + + return chunks; +} diff --git a/src/lib/server/parser/code.parser.test.ts b/src/lib/server/parser/code.parser.test.ts new file mode 100644 index 0000000..c0103ba --- /dev/null +++ b/src/lib/server/parser/code.parser.test.ts @@ -0,0 +1,404 @@ +/** + * Unit tests for the code file parser (TRUEREF-0005). + */ + +import { describe, it, expect } from 'vitest'; +import { parseCodeFile, BOUNDARY_PATTERNS } from './code.parser.js'; +import { estimateTokens, MAX_TOKENS } from './chunker.js'; +import { parseFile } from './index.js'; +import type { CrawledFile } from '$lib/server/crawler/types.js'; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function makeFile(path: string, content: string, language = 'typescript'): CrawledFile { + return { path, content, size: content.length, sha: 'abc123', language }; +} + +// --------------------------------------------------------------------------- +// TypeScript / JavaScript boundary detection +// --------------------------------------------------------------------------- + +describe('parseCodeFile — TypeScript', () => { + it('splits at function boundaries', () => { + const content = ` +export function foo(): string { + return 'foo'; +} + +export function bar(x: number): number { + return x * 2; +} +`.trim(); + + const snippets = parseCodeFile(content, 'utils.ts', 'typescript'); + expect(snippets.length).toBeGreaterThanOrEqual(2); + expect(snippets.every((s) => s.type === 'code')).toBe(true); + expect(snippets.some((s) => s.content.includes('function foo'))).toBe(true); + expect(snippets.some((s) => s.content.includes('function bar'))).toBe(true); + }); + + it('splits at class boundaries', () => { + const content = ` +export class Greeter { + greet(name: string) { + return \`Hello, \${name}\`; + } +} + +export class Farewell { + bye(name: string) { + return \`Goodbye, \${name}\`; + } +} +`.trim(); + + const snippets = parseCodeFile(content, 'greet.ts', 'typescript'); + expect(snippets.length).toBeGreaterThanOrEqual(2); + expect(snippets.some((s) => s.content.includes('class Greeter'))).toBe(true); + expect(snippets.some((s) => s.content.includes('class Farewell'))).toBe(true); + }); + + it('sets correct metadata on snippets', () => { + const content = ` +export function example(): void { + console.log('example function body here'); +} +`.trim(); + + const snippets = parseCodeFile(content, 'src/utils.ts', 'typescript'); + expect(snippets.length).toBeGreaterThanOrEqual(1); + const s = snippets[0]!; + expect(s.type).toBe('code'); + expect(s.language).toBe('typescript'); + expect(s.title).toBe('utils.ts'); + expect(s.breadcrumb).toBe('src/utils.ts'); + }); + + it('produces at least one snippet from a file with many small declarations', () => { + // Each block: a multi-line function — boundary detection fires but chunks are + // large enough to survive the MIN_CONTENT_LENGTH filter. + const blocks = Array.from( + { length: 10 }, + (_, i) => `export function helper${i}(x: number): number {\n return x + ${i};\n}` + ); + const content = blocks.join('\n\n'); + + const snippets = parseCodeFile(content, 'generated.ts', 'typescript'); + expect(snippets.length).toBeGreaterThanOrEqual(1); + expect(snippets.every((s) => s.type === 'code')).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Python +// --------------------------------------------------------------------------- + +describe('parseCodeFile — Python', () => { + it('splits at def and class boundaries', () => { + const content = ` +def greet(name): + return f"Hello, {name}" + +class MyClass: + def __init__(self): + self.value = 0 + + def increment(self): + self.value += 1 + +async def fetch_data(url): + return await http.get(url) +`.trim(); + + const snippets = parseCodeFile(content, 'app.py', 'python'); + expect(snippets.some((s) => s.content.includes('def greet'))).toBe(true); + expect(snippets.some((s) => s.content.includes('class MyClass'))).toBe(true); + expect(snippets.some((s) => s.content.includes('async def fetch_data'))).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Go +// --------------------------------------------------------------------------- + +describe('parseCodeFile — Go', () => { + it('splits at func boundaries', () => { + const content = ` +package main + +import "fmt" + +func greet(name string) string { + return fmt.Sprintf("Hello, %s", name) +} + +func main() { + fmt.Println(greet("world")) +} +`.trim(); + + const snippets = parseCodeFile(content, 'main.go', 'go'); + expect(snippets.some((s) => s.content.includes('func greet'))).toBe(true); + expect(snippets.some((s) => s.content.includes('func main'))).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Rust +// --------------------------------------------------------------------------- + +describe('parseCodeFile — Rust', () => { + it('splits at fn and struct boundaries', () => { + const content = ` +pub struct Config { + pub name: String, + pub value: u32, +} + +pub fn create_config(name: &str, value: u32) -> Config { + Config { name: name.to_string(), value } +} + +impl Config { + pub fn new() -> Self { + Config { name: String::new(), value: 0 } + } +} +`.trim(); + + const snippets = parseCodeFile(content, 'config.rs', 'rust'); + expect(snippets.some((s) => s.content.includes('pub struct Config'))).toBe(true); + expect(snippets.some((s) => s.content.includes('pub fn create_config'))).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Ruby +// --------------------------------------------------------------------------- + +describe('parseCodeFile — Ruby', () => { + it('splits at def and class boundaries', () => { + const content = ` +class Greeter + def initialize(name) + @name = name + end + + def greet + "Hello, #{@name}!" + end +end + +def standalone_helper + puts "helper" +end +`.trim(); + + const snippets = parseCodeFile(content, 'greeter.rb', 'ruby'); + expect(snippets.some((s) => s.content.includes('class Greeter'))).toBe(true); + expect(snippets.some((s) => s.content.includes('def standalone_helper'))).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Config / data files +// --------------------------------------------------------------------------- + +describe('parseCodeFile — JSON', () => { + it('produces at least one code snippet from a JSON object', () => { + const content = JSON.stringify( + { + name: 'my-package', + version: '1.0.0', + dependencies: { lodash: '^4.17.21' } + }, + null, + 2 + ); + + const snippets = parseCodeFile(content, 'package.json', 'json'); + expect(snippets.length).toBeGreaterThanOrEqual(1); + expect(snippets.every((s) => s.type === 'code')).toBe(true); + }); +}); + +describe('parseCodeFile — YAML', () => { + it('splits a YAML file at top-level keys', () => { + const content = ` +name: my-project +version: 1.0.0 +scripts: + build: tsc + test: vitest +dependencies: + lodash: ^4.17.21 +`.trim(); + + const snippets = parseCodeFile(content, 'config.yaml', 'yaml'); + expect(snippets.length).toBeGreaterThanOrEqual(1); + }); +}); + +// --------------------------------------------------------------------------- +// HTML-like files +// --------------------------------------------------------------------------- + +describe('parseCodeFile — HTML', () => { + it('extracts script block and text content', () => { + const content = ` + + +Test Page + +

This is the page body content with enough text for an info snippet.

+ + + +`.trim(); + + const snippets = parseCodeFile(content, 'index.html', 'html'); + expect(snippets.some((s) => s.type === 'code')).toBe(true); + expect(snippets.some((s) => s.type === 'info')).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Plain text +// --------------------------------------------------------------------------- + +describe('parseCodeFile — plain text', () => { + it('splits on paragraph boundaries', () => { + const content = ` +This is the first paragraph with enough content to pass the minimum length check. + +This is the second paragraph that also has enough content to be included here. +`.trim(); + + const snippets = parseCodeFile(content, 'notes.txt', 'text'); + expect(snippets.length).toBeGreaterThanOrEqual(2); + expect(snippets.every((s) => s.type === 'info')).toBe(true); + }); + + it('skips paragraphs shorter than 20 characters', () => { + const content = 'Short.\n\nThis is a much longer paragraph that definitely passes the minimum length filter.'; + const snippets = parseCodeFile(content, 'notes.txt', 'text'); + expect(snippets.length).toBe(1); + }); +}); + +// --------------------------------------------------------------------------- +// Unknown language fallback +// --------------------------------------------------------------------------- + +describe('parseCodeFile — unknown language', () => { + it('falls back to sliding window for unrecognised languages', () => { + const lines = Array.from({ length: 50 }, (_, i) => `line ${i}: some code content here`); + const content = lines.join('\n'); + + const snippets = parseCodeFile(content, 'script.lua', 'lua'); + expect(snippets.length).toBeGreaterThanOrEqual(1); + expect(snippets.every((s) => s.type === 'code')).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Min content filter +// --------------------------------------------------------------------------- + +describe('parseCodeFile — minimum content filter', () => { + it('skips segments shorter than 20 characters', () => { + const content = ` +export function realFunction(): string { + // A function with enough content to be included in the output snippets. + return 'result value from the function that does the operation here'; +} +`.trim(); + + const snippets = parseCodeFile(content, 'test.ts', 'typescript'); + expect(snippets.every((s) => s.content.length >= 20)).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Token count cap +// --------------------------------------------------------------------------- + +describe('parseCodeFile — token count', () => { + it('all snippets have tokenCount within MAX_TOKENS', () => { + const lines = Array.from({ length: 300 }, (_, i) => `// comment line number ${i} here\nconst x${i} = ${i};`); + const content = lines.join('\n'); + + const snippets = parseCodeFile(content, 'large.ts', 'typescript'); + for (const s of snippets) { + expect(estimateTokens(s.content)).toBeLessThanOrEqual(MAX_TOKENS + 50); // slight tolerance for boundary chunks + } + }); +}); + +// --------------------------------------------------------------------------- +// parseFile integration +// --------------------------------------------------------------------------- + +describe('parseFile — integration', () => { + const opts = { repositoryId: 'repo-1', documentId: 'doc-1', versionId: 'v1' }; + + it('returns NewSnippet records with all required fields for a .ts file', () => { + const file = makeFile( + 'src/utils.ts', + `export function add(a: number, b: number): number {\n return a + b;\n}\n` + ); + + const snippets = parseFile(file, opts); + expect(snippets.length).toBeGreaterThanOrEqual(1); + + for (const s of snippets) { + expect(s.id).toBeTruthy(); + expect(s.repositoryId).toBe('repo-1'); + expect(s.documentId).toBe('doc-1'); + expect(s.versionId).toBe('v1'); + expect(s.createdAt).toBeInstanceOf(Date); + expect(s.content).toBeTruthy(); + expect(s.type).toMatch(/^(code|info)$/); + } + }); + + it('returns NewSnippet records for a .md file', () => { + const file = makeFile( + 'README.md', + `# Hello\n\nThis is a long enough paragraph to pass the minimum content length filter.\n`, + 'markdown' + ); + + const snippets = parseFile(file, opts); + expect(snippets.length).toBeGreaterThanOrEqual(1); + expect(snippets[0]?.type).toBe('info'); + }); + + it('uses null for versionId when not provided', () => { + const file = makeFile('src/index.ts', `export function noop(): void {}\n`); + const snippets = parseFile(file, { repositoryId: 'r', documentId: 'd' }); + + // noop is too short; file may return 0 snippets — just verify no error thrown + expect(Array.isArray(snippets)).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// BOUNDARY_PATTERNS export +// --------------------------------------------------------------------------- + +describe('BOUNDARY_PATTERNS', () => { + it('contains entries for core languages', () => { + expect(BOUNDARY_PATTERNS['typescript']).toBeInstanceOf(RegExp); + expect(BOUNDARY_PATTERNS['python']).toBeInstanceOf(RegExp); + expect(BOUNDARY_PATTERNS['go']).toBeInstanceOf(RegExp); + expect(BOUNDARY_PATTERNS['rust']).toBeInstanceOf(RegExp); + expect(BOUNDARY_PATTERNS['ruby']).toBeInstanceOf(RegExp); + }); +}); diff --git a/src/lib/server/parser/code.parser.ts b/src/lib/server/parser/code.parser.ts new file mode 100644 index 0000000..d985cac --- /dev/null +++ b/src/lib/server/parser/code.parser.ts @@ -0,0 +1,302 @@ +/** + * Code file parser for TRUEREF-0005. + * + * Splits source-code files into function/class-level chunks using + * language-specific regex boundary detection. Falls back to a line-count + * sliding window for unrecognised languages. + */ + +import { basename } from 'node:path'; +import type { NewSnippet } from '$lib/server/db/schema.js'; +import { + estimateTokens, + chunkLines, + chunkText, + MAX_TOKENS, + OVERLAP_TOKENS, + MIN_CONTENT_LENGTH +} from './chunker.js'; + +// --------------------------------------------------------------------------- +// Boundary patterns per language +// --------------------------------------------------------------------------- + +/** + * Each pattern must match the START of a top-level declaration line. + * The regex is tested line-by-line (multiline flag not needed). + */ +export const BOUNDARY_PATTERNS: Record = { + typescript: /^(export\s+)?(declare\s+)?(async\s+)?(function|class|interface|type|enum|const|let|var)\s+\w+/, + javascript: /^(export\s+)?(async\s+)?(function|class|const|let|var)\s+\w+/, + python: /^(async\s+)?(def|class)\s+\w+/, + go: /^(func|type|var|const)\s+\w+/, + rust: /^(pub(\s*\(crate\))?\s+)?(async\s+)?(fn|impl|struct|enum|trait|type|const|static)\s+\w+/, + java: /^(\s*(public|private|protected|static|final|abstract|synchronized)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/, + csharp: /^(\s*(public|private|protected|internal|static|override|virtual|abstract|sealed)\s+)+[\w<>\[\]]+\s+\w+\s*[({]/, + kotlin: /^(\s*(public|private|protected|internal|override|suspend|inline|open|abstract|sealed)\s+)*(fun|class|object|interface|data class|sealed class|enum class)\s+\w+/, + swift: /^(\s*(public|private|internal|fileprivate|open|override|static|final|class)\s+)*(func|class|struct|enum|protocol|extension)\s+\w+/, + ruby: /^(def|class|module)\s+\w+/ +}; + +// --------------------------------------------------------------------------- +// Internal types +// --------------------------------------------------------------------------- + +type RawSnippet = Omit; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/** + * Split `content` at lines that match `pattern`, returning the segments + * between boundaries (each segment includes its opening boundary line). + */ +function splitAtBoundaries(content: string, pattern: RegExp): string[] { + const lines = content.split('\n'); + const segments: string[] = []; + let current: string[] = []; + + for (const line of lines) { + if (pattern.test(line) && current.length > 0) { + // Emit what we have, start a new segment from this boundary line + segments.push(current.join('\n')); + current = [line]; + } else { + current.push(line); + } + } + + if (current.length > 0) { + segments.push(current.join('\n')); + } + + return segments; +} + +// --------------------------------------------------------------------------- +// Sliding-window fallback for code +// --------------------------------------------------------------------------- + +function slidingWindowChunks(content: string, filePath: string, language: string): RawSnippet[] { + const lines = content.split('\n'); + const windowedChunks = chunkLines(lines, 200, 20); + return windowedChunks + .filter((chunk) => chunk.trim().length >= MIN_CONTENT_LENGTH) + .map((chunk) => ({ + type: 'code' as const, + title: basename(filePath), + content: chunk, + language, + breadcrumb: filePath, + tokenCount: estimateTokens(chunk) + })); +} + +// --------------------------------------------------------------------------- +// Config / data file parser (JSON, YAML, TOML) +// --------------------------------------------------------------------------- + +/** + * Chunk config/data files by splitting on top-level keys. + * + * Strategy: find lines that look like top-level keys (zero indentation, + * followed by colon/equals/brace) and treat each as a boundary. + */ +function parseConfigFile(content: string, filePath: string, language: string): RawSnippet[] { + const topLevelKey = /^[\w"'\-]+\s*[:=\[{]/; + const lines = content.split('\n'); + const segments: string[] = []; + let current: string[] = []; + + for (const line of lines) { + if (topLevelKey.test(line) && current.length > 0) { + segments.push(current.join('\n')); + current = [line]; + } else { + current.push(line); + } + } + if (current.length > 0) segments.push(current.join('\n')); + + // If we got only one segment (no structure detected), fall back to sliding window + if (segments.length <= 1) { + return slidingWindowChunks(content, filePath, language); + } + + return segments + .filter((seg) => seg.trim().length >= MIN_CONTENT_LENGTH) + .flatMap((seg) => { + if (estimateTokens(seg) <= MAX_TOKENS) { + return [ + { + type: 'code' as const, + title: basename(filePath), + content: seg.trim(), + language, + breadcrumb: filePath, + tokenCount: estimateTokens(seg.trim()) + } + ]; + } + return slidingWindowChunks(seg, filePath, language); + }); +} + +// --------------------------------------------------------------------------- +// HTML / Svelte / Vue parser +// --------------------------------------------------------------------------- + +/** + * Extract script blocks and text content from HTML-like files. + */ +function parseHtmlLikeFile(content: string, filePath: string, language: string): RawSnippet[] { + const snippets: RawSnippet[] = []; + const title = basename(filePath); + + // Extract