Files
trueref/docs/features/TRUEREF-0005.md
2026-03-22 17:08:15 +01:00

8.2 KiB

TRUEREF-0005 — Document Parser & Chunker

Priority: P0 Status: Pending Depends On: TRUEREF-0001 Blocks: TRUEREF-0006, TRUEREF-0007, TRUEREF-0009


Overview

Implement the document parsing and chunking pipeline that transforms raw file contents (from the crawlers) into structured, searchable Snippet records. This is the core intellectual layer of TrueRef — the quality of the chunks directly determines the quality of documentation retrieval.


Acceptance Criteria

  • Parse Markdown files into heading-based sections (info snippets)
  • Extract fenced code blocks from Markdown as separate code snippets
  • Parse standalone code files into function/class-level chunks
  • Respect token limits per chunk (max 512 tokens, with 50-token overlap)
  • Assign breadcrumb paths based on heading hierarchy (Markdown) or file path (code)
  • Detect programming language from file extension
  • Produce both code and info type snippets
  • Calculate approximate token counts using character-based estimation
  • Skip empty or trivially short content (< 20 chars)
  • Unit tests with representative samples of each file type

Supported File Types

Extension Parser Strategy
.md, .mdx Heading-based section splitting + code block extraction
.txt, .rst Paragraph-based splitting
.ts, .tsx, .js, .jsx AST-free: function/class boundary detection via regex
.py def/class boundary detection
.go func/type boundary detection
.rs fn/impl/struct boundary detection
.java, .cs, .kt, .swift Class/method boundary detection
.rb def/class boundary detection
.json, .yaml, .yml, .toml Structural chunking (top-level keys)
.html, .svelte, .vue Text content extraction + script block splitting
Other code Line-count-based sliding window (200 lines per chunk)

Token Counting

Use a simple character-based approximation (no tokenizer library needed for v1):

function estimateTokens(text: string): number {
  // Empirically: ~4 chars per token for English prose
  // ~3 chars per token for code (more symbols)
  return Math.ceil(text.length / 3.5);
}

Markdown Parser

The Markdown parser is the most important parser as most documentation is Markdown.

Algorithm

  1. Split the file into lines.
  2. Track current heading stack (H1 > H2 > H3 > H4).
  3. When a new heading is encountered, emit the accumulated content as an info snippet.
  4. Fenced code blocks (```) within sections are extracted as separate code snippets.
  5. The breadcrumb is built from the heading stack: "Getting Started > Installation".
interface MarkdownSection {
  headings: string[];    // heading stack at this point
  content: string;       // text content (sans code blocks)
  codeBlocks: { language: string; code: string }[];
}

function parseMarkdown(content: string, filePath: string): Snippet[] {
  const sections = splitIntoSections(content);
  const snippets: Snippet[] = [];

  for (const section of sections) {
    const breadcrumb = section.headings.join(' > ');
    const title = section.headings.at(-1) ?? path.basename(filePath);

    // Emit info snippet for text content
    if (section.content.trim().length >= 20) {
      const chunks = chunkText(section.content, MAX_TOKENS, OVERLAP_TOKENS);
      for (const chunk of chunks) {
        snippets.push({
          type: 'info',
          title,
          content: chunk,
          breadcrumb,
          tokenCount: estimateTokens(chunk),
        });
      }
    }

    // Emit code snippets for each code block
    for (const block of section.codeBlocks) {
      if (block.code.trim().length >= 20) {
        snippets.push({
          type: 'code',
          title,
          content: block.code,
          language: block.language || detectLanguage('.' + block.language),
          breadcrumb,
          tokenCount: estimateTokens(block.code),
        });
      }
    }
  }

  return snippets;
}

Code File Parser

For non-Markdown code files, use regex-based function/class boundary detection.

Algorithm

  1. Detect language-specific top-level declaration patterns.
  2. Split the file at those boundaries.
  3. Each chunk: the declaration line(s) + body up to the next declaration.
  4. If a chunk exceeds MAX_TOKENS, apply sliding window splitting with overlap.
const BOUNDARY_PATTERNS: Record<string, RegExp> = {
  typescript: /^(export\s+)?(async\s+)?(function|class|interface|type|const|let|var)\s+\w+/m,
  python: /^(async\s+)?(def|class)\s+\w+/m,
  go: /^(func|type|var|const)\s+\w+/m,
  rust: /^(pub\s+)?(fn|impl|struct|enum|trait)\s+\w+/m,
  java: /^(public|private|protected|static).*?(class|interface|enum|void|\w+)\s+\w+\s*[({]/m,
};

function parseCodeFile(
  content: string,
  filePath: string,
  language: string
): Snippet[] {
  const pattern = BOUNDARY_PATTERNS[language];
  const breadcrumb = filePath;
  const title = path.basename(filePath);

  if (!pattern) {
    // Fallback: sliding window
    return slidingWindowChunks(content, filePath, language);
  }

  const chunks = splitAtBoundaries(content, pattern);
  return chunks
    .filter(chunk => chunk.trim().length >= 20)
    .flatMap(chunk => {
      if (estimateTokens(chunk) <= MAX_TOKENS) {
        return [{
          type: 'code' as const,
          title,
          content: chunk,
          language,
          breadcrumb,
          tokenCount: estimateTokens(chunk),
        }];
      }
      return slidingWindowChunks(chunk, filePath, language);
    });
}

Chunking Constants

const MAX_TOKENS = 512;
const OVERLAP_TOKENS = 50;
const MIN_CONTENT_LENGTH = 20; // characters

Sliding Window Chunker

function chunkText(
  text: string,
  maxTokens: number,
  overlapTokens: number
): string[] {
  const words = text.split(/\s+/);
  const wordsPerToken = 0.75; // ~0.75 words per token
  const maxWords = Math.floor(maxTokens * wordsPerToken);
  const overlapWords = Math.floor(overlapTokens * wordsPerToken);

  const chunks: string[] = [];
  let start = 0;

  while (start < words.length) {
    const end = Math.min(start + maxWords, words.length);
    chunks.push(words.slice(start, end).join(' '));
    if (end === words.length) break;
    start = end - overlapWords;
  }

  return chunks;
}

Language Detection

const LANGUAGE_MAP: Record<string, string> = {
  '.ts': 'typescript', '.tsx': 'typescript',
  '.js': 'javascript', '.jsx': 'javascript',
  '.py': 'python',
  '.rb': 'ruby',
  '.go': 'go',
  '.rs': 'rust',
  '.java': 'java',
  '.cs': 'csharp',
  '.cpp': 'cpp', '.c': 'c', '.h': 'c',
  '.swift': 'swift',
  '.kt': 'kotlin',
  '.php': 'php',
  '.scala': 'scala',
  '.sh': 'bash', '.bash': 'bash', '.zsh': 'bash',
  '.md': 'markdown', '.mdx': 'markdown',
  '.json': 'json',
  '.yaml': 'yaml', '.yml': 'yaml',
  '.toml': 'toml',
  '.html': 'html',
  '.css': 'css',
  '.svelte': 'svelte',
  '.vue': 'vue',
  '.sql': 'sql',
};

function detectLanguage(filePath: string): string {
  const ext = path.extname(filePath).toLowerCase();
  return LANGUAGE_MAP[ext] ?? 'text';
}

Main Entry Point

export interface ParseOptions {
  repositoryId: string;
  documentId: string;
  versionId?: string;
}

export function parseFile(
  file: CrawledFile,
  options: ParseOptions
): NewSnippet[] {
  const language = detectLanguage(file.path);
  let rawSnippets: Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>[];

  if (language === 'markdown') {
    rawSnippets = parseMarkdown(file.content, file.path);
  } else {
    rawSnippets = parseCodeFile(file.content, file.path, language);
  }

  return rawSnippets.map(s => ({
    ...s,
    id: crypto.randomUUID(),
    repositoryId: options.repositoryId,
    documentId: options.documentId,
    versionId: options.versionId ?? null,
    createdAt: new Date(),
  }));
}

Files to Create

  • src/lib/server/parser/markdown.parser.ts
  • src/lib/server/parser/code.parser.ts
  • src/lib/server/parser/chunker.ts
  • src/lib/server/parser/language.ts
  • src/lib/server/parser/index.ts — exports parseFile
  • src/lib/server/parser/markdown.parser.test.ts
  • src/lib/server/parser/code.parser.test.ts