chore(FEEDBACK-0001): linting
This commit is contained in:
@@ -30,19 +30,19 @@ Implement the document parsing and chunking pipeline that transforms raw file co
|
||||
|
||||
## Supported File Types
|
||||
|
||||
| Extension | Parser Strategy |
|
||||
|-----------|----------------|
|
||||
| `.md`, `.mdx` | Heading-based section splitting + code block extraction |
|
||||
| `.txt`, `.rst` | Paragraph-based splitting |
|
||||
| `.ts`, `.tsx`, `.js`, `.jsx` | AST-free: function/class boundary detection via regex |
|
||||
| `.py` | `def`/`class` boundary detection |
|
||||
| `.go` | `func`/`type` boundary detection |
|
||||
| `.rs` | `fn`/`impl`/`struct` boundary detection |
|
||||
| `.java`, `.cs`, `.kt`, `.swift` | Class/method boundary detection |
|
||||
| `.rb` | `def`/`class` boundary detection |
|
||||
| `.json`, `.yaml`, `.yml`, `.toml` | Structural chunking (top-level keys) |
|
||||
| `.html`, `.svelte`, `.vue` | Text content extraction + script block splitting |
|
||||
| Other code | Line-count-based sliding window (200 lines per chunk) |
|
||||
| Extension | Parser Strategy |
|
||||
| --------------------------------- | ------------------------------------------------------- |
|
||||
| `.md`, `.mdx` | Heading-based section splitting + code block extraction |
|
||||
| `.txt`, `.rst` | Paragraph-based splitting |
|
||||
| `.ts`, `.tsx`, `.js`, `.jsx` | AST-free: function/class boundary detection via regex |
|
||||
| `.py` | `def`/`class` boundary detection |
|
||||
| `.go` | `func`/`type` boundary detection |
|
||||
| `.rs` | `fn`/`impl`/`struct` boundary detection |
|
||||
| `.java`, `.cs`, `.kt`, `.swift` | Class/method boundary detection |
|
||||
| `.rb` | `def`/`class` boundary detection |
|
||||
| `.json`, `.yaml`, `.yml`, `.toml` | Structural chunking (top-level keys) |
|
||||
| `.html`, `.svelte`, `.vue` | Text content extraction + script block splitting |
|
||||
| Other code | Line-count-based sliding window (200 lines per chunk) |
|
||||
|
||||
---
|
||||
|
||||
@@ -52,9 +52,9 @@ Use a simple character-based approximation (no tokenizer library needed for v1):
|
||||
|
||||
```typescript
|
||||
function estimateTokens(text: string): number {
|
||||
// Empirically: ~4 chars per token for English prose
|
||||
// ~3 chars per token for code (more symbols)
|
||||
return Math.ceil(text.length / 3.5);
|
||||
// Empirically: ~4 chars per token for English prose
|
||||
// ~3 chars per token for code (more symbols)
|
||||
return Math.ceil(text.length / 3.5);
|
||||
}
|
||||
```
|
||||
|
||||
@@ -74,49 +74,49 @@ The Markdown parser is the most important parser as most documentation is Markdo
|
||||
|
||||
```typescript
|
||||
interface MarkdownSection {
|
||||
headings: string[]; // heading stack at this point
|
||||
content: string; // text content (sans code blocks)
|
||||
codeBlocks: { language: string; code: string }[];
|
||||
headings: string[]; // heading stack at this point
|
||||
content: string; // text content (sans code blocks)
|
||||
codeBlocks: { language: string; code: string }[];
|
||||
}
|
||||
|
||||
function parseMarkdown(content: string, filePath: string): Snippet[] {
|
||||
const sections = splitIntoSections(content);
|
||||
const snippets: Snippet[] = [];
|
||||
const sections = splitIntoSections(content);
|
||||
const snippets: Snippet[] = [];
|
||||
|
||||
for (const section of sections) {
|
||||
const breadcrumb = section.headings.join(' > ');
|
||||
const title = section.headings.at(-1) ?? path.basename(filePath);
|
||||
for (const section of sections) {
|
||||
const breadcrumb = section.headings.join(' > ');
|
||||
const title = section.headings.at(-1) ?? path.basename(filePath);
|
||||
|
||||
// Emit info snippet for text content
|
||||
if (section.content.trim().length >= 20) {
|
||||
const chunks = chunkText(section.content, MAX_TOKENS, OVERLAP_TOKENS);
|
||||
for (const chunk of chunks) {
|
||||
snippets.push({
|
||||
type: 'info',
|
||||
title,
|
||||
content: chunk,
|
||||
breadcrumb,
|
||||
tokenCount: estimateTokens(chunk),
|
||||
});
|
||||
}
|
||||
}
|
||||
// Emit info snippet for text content
|
||||
if (section.content.trim().length >= 20) {
|
||||
const chunks = chunkText(section.content, MAX_TOKENS, OVERLAP_TOKENS);
|
||||
for (const chunk of chunks) {
|
||||
snippets.push({
|
||||
type: 'info',
|
||||
title,
|
||||
content: chunk,
|
||||
breadcrumb,
|
||||
tokenCount: estimateTokens(chunk)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Emit code snippets for each code block
|
||||
for (const block of section.codeBlocks) {
|
||||
if (block.code.trim().length >= 20) {
|
||||
snippets.push({
|
||||
type: 'code',
|
||||
title,
|
||||
content: block.code,
|
||||
language: block.language || detectLanguage('.' + block.language),
|
||||
breadcrumb,
|
||||
tokenCount: estimateTokens(block.code),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
// Emit code snippets for each code block
|
||||
for (const block of section.codeBlocks) {
|
||||
if (block.code.trim().length >= 20) {
|
||||
snippets.push({
|
||||
type: 'code',
|
||||
title,
|
||||
content: block.code,
|
||||
language: block.language || detectLanguage('.' + block.language),
|
||||
breadcrumb,
|
||||
tokenCount: estimateTokens(block.code)
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return snippets;
|
||||
return snippets;
|
||||
}
|
||||
```
|
||||
|
||||
@@ -135,43 +135,41 @@ For non-Markdown code files, use regex-based function/class boundary detection.
|
||||
|
||||
```typescript
|
||||
const BOUNDARY_PATTERNS: Record<string, RegExp> = {
|
||||
typescript: /^(export\s+)?(async\s+)?(function|class|interface|type|const|let|var)\s+\w+/m,
|
||||
python: /^(async\s+)?(def|class)\s+\w+/m,
|
||||
go: /^(func|type|var|const)\s+\w+/m,
|
||||
rust: /^(pub\s+)?(fn|impl|struct|enum|trait)\s+\w+/m,
|
||||
java: /^(public|private|protected|static).*?(class|interface|enum|void|\w+)\s+\w+\s*[({]/m,
|
||||
typescript: /^(export\s+)?(async\s+)?(function|class|interface|type|const|let|var)\s+\w+/m,
|
||||
python: /^(async\s+)?(def|class)\s+\w+/m,
|
||||
go: /^(func|type|var|const)\s+\w+/m,
|
||||
rust: /^(pub\s+)?(fn|impl|struct|enum|trait)\s+\w+/m,
|
||||
java: /^(public|private|protected|static).*?(class|interface|enum|void|\w+)\s+\w+\s*[({]/m
|
||||
};
|
||||
|
||||
function parseCodeFile(
|
||||
content: string,
|
||||
filePath: string,
|
||||
language: string
|
||||
): Snippet[] {
|
||||
const pattern = BOUNDARY_PATTERNS[language];
|
||||
const breadcrumb = filePath;
|
||||
const title = path.basename(filePath);
|
||||
function parseCodeFile(content: string, filePath: string, language: string): Snippet[] {
|
||||
const pattern = BOUNDARY_PATTERNS[language];
|
||||
const breadcrumb = filePath;
|
||||
const title = path.basename(filePath);
|
||||
|
||||
if (!pattern) {
|
||||
// Fallback: sliding window
|
||||
return slidingWindowChunks(content, filePath, language);
|
||||
}
|
||||
if (!pattern) {
|
||||
// Fallback: sliding window
|
||||
return slidingWindowChunks(content, filePath, language);
|
||||
}
|
||||
|
||||
const chunks = splitAtBoundaries(content, pattern);
|
||||
return chunks
|
||||
.filter(chunk => chunk.trim().length >= 20)
|
||||
.flatMap(chunk => {
|
||||
if (estimateTokens(chunk) <= MAX_TOKENS) {
|
||||
return [{
|
||||
type: 'code' as const,
|
||||
title,
|
||||
content: chunk,
|
||||
language,
|
||||
breadcrumb,
|
||||
tokenCount: estimateTokens(chunk),
|
||||
}];
|
||||
}
|
||||
return slidingWindowChunks(chunk, filePath, language);
|
||||
});
|
||||
const chunks = splitAtBoundaries(content, pattern);
|
||||
return chunks
|
||||
.filter((chunk) => chunk.trim().length >= 20)
|
||||
.flatMap((chunk) => {
|
||||
if (estimateTokens(chunk) <= MAX_TOKENS) {
|
||||
return [
|
||||
{
|
||||
type: 'code' as const,
|
||||
title,
|
||||
content: chunk,
|
||||
language,
|
||||
breadcrumb,
|
||||
tokenCount: estimateTokens(chunk)
|
||||
}
|
||||
];
|
||||
}
|
||||
return slidingWindowChunks(chunk, filePath, language);
|
||||
});
|
||||
}
|
||||
```
|
||||
|
||||
@@ -188,27 +186,23 @@ const MIN_CONTENT_LENGTH = 20; // characters
|
||||
### Sliding Window Chunker
|
||||
|
||||
```typescript
|
||||
function chunkText(
|
||||
text: string,
|
||||
maxTokens: number,
|
||||
overlapTokens: number
|
||||
): string[] {
|
||||
const words = text.split(/\s+/);
|
||||
const wordsPerToken = 0.75; // ~0.75 words per token
|
||||
const maxWords = Math.floor(maxTokens * wordsPerToken);
|
||||
const overlapWords = Math.floor(overlapTokens * wordsPerToken);
|
||||
function chunkText(text: string, maxTokens: number, overlapTokens: number): string[] {
|
||||
const words = text.split(/\s+/);
|
||||
const wordsPerToken = 0.75; // ~0.75 words per token
|
||||
const maxWords = Math.floor(maxTokens * wordsPerToken);
|
||||
const overlapWords = Math.floor(overlapTokens * wordsPerToken);
|
||||
|
||||
const chunks: string[] = [];
|
||||
let start = 0;
|
||||
const chunks: string[] = [];
|
||||
let start = 0;
|
||||
|
||||
while (start < words.length) {
|
||||
const end = Math.min(start + maxWords, words.length);
|
||||
chunks.push(words.slice(start, end).join(' '));
|
||||
if (end === words.length) break;
|
||||
start = end - overlapWords;
|
||||
}
|
||||
while (start < words.length) {
|
||||
const end = Math.min(start + maxWords, words.length);
|
||||
chunks.push(words.slice(start, end).join(' '));
|
||||
if (end === words.length) break;
|
||||
start = end - overlapWords;
|
||||
}
|
||||
|
||||
return chunks;
|
||||
return chunks;
|
||||
}
|
||||
```
|
||||
|
||||
@@ -218,34 +212,42 @@ function chunkText(
|
||||
|
||||
```typescript
|
||||
const LANGUAGE_MAP: Record<string, string> = {
|
||||
'.ts': 'typescript', '.tsx': 'typescript',
|
||||
'.js': 'javascript', '.jsx': 'javascript',
|
||||
'.py': 'python',
|
||||
'.rb': 'ruby',
|
||||
'.go': 'go',
|
||||
'.rs': 'rust',
|
||||
'.java': 'java',
|
||||
'.cs': 'csharp',
|
||||
'.cpp': 'cpp', '.c': 'c', '.h': 'c',
|
||||
'.swift': 'swift',
|
||||
'.kt': 'kotlin',
|
||||
'.php': 'php',
|
||||
'.scala': 'scala',
|
||||
'.sh': 'bash', '.bash': 'bash', '.zsh': 'bash',
|
||||
'.md': 'markdown', '.mdx': 'markdown',
|
||||
'.json': 'json',
|
||||
'.yaml': 'yaml', '.yml': 'yaml',
|
||||
'.toml': 'toml',
|
||||
'.html': 'html',
|
||||
'.css': 'css',
|
||||
'.svelte': 'svelte',
|
||||
'.vue': 'vue',
|
||||
'.sql': 'sql',
|
||||
'.ts': 'typescript',
|
||||
'.tsx': 'typescript',
|
||||
'.js': 'javascript',
|
||||
'.jsx': 'javascript',
|
||||
'.py': 'python',
|
||||
'.rb': 'ruby',
|
||||
'.go': 'go',
|
||||
'.rs': 'rust',
|
||||
'.java': 'java',
|
||||
'.cs': 'csharp',
|
||||
'.cpp': 'cpp',
|
||||
'.c': 'c',
|
||||
'.h': 'c',
|
||||
'.swift': 'swift',
|
||||
'.kt': 'kotlin',
|
||||
'.php': 'php',
|
||||
'.scala': 'scala',
|
||||
'.sh': 'bash',
|
||||
'.bash': 'bash',
|
||||
'.zsh': 'bash',
|
||||
'.md': 'markdown',
|
||||
'.mdx': 'markdown',
|
||||
'.json': 'json',
|
||||
'.yaml': 'yaml',
|
||||
'.yml': 'yaml',
|
||||
'.toml': 'toml',
|
||||
'.html': 'html',
|
||||
'.css': 'css',
|
||||
'.svelte': 'svelte',
|
||||
'.vue': 'vue',
|
||||
'.sql': 'sql'
|
||||
};
|
||||
|
||||
function detectLanguage(filePath: string): string {
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
return LANGUAGE_MAP[ext] ?? 'text';
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
return LANGUAGE_MAP[ext] ?? 'text';
|
||||
}
|
||||
```
|
||||
|
||||
@@ -255,32 +257,32 @@ function detectLanguage(filePath: string): string {
|
||||
|
||||
```typescript
|
||||
export interface ParseOptions {
|
||||
repositoryId: string;
|
||||
documentId: string;
|
||||
versionId?: string;
|
||||
repositoryId: string;
|
||||
documentId: string;
|
||||
versionId?: string;
|
||||
}
|
||||
|
||||
export function parseFile(
|
||||
file: CrawledFile,
|
||||
options: ParseOptions
|
||||
): NewSnippet[] {
|
||||
const language = detectLanguage(file.path);
|
||||
let rawSnippets: Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>[];
|
||||
export function parseFile(file: CrawledFile, options: ParseOptions): NewSnippet[] {
|
||||
const language = detectLanguage(file.path);
|
||||
let rawSnippets: Omit<
|
||||
NewSnippet,
|
||||
'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'
|
||||
>[];
|
||||
|
||||
if (language === 'markdown') {
|
||||
rawSnippets = parseMarkdown(file.content, file.path);
|
||||
} else {
|
||||
rawSnippets = parseCodeFile(file.content, file.path, language);
|
||||
}
|
||||
if (language === 'markdown') {
|
||||
rawSnippets = parseMarkdown(file.content, file.path);
|
||||
} else {
|
||||
rawSnippets = parseCodeFile(file.content, file.path, language);
|
||||
}
|
||||
|
||||
return rawSnippets.map(s => ({
|
||||
...s,
|
||||
id: crypto.randomUUID(),
|
||||
repositoryId: options.repositoryId,
|
||||
documentId: options.documentId,
|
||||
versionId: options.versionId ?? null,
|
||||
createdAt: new Date(),
|
||||
}));
|
||||
return rawSnippets.map((s) => ({
|
||||
...s,
|
||||
id: crypto.randomUUID(),
|
||||
repositoryId: options.repositoryId,
|
||||
documentId: options.documentId,
|
||||
versionId: options.versionId ?? null,
|
||||
createdAt: new Date()
|
||||
}));
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
Reference in New Issue
Block a user