chore(FEEDBACK-0001): linting

This commit is contained in:
Giancarmine Salucci
2026-03-27 02:23:01 +01:00
parent 16436bfab2
commit 5a3c27224d
102 changed files with 5108 additions and 4976 deletions

View File

@@ -30,19 +30,19 @@ Implement the document parsing and chunking pipeline that transforms raw file co
## Supported File Types
| Extension | Parser Strategy |
|-----------|----------------|
| `.md`, `.mdx` | Heading-based section splitting + code block extraction |
| `.txt`, `.rst` | Paragraph-based splitting |
| `.ts`, `.tsx`, `.js`, `.jsx` | AST-free: function/class boundary detection via regex |
| `.py` | `def`/`class` boundary detection |
| `.go` | `func`/`type` boundary detection |
| `.rs` | `fn`/`impl`/`struct` boundary detection |
| `.java`, `.cs`, `.kt`, `.swift` | Class/method boundary detection |
| `.rb` | `def`/`class` boundary detection |
| `.json`, `.yaml`, `.yml`, `.toml` | Structural chunking (top-level keys) |
| `.html`, `.svelte`, `.vue` | Text content extraction + script block splitting |
| Other code | Line-count-based sliding window (200 lines per chunk) |
| Extension | Parser Strategy |
| --------------------------------- | ------------------------------------------------------- |
| `.md`, `.mdx` | Heading-based section splitting + code block extraction |
| `.txt`, `.rst` | Paragraph-based splitting |
| `.ts`, `.tsx`, `.js`, `.jsx` | AST-free: function/class boundary detection via regex |
| `.py` | `def`/`class` boundary detection |
| `.go` | `func`/`type` boundary detection |
| `.rs` | `fn`/`impl`/`struct` boundary detection |
| `.java`, `.cs`, `.kt`, `.swift` | Class/method boundary detection |
| `.rb` | `def`/`class` boundary detection |
| `.json`, `.yaml`, `.yml`, `.toml` | Structural chunking (top-level keys) |
| `.html`, `.svelte`, `.vue` | Text content extraction + script block splitting |
| Other code | Line-count-based sliding window (200 lines per chunk) |
---
@@ -52,9 +52,9 @@ Use a simple character-based approximation (no tokenizer library needed for v1):
```typescript
function estimateTokens(text: string): number {
// Empirically: ~4 chars per token for English prose
// ~3 chars per token for code (more symbols)
return Math.ceil(text.length / 3.5);
// Empirically: ~4 chars per token for English prose
// ~3 chars per token for code (more symbols)
return Math.ceil(text.length / 3.5);
}
```
@@ -74,49 +74,49 @@ The Markdown parser is the most important parser as most documentation is Markdo
```typescript
interface MarkdownSection {
headings: string[]; // heading stack at this point
content: string; // text content (sans code blocks)
codeBlocks: { language: string; code: string }[];
headings: string[]; // heading stack at this point
content: string; // text content (sans code blocks)
codeBlocks: { language: string; code: string }[];
}
function parseMarkdown(content: string, filePath: string): Snippet[] {
const sections = splitIntoSections(content);
const snippets: Snippet[] = [];
const sections = splitIntoSections(content);
const snippets: Snippet[] = [];
for (const section of sections) {
const breadcrumb = section.headings.join(' > ');
const title = section.headings.at(-1) ?? path.basename(filePath);
for (const section of sections) {
const breadcrumb = section.headings.join(' > ');
const title = section.headings.at(-1) ?? path.basename(filePath);
// Emit info snippet for text content
if (section.content.trim().length >= 20) {
const chunks = chunkText(section.content, MAX_TOKENS, OVERLAP_TOKENS);
for (const chunk of chunks) {
snippets.push({
type: 'info',
title,
content: chunk,
breadcrumb,
tokenCount: estimateTokens(chunk),
});
}
}
// Emit info snippet for text content
if (section.content.trim().length >= 20) {
const chunks = chunkText(section.content, MAX_TOKENS, OVERLAP_TOKENS);
for (const chunk of chunks) {
snippets.push({
type: 'info',
title,
content: chunk,
breadcrumb,
tokenCount: estimateTokens(chunk)
});
}
}
// Emit code snippets for each code block
for (const block of section.codeBlocks) {
if (block.code.trim().length >= 20) {
snippets.push({
type: 'code',
title,
content: block.code,
language: block.language || detectLanguage('.' + block.language),
breadcrumb,
tokenCount: estimateTokens(block.code),
});
}
}
}
// Emit code snippets for each code block
for (const block of section.codeBlocks) {
if (block.code.trim().length >= 20) {
snippets.push({
type: 'code',
title,
content: block.code,
language: block.language || detectLanguage('.' + block.language),
breadcrumb,
tokenCount: estimateTokens(block.code)
});
}
}
}
return snippets;
return snippets;
}
```
@@ -135,43 +135,41 @@ For non-Markdown code files, use regex-based function/class boundary detection.
```typescript
const BOUNDARY_PATTERNS: Record<string, RegExp> = {
typescript: /^(export\s+)?(async\s+)?(function|class|interface|type|const|let|var)\s+\w+/m,
python: /^(async\s+)?(def|class)\s+\w+/m,
go: /^(func|type|var|const)\s+\w+/m,
rust: /^(pub\s+)?(fn|impl|struct|enum|trait)\s+\w+/m,
java: /^(public|private|protected|static).*?(class|interface|enum|void|\w+)\s+\w+\s*[({]/m,
typescript: /^(export\s+)?(async\s+)?(function|class|interface|type|const|let|var)\s+\w+/m,
python: /^(async\s+)?(def|class)\s+\w+/m,
go: /^(func|type|var|const)\s+\w+/m,
rust: /^(pub\s+)?(fn|impl|struct|enum|trait)\s+\w+/m,
java: /^(public|private|protected|static).*?(class|interface|enum|void|\w+)\s+\w+\s*[({]/m
};
function parseCodeFile(
content: string,
filePath: string,
language: string
): Snippet[] {
const pattern = BOUNDARY_PATTERNS[language];
const breadcrumb = filePath;
const title = path.basename(filePath);
function parseCodeFile(content: string, filePath: string, language: string): Snippet[] {
const pattern = BOUNDARY_PATTERNS[language];
const breadcrumb = filePath;
const title = path.basename(filePath);
if (!pattern) {
// Fallback: sliding window
return slidingWindowChunks(content, filePath, language);
}
if (!pattern) {
// Fallback: sliding window
return slidingWindowChunks(content, filePath, language);
}
const chunks = splitAtBoundaries(content, pattern);
return chunks
.filter(chunk => chunk.trim().length >= 20)
.flatMap(chunk => {
if (estimateTokens(chunk) <= MAX_TOKENS) {
return [{
type: 'code' as const,
title,
content: chunk,
language,
breadcrumb,
tokenCount: estimateTokens(chunk),
}];
}
return slidingWindowChunks(chunk, filePath, language);
});
const chunks = splitAtBoundaries(content, pattern);
return chunks
.filter((chunk) => chunk.trim().length >= 20)
.flatMap((chunk) => {
if (estimateTokens(chunk) <= MAX_TOKENS) {
return [
{
type: 'code' as const,
title,
content: chunk,
language,
breadcrumb,
tokenCount: estimateTokens(chunk)
}
];
}
return slidingWindowChunks(chunk, filePath, language);
});
}
```
@@ -188,27 +186,23 @@ const MIN_CONTENT_LENGTH = 20; // characters
### Sliding Window Chunker
```typescript
function chunkText(
text: string,
maxTokens: number,
overlapTokens: number
): string[] {
const words = text.split(/\s+/);
const wordsPerToken = 0.75; // ~0.75 words per token
const maxWords = Math.floor(maxTokens * wordsPerToken);
const overlapWords = Math.floor(overlapTokens * wordsPerToken);
function chunkText(text: string, maxTokens: number, overlapTokens: number): string[] {
const words = text.split(/\s+/);
const wordsPerToken = 0.75; // ~0.75 words per token
const maxWords = Math.floor(maxTokens * wordsPerToken);
const overlapWords = Math.floor(overlapTokens * wordsPerToken);
const chunks: string[] = [];
let start = 0;
const chunks: string[] = [];
let start = 0;
while (start < words.length) {
const end = Math.min(start + maxWords, words.length);
chunks.push(words.slice(start, end).join(' '));
if (end === words.length) break;
start = end - overlapWords;
}
while (start < words.length) {
const end = Math.min(start + maxWords, words.length);
chunks.push(words.slice(start, end).join(' '));
if (end === words.length) break;
start = end - overlapWords;
}
return chunks;
return chunks;
}
```
@@ -218,34 +212,42 @@ function chunkText(
```typescript
const LANGUAGE_MAP: Record<string, string> = {
'.ts': 'typescript', '.tsx': 'typescript',
'.js': 'javascript', '.jsx': 'javascript',
'.py': 'python',
'.rb': 'ruby',
'.go': 'go',
'.rs': 'rust',
'.java': 'java',
'.cs': 'csharp',
'.cpp': 'cpp', '.c': 'c', '.h': 'c',
'.swift': 'swift',
'.kt': 'kotlin',
'.php': 'php',
'.scala': 'scala',
'.sh': 'bash', '.bash': 'bash', '.zsh': 'bash',
'.md': 'markdown', '.mdx': 'markdown',
'.json': 'json',
'.yaml': 'yaml', '.yml': 'yaml',
'.toml': 'toml',
'.html': 'html',
'.css': 'css',
'.svelte': 'svelte',
'.vue': 'vue',
'.sql': 'sql',
'.ts': 'typescript',
'.tsx': 'typescript',
'.js': 'javascript',
'.jsx': 'javascript',
'.py': 'python',
'.rb': 'ruby',
'.go': 'go',
'.rs': 'rust',
'.java': 'java',
'.cs': 'csharp',
'.cpp': 'cpp',
'.c': 'c',
'.h': 'c',
'.swift': 'swift',
'.kt': 'kotlin',
'.php': 'php',
'.scala': 'scala',
'.sh': 'bash',
'.bash': 'bash',
'.zsh': 'bash',
'.md': 'markdown',
'.mdx': 'markdown',
'.json': 'json',
'.yaml': 'yaml',
'.yml': 'yaml',
'.toml': 'toml',
'.html': 'html',
'.css': 'css',
'.svelte': 'svelte',
'.vue': 'vue',
'.sql': 'sql'
};
function detectLanguage(filePath: string): string {
const ext = path.extname(filePath).toLowerCase();
return LANGUAGE_MAP[ext] ?? 'text';
const ext = path.extname(filePath).toLowerCase();
return LANGUAGE_MAP[ext] ?? 'text';
}
```
@@ -255,32 +257,32 @@ function detectLanguage(filePath: string): string {
```typescript
export interface ParseOptions {
repositoryId: string;
documentId: string;
versionId?: string;
repositoryId: string;
documentId: string;
versionId?: string;
}
export function parseFile(
file: CrawledFile,
options: ParseOptions
): NewSnippet[] {
const language = detectLanguage(file.path);
let rawSnippets: Omit<NewSnippet, 'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'>[];
export function parseFile(file: CrawledFile, options: ParseOptions): NewSnippet[] {
const language = detectLanguage(file.path);
let rawSnippets: Omit<
NewSnippet,
'id' | 'repositoryId' | 'documentId' | 'versionId' | 'createdAt'
>[];
if (language === 'markdown') {
rawSnippets = parseMarkdown(file.content, file.path);
} else {
rawSnippets = parseCodeFile(file.content, file.path, language);
}
if (language === 'markdown') {
rawSnippets = parseMarkdown(file.content, file.path);
} else {
rawSnippets = parseCodeFile(file.content, file.path, language);
}
return rawSnippets.map(s => ({
...s,
id: crypto.randomUUID(),
repositoryId: options.repositoryId,
documentId: options.documentId,
versionId: options.versionId ?? null,
createdAt: new Date(),
}));
return rawSnippets.map((s) => ({
...s,
id: crypto.randomUUID(),
repositoryId: options.repositoryId,
documentId: options.documentId,
versionId: options.versionId ?? null,
createdAt: new Date()
}));
}
```