From 59628dd40844154e9556f352ae697b57539b924f Mon Sep 17 00:00:00 2001 From: Giancarmine Salucci Date: Wed, 25 Mar 2026 15:10:44 +0100 Subject: [PATCH] feat(crawler): ignore .gitingore files and folders, fallback to common ignored deps --- src/lib/server/crawler/file-filter.ts | 191 ++++++++++++--- .../server/crawler/gitignore-parser.test.ts | 219 ++++++++++++++++++ src/lib/server/crawler/gitignore-parser.ts | 151 ++++++++++++ src/lib/server/crawler/local.crawler.test.ts | 145 +++++++++++- src/lib/server/crawler/local.crawler.ts | 88 +++++-- 5 files changed, 746 insertions(+), 48 deletions(-) create mode 100644 src/lib/server/crawler/gitignore-parser.test.ts create mode 100644 src/lib/server/crawler/gitignore-parser.ts diff --git a/src/lib/server/crawler/file-filter.ts b/src/lib/server/crawler/file-filter.ts index dd8e048..eb3f4b9 100644 --- a/src/lib/server/crawler/file-filter.ts +++ b/src/lib/server/crawler/file-filter.ts @@ -1,8 +1,8 @@ /** - * File filtering logic for the GitHub crawler (TRUEREF-0003). + * File filtering logic shared by the GitHub and local crawlers. * - * Determines whether a file in the repository tree should be downloaded - * and indexed based on its extension, size, and the trueref.json config. + * Determines whether a file should be indexed based on its extension, size, + * trueref.json config, and membership in known dependency / artifact paths. */ import { extname, basename } from 'node:path'; @@ -56,26 +56,144 @@ export const INDEXABLE_EXTENSIONS = new Set([ '.vue' ]); -/** Maximum file size we are willing to download (500 KB). */ +/** Maximum file size we are willing to index (500 KB). */ export const MAX_FILE_SIZE_BYTES = 500_000; /** - * Default path prefixes that are always excluded regardless of config. - * These directories contain generated or dependency files that should never - * be indexed. + * Directory names that are always excluded, regardless of depth in the tree. + * + * Used in two ways: + * 1. By the local crawler to prune entire directories during the walk + * (via shouldPruneDirectory) when no .gitignore is present. + * 2. By shouldIndexFile to drop files whose path passes through one of + * these directories (covers the GitHub crawler, which receives flat + * path lists from the API). + * + * The list covers dependency stores, build outputs, caches, and generated + * artifacts across all major language ecosystems. */ -const DEFAULT_EXCLUDES: string[] = [ - 'node_modules/', - '.git/', - 'dist/', - 'build/', - 'coverage/', - '.next/', - '__pycache__/', - 'vendor/', - 'target/', - '.cache/' -]; +export const IGNORED_DIR_NAMES = new Set([ + // ── Version control ──────────────────────────────────────────────────── + '.git', '.hg', '.svn', + + // ── JavaScript / TypeScript ───────────────────────────────────────────── + 'node_modules', + '.npm', '.yarn', '.pnpm-store', '.pnp', + // Build outputs and framework caches + 'dist', 'build', 'out', + '.next', '.nuxt', '.svelte-kit', '.vite', + '.turbo', '.parcel-cache', '.webpack', + + // ── Python ────────────────────────────────────────────────────────────── + '__pycache__', + '.venv', 'venv', 'env', + 'site-packages', '.eggs', + '.pytest_cache', '.mypy_cache', '.ruff_cache', + '.tox', '.nox', + 'htmlcov', + + // ── Java / Kotlin / Scala ─────────────────────────────────────────────── + 'target', // Maven + sbt + '.gradle', '.mvn', + + // ── Ruby ──────────────────────────────────────────────────────────────── + '.bundle', + + // ── PHP ───────────────────────────────────────────────────────────────── + // 'vendor' below covers PHP Composer + + // ── .NET ──────────────────────────────────────────────────────────────── + 'bin', 'obj', 'packages', + + // ── Haskell ───────────────────────────────────────────────────────────── + '.stack-work', 'dist-newstyle', + + // ── Dart / Flutter ────────────────────────────────────────────────────── + '.dart_tool', + + // ── Swift / iOS ───────────────────────────────────────────────────────── + 'Pods', 'DerivedData', + + // ── Elixir / Erlang ───────────────────────────────────────────────────── + '_build', 'deps', + + // ── Clojure ───────────────────────────────────────────────────────────── + '.cpcache', + + // ── Multi-ecosystem dependency directory ──────────────────────────────── + // Go, PHP, Ruby, C++ (conan), Rust (workspace) all use vendor/ + 'vendor', + + // ── Generic caches / temp ─────────────────────────────────────────────── + '.cache', '.tmp', 'tmp', 'temp', '.temp', '.sass-cache', + + // ── Test coverage ─────────────────────────────────────────────────────── + 'coverage', '.nyc_output', + + // ── IDE / editor artefacts ────────────────────────────────────────────── + '.idea', '.vs', + + // ── Generated code ────────────────────────────────────────────────────── + 'generated', '__generated__', '_generated', + + // ── Logs ──────────────────────────────────────────────────────────────── + 'logs' +]); + +/** + * Exact basenames that are never useful to index — primarily lock files. + * These often have indexable extensions (.json, .yaml, .toml) so the + * extension allow-list alone is not sufficient to exclude them. + */ +const IGNORED_FILE_NAMES = new Set([ + 'package-lock.json', + 'yarn.lock', + 'pnpm-lock.yaml', + 'bun.lockb', + 'Gemfile.lock', + 'Cargo.lock', + 'poetry.lock', + 'Pipfile.lock', + 'composer.lock', + 'go.sum', + 'go.work.sum', + 'flake.lock' +]); + +// --------------------------------------------------------------------------- +// Directory pruning helpers (used by the local crawler) +// --------------------------------------------------------------------------- + +/** + * Returns true if the given directory should be pruned during a local walk. + * + * Checks only the final path segment (the directory's own name), because the + * walker calls this function as it descends — parent directories are already + * handled by earlier recursive calls. + * + * Used as the fallback when no .gitignore is present. + */ +export function shouldPruneDirectory(relDirPath: string): boolean { + const name = relDirPath.split('/').at(-1)!; + return IGNORED_DIR_NAMES.has(name); +} + +/** + * Returns true if the file's path passes through at least one ignored directory. + * + * Checks every directory component (all segments except the filename) so that + * nested occurrences like "packages/ui/node_modules/lodash/index.js" are + * correctly excluded. Used by shouldIndexFile for the GitHub crawler, which + * receives flat path lists rather than walking a directory tree. + */ +function isInIgnoredDirectory(relFilePath: string): boolean { + const parts = relFilePath.split('/'); + // All parts except the last (filename) are directory names. + for (let i = 0; i < parts.length - 1; i++) { + if (IGNORED_DIR_NAMES.has(parts[i])) return true; + } + return false; +} // --------------------------------------------------------------------------- // Language detection @@ -134,15 +252,17 @@ export function detectLanguage(filePath: string): string { // --------------------------------------------------------------------------- /** - * Decide whether a file from the repository tree should be downloaded. + * Decide whether a file should be indexed. * * Rules (applied in order): * 1. Must have an indexable extension. * 2. Must not exceed the size limit. - * 3. Must not match config.excludeFiles (exact basename match). - * 4. Must not be under a config.excludeFolders path / regex. - * 5. Must be under a config.folders allowlist path / regex (if specified). - * 6. Must not start with a default-excluded prefix. + * 3. Must not be a known lock file or other always-excluded filename. + * 4. Must not be a minified file (*.min.js, *.min.css, *.bundle.js/css). + * 5. Must not match config.excludeFiles (exact basename match). + * 6. Must not be inside an ignored directory (dependency/build/cache). + * 7. Must not be under a config.excludeFolders path / regex. + * 8. Must be under a config.folders allowlist path / regex (if specified). */ export function shouldIndexFile( filePath: string, @@ -150,6 +270,7 @@ export function shouldIndexFile( config?: RepoConfig ): boolean { const ext = extname(filePath).toLowerCase(); + const base = basename(filePath); // 1. Extension allow-list if (!INDEXABLE_EXTENSIONS.has(ext)) return false; @@ -157,10 +278,21 @@ export function shouldIndexFile( // 2. Size limit if (fileSize > MAX_FILE_SIZE_BYTES) return false; - // 3. Config excludeFiles — exact basename match - if (config?.excludeFiles?.includes(basename(filePath))) return false; + // 3. Lock files and other always-excluded basenames + if (IGNORED_FILE_NAMES.has(base)) return false; - // 4. Config excludeFolders — prefix or regex match + // 4. Minified / bundled files + if (base.includes('.min.') || base.endsWith('.bundle.js') || base.endsWith('.bundle.css')) + return false; + + // 5. Config excludeFiles — exact basename match + if (config?.excludeFiles?.includes(base)) return false; + + // 6. Ignored directories (dependency stores, build outputs, caches, etc.) + // Covers nested occurrences at any depth, e.g. packages/ui/node_modules/ + if (isInIgnoredDirectory(filePath)) return false; + + // 7. Config excludeFolders — prefix or regex match if ( config?.excludeFolders?.some( (folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath) @@ -168,7 +300,7 @@ export function shouldIndexFile( ) return false; - // 5. Config folders allowlist — if provided, the file must match at least one + // 8. Config folders allowlist — if provided, the file must match at least one if (config?.folders?.length) { const inAllowedFolder = config.folders.some( (folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath) @@ -176,8 +308,5 @@ export function shouldIndexFile( if (!inAllowedFolder) return false; } - // 6. Default excludes - if (DEFAULT_EXCLUDES.some((ex) => filePath.startsWith(ex))) return false; - return true; } diff --git a/src/lib/server/crawler/gitignore-parser.test.ts b/src/lib/server/crawler/gitignore-parser.test.ts new file mode 100644 index 0000000..617d5e4 --- /dev/null +++ b/src/lib/server/crawler/gitignore-parser.test.ts @@ -0,0 +1,219 @@ +/** + * Unit tests for the .gitignore parser. + */ + +import { describe, expect, it } from 'vitest'; +import { parseGitignore } from './gitignore-parser.js'; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function ignored(pattern: string, path: string, isDir = false): boolean { + return parseGitignore(pattern).isIgnored(path, isDir); +} + +// --------------------------------------------------------------------------- +// Blank lines and comments +// --------------------------------------------------------------------------- + +describe('parseGitignore — blank lines and comments', () => { + it('ignores blank lines', () => { + const f = parseGitignore('\n\n \n'); + expect(f.isIgnored('anything.ts', false)).toBe(false); + }); + + it('ignores comment lines', () => { + const f = parseGitignore('# this is a comment\n # indented comment'); + expect(f.isIgnored('anything.ts', false)).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// Simple unanchored patterns (match anywhere in the tree) +// --------------------------------------------------------------------------- + +describe('parseGitignore — unanchored patterns', () => { + it('matches a bare directory name at the root', () => { + expect(ignored('node_modules', 'node_modules', true)).toBe(true); + }); + + it('matches a bare directory name at any depth', () => { + expect(ignored('node_modules', 'packages/ui/node_modules', true)).toBe(true); + }); + + it('does not match a substring of a directory name', () => { + expect(ignored('node_modules', 'not_node_modules', true)).toBe(false); + }); + + it('matches files by bare name at any depth', () => { + expect(ignored('secret.env', 'config/secret.env', false)).toBe(true); + }); + + it('matches a wildcard extension pattern at the root', () => { + expect(ignored('*.log', 'error.log', false)).toBe(true); + }); + + it('matches a wildcard extension pattern in a subdirectory', () => { + expect(ignored('*.log', 'logs/app/error.log', false)).toBe(true); + }); + + it('does not match a partial extension', () => { + expect(ignored('*.log', 'changelog.md', false)).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// Directory-only patterns (trailing /) +// --------------------------------------------------------------------------- + +describe('parseGitignore — directory-only patterns', () => { + it('matches a directory path', () => { + expect(ignored('dist/', 'dist', true)).toBe(true); + }); + + it('does not match a file with the same name', () => { + expect(ignored('dist/', 'dist', false)).toBe(false); + }); + + it('matches the directory at any depth', () => { + expect(ignored('node_modules/', 'packages/ui/node_modules', true)).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Anchored patterns (pattern contains /) +// --------------------------------------------------------------------------- + +describe('parseGitignore — anchored patterns', () => { + it('matches only at the root when pattern starts with /', () => { + expect(ignored('/dist', 'dist', true)).toBe(true); + expect(ignored('/dist', 'src/dist', true)).toBe(false); + }); + + it('matches only at the root when pattern contains / in the middle', () => { + expect(ignored('src/*.ts', 'src/index.ts', false)).toBe(true); + expect(ignored('src/*.ts', 'lib/src/index.ts', false)).toBe(false); + }); + + it('anchors to root for patterns like docs/generated', () => { + expect(ignored('docs/generated', 'docs/generated', true)).toBe(true); + expect(ignored('docs/generated', 'other/docs/generated', true)).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// Wildcards: * and ? +// --------------------------------------------------------------------------- + +describe('parseGitignore — * and ? wildcards', () => { + it('* does not cross directory boundaries', () => { + expect(ignored('src/*', 'src/index.ts', false)).toBe(true); + expect(ignored('src/*', 'src/nested/index.ts', false)).toBe(false); + }); + + it('? matches exactly one character', () => { + expect(ignored('file?.ts', 'fileA.ts', false)).toBe(true); + expect(ignored('file?.ts', 'fileAB.ts', false)).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// Double-star ** +// --------------------------------------------------------------------------- + +describe('parseGitignore — ** patterns', () => { + it('**/ matches zero segments (file at root)', () => { + expect(ignored('**/debug.log', 'debug.log', false)).toBe(true); + }); + + it('**/ matches one or more path segments', () => { + expect(ignored('**/debug.log', 'logs/debug.log', false)).toBe(true); + expect(ignored('**/debug.log', 'a/b/c/debug.log', false)).toBe(true); + }); + + it('dir/** matches everything inside dir', () => { + expect(ignored('build/**', 'build/index.js', false)).toBe(true); + expect(ignored('build/**', 'build/sub/index.js', false)).toBe(true); + expect(ignored('build/**', 'other/index.js', false)).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// Negations +// --------------------------------------------------------------------------- + +describe('parseGitignore — negation patterns', () => { + it('! un-ignores a previously ignored path (last rule wins)', () => { + const f = parseGitignore('*.log\n!important.log'); + expect(f.isIgnored('error.log', false)).toBe(true); + expect(f.isIgnored('important.log', false)).toBe(false); + }); + + it('a later positive rule re-ignores after a negation', () => { + const f = parseGitignore('*.log\n!important.log\n*.log'); + expect(f.isIgnored('important.log', false)).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Real-world .gitignore snippet +// --------------------------------------------------------------------------- + +describe('parseGitignore — realistic .gitignore', () => { + const gitignore = ` +# Dependencies +node_modules/ +.pnpm-store/ + +# Build outputs +dist/ +build/ +.next/ + +# Environment +.env +.env.* +!.env.example + +# Logs +*.log +logs/ + +# OS +.DS_Store +Thumbs.db +`; + + const f = parseGitignore(gitignore); + + it('ignores node_modules at root', () => { + expect(f.isIgnored('node_modules', true)).toBe(true); + }); + + it('ignores node_modules at depth', () => { + expect(f.isIgnored('packages/ui/node_modules', true)).toBe(true); + }); + + it('ignores dist directory', () => { + expect(f.isIgnored('dist', true)).toBe(true); + }); + + it('ignores .env file', () => { + expect(f.isIgnored('.env', false)).toBe(true); + }); + + it('does not ignore .env.example (negated)', () => { + expect(f.isIgnored('.env.example', false)).toBe(false); + }); + + it('ignores .log files', () => { + expect(f.isIgnored('server.log', false)).toBe(true); + expect(f.isIgnored('logs/app.log', false)).toBe(true); + }); + + it('does not ignore source files', () => { + expect(f.isIgnored('src/index.ts', false)).toBe(false); + expect(f.isIgnored('README.md', false)).toBe(false); + }); +}); diff --git a/src/lib/server/crawler/gitignore-parser.ts b/src/lib/server/crawler/gitignore-parser.ts new file mode 100644 index 0000000..af28e75 --- /dev/null +++ b/src/lib/server/crawler/gitignore-parser.ts @@ -0,0 +1,151 @@ +/** + * Minimal but correct .gitignore parser for the local crawler. + * + * Implements the gitignore specification: + * https://git-scm.com/docs/gitignore + * + * Supported: wildcards (* ?), double-star (**), directory markers (/), + * negations (!), anchored patterns, character classes ([…]). + * + * Not supported: .gitignore files nested inside subdirectories (root only). + * + * No external dependencies — Node.js built-ins only. + */ + +// --------------------------------------------------------------------------- +// Internal types +// --------------------------------------------------------------------------- + +interface CompiledPattern { + regex: RegExp; + negated: boolean; + /** True when the pattern ends with / — only applies to directories. */ + dirOnly: boolean; +} + +// --------------------------------------------------------------------------- +// Pattern compiler +// --------------------------------------------------------------------------- + +/** + * Compile a single .gitignore line into a CompiledPattern. + * Returns null for blank lines and comments. + */ +function compileLine(raw: string): CompiledPattern | null { + let line = raw.trim(); + + // Skip blank lines and comments. + if (!line || line.startsWith('#')) return null; + + const negated = line.startsWith('!'); + if (negated) line = line.slice(1).trim(); + if (!line) return null; + + const dirOnly = line.endsWith('/'); + if (dirOnly) line = line.slice(0, -1); + + // A pattern is anchored to the repository root when it contains a slash + // anywhere (the trailing slash was already stripped above). + const anchored = line.includes('/'); + if (line.startsWith('/')) line = line.slice(1); + + // Convert glob characters to a regex fragment. + let regexBody = ''; + let i = 0; + while (i < line.length) { + const ch = line[i]; + + if (ch === '*' && line[i + 1] === '*') { + if (line[i + 2] === '/') { + // "**/" — match zero or more path segments. + regexBody += '(?:.+/)?'; + i += 3; + } else { + // "**" at the end or alone — match anything. + regexBody += '.*'; + i += 2; + } + } else if (ch === '*') { + regexBody += '[^/]*'; + i++; + } else if (ch === '?') { + regexBody += '[^/]'; + i++; + } else if (ch === '[') { + // Pass character classes through verbatim. + const end = line.indexOf(']', i + 1); + if (end === -1) { + regexBody += '\\['; + i++; + } else { + regexBody += line.slice(i, end + 1); + i = end + 1; + } + } else { + // Escape all other regex metacharacters. + regexBody += ch.replace(/[.+^${}()|\\]/g, '\\$&'); + i++; + } + } + + // Anchored patterns are relative to the repo root. + // Unanchored patterns can match at any path depth. + // No trailing (?:/.*)? — the local crawler prunes directories before + // descending, so we only ever test a path against its own entry, never + // against a child path. Adding the suffix would make `src/*` incorrectly + // match `src/nested/index.ts`. + const fullPattern = anchored ? `^${regexBody}$` : `(?:^|/)${regexBody}$`; + + return { + regex: new RegExp(fullPattern), + negated, + dirOnly + }; +} + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +/** + * A compiled set of .gitignore rules. + * + * Rules are evaluated in declaration order; the last matching rule wins + * (same semantics as git). + */ +export class GitignoreFilter { + private readonly patterns: readonly CompiledPattern[]; + + constructor(patterns: CompiledPattern[]) { + this.patterns = patterns; + } + + /** + * Returns true if the path should be ignored according to the loaded rules. + * + * @param relPath Forward-slash relative path from the repo root, no leading slash. + * @param isDir Whether the path refers to a directory. + */ + isIgnored(relPath: string, isDir: boolean): boolean { + let ignored = false; + for (const { regex, negated, dirOnly } of this.patterns) { + if (dirOnly && !isDir) continue; + if (regex.test(relPath)) { + ignored = !negated; + } + } + return ignored; + } +} + +/** + * Parse the textual content of a .gitignore file and return a GitignoreFilter. + */ +export function parseGitignore(content: string): GitignoreFilter { + const patterns: CompiledPattern[] = []; + for (const line of content.split('\n')) { + const compiled = compileLine(line); + if (compiled) patterns.push(compiled); + } + return new GitignoreFilter(patterns); +} diff --git a/src/lib/server/crawler/local.crawler.test.ts b/src/lib/server/crawler/local.crawler.test.ts index fbae0d0..e83457f 100644 --- a/src/lib/server/crawler/local.crawler.test.ts +++ b/src/lib/server/crawler/local.crawler.test.ts @@ -173,13 +173,154 @@ describe('LocalCrawler.crawl() — default filtering', () => { it('reports skippedFiles = total enumerated – filtered', async () => { const result = await crawlRoot(); - // dist/, node_modules/, .git/, .png = 4 skipped + // dist/, node_modules/, .git/ are pruned at walk time — never counted. + // Only image.png reaches allRelPaths and is skipped (non-indexable extension). // src/index.ts + README.md = 2 kept - expect(result.skippedFiles).toBe(4); + expect(result.skippedFiles).toBe(1); expect(result.totalFiles).toBe(2); }); }); +// --------------------------------------------------------------------------- +// .gitignore support +// --------------------------------------------------------------------------- + +describe('LocalCrawler.crawl() — .gitignore support', () => { + afterEach(async () => { + await cleanupTempRepo(root); + }); + + it('excludes files matching a .gitignore pattern', async () => { + root = await makeTempRepo({ + '.gitignore': '*.log\nsecrets.ts', + 'src/index.ts': 'export {};', + 'debug.log': 'log data', + 'secrets.ts': 'const key = "abc";' + }); + const result = await crawlRoot(); + expect(result.files.some((f) => f.path === 'debug.log')).toBe(false); + expect(result.files.some((f) => f.path === 'secrets.ts')).toBe(false); + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); + }); + + it('excludes a directory listed in .gitignore', async () => { + root = await makeTempRepo({ + '.gitignore': 'generated/', + 'src/index.ts': 'export {};', + 'generated/api.ts': 'auto-generated' + }); + const result = await crawlRoot(); + expect(result.files.some((f) => f.path.startsWith('generated/'))).toBe(false); + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); + }); + + it('respects negation patterns in .gitignore', async () => { + root = await makeTempRepo({ + '.gitignore': '*.env\n!.env.example', + 'src/index.ts': 'export {};', + '.env': 'SECRET=abc', + '.env.example': 'SECRET=changeme' + }); + const result = await crawlRoot(); + // .env files don't have an indexable extension so this tests the gitignore logic + // doesn't incorrectly block .env.example from passing through + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); + }); + + it('falls back to IGNORED_DIR_NAMES when no .gitignore is present', async () => { + root = await makeTempRepo({ + 'src/index.ts': 'export {};', + 'node_modules/lodash/index.js': 'lodash', + '__pycache__/main.cpython-311.pyc': 'bytecode' + }); + const result = await crawlRoot(); + expect(result.files.every((f) => !f.path.startsWith('node_modules/'))).toBe(true); + expect(result.files.every((f) => !f.path.startsWith('__pycache__/'))).toBe(true); + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); + }); + + it('excludes nested node_modules via fallback (no .gitignore)', async () => { + root = await makeTempRepo({ + 'src/index.ts': 'export {};', + 'packages/ui/node_modules/react/index.js': 'react' + }); + const result = await crawlRoot(); + expect(result.files.every((f) => !f.path.includes('node_modules'))).toBe(true); + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); + }); + + it('still prunes common dependency directories when .gitignore exists', async () => { + root = await makeTempRepo({ + '.gitignore': 'logs/\n*.log', + 'src/index.ts': 'export {};', + 'node_modules/lodash/index.js': 'lodash', + 'packages/ui/node_modules/react/index.js': 'react', + 'logs/debug.log': 'debug' + }); + const result = await crawlRoot(); + expect(result.files.every((f) => !f.path.includes('node_modules'))).toBe(true); + expect(result.files.every((f) => !f.path.startsWith('logs/'))).toBe(true); + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Lock file and minified file exclusions +// --------------------------------------------------------------------------- + +describe('LocalCrawler.crawl() — lock file and minified file exclusions', () => { + afterEach(async () => { + await cleanupTempRepo(root); + }); + + it('excludes package-lock.json', async () => { + root = await makeTempRepo({ + 'src/index.ts': 'export {};', + 'package-lock.json': '{"lockfileVersion":3}' + }); + const result = await crawlRoot(); + expect(result.files.some((f) => f.path === 'package-lock.json')).toBe(false); + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); + }); + + it('excludes pnpm-lock.yaml', async () => { + root = await makeTempRepo({ + 'src/index.ts': 'export {};', + 'pnpm-lock.yaml': 'lockfileVersion: 9' + }); + const result = await crawlRoot(); + expect(result.files.some((f) => f.path === 'pnpm-lock.yaml')).toBe(false); + }); + + it('excludes minified .js files', async () => { + root = await makeTempRepo({ + 'src/index.ts': 'export {};', + 'dist/vendor.min.js': '!function(e,t){}()' + }); + const result = await crawlRoot(); + // dist/ is pruned by default — test via shouldIndexFile logic only if .gitignore present + // Use a custom path outside ignored dirs: + await fs.rm(root, { recursive: true, force: true }); + root = await makeTempRepo({ + 'src/index.ts': 'export {};', + 'public/vendor.min.js': '!function(){}' + }); + const r2 = await crawlRoot(); + expect(r2.files.some((f) => f.path === 'public/vendor.min.js')).toBe(false); + expect(r2.files.some((f) => f.path === 'src/index.ts')).toBe(true); + }); + + it('excludes .bundle.js files', async () => { + root = await makeTempRepo({ + 'src/index.ts': 'export {};', + 'public/app.bundle.js': 'bundled code' + }); + const result = await crawlRoot(); + expect(result.files.some((f) => f.path === 'public/app.bundle.js')).toBe(false); + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); + }); +}); + // --------------------------------------------------------------------------- // Size limit // --------------------------------------------------------------------------- diff --git a/src/lib/server/crawler/local.crawler.ts b/src/lib/server/crawler/local.crawler.ts index ba8d7e6..09cffda 100644 --- a/src/lib/server/crawler/local.crawler.ts +++ b/src/lib/server/crawler/local.crawler.ts @@ -24,7 +24,8 @@ import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { promisify } from 'node:util'; -import { detectLanguage, shouldIndexFile } from './file-filter.js'; +import { detectLanguage, shouldIndexFile, shouldPruneDirectory } from './file-filter.js'; +import { parseGitignore, type GitignoreFilter } from './gitignore-parser.js'; import { InvalidRefError, NotAGitRepositoryError } from './types.js'; import type { CrawledFile, CrawlResult, RepoConfig } from './types.js'; @@ -168,11 +169,18 @@ export class LocalCrawler { onProgress: LocalCrawlOptions['onProgress'], branch: string ): Promise { - // Step 1: Walk the directory tree and collect (relPath, size) pairs. - const statCache = new Map(); - const allRelPaths = await this.walkDirectory(rootPath, '', statCache); + // Step 1: Load .gitignore from the repo root (if present). + // When found, the filter drives file exclusion during the walk. + // Built-in dependency / build-artifact pruning still applies so local + // indexing stays focused on repository source, not vendored code. + const gitignoreFilter = await this.loadGitignore(rootPath); - // Step 2: Detect trueref.json / context7.json at the repo root first. + // Step 2: Walk the directory tree and collect (relPath, size) pairs. + // Directories are pruned early — their contents are never enumerated. + const statCache = new Map(); + const allRelPaths = await this.walkDirectory(rootPath, '', statCache, gitignoreFilter); + + // Step 3: Detect trueref.json / context7.json at the repo root first. // Only root-level config files are honoured (no directory prefix). const configRelPath = allRelPaths.find((p) => CONFIG_FILE_NAMES.has(p)); let config = callerConfig; @@ -180,13 +188,13 @@ export class LocalCrawler { config = await parseConfigFile(join(rootPath, configRelPath)); } - // Step 3: Filter files according to extension, size, and config rules. + // Step 4: Filter files according to extension, size, and config rules. const filteredPaths = allRelPaths.filter((relPath) => { const size = statCache.get(relPath) ?? 0; return shouldIndexFile(relPath, size, config); }); - // Step 4: Read file contents and build CrawledFile records. + // Step 5: Read file contents and build CrawledFile records. const crawledFiles: CrawledFile[] = []; for (const [i, relPath] of filteredPaths.entries()) { @@ -209,7 +217,7 @@ export class LocalCrawler { onProgress?.(i + 1, filteredPaths.length); } - // Step 5: Build a deterministic repo-level fingerprint from file SHAs. + // Step 6: Build a deterministic repo-level fingerprint from file SHAs. const commitSha = computeSHA256(crawledFiles.map((f) => f.sha).join('')); return { @@ -221,20 +229,56 @@ export class LocalCrawler { }; } + // --------------------------------------------------------------------------- + // Private — .gitignore loading + // --------------------------------------------------------------------------- + + /** + * Attempt to read and parse the root .gitignore file. + * Returns null when the file does not exist or cannot be read. + * + * Only the repository root .gitignore is honoured. Nested .gitignore files + * inside subdirectories are not processed (they are rare and their absence + * only leads to over-indexing, never incorrect indexing). + */ + private async loadGitignore(rootPath: string): Promise { + try { + const content = await fs.readFile(join(rootPath, '.gitignore'), 'utf-8'); + return parseGitignore(content); + } catch { + // File absent or unreadable — fall back to IGNORED_DIR_NAMES. + return null; + } + } + + // --------------------------------------------------------------------------- + // Private — directory walk + // --------------------------------------------------------------------------- + /** * Recursively walk a directory and collect relative paths of all regular files. + * + * Directories are pruned before recursion using the built-in ignored-directory + * list plus any matching root .gitignore rule. This avoids + * enumerating the contents of node_modules, dist, .venv, etc. entirely. + * + * Individual files are also tested against the gitignore filter when present, + * so patterns like *.log or /secrets.json are respected. + * * Symlinks and special files (devices, sockets, FIFOs) are silently skipped. * Populates `statCache` with file sizes so the caller can filter without a * second `stat()` call. * - * @param dir - Absolute path of the directory to read. - * @param rel - Relative path prefix accumulated during recursion. - * @param statCache - Mutable map from relative path → byte size. + * @param dir - Absolute path of the directory to read. + * @param rel - Relative path prefix accumulated during recursion. + * @param statCache - Mutable map from relative path → byte size. + * @param filter - Compiled gitignore filter, or null when absent. */ private async walkDirectory( dir: string, rel: string, - statCache: Map + statCache: Map, + filter: GitignoreFilter | null ): Promise { let entries; try { @@ -255,11 +299,25 @@ export class LocalCrawler { const relPath = rel ? `${rel}/${entry.name}` : entry.name; if (entry.isDirectory()) { - const children = await this.walkDirectory(join(dir, entry.name), relPath, statCache); + // Prune ignored directories before recursing — never enumerate + // their contents. Built-in exclusions always apply, even when a + // repo-level .gitignore exists but does not mention them. + const ignored = shouldPruneDirectory(relPath) || filter?.isIgnored(relPath, true); + if (ignored) continue; + + const children = await this.walkDirectory( + join(dir, entry.name), + relPath, + statCache, + filter + ); files.push(...children); } else { - // Capture file size from stat so shouldIndexFile can enforce the limit - // without reading the file. + // Apply gitignore file-level rules when a filter is loaded. + if (filter?.isIgnored(relPath, false)) continue; + + // Capture file size from stat so shouldIndexFile can enforce + // the size limit without reading the file content. try { const stat = await fs.stat(join(dir, entry.name)); statCache.set(relPath, stat.size);