feat(crawler): ignore .gitingore files and folders, fallback to common ignored deps

This commit is contained in:
Giancarmine Salucci
2026-03-25 15:10:44 +01:00
parent 53b3d36ca3
commit 59628dd408
5 changed files with 746 additions and 48 deletions

View File

@@ -1,8 +1,8 @@
/**
* File filtering logic for the GitHub crawler (TRUEREF-0003).
* File filtering logic shared by the GitHub and local crawlers.
*
* Determines whether a file in the repository tree should be downloaded
* and indexed based on its extension, size, and the trueref.json config.
* Determines whether a file should be indexed based on its extension, size,
* trueref.json config, and membership in known dependency / artifact paths.
*/
import { extname, basename } from 'node:path';
@@ -56,26 +56,144 @@ export const INDEXABLE_EXTENSIONS = new Set([
'.vue'
]);
/** Maximum file size we are willing to download (500 KB). */
/** Maximum file size we are willing to index (500 KB). */
export const MAX_FILE_SIZE_BYTES = 500_000;
/**
* Default path prefixes that are always excluded regardless of config.
* These directories contain generated or dependency files that should never
* be indexed.
* Directory names that are always excluded, regardless of depth in the tree.
*
* Used in two ways:
* 1. By the local crawler to prune entire directories during the walk
* (via shouldPruneDirectory) when no .gitignore is present.
* 2. By shouldIndexFile to drop files whose path passes through one of
* these directories (covers the GitHub crawler, which receives flat
* path lists from the API).
*
* The list covers dependency stores, build outputs, caches, and generated
* artifacts across all major language ecosystems.
*/
const DEFAULT_EXCLUDES: string[] = [
'node_modules/',
'.git/',
'dist/',
'build/',
'coverage/',
'.next/',
'__pycache__/',
'vendor/',
'target/',
'.cache/'
];
export const IGNORED_DIR_NAMES = new Set([
// ── Version control ────────────────────────────────────────────────────
'.git', '.hg', '.svn',
// ── JavaScript / TypeScript ─────────────────────────────────────────────
'node_modules',
'.npm', '.yarn', '.pnpm-store', '.pnp',
// Build outputs and framework caches
'dist', 'build', 'out',
'.next', '.nuxt', '.svelte-kit', '.vite',
'.turbo', '.parcel-cache', '.webpack',
// ── Python ──────────────────────────────────────────────────────────────
'__pycache__',
'.venv', 'venv', 'env',
'site-packages', '.eggs',
'.pytest_cache', '.mypy_cache', '.ruff_cache',
'.tox', '.nox',
'htmlcov',
// ── Java / Kotlin / Scala ───────────────────────────────────────────────
'target', // Maven + sbt
'.gradle', '.mvn',
// ── Ruby ────────────────────────────────────────────────────────────────
'.bundle',
// ── PHP ─────────────────────────────────────────────────────────────────
// 'vendor' below covers PHP Composer
// ── .NET ────────────────────────────────────────────────────────────────
'bin', 'obj', 'packages',
// ── Haskell ─────────────────────────────────────────────────────────────
'.stack-work', 'dist-newstyle',
// ── Dart / Flutter ──────────────────────────────────────────────────────
'.dart_tool',
// ── Swift / iOS ─────────────────────────────────────────────────────────
'Pods', 'DerivedData',
// ── Elixir / Erlang ─────────────────────────────────────────────────────
'_build', 'deps',
// ── Clojure ─────────────────────────────────────────────────────────────
'.cpcache',
// ── Multi-ecosystem dependency directory ────────────────────────────────
// Go, PHP, Ruby, C++ (conan), Rust (workspace) all use vendor/
'vendor',
// ── Generic caches / temp ───────────────────────────────────────────────
'.cache', '.tmp', 'tmp', 'temp', '.temp', '.sass-cache',
// ── Test coverage ───────────────────────────────────────────────────────
'coverage', '.nyc_output',
// ── IDE / editor artefacts ──────────────────────────────────────────────
'.idea', '.vs',
// ── Generated code ──────────────────────────────────────────────────────
'generated', '__generated__', '_generated',
// ── Logs ────────────────────────────────────────────────────────────────
'logs'
]);
/**
* Exact basenames that are never useful to index — primarily lock files.
* These often have indexable extensions (.json, .yaml, .toml) so the
* extension allow-list alone is not sufficient to exclude them.
*/
const IGNORED_FILE_NAMES = new Set([
'package-lock.json',
'yarn.lock',
'pnpm-lock.yaml',
'bun.lockb',
'Gemfile.lock',
'Cargo.lock',
'poetry.lock',
'Pipfile.lock',
'composer.lock',
'go.sum',
'go.work.sum',
'flake.lock'
]);
// ---------------------------------------------------------------------------
// Directory pruning helpers (used by the local crawler)
// ---------------------------------------------------------------------------
/**
* Returns true if the given directory should be pruned during a local walk.
*
* Checks only the final path segment (the directory's own name), because the
* walker calls this function as it descends — parent directories are already
* handled by earlier recursive calls.
*
* Used as the fallback when no .gitignore is present.
*/
export function shouldPruneDirectory(relDirPath: string): boolean {
const name = relDirPath.split('/').at(-1)!;
return IGNORED_DIR_NAMES.has(name);
}
/**
* Returns true if the file's path passes through at least one ignored directory.
*
* Checks every directory component (all segments except the filename) so that
* nested occurrences like "packages/ui/node_modules/lodash/index.js" are
* correctly excluded. Used by shouldIndexFile for the GitHub crawler, which
* receives flat path lists rather than walking a directory tree.
*/
function isInIgnoredDirectory(relFilePath: string): boolean {
const parts = relFilePath.split('/');
// All parts except the last (filename) are directory names.
for (let i = 0; i < parts.length - 1; i++) {
if (IGNORED_DIR_NAMES.has(parts[i])) return true;
}
return false;
}
// ---------------------------------------------------------------------------
// Language detection
@@ -134,15 +252,17 @@ export function detectLanguage(filePath: string): string {
// ---------------------------------------------------------------------------
/**
* Decide whether a file from the repository tree should be downloaded.
* Decide whether a file should be indexed.
*
* Rules (applied in order):
* 1. Must have an indexable extension.
* 2. Must not exceed the size limit.
* 3. Must not match config.excludeFiles (exact basename match).
* 4. Must not be under a config.excludeFolders path / regex.
* 5. Must be under a config.folders allowlist path / regex (if specified).
* 6. Must not start with a default-excluded prefix.
* 3. Must not be a known lock file or other always-excluded filename.
* 4. Must not be a minified file (*.min.js, *.min.css, *.bundle.js/css).
* 5. Must not match config.excludeFiles (exact basename match).
* 6. Must not be inside an ignored directory (dependency/build/cache).
* 7. Must not be under a config.excludeFolders path / regex.
* 8. Must be under a config.folders allowlist path / regex (if specified).
*/
export function shouldIndexFile(
filePath: string,
@@ -150,6 +270,7 @@ export function shouldIndexFile(
config?: RepoConfig
): boolean {
const ext = extname(filePath).toLowerCase();
const base = basename(filePath);
// 1. Extension allow-list
if (!INDEXABLE_EXTENSIONS.has(ext)) return false;
@@ -157,10 +278,21 @@ export function shouldIndexFile(
// 2. Size limit
if (fileSize > MAX_FILE_SIZE_BYTES) return false;
// 3. Config excludeFiles — exact basename match
if (config?.excludeFiles?.includes(basename(filePath))) return false;
// 3. Lock files and other always-excluded basenames
if (IGNORED_FILE_NAMES.has(base)) return false;
// 4. Config excludeFolders — prefix or regex match
// 4. Minified / bundled files
if (base.includes('.min.') || base.endsWith('.bundle.js') || base.endsWith('.bundle.css'))
return false;
// 5. Config excludeFiles — exact basename match
if (config?.excludeFiles?.includes(base)) return false;
// 6. Ignored directories (dependency stores, build outputs, caches, etc.)
// Covers nested occurrences at any depth, e.g. packages/ui/node_modules/
if (isInIgnoredDirectory(filePath)) return false;
// 7. Config excludeFolders — prefix or regex match
if (
config?.excludeFolders?.some(
(folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath)
@@ -168,7 +300,7 @@ export function shouldIndexFile(
)
return false;
// 5. Config folders allowlist — if provided, the file must match at least one
// 8. Config folders allowlist — if provided, the file must match at least one
if (config?.folders?.length) {
const inAllowedFolder = config.folders.some(
(folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath)
@@ -176,8 +308,5 @@ export function shouldIndexFile(
if (!inAllowedFolder) return false;
}
// 6. Default excludes
if (DEFAULT_EXCLUDES.some((ex) => filePath.startsWith(ex))) return false;
return true;
}

View File

@@ -0,0 +1,219 @@
/**
* Unit tests for the .gitignore parser.
*/
import { describe, expect, it } from 'vitest';
import { parseGitignore } from './gitignore-parser.js';
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function ignored(pattern: string, path: string, isDir = false): boolean {
return parseGitignore(pattern).isIgnored(path, isDir);
}
// ---------------------------------------------------------------------------
// Blank lines and comments
// ---------------------------------------------------------------------------
describe('parseGitignore — blank lines and comments', () => {
it('ignores blank lines', () => {
const f = parseGitignore('\n\n \n');
expect(f.isIgnored('anything.ts', false)).toBe(false);
});
it('ignores comment lines', () => {
const f = parseGitignore('# this is a comment\n # indented comment');
expect(f.isIgnored('anything.ts', false)).toBe(false);
});
});
// ---------------------------------------------------------------------------
// Simple unanchored patterns (match anywhere in the tree)
// ---------------------------------------------------------------------------
describe('parseGitignore — unanchored patterns', () => {
it('matches a bare directory name at the root', () => {
expect(ignored('node_modules', 'node_modules', true)).toBe(true);
});
it('matches a bare directory name at any depth', () => {
expect(ignored('node_modules', 'packages/ui/node_modules', true)).toBe(true);
});
it('does not match a substring of a directory name', () => {
expect(ignored('node_modules', 'not_node_modules', true)).toBe(false);
});
it('matches files by bare name at any depth', () => {
expect(ignored('secret.env', 'config/secret.env', false)).toBe(true);
});
it('matches a wildcard extension pattern at the root', () => {
expect(ignored('*.log', 'error.log', false)).toBe(true);
});
it('matches a wildcard extension pattern in a subdirectory', () => {
expect(ignored('*.log', 'logs/app/error.log', false)).toBe(true);
});
it('does not match a partial extension', () => {
expect(ignored('*.log', 'changelog.md', false)).toBe(false);
});
});
// ---------------------------------------------------------------------------
// Directory-only patterns (trailing /)
// ---------------------------------------------------------------------------
describe('parseGitignore — directory-only patterns', () => {
it('matches a directory path', () => {
expect(ignored('dist/', 'dist', true)).toBe(true);
});
it('does not match a file with the same name', () => {
expect(ignored('dist/', 'dist', false)).toBe(false);
});
it('matches the directory at any depth', () => {
expect(ignored('node_modules/', 'packages/ui/node_modules', true)).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Anchored patterns (pattern contains /)
// ---------------------------------------------------------------------------
describe('parseGitignore — anchored patterns', () => {
it('matches only at the root when pattern starts with /', () => {
expect(ignored('/dist', 'dist', true)).toBe(true);
expect(ignored('/dist', 'src/dist', true)).toBe(false);
});
it('matches only at the root when pattern contains / in the middle', () => {
expect(ignored('src/*.ts', 'src/index.ts', false)).toBe(true);
expect(ignored('src/*.ts', 'lib/src/index.ts', false)).toBe(false);
});
it('anchors to root for patterns like docs/generated', () => {
expect(ignored('docs/generated', 'docs/generated', true)).toBe(true);
expect(ignored('docs/generated', 'other/docs/generated', true)).toBe(false);
});
});
// ---------------------------------------------------------------------------
// Wildcards: * and ?
// ---------------------------------------------------------------------------
describe('parseGitignore — * and ? wildcards', () => {
it('* does not cross directory boundaries', () => {
expect(ignored('src/*', 'src/index.ts', false)).toBe(true);
expect(ignored('src/*', 'src/nested/index.ts', false)).toBe(false);
});
it('? matches exactly one character', () => {
expect(ignored('file?.ts', 'fileA.ts', false)).toBe(true);
expect(ignored('file?.ts', 'fileAB.ts', false)).toBe(false);
});
});
// ---------------------------------------------------------------------------
// Double-star **
// ---------------------------------------------------------------------------
describe('parseGitignore — ** patterns', () => {
it('**/ matches zero segments (file at root)', () => {
expect(ignored('**/debug.log', 'debug.log', false)).toBe(true);
});
it('**/ matches one or more path segments', () => {
expect(ignored('**/debug.log', 'logs/debug.log', false)).toBe(true);
expect(ignored('**/debug.log', 'a/b/c/debug.log', false)).toBe(true);
});
it('dir/** matches everything inside dir', () => {
expect(ignored('build/**', 'build/index.js', false)).toBe(true);
expect(ignored('build/**', 'build/sub/index.js', false)).toBe(true);
expect(ignored('build/**', 'other/index.js', false)).toBe(false);
});
});
// ---------------------------------------------------------------------------
// Negations
// ---------------------------------------------------------------------------
describe('parseGitignore — negation patterns', () => {
it('! un-ignores a previously ignored path (last rule wins)', () => {
const f = parseGitignore('*.log\n!important.log');
expect(f.isIgnored('error.log', false)).toBe(true);
expect(f.isIgnored('important.log', false)).toBe(false);
});
it('a later positive rule re-ignores after a negation', () => {
const f = parseGitignore('*.log\n!important.log\n*.log');
expect(f.isIgnored('important.log', false)).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Real-world .gitignore snippet
// ---------------------------------------------------------------------------
describe('parseGitignore — realistic .gitignore', () => {
const gitignore = `
# Dependencies
node_modules/
.pnpm-store/
# Build outputs
dist/
build/
.next/
# Environment
.env
.env.*
!.env.example
# Logs
*.log
logs/
# OS
.DS_Store
Thumbs.db
`;
const f = parseGitignore(gitignore);
it('ignores node_modules at root', () => {
expect(f.isIgnored('node_modules', true)).toBe(true);
});
it('ignores node_modules at depth', () => {
expect(f.isIgnored('packages/ui/node_modules', true)).toBe(true);
});
it('ignores dist directory', () => {
expect(f.isIgnored('dist', true)).toBe(true);
});
it('ignores .env file', () => {
expect(f.isIgnored('.env', false)).toBe(true);
});
it('does not ignore .env.example (negated)', () => {
expect(f.isIgnored('.env.example', false)).toBe(false);
});
it('ignores .log files', () => {
expect(f.isIgnored('server.log', false)).toBe(true);
expect(f.isIgnored('logs/app.log', false)).toBe(true);
});
it('does not ignore source files', () => {
expect(f.isIgnored('src/index.ts', false)).toBe(false);
expect(f.isIgnored('README.md', false)).toBe(false);
});
});

View File

@@ -0,0 +1,151 @@
/**
* Minimal but correct .gitignore parser for the local crawler.
*
* Implements the gitignore specification:
* https://git-scm.com/docs/gitignore
*
* Supported: wildcards (* ?), double-star (**), directory markers (/),
* negations (!), anchored patterns, character classes ([…]).
*
* Not supported: .gitignore files nested inside subdirectories (root only).
*
* No external dependencies — Node.js built-ins only.
*/
// ---------------------------------------------------------------------------
// Internal types
// ---------------------------------------------------------------------------
interface CompiledPattern {
regex: RegExp;
negated: boolean;
/** True when the pattern ends with / — only applies to directories. */
dirOnly: boolean;
}
// ---------------------------------------------------------------------------
// Pattern compiler
// ---------------------------------------------------------------------------
/**
* Compile a single .gitignore line into a CompiledPattern.
* Returns null for blank lines and comments.
*/
function compileLine(raw: string): CompiledPattern | null {
let line = raw.trim();
// Skip blank lines and comments.
if (!line || line.startsWith('#')) return null;
const negated = line.startsWith('!');
if (negated) line = line.slice(1).trim();
if (!line) return null;
const dirOnly = line.endsWith('/');
if (dirOnly) line = line.slice(0, -1);
// A pattern is anchored to the repository root when it contains a slash
// anywhere (the trailing slash was already stripped above).
const anchored = line.includes('/');
if (line.startsWith('/')) line = line.slice(1);
// Convert glob characters to a regex fragment.
let regexBody = '';
let i = 0;
while (i < line.length) {
const ch = line[i];
if (ch === '*' && line[i + 1] === '*') {
if (line[i + 2] === '/') {
// "**/" — match zero or more path segments.
regexBody += '(?:.+/)?';
i += 3;
} else {
// "**" at the end or alone — match anything.
regexBody += '.*';
i += 2;
}
} else if (ch === '*') {
regexBody += '[^/]*';
i++;
} else if (ch === '?') {
regexBody += '[^/]';
i++;
} else if (ch === '[') {
// Pass character classes through verbatim.
const end = line.indexOf(']', i + 1);
if (end === -1) {
regexBody += '\\[';
i++;
} else {
regexBody += line.slice(i, end + 1);
i = end + 1;
}
} else {
// Escape all other regex metacharacters.
regexBody += ch.replace(/[.+^${}()|\\]/g, '\\$&');
i++;
}
}
// Anchored patterns are relative to the repo root.
// Unanchored patterns can match at any path depth.
// No trailing (?:/.*)? — the local crawler prunes directories before
// descending, so we only ever test a path against its own entry, never
// against a child path. Adding the suffix would make `src/*` incorrectly
// match `src/nested/index.ts`.
const fullPattern = anchored ? `^${regexBody}$` : `(?:^|/)${regexBody}$`;
return {
regex: new RegExp(fullPattern),
negated,
dirOnly
};
}
// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------
/**
* A compiled set of .gitignore rules.
*
* Rules are evaluated in declaration order; the last matching rule wins
* (same semantics as git).
*/
export class GitignoreFilter {
private readonly patterns: readonly CompiledPattern[];
constructor(patterns: CompiledPattern[]) {
this.patterns = patterns;
}
/**
* Returns true if the path should be ignored according to the loaded rules.
*
* @param relPath Forward-slash relative path from the repo root, no leading slash.
* @param isDir Whether the path refers to a directory.
*/
isIgnored(relPath: string, isDir: boolean): boolean {
let ignored = false;
for (const { regex, negated, dirOnly } of this.patterns) {
if (dirOnly && !isDir) continue;
if (regex.test(relPath)) {
ignored = !negated;
}
}
return ignored;
}
}
/**
* Parse the textual content of a .gitignore file and return a GitignoreFilter.
*/
export function parseGitignore(content: string): GitignoreFilter {
const patterns: CompiledPattern[] = [];
for (const line of content.split('\n')) {
const compiled = compileLine(line);
if (compiled) patterns.push(compiled);
}
return new GitignoreFilter(patterns);
}

View File

@@ -173,13 +173,154 @@ describe('LocalCrawler.crawl() — default filtering', () => {
it('reports skippedFiles = total enumerated filtered', async () => {
const result = await crawlRoot();
// dist/, node_modules/, .git/, .png = 4 skipped
// dist/, node_modules/, .git/ are pruned at walk time — never counted.
// Only image.png reaches allRelPaths and is skipped (non-indexable extension).
// src/index.ts + README.md = 2 kept
expect(result.skippedFiles).toBe(4);
expect(result.skippedFiles).toBe(1);
expect(result.totalFiles).toBe(2);
});
});
// ---------------------------------------------------------------------------
// .gitignore support
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — .gitignore support', () => {
afterEach(async () => {
await cleanupTempRepo(root);
});
it('excludes files matching a .gitignore pattern', async () => {
root = await makeTempRepo({
'.gitignore': '*.log\nsecrets.ts',
'src/index.ts': 'export {};',
'debug.log': 'log data',
'secrets.ts': 'const key = "abc";'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'debug.log')).toBe(false);
expect(result.files.some((f) => f.path === 'secrets.ts')).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('excludes a directory listed in .gitignore', async () => {
root = await makeTempRepo({
'.gitignore': 'generated/',
'src/index.ts': 'export {};',
'generated/api.ts': 'auto-generated'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path.startsWith('generated/'))).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('respects negation patterns in .gitignore', async () => {
root = await makeTempRepo({
'.gitignore': '*.env\n!.env.example',
'src/index.ts': 'export {};',
'.env': 'SECRET=abc',
'.env.example': 'SECRET=changeme'
});
const result = await crawlRoot();
// .env files don't have an indexable extension so this tests the gitignore logic
// doesn't incorrectly block .env.example from passing through
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('falls back to IGNORED_DIR_NAMES when no .gitignore is present', async () => {
root = await makeTempRepo({
'src/index.ts': 'export {};',
'node_modules/lodash/index.js': 'lodash',
'__pycache__/main.cpython-311.pyc': 'bytecode'
});
const result = await crawlRoot();
expect(result.files.every((f) => !f.path.startsWith('node_modules/'))).toBe(true);
expect(result.files.every((f) => !f.path.startsWith('__pycache__/'))).toBe(true);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('excludes nested node_modules via fallback (no .gitignore)', async () => {
root = await makeTempRepo({
'src/index.ts': 'export {};',
'packages/ui/node_modules/react/index.js': 'react'
});
const result = await crawlRoot();
expect(result.files.every((f) => !f.path.includes('node_modules'))).toBe(true);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('still prunes common dependency directories when .gitignore exists', async () => {
root = await makeTempRepo({
'.gitignore': 'logs/\n*.log',
'src/index.ts': 'export {};',
'node_modules/lodash/index.js': 'lodash',
'packages/ui/node_modules/react/index.js': 'react',
'logs/debug.log': 'debug'
});
const result = await crawlRoot();
expect(result.files.every((f) => !f.path.includes('node_modules'))).toBe(true);
expect(result.files.every((f) => !f.path.startsWith('logs/'))).toBe(true);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Lock file and minified file exclusions
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — lock file and minified file exclusions', () => {
afterEach(async () => {
await cleanupTempRepo(root);
});
it('excludes package-lock.json', async () => {
root = await makeTempRepo({
'src/index.ts': 'export {};',
'package-lock.json': '{"lockfileVersion":3}'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'package-lock.json')).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('excludes pnpm-lock.yaml', async () => {
root = await makeTempRepo({
'src/index.ts': 'export {};',
'pnpm-lock.yaml': 'lockfileVersion: 9'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'pnpm-lock.yaml')).toBe(false);
});
it('excludes minified .js files', async () => {
root = await makeTempRepo({
'src/index.ts': 'export {};',
'dist/vendor.min.js': '!function(e,t){}()'
});
const result = await crawlRoot();
// dist/ is pruned by default — test via shouldIndexFile logic only if .gitignore present
// Use a custom path outside ignored dirs:
await fs.rm(root, { recursive: true, force: true });
root = await makeTempRepo({
'src/index.ts': 'export {};',
'public/vendor.min.js': '!function(){}'
});
const r2 = await crawlRoot();
expect(r2.files.some((f) => f.path === 'public/vendor.min.js')).toBe(false);
expect(r2.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('excludes .bundle.js files', async () => {
root = await makeTempRepo({
'src/index.ts': 'export {};',
'public/app.bundle.js': 'bundled code'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'public/app.bundle.js')).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Size limit
// ---------------------------------------------------------------------------

View File

@@ -24,7 +24,8 @@ import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { promisify } from 'node:util';
import { detectLanguage, shouldIndexFile } from './file-filter.js';
import { detectLanguage, shouldIndexFile, shouldPruneDirectory } from './file-filter.js';
import { parseGitignore, type GitignoreFilter } from './gitignore-parser.js';
import { InvalidRefError, NotAGitRepositoryError } from './types.js';
import type { CrawledFile, CrawlResult, RepoConfig } from './types.js';
@@ -168,11 +169,18 @@ export class LocalCrawler {
onProgress: LocalCrawlOptions['onProgress'],
branch: string
): Promise<CrawlResult> {
// Step 1: Walk the directory tree and collect (relPath, size) pairs.
const statCache = new Map<string, number>();
const allRelPaths = await this.walkDirectory(rootPath, '', statCache);
// Step 1: Load .gitignore from the repo root (if present).
// When found, the filter drives file exclusion during the walk.
// Built-in dependency / build-artifact pruning still applies so local
// indexing stays focused on repository source, not vendored code.
const gitignoreFilter = await this.loadGitignore(rootPath);
// Step 2: Detect trueref.json / context7.json at the repo root first.
// Step 2: Walk the directory tree and collect (relPath, size) pairs.
// Directories are pruned early — their contents are never enumerated.
const statCache = new Map<string, number>();
const allRelPaths = await this.walkDirectory(rootPath, '', statCache, gitignoreFilter);
// Step 3: Detect trueref.json / context7.json at the repo root first.
// Only root-level config files are honoured (no directory prefix).
const configRelPath = allRelPaths.find((p) => CONFIG_FILE_NAMES.has(p));
let config = callerConfig;
@@ -180,13 +188,13 @@ export class LocalCrawler {
config = await parseConfigFile(join(rootPath, configRelPath));
}
// Step 3: Filter files according to extension, size, and config rules.
// Step 4: Filter files according to extension, size, and config rules.
const filteredPaths = allRelPaths.filter((relPath) => {
const size = statCache.get(relPath) ?? 0;
return shouldIndexFile(relPath, size, config);
});
// Step 4: Read file contents and build CrawledFile records.
// Step 5: Read file contents and build CrawledFile records.
const crawledFiles: CrawledFile[] = [];
for (const [i, relPath] of filteredPaths.entries()) {
@@ -209,7 +217,7 @@ export class LocalCrawler {
onProgress?.(i + 1, filteredPaths.length);
}
// Step 5: Build a deterministic repo-level fingerprint from file SHAs.
// Step 6: Build a deterministic repo-level fingerprint from file SHAs.
const commitSha = computeSHA256(crawledFiles.map((f) => f.sha).join(''));
return {
@@ -221,20 +229,56 @@ export class LocalCrawler {
};
}
// ---------------------------------------------------------------------------
// Private — .gitignore loading
// ---------------------------------------------------------------------------
/**
* Attempt to read and parse the root .gitignore file.
* Returns null when the file does not exist or cannot be read.
*
* Only the repository root .gitignore is honoured. Nested .gitignore files
* inside subdirectories are not processed (they are rare and their absence
* only leads to over-indexing, never incorrect indexing).
*/
private async loadGitignore(rootPath: string): Promise<GitignoreFilter | null> {
try {
const content = await fs.readFile(join(rootPath, '.gitignore'), 'utf-8');
return parseGitignore(content);
} catch {
// File absent or unreadable — fall back to IGNORED_DIR_NAMES.
return null;
}
}
// ---------------------------------------------------------------------------
// Private — directory walk
// ---------------------------------------------------------------------------
/**
* Recursively walk a directory and collect relative paths of all regular files.
*
* Directories are pruned before recursion using the built-in ignored-directory
* list plus any matching root .gitignore rule. This avoids
* enumerating the contents of node_modules, dist, .venv, etc. entirely.
*
* Individual files are also tested against the gitignore filter when present,
* so patterns like *.log or /secrets.json are respected.
*
* Symlinks and special files (devices, sockets, FIFOs) are silently skipped.
* Populates `statCache` with file sizes so the caller can filter without a
* second `stat()` call.
*
* @param dir - Absolute path of the directory to read.
* @param rel - Relative path prefix accumulated during recursion.
* @param statCache - Mutable map from relative path → byte size.
* @param dir - Absolute path of the directory to read.
* @param rel - Relative path prefix accumulated during recursion.
* @param statCache - Mutable map from relative path → byte size.
* @param filter - Compiled gitignore filter, or null when absent.
*/
private async walkDirectory(
dir: string,
rel: string,
statCache: Map<string, number>
statCache: Map<string, number>,
filter: GitignoreFilter | null
): Promise<string[]> {
let entries;
try {
@@ -255,11 +299,25 @@ export class LocalCrawler {
const relPath = rel ? `${rel}/${entry.name}` : entry.name;
if (entry.isDirectory()) {
const children = await this.walkDirectory(join(dir, entry.name), relPath, statCache);
// Prune ignored directories before recursing — never enumerate
// their contents. Built-in exclusions always apply, even when a
// repo-level .gitignore exists but does not mention them.
const ignored = shouldPruneDirectory(relPath) || filter?.isIgnored(relPath, true);
if (ignored) continue;
const children = await this.walkDirectory(
join(dir, entry.name),
relPath,
statCache,
filter
);
files.push(...children);
} else {
// Capture file size from stat so shouldIndexFile can enforce the limit
// without reading the file.
// Apply gitignore file-level rules when a filter is loaded.
if (filter?.isIgnored(relPath, false)) continue;
// Capture file size from stat so shouldIndexFile can enforce
// the size limit without reading the file content.
try {
const stat = await fs.stat(join(dir, entry.name));
statCache.set(relPath, stat.size);