feat(crawler): ignore .gitingore files and folders, fallback to common ignored deps

This commit is contained in:
Giancarmine Salucci
2026-03-25 15:10:44 +01:00
parent 53b3d36ca3
commit 59628dd408
5 changed files with 746 additions and 48 deletions

View File

@@ -24,7 +24,8 @@ import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { promisify } from 'node:util';
import { detectLanguage, shouldIndexFile } from './file-filter.js';
import { detectLanguage, shouldIndexFile, shouldPruneDirectory } from './file-filter.js';
import { parseGitignore, type GitignoreFilter } from './gitignore-parser.js';
import { InvalidRefError, NotAGitRepositoryError } from './types.js';
import type { CrawledFile, CrawlResult, RepoConfig } from './types.js';
@@ -168,11 +169,18 @@ export class LocalCrawler {
onProgress: LocalCrawlOptions['onProgress'],
branch: string
): Promise<CrawlResult> {
// Step 1: Walk the directory tree and collect (relPath, size) pairs.
const statCache = new Map<string, number>();
const allRelPaths = await this.walkDirectory(rootPath, '', statCache);
// Step 1: Load .gitignore from the repo root (if present).
// When found, the filter drives file exclusion during the walk.
// Built-in dependency / build-artifact pruning still applies so local
// indexing stays focused on repository source, not vendored code.
const gitignoreFilter = await this.loadGitignore(rootPath);
// Step 2: Detect trueref.json / context7.json at the repo root first.
// Step 2: Walk the directory tree and collect (relPath, size) pairs.
// Directories are pruned early — their contents are never enumerated.
const statCache = new Map<string, number>();
const allRelPaths = await this.walkDirectory(rootPath, '', statCache, gitignoreFilter);
// Step 3: Detect trueref.json / context7.json at the repo root first.
// Only root-level config files are honoured (no directory prefix).
const configRelPath = allRelPaths.find((p) => CONFIG_FILE_NAMES.has(p));
let config = callerConfig;
@@ -180,13 +188,13 @@ export class LocalCrawler {
config = await parseConfigFile(join(rootPath, configRelPath));
}
// Step 3: Filter files according to extension, size, and config rules.
// Step 4: Filter files according to extension, size, and config rules.
const filteredPaths = allRelPaths.filter((relPath) => {
const size = statCache.get(relPath) ?? 0;
return shouldIndexFile(relPath, size, config);
});
// Step 4: Read file contents and build CrawledFile records.
// Step 5: Read file contents and build CrawledFile records.
const crawledFiles: CrawledFile[] = [];
for (const [i, relPath] of filteredPaths.entries()) {
@@ -209,7 +217,7 @@ export class LocalCrawler {
onProgress?.(i + 1, filteredPaths.length);
}
// Step 5: Build a deterministic repo-level fingerprint from file SHAs.
// Step 6: Build a deterministic repo-level fingerprint from file SHAs.
const commitSha = computeSHA256(crawledFiles.map((f) => f.sha).join(''));
return {
@@ -221,20 +229,56 @@ export class LocalCrawler {
};
}
// ---------------------------------------------------------------------------
// Private — .gitignore loading
// ---------------------------------------------------------------------------
/**
* Attempt to read and parse the root .gitignore file.
* Returns null when the file does not exist or cannot be read.
*
* Only the repository root .gitignore is honoured. Nested .gitignore files
* inside subdirectories are not processed (they are rare and their absence
* only leads to over-indexing, never incorrect indexing).
*/
private async loadGitignore(rootPath: string): Promise<GitignoreFilter | null> {
try {
const content = await fs.readFile(join(rootPath, '.gitignore'), 'utf-8');
return parseGitignore(content);
} catch {
// File absent or unreadable — fall back to IGNORED_DIR_NAMES.
return null;
}
}
// ---------------------------------------------------------------------------
// Private — directory walk
// ---------------------------------------------------------------------------
/**
* Recursively walk a directory and collect relative paths of all regular files.
*
* Directories are pruned before recursion using the built-in ignored-directory
* list plus any matching root .gitignore rule. This avoids
* enumerating the contents of node_modules, dist, .venv, etc. entirely.
*
* Individual files are also tested against the gitignore filter when present,
* so patterns like *.log or /secrets.json are respected.
*
* Symlinks and special files (devices, sockets, FIFOs) are silently skipped.
* Populates `statCache` with file sizes so the caller can filter without a
* second `stat()` call.
*
* @param dir - Absolute path of the directory to read.
* @param rel - Relative path prefix accumulated during recursion.
* @param statCache - Mutable map from relative path → byte size.
* @param dir - Absolute path of the directory to read.
* @param rel - Relative path prefix accumulated during recursion.
* @param statCache - Mutable map from relative path → byte size.
* @param filter - Compiled gitignore filter, or null when absent.
*/
private async walkDirectory(
dir: string,
rel: string,
statCache: Map<string, number>
statCache: Map<string, number>,
filter: GitignoreFilter | null
): Promise<string[]> {
let entries;
try {
@@ -255,11 +299,25 @@ export class LocalCrawler {
const relPath = rel ? `${rel}/${entry.name}` : entry.name;
if (entry.isDirectory()) {
const children = await this.walkDirectory(join(dir, entry.name), relPath, statCache);
// Prune ignored directories before recursing — never enumerate
// their contents. Built-in exclusions always apply, even when a
// repo-level .gitignore exists but does not mention them.
const ignored = shouldPruneDirectory(relPath) || filter?.isIgnored(relPath, true);
if (ignored) continue;
const children = await this.walkDirectory(
join(dir, entry.name),
relPath,
statCache,
filter
);
files.push(...children);
} else {
// Capture file size from stat so shouldIndexFile can enforce the limit
// without reading the file.
// Apply gitignore file-level rules when a filter is loaded.
if (filter?.isIgnored(relPath, false)) continue;
// Capture file size from stat so shouldIndexFile can enforce
// the size limit without reading the file content.
try {
const stat = await fs.stat(join(dir, entry.name));
statCache.set(relPath, stat.size);