feat(crawler): ignore .gitingore files and folders, fallback to common ignored deps
This commit is contained in:
@@ -24,7 +24,8 @@ import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { promisify } from 'node:util';
|
||||
|
||||
import { detectLanguage, shouldIndexFile } from './file-filter.js';
|
||||
import { detectLanguage, shouldIndexFile, shouldPruneDirectory } from './file-filter.js';
|
||||
import { parseGitignore, type GitignoreFilter } from './gitignore-parser.js';
|
||||
import { InvalidRefError, NotAGitRepositoryError } from './types.js';
|
||||
import type { CrawledFile, CrawlResult, RepoConfig } from './types.js';
|
||||
|
||||
@@ -168,11 +169,18 @@ export class LocalCrawler {
|
||||
onProgress: LocalCrawlOptions['onProgress'],
|
||||
branch: string
|
||||
): Promise<CrawlResult> {
|
||||
// Step 1: Walk the directory tree and collect (relPath, size) pairs.
|
||||
const statCache = new Map<string, number>();
|
||||
const allRelPaths = await this.walkDirectory(rootPath, '', statCache);
|
||||
// Step 1: Load .gitignore from the repo root (if present).
|
||||
// When found, the filter drives file exclusion during the walk.
|
||||
// Built-in dependency / build-artifact pruning still applies so local
|
||||
// indexing stays focused on repository source, not vendored code.
|
||||
const gitignoreFilter = await this.loadGitignore(rootPath);
|
||||
|
||||
// Step 2: Detect trueref.json / context7.json at the repo root first.
|
||||
// Step 2: Walk the directory tree and collect (relPath, size) pairs.
|
||||
// Directories are pruned early — their contents are never enumerated.
|
||||
const statCache = new Map<string, number>();
|
||||
const allRelPaths = await this.walkDirectory(rootPath, '', statCache, gitignoreFilter);
|
||||
|
||||
// Step 3: Detect trueref.json / context7.json at the repo root first.
|
||||
// Only root-level config files are honoured (no directory prefix).
|
||||
const configRelPath = allRelPaths.find((p) => CONFIG_FILE_NAMES.has(p));
|
||||
let config = callerConfig;
|
||||
@@ -180,13 +188,13 @@ export class LocalCrawler {
|
||||
config = await parseConfigFile(join(rootPath, configRelPath));
|
||||
}
|
||||
|
||||
// Step 3: Filter files according to extension, size, and config rules.
|
||||
// Step 4: Filter files according to extension, size, and config rules.
|
||||
const filteredPaths = allRelPaths.filter((relPath) => {
|
||||
const size = statCache.get(relPath) ?? 0;
|
||||
return shouldIndexFile(relPath, size, config);
|
||||
});
|
||||
|
||||
// Step 4: Read file contents and build CrawledFile records.
|
||||
// Step 5: Read file contents and build CrawledFile records.
|
||||
const crawledFiles: CrawledFile[] = [];
|
||||
|
||||
for (const [i, relPath] of filteredPaths.entries()) {
|
||||
@@ -209,7 +217,7 @@ export class LocalCrawler {
|
||||
onProgress?.(i + 1, filteredPaths.length);
|
||||
}
|
||||
|
||||
// Step 5: Build a deterministic repo-level fingerprint from file SHAs.
|
||||
// Step 6: Build a deterministic repo-level fingerprint from file SHAs.
|
||||
const commitSha = computeSHA256(crawledFiles.map((f) => f.sha).join(''));
|
||||
|
||||
return {
|
||||
@@ -221,20 +229,56 @@ export class LocalCrawler {
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Private — .gitignore loading
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Attempt to read and parse the root .gitignore file.
|
||||
* Returns null when the file does not exist or cannot be read.
|
||||
*
|
||||
* Only the repository root .gitignore is honoured. Nested .gitignore files
|
||||
* inside subdirectories are not processed (they are rare and their absence
|
||||
* only leads to over-indexing, never incorrect indexing).
|
||||
*/
|
||||
private async loadGitignore(rootPath: string): Promise<GitignoreFilter | null> {
|
||||
try {
|
||||
const content = await fs.readFile(join(rootPath, '.gitignore'), 'utf-8');
|
||||
return parseGitignore(content);
|
||||
} catch {
|
||||
// File absent or unreadable — fall back to IGNORED_DIR_NAMES.
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Private — directory walk
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Recursively walk a directory and collect relative paths of all regular files.
|
||||
*
|
||||
* Directories are pruned before recursion using the built-in ignored-directory
|
||||
* list plus any matching root .gitignore rule. This avoids
|
||||
* enumerating the contents of node_modules, dist, .venv, etc. entirely.
|
||||
*
|
||||
* Individual files are also tested against the gitignore filter when present,
|
||||
* so patterns like *.log or /secrets.json are respected.
|
||||
*
|
||||
* Symlinks and special files (devices, sockets, FIFOs) are silently skipped.
|
||||
* Populates `statCache` with file sizes so the caller can filter without a
|
||||
* second `stat()` call.
|
||||
*
|
||||
* @param dir - Absolute path of the directory to read.
|
||||
* @param rel - Relative path prefix accumulated during recursion.
|
||||
* @param statCache - Mutable map from relative path → byte size.
|
||||
* @param dir - Absolute path of the directory to read.
|
||||
* @param rel - Relative path prefix accumulated during recursion.
|
||||
* @param statCache - Mutable map from relative path → byte size.
|
||||
* @param filter - Compiled gitignore filter, or null when absent.
|
||||
*/
|
||||
private async walkDirectory(
|
||||
dir: string,
|
||||
rel: string,
|
||||
statCache: Map<string, number>
|
||||
statCache: Map<string, number>,
|
||||
filter: GitignoreFilter | null
|
||||
): Promise<string[]> {
|
||||
let entries;
|
||||
try {
|
||||
@@ -255,11 +299,25 @@ export class LocalCrawler {
|
||||
const relPath = rel ? `${rel}/${entry.name}` : entry.name;
|
||||
|
||||
if (entry.isDirectory()) {
|
||||
const children = await this.walkDirectory(join(dir, entry.name), relPath, statCache);
|
||||
// Prune ignored directories before recursing — never enumerate
|
||||
// their contents. Built-in exclusions always apply, even when a
|
||||
// repo-level .gitignore exists but does not mention them.
|
||||
const ignored = shouldPruneDirectory(relPath) || filter?.isIgnored(relPath, true);
|
||||
if (ignored) continue;
|
||||
|
||||
const children = await this.walkDirectory(
|
||||
join(dir, entry.name),
|
||||
relPath,
|
||||
statCache,
|
||||
filter
|
||||
);
|
||||
files.push(...children);
|
||||
} else {
|
||||
// Capture file size from stat so shouldIndexFile can enforce the limit
|
||||
// without reading the file.
|
||||
// Apply gitignore file-level rules when a filter is loaded.
|
||||
if (filter?.isIgnored(relPath, false)) continue;
|
||||
|
||||
// Capture file size from stat so shouldIndexFile can enforce
|
||||
// the size limit without reading the file content.
|
||||
try {
|
||||
const stat = await fs.stat(join(dir, entry.name));
|
||||
statCache.set(relPath, stat.size);
|
||||
|
||||
Reference in New Issue
Block a user