feat(crawler): ignore .gitingore files and folders, fallback to common ignored deps

2026-03-25 15:10:44 +01:00
parent 53b3d36ca3
commit 59628dd408
5 changed files with 746 additions and 48 deletions
--- a/src/lib/server/crawler/local.crawler.ts
+++ b/src/lib/server/crawler/local.crawler.ts
@@ -24,7 +24,8 @@ import { tmpdir } from 'node:os';
 import { join } from 'node:path';
 import { promisify } from 'node:util';

-import { detectLanguage, shouldIndexFile } from './file-filter.js';
+import { detectLanguage, shouldIndexFile, shouldPruneDirectory } from './file-filter.js';
+import { parseGitignore, type GitignoreFilter } from './gitignore-parser.js';
 import { InvalidRefError, NotAGitRepositoryError } from './types.js';
 import type { CrawledFile, CrawlResult, RepoConfig } from './types.js';

@@ -168,11 +169,18 @@ export class LocalCrawler {
 		onProgress: LocalCrawlOptions['onProgress'],
 		branch: string
 	): Promise<CrawlResult> {
-		// Step 1: Walk the directory tree and collect (relPath, size) pairs.
-		const statCache = new Map<string, number>();
-		const allRelPaths = await this.walkDirectory(rootPath, '', statCache);
+		// Step 1: Load .gitignore from the repo root (if present).
+		// When found, the filter drives file exclusion during the walk.
+		// Built-in dependency / build-artifact pruning still applies so local
+		// indexing stays focused on repository source, not vendored code.
+		const gitignoreFilter = await this.loadGitignore(rootPath);

-		// Step 2: Detect trueref.json / context7.json at the repo root first.
+		// Step 2: Walk the directory tree and collect (relPath, size) pairs.
+		// Directories are pruned early — their contents are never enumerated.
+		const statCache = new Map<string, number>();
+		const allRelPaths = await this.walkDirectory(rootPath, '', statCache, gitignoreFilter);
+
+		// Step 3: Detect trueref.json / context7.json at the repo root first.
 		// Only root-level config files are honoured (no directory prefix).
 		const configRelPath = allRelPaths.find((p) => CONFIG_FILE_NAMES.has(p));
 		let config = callerConfig;
@@ -180,13 +188,13 @@ export class LocalCrawler {
 			config = await parseConfigFile(join(rootPath, configRelPath));
 		}

-		// Step 3: Filter files according to extension, size, and config rules.
+		// Step 4: Filter files according to extension, size, and config rules.
 		const filteredPaths = allRelPaths.filter((relPath) => {
 			const size = statCache.get(relPath) ?? 0;
 			return shouldIndexFile(relPath, size, config);
 		});

-		// Step 4: Read file contents and build CrawledFile records.
+		// Step 5: Read file contents and build CrawledFile records.
 		const crawledFiles: CrawledFile[] = [];

 		for (const [i, relPath] of filteredPaths.entries()) {
@@ -209,7 +217,7 @@ export class LocalCrawler {
 			onProgress?.(i + 1, filteredPaths.length);
 		}

-		// Step 5: Build a deterministic repo-level fingerprint from file SHAs.
+		// Step 6: Build a deterministic repo-level fingerprint from file SHAs.
 		const commitSha = computeSHA256(crawledFiles.map((f) => f.sha).join(''));

 		return {
@@ -221,20 +229,56 @@ export class LocalCrawler {
 		};
 	}

+	// ---------------------------------------------------------------------------
+	// Private — .gitignore loading
+	// ---------------------------------------------------------------------------
+
+	/**
+	 * Attempt to read and parse the root .gitignore file.
+	 * Returns null when the file does not exist or cannot be read.
+	 *
+	 * Only the repository root .gitignore is honoured. Nested .gitignore files
+	 * inside subdirectories are not processed (they are rare and their absence
+	 * only leads to over-indexing, never incorrect indexing).
+	 */
+	private async loadGitignore(rootPath: string): Promise<GitignoreFilter | null> {
+		try {
+			const content = await fs.readFile(join(rootPath, '.gitignore'), 'utf-8');
+			return parseGitignore(content);
+		} catch {
+			// File absent or unreadable — fall back to IGNORED_DIR_NAMES.
+			return null;
+		}
+	}
+
+	// ---------------------------------------------------------------------------
+	// Private — directory walk
+	// ---------------------------------------------------------------------------
+
 	/**
 	 * Recursively walk a directory and collect relative paths of all regular files.
+	 *
+	 * Directories are pruned before recursion using the built-in ignored-directory
+	 * list plus any matching root .gitignore rule. This avoids
+	 * enumerating the contents of node_modules, dist, .venv, etc. entirely.
+	 *
+	 * Individual files are also tested against the gitignore filter when present,
+	 * so patterns like *.log or /secrets.json are respected.
+	 *
 	 * Symlinks and special files (devices, sockets, FIFOs) are silently skipped.
 	 * Populates `statCache` with file sizes so the caller can filter without a
 	 * second `stat()` call.
 	 *
-	 * @param dir     - Absolute path of the directory to read.
-	 * @param rel     - Relative path prefix accumulated during recursion.
-	 * @param statCache - Mutable map from relative path → byte size.
+	 * @param dir        - Absolute path of the directory to read.
+	 * @param rel        - Relative path prefix accumulated during recursion.
+	 * @param statCache  - Mutable map from relative path → byte size.
+	 * @param filter     - Compiled gitignore filter, or null when absent.
 	 */
 	private async walkDirectory(
 		dir: string,
 		rel: string,
-		statCache: Map<string, number>
+		statCache: Map<string, number>,
+		filter: GitignoreFilter | null
 	): Promise<string[]> {
 		let entries;
 		try {
@@ -255,11 +299,25 @@ export class LocalCrawler {
 			const relPath = rel ? `${rel}/${entry.name}` : entry.name;

 			if (entry.isDirectory()) {
-				const children = await this.walkDirectory(join(dir, entry.name), relPath, statCache);
+				// Prune ignored directories before recursing — never enumerate
+				// their contents. Built-in exclusions always apply, even when a
+				// repo-level .gitignore exists but does not mention them.
+				const ignored = shouldPruneDirectory(relPath) || filter?.isIgnored(relPath, true);
+				if (ignored) continue;
+
+				const children = await this.walkDirectory(
+					join(dir, entry.name),
+					relPath,
+					statCache,
+					filter
+				);
 				files.push(...children);
 			} else {
-				// Capture file size from stat so shouldIndexFile can enforce the limit
-				// without reading the file.
+				// Apply gitignore file-level rules when a filter is loaded.
+				if (filter?.isIgnored(relPath, false)) continue;
+
+				// Capture file size from stat so shouldIndexFile can enforce
+				// the size limit without reading the file content.
 				try {
 					const stat = await fs.stat(join(dir, entry.name));
 					statCache.set(relPath, stat.size);