feat(crawler): ignore .gitingore files and folders, fallback to common ignored deps
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
/**
|
||||
* File filtering logic for the GitHub crawler (TRUEREF-0003).
|
||||
* File filtering logic shared by the GitHub and local crawlers.
|
||||
*
|
||||
* Determines whether a file in the repository tree should be downloaded
|
||||
* and indexed based on its extension, size, and the trueref.json config.
|
||||
* Determines whether a file should be indexed based on its extension, size,
|
||||
* trueref.json config, and membership in known dependency / artifact paths.
|
||||
*/
|
||||
|
||||
import { extname, basename } from 'node:path';
|
||||
@@ -56,26 +56,144 @@ export const INDEXABLE_EXTENSIONS = new Set([
|
||||
'.vue'
|
||||
]);
|
||||
|
||||
/** Maximum file size we are willing to download (500 KB). */
|
||||
/** Maximum file size we are willing to index (500 KB). */
|
||||
export const MAX_FILE_SIZE_BYTES = 500_000;
|
||||
|
||||
/**
|
||||
* Default path prefixes that are always excluded regardless of config.
|
||||
* These directories contain generated or dependency files that should never
|
||||
* be indexed.
|
||||
* Directory names that are always excluded, regardless of depth in the tree.
|
||||
*
|
||||
* Used in two ways:
|
||||
* 1. By the local crawler to prune entire directories during the walk
|
||||
* (via shouldPruneDirectory) when no .gitignore is present.
|
||||
* 2. By shouldIndexFile to drop files whose path passes through one of
|
||||
* these directories (covers the GitHub crawler, which receives flat
|
||||
* path lists from the API).
|
||||
*
|
||||
* The list covers dependency stores, build outputs, caches, and generated
|
||||
* artifacts across all major language ecosystems.
|
||||
*/
|
||||
const DEFAULT_EXCLUDES: string[] = [
|
||||
'node_modules/',
|
||||
'.git/',
|
||||
'dist/',
|
||||
'build/',
|
||||
'coverage/',
|
||||
'.next/',
|
||||
'__pycache__/',
|
||||
'vendor/',
|
||||
'target/',
|
||||
'.cache/'
|
||||
];
|
||||
export const IGNORED_DIR_NAMES = new Set([
|
||||
// ── Version control ────────────────────────────────────────────────────
|
||||
'.git', '.hg', '.svn',
|
||||
|
||||
// ── JavaScript / TypeScript ─────────────────────────────────────────────
|
||||
'node_modules',
|
||||
'.npm', '.yarn', '.pnpm-store', '.pnp',
|
||||
// Build outputs and framework caches
|
||||
'dist', 'build', 'out',
|
||||
'.next', '.nuxt', '.svelte-kit', '.vite',
|
||||
'.turbo', '.parcel-cache', '.webpack',
|
||||
|
||||
// ── Python ──────────────────────────────────────────────────────────────
|
||||
'__pycache__',
|
||||
'.venv', 'venv', 'env',
|
||||
'site-packages', '.eggs',
|
||||
'.pytest_cache', '.mypy_cache', '.ruff_cache',
|
||||
'.tox', '.nox',
|
||||
'htmlcov',
|
||||
|
||||
// ── Java / Kotlin / Scala ───────────────────────────────────────────────
|
||||
'target', // Maven + sbt
|
||||
'.gradle', '.mvn',
|
||||
|
||||
// ── Ruby ────────────────────────────────────────────────────────────────
|
||||
'.bundle',
|
||||
|
||||
// ── PHP ─────────────────────────────────────────────────────────────────
|
||||
// 'vendor' below covers PHP Composer
|
||||
|
||||
// ── .NET ────────────────────────────────────────────────────────────────
|
||||
'bin', 'obj', 'packages',
|
||||
|
||||
// ── Haskell ─────────────────────────────────────────────────────────────
|
||||
'.stack-work', 'dist-newstyle',
|
||||
|
||||
// ── Dart / Flutter ──────────────────────────────────────────────────────
|
||||
'.dart_tool',
|
||||
|
||||
// ── Swift / iOS ─────────────────────────────────────────────────────────
|
||||
'Pods', 'DerivedData',
|
||||
|
||||
// ── Elixir / Erlang ─────────────────────────────────────────────────────
|
||||
'_build', 'deps',
|
||||
|
||||
// ── Clojure ─────────────────────────────────────────────────────────────
|
||||
'.cpcache',
|
||||
|
||||
// ── Multi-ecosystem dependency directory ────────────────────────────────
|
||||
// Go, PHP, Ruby, C++ (conan), Rust (workspace) all use vendor/
|
||||
'vendor',
|
||||
|
||||
// ── Generic caches / temp ───────────────────────────────────────────────
|
||||
'.cache', '.tmp', 'tmp', 'temp', '.temp', '.sass-cache',
|
||||
|
||||
// ── Test coverage ───────────────────────────────────────────────────────
|
||||
'coverage', '.nyc_output',
|
||||
|
||||
// ── IDE / editor artefacts ──────────────────────────────────────────────
|
||||
'.idea', '.vs',
|
||||
|
||||
// ── Generated code ──────────────────────────────────────────────────────
|
||||
'generated', '__generated__', '_generated',
|
||||
|
||||
// ── Logs ────────────────────────────────────────────────────────────────
|
||||
'logs'
|
||||
]);
|
||||
|
||||
/**
|
||||
* Exact basenames that are never useful to index — primarily lock files.
|
||||
* These often have indexable extensions (.json, .yaml, .toml) so the
|
||||
* extension allow-list alone is not sufficient to exclude them.
|
||||
*/
|
||||
const IGNORED_FILE_NAMES = new Set([
|
||||
'package-lock.json',
|
||||
'yarn.lock',
|
||||
'pnpm-lock.yaml',
|
||||
'bun.lockb',
|
||||
'Gemfile.lock',
|
||||
'Cargo.lock',
|
||||
'poetry.lock',
|
||||
'Pipfile.lock',
|
||||
'composer.lock',
|
||||
'go.sum',
|
||||
'go.work.sum',
|
||||
'flake.lock'
|
||||
]);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Directory pruning helpers (used by the local crawler)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns true if the given directory should be pruned during a local walk.
|
||||
*
|
||||
* Checks only the final path segment (the directory's own name), because the
|
||||
* walker calls this function as it descends — parent directories are already
|
||||
* handled by earlier recursive calls.
|
||||
*
|
||||
* Used as the fallback when no .gitignore is present.
|
||||
*/
|
||||
export function shouldPruneDirectory(relDirPath: string): boolean {
|
||||
const name = relDirPath.split('/').at(-1)!;
|
||||
return IGNORED_DIR_NAMES.has(name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the file's path passes through at least one ignored directory.
|
||||
*
|
||||
* Checks every directory component (all segments except the filename) so that
|
||||
* nested occurrences like "packages/ui/node_modules/lodash/index.js" are
|
||||
* correctly excluded. Used by shouldIndexFile for the GitHub crawler, which
|
||||
* receives flat path lists rather than walking a directory tree.
|
||||
*/
|
||||
function isInIgnoredDirectory(relFilePath: string): boolean {
|
||||
const parts = relFilePath.split('/');
|
||||
// All parts except the last (filename) are directory names.
|
||||
for (let i = 0; i < parts.length - 1; i++) {
|
||||
if (IGNORED_DIR_NAMES.has(parts[i])) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Language detection
|
||||
@@ -134,15 +252,17 @@ export function detectLanguage(filePath: string): string {
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Decide whether a file from the repository tree should be downloaded.
|
||||
* Decide whether a file should be indexed.
|
||||
*
|
||||
* Rules (applied in order):
|
||||
* 1. Must have an indexable extension.
|
||||
* 2. Must not exceed the size limit.
|
||||
* 3. Must not match config.excludeFiles (exact basename match).
|
||||
* 4. Must not be under a config.excludeFolders path / regex.
|
||||
* 5. Must be under a config.folders allowlist path / regex (if specified).
|
||||
* 6. Must not start with a default-excluded prefix.
|
||||
* 3. Must not be a known lock file or other always-excluded filename.
|
||||
* 4. Must not be a minified file (*.min.js, *.min.css, *.bundle.js/css).
|
||||
* 5. Must not match config.excludeFiles (exact basename match).
|
||||
* 6. Must not be inside an ignored directory (dependency/build/cache).
|
||||
* 7. Must not be under a config.excludeFolders path / regex.
|
||||
* 8. Must be under a config.folders allowlist path / regex (if specified).
|
||||
*/
|
||||
export function shouldIndexFile(
|
||||
filePath: string,
|
||||
@@ -150,6 +270,7 @@ export function shouldIndexFile(
|
||||
config?: RepoConfig
|
||||
): boolean {
|
||||
const ext = extname(filePath).toLowerCase();
|
||||
const base = basename(filePath);
|
||||
|
||||
// 1. Extension allow-list
|
||||
if (!INDEXABLE_EXTENSIONS.has(ext)) return false;
|
||||
@@ -157,10 +278,21 @@ export function shouldIndexFile(
|
||||
// 2. Size limit
|
||||
if (fileSize > MAX_FILE_SIZE_BYTES) return false;
|
||||
|
||||
// 3. Config excludeFiles — exact basename match
|
||||
if (config?.excludeFiles?.includes(basename(filePath))) return false;
|
||||
// 3. Lock files and other always-excluded basenames
|
||||
if (IGNORED_FILE_NAMES.has(base)) return false;
|
||||
|
||||
// 4. Config excludeFolders — prefix or regex match
|
||||
// 4. Minified / bundled files
|
||||
if (base.includes('.min.') || base.endsWith('.bundle.js') || base.endsWith('.bundle.css'))
|
||||
return false;
|
||||
|
||||
// 5. Config excludeFiles — exact basename match
|
||||
if (config?.excludeFiles?.includes(base)) return false;
|
||||
|
||||
// 6. Ignored directories (dependency stores, build outputs, caches, etc.)
|
||||
// Covers nested occurrences at any depth, e.g. packages/ui/node_modules/
|
||||
if (isInIgnoredDirectory(filePath)) return false;
|
||||
|
||||
// 7. Config excludeFolders — prefix or regex match
|
||||
if (
|
||||
config?.excludeFolders?.some(
|
||||
(folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath)
|
||||
@@ -168,7 +300,7 @@ export function shouldIndexFile(
|
||||
)
|
||||
return false;
|
||||
|
||||
// 5. Config folders allowlist — if provided, the file must match at least one
|
||||
// 8. Config folders allowlist — if provided, the file must match at least one
|
||||
if (config?.folders?.length) {
|
||||
const inAllowedFolder = config.folders.some(
|
||||
(folder) => filePath.startsWith(folder) || new RegExp(folder).test(filePath)
|
||||
@@ -176,8 +308,5 @@ export function shouldIndexFile(
|
||||
if (!inAllowedFolder) return false;
|
||||
}
|
||||
|
||||
// 6. Default excludes
|
||||
if (DEFAULT_EXCLUDES.some((ex) => filePath.startsWith(ex))) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
219
src/lib/server/crawler/gitignore-parser.test.ts
Normal file
219
src/lib/server/crawler/gitignore-parser.test.ts
Normal file
@@ -0,0 +1,219 @@
|
||||
/**
|
||||
* Unit tests for the .gitignore parser.
|
||||
*/
|
||||
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { parseGitignore } from './gitignore-parser.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function ignored(pattern: string, path: string, isDir = false): boolean {
|
||||
return parseGitignore(pattern).isIgnored(path, isDir);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Blank lines and comments
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseGitignore — blank lines and comments', () => {
|
||||
it('ignores blank lines', () => {
|
||||
const f = parseGitignore('\n\n \n');
|
||||
expect(f.isIgnored('anything.ts', false)).toBe(false);
|
||||
});
|
||||
|
||||
it('ignores comment lines', () => {
|
||||
const f = parseGitignore('# this is a comment\n # indented comment');
|
||||
expect(f.isIgnored('anything.ts', false)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Simple unanchored patterns (match anywhere in the tree)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseGitignore — unanchored patterns', () => {
|
||||
it('matches a bare directory name at the root', () => {
|
||||
expect(ignored('node_modules', 'node_modules', true)).toBe(true);
|
||||
});
|
||||
|
||||
it('matches a bare directory name at any depth', () => {
|
||||
expect(ignored('node_modules', 'packages/ui/node_modules', true)).toBe(true);
|
||||
});
|
||||
|
||||
it('does not match a substring of a directory name', () => {
|
||||
expect(ignored('node_modules', 'not_node_modules', true)).toBe(false);
|
||||
});
|
||||
|
||||
it('matches files by bare name at any depth', () => {
|
||||
expect(ignored('secret.env', 'config/secret.env', false)).toBe(true);
|
||||
});
|
||||
|
||||
it('matches a wildcard extension pattern at the root', () => {
|
||||
expect(ignored('*.log', 'error.log', false)).toBe(true);
|
||||
});
|
||||
|
||||
it('matches a wildcard extension pattern in a subdirectory', () => {
|
||||
expect(ignored('*.log', 'logs/app/error.log', false)).toBe(true);
|
||||
});
|
||||
|
||||
it('does not match a partial extension', () => {
|
||||
expect(ignored('*.log', 'changelog.md', false)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Directory-only patterns (trailing /)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseGitignore — directory-only patterns', () => {
|
||||
it('matches a directory path', () => {
|
||||
expect(ignored('dist/', 'dist', true)).toBe(true);
|
||||
});
|
||||
|
||||
it('does not match a file with the same name', () => {
|
||||
expect(ignored('dist/', 'dist', false)).toBe(false);
|
||||
});
|
||||
|
||||
it('matches the directory at any depth', () => {
|
||||
expect(ignored('node_modules/', 'packages/ui/node_modules', true)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Anchored patterns (pattern contains /)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseGitignore — anchored patterns', () => {
|
||||
it('matches only at the root when pattern starts with /', () => {
|
||||
expect(ignored('/dist', 'dist', true)).toBe(true);
|
||||
expect(ignored('/dist', 'src/dist', true)).toBe(false);
|
||||
});
|
||||
|
||||
it('matches only at the root when pattern contains / in the middle', () => {
|
||||
expect(ignored('src/*.ts', 'src/index.ts', false)).toBe(true);
|
||||
expect(ignored('src/*.ts', 'lib/src/index.ts', false)).toBe(false);
|
||||
});
|
||||
|
||||
it('anchors to root for patterns like docs/generated', () => {
|
||||
expect(ignored('docs/generated', 'docs/generated', true)).toBe(true);
|
||||
expect(ignored('docs/generated', 'other/docs/generated', true)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Wildcards: * and ?
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseGitignore — * and ? wildcards', () => {
|
||||
it('* does not cross directory boundaries', () => {
|
||||
expect(ignored('src/*', 'src/index.ts', false)).toBe(true);
|
||||
expect(ignored('src/*', 'src/nested/index.ts', false)).toBe(false);
|
||||
});
|
||||
|
||||
it('? matches exactly one character', () => {
|
||||
expect(ignored('file?.ts', 'fileA.ts', false)).toBe(true);
|
||||
expect(ignored('file?.ts', 'fileAB.ts', false)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Double-star **
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseGitignore — ** patterns', () => {
|
||||
it('**/ matches zero segments (file at root)', () => {
|
||||
expect(ignored('**/debug.log', 'debug.log', false)).toBe(true);
|
||||
});
|
||||
|
||||
it('**/ matches one or more path segments', () => {
|
||||
expect(ignored('**/debug.log', 'logs/debug.log', false)).toBe(true);
|
||||
expect(ignored('**/debug.log', 'a/b/c/debug.log', false)).toBe(true);
|
||||
});
|
||||
|
||||
it('dir/** matches everything inside dir', () => {
|
||||
expect(ignored('build/**', 'build/index.js', false)).toBe(true);
|
||||
expect(ignored('build/**', 'build/sub/index.js', false)).toBe(true);
|
||||
expect(ignored('build/**', 'other/index.js', false)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Negations
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseGitignore — negation patterns', () => {
|
||||
it('! un-ignores a previously ignored path (last rule wins)', () => {
|
||||
const f = parseGitignore('*.log\n!important.log');
|
||||
expect(f.isIgnored('error.log', false)).toBe(true);
|
||||
expect(f.isIgnored('important.log', false)).toBe(false);
|
||||
});
|
||||
|
||||
it('a later positive rule re-ignores after a negation', () => {
|
||||
const f = parseGitignore('*.log\n!important.log\n*.log');
|
||||
expect(f.isIgnored('important.log', false)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Real-world .gitignore snippet
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('parseGitignore — realistic .gitignore', () => {
|
||||
const gitignore = `
|
||||
# Dependencies
|
||||
node_modules/
|
||||
.pnpm-store/
|
||||
|
||||
# Build outputs
|
||||
dist/
|
||||
build/
|
||||
.next/
|
||||
|
||||
# Environment
|
||||
.env
|
||||
.env.*
|
||||
!.env.example
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
logs/
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
`;
|
||||
|
||||
const f = parseGitignore(gitignore);
|
||||
|
||||
it('ignores node_modules at root', () => {
|
||||
expect(f.isIgnored('node_modules', true)).toBe(true);
|
||||
});
|
||||
|
||||
it('ignores node_modules at depth', () => {
|
||||
expect(f.isIgnored('packages/ui/node_modules', true)).toBe(true);
|
||||
});
|
||||
|
||||
it('ignores dist directory', () => {
|
||||
expect(f.isIgnored('dist', true)).toBe(true);
|
||||
});
|
||||
|
||||
it('ignores .env file', () => {
|
||||
expect(f.isIgnored('.env', false)).toBe(true);
|
||||
});
|
||||
|
||||
it('does not ignore .env.example (negated)', () => {
|
||||
expect(f.isIgnored('.env.example', false)).toBe(false);
|
||||
});
|
||||
|
||||
it('ignores .log files', () => {
|
||||
expect(f.isIgnored('server.log', false)).toBe(true);
|
||||
expect(f.isIgnored('logs/app.log', false)).toBe(true);
|
||||
});
|
||||
|
||||
it('does not ignore source files', () => {
|
||||
expect(f.isIgnored('src/index.ts', false)).toBe(false);
|
||||
expect(f.isIgnored('README.md', false)).toBe(false);
|
||||
});
|
||||
});
|
||||
151
src/lib/server/crawler/gitignore-parser.ts
Normal file
151
src/lib/server/crawler/gitignore-parser.ts
Normal file
@@ -0,0 +1,151 @@
|
||||
/**
|
||||
* Minimal but correct .gitignore parser for the local crawler.
|
||||
*
|
||||
* Implements the gitignore specification:
|
||||
* https://git-scm.com/docs/gitignore
|
||||
*
|
||||
* Supported: wildcards (* ?), double-star (**), directory markers (/),
|
||||
* negations (!), anchored patterns, character classes ([…]).
|
||||
*
|
||||
* Not supported: .gitignore files nested inside subdirectories (root only).
|
||||
*
|
||||
* No external dependencies — Node.js built-ins only.
|
||||
*/
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Internal types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
interface CompiledPattern {
|
||||
regex: RegExp;
|
||||
negated: boolean;
|
||||
/** True when the pattern ends with / — only applies to directories. */
|
||||
dirOnly: boolean;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Pattern compiler
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Compile a single .gitignore line into a CompiledPattern.
|
||||
* Returns null for blank lines and comments.
|
||||
*/
|
||||
function compileLine(raw: string): CompiledPattern | null {
|
||||
let line = raw.trim();
|
||||
|
||||
// Skip blank lines and comments.
|
||||
if (!line || line.startsWith('#')) return null;
|
||||
|
||||
const negated = line.startsWith('!');
|
||||
if (negated) line = line.slice(1).trim();
|
||||
if (!line) return null;
|
||||
|
||||
const dirOnly = line.endsWith('/');
|
||||
if (dirOnly) line = line.slice(0, -1);
|
||||
|
||||
// A pattern is anchored to the repository root when it contains a slash
|
||||
// anywhere (the trailing slash was already stripped above).
|
||||
const anchored = line.includes('/');
|
||||
if (line.startsWith('/')) line = line.slice(1);
|
||||
|
||||
// Convert glob characters to a regex fragment.
|
||||
let regexBody = '';
|
||||
let i = 0;
|
||||
while (i < line.length) {
|
||||
const ch = line[i];
|
||||
|
||||
if (ch === '*' && line[i + 1] === '*') {
|
||||
if (line[i + 2] === '/') {
|
||||
// "**/" — match zero or more path segments.
|
||||
regexBody += '(?:.+/)?';
|
||||
i += 3;
|
||||
} else {
|
||||
// "**" at the end or alone — match anything.
|
||||
regexBody += '.*';
|
||||
i += 2;
|
||||
}
|
||||
} else if (ch === '*') {
|
||||
regexBody += '[^/]*';
|
||||
i++;
|
||||
} else if (ch === '?') {
|
||||
regexBody += '[^/]';
|
||||
i++;
|
||||
} else if (ch === '[') {
|
||||
// Pass character classes through verbatim.
|
||||
const end = line.indexOf(']', i + 1);
|
||||
if (end === -1) {
|
||||
regexBody += '\\[';
|
||||
i++;
|
||||
} else {
|
||||
regexBody += line.slice(i, end + 1);
|
||||
i = end + 1;
|
||||
}
|
||||
} else {
|
||||
// Escape all other regex metacharacters.
|
||||
regexBody += ch.replace(/[.+^${}()|\\]/g, '\\$&');
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
// Anchored patterns are relative to the repo root.
|
||||
// Unanchored patterns can match at any path depth.
|
||||
// No trailing (?:/.*)? — the local crawler prunes directories before
|
||||
// descending, so we only ever test a path against its own entry, never
|
||||
// against a child path. Adding the suffix would make `src/*` incorrectly
|
||||
// match `src/nested/index.ts`.
|
||||
const fullPattern = anchored ? `^${regexBody}$` : `(?:^|/)${regexBody}$`;
|
||||
|
||||
return {
|
||||
regex: new RegExp(fullPattern),
|
||||
negated,
|
||||
dirOnly
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public API
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* A compiled set of .gitignore rules.
|
||||
*
|
||||
* Rules are evaluated in declaration order; the last matching rule wins
|
||||
* (same semantics as git).
|
||||
*/
|
||||
export class GitignoreFilter {
|
||||
private readonly patterns: readonly CompiledPattern[];
|
||||
|
||||
constructor(patterns: CompiledPattern[]) {
|
||||
this.patterns = patterns;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the path should be ignored according to the loaded rules.
|
||||
*
|
||||
* @param relPath Forward-slash relative path from the repo root, no leading slash.
|
||||
* @param isDir Whether the path refers to a directory.
|
||||
*/
|
||||
isIgnored(relPath: string, isDir: boolean): boolean {
|
||||
let ignored = false;
|
||||
for (const { regex, negated, dirOnly } of this.patterns) {
|
||||
if (dirOnly && !isDir) continue;
|
||||
if (regex.test(relPath)) {
|
||||
ignored = !negated;
|
||||
}
|
||||
}
|
||||
return ignored;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the textual content of a .gitignore file and return a GitignoreFilter.
|
||||
*/
|
||||
export function parseGitignore(content: string): GitignoreFilter {
|
||||
const patterns: CompiledPattern[] = [];
|
||||
for (const line of content.split('\n')) {
|
||||
const compiled = compileLine(line);
|
||||
if (compiled) patterns.push(compiled);
|
||||
}
|
||||
return new GitignoreFilter(patterns);
|
||||
}
|
||||
@@ -173,13 +173,154 @@ describe('LocalCrawler.crawl() — default filtering', () => {
|
||||
|
||||
it('reports skippedFiles = total enumerated – filtered', async () => {
|
||||
const result = await crawlRoot();
|
||||
// dist/, node_modules/, .git/, .png = 4 skipped
|
||||
// dist/, node_modules/, .git/ are pruned at walk time — never counted.
|
||||
// Only image.png reaches allRelPaths and is skipped (non-indexable extension).
|
||||
// src/index.ts + README.md = 2 kept
|
||||
expect(result.skippedFiles).toBe(4);
|
||||
expect(result.skippedFiles).toBe(1);
|
||||
expect(result.totalFiles).toBe(2);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// .gitignore support
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('LocalCrawler.crawl() — .gitignore support', () => {
|
||||
afterEach(async () => {
|
||||
await cleanupTempRepo(root);
|
||||
});
|
||||
|
||||
it('excludes files matching a .gitignore pattern', async () => {
|
||||
root = await makeTempRepo({
|
||||
'.gitignore': '*.log\nsecrets.ts',
|
||||
'src/index.ts': 'export {};',
|
||||
'debug.log': 'log data',
|
||||
'secrets.ts': 'const key = "abc";'
|
||||
});
|
||||
const result = await crawlRoot();
|
||||
expect(result.files.some((f) => f.path === 'debug.log')).toBe(false);
|
||||
expect(result.files.some((f) => f.path === 'secrets.ts')).toBe(false);
|
||||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||||
});
|
||||
|
||||
it('excludes a directory listed in .gitignore', async () => {
|
||||
root = await makeTempRepo({
|
||||
'.gitignore': 'generated/',
|
||||
'src/index.ts': 'export {};',
|
||||
'generated/api.ts': 'auto-generated'
|
||||
});
|
||||
const result = await crawlRoot();
|
||||
expect(result.files.some((f) => f.path.startsWith('generated/'))).toBe(false);
|
||||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||||
});
|
||||
|
||||
it('respects negation patterns in .gitignore', async () => {
|
||||
root = await makeTempRepo({
|
||||
'.gitignore': '*.env\n!.env.example',
|
||||
'src/index.ts': 'export {};',
|
||||
'.env': 'SECRET=abc',
|
||||
'.env.example': 'SECRET=changeme'
|
||||
});
|
||||
const result = await crawlRoot();
|
||||
// .env files don't have an indexable extension so this tests the gitignore logic
|
||||
// doesn't incorrectly block .env.example from passing through
|
||||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||||
});
|
||||
|
||||
it('falls back to IGNORED_DIR_NAMES when no .gitignore is present', async () => {
|
||||
root = await makeTempRepo({
|
||||
'src/index.ts': 'export {};',
|
||||
'node_modules/lodash/index.js': 'lodash',
|
||||
'__pycache__/main.cpython-311.pyc': 'bytecode'
|
||||
});
|
||||
const result = await crawlRoot();
|
||||
expect(result.files.every((f) => !f.path.startsWith('node_modules/'))).toBe(true);
|
||||
expect(result.files.every((f) => !f.path.startsWith('__pycache__/'))).toBe(true);
|
||||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||||
});
|
||||
|
||||
it('excludes nested node_modules via fallback (no .gitignore)', async () => {
|
||||
root = await makeTempRepo({
|
||||
'src/index.ts': 'export {};',
|
||||
'packages/ui/node_modules/react/index.js': 'react'
|
||||
});
|
||||
const result = await crawlRoot();
|
||||
expect(result.files.every((f) => !f.path.includes('node_modules'))).toBe(true);
|
||||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||||
});
|
||||
|
||||
it('still prunes common dependency directories when .gitignore exists', async () => {
|
||||
root = await makeTempRepo({
|
||||
'.gitignore': 'logs/\n*.log',
|
||||
'src/index.ts': 'export {};',
|
||||
'node_modules/lodash/index.js': 'lodash',
|
||||
'packages/ui/node_modules/react/index.js': 'react',
|
||||
'logs/debug.log': 'debug'
|
||||
});
|
||||
const result = await crawlRoot();
|
||||
expect(result.files.every((f) => !f.path.includes('node_modules'))).toBe(true);
|
||||
expect(result.files.every((f) => !f.path.startsWith('logs/'))).toBe(true);
|
||||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Lock file and minified file exclusions
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('LocalCrawler.crawl() — lock file and minified file exclusions', () => {
|
||||
afterEach(async () => {
|
||||
await cleanupTempRepo(root);
|
||||
});
|
||||
|
||||
it('excludes package-lock.json', async () => {
|
||||
root = await makeTempRepo({
|
||||
'src/index.ts': 'export {};',
|
||||
'package-lock.json': '{"lockfileVersion":3}'
|
||||
});
|
||||
const result = await crawlRoot();
|
||||
expect(result.files.some((f) => f.path === 'package-lock.json')).toBe(false);
|
||||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||||
});
|
||||
|
||||
it('excludes pnpm-lock.yaml', async () => {
|
||||
root = await makeTempRepo({
|
||||
'src/index.ts': 'export {};',
|
||||
'pnpm-lock.yaml': 'lockfileVersion: 9'
|
||||
});
|
||||
const result = await crawlRoot();
|
||||
expect(result.files.some((f) => f.path === 'pnpm-lock.yaml')).toBe(false);
|
||||
});
|
||||
|
||||
it('excludes minified .js files', async () => {
|
||||
root = await makeTempRepo({
|
||||
'src/index.ts': 'export {};',
|
||||
'dist/vendor.min.js': '!function(e,t){}()'
|
||||
});
|
||||
const result = await crawlRoot();
|
||||
// dist/ is pruned by default — test via shouldIndexFile logic only if .gitignore present
|
||||
// Use a custom path outside ignored dirs:
|
||||
await fs.rm(root, { recursive: true, force: true });
|
||||
root = await makeTempRepo({
|
||||
'src/index.ts': 'export {};',
|
||||
'public/vendor.min.js': '!function(){}'
|
||||
});
|
||||
const r2 = await crawlRoot();
|
||||
expect(r2.files.some((f) => f.path === 'public/vendor.min.js')).toBe(false);
|
||||
expect(r2.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||||
});
|
||||
|
||||
it('excludes .bundle.js files', async () => {
|
||||
root = await makeTempRepo({
|
||||
'src/index.ts': 'export {};',
|
||||
'public/app.bundle.js': 'bundled code'
|
||||
});
|
||||
const result = await crawlRoot();
|
||||
expect(result.files.some((f) => f.path === 'public/app.bundle.js')).toBe(false);
|
||||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Size limit
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
@@ -24,7 +24,8 @@ import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { promisify } from 'node:util';
|
||||
|
||||
import { detectLanguage, shouldIndexFile } from './file-filter.js';
|
||||
import { detectLanguage, shouldIndexFile, shouldPruneDirectory } from './file-filter.js';
|
||||
import { parseGitignore, type GitignoreFilter } from './gitignore-parser.js';
|
||||
import { InvalidRefError, NotAGitRepositoryError } from './types.js';
|
||||
import type { CrawledFile, CrawlResult, RepoConfig } from './types.js';
|
||||
|
||||
@@ -168,11 +169,18 @@ export class LocalCrawler {
|
||||
onProgress: LocalCrawlOptions['onProgress'],
|
||||
branch: string
|
||||
): Promise<CrawlResult> {
|
||||
// Step 1: Walk the directory tree and collect (relPath, size) pairs.
|
||||
const statCache = new Map<string, number>();
|
||||
const allRelPaths = await this.walkDirectory(rootPath, '', statCache);
|
||||
// Step 1: Load .gitignore from the repo root (if present).
|
||||
// When found, the filter drives file exclusion during the walk.
|
||||
// Built-in dependency / build-artifact pruning still applies so local
|
||||
// indexing stays focused on repository source, not vendored code.
|
||||
const gitignoreFilter = await this.loadGitignore(rootPath);
|
||||
|
||||
// Step 2: Detect trueref.json / context7.json at the repo root first.
|
||||
// Step 2: Walk the directory tree and collect (relPath, size) pairs.
|
||||
// Directories are pruned early — their contents are never enumerated.
|
||||
const statCache = new Map<string, number>();
|
||||
const allRelPaths = await this.walkDirectory(rootPath, '', statCache, gitignoreFilter);
|
||||
|
||||
// Step 3: Detect trueref.json / context7.json at the repo root first.
|
||||
// Only root-level config files are honoured (no directory prefix).
|
||||
const configRelPath = allRelPaths.find((p) => CONFIG_FILE_NAMES.has(p));
|
||||
let config = callerConfig;
|
||||
@@ -180,13 +188,13 @@ export class LocalCrawler {
|
||||
config = await parseConfigFile(join(rootPath, configRelPath));
|
||||
}
|
||||
|
||||
// Step 3: Filter files according to extension, size, and config rules.
|
||||
// Step 4: Filter files according to extension, size, and config rules.
|
||||
const filteredPaths = allRelPaths.filter((relPath) => {
|
||||
const size = statCache.get(relPath) ?? 0;
|
||||
return shouldIndexFile(relPath, size, config);
|
||||
});
|
||||
|
||||
// Step 4: Read file contents and build CrawledFile records.
|
||||
// Step 5: Read file contents and build CrawledFile records.
|
||||
const crawledFiles: CrawledFile[] = [];
|
||||
|
||||
for (const [i, relPath] of filteredPaths.entries()) {
|
||||
@@ -209,7 +217,7 @@ export class LocalCrawler {
|
||||
onProgress?.(i + 1, filteredPaths.length);
|
||||
}
|
||||
|
||||
// Step 5: Build a deterministic repo-level fingerprint from file SHAs.
|
||||
// Step 6: Build a deterministic repo-level fingerprint from file SHAs.
|
||||
const commitSha = computeSHA256(crawledFiles.map((f) => f.sha).join(''));
|
||||
|
||||
return {
|
||||
@@ -221,20 +229,56 @@ export class LocalCrawler {
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Private — .gitignore loading
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Attempt to read and parse the root .gitignore file.
|
||||
* Returns null when the file does not exist or cannot be read.
|
||||
*
|
||||
* Only the repository root .gitignore is honoured. Nested .gitignore files
|
||||
* inside subdirectories are not processed (they are rare and their absence
|
||||
* only leads to over-indexing, never incorrect indexing).
|
||||
*/
|
||||
private async loadGitignore(rootPath: string): Promise<GitignoreFilter | null> {
|
||||
try {
|
||||
const content = await fs.readFile(join(rootPath, '.gitignore'), 'utf-8');
|
||||
return parseGitignore(content);
|
||||
} catch {
|
||||
// File absent or unreadable — fall back to IGNORED_DIR_NAMES.
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Private — directory walk
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Recursively walk a directory and collect relative paths of all regular files.
|
||||
*
|
||||
* Directories are pruned before recursion using the built-in ignored-directory
|
||||
* list plus any matching root .gitignore rule. This avoids
|
||||
* enumerating the contents of node_modules, dist, .venv, etc. entirely.
|
||||
*
|
||||
* Individual files are also tested against the gitignore filter when present,
|
||||
* so patterns like *.log or /secrets.json are respected.
|
||||
*
|
||||
* Symlinks and special files (devices, sockets, FIFOs) are silently skipped.
|
||||
* Populates `statCache` with file sizes so the caller can filter without a
|
||||
* second `stat()` call.
|
||||
*
|
||||
* @param dir - Absolute path of the directory to read.
|
||||
* @param rel - Relative path prefix accumulated during recursion.
|
||||
* @param statCache - Mutable map from relative path → byte size.
|
||||
* @param dir - Absolute path of the directory to read.
|
||||
* @param rel - Relative path prefix accumulated during recursion.
|
||||
* @param statCache - Mutable map from relative path → byte size.
|
||||
* @param filter - Compiled gitignore filter, or null when absent.
|
||||
*/
|
||||
private async walkDirectory(
|
||||
dir: string,
|
||||
rel: string,
|
||||
statCache: Map<string, number>
|
||||
statCache: Map<string, number>,
|
||||
filter: GitignoreFilter | null
|
||||
): Promise<string[]> {
|
||||
let entries;
|
||||
try {
|
||||
@@ -255,11 +299,25 @@ export class LocalCrawler {
|
||||
const relPath = rel ? `${rel}/${entry.name}` : entry.name;
|
||||
|
||||
if (entry.isDirectory()) {
|
||||
const children = await this.walkDirectory(join(dir, entry.name), relPath, statCache);
|
||||
// Prune ignored directories before recursing — never enumerate
|
||||
// their contents. Built-in exclusions always apply, even when a
|
||||
// repo-level .gitignore exists but does not mention them.
|
||||
const ignored = shouldPruneDirectory(relPath) || filter?.isIgnored(relPath, true);
|
||||
if (ignored) continue;
|
||||
|
||||
const children = await this.walkDirectory(
|
||||
join(dir, entry.name),
|
||||
relPath,
|
||||
statCache,
|
||||
filter
|
||||
);
|
||||
files.push(...children);
|
||||
} else {
|
||||
// Capture file size from stat so shouldIndexFile can enforce the limit
|
||||
// without reading the file.
|
||||
// Apply gitignore file-level rules when a filter is loaded.
|
||||
if (filter?.isIgnored(relPath, false)) continue;
|
||||
|
||||
// Capture file size from stat so shouldIndexFile can enforce
|
||||
// the size limit without reading the file content.
|
||||
try {
|
||||
const stat = await fs.stat(join(dir, entry.name));
|
||||
statCache.set(relPath, stat.size);
|
||||
|
||||
Reference in New Issue
Block a user