feat(crawler): ignore .gitingore files and folders, fallback to common ignored deps

This commit is contained in:
Giancarmine Salucci
2026-03-25 15:10:44 +01:00
parent 53b3d36ca3
commit 59628dd408
5 changed files with 746 additions and 48 deletions

View File

@@ -173,13 +173,154 @@ describe('LocalCrawler.crawl() — default filtering', () => {
it('reports skippedFiles = total enumerated filtered', async () => {
const result = await crawlRoot();
// dist/, node_modules/, .git/, .png = 4 skipped
// dist/, node_modules/, .git/ are pruned at walk time — never counted.
// Only image.png reaches allRelPaths and is skipped (non-indexable extension).
// src/index.ts + README.md = 2 kept
expect(result.skippedFiles).toBe(4);
expect(result.skippedFiles).toBe(1);
expect(result.totalFiles).toBe(2);
});
});
// ---------------------------------------------------------------------------
// .gitignore support
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — .gitignore support', () => {
afterEach(async () => {
await cleanupTempRepo(root);
});
it('excludes files matching a .gitignore pattern', async () => {
root = await makeTempRepo({
'.gitignore': '*.log\nsecrets.ts',
'src/index.ts': 'export {};',
'debug.log': 'log data',
'secrets.ts': 'const key = "abc";'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'debug.log')).toBe(false);
expect(result.files.some((f) => f.path === 'secrets.ts')).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('excludes a directory listed in .gitignore', async () => {
root = await makeTempRepo({
'.gitignore': 'generated/',
'src/index.ts': 'export {};',
'generated/api.ts': 'auto-generated'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path.startsWith('generated/'))).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('respects negation patterns in .gitignore', async () => {
root = await makeTempRepo({
'.gitignore': '*.env\n!.env.example',
'src/index.ts': 'export {};',
'.env': 'SECRET=abc',
'.env.example': 'SECRET=changeme'
});
const result = await crawlRoot();
// .env files don't have an indexable extension so this tests the gitignore logic
// doesn't incorrectly block .env.example from passing through
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('falls back to IGNORED_DIR_NAMES when no .gitignore is present', async () => {
root = await makeTempRepo({
'src/index.ts': 'export {};',
'node_modules/lodash/index.js': 'lodash',
'__pycache__/main.cpython-311.pyc': 'bytecode'
});
const result = await crawlRoot();
expect(result.files.every((f) => !f.path.startsWith('node_modules/'))).toBe(true);
expect(result.files.every((f) => !f.path.startsWith('__pycache__/'))).toBe(true);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('excludes nested node_modules via fallback (no .gitignore)', async () => {
root = await makeTempRepo({
'src/index.ts': 'export {};',
'packages/ui/node_modules/react/index.js': 'react'
});
const result = await crawlRoot();
expect(result.files.every((f) => !f.path.includes('node_modules'))).toBe(true);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('still prunes common dependency directories when .gitignore exists', async () => {
root = await makeTempRepo({
'.gitignore': 'logs/\n*.log',
'src/index.ts': 'export {};',
'node_modules/lodash/index.js': 'lodash',
'packages/ui/node_modules/react/index.js': 'react',
'logs/debug.log': 'debug'
});
const result = await crawlRoot();
expect(result.files.every((f) => !f.path.includes('node_modules'))).toBe(true);
expect(result.files.every((f) => !f.path.startsWith('logs/'))).toBe(true);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Lock file and minified file exclusions
// ---------------------------------------------------------------------------
describe('LocalCrawler.crawl() — lock file and minified file exclusions', () => {
afterEach(async () => {
await cleanupTempRepo(root);
});
it('excludes package-lock.json', async () => {
root = await makeTempRepo({
'src/index.ts': 'export {};',
'package-lock.json': '{"lockfileVersion":3}'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'package-lock.json')).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('excludes pnpm-lock.yaml', async () => {
root = await makeTempRepo({
'src/index.ts': 'export {};',
'pnpm-lock.yaml': 'lockfileVersion: 9'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'pnpm-lock.yaml')).toBe(false);
});
it('excludes minified .js files', async () => {
root = await makeTempRepo({
'src/index.ts': 'export {};',
'dist/vendor.min.js': '!function(e,t){}()'
});
const result = await crawlRoot();
// dist/ is pruned by default — test via shouldIndexFile logic only if .gitignore present
// Use a custom path outside ignored dirs:
await fs.rm(root, { recursive: true, force: true });
root = await makeTempRepo({
'src/index.ts': 'export {};',
'public/vendor.min.js': '!function(){}'
});
const r2 = await crawlRoot();
expect(r2.files.some((f) => f.path === 'public/vendor.min.js')).toBe(false);
expect(r2.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
it('excludes .bundle.js files', async () => {
root = await makeTempRepo({
'src/index.ts': 'export {};',
'public/app.bundle.js': 'bundled code'
});
const result = await crawlRoot();
expect(result.files.some((f) => f.path === 'public/app.bundle.js')).toBe(false);
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
});
});
// ---------------------------------------------------------------------------
// Size limit
// ---------------------------------------------------------------------------