When trueref.json specifies a `folders` allowlist (e.g. ["src/"]), shouldIndexFile() excludes trueref.json itself because it lives at the repo root. The indexing pipeline then searches crawlResult.files for the config file, finds nothing, and never writes rules to repository_configs. Fix (Option B): add a `config` field to CrawlResult so LocalCrawler returns the pre-parsed config directly. The indexing pipeline now reads crawlResult.config first instead of scanning files[], which resolves the regression for all repos with a folders allowlist. - Add `config?: RepoConfig` to CrawlResult in crawler/types.ts - Return `config` from LocalCrawler.crawlDirectory() - Update IndexingPipeline.crawl() to propagate CrawlResult.config - Update IndexingPipeline.run() to prefer crawlResult.config over files - Add regression tests covering the folders-allowlist exclusion scenario Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
750 lines
26 KiB
TypeScript
750 lines
26 KiB
TypeScript
/**
|
||
* Unit tests for the local filesystem crawler (TRUEREF-0004).
|
||
*
|
||
* Each test that needs a filesystem fixture creates a temporary directory via
|
||
* `fs.mkdtemp`, writes the required files, runs the crawler, then cleans up
|
||
* with `fs.rm` regardless of the test outcome.
|
||
*/
|
||
|
||
import { execFile } from 'node:child_process';
|
||
import { createHash } from 'node:crypto';
|
||
import { promises as fs } from 'node:fs';
|
||
import { join } from 'node:path';
|
||
import { tmpdir } from 'node:os';
|
||
import { promisify } from 'node:util';
|
||
|
||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||
|
||
import { LocalCrawler } from './local.crawler.js';
|
||
import type { LocalCrawlOptions } from './local.crawler.js';
|
||
import { InvalidRefError, NotAGitRepositoryError } from './types.js';
|
||
|
||
const execFileAsync = promisify(execFile);
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Helpers
|
||
// ---------------------------------------------------------------------------
|
||
|
||
function sha256(content: string): string {
|
||
return createHash('sha256').update(content, 'utf-8').digest('hex');
|
||
}
|
||
|
||
/** Create a temp directory, write a map of relPath → content, return rootPath. */
|
||
async function makeTempRepo(files: Record<string, string>): Promise<string> {
|
||
const root = await fs.mkdtemp(join(tmpdir(), 'trueref-test-'));
|
||
for (const [relPath, content] of Object.entries(files)) {
|
||
const absPath = join(root, relPath);
|
||
await fs.mkdir(join(absPath, '..'), { recursive: true });
|
||
await fs.writeFile(absPath, content, 'utf-8');
|
||
}
|
||
return root;
|
||
}
|
||
|
||
/** Remove a temporary directory tree created by makeTempRepo. */
|
||
async function cleanupTempRepo(root: string): Promise<void> {
|
||
await fs.rm(root, { recursive: true, force: true });
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Test state
|
||
// ---------------------------------------------------------------------------
|
||
|
||
let root: string = '';
|
||
const crawler = new LocalCrawler();
|
||
|
||
async function crawlRoot(
|
||
opts: Partial<LocalCrawlOptions> = {}
|
||
): Promise<ReturnType<LocalCrawler['crawl']>> {
|
||
return crawler.crawl({ rootPath: root, ...opts });
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Basic crawl behaviour
|
||
// ---------------------------------------------------------------------------
|
||
|
||
describe('LocalCrawler.crawl() — basic file enumeration', () => {
|
||
beforeEach(async () => {
|
||
root = await makeTempRepo({
|
||
'README.md': '# Hello',
|
||
'src/index.ts': 'export const x = 1;',
|
||
'src/utils.ts': 'export const y = 2;',
|
||
'package.json': '{"name":"test"}'
|
||
});
|
||
});
|
||
|
||
afterEach(async () => {
|
||
await cleanupTempRepo(root);
|
||
});
|
||
|
||
it('returns all indexable files', async () => {
|
||
const result = await crawlRoot();
|
||
const paths = result.files.map((f) => f.path).sort();
|
||
expect(paths).toEqual(['README.md', 'package.json', 'src/index.ts', 'src/utils.ts'].sort());
|
||
});
|
||
|
||
it('populates content as a UTF-8 string', async () => {
|
||
const result = await crawlRoot();
|
||
const readme = result.files.find((f) => f.path === 'README.md');
|
||
expect(readme?.content).toBe('# Hello');
|
||
});
|
||
|
||
it('sets size equal to Buffer.byteLength of content', async () => {
|
||
const result = await crawlRoot();
|
||
for (const file of result.files) {
|
||
expect(file.size).toBe(Buffer.byteLength(file.content, 'utf-8'));
|
||
}
|
||
});
|
||
|
||
it('computes correct SHA-256 per file', async () => {
|
||
const result = await crawlRoot();
|
||
const readme = result.files.find((f) => f.path === 'README.md');
|
||
expect(readme?.sha).toBe(sha256('# Hello'));
|
||
});
|
||
|
||
it('detects language from extension', async () => {
|
||
const result = await crawlRoot();
|
||
const ts = result.files.find((f) => f.path === 'src/index.ts');
|
||
expect(ts?.language).toBe('typescript');
|
||
const md = result.files.find((f) => f.path === 'README.md');
|
||
expect(md?.language).toBe('markdown');
|
||
const json = result.files.find((f) => f.path === 'package.json');
|
||
expect(json?.language).toBe('json');
|
||
});
|
||
|
||
it('sets branch to "local"', async () => {
|
||
const result = await crawlRoot();
|
||
expect(result.branch).toBe('local');
|
||
});
|
||
|
||
it('sets totalFiles to the count of filtered files', async () => {
|
||
const result = await crawlRoot();
|
||
expect(result.totalFiles).toBe(result.files.length);
|
||
});
|
||
|
||
it('sets commitSha to a non-empty hex string', async () => {
|
||
const result = await crawlRoot();
|
||
expect(result.commitSha).toMatch(/^[0-9a-f]{64}$/);
|
||
});
|
||
|
||
it('produces a deterministic commitSha for the same file set', async () => {
|
||
const r1 = await crawlRoot();
|
||
const r2 = await crawlRoot();
|
||
expect(r1.commitSha).toBe(r2.commitSha);
|
||
});
|
||
});
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Filtering — default excludes and extension allow-list
|
||
// ---------------------------------------------------------------------------
|
||
|
||
describe('LocalCrawler.crawl() — default filtering', () => {
|
||
beforeEach(async () => {
|
||
root = await makeTempRepo({
|
||
'src/index.ts': 'export {};',
|
||
'dist/bundle.js': 'bundled',
|
||
'node_modules/lodash/index.js': 'lodash',
|
||
'.git/config': '[core]',
|
||
'image.png': '\x89PNG',
|
||
'README.md': '# Docs'
|
||
});
|
||
});
|
||
|
||
afterEach(async () => {
|
||
await cleanupTempRepo(root);
|
||
});
|
||
|
||
it('excludes files in dist/', async () => {
|
||
const result = await crawlRoot();
|
||
expect(result.files.every((f) => !f.path.startsWith('dist/'))).toBe(true);
|
||
});
|
||
|
||
it('excludes files in node_modules/', async () => {
|
||
const result = await crawlRoot();
|
||
expect(result.files.every((f) => !f.path.startsWith('node_modules/'))).toBe(true);
|
||
});
|
||
|
||
it('excludes files in .git/', async () => {
|
||
const result = await crawlRoot();
|
||
expect(result.files.every((f) => !f.path.startsWith('.git/'))).toBe(true);
|
||
});
|
||
|
||
it('excludes non-indexable extensions like .png', async () => {
|
||
const result = await crawlRoot();
|
||
expect(result.files.every((f) => !f.path.endsWith('.png'))).toBe(true);
|
||
});
|
||
|
||
it('reports skippedFiles = total enumerated – filtered', async () => {
|
||
const result = await crawlRoot();
|
||
// dist/, node_modules/, .git/ are pruned at walk time — never counted.
|
||
// Only image.png reaches allRelPaths and is skipped (non-indexable extension).
|
||
// src/index.ts + README.md = 2 kept
|
||
expect(result.skippedFiles).toBe(1);
|
||
expect(result.totalFiles).toBe(2);
|
||
});
|
||
});
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// .gitignore support
|
||
// ---------------------------------------------------------------------------
|
||
|
||
describe('LocalCrawler.crawl() — .gitignore support', () => {
|
||
afterEach(async () => {
|
||
await cleanupTempRepo(root);
|
||
});
|
||
|
||
it('excludes files matching a .gitignore pattern', async () => {
|
||
root = await makeTempRepo({
|
||
'.gitignore': '*.log\nsecrets.ts',
|
||
'src/index.ts': 'export {};',
|
||
'debug.log': 'log data',
|
||
'secrets.ts': 'const key = "abc";'
|
||
});
|
||
const result = await crawlRoot();
|
||
expect(result.files.some((f) => f.path === 'debug.log')).toBe(false);
|
||
expect(result.files.some((f) => f.path === 'secrets.ts')).toBe(false);
|
||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||
});
|
||
|
||
it('excludes a directory listed in .gitignore', async () => {
|
||
root = await makeTempRepo({
|
||
'.gitignore': 'generated/',
|
||
'src/index.ts': 'export {};',
|
||
'generated/api.ts': 'auto-generated'
|
||
});
|
||
const result = await crawlRoot();
|
||
expect(result.files.some((f) => f.path.startsWith('generated/'))).toBe(false);
|
||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||
});
|
||
|
||
it('respects negation patterns in .gitignore', async () => {
|
||
root = await makeTempRepo({
|
||
'.gitignore': '*.env\n!.env.example',
|
||
'src/index.ts': 'export {};',
|
||
'.env': 'SECRET=abc',
|
||
'.env.example': 'SECRET=changeme'
|
||
});
|
||
const result = await crawlRoot();
|
||
// .env files don't have an indexable extension so this tests the gitignore logic
|
||
// doesn't incorrectly block .env.example from passing through
|
||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||
});
|
||
|
||
it('falls back to IGNORED_DIR_NAMES when no .gitignore is present', async () => {
|
||
root = await makeTempRepo({
|
||
'src/index.ts': 'export {};',
|
||
'node_modules/lodash/index.js': 'lodash',
|
||
'__pycache__/main.cpython-311.pyc': 'bytecode'
|
||
});
|
||
const result = await crawlRoot();
|
||
expect(result.files.every((f) => !f.path.startsWith('node_modules/'))).toBe(true);
|
||
expect(result.files.every((f) => !f.path.startsWith('__pycache__/'))).toBe(true);
|
||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||
});
|
||
|
||
it('excludes nested node_modules via fallback (no .gitignore)', async () => {
|
||
root = await makeTempRepo({
|
||
'src/index.ts': 'export {};',
|
||
'packages/ui/node_modules/react/index.js': 'react'
|
||
});
|
||
const result = await crawlRoot();
|
||
expect(result.files.every((f) => !f.path.includes('node_modules'))).toBe(true);
|
||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||
});
|
||
|
||
it('still prunes common dependency directories when .gitignore exists', async () => {
|
||
root = await makeTempRepo({
|
||
'.gitignore': 'logs/\n*.log',
|
||
'src/index.ts': 'export {};',
|
||
'node_modules/lodash/index.js': 'lodash',
|
||
'packages/ui/node_modules/react/index.js': 'react',
|
||
'logs/debug.log': 'debug'
|
||
});
|
||
const result = await crawlRoot();
|
||
expect(result.files.every((f) => !f.path.includes('node_modules'))).toBe(true);
|
||
expect(result.files.every((f) => !f.path.startsWith('logs/'))).toBe(true);
|
||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||
});
|
||
});
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Lock file and minified file exclusions
|
||
// ---------------------------------------------------------------------------
|
||
|
||
describe('LocalCrawler.crawl() — lock file and minified file exclusions', () => {
|
||
afterEach(async () => {
|
||
await cleanupTempRepo(root);
|
||
});
|
||
|
||
it('excludes package-lock.json', async () => {
|
||
root = await makeTempRepo({
|
||
'src/index.ts': 'export {};',
|
||
'package-lock.json': '{"lockfileVersion":3}'
|
||
});
|
||
const result = await crawlRoot();
|
||
expect(result.files.some((f) => f.path === 'package-lock.json')).toBe(false);
|
||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||
});
|
||
|
||
it('excludes pnpm-lock.yaml', async () => {
|
||
root = await makeTempRepo({
|
||
'src/index.ts': 'export {};',
|
||
'pnpm-lock.yaml': 'lockfileVersion: 9'
|
||
});
|
||
const result = await crawlRoot();
|
||
expect(result.files.some((f) => f.path === 'pnpm-lock.yaml')).toBe(false);
|
||
});
|
||
|
||
it('excludes minified .js files', async () => {
|
||
root = await makeTempRepo({
|
||
'src/index.ts': 'export {};',
|
||
'dist/vendor.min.js': '!function(e,t){}()'
|
||
});
|
||
// dist/ is pruned by default — test via shouldIndexFile logic only if .gitignore present
|
||
// Use a custom path outside ignored dirs:
|
||
await fs.rm(root, { recursive: true, force: true });
|
||
root = await makeTempRepo({
|
||
'src/index.ts': 'export {};',
|
||
'public/vendor.min.js': '!function(){}'
|
||
});
|
||
const r2 = await crawlRoot();
|
||
expect(r2.files.some((f) => f.path === 'public/vendor.min.js')).toBe(false);
|
||
expect(r2.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||
});
|
||
|
||
it('excludes .bundle.js files', async () => {
|
||
root = await makeTempRepo({
|
||
'src/index.ts': 'export {};',
|
||
'public/app.bundle.js': 'bundled code'
|
||
});
|
||
const result = await crawlRoot();
|
||
expect(result.files.some((f) => f.path === 'public/app.bundle.js')).toBe(false);
|
||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||
});
|
||
});
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Size limit
|
||
// ---------------------------------------------------------------------------
|
||
|
||
describe('LocalCrawler.crawl() — size limit', () => {
|
||
afterEach(async () => {
|
||
await cleanupTempRepo(root);
|
||
});
|
||
|
||
it('excludes files larger than MAX_FILE_SIZE_BYTES (500 KB)', async () => {
|
||
// 500_001 bytes of 'x'
|
||
const bigContent = 'x'.repeat(500_001);
|
||
root = await makeTempRepo({
|
||
'big.ts': bigContent,
|
||
'small.ts': 'export const x = 1;'
|
||
});
|
||
const result = await crawlRoot();
|
||
expect(result.files.some((f) => f.path === 'big.ts')).toBe(false);
|
||
expect(result.files.some((f) => f.path === 'small.ts')).toBe(true);
|
||
});
|
||
|
||
it('includes files exactly at MAX_FILE_SIZE_BYTES (500 KB)', async () => {
|
||
const edgeContent = 'a'.repeat(500_000);
|
||
root = await makeTempRepo({ 'edge.ts': edgeContent });
|
||
const result = await crawlRoot();
|
||
expect(result.files.some((f) => f.path === 'edge.ts')).toBe(true);
|
||
});
|
||
});
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// trueref.json / context7.json config detection
|
||
// ---------------------------------------------------------------------------
|
||
|
||
describe('LocalCrawler.crawl() — config file detection', () => {
|
||
afterEach(async () => {
|
||
await cleanupTempRepo(root);
|
||
});
|
||
|
||
it('auto-detects trueref.json and applies excludeFiles', async () => {
|
||
root = await makeTempRepo({
|
||
'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }),
|
||
'src/index.ts': 'export {};',
|
||
'package.json': '{"name":"test"}'
|
||
});
|
||
const result = await crawlRoot();
|
||
expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
|
||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||
});
|
||
|
||
it('auto-detects context7.json and applies folders allowlist', async () => {
|
||
root = await makeTempRepo({
|
||
'context7.json': JSON.stringify({ folders: ['docs/'] }),
|
||
'src/index.ts': 'export {};',
|
||
'docs/guide.md': '# Guide'
|
||
});
|
||
const result = await crawlRoot();
|
||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(false);
|
||
expect(result.files.some((f) => f.path === 'docs/guide.md')).toBe(true);
|
||
});
|
||
|
||
it('caller-supplied config takes precedence over discovered config file', async () => {
|
||
root = await makeTempRepo({
|
||
'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }),
|
||
'src/index.ts': 'export {};',
|
||
'package.json': '{"name":"test"}'
|
||
});
|
||
// Caller provides a config with no exclusions — package.json should appear.
|
||
const result = await crawlRoot({ config: {} });
|
||
expect(result.files.some((f) => f.path === 'package.json')).toBe(true);
|
||
});
|
||
|
||
it('applies excludeFolders from config', async () => {
|
||
root = await makeTempRepo({
|
||
'trueref.json': JSON.stringify({ excludeFolders: ['internal/'] }),
|
||
'internal/secret.ts': 'secret',
|
||
'src/public.ts': 'public'
|
||
});
|
||
const result = await crawlRoot();
|
||
expect(result.files.some((f) => f.path.startsWith('internal/'))).toBe(false);
|
||
expect(result.files.some((f) => f.path === 'src/public.ts')).toBe(true);
|
||
});
|
||
|
||
it('gracefully handles a malformed config file', async () => {
|
||
root = await makeTempRepo({
|
||
'trueref.json': 'NOT VALID JSON {{{',
|
||
'src/index.ts': 'export {};'
|
||
});
|
||
// Should not throw; falls back to no config.
|
||
const result = await crawlRoot();
|
||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||
});
|
||
|
||
it('populates CrawlResult.config with the parsed trueref.json even when folders allowlist excludes the root', async () => {
|
||
// Regression test for MULTIVERSION-0001:
|
||
// When folders: ["src/"] is set, trueref.json at the root is excluded from
|
||
// files[] by shouldIndexFile(). The config must still be returned in
|
||
// CrawlResult.config so the indexing pipeline can persist rules.
|
||
root = await makeTempRepo({
|
||
'trueref.json': JSON.stringify({
|
||
folders: ['src/'],
|
||
rules: ['Always document public APIs.']
|
||
}),
|
||
'src/index.ts': 'export {};',
|
||
'docs/guide.md': '# Guide'
|
||
});
|
||
const result = await crawlRoot();
|
||
|
||
// trueref.json must NOT appear in files (excluded by folders allowlist).
|
||
expect(result.files.some((f) => f.path === 'trueref.json')).toBe(false);
|
||
// docs/guide.md must NOT appear (outside src/).
|
||
expect(result.files.some((f) => f.path === 'docs/guide.md')).toBe(false);
|
||
// src/index.ts must appear (inside src/).
|
||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||
// CrawlResult.config must carry the parsed config.
|
||
expect(result.config).toBeDefined();
|
||
expect(result.config?.rules).toEqual(['Always document public APIs.']);
|
||
});
|
||
|
||
it('populates CrawlResult.config with the parsed context7.json', async () => {
|
||
root = await makeTempRepo({
|
||
'context7.json': JSON.stringify({ rules: ['Rule from context7.'] }),
|
||
'src/index.ts': 'export {};'
|
||
});
|
||
const result = await crawlRoot();
|
||
expect(result.config).toBeDefined();
|
||
expect(result.config?.rules).toEqual(['Rule from context7.']);
|
||
});
|
||
|
||
it('CrawlResult.config is undefined when no config file is present', async () => {
|
||
root = await makeTempRepo({ 'src/index.ts': 'export {};' });
|
||
const result = await crawlRoot();
|
||
expect(result.config).toBeUndefined();
|
||
});
|
||
|
||
it('CrawlResult.config is undefined when caller supplies config (caller-provided takes precedence, no auto-detect)', async () => {
|
||
root = await makeTempRepo({
|
||
'trueref.json': JSON.stringify({ rules: ['From file.'] }),
|
||
'src/index.ts': 'export {};'
|
||
});
|
||
// Caller-supplied config prevents auto-detection; CrawlResult.config
|
||
// should carry the caller config (not the file content).
|
||
const result = await crawlRoot({ config: { rules: ['From caller.'] } });
|
||
expect(result.config?.rules).toEqual(['From caller.']);
|
||
});
|
||
});
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Progress callback
|
||
// ---------------------------------------------------------------------------
|
||
|
||
describe('LocalCrawler.crawl() — progress reporting', () => {
|
||
beforeEach(async () => {
|
||
root = await makeTempRepo({
|
||
'src/a.ts': 'a',
|
||
'src/b.ts': 'b',
|
||
'src/c.ts': 'c'
|
||
});
|
||
});
|
||
|
||
afterEach(async () => {
|
||
await cleanupTempRepo(root);
|
||
});
|
||
|
||
it('calls onProgress once per filtered file', async () => {
|
||
const calls: Array<[number, number]> = [];
|
||
await crawlRoot({ onProgress: (p, t) => calls.push([p, t]) });
|
||
expect(calls).toHaveLength(3);
|
||
});
|
||
|
||
it('increments processed from 1 to totalFiles', async () => {
|
||
const calls: Array<[number, number]> = [];
|
||
await crawlRoot({ onProgress: (p, t) => calls.push([p, t]) });
|
||
const processed = calls.map(([p]) => p);
|
||
expect(processed).toEqual([1, 2, 3]);
|
||
});
|
||
|
||
it('keeps total constant across all callback invocations', async () => {
|
||
const totals: number[] = [];
|
||
await crawlRoot({ onProgress: (_, t) => totals.push(t) });
|
||
expect(totals.every((t) => t === totals[0])).toBe(true);
|
||
});
|
||
|
||
it('does not call onProgress when no files pass the filter', async () => {
|
||
// Overwrite root with only non-indexable files.
|
||
await fs.rm(root, { recursive: true, force: true });
|
||
root = await makeTempRepo({ 'image.png': '\x89PNG' });
|
||
const calls: number[] = [];
|
||
await crawlRoot({ onProgress: () => calls.push(1) });
|
||
expect(calls).toHaveLength(0);
|
||
});
|
||
});
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Git ref checkout
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Create a temp directory that is a valid git repo with one commit per entry
|
||
* in `history`. Each entry is a map of relPath → content committed under the
|
||
* given tag (if provided). Returns the repo root path.
|
||
*
|
||
* Layout of `history`:
|
||
* [{ tag?: string, files: Record<string, string> }, ...]
|
||
*/
|
||
async function makeGitRepo(
|
||
history: Array<{ tag?: string; files: Record<string, string> }>
|
||
): Promise<string> {
|
||
const root = await fs.mkdtemp(join(tmpdir(), 'trueref-git-test-'));
|
||
|
||
async function git(...args: string[]) {
|
||
await execFileAsync('git', ['-C', root, ...args]);
|
||
}
|
||
|
||
await git('init', '--initial-branch=main');
|
||
await git('config', 'user.email', 'test@trueref.local');
|
||
await git('config', 'user.name', 'TrueRef Test');
|
||
|
||
for (const { tag, files } of history) {
|
||
// Write files
|
||
for (const [relPath, content] of Object.entries(files)) {
|
||
const absPath = join(root, relPath);
|
||
await fs.mkdir(join(absPath, '..'), { recursive: true });
|
||
await fs.writeFile(absPath, content, 'utf-8');
|
||
}
|
||
await git('add', '.');
|
||
await git('commit', '--allow-empty', '-m', `commit for ${tag ?? 'HEAD'}`);
|
||
if (tag) {
|
||
await git('tag', tag);
|
||
}
|
||
}
|
||
|
||
return root;
|
||
}
|
||
|
||
describe('LocalCrawler.crawl() — git ref checkout', () => {
|
||
let root: string = '';
|
||
const crawler = new LocalCrawler();
|
||
|
||
afterEach(async () => {
|
||
if (root) await cleanupTempRepo(root);
|
||
});
|
||
|
||
it('crawls files at a specific tag, not the HEAD state', async () => {
|
||
root = await makeGitRepo([
|
||
{ tag: 'v1.0.0', files: { 'src/index.ts': 'export const version = 1;' } },
|
||
{ files: { 'src/index.ts': 'export const version = 2;' } }
|
||
]);
|
||
|
||
const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
|
||
const indexFile = result.files.find((f) => f.path === 'src/index.ts');
|
||
expect(indexFile?.content).toBe('export const version = 1;');
|
||
});
|
||
|
||
it('crawls files at a specific commit SHA', async () => {
|
||
root = await makeGitRepo([
|
||
{ tag: 'v1.0.0', files: { 'api.ts': 'v1' } },
|
||
{ files: { 'api.ts': 'v2' } }
|
||
]);
|
||
|
||
// Resolve the SHA of v1.0.0
|
||
const { stdout } = await execFileAsync('git', ['-C', root, 'rev-parse', 'v1.0.0'], {
|
||
encoding: 'utf-8'
|
||
});
|
||
const sha = stdout.trim();
|
||
|
||
const result = await crawler.crawl({ rootPath: root, ref: sha });
|
||
const api = result.files.find((f) => f.path === 'api.ts');
|
||
expect(api?.content).toBe('v1');
|
||
});
|
||
|
||
it('sets branch to the ref string in the result', async () => {
|
||
root = await makeGitRepo([{ tag: 'v2.3.1', files: { 'README.md': '# v2' } }]);
|
||
|
||
const result = await crawler.crawl({ rootPath: root, ref: 'v2.3.1' });
|
||
expect(result.branch).toBe('v2.3.1');
|
||
});
|
||
|
||
it('sets commitSha to the git-resolved SHA (not file-content hash)', async () => {
|
||
root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'a.ts': 'a' } }]);
|
||
|
||
const { stdout } = await execFileAsync('git', ['-C', root, 'rev-parse', 'v1.0.0'], {
|
||
encoding: 'utf-8'
|
||
});
|
||
const expectedSha = stdout.trim();
|
||
|
||
const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
|
||
expect(result.commitSha).toBe(expectedSha);
|
||
});
|
||
|
||
it('does not modify the working tree', async () => {
|
||
root = await makeGitRepo([
|
||
{ tag: 'v1.0.0', files: { 'src/index.ts': 'v1' } },
|
||
{ files: { 'src/index.ts': 'v2' } }
|
||
]);
|
||
|
||
// Working tree is at HEAD (v2)
|
||
const before = await fs.readFile(join(root, 'src/index.ts'), 'utf-8');
|
||
await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
|
||
const after = await fs.readFile(join(root, 'src/index.ts'), 'utf-8');
|
||
|
||
expect(before).toBe('v2');
|
||
expect(after).toBe('v2');
|
||
});
|
||
|
||
it('removes the temporary worktree after crawling', async () => {
|
||
root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'f.ts': 'x' } }]);
|
||
|
||
await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
|
||
|
||
// List remaining worktrees — only the main one should remain.
|
||
const { stdout } = await execFileAsync('git', ['-C', root, 'worktree', 'list', '--porcelain'], {
|
||
encoding: 'utf-8'
|
||
});
|
||
const worktreeCount = stdout.split('\n').filter((l) => l.startsWith('worktree ')).length;
|
||
expect(worktreeCount).toBe(1);
|
||
});
|
||
|
||
it('throws NotAGitRepositoryError for a plain directory', async () => {
|
||
const plainDir = await fs.mkdtemp(join(tmpdir(), 'trueref-plain-'));
|
||
root = plainDir; // cleaned up in afterEach
|
||
|
||
await expect(crawler.crawl({ rootPath: plainDir, ref: 'v1.0.0' })).rejects.toThrow(
|
||
NotAGitRepositoryError
|
||
);
|
||
});
|
||
|
||
it('throws InvalidRefError for a ref that does not exist', async () => {
|
||
root = await makeGitRepo([{ tag: 'v1.0.0', files: { 'f.ts': 'x' } }]);
|
||
|
||
await expect(crawler.crawl({ rootPath: root, ref: 'v99.99.99' })).rejects.toThrow(
|
||
InvalidRefError
|
||
);
|
||
});
|
||
|
||
it('applies caller-supplied config at the checked-out ref', async () => {
|
||
root = await makeGitRepo([
|
||
{
|
||
tag: 'v1.0.0',
|
||
files: {
|
||
'src/index.ts': 'export {};',
|
||
'package.json': '{"name":"test"}'
|
||
}
|
||
}
|
||
]);
|
||
|
||
// Exclude package.json via caller config
|
||
const result = await crawler.crawl({
|
||
rootPath: root,
|
||
ref: 'v1.0.0',
|
||
config: { excludeFiles: ['package.json'] }
|
||
});
|
||
|
||
expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
|
||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||
});
|
||
|
||
it('reads trueref.json from the checked-out ref', async () => {
|
||
root = await makeGitRepo([
|
||
{
|
||
tag: 'v1.0.0',
|
||
files: {
|
||
'trueref.json': JSON.stringify({ excludeFiles: ['package.json'] }),
|
||
'src/index.ts': 'export {};',
|
||
'package.json': '{"name":"test"}'
|
||
}
|
||
}
|
||
]);
|
||
|
||
const result = await crawler.crawl({ rootPath: root, ref: 'v1.0.0' });
|
||
expect(result.files.some((f) => f.path === 'package.json')).toBe(false);
|
||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||
});
|
||
});
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Edge cases
|
||
// ---------------------------------------------------------------------------
|
||
|
||
describe('LocalCrawler.crawl() — edge cases', () => {
|
||
afterEach(async () => {
|
||
await cleanupTempRepo(root);
|
||
});
|
||
|
||
it('returns empty result for an empty directory', async () => {
|
||
root = await makeTempRepo({});
|
||
const result = await crawlRoot();
|
||
expect(result.files).toHaveLength(0);
|
||
expect(result.totalFiles).toBe(0);
|
||
expect(result.skippedFiles).toBe(0);
|
||
});
|
||
|
||
it('handles deeply nested directory structures', async () => {
|
||
root = await makeTempRepo({
|
||
'a/b/c/d/deep.ts': 'export const deep = true;'
|
||
});
|
||
const result = await crawlRoot();
|
||
expect(result.files.some((f) => f.path === 'a/b/c/d/deep.ts')).toBe(true);
|
||
});
|
||
|
||
it('handles files with UTF-8 content correctly', async () => {
|
||
const utf8Content = 'const greeting = "héllo wörld — 日本語";';
|
||
root = await makeTempRepo({ 'src/unicode.ts': utf8Content });
|
||
const result = await crawlRoot();
|
||
const file = result.files.find((f) => f.path === 'src/unicode.ts');
|
||
expect(file?.content).toBe(utf8Content);
|
||
expect(file?.sha).toBe(sha256(utf8Content));
|
||
});
|
||
|
||
it('commitSha differs when file content changes', async () => {
|
||
root = await makeTempRepo({ 'src/index.ts': 'version 1' });
|
||
const r1 = await crawlRoot();
|
||
|
||
await fs.writeFile(join(root, 'src/index.ts'), 'version 2', 'utf-8');
|
||
const r2 = await crawlRoot();
|
||
|
||
expect(r1.commitSha).not.toBe(r2.commitSha);
|
||
});
|
||
|
||
it('commitSha is empty-string hash when no files are crawled', async () => {
|
||
root = await makeTempRepo({ 'image.png': '\x89PNG' });
|
||
const result = await crawlRoot();
|
||
// SHA-256 of an empty string
|
||
expect(result.commitSha).toBe(sha256(''));
|
||
});
|
||
});
|