Files
trueref/docs/features/TRUEREF-0004.md
2026-03-22 17:08:15 +01:00

3.7 KiB

TRUEREF-0004 — Local Filesystem Crawler

Priority: P1 Status: Pending Depends On: TRUEREF-0001, TRUEREF-0003 (shares types and filter logic) Blocks: TRUEREF-0009


Overview

Implement a local filesystem crawler that indexes repositories stored on disk. Uses the same file filtering logic as the GitHub crawler but reads from the local filesystem using Node.js fs APIs. Useful for private internal codebases, monorepos on disk, and offline development.


Acceptance Criteria

  • Walk a directory tree and enumerate all files
  • Apply the same extension and size filters as the GitHub crawler
  • Apply trueref.json include/exclude rules
  • Read file contents as UTF-8 strings
  • Compute SHA-256 checksum per file for change detection
  • Detect trueref.json / context7.json at the repo root before filtering other files
  • Report progress via callback
  • Skip symlinks, special files (devices, sockets, etc.)
  • Unit tests with temporary directory fixtures

Data Types

Reuses CrawledFile and CrawlResult from TRUEREF-0003 crawler types:

export interface LocalCrawlOptions {
  rootPath: string;          // absolute path to repository root
  config?: RepoConfig;       // parsed trueref.json
  onProgress?: (processed: number, total: number) => void;
}

Implementation

export class LocalCrawler {
  async crawl(options: LocalCrawlOptions): Promise<CrawlResult> {
    // 1. Enumerate all files recursively
    const allFiles = await this.walkDirectory(options.rootPath);

    // 2. Look for trueref.json / context7.json first
    const configFile = allFiles.find(f =>
      f === 'trueref.json' || f === 'context7.json'
    );
    let config = options.config;
    if (configFile && !config) {
      config = await this.parseConfigFile(
        path.join(options.rootPath, configFile)
      );
    }

    // 3. Filter files
    const filteredFiles = allFiles.filter(relPath => {
      const stat = statCache.get(relPath);
      return shouldIndexFile(relPath, stat.size, config);
    });

    // 4. Read and return file contents
    const crawledFiles: CrawledFile[] = [];
    for (const [i, relPath] of filteredFiles.entries()) {
      const absPath = path.join(options.rootPath, relPath);
      const content = await fs.readFile(absPath, 'utf-8');
      const sha = computeSHA256(content);
      crawledFiles.push({
        path: relPath,
        content,
        size: Buffer.byteLength(content, 'utf-8'),
        sha,
        language: detectLanguage(relPath),
      });
      options.onProgress?.(i + 1, filteredFiles.length);
    }

    return {
      files: crawledFiles,
      totalFiles: filteredFiles.length,
      skippedFiles: allFiles.length - filteredFiles.length,
      branch: 'local',
      commitSha: computeSHA256(crawledFiles.map(f => f.sha).join('')),
    };
  }

  private async walkDirectory(dir: string, rel = ''): Promise<string[]> {
    const entries = await fs.readdir(dir, { withFileTypes: true });
    const files: string[] = [];
    for (const entry of entries) {
      if (!entry.isFile() && !entry.isDirectory()) continue; // skip symlinks, devices
      const relPath = rel ? `${rel}/${entry.name}` : entry.name;
      if (entry.isDirectory()) {
        files.push(...await this.walkDirectory(
          path.join(dir, entry.name), relPath
        ));
      } else {
        files.push(relPath);
      }
    }
    return files;
  }
}

Checksum Computation

import { createHash } from 'crypto';

function computeSHA256(content: string): string {
  return createHash('sha256').update(content, 'utf-8').digest('hex');
}

Files to Create

  • src/lib/server/crawler/local.crawler.ts
  • src/lib/server/crawler/local.crawler.test.ts