131 lines
3.7 KiB
Markdown
131 lines
3.7 KiB
Markdown
# TRUEREF-0004 — Local Filesystem Crawler
|
|
|
|
**Priority:** P1
|
|
**Status:** Pending
|
|
**Depends On:** TRUEREF-0001, TRUEREF-0003 (shares types and filter logic)
|
|
**Blocks:** TRUEREF-0009
|
|
|
|
---
|
|
|
|
## Overview
|
|
|
|
Implement a local filesystem crawler that indexes repositories stored on disk. Uses the same file filtering logic as the GitHub crawler but reads from the local filesystem using Node.js `fs` APIs. Useful for private internal codebases, monorepos on disk, and offline development.
|
|
|
|
---
|
|
|
|
## Acceptance Criteria
|
|
|
|
- [ ] Walk a directory tree and enumerate all files
|
|
- [ ] Apply the same extension and size filters as the GitHub crawler
|
|
- [ ] Apply `trueref.json` include/exclude rules
|
|
- [ ] Read file contents as UTF-8 strings
|
|
- [ ] Compute SHA-256 checksum per file for change detection
|
|
- [ ] Detect `trueref.json` / `context7.json` at the repo root before filtering other files
|
|
- [ ] Report progress via callback
|
|
- [ ] Skip symlinks, special files (devices, sockets, etc.)
|
|
- [ ] Unit tests with temporary directory fixtures
|
|
|
|
---
|
|
|
|
## Data Types
|
|
|
|
Reuses `CrawledFile` and `CrawlResult` from TRUEREF-0003 crawler types:
|
|
|
|
```typescript
|
|
export interface LocalCrawlOptions {
|
|
rootPath: string; // absolute path to repository root
|
|
config?: RepoConfig; // parsed trueref.json
|
|
onProgress?: (processed: number, total: number) => void;
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Implementation
|
|
|
|
```typescript
|
|
export class LocalCrawler {
|
|
async crawl(options: LocalCrawlOptions): Promise<CrawlResult> {
|
|
// 1. Enumerate all files recursively
|
|
const allFiles = await this.walkDirectory(options.rootPath);
|
|
|
|
// 2. Look for trueref.json / context7.json first
|
|
const configFile = allFiles.find(f =>
|
|
f === 'trueref.json' || f === 'context7.json'
|
|
);
|
|
let config = options.config;
|
|
if (configFile && !config) {
|
|
config = await this.parseConfigFile(
|
|
path.join(options.rootPath, configFile)
|
|
);
|
|
}
|
|
|
|
// 3. Filter files
|
|
const filteredFiles = allFiles.filter(relPath => {
|
|
const stat = statCache.get(relPath);
|
|
return shouldIndexFile(relPath, stat.size, config);
|
|
});
|
|
|
|
// 4. Read and return file contents
|
|
const crawledFiles: CrawledFile[] = [];
|
|
for (const [i, relPath] of filteredFiles.entries()) {
|
|
const absPath = path.join(options.rootPath, relPath);
|
|
const content = await fs.readFile(absPath, 'utf-8');
|
|
const sha = computeSHA256(content);
|
|
crawledFiles.push({
|
|
path: relPath,
|
|
content,
|
|
size: Buffer.byteLength(content, 'utf-8'),
|
|
sha,
|
|
language: detectLanguage(relPath),
|
|
});
|
|
options.onProgress?.(i + 1, filteredFiles.length);
|
|
}
|
|
|
|
return {
|
|
files: crawledFiles,
|
|
totalFiles: filteredFiles.length,
|
|
skippedFiles: allFiles.length - filteredFiles.length,
|
|
branch: 'local',
|
|
commitSha: computeSHA256(crawledFiles.map(f => f.sha).join('')),
|
|
};
|
|
}
|
|
|
|
private async walkDirectory(dir: string, rel = ''): Promise<string[]> {
|
|
const entries = await fs.readdir(dir, { withFileTypes: true });
|
|
const files: string[] = [];
|
|
for (const entry of entries) {
|
|
if (!entry.isFile() && !entry.isDirectory()) continue; // skip symlinks, devices
|
|
const relPath = rel ? `${rel}/${entry.name}` : entry.name;
|
|
if (entry.isDirectory()) {
|
|
files.push(...await this.walkDirectory(
|
|
path.join(dir, entry.name), relPath
|
|
));
|
|
} else {
|
|
files.push(relPath);
|
|
}
|
|
}
|
|
return files;
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Checksum Computation
|
|
|
|
```typescript
|
|
import { createHash } from 'crypto';
|
|
|
|
function computeSHA256(content: string): string {
|
|
return createHash('sha256').update(content, 'utf-8').digest('hex');
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Files to Create
|
|
|
|
- `src/lib/server/crawler/local.crawler.ts`
|
|
- `src/lib/server/crawler/local.crawler.test.ts`
|