chore: initial project scaffold
This commit is contained in:
130
docs/features/TRUEREF-0004.md
Normal file
130
docs/features/TRUEREF-0004.md
Normal file
@@ -0,0 +1,130 @@
|
||||
# TRUEREF-0004 — Local Filesystem Crawler
|
||||
|
||||
**Priority:** P1
|
||||
**Status:** Pending
|
||||
**Depends On:** TRUEREF-0001, TRUEREF-0003 (shares types and filter logic)
|
||||
**Blocks:** TRUEREF-0009
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Implement a local filesystem crawler that indexes repositories stored on disk. Uses the same file filtering logic as the GitHub crawler but reads from the local filesystem using Node.js `fs` APIs. Useful for private internal codebases, monorepos on disk, and offline development.
|
||||
|
||||
---
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
- [ ] Walk a directory tree and enumerate all files
|
||||
- [ ] Apply the same extension and size filters as the GitHub crawler
|
||||
- [ ] Apply `trueref.json` include/exclude rules
|
||||
- [ ] Read file contents as UTF-8 strings
|
||||
- [ ] Compute SHA-256 checksum per file for change detection
|
||||
- [ ] Detect `trueref.json` / `context7.json` at the repo root before filtering other files
|
||||
- [ ] Report progress via callback
|
||||
- [ ] Skip symlinks, special files (devices, sockets, etc.)
|
||||
- [ ] Unit tests with temporary directory fixtures
|
||||
|
||||
---
|
||||
|
||||
## Data Types
|
||||
|
||||
Reuses `CrawledFile` and `CrawlResult` from TRUEREF-0003 crawler types:
|
||||
|
||||
```typescript
|
||||
export interface LocalCrawlOptions {
|
||||
rootPath: string; // absolute path to repository root
|
||||
config?: RepoConfig; // parsed trueref.json
|
||||
onProgress?: (processed: number, total: number) => void;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation
|
||||
|
||||
```typescript
|
||||
export class LocalCrawler {
|
||||
async crawl(options: LocalCrawlOptions): Promise<CrawlResult> {
|
||||
// 1. Enumerate all files recursively
|
||||
const allFiles = await this.walkDirectory(options.rootPath);
|
||||
|
||||
// 2. Look for trueref.json / context7.json first
|
||||
const configFile = allFiles.find(f =>
|
||||
f === 'trueref.json' || f === 'context7.json'
|
||||
);
|
||||
let config = options.config;
|
||||
if (configFile && !config) {
|
||||
config = await this.parseConfigFile(
|
||||
path.join(options.rootPath, configFile)
|
||||
);
|
||||
}
|
||||
|
||||
// 3. Filter files
|
||||
const filteredFiles = allFiles.filter(relPath => {
|
||||
const stat = statCache.get(relPath);
|
||||
return shouldIndexFile(relPath, stat.size, config);
|
||||
});
|
||||
|
||||
// 4. Read and return file contents
|
||||
const crawledFiles: CrawledFile[] = [];
|
||||
for (const [i, relPath] of filteredFiles.entries()) {
|
||||
const absPath = path.join(options.rootPath, relPath);
|
||||
const content = await fs.readFile(absPath, 'utf-8');
|
||||
const sha = computeSHA256(content);
|
||||
crawledFiles.push({
|
||||
path: relPath,
|
||||
content,
|
||||
size: Buffer.byteLength(content, 'utf-8'),
|
||||
sha,
|
||||
language: detectLanguage(relPath),
|
||||
});
|
||||
options.onProgress?.(i + 1, filteredFiles.length);
|
||||
}
|
||||
|
||||
return {
|
||||
files: crawledFiles,
|
||||
totalFiles: filteredFiles.length,
|
||||
skippedFiles: allFiles.length - filteredFiles.length,
|
||||
branch: 'local',
|
||||
commitSha: computeSHA256(crawledFiles.map(f => f.sha).join('')),
|
||||
};
|
||||
}
|
||||
|
||||
private async walkDirectory(dir: string, rel = ''): Promise<string[]> {
|
||||
const entries = await fs.readdir(dir, { withFileTypes: true });
|
||||
const files: string[] = [];
|
||||
for (const entry of entries) {
|
||||
if (!entry.isFile() && !entry.isDirectory()) continue; // skip symlinks, devices
|
||||
const relPath = rel ? `${rel}/${entry.name}` : entry.name;
|
||||
if (entry.isDirectory()) {
|
||||
files.push(...await this.walkDirectory(
|
||||
path.join(dir, entry.name), relPath
|
||||
));
|
||||
} else {
|
||||
files.push(relPath);
|
||||
}
|
||||
}
|
||||
return files;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Checksum Computation
|
||||
|
||||
```typescript
|
||||
import { createHash } from 'crypto';
|
||||
|
||||
function computeSHA256(content: string): string {
|
||||
return createHash('sha256').update(content, 'utf-8').digest('hex');
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Files to Create
|
||||
|
||||
- `src/lib/server/crawler/local.crawler.ts`
|
||||
- `src/lib/server/crawler/local.crawler.test.ts`
|
||||
Reference in New Issue
Block a user