docs: update docs, add new features
This commit is contained in:
@@ -11,6 +11,8 @@
|
||||
|
||||
Implement a local filesystem crawler that indexes repositories stored on disk. Uses the same file filtering logic as the GitHub crawler but reads from the local filesystem using Node.js `fs` APIs. Useful for private internal codebases, monorepos on disk, and offline development.
|
||||
|
||||
When indexing a local project, the crawler should prefer the repository's root `.gitignore` when present so local indexing follows the same intent developers use in day-to-day work. If no `.gitignore` exists, or if it does not exclude common dependency and artifact paths, the crawler must still avoid indexing those paths by default. The goal is to return relevant library code and documentation, not vendored dependencies, caches, lockfiles, or generated build output.
|
||||
|
||||
---
|
||||
|
||||
## Acceptance Criteria
|
||||
@@ -18,6 +20,9 @@ Implement a local filesystem crawler that indexes repositories stored on disk. U
|
||||
- [ ] Walk a directory tree and enumerate all files
|
||||
- [ ] Apply the same extension and size filters as the GitHub crawler
|
||||
- [ ] Apply `trueref.json` include/exclude rules
|
||||
- [ ] Respect a root `.gitignore` file when present
|
||||
- [ ] Prune common dependency / artifact directories even when `.gitignore` is absent
|
||||
- [ ] Exclude common lockfiles and minified bundle artifacts from indexing
|
||||
- [ ] Read file contents as UTF-8 strings
|
||||
- [ ] Compute SHA-256 checksum per file for change detection
|
||||
- [ ] Detect `trueref.json` / `context7.json` at the repo root before filtering other files
|
||||
@@ -46,10 +51,13 @@ export interface LocalCrawlOptions {
|
||||
```typescript
|
||||
export class LocalCrawler {
|
||||
async crawl(options: LocalCrawlOptions): Promise<CrawlResult> {
|
||||
// 1. Enumerate all files recursively
|
||||
const allFiles = await this.walkDirectory(options.rootPath);
|
||||
// 1. Load root .gitignore if present
|
||||
const gitignore = await this.loadGitignore(options.rootPath);
|
||||
|
||||
// 2. Look for trueref.json / context7.json first
|
||||
// 2. Enumerate files recursively, pruning ignored directories early
|
||||
const allFiles = await this.walkDirectory(options.rootPath, '', gitignore);
|
||||
|
||||
// 3. Look for trueref.json / context7.json first
|
||||
const configFile = allFiles.find(f =>
|
||||
f === 'trueref.json' || f === 'context7.json'
|
||||
);
|
||||
@@ -60,13 +68,13 @@ export class LocalCrawler {
|
||||
);
|
||||
}
|
||||
|
||||
// 3. Filter files
|
||||
// 4. Filter files
|
||||
const filteredFiles = allFiles.filter(relPath => {
|
||||
const stat = statCache.get(relPath);
|
||||
return shouldIndexFile(relPath, stat.size, config);
|
||||
});
|
||||
|
||||
// 4. Read and return file contents
|
||||
// 5. Read and return file contents
|
||||
const crawledFiles: CrawledFile[] = [];
|
||||
for (const [i, relPath] of filteredFiles.entries()) {
|
||||
const absPath = path.join(options.rootPath, relPath);
|
||||
@@ -91,17 +99,21 @@ export class LocalCrawler {
|
||||
};
|
||||
}
|
||||
|
||||
private async walkDirectory(dir: string, rel = ''): Promise<string[]> {
|
||||
private async walkDirectory(dir: string, rel = '', gitignore?: GitignoreFilter): Promise<string[]> {
|
||||
const entries = await fs.readdir(dir, { withFileTypes: true });
|
||||
const files: string[] = [];
|
||||
for (const entry of entries) {
|
||||
if (!entry.isFile() && !entry.isDirectory()) continue; // skip symlinks, devices
|
||||
const relPath = rel ? `${rel}/${entry.name}` : entry.name;
|
||||
if (entry.isDirectory()) {
|
||||
if (shouldPruneDirectory(relPath) || gitignore?.isIgnored(relPath, true)) {
|
||||
continue;
|
||||
}
|
||||
files.push(...await this.walkDirectory(
|
||||
path.join(dir, entry.name), relPath
|
||||
path.join(dir, entry.name), relPath, gitignore
|
||||
));
|
||||
} else {
|
||||
if (gitignore?.isIgnored(relPath, false)) continue;
|
||||
files.push(relPath);
|
||||
}
|
||||
}
|
||||
@@ -112,6 +124,18 @@ export class LocalCrawler {
|
||||
|
||||
---
|
||||
|
||||
## Ignore Handling
|
||||
|
||||
Filtering happens in three layers:
|
||||
|
||||
1. Root `.gitignore` rules for local-project expectations.
|
||||
2. Built-in exclusions for dependency stores and artifacts such as `node_modules`, `dist`, `build`, `.next`, `vendor`, `target`, `.venv`, `__pycache__`, caches, coverage output, and other generated directories.
|
||||
3. Shared file-level exclusions for oversized files, unsupported extensions, known lockfiles such as `package-lock.json` and `pnpm-lock.yaml`, and minified/bundled assets such as `vendor.min.js` or `app.bundle.js`.
|
||||
|
||||
Directory pruning should happen during the walk so large dependency trees are never enumerated in the first place.
|
||||
|
||||
---
|
||||
|
||||
## Checksum Computation
|
||||
|
||||
```typescript
|
||||
|
||||
Reference in New Issue
Block a user