fix(MULTIVERSION-0001): surface pre-parsed config in CrawlResult to fix rules persistence

When trueref.json specifies a `folders` allowlist (e.g. ["src/"]),
shouldIndexFile() excludes trueref.json itself because it lives at the
repo root. The indexing pipeline then searches crawlResult.files for the
config file, finds nothing, and never writes rules to repository_configs.

Fix (Option B): add a `config` field to CrawlResult so LocalCrawler
returns the pre-parsed config directly. The indexing pipeline now reads
crawlResult.config first instead of scanning files[], which resolves the
regression for all repos with a folders allowlist.

- Add `config?: RepoConfig` to CrawlResult in crawler/types.ts
- Return `config` from LocalCrawler.crawlDirectory()
- Update IndexingPipeline.crawl() to propagate CrawlResult.config
- Update IndexingPipeline.run() to prefer crawlResult.config over files
- Add regression tests covering the folders-allowlist exclusion scenario

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Giancarmine Salucci
2026-03-28 17:27:53 +01:00
parent 666ec7d55f
commit cd4ea7112c
5 changed files with 157 additions and 11 deletions

View File

@@ -15,14 +15,14 @@
import { createHash, randomUUID } from 'node:crypto';
import type Database from 'better-sqlite3';
import type { Document, NewDocument, NewSnippet } from '$lib/types';
import type { Document, NewDocument, NewSnippet, TrueRefConfig } from '$lib/types';
import type { crawl as GithubCrawlFn } from '$lib/server/crawler/github.crawler.js';
import type { LocalCrawler } from '$lib/server/crawler/local.crawler.js';
import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
import { RepositoryMapper } from '$lib/server/mappers/repository.mapper.js';
import { IndexingJob } from '$lib/server/models/indexing-job.js';
import { Repository, RepositoryEntity } from '$lib/server/models/repository.js';
import { resolveConfig } from '$lib/server/config/config-parser.js';
import { resolveConfig, type ParsedConfig } from '$lib/server/config/config-parser.js';
import { parseFile } from '$lib/server/parser/index.js';
import { computeTrustScore } from '$lib/server/search/trust-score.js';
import { computeDiff } from './diff.js';
@@ -101,13 +101,25 @@ export class IndexingPipeline {
: undefined;
const crawlResult = await this.crawl(repo, versionTag);
// Parse trueref.json / context7.json if present in the crawl results.
const configFile = crawlResult.files.find(
(f) => f.path === 'trueref.json' || f.path === 'context7.json'
);
const parsedConfig = configFile
? resolveConfig([{ filename: configFile.path, content: configFile.content }])
: null;
// Resolve trueref.json / context7.json configuration.
// Prefer the pre-parsed config carried in the CrawlResult (set by
// LocalCrawler so it is available even when a `folders` allowlist
// excludes the repo root and trueref.json never appears in files[]).
// Fall back to locating the file in crawlResult.files for GitHub crawls
// which do not yet populate CrawlResult.config.
let parsedConfig: ReturnType<typeof resolveConfig> | null = null;
if (crawlResult.config) {
// Config was pre-parsed by the crawler — wrap it in a ParsedConfig
// shell so the rest of the pipeline can use it uniformly.
parsedConfig = { config: crawlResult.config, source: 'trueref.json', warnings: [] } satisfies ParsedConfig;
} else {
const configFile = crawlResult.files.find(
(f) => f.path === 'trueref.json' || f.path === 'context7.json'
);
parsedConfig = configFile
? resolveConfig([{ filename: configFile.path, content: configFile.content }])
: null;
}
const excludeFiles: string[] = parsedConfig?.config.excludeFiles ?? [];
// Filter out excluded files before diff computation.
@@ -304,6 +316,8 @@ export class IndexingPipeline {
private async crawl(repo: Repository, ref?: string): Promise<{
files: Array<{ path: string; content: string; sha: string; size: number; language: string }>;
totalFiles: number;
/** Pre-parsed trueref.json / context7.json, or undefined when absent. */
config?: TrueRefConfig;
}> {
if (repo.source === 'github') {
// Parse owner/repo from the canonical ID: "/owner/repo"
@@ -330,7 +344,7 @@ export class IndexingPipeline {
ref: ref ?? (repo.branch !== 'main' ? (repo.branch ?? undefined) : undefined)
});
return { files: result.files, totalFiles: result.totalFiles };
return { files: result.files, totalFiles: result.totalFiles, config: result.config };
}
}