fix(MULTIVERSION-0001): surface pre-parsed config in CrawlResult to fix rules persistence
When trueref.json specifies a `folders` allowlist (e.g. ["src/"]), shouldIndexFile() excludes trueref.json itself because it lives at the repo root. The indexing pipeline then searches crawlResult.files for the config file, finds nothing, and never writes rules to repository_configs. Fix (Option B): add a `config` field to CrawlResult so LocalCrawler returns the pre-parsed config directly. The indexing pipeline now reads crawlResult.config first instead of scanning files[], which resolves the regression for all repos with a folders allowlist. - Add `config?: RepoConfig` to CrawlResult in crawler/types.ts - Return `config` from LocalCrawler.crawlDirectory() - Update IndexingPipeline.crawl() to propagate CrawlResult.config - Update IndexingPipeline.run() to prefer crawlResult.config over files - Add regression tests covering the folders-allowlist exclusion scenario Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -413,6 +413,59 @@ describe('LocalCrawler.crawl() — config file detection', () => {
|
|||||||
const result = await crawlRoot();
|
const result = await crawlRoot();
|
||||||
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('populates CrawlResult.config with the parsed trueref.json even when folders allowlist excludes the root', async () => {
|
||||||
|
// Regression test for MULTIVERSION-0001:
|
||||||
|
// When folders: ["src/"] is set, trueref.json at the root is excluded from
|
||||||
|
// files[] by shouldIndexFile(). The config must still be returned in
|
||||||
|
// CrawlResult.config so the indexing pipeline can persist rules.
|
||||||
|
root = await makeTempRepo({
|
||||||
|
'trueref.json': JSON.stringify({
|
||||||
|
folders: ['src/'],
|
||||||
|
rules: ['Always document public APIs.']
|
||||||
|
}),
|
||||||
|
'src/index.ts': 'export {};',
|
||||||
|
'docs/guide.md': '# Guide'
|
||||||
|
});
|
||||||
|
const result = await crawlRoot();
|
||||||
|
|
||||||
|
// trueref.json must NOT appear in files (excluded by folders allowlist).
|
||||||
|
expect(result.files.some((f) => f.path === 'trueref.json')).toBe(false);
|
||||||
|
// docs/guide.md must NOT appear (outside src/).
|
||||||
|
expect(result.files.some((f) => f.path === 'docs/guide.md')).toBe(false);
|
||||||
|
// src/index.ts must appear (inside src/).
|
||||||
|
expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true);
|
||||||
|
// CrawlResult.config must carry the parsed config.
|
||||||
|
expect(result.config).toBeDefined();
|
||||||
|
expect(result.config?.rules).toEqual(['Always document public APIs.']);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('populates CrawlResult.config with the parsed context7.json', async () => {
|
||||||
|
root = await makeTempRepo({
|
||||||
|
'context7.json': JSON.stringify({ rules: ['Rule from context7.'] }),
|
||||||
|
'src/index.ts': 'export {};'
|
||||||
|
});
|
||||||
|
const result = await crawlRoot();
|
||||||
|
expect(result.config).toBeDefined();
|
||||||
|
expect(result.config?.rules).toEqual(['Rule from context7.']);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('CrawlResult.config is undefined when no config file is present', async () => {
|
||||||
|
root = await makeTempRepo({ 'src/index.ts': 'export {};' });
|
||||||
|
const result = await crawlRoot();
|
||||||
|
expect(result.config).toBeUndefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('CrawlResult.config is undefined when caller supplies config (caller-provided takes precedence, no auto-detect)', async () => {
|
||||||
|
root = await makeTempRepo({
|
||||||
|
'trueref.json': JSON.stringify({ rules: ['From file.'] }),
|
||||||
|
'src/index.ts': 'export {};'
|
||||||
|
});
|
||||||
|
// Caller-supplied config prevents auto-detection; CrawlResult.config
|
||||||
|
// should carry the caller config (not the file content).
|
||||||
|
const result = await crawlRoot({ config: { rules: ['From caller.'] } });
|
||||||
|
expect(result.config?.rules).toEqual(['From caller.']);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -230,7 +230,11 @@ export class LocalCrawler {
|
|||||||
totalFiles: filteredPaths.length,
|
totalFiles: filteredPaths.length,
|
||||||
skippedFiles: allRelPaths.length - filteredPaths.length,
|
skippedFiles: allRelPaths.length - filteredPaths.length,
|
||||||
branch,
|
branch,
|
||||||
commitSha
|
commitSha,
|
||||||
|
// Surface the pre-parsed config so the indexing pipeline can read rules
|
||||||
|
// without needing to find trueref.json inside crawledFiles (which fails
|
||||||
|
// when a `folders` allowlist excludes the repo root).
|
||||||
|
config: config ?? undefined
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -35,6 +35,13 @@ export interface CrawlResult {
|
|||||||
branch: string;
|
branch: string;
|
||||||
/** HEAD commit SHA */
|
/** HEAD commit SHA */
|
||||||
commitSha: string;
|
commitSha: string;
|
||||||
|
/**
|
||||||
|
* Pre-parsed trueref.json / context7.json configuration found at the repo
|
||||||
|
* root during crawling. Carried here so the indexing pipeline can consume it
|
||||||
|
* directly without having to locate the config file in `files` — which fails
|
||||||
|
* when a `folders` allowlist excludes the repo root.
|
||||||
|
*/
|
||||||
|
config?: RepoConfig;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface CrawlOptions {
|
export interface CrawlOptions {
|
||||||
|
|||||||
@@ -268,6 +268,8 @@ describe('IndexingPipeline', () => {
|
|||||||
crawlResult: {
|
crawlResult: {
|
||||||
files: Array<{ path: string; content: string; sha: string; language: string }>;
|
files: Array<{ path: string; content: string; sha: string; language: string }>;
|
||||||
totalFiles: number;
|
totalFiles: number;
|
||||||
|
/** Optional pre-parsed config — simulates LocalCrawler returning CrawlResult.config. */
|
||||||
|
config?: Record<string, unknown>;
|
||||||
} = { files: [], totalFiles: 0 },
|
} = { files: [], totalFiles: 0 },
|
||||||
embeddingService: EmbeddingService | null = null
|
embeddingService: EmbeddingService | null = null
|
||||||
) {
|
) {
|
||||||
@@ -885,4 +887,70 @@ describe('IndexingPipeline', () => {
|
|||||||
const rules = JSON.parse(versionRow!.rules);
|
const rules = JSON.parse(versionRow!.rules);
|
||||||
expect(rules).toEqual(['This is v2. Use the new Builder API.']);
|
expect(rules).toEqual(['This is v2. Use the new Builder API.']);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('persists rules from CrawlResult.config even when trueref.json is absent from files (folders allowlist bug)', async () => {
|
||||||
|
// Regression test for MULTIVERSION-0001:
|
||||||
|
// When trueref.json specifies a `folders` allowlist (e.g. ["src/"]),
|
||||||
|
// shouldIndexFile() excludes trueref.json itself because it lives at the
|
||||||
|
// repo root. The LocalCrawler now carries the pre-parsed config in
|
||||||
|
// CrawlResult.config so the pipeline no longer needs to find the file in
|
||||||
|
// crawlResult.files[].
|
||||||
|
const pipeline = makePipeline({
|
||||||
|
// trueref.json is NOT in files — simulates it being excluded by folders allowlist.
|
||||||
|
files: [
|
||||||
|
{
|
||||||
|
path: 'src/index.ts',
|
||||||
|
content: 'export const x = 1;',
|
||||||
|
sha: 'sha-src',
|
||||||
|
language: 'typescript'
|
||||||
|
}
|
||||||
|
],
|
||||||
|
totalFiles: 1,
|
||||||
|
// The pre-parsed config is carried here instead (set by LocalCrawler).
|
||||||
|
config: { rules: ['Use strict TypeScript.', 'Avoid any.'] }
|
||||||
|
});
|
||||||
|
const job = makeJob();
|
||||||
|
|
||||||
|
await pipeline.run(job as never);
|
||||||
|
|
||||||
|
const row = db
|
||||||
|
.prepare(
|
||||||
|
`SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL`
|
||||||
|
)
|
||||||
|
.get() as { rules: string } | undefined;
|
||||||
|
|
||||||
|
expect(row).toBeDefined();
|
||||||
|
const rules = JSON.parse(row!.rules);
|
||||||
|
expect(rules).toEqual(['Use strict TypeScript.', 'Avoid any.']);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('persists version-specific rules from CrawlResult.config when trueref.json is excluded by folders allowlist', async () => {
|
||||||
|
const versionId = insertVersion(db, { tag: 'v3.0.0', state: 'pending' });
|
||||||
|
|
||||||
|
const pipeline = makePipeline({
|
||||||
|
files: [
|
||||||
|
{
|
||||||
|
path: 'src/index.ts',
|
||||||
|
content: 'export const x = 1;',
|
||||||
|
sha: 'sha-src',
|
||||||
|
language: 'typescript'
|
||||||
|
}
|
||||||
|
],
|
||||||
|
totalFiles: 1,
|
||||||
|
config: { rules: ['v3: use the streaming API.'] }
|
||||||
|
});
|
||||||
|
const job = makeJob('/test/repo', versionId);
|
||||||
|
|
||||||
|
await pipeline.run(job as never);
|
||||||
|
|
||||||
|
const versionRow = db
|
||||||
|
.prepare(
|
||||||
|
`SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id = ?`
|
||||||
|
)
|
||||||
|
.get(versionId) as { rules: string } | undefined;
|
||||||
|
|
||||||
|
expect(versionRow).toBeDefined();
|
||||||
|
const rules = JSON.parse(versionRow!.rules);
|
||||||
|
expect(rules).toEqual(['v3: use the streaming API.']);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -15,14 +15,14 @@
|
|||||||
|
|
||||||
import { createHash, randomUUID } from 'node:crypto';
|
import { createHash, randomUUID } from 'node:crypto';
|
||||||
import type Database from 'better-sqlite3';
|
import type Database from 'better-sqlite3';
|
||||||
import type { Document, NewDocument, NewSnippet } from '$lib/types';
|
import type { Document, NewDocument, NewSnippet, TrueRefConfig } from '$lib/types';
|
||||||
import type { crawl as GithubCrawlFn } from '$lib/server/crawler/github.crawler.js';
|
import type { crawl as GithubCrawlFn } from '$lib/server/crawler/github.crawler.js';
|
||||||
import type { LocalCrawler } from '$lib/server/crawler/local.crawler.js';
|
import type { LocalCrawler } from '$lib/server/crawler/local.crawler.js';
|
||||||
import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
|
import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
|
||||||
import { RepositoryMapper } from '$lib/server/mappers/repository.mapper.js';
|
import { RepositoryMapper } from '$lib/server/mappers/repository.mapper.js';
|
||||||
import { IndexingJob } from '$lib/server/models/indexing-job.js';
|
import { IndexingJob } from '$lib/server/models/indexing-job.js';
|
||||||
import { Repository, RepositoryEntity } from '$lib/server/models/repository.js';
|
import { Repository, RepositoryEntity } from '$lib/server/models/repository.js';
|
||||||
import { resolveConfig } from '$lib/server/config/config-parser.js';
|
import { resolveConfig, type ParsedConfig } from '$lib/server/config/config-parser.js';
|
||||||
import { parseFile } from '$lib/server/parser/index.js';
|
import { parseFile } from '$lib/server/parser/index.js';
|
||||||
import { computeTrustScore } from '$lib/server/search/trust-score.js';
|
import { computeTrustScore } from '$lib/server/search/trust-score.js';
|
||||||
import { computeDiff } from './diff.js';
|
import { computeDiff } from './diff.js';
|
||||||
@@ -101,13 +101,25 @@ export class IndexingPipeline {
|
|||||||
: undefined;
|
: undefined;
|
||||||
const crawlResult = await this.crawl(repo, versionTag);
|
const crawlResult = await this.crawl(repo, versionTag);
|
||||||
|
|
||||||
// Parse trueref.json / context7.json if present in the crawl results.
|
// Resolve trueref.json / context7.json configuration.
|
||||||
|
// Prefer the pre-parsed config carried in the CrawlResult (set by
|
||||||
|
// LocalCrawler so it is available even when a `folders` allowlist
|
||||||
|
// excludes the repo root and trueref.json never appears in files[]).
|
||||||
|
// Fall back to locating the file in crawlResult.files for GitHub crawls
|
||||||
|
// which do not yet populate CrawlResult.config.
|
||||||
|
let parsedConfig: ReturnType<typeof resolveConfig> | null = null;
|
||||||
|
if (crawlResult.config) {
|
||||||
|
// Config was pre-parsed by the crawler — wrap it in a ParsedConfig
|
||||||
|
// shell so the rest of the pipeline can use it uniformly.
|
||||||
|
parsedConfig = { config: crawlResult.config, source: 'trueref.json', warnings: [] } satisfies ParsedConfig;
|
||||||
|
} else {
|
||||||
const configFile = crawlResult.files.find(
|
const configFile = crawlResult.files.find(
|
||||||
(f) => f.path === 'trueref.json' || f.path === 'context7.json'
|
(f) => f.path === 'trueref.json' || f.path === 'context7.json'
|
||||||
);
|
);
|
||||||
const parsedConfig = configFile
|
parsedConfig = configFile
|
||||||
? resolveConfig([{ filename: configFile.path, content: configFile.content }])
|
? resolveConfig([{ filename: configFile.path, content: configFile.content }])
|
||||||
: null;
|
: null;
|
||||||
|
}
|
||||||
const excludeFiles: string[] = parsedConfig?.config.excludeFiles ?? [];
|
const excludeFiles: string[] = parsedConfig?.config.excludeFiles ?? [];
|
||||||
|
|
||||||
// Filter out excluded files before diff computation.
|
// Filter out excluded files before diff computation.
|
||||||
@@ -304,6 +316,8 @@ export class IndexingPipeline {
|
|||||||
private async crawl(repo: Repository, ref?: string): Promise<{
|
private async crawl(repo: Repository, ref?: string): Promise<{
|
||||||
files: Array<{ path: string; content: string; sha: string; size: number; language: string }>;
|
files: Array<{ path: string; content: string; sha: string; size: number; language: string }>;
|
||||||
totalFiles: number;
|
totalFiles: number;
|
||||||
|
/** Pre-parsed trueref.json / context7.json, or undefined when absent. */
|
||||||
|
config?: TrueRefConfig;
|
||||||
}> {
|
}> {
|
||||||
if (repo.source === 'github') {
|
if (repo.source === 'github') {
|
||||||
// Parse owner/repo from the canonical ID: "/owner/repo"
|
// Parse owner/repo from the canonical ID: "/owner/repo"
|
||||||
@@ -330,7 +344,7 @@ export class IndexingPipeline {
|
|||||||
ref: ref ?? (repo.branch !== 'main' ? (repo.branch ?? undefined) : undefined)
|
ref: ref ?? (repo.branch !== 'main' ? (repo.branch ?? undefined) : undefined)
|
||||||
});
|
});
|
||||||
|
|
||||||
return { files: result.files, totalFiles: result.totalFiles };
|
return { files: result.files, totalFiles: result.totalFiles, config: result.config };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user