diff --git a/src/lib/server/crawler/local.crawler.test.ts b/src/lib/server/crawler/local.crawler.test.ts index fe41bdc..aa08967 100644 --- a/src/lib/server/crawler/local.crawler.test.ts +++ b/src/lib/server/crawler/local.crawler.test.ts @@ -413,6 +413,59 @@ describe('LocalCrawler.crawl() — config file detection', () => { const result = await crawlRoot(); expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); }); + + it('populates CrawlResult.config with the parsed trueref.json even when folders allowlist excludes the root', async () => { + // Regression test for MULTIVERSION-0001: + // When folders: ["src/"] is set, trueref.json at the root is excluded from + // files[] by shouldIndexFile(). The config must still be returned in + // CrawlResult.config so the indexing pipeline can persist rules. + root = await makeTempRepo({ + 'trueref.json': JSON.stringify({ + folders: ['src/'], + rules: ['Always document public APIs.'] + }), + 'src/index.ts': 'export {};', + 'docs/guide.md': '# Guide' + }); + const result = await crawlRoot(); + + // trueref.json must NOT appear in files (excluded by folders allowlist). + expect(result.files.some((f) => f.path === 'trueref.json')).toBe(false); + // docs/guide.md must NOT appear (outside src/). + expect(result.files.some((f) => f.path === 'docs/guide.md')).toBe(false); + // src/index.ts must appear (inside src/). + expect(result.files.some((f) => f.path === 'src/index.ts')).toBe(true); + // CrawlResult.config must carry the parsed config. + expect(result.config).toBeDefined(); + expect(result.config?.rules).toEqual(['Always document public APIs.']); + }); + + it('populates CrawlResult.config with the parsed context7.json', async () => { + root = await makeTempRepo({ + 'context7.json': JSON.stringify({ rules: ['Rule from context7.'] }), + 'src/index.ts': 'export {};' + }); + const result = await crawlRoot(); + expect(result.config).toBeDefined(); + expect(result.config?.rules).toEqual(['Rule from context7.']); + }); + + it('CrawlResult.config is undefined when no config file is present', async () => { + root = await makeTempRepo({ 'src/index.ts': 'export {};' }); + const result = await crawlRoot(); + expect(result.config).toBeUndefined(); + }); + + it('CrawlResult.config is undefined when caller supplies config (caller-provided takes precedence, no auto-detect)', async () => { + root = await makeTempRepo({ + 'trueref.json': JSON.stringify({ rules: ['From file.'] }), + 'src/index.ts': 'export {};' + }); + // Caller-supplied config prevents auto-detection; CrawlResult.config + // should carry the caller config (not the file content). + const result = await crawlRoot({ config: { rules: ['From caller.'] } }); + expect(result.config?.rules).toEqual(['From caller.']); + }); }); // --------------------------------------------------------------------------- diff --git a/src/lib/server/crawler/local.crawler.ts b/src/lib/server/crawler/local.crawler.ts index 6dfd01a..ad00ca7 100644 --- a/src/lib/server/crawler/local.crawler.ts +++ b/src/lib/server/crawler/local.crawler.ts @@ -230,7 +230,11 @@ export class LocalCrawler { totalFiles: filteredPaths.length, skippedFiles: allRelPaths.length - filteredPaths.length, branch, - commitSha + commitSha, + // Surface the pre-parsed config so the indexing pipeline can read rules + // without needing to find trueref.json inside crawledFiles (which fails + // when a `folders` allowlist excludes the repo root). + config: config ?? undefined }; } diff --git a/src/lib/server/crawler/types.ts b/src/lib/server/crawler/types.ts index a40eba6..da386ec 100644 --- a/src/lib/server/crawler/types.ts +++ b/src/lib/server/crawler/types.ts @@ -35,6 +35,13 @@ export interface CrawlResult { branch: string; /** HEAD commit SHA */ commitSha: string; + /** + * Pre-parsed trueref.json / context7.json configuration found at the repo + * root during crawling. Carried here so the indexing pipeline can consume it + * directly without having to locate the config file in `files` — which fails + * when a `folders` allowlist excludes the repo root. + */ + config?: RepoConfig; } export interface CrawlOptions { diff --git a/src/lib/server/pipeline/indexing.pipeline.test.ts b/src/lib/server/pipeline/indexing.pipeline.test.ts index 59c0078..51a1ba7 100644 --- a/src/lib/server/pipeline/indexing.pipeline.test.ts +++ b/src/lib/server/pipeline/indexing.pipeline.test.ts @@ -268,6 +268,8 @@ describe('IndexingPipeline', () => { crawlResult: { files: Array<{ path: string; content: string; sha: string; language: string }>; totalFiles: number; + /** Optional pre-parsed config — simulates LocalCrawler returning CrawlResult.config. */ + config?: Record; } = { files: [], totalFiles: 0 }, embeddingService: EmbeddingService | null = null ) { @@ -885,4 +887,70 @@ describe('IndexingPipeline', () => { const rules = JSON.parse(versionRow!.rules); expect(rules).toEqual(['This is v2. Use the new Builder API.']); }); + + it('persists rules from CrawlResult.config even when trueref.json is absent from files (folders allowlist bug)', async () => { + // Regression test for MULTIVERSION-0001: + // When trueref.json specifies a `folders` allowlist (e.g. ["src/"]), + // shouldIndexFile() excludes trueref.json itself because it lives at the + // repo root. The LocalCrawler now carries the pre-parsed config in + // CrawlResult.config so the pipeline no longer needs to find the file in + // crawlResult.files[]. + const pipeline = makePipeline({ + // trueref.json is NOT in files — simulates it being excluded by folders allowlist. + files: [ + { + path: 'src/index.ts', + content: 'export const x = 1;', + sha: 'sha-src', + language: 'typescript' + } + ], + totalFiles: 1, + // The pre-parsed config is carried here instead (set by LocalCrawler). + config: { rules: ['Use strict TypeScript.', 'Avoid any.'] } + }); + const job = makeJob(); + + await pipeline.run(job as never); + + const row = db + .prepare( + `SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL` + ) + .get() as { rules: string } | undefined; + + expect(row).toBeDefined(); + const rules = JSON.parse(row!.rules); + expect(rules).toEqual(['Use strict TypeScript.', 'Avoid any.']); + }); + + it('persists version-specific rules from CrawlResult.config when trueref.json is excluded by folders allowlist', async () => { + const versionId = insertVersion(db, { tag: 'v3.0.0', state: 'pending' }); + + const pipeline = makePipeline({ + files: [ + { + path: 'src/index.ts', + content: 'export const x = 1;', + sha: 'sha-src', + language: 'typescript' + } + ], + totalFiles: 1, + config: { rules: ['v3: use the streaming API.'] } + }); + const job = makeJob('/test/repo', versionId); + + await pipeline.run(job as never); + + const versionRow = db + .prepare( + `SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id = ?` + ) + .get(versionId) as { rules: string } | undefined; + + expect(versionRow).toBeDefined(); + const rules = JSON.parse(versionRow!.rules); + expect(rules).toEqual(['v3: use the streaming API.']); + }); }); diff --git a/src/lib/server/pipeline/indexing.pipeline.ts b/src/lib/server/pipeline/indexing.pipeline.ts index ed01062..6056e0e 100644 --- a/src/lib/server/pipeline/indexing.pipeline.ts +++ b/src/lib/server/pipeline/indexing.pipeline.ts @@ -15,14 +15,14 @@ import { createHash, randomUUID } from 'node:crypto'; import type Database from 'better-sqlite3'; -import type { Document, NewDocument, NewSnippet } from '$lib/types'; +import type { Document, NewDocument, NewSnippet, TrueRefConfig } from '$lib/types'; import type { crawl as GithubCrawlFn } from '$lib/server/crawler/github.crawler.js'; import type { LocalCrawler } from '$lib/server/crawler/local.crawler.js'; import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.js'; import { RepositoryMapper } from '$lib/server/mappers/repository.mapper.js'; import { IndexingJob } from '$lib/server/models/indexing-job.js'; import { Repository, RepositoryEntity } from '$lib/server/models/repository.js'; -import { resolveConfig } from '$lib/server/config/config-parser.js'; +import { resolveConfig, type ParsedConfig } from '$lib/server/config/config-parser.js'; import { parseFile } from '$lib/server/parser/index.js'; import { computeTrustScore } from '$lib/server/search/trust-score.js'; import { computeDiff } from './diff.js'; @@ -101,13 +101,25 @@ export class IndexingPipeline { : undefined; const crawlResult = await this.crawl(repo, versionTag); - // Parse trueref.json / context7.json if present in the crawl results. - const configFile = crawlResult.files.find( - (f) => f.path === 'trueref.json' || f.path === 'context7.json' - ); - const parsedConfig = configFile - ? resolveConfig([{ filename: configFile.path, content: configFile.content }]) - : null; + // Resolve trueref.json / context7.json configuration. + // Prefer the pre-parsed config carried in the CrawlResult (set by + // LocalCrawler so it is available even when a `folders` allowlist + // excludes the repo root and trueref.json never appears in files[]). + // Fall back to locating the file in crawlResult.files for GitHub crawls + // which do not yet populate CrawlResult.config. + let parsedConfig: ReturnType | null = null; + if (crawlResult.config) { + // Config was pre-parsed by the crawler — wrap it in a ParsedConfig + // shell so the rest of the pipeline can use it uniformly. + parsedConfig = { config: crawlResult.config, source: 'trueref.json', warnings: [] } satisfies ParsedConfig; + } else { + const configFile = crawlResult.files.find( + (f) => f.path === 'trueref.json' || f.path === 'context7.json' + ); + parsedConfig = configFile + ? resolveConfig([{ filename: configFile.path, content: configFile.content }]) + : null; + } const excludeFiles: string[] = parsedConfig?.config.excludeFiles ?? []; // Filter out excluded files before diff computation. @@ -304,6 +316,8 @@ export class IndexingPipeline { private async crawl(repo: Repository, ref?: string): Promise<{ files: Array<{ path: string; content: string; sha: string; size: number; language: string }>; totalFiles: number; + /** Pre-parsed trueref.json / context7.json, or undefined when absent. */ + config?: TrueRefConfig; }> { if (repo.source === 'github') { // Parse owner/repo from the canonical ID: "/owner/repo" @@ -330,7 +344,7 @@ export class IndexingPipeline { ref: ref ?? (repo.branch !== 'main' ? (repo.branch ?? undefined) : undefined) }); - return { files: result.files, totalFiles: result.totalFiles }; + return { files: result.files, totalFiles: result.totalFiles, config: result.config }; } }