/** * Unit tests for IndexingPipeline and JobQueue (TRUEREF-0009). * * Uses an in-memory SQLite database populated with the same migration SQL * as the production database. */ import { describe, it, expect, beforeEach, vi } from 'vitest'; import Database from 'better-sqlite3'; import { readFileSync } from 'node:fs'; import { join } from 'node:path'; import { JobQueue } from './job-queue.js'; import { IndexingPipeline } from './indexing.pipeline.js'; import { recoverStaleJobs } from './startup.js'; import { EmbeddingService } from '$lib/server/embeddings/embedding.service.js'; import * as diffStrategy from './differential-strategy.js'; // --------------------------------------------------------------------------- // Test DB factory // --------------------------------------------------------------------------- function createTestDb(): Database.Database { const client = new Database(':memory:'); client.pragma('foreign_keys = ON'); const migrationsFolder = join(import.meta.dirname, '../db/migrations'); for (const migrationFile of [ '0000_large_master_chief.sql', '0001_quick_nighthawk.sql', '0002_silky_stellaris.sql', '0003_multiversion_config.sql', '0004_complete_sentry.sql' ]) { const migrationSql = readFileSync(join(migrationsFolder, migrationFile), 'utf-8'); const statements = migrationSql .split('--> statement-breakpoint') .map((s) => s.trim()) .filter(Boolean); for (const stmt of statements) { client.exec(stmt); } } return client; } // --------------------------------------------------------------------------- // Fixtures // --------------------------------------------------------------------------- const now = Math.floor(Date.now() / 1000); function insertRepo(db: Database.Database, overrides: Partial> = {}): void { db.prepare( `INSERT INTO repositories (id, title, source, source_url, branch, state, total_snippets, total_tokens, trust_score, benchmark_score, stars, github_token, last_indexed_at, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` ).run( overrides.id ?? '/test/repo', overrides.title ?? 'Test Repo', overrides.source ?? 'local', overrides.source_url ?? '/tmp/test-repo', overrides.branch ?? 'main', overrides.state ?? 'pending', 0, 0, 0, 0, null, null, null, now, now ); } function insertVersion( db: Database.Database, overrides: Partial> = {} ): string { const id = crypto.randomUUID(); db.prepare( `INSERT INTO repository_versions (id, repository_id, tag, title, state, total_snippets, indexed_at, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?)` ).run( overrides.id ?? id, overrides.repository_id ?? '/test/repo', overrides.tag ?? 'v1.0.0', overrides.title ?? null, overrides.state ?? 'pending', overrides.total_snippets ?? 0, overrides.indexed_at ?? null, overrides.created_at ?? now ); return (overrides.id as string) ?? id; } function insertJob( db: Database.Database, overrides: Partial> = {} ): string { const id = crypto.randomUUID(); db.prepare( `INSERT INTO indexing_jobs (id, repository_id, version_id, status, progress, total_files, processed_files, error, started_at, completed_at, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` ).run( overrides.id ?? id, overrides.repository_id ?? '/test/repo', overrides.version_id ?? null, overrides.status ?? 'queued', overrides.progress ?? 0, overrides.total_files ?? 0, overrides.processed_files ?? 0, overrides.error ?? null, overrides.started_at ?? null, overrides.completed_at ?? null, overrides.created_at ?? now ); return (overrides.id as string) ?? id; } // --------------------------------------------------------------------------- // recoverStaleJobs // --------------------------------------------------------------------------- describe('recoverStaleJobs', () => { let db: Database.Database; beforeEach(() => { db = createTestDb(); insertRepo(db); }); it('marks running jobs as failed', () => { insertJob(db, { status: 'running' }); recoverStaleJobs(db); const row = db.prepare(`SELECT status, error FROM indexing_jobs LIMIT 1`).get() as { status: string; error: string; }; expect(row.status).toBe('failed'); expect(row.error).toMatch(/restarted/i); }); it('resets repositories in indexing state to error', () => { db.prepare(`UPDATE repositories SET state = 'indexing' WHERE id = '/test/repo'`).run(); recoverStaleJobs(db); const row = db.prepare(`SELECT state FROM repositories WHERE id = '/test/repo'`).get() as { state: string; }; expect(row.state).toBe('error'); }); it('leaves queued and done jobs untouched', () => { insertJob(db, { status: 'queued' }); insertJob(db, { status: 'done' }); recoverStaleJobs(db); const rows = db .prepare(`SELECT status FROM indexing_jobs WHERE status IN ('queued', 'done')`) .all() as { status: string }[]; expect(rows).toHaveLength(2); }); }); // --------------------------------------------------------------------------- // JobQueue // --------------------------------------------------------------------------- describe('JobQueue', () => { let db: Database.Database; let queue: JobQueue; beforeEach(() => { db = createTestDb(); insertRepo(db); queue = new JobQueue(db); }); it('enqueues a new job and returns it', () => { const job = queue.enqueue('/test/repo'); expect(job.status).toBe('queued'); expect(job.repositoryId ?? (job as unknown as { repository_id: string }).repository_id).toBe( '/test/repo' ); }); it('deduplicates: returns existing active job instead of creating a new one', () => { const job1 = queue.enqueue('/test/repo'); const job2 = queue.enqueue('/test/repo'); expect(job1.id).toBe(job2.id); const count = (db.prepare(`SELECT COUNT(*) as n FROM indexing_jobs`).get() as { n: number }).n; expect(count).toBe(1); }); it('getJob returns null for unknown ID', () => { expect(queue.getJob('non-existent')).toBeNull(); }); it('getJob returns the job if it exists', () => { const jobId = insertJob(db, { status: 'done' }); const job = queue.getJob(jobId); expect(job).not.toBeNull(); expect(job!.id).toBe(jobId); }); it('listJobs returns all jobs ordered by created_at desc', () => { insertJob(db, { created_at: now - 10, status: 'done' }); insertJob(db, { created_at: now - 5, status: 'done' }); insertJob(db, { created_at: now, status: 'queued' }); const jobs = queue.listJobs(); expect(jobs.length).toBeGreaterThanOrEqual(3); // Most recent first. expect(jobs[0].status).toBe('queued'); }); it('listJobs filters by repositoryId', () => { insertRepo(db, { id: '/other/repo', source_url: '/tmp/other' }); insertJob(db, { repository_id: '/other/repo', status: 'done' }); insertJob(db, { status: 'queued' }); const jobs = queue.listJobs({ repositoryId: '/other/repo' }); expect(jobs).toHaveLength(1); }); it('listJobs filters by status', () => { insertJob(db, { status: 'queued' }); insertJob(db, { status: 'done' }); insertJob(db, { status: 'failed' }); const queued = queue.listJobs({ status: 'queued' }); expect(queued.every((j) => j.status === 'queued')).toBe(true); }); it('countJobs returns correct count', () => { insertJob(db, { status: 'done' }); insertJob(db, { status: 'done' }); insertJob(db, { status: 'failed' }); expect(queue.countJobs()).toBe(3); expect(queue.countJobs({ status: 'done' })).toBe(2); expect(queue.countJobs({ status: 'failed' })).toBe(1); }); }); // --------------------------------------------------------------------------- // IndexingPipeline // --------------------------------------------------------------------------- describe('IndexingPipeline', () => { let db: Database.Database; beforeEach(() => { db = createTestDb(); insertRepo(db, { source: 'local', source_url: '/tmp/test-repo' }); }); function makePipeline( crawlResult: { files: Array<{ path: string; content: string; sha: string; language: string }>; totalFiles: number; /** Optional pre-parsed config — simulates LocalCrawler returning CrawlResult.config. */ config?: Record; } = { files: [], totalFiles: 0 }, embeddingService: EmbeddingService | null = null ) { const mockGithubCrawl = vi.fn().mockResolvedValue({ ...crawlResult, skippedFiles: 0, branch: 'main', commitSha: 'abc' }); const mockLocalCrawler = { crawl: vi.fn().mockResolvedValue({ ...crawlResult, skippedFiles: 0, branch: 'main', commitSha: 'abc' }) }; return new IndexingPipeline( db, mockGithubCrawl as never, mockLocalCrawler as never, embeddingService ); } function makeJob(repositoryId = '/test/repo', versionId?: string) { const jobId = insertJob(db, { repository_id: repositoryId, version_id: versionId ?? null, status: 'queued' }); return db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(jobId) as { id: string; repositoryId?: string; repository_id?: string; status: string; versionId?: string; version_id?: string; }; } it('marks job as done when there are no files to index', async () => { const pipeline = makePipeline({ files: [], totalFiles: 0 }); const job = makeJob(); await pipeline.run(job as never); const updated = db .prepare(`SELECT status, progress FROM indexing_jobs WHERE id = ?`) .get(job.id) as { status: string; progress: number }; expect(updated.status).toBe('done'); expect(updated.progress).toBe(100); }); it('marks job as running then done (final state is done)', async () => { const pipeline = makePipeline({ files: [], totalFiles: 0 }); const job = makeJob(); await pipeline.run(job as never); const updated = db.prepare(`SELECT status FROM indexing_jobs WHERE id = ?`).get(job.id) as { status: string; }; // The job should end in 'done' — the running→done transition is covered // by the pipeline's internal updateJob calls. expect(updated.status).toBe('done'); }); it('marks job as failed and repo as error when pipeline throws', async () => { const errorCrawl = vi.fn().mockRejectedValue(new Error('crawl failed')); const pipeline = new IndexingPipeline( db, errorCrawl as never, { crawl: errorCrawl } as never, null ); const job = makeJob(); await expect(pipeline.run(job as never)).rejects.toThrow('crawl failed'); const updatedJob = db .prepare(`SELECT status, error FROM indexing_jobs WHERE id = ?`) .get(job.id) as { status: string; error: string }; expect(updatedJob.status).toBe('failed'); expect(updatedJob.error).toBe('crawl failed'); const updatedRepo = db .prepare(`SELECT state FROM repositories WHERE id = '/test/repo'`) .get() as { state: string }; expect(updatedRepo.state).toBe('error'); }); it('inserts documents and snippets for new files', async () => { const files = [ { path: 'README.md', content: '# Hello\n\nThis is documentation.', sha: 'sha-readme', language: 'markdown' } ]; const pipeline = makePipeline({ files, totalFiles: 1 }); const job = makeJob(); await pipeline.run(job as never); const docs = db.prepare(`SELECT * FROM documents`).all() as unknown[]; expect(docs.length).toBeGreaterThan(0); const snippets = db.prepare(`SELECT * FROM snippets`).all() as unknown[]; expect(snippets.length).toBeGreaterThan(0); const repo = db .prepare(`SELECT state, total_snippets FROM repositories WHERE id = '/test/repo'`) .get() as { state: string; total_snippets: number }; expect(repo.state).toBe('indexed'); expect(repo.total_snippets).toBeGreaterThan(0); }); it('skips unchanged files (checksum match)', async () => { // First indexing run. const files = [ { path: 'README.md', content: '# Hello\n\nThis is documentation.', sha: 'sha-readme', language: 'markdown' } ]; const pipeline = makePipeline({ files, totalFiles: 1 }); const job1 = makeJob(); await pipeline.run(job1 as never); const firstDocCount = (db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }) .n; const firstSnippetIds = (db.prepare(`SELECT id FROM snippets`).all() as { id: string }[]).map( (r) => r.id ); // Second run with identical files. const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' }); const job2 = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(job2Id) as never; await pipeline.run(job2); const secondDocCount = ( db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number } ).n; const secondSnippetIds = (db.prepare(`SELECT id FROM snippets`).all() as { id: string }[]).map( (r) => r.id ); // Document count stays the same and snippet IDs are unchanged. expect(secondDocCount).toBe(firstDocCount); expect(secondSnippetIds).toEqual(firstSnippetIds); }); it('re-index backfills missing embeddings for unchanged snippets', async () => { const provider = { name: 'test-provider', model: 'test-model', dimensions: 3, embed: vi.fn(async (texts: string[]) => texts.map(() => ({ values: new Float32Array([0.1, 0.2, 0.3]), dimensions: 3, model: 'test-model' })) ), isAvailable: vi.fn(async () => true) }; const embeddingService = new EmbeddingService(db, provider, 'local-default'); const files = [ { path: 'README.md', content: '# Hello\n\nThis is documentation.', sha: 'sha-readme', language: 'markdown' } ]; const pipeline = makePipeline({ files, totalFiles: 1 }, embeddingService); const job1 = makeJob(); await pipeline.run(job1 as never); const firstSnippetIds = (db.prepare(`SELECT id FROM snippets ORDER BY id`).all() as { id: string }[]) .map((row) => row.id); expect(firstSnippetIds.length).toBeGreaterThan(0); const firstEmbeddingCount = ( db.prepare(`SELECT COUNT(*) as n FROM snippet_embeddings WHERE profile_id = 'local-default'`).get() as { n: number; } ).n; expect(firstEmbeddingCount).toBe(firstSnippetIds.length); db.prepare(`DELETE FROM snippet_embeddings WHERE profile_id = 'local-default'`).run(); const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' }); const job2 = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(job2Id) as never; await pipeline.run(job2); const secondSnippetIds = (db.prepare(`SELECT id FROM snippets ORDER BY id`).all() as { id: string; }[]).map((row) => row.id); const secondEmbeddingCount = ( db.prepare(`SELECT COUNT(*) as n FROM snippet_embeddings WHERE profile_id = 'local-default'`).get() as { n: number; } ).n; expect(secondSnippetIds).toEqual(firstSnippetIds); expect(secondEmbeddingCount).toBe(firstSnippetIds.length); }); it('replaces snippets atomically when a file changes', async () => { const pipeline1 = makePipeline({ files: [ { path: 'README.md', content: '# Original\n\nThis is the original version of the documentation with sufficient content.', sha: 'sha-v1', language: 'markdown' } ], totalFiles: 1 }); const job1 = makeJob(); await pipeline1.run(job1 as never); const originalSnippetCount = ( db.prepare(`SELECT COUNT(*) as n FROM snippets`).get() as { n: number } ).n; expect(originalSnippetCount).toBeGreaterThan(0); // Second run with changed file content. const pipeline2 = makePipeline({ files: [ { path: 'README.md', content: '# Updated\n\nThis is a completely different version of the documentation with new content.', sha: 'sha-v2', language: 'markdown' } ], totalFiles: 1 }); const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' }); const job2 = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(job2Id) as never; await pipeline2.run(job2); const finalDocCount = (db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }) .n; // Only one document should exist (the updated one). expect(finalDocCount).toBe(1); const finalChecksum = ( db.prepare(`SELECT checksum FROM documents LIMIT 1`).get() as { checksum: string } ).checksum; expect(finalChecksum).toBe('sha-v2'); }); it('updates job progress as files are processed', async () => { const files = Array.from({ length: 5 }, (_, i) => ({ path: `file${i}.md`, content: `# File ${i}\n\nContent ${i}.`, sha: `sha-${i}`, language: 'markdown' })); const pipeline = makePipeline({ files, totalFiles: 5 }); const job = makeJob(); await pipeline.run(job as never); const updated = db.prepare(`SELECT progress FROM indexing_jobs WHERE id = ?`).get(job.id) as { progress: number; }; expect(updated.progress).toBe(100); }); it('uses the repository source_url when crawling local repositories', async () => { const crawl = vi.fn().mockResolvedValue({ files: [], totalFiles: 0, skippedFiles: 0, branch: 'local', commitSha: 'abc' }); const pipeline = new IndexingPipeline(db, vi.fn() as never, { crawl } as never, null); const job = makeJob(); await pipeline.run(job as never); expect(crawl).toHaveBeenCalledWith({ rootPath: '/tmp/test-repo', ref: undefined }); }); it('integration: handles unchanged, modified, added, and deleted files in one run', async () => { // ---- First run: index three files ----------------------------------- const firstFiles = [ { path: 'unchanged.md', content: '# Unchanged\n\nThis file never changes.', sha: 'sha-unchanged', language: 'markdown' }, { path: 'will-change.md', content: '# Original\n\nThis will be modified in the next run.', sha: 'sha-will-change-v1', language: 'markdown' }, { path: 'will-delete.md', content: '# To Be Deleted\n\nThis file will vanish in the next run.', sha: 'sha-will-delete', language: 'markdown' } ]; const pipeline1 = makePipeline({ files: firstFiles, totalFiles: 3 }); const job1 = makeJob(); await pipeline1.run(job1 as never); const afterFirstRun = { docs: db.prepare(`SELECT file_path, checksum FROM documents ORDER BY file_path`).all() as { file_path: string; checksum: string; }[], snippetCount: (db.prepare(`SELECT COUNT(*) as n FROM snippets`).get() as { n: number }).n }; expect(afterFirstRun.docs).toHaveLength(3); expect(afterFirstRun.snippetCount).toBeGreaterThan(0); // ---- Second run: add a new file, modify one, delete one, keep one --- const secondFiles = [ { path: 'unchanged.md', content: '# Unchanged\n\nThis file never changes.', sha: 'sha-unchanged', // same sha → should be skipped language: 'markdown' }, { path: 'will-change.md', content: '# Modified\n\nThis file was modified with completely new content.', sha: 'sha-will-change-v2', // different sha → should be re-indexed language: 'markdown' }, { path: 'brand-new.md', content: '# Brand New\n\nThis file was added in the second crawl.', sha: 'sha-brand-new', // not in DB → should be added language: 'markdown' } // 'will-delete.md' is intentionally absent → should be deleted ]; const pipeline2 = makePipeline({ files: secondFiles, totalFiles: 3 }); const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' }); const job2 = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(job2Id) as never; await pipeline2.run(job2); // ---- Verify final DB state ------------------------------------------- const finalDocs = db .prepare(`SELECT file_path, checksum FROM documents ORDER BY file_path`) .all() as { file_path: string; checksum: string }[]; const filePaths = finalDocs.map((d) => d.file_path); // unchanged.md: still present, same checksum expect(filePaths).toContain('unchanged.md'); const unchangedDoc = finalDocs.find((d) => d.file_path === 'unchanged.md'); expect(unchangedDoc?.checksum).toBe('sha-unchanged'); // will-change.md: present with updated checksum expect(filePaths).toContain('will-change.md'); const changedDoc = finalDocs.find((d) => d.file_path === 'will-change.md'); expect(changedDoc?.checksum).toBe('sha-will-change-v2'); // brand-new.md: present (was added in second run) expect(filePaths).toContain('brand-new.md'); // will-delete.md: NOT present (was absent from second crawl) expect(filePaths).not.toContain('will-delete.md'); // Exactly 3 documents remain expect(finalDocs).toHaveLength(3); // Job ended successfully with full progress const finalJob = db .prepare(`SELECT status, progress FROM indexing_jobs WHERE id = ?`) .get(job2Id) as { status: string; progress: number }; expect(finalJob.status).toBe('done'); expect(finalJob.progress).toBe(100); }); it('updates repository_versions state to indexing then indexed when job has versionId', async () => { const versionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' }); const files = [ { path: 'README.md', content: '# Hello\n\nThis is documentation.', sha: 'sha-readme', language: 'markdown' } ]; const pipeline = makePipeline({ files, totalFiles: 1 }); const job = makeJob('/test/repo', versionId); await pipeline.run(job as never); const version = db .prepare(`SELECT state, total_snippets, indexed_at FROM repository_versions WHERE id = ?`) .get(versionId) as { state: string; total_snippets: number; indexed_at: number | null }; expect(version.state).toBe('indexed'); expect(version.total_snippets).toBeGreaterThan(0); expect(version.indexed_at).not.toBeNull(); }); it('updates repository_versions state to error when pipeline throws and job has versionId', async () => { const versionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' }); const errorCrawl = vi.fn().mockRejectedValue(new Error('crawl failed')); const pipeline = new IndexingPipeline( db, errorCrawl as never, { crawl: errorCrawl } as never, null ); const job = makeJob('/test/repo', versionId); await expect(pipeline.run(job as never)).rejects.toThrow('crawl failed'); const version = db .prepare(`SELECT state FROM repository_versions WHERE id = ?`) .get(versionId) as { state: string }; expect(version.state).toBe('error'); }); it('does not touch repository_versions when job has no versionId', async () => { const versionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' }); const pipeline = makePipeline({ files: [], totalFiles: 0 }); const job = makeJob('/test/repo'); // no versionId await pipeline.run(job as never); const version = db .prepare(`SELECT state FROM repository_versions WHERE id = ?`) .get(versionId) as { state: string }; // State should remain 'pending' — pipeline with no versionId must not touch it expect(version.state).toBe('pending'); }); it('calls LocalCrawler with ref=v1.2.0 when job has a versionId with tag v1.2.0', async () => { const versionId = insertVersion(db, { tag: 'v1.2.0', state: 'pending' }); const crawl = vi.fn().mockResolvedValue({ files: [], totalFiles: 0, skippedFiles: 0, branch: 'main', commitSha: 'abc' }); const pipeline = new IndexingPipeline(db, vi.fn() as never, { crawl } as never, null); const job = makeJob('/test/repo', versionId); await pipeline.run(job as never); expect(crawl).toHaveBeenCalledWith({ rootPath: '/tmp/test-repo', ref: 'v1.2.0' }); }); it('calls LocalCrawler with ref=undefined when job has no versionId (main-branch)', async () => { const crawl = vi.fn().mockResolvedValue({ files: [], totalFiles: 0, skippedFiles: 0, branch: 'main', commitSha: 'abc' }); const pipeline = new IndexingPipeline(db, vi.fn() as never, { crawl } as never, null); const job = makeJob('/test/repo'); // no versionId await pipeline.run(job as never); expect(crawl).toHaveBeenCalledWith({ rootPath: '/tmp/test-repo', ref: undefined }); }); it('excludes files matching excludeFiles patterns from trueref.json', async () => { const truerefConfig = JSON.stringify({ excludeFiles: ['migration-guide.md', 'docs/legacy*'] }); const files = [ { path: 'trueref.json', content: truerefConfig, sha: 'sha-config', language: 'json' }, { path: 'README.md', content: '# Hello\n\nThis is documentation.', sha: 'sha-readme', language: 'markdown' }, { path: 'migration-guide.md', content: '# Migration Guide\n\nThis should be excluded.', sha: 'sha-migration', language: 'markdown' }, { path: 'docs/legacy-api.md', content: '# Legacy API\n\nShould be excluded by glob prefix.', sha: 'sha-legacy', language: 'markdown' } ]; const pipeline = makePipeline({ files, totalFiles: files.length }); const job = makeJob(); await pipeline.run(job as never); const docs = db .prepare(`SELECT file_path FROM documents ORDER BY file_path`) .all() as { file_path: string }[]; const filePaths = docs.map((d) => d.file_path); // migration-guide.md and docs/legacy-api.md must be absent. expect(filePaths).not.toContain('migration-guide.md'); expect(filePaths).not.toContain('docs/legacy-api.md'); // README.md must still be indexed. expect(filePaths).toContain('README.md'); }); it('persists repo-wide rules from trueref.json to repository_configs after indexing', async () => { const truerefConfig = JSON.stringify({ rules: ['Always use TypeScript strict mode', 'Prefer async/await over callbacks'] }); const files = [ { path: 'trueref.json', content: truerefConfig, sha: 'sha-config', language: 'json' } ]; const pipeline = makePipeline({ files, totalFiles: files.length }); const job = makeJob(); await pipeline.run(job as never); const row = db .prepare( `SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL` ) .get() as { rules: string } | undefined; expect(row).toBeDefined(); const rules = JSON.parse(row!.rules); expect(rules).toEqual(['Always use TypeScript strict mode', 'Prefer async/await over callbacks']); }); it('persists version-specific rules under (repositoryId, versionId) when job has versionId', async () => { const versionId = insertVersion(db, { tag: 'v2.0.0', state: 'pending' }); const truerefConfig = JSON.stringify({ rules: ['This is v2. Use the new Builder API.'] }); const files = [ { path: 'trueref.json', content: truerefConfig, sha: 'sha-config', language: 'json' } ]; const pipeline = makePipeline({ files, totalFiles: files.length }); const job = makeJob('/test/repo', versionId); await pipeline.run(job as never); // Repo-wide row (version_id IS NULL) must NOT be written by a version job — // writing it here would contaminate the NULL entry with version-specific rules // (Bug 5b regression guard). const repoRow = db .prepare( `SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL` ) .get() as { rules: string } | undefined; expect(repoRow).toBeUndefined(); // Version-specific row must exist with the correct rules. const versionRow = db .prepare( `SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id = ?` ) .get(versionId) as { rules: string } | undefined; expect(versionRow).toBeDefined(); const rules = JSON.parse(versionRow!.rules); expect(rules).toEqual(['This is v2. Use the new Builder API.']); }); it('regression(Bug5b): version job does not overwrite the repo-wide NULL rules entry', async () => { // Arrange: index the main branch first to establish a repo-wide rules entry. const mainBranchRules = ['Always use TypeScript strict mode.']; const mainPipeline = makePipeline({ files: [ { path: 'trueref.json', content: JSON.stringify({ rules: mainBranchRules }), sha: 'sha-main-config', language: 'json' } ], totalFiles: 1 }); const mainJob = makeJob('/test/repo'); // no versionId → main-branch job await mainPipeline.run(mainJob as never); // Confirm the repo-wide entry was written. const afterMain = db .prepare( `SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL` ) .get() as { rules: string } | undefined; expect(afterMain).toBeDefined(); expect(JSON.parse(afterMain!.rules)).toEqual(mainBranchRules); // Act: index a version with different rules. const versionId = insertVersion(db, { tag: 'v3.0.0', state: 'pending' }); const versionRules = ['v3 only: use the streaming API.']; const versionPipeline = makePipeline({ files: [ { path: 'trueref.json', content: JSON.stringify({ rules: versionRules }), sha: 'sha-v3-config', language: 'json' } ], totalFiles: 1 }); const versionJob = makeJob('/test/repo', versionId); await versionPipeline.run(versionJob as never); // Assert: the repo-wide NULL entry must still contain the main-branch rules, // not the version-specific ones. const afterVersion = db .prepare( `SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL` ) .get() as { rules: string } | undefined; expect(afterVersion).toBeDefined(); expect(JSON.parse(afterVersion!.rules)).toEqual(mainBranchRules); // And the version-specific row must contain the version rules. const versionRow = db .prepare( `SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id = ?` ) .get(versionId) as { rules: string } | undefined; expect(versionRow).toBeDefined(); expect(JSON.parse(versionRow!.rules)).toEqual(versionRules); }); it('persists rules from CrawlResult.config even when trueref.json is absent from files (folders allowlist bug)', async () => { // Regression test for MULTIVERSION-0001: // When trueref.json specifies a `folders` allowlist (e.g. ["src/"]), // shouldIndexFile() excludes trueref.json itself because it lives at the // repo root. The LocalCrawler now carries the pre-parsed config in // CrawlResult.config so the pipeline no longer needs to find the file in // crawlResult.files[]. const pipeline = makePipeline({ // trueref.json is NOT in files — simulates it being excluded by folders allowlist. files: [ { path: 'src/index.ts', content: 'export const x = 1;', sha: 'sha-src', language: 'typescript' } ], totalFiles: 1, // The pre-parsed config is carried here instead (set by LocalCrawler). config: { rules: ['Use strict TypeScript.', 'Avoid any.'] } }); const job = makeJob(); await pipeline.run(job as never); const row = db .prepare( `SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL` ) .get() as { rules: string } | undefined; expect(row).toBeDefined(); const rules = JSON.parse(row!.rules); expect(rules).toEqual(['Use strict TypeScript.', 'Avoid any.']); }); it('persists version-specific rules from CrawlResult.config when trueref.json is excluded by folders allowlist', async () => { const versionId = insertVersion(db, { tag: 'v3.0.0', state: 'pending' }); const pipeline = makePipeline({ files: [ { path: 'src/index.ts', content: 'export const x = 1;', sha: 'sha-src', language: 'typescript' } ], totalFiles: 1, config: { rules: ['v3: use the streaming API.'] } }); const job = makeJob('/test/repo', versionId); await pipeline.run(job as never); const versionRow = db .prepare( `SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id = ?` ) .get(versionId) as { rules: string } | undefined; expect(versionRow).toBeDefined(); const rules = JSON.parse(versionRow!.rules); expect(rules).toEqual(['v3: use the streaming API.']); }); }); // --------------------------------------------------------------------------- // differential indexing // --------------------------------------------------------------------------- describe('differential indexing', () => { let db: Database.Database; beforeEach(() => { db = createTestDb(); insertRepo(db, { source: 'local', source_url: '/tmp/test-repo' }); }); function insertDocument( localDb: Database.Database, overrides: Partial> = {} ): string { const id = crypto.randomUUID(); localDb .prepare( `INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)` ) .run( (overrides.id as string) ?? id, (overrides.repository_id as string) ?? '/test/repo', (overrides.version_id as string | null) ?? null, (overrides.file_path as string) ?? 'README.md', null, 'markdown', 100, (overrides.checksum as string) ?? 'abc123', Math.floor(Date.now() / 1000) ); return (overrides.id as string) ?? id; } function insertSnippet( localDb: Database.Database, documentId: string, overrides: Partial> = {} ): string { const id = crypto.randomUUID(); localDb .prepare( `INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` ) .run( (overrides.id as string) ?? id, documentId, (overrides.repository_id as string) ?? '/test/repo', (overrides.version_id as string | null) ?? null, 'info', null, 'content', 'markdown', null, 10, Math.floor(Date.now() / 1000) ); return (overrides.id as string) ?? id; } type PipelineInternals = IndexingPipeline & { cloneFromAncestor: ( ancestorVersionId: string, targetVersionId: string, repositoryId: string, unchangedPaths: Set ) => void; }; it('cloneFromAncestor inserts documents and snippets into the target version', () => { const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' }); const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' }); const doc1Id = insertDocument(db, { repository_id: '/test/repo', version_id: ancestorVersionId, file_path: 'README.md', checksum: 'sha-readme' }); const doc2Id = insertDocument(db, { repository_id: '/test/repo', version_id: ancestorVersionId, file_path: 'src/index.ts', checksum: 'sha-index' }); insertSnippet(db, doc1Id, { repository_id: '/test/repo', version_id: ancestorVersionId }); insertSnippet(db, doc2Id, { repository_id: '/test/repo', version_id: ancestorVersionId }); const pipeline = new IndexingPipeline( db, vi.fn() as never, { crawl: vi.fn() } as never, null ); (pipeline as unknown as PipelineInternals).cloneFromAncestor( ancestorVersionId, targetVersionId, '/test/repo', new Set(['README.md', 'src/index.ts']) ); const targetDocs = db .prepare(`SELECT * FROM documents WHERE version_id = ?`) .all(targetVersionId) as { id: string; file_path: string }[]; expect(targetDocs).toHaveLength(2); expect(targetDocs.map((d) => d.file_path).sort()).toEqual( ['README.md', 'src/index.ts'].sort() ); // New IDs must differ from ancestor doc IDs. const targetDocIds = targetDocs.map((d) => d.id); expect(targetDocIds).not.toContain(doc1Id); expect(targetDocIds).not.toContain(doc2Id); const targetSnippets = db .prepare(`SELECT * FROM snippets WHERE version_id = ?`) .all(targetVersionId) as { id: string }[]; expect(targetSnippets).toHaveLength(2); }); it('cloneFromAncestor silently skips paths absent from the ancestor', () => { const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' }); const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' }); insertDocument(db, { repository_id: '/test/repo', version_id: ancestorVersionId, file_path: 'src/main.ts', checksum: 'sha-main' }); const pipeline = new IndexingPipeline( db, vi.fn() as never, { crawl: vi.fn() } as never, null ); (pipeline as unknown as PipelineInternals).cloneFromAncestor( ancestorVersionId, targetVersionId, '/test/repo', new Set(['src/main.ts', 'MISSING.md']) ); const targetDocs = db .prepare(`SELECT * FROM documents WHERE version_id = ?`) .all(targetVersionId) as { id: string; file_path: string }[]; expect(targetDocs).toHaveLength(1); expect(targetDocs[0].file_path).toBe('src/main.ts'); }); it('falls back to full crawl when no indexed ancestor exists', async () => { const targetVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' }); const files = [ { path: 'README.md', content: '# Hello\n\nThis is documentation.', sha: 'sha-readme', language: 'markdown' }, { path: 'src/index.ts', content: 'export const x = 1;', sha: 'sha-index', language: 'typescript' } ]; const mockLocalCrawl = vi.fn().mockResolvedValue({ files, totalFiles: 2, skippedFiles: 0, branch: 'main', commitSha: 'abc' }); const pipeline = new IndexingPipeline( db, vi.fn() as never, { crawl: mockLocalCrawl } as never, null ); const jobId = insertJob(db, { repository_id: '/test/repo', version_id: targetVersionId, status: 'queued' }); const job = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(jobId) as never; await pipeline.run(job); const updatedJob = db .prepare(`SELECT status FROM indexing_jobs WHERE id = ?`) .get(jobId) as { status: string }; expect(updatedJob.status).toBe('done'); const docs = db .prepare(`SELECT * FROM documents WHERE version_id = ?`) .all(targetVersionId) as { id: string }[]; expect(docs.length).toBeGreaterThanOrEqual(2); }); it('cloned unchanged documents survive the diff/replace stage', async () => { // 1. Set up ancestor and target versions. const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' }); const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' }); // 2. Insert ancestor doc + snippet for unchanged.md. const ancestorDocId = insertDocument(db, { repository_id: '/test/repo', version_id: ancestorVersionId, file_path: 'unchanged.md', checksum: 'sha-unchanged' }); insertSnippet(db, ancestorDocId, { repository_id: '/test/repo', version_id: ancestorVersionId }); // 3. Crawl returns ONLY changed.md (unchanged.md is absent — differential only). const mockLocalCrawl = vi.fn().mockResolvedValue({ files: [ { path: 'changed.md', content: '# Changed\n\nThis file was added.', sha: 'sha-changed', language: 'markdown' } ], totalFiles: 1, skippedFiles: 0, branch: 'main', commitSha: 'abc' }); // 4. Mock buildDifferentialPlan to return a plan with the two paths. const mockPlan = { ancestorVersionId, ancestorTag: 'v1.0.0', changedPaths: new Set(['changed.md']), deletedPaths: new Set(), unchangedPaths: new Set(['unchanged.md']) }; const spy = vi .spyOn(diffStrategy, 'buildDifferentialPlan') .mockResolvedValueOnce(mockPlan); const pipeline = new IndexingPipeline( db, vi.fn() as never, { crawl: mockLocalCrawl } as never, null ); // 5. Run pipeline for the target version job. const jobId = insertJob(db, { repository_id: '/test/repo', version_id: targetVersionId, status: 'queued' }); const job = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(jobId) as never; await pipeline.run(job); spy.mockRestore(); // 6. Assert job completed and both docs exist under the target version. const finalJob = db .prepare(`SELECT status FROM indexing_jobs WHERE id = ?`) .get(jobId) as { status: string }; expect(finalJob.status).toBe('done'); const targetDocs = db .prepare(`SELECT file_path FROM documents WHERE version_id = ?`) .all(targetVersionId) as { file_path: string }[]; const filePaths = targetDocs.map((d) => d.file_path); // unchanged.md was cloned and must NOT have been deleted by computeDiff. expect(filePaths).toContain('unchanged.md'); // changed.md was crawled and indexed in this run. expect(filePaths).toContain('changed.md'); }); });