/** * Unit tests for IndexingPipeline and JobQueue (TRUEREF-0009). * * Uses an in-memory SQLite database populated with the same migration SQL * as the production database. */ import { describe, it, expect, beforeEach, vi } from 'vitest'; import Database from 'better-sqlite3'; import { readFileSync } from 'node:fs'; import { join } from 'node:path'; import { JobQueue } from './job-queue.js'; import { IndexingPipeline } from './indexing.pipeline.js'; import { recoverStaleJobs } from './startup.js'; // --------------------------------------------------------------------------- // Test DB factory // --------------------------------------------------------------------------- function createTestDb(): Database.Database { const client = new Database(':memory:'); client.pragma('foreign_keys = ON'); const migrationsFolder = join(import.meta.dirname, '../db/migrations'); const migrationSql = readFileSync( join(migrationsFolder, '0000_large_master_chief.sql'), 'utf-8' ); const statements = migrationSql .split('--> statement-breakpoint') .map((s) => s.trim()) .filter(Boolean); for (const stmt of statements) { client.exec(stmt); } return client; } // --------------------------------------------------------------------------- // Fixtures // --------------------------------------------------------------------------- const now = Math.floor(Date.now() / 1000); function insertRepo( db: Database.Database, overrides: Partial> = {} ): void { db.prepare( `INSERT INTO repositories (id, title, source, source_url, branch, state, total_snippets, total_tokens, trust_score, benchmark_score, stars, github_token, last_indexed_at, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` ).run( overrides.id ?? '/test/repo', overrides.title ?? 'Test Repo', overrides.source ?? 'local', overrides.source_url ?? '/tmp/test-repo', overrides.branch ?? 'main', overrides.state ?? 'pending', 0, 0, 0, 0, null, null, null, now, now ); } function insertJob( db: Database.Database, overrides: Partial> = {} ): string { const id = crypto.randomUUID(); db.prepare( `INSERT INTO indexing_jobs (id, repository_id, version_id, status, progress, total_files, processed_files, error, started_at, completed_at, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` ).run( overrides.id ?? id, overrides.repository_id ?? '/test/repo', overrides.version_id ?? null, overrides.status ?? 'queued', overrides.progress ?? 0, overrides.total_files ?? 0, overrides.processed_files ?? 0, overrides.error ?? null, overrides.started_at ?? null, overrides.completed_at ?? null, overrides.created_at ?? now ); return (overrides.id as string) ?? id; } // --------------------------------------------------------------------------- // recoverStaleJobs // --------------------------------------------------------------------------- describe('recoverStaleJobs', () => { let db: Database.Database; beforeEach(() => { db = createTestDb(); insertRepo(db); }); it('marks running jobs as failed', () => { insertJob(db, { status: 'running' }); recoverStaleJobs(db); const row = db .prepare(`SELECT status, error FROM indexing_jobs LIMIT 1`) .get() as { status: string; error: string }; expect(row.status).toBe('failed'); expect(row.error).toMatch(/restarted/i); }); it('resets repositories in indexing state to error', () => { db.prepare(`UPDATE repositories SET state = 'indexing' WHERE id = '/test/repo'`).run(); recoverStaleJobs(db); const row = db .prepare(`SELECT state FROM repositories WHERE id = '/test/repo'`) .get() as { state: string }; expect(row.state).toBe('error'); }); it('leaves queued and done jobs untouched', () => { insertJob(db, { status: 'queued' }); insertJob(db, { status: 'done' }); recoverStaleJobs(db); const rows = db .prepare(`SELECT status FROM indexing_jobs WHERE status IN ('queued', 'done')`) .all() as { status: string }[]; expect(rows).toHaveLength(2); }); }); // --------------------------------------------------------------------------- // JobQueue // --------------------------------------------------------------------------- describe('JobQueue', () => { let db: Database.Database; let queue: JobQueue; beforeEach(() => { db = createTestDb(); insertRepo(db); queue = new JobQueue(db); }); it('enqueues a new job and returns it', () => { const job = queue.enqueue('/test/repo'); expect(job.status).toBe('queued'); expect(job.repositoryId ?? (job as unknown as { repository_id: string }).repository_id).toBe( '/test/repo' ); }); it('deduplicates: returns existing active job instead of creating a new one', () => { const job1 = queue.enqueue('/test/repo'); const job2 = queue.enqueue('/test/repo'); expect(job1.id).toBe(job2.id); const count = ( db.prepare(`SELECT COUNT(*) as n FROM indexing_jobs`).get() as { n: number } ).n; expect(count).toBe(1); }); it('getJob returns null for unknown ID', () => { expect(queue.getJob('non-existent')).toBeNull(); }); it('getJob returns the job if it exists', () => { const jobId = insertJob(db, { status: 'done' }); const job = queue.getJob(jobId); expect(job).not.toBeNull(); expect(job!.id).toBe(jobId); }); it('listJobs returns all jobs ordered by created_at desc', () => { insertJob(db, { created_at: now - 10, status: 'done' }); insertJob(db, { created_at: now - 5, status: 'done' }); insertJob(db, { created_at: now, status: 'queued' }); const jobs = queue.listJobs(); expect(jobs.length).toBeGreaterThanOrEqual(3); // Most recent first. expect(jobs[0].status).toBe('queued'); }); it('listJobs filters by repositoryId', () => { insertRepo(db, { id: '/other/repo', source_url: '/tmp/other' }); insertJob(db, { repository_id: '/other/repo', status: 'done' }); insertJob(db, { status: 'queued' }); const jobs = queue.listJobs({ repositoryId: '/other/repo' }); expect(jobs).toHaveLength(1); }); it('listJobs filters by status', () => { insertJob(db, { status: 'queued' }); insertJob(db, { status: 'done' }); insertJob(db, { status: 'failed' }); const queued = queue.listJobs({ status: 'queued' }); expect(queued.every((j) => j.status === 'queued')).toBe(true); }); it('countJobs returns correct count', () => { insertJob(db, { status: 'done' }); insertJob(db, { status: 'done' }); insertJob(db, { status: 'failed' }); expect(queue.countJobs()).toBe(3); expect(queue.countJobs({ status: 'done' })).toBe(2); expect(queue.countJobs({ status: 'failed' })).toBe(1); }); }); // --------------------------------------------------------------------------- // IndexingPipeline // --------------------------------------------------------------------------- describe('IndexingPipeline', () => { let db: Database.Database; beforeEach(() => { db = createTestDb(); insertRepo(db, { source: 'local', source_url: '/tmp/test-repo' }); }); function makePipeline( crawlResult: { files: Array<{ path: string; content: string; sha: string; language: string }>; totalFiles: number; } = { files: [], totalFiles: 0 } ) { const mockGithubCrawl = vi.fn().mockResolvedValue({ ...crawlResult, skippedFiles: 0, branch: 'main', commitSha: 'abc' }); const mockLocalCrawler = { crawl: vi.fn().mockResolvedValue({ ...crawlResult, skippedFiles: 0, branch: 'main', commitSha: 'abc' }) }; return new IndexingPipeline( db, mockGithubCrawl as never, mockLocalCrawler as never, null ); } function makeJob(repositoryId = '/test/repo') { const jobId = insertJob(db, { repository_id: repositoryId, status: 'queued' }); return db .prepare(`SELECT * FROM indexing_jobs WHERE id = ?`) .get(jobId) as { id: string; repositoryId?: string; repository_id?: string; status: string; versionId?: string; version_id?: string }; } it('marks job as done when there are no files to index', async () => { const pipeline = makePipeline({ files: [], totalFiles: 0 }); const job = makeJob(); await pipeline.run(job as never); const updated = db .prepare(`SELECT status, progress FROM indexing_jobs WHERE id = ?`) .get(job.id) as { status: string; progress: number }; expect(updated.status).toBe('done'); expect(updated.progress).toBe(100); }); it('marks job as running then done (final state is done)', async () => { const pipeline = makePipeline({ files: [], totalFiles: 0 }); const job = makeJob(); await pipeline.run(job as never); const updated = db .prepare(`SELECT status FROM indexing_jobs WHERE id = ?`) .get(job.id) as { status: string }; // The job should end in 'done' — the running→done transition is covered // by the pipeline's internal updateJob calls. expect(updated.status).toBe('done'); }); it('marks job as failed and repo as error when pipeline throws', async () => { const errorCrawl = vi.fn().mockRejectedValue(new Error('crawl failed')); const pipeline = new IndexingPipeline( db, errorCrawl as never, { crawl: errorCrawl } as never, null ); const job = makeJob(); await expect(pipeline.run(job as never)).rejects.toThrow('crawl failed'); const updatedJob = db .prepare(`SELECT status, error FROM indexing_jobs WHERE id = ?`) .get(job.id) as { status: string; error: string }; expect(updatedJob.status).toBe('failed'); expect(updatedJob.error).toBe('crawl failed'); const updatedRepo = db .prepare(`SELECT state FROM repositories WHERE id = '/test/repo'`) .get() as { state: string }; expect(updatedRepo.state).toBe('error'); }); it('inserts documents and snippets for new files', async () => { const files = [ { path: 'README.md', content: '# Hello\n\nThis is documentation.', sha: 'sha-readme', language: 'markdown' } ]; const pipeline = makePipeline({ files, totalFiles: 1 }); const job = makeJob(); await pipeline.run(job as never); const docs = db.prepare(`SELECT * FROM documents`).all() as unknown[]; expect(docs.length).toBeGreaterThan(0); const snippets = db.prepare(`SELECT * FROM snippets`).all() as unknown[]; expect(snippets.length).toBeGreaterThan(0); const repo = db .prepare(`SELECT state, total_snippets FROM repositories WHERE id = '/test/repo'`) .get() as { state: string; total_snippets: number }; expect(repo.state).toBe('indexed'); expect(repo.total_snippets).toBeGreaterThan(0); }); it('skips unchanged files (checksum match)', async () => { // First indexing run. const files = [ { path: 'README.md', content: '# Hello\n\nThis is documentation.', sha: 'sha-readme', language: 'markdown' } ]; const pipeline = makePipeline({ files, totalFiles: 1 }); const job1 = makeJob(); await pipeline.run(job1 as never); const firstDocCount = ( db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number } ).n; const firstSnippetIds = ( db.prepare(`SELECT id FROM snippets`).all() as { id: string }[] ).map((r) => r.id); // Second run with identical files. const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' }); const job2 = db .prepare(`SELECT * FROM indexing_jobs WHERE id = ?`) .get(job2Id) as never; await pipeline.run(job2); const secondDocCount = ( db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number } ).n; const secondSnippetIds = ( db.prepare(`SELECT id FROM snippets`).all() as { id: string }[] ).map((r) => r.id); // Document count stays the same and snippet IDs are unchanged. expect(secondDocCount).toBe(firstDocCount); expect(secondSnippetIds).toEqual(firstSnippetIds); }); it('replaces snippets atomically when a file changes', async () => { const pipeline1 = makePipeline({ files: [ { path: 'README.md', content: '# Original\n\nThis is the original version of the documentation with sufficient content.', sha: 'sha-v1', language: 'markdown' } ], totalFiles: 1 }); const job1 = makeJob(); await pipeline1.run(job1 as never); const originalSnippetCount = ( db.prepare(`SELECT COUNT(*) as n FROM snippets`).get() as { n: number } ).n; expect(originalSnippetCount).toBeGreaterThan(0); // Second run with changed file content. const pipeline2 = makePipeline({ files: [ { path: 'README.md', content: '# Updated\n\nThis is a completely different version of the documentation with new content.', sha: 'sha-v2', language: 'markdown' } ], totalFiles: 1 }); const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' }); const job2 = db .prepare(`SELECT * FROM indexing_jobs WHERE id = ?`) .get(job2Id) as never; await pipeline2.run(job2); const finalDocCount = ( db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number } ).n; // Only one document should exist (the updated one). expect(finalDocCount).toBe(1); const finalChecksum = ( db.prepare(`SELECT checksum FROM documents LIMIT 1`).get() as { checksum: string } ).checksum; expect(finalChecksum).toBe('sha-v2'); }); it('updates job progress as files are processed', async () => { const files = Array.from({ length: 5 }, (_, i) => ({ path: `file${i}.md`, content: `# File ${i}\n\nContent ${i}.`, sha: `sha-${i}`, language: 'markdown' })); const pipeline = makePipeline({ files, totalFiles: 5 }); const job = makeJob(); await pipeline.run(job as never); const updated = db .prepare(`SELECT progress FROM indexing_jobs WHERE id = ?`) .get(job.id) as { progress: number }; expect(updated.progress).toBe(100); }); it('uses the repository source_url when crawling local repositories', async () => { const crawl = vi.fn().mockResolvedValue({ files: [], totalFiles: 0, skippedFiles: 0, branch: 'local', commitSha: 'abc' }); const pipeline = new IndexingPipeline( db, vi.fn() as never, { crawl } as never, null ); const job = makeJob(); await pipeline.run(job as never); expect(crawl).toHaveBeenCalledWith({ rootPath: '/tmp/test-repo', ref: undefined }); }); it('integration: handles unchanged, modified, added, and deleted files in one run', async () => { // ---- First run: index three files ----------------------------------- const firstFiles = [ { path: 'unchanged.md', content: '# Unchanged\n\nThis file never changes.', sha: 'sha-unchanged', language: 'markdown' }, { path: 'will-change.md', content: '# Original\n\nThis will be modified in the next run.', sha: 'sha-will-change-v1', language: 'markdown' }, { path: 'will-delete.md', content: '# To Be Deleted\n\nThis file will vanish in the next run.', sha: 'sha-will-delete', language: 'markdown' } ]; const pipeline1 = makePipeline({ files: firstFiles, totalFiles: 3 }); const job1 = makeJob(); await pipeline1.run(job1 as never); const afterFirstRun = { docs: db.prepare(`SELECT file_path, checksum FROM documents ORDER BY file_path`).all() as { file_path: string; checksum: string }[], snippetCount: (db.prepare(`SELECT COUNT(*) as n FROM snippets`).get() as { n: number }).n }; expect(afterFirstRun.docs).toHaveLength(3); expect(afterFirstRun.snippetCount).toBeGreaterThan(0); // ---- Second run: add a new file, modify one, delete one, keep one --- const secondFiles = [ { path: 'unchanged.md', content: '# Unchanged\n\nThis file never changes.', sha: 'sha-unchanged', // same sha → should be skipped language: 'markdown' }, { path: 'will-change.md', content: '# Modified\n\nThis file was modified with completely new content.', sha: 'sha-will-change-v2', // different sha → should be re-indexed language: 'markdown' }, { path: 'brand-new.md', content: '# Brand New\n\nThis file was added in the second crawl.', sha: 'sha-brand-new', // not in DB → should be added language: 'markdown' } // 'will-delete.md' is intentionally absent → should be deleted ]; const pipeline2 = makePipeline({ files: secondFiles, totalFiles: 3 }); const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' }); const job2 = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(job2Id) as never; await pipeline2.run(job2); // ---- Verify final DB state ------------------------------------------- const finalDocs = db .prepare(`SELECT file_path, checksum FROM documents ORDER BY file_path`) .all() as { file_path: string; checksum: string }[]; const filePaths = finalDocs.map((d) => d.file_path); // unchanged.md: still present, same checksum expect(filePaths).toContain('unchanged.md'); const unchangedDoc = finalDocs.find((d) => d.file_path === 'unchanged.md'); expect(unchangedDoc?.checksum).toBe('sha-unchanged'); // will-change.md: present with updated checksum expect(filePaths).toContain('will-change.md'); const changedDoc = finalDocs.find((d) => d.file_path === 'will-change.md'); expect(changedDoc?.checksum).toBe('sha-will-change-v2'); // brand-new.md: present (was added in second run) expect(filePaths).toContain('brand-new.md'); // will-delete.md: NOT present (was absent from second crawl) expect(filePaths).not.toContain('will-delete.md'); // Exactly 3 documents remain expect(finalDocs).toHaveLength(3); // Job ended successfully with full progress const finalJob = db .prepare(`SELECT status, progress FROM indexing_jobs WHERE id = ?`) .get(job2Id) as { status: string; progress: number }; expect(finalJob.status).toBe('done'); expect(finalJob.progress).toBe(100); }); });