trueref-legacy/src/lib/server/pipeline/indexing.pipeline.test.ts

/**
 * Unit tests for IndexingPipeline and JobQueue (TRUEREF-0009).
 *
 * Uses an in-memory SQLite database populated with the same migration SQL
 * as the production database.
 */

import { describe, it, expect, beforeEach, vi } from 'vitest';
import Database from 'better-sqlite3';
import { readFileSync } from 'node:fs';
import { join } from 'node:path';
import { JobQueue } from './job-queue.js';
import { IndexingPipeline } from './indexing.pipeline.js';
import { recoverStaleJobs } from './startup.js';

// ---------------------------------------------------------------------------
// Test DB factory
// ---------------------------------------------------------------------------

function createTestDb(): Database.Database {
	const client = new Database(':memory:');
	client.pragma('foreign_keys = ON');

	const migrationsFolder = join(import.meta.dirname, '../db/migrations');
	const migrationSql = readFileSync(
		join(migrationsFolder, '0000_large_master_chief.sql'),
		'utf-8'
	);

	const statements = migrationSql
		.split('--> statement-breakpoint')
		.map((s) => s.trim())
		.filter(Boolean);

	for (const stmt of statements) {
		client.exec(stmt);
	}

	return client;
}

// ---------------------------------------------------------------------------
// Fixtures
// ---------------------------------------------------------------------------

const now = Math.floor(Date.now() / 1000);

function insertRepo(
	db: Database.Database,
	overrides: Partial<Record<string, unknown>> = {}
): void {
	db.prepare(
		`INSERT INTO repositories
       (id, title, source, source_url, branch, state,
        total_snippets, total_tokens, trust_score, benchmark_score,
        stars, github_token, last_indexed_at, created_at, updated_at)
     VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
	).run(
		overrides.id ?? '/test/repo',
		overrides.title ?? 'Test Repo',
		overrides.source ?? 'local',
		overrides.source_url ?? '/tmp/test-repo',
		overrides.branch ?? 'main',
		overrides.state ?? 'pending',
		0, 0, 0, 0, null, null, null, now, now
	);
}

function insertJob(
	db: Database.Database,
	overrides: Partial<Record<string, unknown>> = {}
): string {
	const id = crypto.randomUUID();
	db.prepare(
		`INSERT INTO indexing_jobs
       (id, repository_id, version_id, status, progress,
        total_files, processed_files, error, started_at, completed_at, created_at)
     VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
	).run(
		overrides.id ?? id,
		overrides.repository_id ?? '/test/repo',
		overrides.version_id ?? null,
		overrides.status ?? 'queued',
		overrides.progress ?? 0,
		overrides.total_files ?? 0,
		overrides.processed_files ?? 0,
		overrides.error ?? null,
		overrides.started_at ?? null,
		overrides.completed_at ?? null,
		overrides.created_at ?? now
	);
	return (overrides.id as string) ?? id;
}

// ---------------------------------------------------------------------------
// recoverStaleJobs
// ---------------------------------------------------------------------------

describe('recoverStaleJobs', () => {
	let db: Database.Database;

	beforeEach(() => {
		db = createTestDb();
		insertRepo(db);
	});

	it('marks running jobs as failed', () => {
		insertJob(db, { status: 'running' });
		recoverStaleJobs(db);

		const row = db
			.prepare(`SELECT status, error FROM indexing_jobs LIMIT 1`)
			.get() as { status: string; error: string };
		expect(row.status).toBe('failed');
		expect(row.error).toMatch(/restarted/i);
	});

	it('resets repositories in indexing state to error', () => {
		db.prepare(`UPDATE repositories SET state = 'indexing' WHERE id = '/test/repo'`).run();
		recoverStaleJobs(db);

		const row = db
			.prepare(`SELECT state FROM repositories WHERE id = '/test/repo'`)
			.get() as { state: string };
		expect(row.state).toBe('error');
	});

	it('leaves queued and done jobs untouched', () => {
		insertJob(db, { status: 'queued' });
		insertJob(db, { status: 'done' });
		recoverStaleJobs(db);

		const rows = db
			.prepare(`SELECT status FROM indexing_jobs WHERE status IN ('queued', 'done')`)
			.all() as { status: string }[];
		expect(rows).toHaveLength(2);
	});
});

// ---------------------------------------------------------------------------
// JobQueue
// ---------------------------------------------------------------------------

describe('JobQueue', () => {
	let db: Database.Database;
	let queue: JobQueue;

	beforeEach(() => {
		db = createTestDb();
		insertRepo(db);
		queue = new JobQueue(db);
	});

	it('enqueues a new job and returns it', () => {
		const job = queue.enqueue('/test/repo');
		expect(job.status).toBe('queued');
		expect(job.repositoryId ?? (job as unknown as { repository_id: string }).repository_id).toBe(
			'/test/repo'
		);
	});

	it('deduplicates: returns existing active job instead of creating a new one', () => {
		const job1 = queue.enqueue('/test/repo');
		const job2 = queue.enqueue('/test/repo');
		expect(job1.id).toBe(job2.id);

		const count = (
			db.prepare(`SELECT COUNT(*) as n FROM indexing_jobs`).get() as { n: number }
		).n;
		expect(count).toBe(1);
	});

	it('getJob returns null for unknown ID', () => {
		expect(queue.getJob('non-existent')).toBeNull();
	});

	it('getJob returns the job if it exists', () => {
		const jobId = insertJob(db, { status: 'done' });
		const job = queue.getJob(jobId);
		expect(job).not.toBeNull();
		expect(job!.id).toBe(jobId);
	});

	it('listJobs returns all jobs ordered by created_at desc', () => {
		insertJob(db, { created_at: now - 10, status: 'done' });
		insertJob(db, { created_at: now - 5, status: 'done' });
		insertJob(db, { created_at: now, status: 'queued' });

		const jobs = queue.listJobs();
		expect(jobs.length).toBeGreaterThanOrEqual(3);
		// Most recent first.
		expect(jobs[0].status).toBe('queued');
	});

	it('listJobs filters by repositoryId', () => {
		insertRepo(db, { id: '/other/repo', source_url: '/tmp/other' });
		insertJob(db, { repository_id: '/other/repo', status: 'done' });
		insertJob(db, { status: 'queued' });

		const jobs = queue.listJobs({ repositoryId: '/other/repo' });
		expect(jobs).toHaveLength(1);
	});

	it('listJobs filters by status', () => {
		insertJob(db, { status: 'queued' });
		insertJob(db, { status: 'done' });
		insertJob(db, { status: 'failed' });

		const queued = queue.listJobs({ status: 'queued' });
		expect(queued.every((j) => j.status === 'queued')).toBe(true);
	});

	it('countJobs returns correct count', () => {
		insertJob(db, { status: 'done' });
		insertJob(db, { status: 'done' });
		insertJob(db, { status: 'failed' });

		expect(queue.countJobs()).toBe(3);
		expect(queue.countJobs({ status: 'done' })).toBe(2);
		expect(queue.countJobs({ status: 'failed' })).toBe(1);
	});
});

// ---------------------------------------------------------------------------
// IndexingPipeline
// ---------------------------------------------------------------------------

describe('IndexingPipeline', () => {
	let db: Database.Database;

	beforeEach(() => {
		db = createTestDb();
		insertRepo(db, { source: 'local', source_url: '/tmp/test-repo' });
	});

	function makePipeline(
		crawlResult: {
			files: Array<{ path: string; content: string; sha: string; language: string }>;
			totalFiles: number;
		} = { files: [], totalFiles: 0 }
	) {
		const mockGithubCrawl = vi.fn().mockResolvedValue({
			...crawlResult,
			skippedFiles: 0,
			branch: 'main',
			commitSha: 'abc'
		});

		const mockLocalCrawler = {
			crawl: vi.fn().mockResolvedValue({
				...crawlResult,
				skippedFiles: 0,
				branch: 'main',
				commitSha: 'abc'
			})
		};

		return new IndexingPipeline(
			db,
			mockGithubCrawl as never,
			mockLocalCrawler as never,
			null
		);
	}

	function makeJob(repositoryId = '/test/repo') {
		const jobId = insertJob(db, { repository_id: repositoryId, status: 'queued' });
		return db
			.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`)
			.get(jobId) as { id: string; repositoryId?: string; repository_id?: string; status: string; versionId?: string; version_id?: string };
	}

	it('marks job as done when there are no files to index', async () => {
		const pipeline = makePipeline({ files: [], totalFiles: 0 });
		const job = makeJob();

		await pipeline.run(job as never);

		const updated = db
			.prepare(`SELECT status, progress FROM indexing_jobs WHERE id = ?`)
			.get(job.id) as { status: string; progress: number };
		expect(updated.status).toBe('done');
		expect(updated.progress).toBe(100);
	});

	it('marks job as running then done (final state is done)', async () => {
		const pipeline = makePipeline({ files: [], totalFiles: 0 });
		const job = makeJob();

		await pipeline.run(job as never);

		const updated = db
			.prepare(`SELECT status FROM indexing_jobs WHERE id = ?`)
			.get(job.id) as { status: string };
		// The job should end in 'done' — the running→done transition is covered
		// by the pipeline's internal updateJob calls.
		expect(updated.status).toBe('done');
	});

	it('marks job as failed and repo as error when pipeline throws', async () => {
		const errorCrawl = vi.fn().mockRejectedValue(new Error('crawl failed'));
		const pipeline = new IndexingPipeline(
			db,
			errorCrawl as never,
			{ crawl: errorCrawl } as never,
			null
		);

		const job = makeJob();

		await expect(pipeline.run(job as never)).rejects.toThrow('crawl failed');

		const updatedJob = db
			.prepare(`SELECT status, error FROM indexing_jobs WHERE id = ?`)
			.get(job.id) as { status: string; error: string };
		expect(updatedJob.status).toBe('failed');
		expect(updatedJob.error).toBe('crawl failed');

		const updatedRepo = db
			.prepare(`SELECT state FROM repositories WHERE id = '/test/repo'`)
			.get() as { state: string };
		expect(updatedRepo.state).toBe('error');
	});

	it('inserts documents and snippets for new files', async () => {
		const files = [
			{
				path: 'README.md',
				content: '# Hello\n\nThis is documentation.',
				sha: 'sha-readme',
				language: 'markdown'
			}
		];
		const pipeline = makePipeline({ files, totalFiles: 1 });
		const job = makeJob();

		await pipeline.run(job as never);

		const docs = db.prepare(`SELECT * FROM documents`).all() as unknown[];
		expect(docs.length).toBeGreaterThan(0);

		const snippets = db.prepare(`SELECT * FROM snippets`).all() as unknown[];
		expect(snippets.length).toBeGreaterThan(0);

		const repo = db
			.prepare(`SELECT state, total_snippets FROM repositories WHERE id = '/test/repo'`)
			.get() as { state: string; total_snippets: number };
		expect(repo.state).toBe('indexed');
		expect(repo.total_snippets).toBeGreaterThan(0);
	});

	it('skips unchanged files (checksum match)', async () => {
		// First indexing run.
		const files = [
			{
				path: 'README.md',
				content: '# Hello\n\nThis is documentation.',
				sha: 'sha-readme',
				language: 'markdown'
			}
		];
		const pipeline = makePipeline({ files, totalFiles: 1 });
		const job1 = makeJob();
		await pipeline.run(job1 as never);

		const firstDocCount = (
			db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }
		).n;
		const firstSnippetIds = (
			db.prepare(`SELECT id FROM snippets`).all() as { id: string }[]
		).map((r) => r.id);

		// Second run with identical files.
		const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' });
		const job2 = db
			.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`)
			.get(job2Id) as never;

		await pipeline.run(job2);

		const secondDocCount = (
			db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }
		).n;
		const secondSnippetIds = (
			db.prepare(`SELECT id FROM snippets`).all() as { id: string }[]
		).map((r) => r.id);

		// Document count stays the same and snippet IDs are unchanged.
		expect(secondDocCount).toBe(firstDocCount);
		expect(secondSnippetIds).toEqual(firstSnippetIds);
	});

	it('replaces snippets atomically when a file changes', async () => {
		const pipeline1 = makePipeline({
			files: [
				{
					path: 'README.md',
					content: '# Original\n\nThis is the original version of the documentation with sufficient content.',
					sha: 'sha-v1',
					language: 'markdown'
				}
			],
			totalFiles: 1
		});
		const job1 = makeJob();
		await pipeline1.run(job1 as never);

		const originalSnippetCount = (
			db.prepare(`SELECT COUNT(*) as n FROM snippets`).get() as { n: number }
		).n;
		expect(originalSnippetCount).toBeGreaterThan(0);

		// Second run with changed file content.
		const pipeline2 = makePipeline({
			files: [
				{
					path: 'README.md',
					content: '# Updated\n\nThis is a completely different version of the documentation with new content.',
					sha: 'sha-v2',
					language: 'markdown'
				}
			],
			totalFiles: 1
		});
		const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' });
		const job2 = db
			.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`)
			.get(job2Id) as never;
		await pipeline2.run(job2);

		const finalDocCount = (
			db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }
		).n;
		// Only one document should exist (the updated one).
		expect(finalDocCount).toBe(1);

		const finalChecksum = (
			db.prepare(`SELECT checksum FROM documents LIMIT 1`).get() as { checksum: string }
		).checksum;
		expect(finalChecksum).toBe('sha-v2');
	});

	it('updates job progress as files are processed', async () => {
		const files = Array.from({ length: 5 }, (_, i) => ({
			path: `file${i}.md`,
			content: `# File ${i}\n\nContent ${i}.`,
			sha: `sha-${i}`,
			language: 'markdown'
		}));

		const pipeline = makePipeline({ files, totalFiles: 5 });
		const job = makeJob();
		await pipeline.run(job as never);

		const updated = db
			.prepare(`SELECT progress FROM indexing_jobs WHERE id = ?`)
			.get(job.id) as { progress: number };
		expect(updated.progress).toBe(100);
	});

	it('uses the repository source_url when crawling local repositories', async () => {
		const crawl = vi.fn().mockResolvedValue({
			files: [],
			totalFiles: 0,
			skippedFiles: 0,
			branch: 'local',
			commitSha: 'abc'
		});

		const pipeline = new IndexingPipeline(
			db,
			vi.fn() as never,
			{ crawl } as never,
			null
		);

		const job = makeJob();
		await pipeline.run(job as never);

		expect(crawl).toHaveBeenCalledWith({
			rootPath: '/tmp/test-repo',
			ref: undefined
		});
	});

	it('integration: handles unchanged, modified, added, and deleted files in one run', async () => {
		// ---- First run: index three files -----------------------------------
		const firstFiles = [
			{
				path: 'unchanged.md',
				content: '# Unchanged\n\nThis file never changes.',
				sha: 'sha-unchanged',
				language: 'markdown'
			},
			{
				path: 'will-change.md',
				content: '# Original\n\nThis will be modified in the next run.',
				sha: 'sha-will-change-v1',
				language: 'markdown'
			},
			{
				path: 'will-delete.md',
				content: '# To Be Deleted\n\nThis file will vanish in the next run.',
				sha: 'sha-will-delete',
				language: 'markdown'
			}
		];

		const pipeline1 = makePipeline({ files: firstFiles, totalFiles: 3 });
		const job1 = makeJob();
		await pipeline1.run(job1 as never);

		const afterFirstRun = {
			docs: db.prepare(`SELECT file_path, checksum FROM documents ORDER BY file_path`).all() as { file_path: string; checksum: string }[],
			snippetCount: (db.prepare(`SELECT COUNT(*) as n FROM snippets`).get() as { n: number }).n
		};
		expect(afterFirstRun.docs).toHaveLength(3);
		expect(afterFirstRun.snippetCount).toBeGreaterThan(0);

		// ---- Second run: add a new file, modify one, delete one, keep one ---
		const secondFiles = [
			{
				path: 'unchanged.md',
				content: '# Unchanged\n\nThis file never changes.',
				sha: 'sha-unchanged', // same sha → should be skipped
				language: 'markdown'
			},
			{
				path: 'will-change.md',
				content: '# Modified\n\nThis file was modified with completely new content.',
				sha: 'sha-will-change-v2', // different sha → should be re-indexed
				language: 'markdown'
			},
			{
				path: 'brand-new.md',
				content: '# Brand New\n\nThis file was added in the second crawl.',
				sha: 'sha-brand-new', // not in DB → should be added
				language: 'markdown'
			}
			// 'will-delete.md' is intentionally absent → should be deleted
		];

		const pipeline2 = makePipeline({ files: secondFiles, totalFiles: 3 });
		const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' });
		const job2 = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(job2Id) as never;
		await pipeline2.run(job2);

		// ---- Verify final DB state -------------------------------------------
		const finalDocs = db
			.prepare(`SELECT file_path, checksum FROM documents ORDER BY file_path`)
			.all() as { file_path: string; checksum: string }[];

		const filePaths = finalDocs.map((d) => d.file_path);

		// unchanged.md: still present, same checksum
		expect(filePaths).toContain('unchanged.md');
		const unchangedDoc = finalDocs.find((d) => d.file_path === 'unchanged.md');
		expect(unchangedDoc?.checksum).toBe('sha-unchanged');

		// will-change.md: present with updated checksum
		expect(filePaths).toContain('will-change.md');
		const changedDoc = finalDocs.find((d) => d.file_path === 'will-change.md');
		expect(changedDoc?.checksum).toBe('sha-will-change-v2');

		// brand-new.md: present (was added in second run)
		expect(filePaths).toContain('brand-new.md');

		// will-delete.md: NOT present (was absent from second crawl)
		expect(filePaths).not.toContain('will-delete.md');

		// Exactly 3 documents remain
		expect(finalDocs).toHaveLength(3);

		// Job ended successfully with full progress
		const finalJob = db
			.prepare(`SELECT status, progress FROM indexing_jobs WHERE id = ?`)
			.get(job2Id) as { status: string; progress: number };
		expect(finalJob.status).toBe('done');
		expect(finalJob.progress).toBe(100);
	});
});