- Move IndexingPipeline.run() into Worker Threads via WorkerPool - Add dedicated embedding worker thread with single model instance - Add stage/stageDetail columns to indexing_jobs schema - Create ProgressBroadcaster for SSE channel management - Add SSE endpoints: GET /api/v1/jobs/:id/stream, GET /api/v1/jobs/stream - Replace UI polling with EventSource on repo detail and admin pages - Add concurrency settings UI and API endpoint - Build worker entries separately via esbuild
1311 lines
40 KiB
TypeScript
1311 lines
40 KiB
TypeScript
/**
|
|
* Unit tests for IndexingPipeline and JobQueue (TRUEREF-0009).
|
|
*
|
|
* Uses an in-memory SQLite database populated with the same migration SQL
|
|
* as the production database.
|
|
*/
|
|
|
|
import { describe, it, expect, beforeEach, vi } from 'vitest';
|
|
import Database from 'better-sqlite3';
|
|
import { readFileSync } from 'node:fs';
|
|
import { join } from 'node:path';
|
|
import { JobQueue } from './job-queue.js';
|
|
import { IndexingPipeline } from './indexing.pipeline.js';
|
|
import { recoverStaleJobs } from './startup.js';
|
|
import { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
|
|
import * as diffStrategy from './differential-strategy.js';
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Test DB factory
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function createTestDb(): Database.Database {
|
|
const client = new Database(':memory:');
|
|
client.pragma('foreign_keys = ON');
|
|
|
|
const migrationsFolder = join(import.meta.dirname, '../db/migrations');
|
|
for (const migrationFile of [
|
|
'0000_large_master_chief.sql',
|
|
'0001_quick_nighthawk.sql',
|
|
'0002_silky_stellaris.sql',
|
|
'0003_multiversion_config.sql',
|
|
'0004_complete_sentry.sql'
|
|
]) {
|
|
const migrationSql = readFileSync(join(migrationsFolder, migrationFile), 'utf-8');
|
|
|
|
const statements = migrationSql
|
|
.split('--> statement-breakpoint')
|
|
.map((s) => s.trim())
|
|
.filter(Boolean);
|
|
|
|
for (const stmt of statements) {
|
|
client.exec(stmt);
|
|
}
|
|
}
|
|
|
|
return client;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Fixtures
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const now = Math.floor(Date.now() / 1000);
|
|
|
|
function insertRepo(db: Database.Database, overrides: Partial<Record<string, unknown>> = {}): void {
|
|
db.prepare(
|
|
`INSERT INTO repositories
|
|
(id, title, source, source_url, branch, state,
|
|
total_snippets, total_tokens, trust_score, benchmark_score,
|
|
stars, github_token, last_indexed_at, created_at, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
).run(
|
|
overrides.id ?? '/test/repo',
|
|
overrides.title ?? 'Test Repo',
|
|
overrides.source ?? 'local',
|
|
overrides.source_url ?? '/tmp/test-repo',
|
|
overrides.branch ?? 'main',
|
|
overrides.state ?? 'pending',
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
null,
|
|
null,
|
|
null,
|
|
now,
|
|
now
|
|
);
|
|
}
|
|
|
|
function insertVersion(
|
|
db: Database.Database,
|
|
overrides: Partial<Record<string, unknown>> = {}
|
|
): string {
|
|
const id = crypto.randomUUID();
|
|
db.prepare(
|
|
`INSERT INTO repository_versions
|
|
(id, repository_id, tag, title, state, total_snippets, indexed_at, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`
|
|
).run(
|
|
overrides.id ?? id,
|
|
overrides.repository_id ?? '/test/repo',
|
|
overrides.tag ?? 'v1.0.0',
|
|
overrides.title ?? null,
|
|
overrides.state ?? 'pending',
|
|
overrides.total_snippets ?? 0,
|
|
overrides.indexed_at ?? null,
|
|
overrides.created_at ?? now
|
|
);
|
|
return (overrides.id as string) ?? id;
|
|
}
|
|
|
|
function insertJob(
|
|
db: Database.Database,
|
|
overrides: Partial<Record<string, unknown>> = {}
|
|
): string {
|
|
const id = crypto.randomUUID();
|
|
db.prepare(
|
|
`INSERT INTO indexing_jobs
|
|
(id, repository_id, version_id, status, progress,
|
|
total_files, processed_files, error, started_at, completed_at, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
).run(
|
|
overrides.id ?? id,
|
|
overrides.repository_id ?? '/test/repo',
|
|
overrides.version_id ?? null,
|
|
overrides.status ?? 'queued',
|
|
overrides.progress ?? 0,
|
|
overrides.total_files ?? 0,
|
|
overrides.processed_files ?? 0,
|
|
overrides.error ?? null,
|
|
overrides.started_at ?? null,
|
|
overrides.completed_at ?? null,
|
|
overrides.created_at ?? now
|
|
);
|
|
return (overrides.id as string) ?? id;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// recoverStaleJobs
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('recoverStaleJobs', () => {
|
|
let db: Database.Database;
|
|
|
|
beforeEach(() => {
|
|
db = createTestDb();
|
|
insertRepo(db);
|
|
});
|
|
|
|
it('marks running jobs as failed', () => {
|
|
insertJob(db, { status: 'running' });
|
|
recoverStaleJobs(db);
|
|
|
|
const row = db.prepare(`SELECT status, error FROM indexing_jobs LIMIT 1`).get() as {
|
|
status: string;
|
|
error: string;
|
|
};
|
|
expect(row.status).toBe('failed');
|
|
expect(row.error).toMatch(/restarted/i);
|
|
});
|
|
|
|
it('resets repositories in indexing state to error', () => {
|
|
db.prepare(`UPDATE repositories SET state = 'indexing' WHERE id = '/test/repo'`).run();
|
|
recoverStaleJobs(db);
|
|
|
|
const row = db.prepare(`SELECT state FROM repositories WHERE id = '/test/repo'`).get() as {
|
|
state: string;
|
|
};
|
|
expect(row.state).toBe('error');
|
|
});
|
|
|
|
it('leaves queued and done jobs untouched', () => {
|
|
insertJob(db, { status: 'queued' });
|
|
insertJob(db, { status: 'done' });
|
|
recoverStaleJobs(db);
|
|
|
|
const rows = db
|
|
.prepare(`SELECT status FROM indexing_jobs WHERE status IN ('queued', 'done')`)
|
|
.all() as { status: string }[];
|
|
expect(rows).toHaveLength(2);
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// JobQueue
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('JobQueue', () => {
|
|
let db: Database.Database;
|
|
let queue: JobQueue;
|
|
|
|
beforeEach(() => {
|
|
db = createTestDb();
|
|
insertRepo(db);
|
|
queue = new JobQueue(db);
|
|
});
|
|
|
|
it('enqueues a new job and returns it', () => {
|
|
const job = queue.enqueue('/test/repo');
|
|
expect(job.status).toBe('queued');
|
|
expect(job.repositoryId ?? (job as unknown as { repository_id: string }).repository_id).toBe(
|
|
'/test/repo'
|
|
);
|
|
});
|
|
|
|
it('deduplicates: returns existing active job instead of creating a new one', () => {
|
|
const job1 = queue.enqueue('/test/repo');
|
|
const job2 = queue.enqueue('/test/repo');
|
|
expect(job1.id).toBe(job2.id);
|
|
|
|
const count = (db.prepare(`SELECT COUNT(*) as n FROM indexing_jobs`).get() as { n: number }).n;
|
|
expect(count).toBe(1);
|
|
});
|
|
|
|
it('getJob returns null for unknown ID', () => {
|
|
expect(queue.getJob('non-existent')).toBeNull();
|
|
});
|
|
|
|
it('getJob returns the job if it exists', () => {
|
|
const jobId = insertJob(db, { status: 'done' });
|
|
const job = queue.getJob(jobId);
|
|
expect(job).not.toBeNull();
|
|
expect(job!.id).toBe(jobId);
|
|
});
|
|
|
|
it('listJobs returns all jobs ordered by created_at desc', () => {
|
|
insertJob(db, { created_at: now - 10, status: 'done' });
|
|
insertJob(db, { created_at: now - 5, status: 'done' });
|
|
insertJob(db, { created_at: now, status: 'queued' });
|
|
|
|
const jobs = queue.listJobs();
|
|
expect(jobs.length).toBeGreaterThanOrEqual(3);
|
|
// Most recent first.
|
|
expect(jobs[0].status).toBe('queued');
|
|
});
|
|
|
|
it('listJobs filters by repositoryId', () => {
|
|
insertRepo(db, { id: '/other/repo', source_url: '/tmp/other' });
|
|
insertJob(db, { repository_id: '/other/repo', status: 'done' });
|
|
insertJob(db, { status: 'queued' });
|
|
|
|
const jobs = queue.listJobs({ repositoryId: '/other/repo' });
|
|
expect(jobs).toHaveLength(1);
|
|
});
|
|
|
|
it('listJobs filters by status', () => {
|
|
insertJob(db, { status: 'queued' });
|
|
insertJob(db, { status: 'done' });
|
|
insertJob(db, { status: 'failed' });
|
|
|
|
const queued = queue.listJobs({ status: 'queued' });
|
|
expect(queued.every((j) => j.status === 'queued')).toBe(true);
|
|
});
|
|
|
|
it('countJobs returns correct count', () => {
|
|
insertJob(db, { status: 'done' });
|
|
insertJob(db, { status: 'done' });
|
|
insertJob(db, { status: 'failed' });
|
|
|
|
expect(queue.countJobs()).toBe(3);
|
|
expect(queue.countJobs({ status: 'done' })).toBe(2);
|
|
expect(queue.countJobs({ status: 'failed' })).toBe(1);
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// IndexingPipeline
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('IndexingPipeline', () => {
|
|
let db: Database.Database;
|
|
|
|
beforeEach(() => {
|
|
db = createTestDb();
|
|
insertRepo(db, { source: 'local', source_url: '/tmp/test-repo' });
|
|
});
|
|
|
|
function makePipeline(
|
|
crawlResult: {
|
|
files: Array<{ path: string; content: string; sha: string; language: string }>;
|
|
totalFiles: number;
|
|
/** Optional pre-parsed config — simulates LocalCrawler returning CrawlResult.config. */
|
|
config?: Record<string, unknown>;
|
|
} = { files: [], totalFiles: 0 },
|
|
embeddingService: EmbeddingService | null = null
|
|
) {
|
|
const mockGithubCrawl = vi.fn().mockResolvedValue({
|
|
...crawlResult,
|
|
skippedFiles: 0,
|
|
branch: 'main',
|
|
commitSha: 'abc'
|
|
});
|
|
|
|
const mockLocalCrawler = {
|
|
crawl: vi.fn().mockResolvedValue({
|
|
...crawlResult,
|
|
skippedFiles: 0,
|
|
branch: 'main',
|
|
commitSha: 'abc'
|
|
})
|
|
};
|
|
|
|
return new IndexingPipeline(
|
|
db,
|
|
mockGithubCrawl as never,
|
|
mockLocalCrawler as never,
|
|
embeddingService
|
|
);
|
|
}
|
|
|
|
function makeJob(repositoryId = '/test/repo', versionId?: string) {
|
|
const jobId = insertJob(db, {
|
|
repository_id: repositoryId,
|
|
version_id: versionId ?? null,
|
|
status: 'queued'
|
|
});
|
|
return db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(jobId) as {
|
|
id: string;
|
|
repositoryId?: string;
|
|
repository_id?: string;
|
|
status: string;
|
|
versionId?: string;
|
|
version_id?: string;
|
|
};
|
|
}
|
|
|
|
it('marks job as done when there are no files to index', async () => {
|
|
const pipeline = makePipeline({ files: [], totalFiles: 0 });
|
|
const job = makeJob();
|
|
|
|
await pipeline.run(job as never);
|
|
|
|
const updated = db
|
|
.prepare(`SELECT status, progress FROM indexing_jobs WHERE id = ?`)
|
|
.get(job.id) as { status: string; progress: number };
|
|
expect(updated.status).toBe('done');
|
|
expect(updated.progress).toBe(100);
|
|
});
|
|
|
|
it('marks job as running then done (final state is done)', async () => {
|
|
const pipeline = makePipeline({ files: [], totalFiles: 0 });
|
|
const job = makeJob();
|
|
|
|
await pipeline.run(job as never);
|
|
|
|
const updated = db.prepare(`SELECT status FROM indexing_jobs WHERE id = ?`).get(job.id) as {
|
|
status: string;
|
|
};
|
|
// The job should end in 'done' — the running→done transition is covered
|
|
// by the pipeline's internal updateJob calls.
|
|
expect(updated.status).toBe('done');
|
|
});
|
|
|
|
it('marks job as failed and repo as error when pipeline throws', async () => {
|
|
const errorCrawl = vi.fn().mockRejectedValue(new Error('crawl failed'));
|
|
const pipeline = new IndexingPipeline(
|
|
db,
|
|
errorCrawl as never,
|
|
{ crawl: errorCrawl } as never,
|
|
null
|
|
);
|
|
|
|
const job = makeJob();
|
|
|
|
await expect(pipeline.run(job as never)).rejects.toThrow('crawl failed');
|
|
|
|
const updatedJob = db
|
|
.prepare(`SELECT status, error FROM indexing_jobs WHERE id = ?`)
|
|
.get(job.id) as { status: string; error: string };
|
|
expect(updatedJob.status).toBe('failed');
|
|
expect(updatedJob.error).toBe('crawl failed');
|
|
|
|
const updatedRepo = db
|
|
.prepare(`SELECT state FROM repositories WHERE id = '/test/repo'`)
|
|
.get() as { state: string };
|
|
expect(updatedRepo.state).toBe('error');
|
|
});
|
|
|
|
it('inserts documents and snippets for new files', async () => {
|
|
const files = [
|
|
{
|
|
path: 'README.md',
|
|
content: '# Hello\n\nThis is documentation.',
|
|
sha: 'sha-readme',
|
|
language: 'markdown'
|
|
}
|
|
];
|
|
const pipeline = makePipeline({ files, totalFiles: 1 });
|
|
const job = makeJob();
|
|
|
|
await pipeline.run(job as never);
|
|
|
|
const docs = db.prepare(`SELECT * FROM documents`).all() as unknown[];
|
|
expect(docs.length).toBeGreaterThan(0);
|
|
|
|
const snippets = db.prepare(`SELECT * FROM snippets`).all() as unknown[];
|
|
expect(snippets.length).toBeGreaterThan(0);
|
|
|
|
const repo = db
|
|
.prepare(`SELECT state, total_snippets FROM repositories WHERE id = '/test/repo'`)
|
|
.get() as { state: string; total_snippets: number };
|
|
expect(repo.state).toBe('indexed');
|
|
expect(repo.total_snippets).toBeGreaterThan(0);
|
|
});
|
|
|
|
it('skips unchanged files (checksum match)', async () => {
|
|
// First indexing run.
|
|
const files = [
|
|
{
|
|
path: 'README.md',
|
|
content: '# Hello\n\nThis is documentation.',
|
|
sha: 'sha-readme',
|
|
language: 'markdown'
|
|
}
|
|
];
|
|
const pipeline = makePipeline({ files, totalFiles: 1 });
|
|
const job1 = makeJob();
|
|
await pipeline.run(job1 as never);
|
|
|
|
const firstDocCount = (db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number })
|
|
.n;
|
|
const firstSnippetIds = (db.prepare(`SELECT id FROM snippets`).all() as { id: string }[]).map(
|
|
(r) => r.id
|
|
);
|
|
|
|
// Second run with identical files.
|
|
const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' });
|
|
const job2 = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(job2Id) as never;
|
|
|
|
await pipeline.run(job2);
|
|
|
|
const secondDocCount = (
|
|
db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number }
|
|
).n;
|
|
const secondSnippetIds = (db.prepare(`SELECT id FROM snippets`).all() as { id: string }[]).map(
|
|
(r) => r.id
|
|
);
|
|
|
|
// Document count stays the same and snippet IDs are unchanged.
|
|
expect(secondDocCount).toBe(firstDocCount);
|
|
expect(secondSnippetIds).toEqual(firstSnippetIds);
|
|
});
|
|
|
|
it('re-index backfills missing embeddings for unchanged snippets', async () => {
|
|
const provider = {
|
|
name: 'test-provider',
|
|
model: 'test-model',
|
|
dimensions: 3,
|
|
embed: vi.fn(async (texts: string[]) =>
|
|
texts.map(() => ({
|
|
values: new Float32Array([0.1, 0.2, 0.3]),
|
|
dimensions: 3,
|
|
model: 'test-model'
|
|
}))
|
|
),
|
|
isAvailable: vi.fn(async () => true)
|
|
};
|
|
const embeddingService = new EmbeddingService(db, provider, 'local-default');
|
|
const files = [
|
|
{
|
|
path: 'README.md',
|
|
content: '# Hello\n\nThis is documentation.',
|
|
sha: 'sha-readme',
|
|
language: 'markdown'
|
|
}
|
|
];
|
|
|
|
const pipeline = makePipeline({ files, totalFiles: 1 }, embeddingService);
|
|
const job1 = makeJob();
|
|
await pipeline.run(job1 as never);
|
|
|
|
const firstSnippetIds = (db.prepare(`SELECT id FROM snippets ORDER BY id`).all() as { id: string }[])
|
|
.map((row) => row.id);
|
|
expect(firstSnippetIds.length).toBeGreaterThan(0);
|
|
|
|
const firstEmbeddingCount = (
|
|
db.prepare(`SELECT COUNT(*) as n FROM snippet_embeddings WHERE profile_id = 'local-default'`).get() as {
|
|
n: number;
|
|
}
|
|
).n;
|
|
expect(firstEmbeddingCount).toBe(firstSnippetIds.length);
|
|
|
|
db.prepare(`DELETE FROM snippet_embeddings WHERE profile_id = 'local-default'`).run();
|
|
|
|
const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' });
|
|
const job2 = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(job2Id) as never;
|
|
await pipeline.run(job2);
|
|
|
|
const secondSnippetIds = (db.prepare(`SELECT id FROM snippets ORDER BY id`).all() as {
|
|
id: string;
|
|
}[]).map((row) => row.id);
|
|
const secondEmbeddingCount = (
|
|
db.prepare(`SELECT COUNT(*) as n FROM snippet_embeddings WHERE profile_id = 'local-default'`).get() as {
|
|
n: number;
|
|
}
|
|
).n;
|
|
|
|
expect(secondSnippetIds).toEqual(firstSnippetIds);
|
|
expect(secondEmbeddingCount).toBe(firstSnippetIds.length);
|
|
});
|
|
|
|
it('replaces snippets atomically when a file changes', async () => {
|
|
const pipeline1 = makePipeline({
|
|
files: [
|
|
{
|
|
path: 'README.md',
|
|
content:
|
|
'# Original\n\nThis is the original version of the documentation with sufficient content.',
|
|
sha: 'sha-v1',
|
|
language: 'markdown'
|
|
}
|
|
],
|
|
totalFiles: 1
|
|
});
|
|
const job1 = makeJob();
|
|
await pipeline1.run(job1 as never);
|
|
|
|
const originalSnippetCount = (
|
|
db.prepare(`SELECT COUNT(*) as n FROM snippets`).get() as { n: number }
|
|
).n;
|
|
expect(originalSnippetCount).toBeGreaterThan(0);
|
|
|
|
// Second run with changed file content.
|
|
const pipeline2 = makePipeline({
|
|
files: [
|
|
{
|
|
path: 'README.md',
|
|
content:
|
|
'# Updated\n\nThis is a completely different version of the documentation with new content.',
|
|
sha: 'sha-v2',
|
|
language: 'markdown'
|
|
}
|
|
],
|
|
totalFiles: 1
|
|
});
|
|
const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' });
|
|
const job2 = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(job2Id) as never;
|
|
await pipeline2.run(job2);
|
|
|
|
const finalDocCount = (db.prepare(`SELECT COUNT(*) as n FROM documents`).get() as { n: number })
|
|
.n;
|
|
// Only one document should exist (the updated one).
|
|
expect(finalDocCount).toBe(1);
|
|
|
|
const finalChecksum = (
|
|
db.prepare(`SELECT checksum FROM documents LIMIT 1`).get() as { checksum: string }
|
|
).checksum;
|
|
expect(finalChecksum).toBe('sha-v2');
|
|
});
|
|
|
|
it('updates job progress as files are processed', async () => {
|
|
const files = Array.from({ length: 5 }, (_, i) => ({
|
|
path: `file${i}.md`,
|
|
content: `# File ${i}\n\nContent ${i}.`,
|
|
sha: `sha-${i}`,
|
|
language: 'markdown'
|
|
}));
|
|
|
|
const pipeline = makePipeline({ files, totalFiles: 5 });
|
|
const job = makeJob();
|
|
await pipeline.run(job as never);
|
|
|
|
const updated = db.prepare(`SELECT progress FROM indexing_jobs WHERE id = ?`).get(job.id) as {
|
|
progress: number;
|
|
};
|
|
expect(updated.progress).toBe(100);
|
|
});
|
|
|
|
it('uses the repository source_url when crawling local repositories', async () => {
|
|
const crawl = vi.fn().mockResolvedValue({
|
|
files: [],
|
|
totalFiles: 0,
|
|
skippedFiles: 0,
|
|
branch: 'local',
|
|
commitSha: 'abc'
|
|
});
|
|
|
|
const pipeline = new IndexingPipeline(db, vi.fn() as never, { crawl } as never, null);
|
|
|
|
const job = makeJob();
|
|
await pipeline.run(job as never);
|
|
|
|
expect(crawl).toHaveBeenCalledWith({
|
|
rootPath: '/tmp/test-repo',
|
|
ref: undefined
|
|
});
|
|
});
|
|
|
|
it('integration: handles unchanged, modified, added, and deleted files in one run', async () => {
|
|
// ---- First run: index three files -----------------------------------
|
|
const firstFiles = [
|
|
{
|
|
path: 'unchanged.md',
|
|
content: '# Unchanged\n\nThis file never changes.',
|
|
sha: 'sha-unchanged',
|
|
language: 'markdown'
|
|
},
|
|
{
|
|
path: 'will-change.md',
|
|
content: '# Original\n\nThis will be modified in the next run.',
|
|
sha: 'sha-will-change-v1',
|
|
language: 'markdown'
|
|
},
|
|
{
|
|
path: 'will-delete.md',
|
|
content: '# To Be Deleted\n\nThis file will vanish in the next run.',
|
|
sha: 'sha-will-delete',
|
|
language: 'markdown'
|
|
}
|
|
];
|
|
|
|
const pipeline1 = makePipeline({ files: firstFiles, totalFiles: 3 });
|
|
const job1 = makeJob();
|
|
await pipeline1.run(job1 as never);
|
|
|
|
const afterFirstRun = {
|
|
docs: db.prepare(`SELECT file_path, checksum FROM documents ORDER BY file_path`).all() as {
|
|
file_path: string;
|
|
checksum: string;
|
|
}[],
|
|
snippetCount: (db.prepare(`SELECT COUNT(*) as n FROM snippets`).get() as { n: number }).n
|
|
};
|
|
expect(afterFirstRun.docs).toHaveLength(3);
|
|
expect(afterFirstRun.snippetCount).toBeGreaterThan(0);
|
|
|
|
// ---- Second run: add a new file, modify one, delete one, keep one ---
|
|
const secondFiles = [
|
|
{
|
|
path: 'unchanged.md',
|
|
content: '# Unchanged\n\nThis file never changes.',
|
|
sha: 'sha-unchanged', // same sha → should be skipped
|
|
language: 'markdown'
|
|
},
|
|
{
|
|
path: 'will-change.md',
|
|
content: '# Modified\n\nThis file was modified with completely new content.',
|
|
sha: 'sha-will-change-v2', // different sha → should be re-indexed
|
|
language: 'markdown'
|
|
},
|
|
{
|
|
path: 'brand-new.md',
|
|
content: '# Brand New\n\nThis file was added in the second crawl.',
|
|
sha: 'sha-brand-new', // not in DB → should be added
|
|
language: 'markdown'
|
|
}
|
|
// 'will-delete.md' is intentionally absent → should be deleted
|
|
];
|
|
|
|
const pipeline2 = makePipeline({ files: secondFiles, totalFiles: 3 });
|
|
const job2Id = insertJob(db, { repository_id: '/test/repo', status: 'queued' });
|
|
const job2 = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(job2Id) as never;
|
|
await pipeline2.run(job2);
|
|
|
|
// ---- Verify final DB state -------------------------------------------
|
|
const finalDocs = db
|
|
.prepare(`SELECT file_path, checksum FROM documents ORDER BY file_path`)
|
|
.all() as { file_path: string; checksum: string }[];
|
|
|
|
const filePaths = finalDocs.map((d) => d.file_path);
|
|
|
|
// unchanged.md: still present, same checksum
|
|
expect(filePaths).toContain('unchanged.md');
|
|
const unchangedDoc = finalDocs.find((d) => d.file_path === 'unchanged.md');
|
|
expect(unchangedDoc?.checksum).toBe('sha-unchanged');
|
|
|
|
// will-change.md: present with updated checksum
|
|
expect(filePaths).toContain('will-change.md');
|
|
const changedDoc = finalDocs.find((d) => d.file_path === 'will-change.md');
|
|
expect(changedDoc?.checksum).toBe('sha-will-change-v2');
|
|
|
|
// brand-new.md: present (was added in second run)
|
|
expect(filePaths).toContain('brand-new.md');
|
|
|
|
// will-delete.md: NOT present (was absent from second crawl)
|
|
expect(filePaths).not.toContain('will-delete.md');
|
|
|
|
// Exactly 3 documents remain
|
|
expect(finalDocs).toHaveLength(3);
|
|
|
|
// Job ended successfully with full progress
|
|
const finalJob = db
|
|
.prepare(`SELECT status, progress FROM indexing_jobs WHERE id = ?`)
|
|
.get(job2Id) as { status: string; progress: number };
|
|
expect(finalJob.status).toBe('done');
|
|
expect(finalJob.progress).toBe(100);
|
|
});
|
|
|
|
it('updates repository_versions state to indexing then indexed when job has versionId', async () => {
|
|
const versionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' });
|
|
const files = [
|
|
{
|
|
path: 'README.md',
|
|
content: '# Hello\n\nThis is documentation.',
|
|
sha: 'sha-readme',
|
|
language: 'markdown'
|
|
}
|
|
];
|
|
const pipeline = makePipeline({ files, totalFiles: 1 });
|
|
const job = makeJob('/test/repo', versionId);
|
|
|
|
await pipeline.run(job as never);
|
|
|
|
const version = db
|
|
.prepare(`SELECT state, total_snippets, indexed_at FROM repository_versions WHERE id = ?`)
|
|
.get(versionId) as { state: string; total_snippets: number; indexed_at: number | null };
|
|
|
|
expect(version.state).toBe('indexed');
|
|
expect(version.total_snippets).toBeGreaterThan(0);
|
|
expect(version.indexed_at).not.toBeNull();
|
|
});
|
|
|
|
it('updates repository_versions state to error when pipeline throws and job has versionId', async () => {
|
|
const versionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' });
|
|
const errorCrawl = vi.fn().mockRejectedValue(new Error('crawl failed'));
|
|
const pipeline = new IndexingPipeline(
|
|
db,
|
|
errorCrawl as never,
|
|
{ crawl: errorCrawl } as never,
|
|
null
|
|
);
|
|
const job = makeJob('/test/repo', versionId);
|
|
|
|
await expect(pipeline.run(job as never)).rejects.toThrow('crawl failed');
|
|
|
|
const version = db
|
|
.prepare(`SELECT state FROM repository_versions WHERE id = ?`)
|
|
.get(versionId) as { state: string };
|
|
|
|
expect(version.state).toBe('error');
|
|
});
|
|
|
|
it('does not touch repository_versions when job has no versionId', async () => {
|
|
const versionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' });
|
|
const pipeline = makePipeline({ files: [], totalFiles: 0 });
|
|
const job = makeJob('/test/repo'); // no versionId
|
|
|
|
await pipeline.run(job as never);
|
|
|
|
const version = db
|
|
.prepare(`SELECT state FROM repository_versions WHERE id = ?`)
|
|
.get(versionId) as { state: string };
|
|
|
|
// State should remain 'pending' — pipeline with no versionId must not touch it
|
|
expect(version.state).toBe('pending');
|
|
});
|
|
|
|
it('calls LocalCrawler with ref=v1.2.0 when job has a versionId with tag v1.2.0', async () => {
|
|
const versionId = insertVersion(db, { tag: 'v1.2.0', state: 'pending' });
|
|
|
|
const crawl = vi.fn().mockResolvedValue({
|
|
files: [],
|
|
totalFiles: 0,
|
|
skippedFiles: 0,
|
|
branch: 'main',
|
|
commitSha: 'abc'
|
|
});
|
|
|
|
const pipeline = new IndexingPipeline(db, vi.fn() as never, { crawl } as never, null);
|
|
const job = makeJob('/test/repo', versionId);
|
|
|
|
await pipeline.run(job as never);
|
|
|
|
expect(crawl).toHaveBeenCalledWith({
|
|
rootPath: '/tmp/test-repo',
|
|
ref: 'v1.2.0'
|
|
});
|
|
});
|
|
|
|
it('calls LocalCrawler with ref=undefined when job has no versionId (main-branch)', async () => {
|
|
const crawl = vi.fn().mockResolvedValue({
|
|
files: [],
|
|
totalFiles: 0,
|
|
skippedFiles: 0,
|
|
branch: 'main',
|
|
commitSha: 'abc'
|
|
});
|
|
|
|
const pipeline = new IndexingPipeline(db, vi.fn() as never, { crawl } as never, null);
|
|
const job = makeJob('/test/repo'); // no versionId
|
|
|
|
await pipeline.run(job as never);
|
|
|
|
expect(crawl).toHaveBeenCalledWith({
|
|
rootPath: '/tmp/test-repo',
|
|
ref: undefined
|
|
});
|
|
});
|
|
|
|
it('excludes files matching excludeFiles patterns from trueref.json', async () => {
|
|
const truerefConfig = JSON.stringify({
|
|
excludeFiles: ['migration-guide.md', 'docs/legacy*']
|
|
});
|
|
const files = [
|
|
{
|
|
path: 'trueref.json',
|
|
content: truerefConfig,
|
|
sha: 'sha-config',
|
|
language: 'json'
|
|
},
|
|
{
|
|
path: 'README.md',
|
|
content: '# Hello\n\nThis is documentation.',
|
|
sha: 'sha-readme',
|
|
language: 'markdown'
|
|
},
|
|
{
|
|
path: 'migration-guide.md',
|
|
content: '# Migration Guide\n\nThis should be excluded.',
|
|
sha: 'sha-migration',
|
|
language: 'markdown'
|
|
},
|
|
{
|
|
path: 'docs/legacy-api.md',
|
|
content: '# Legacy API\n\nShould be excluded by glob prefix.',
|
|
sha: 'sha-legacy',
|
|
language: 'markdown'
|
|
}
|
|
];
|
|
const pipeline = makePipeline({ files, totalFiles: files.length });
|
|
const job = makeJob();
|
|
|
|
await pipeline.run(job as never);
|
|
|
|
const docs = db
|
|
.prepare(`SELECT file_path FROM documents ORDER BY file_path`)
|
|
.all() as { file_path: string }[];
|
|
const filePaths = docs.map((d) => d.file_path);
|
|
|
|
// migration-guide.md and docs/legacy-api.md must be absent.
|
|
expect(filePaths).not.toContain('migration-guide.md');
|
|
expect(filePaths).not.toContain('docs/legacy-api.md');
|
|
|
|
// README.md must still be indexed.
|
|
expect(filePaths).toContain('README.md');
|
|
});
|
|
|
|
it('persists repo-wide rules from trueref.json to repository_configs after indexing', async () => {
|
|
const truerefConfig = JSON.stringify({
|
|
rules: ['Always use TypeScript strict mode', 'Prefer async/await over callbacks']
|
|
});
|
|
const files = [
|
|
{
|
|
path: 'trueref.json',
|
|
content: truerefConfig,
|
|
sha: 'sha-config',
|
|
language: 'json'
|
|
}
|
|
];
|
|
const pipeline = makePipeline({ files, totalFiles: files.length });
|
|
const job = makeJob();
|
|
|
|
await pipeline.run(job as never);
|
|
|
|
const row = db
|
|
.prepare(
|
|
`SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL`
|
|
)
|
|
.get() as { rules: string } | undefined;
|
|
|
|
expect(row).toBeDefined();
|
|
const rules = JSON.parse(row!.rules);
|
|
expect(rules).toEqual(['Always use TypeScript strict mode', 'Prefer async/await over callbacks']);
|
|
});
|
|
|
|
it('persists version-specific rules under (repositoryId, versionId) when job has versionId', async () => {
|
|
const versionId = insertVersion(db, { tag: 'v2.0.0', state: 'pending' });
|
|
const truerefConfig = JSON.stringify({
|
|
rules: ['This is v2. Use the new Builder API.']
|
|
});
|
|
const files = [
|
|
{
|
|
path: 'trueref.json',
|
|
content: truerefConfig,
|
|
sha: 'sha-config',
|
|
language: 'json'
|
|
}
|
|
];
|
|
const pipeline = makePipeline({ files, totalFiles: files.length });
|
|
const job = makeJob('/test/repo', versionId);
|
|
|
|
await pipeline.run(job as never);
|
|
|
|
// Repo-wide row (version_id IS NULL) must NOT be written by a version job —
|
|
// writing it here would contaminate the NULL entry with version-specific rules
|
|
// (Bug 5b regression guard).
|
|
const repoRow = db
|
|
.prepare(
|
|
`SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL`
|
|
)
|
|
.get() as { rules: string } | undefined;
|
|
expect(repoRow).toBeUndefined();
|
|
|
|
// Version-specific row must exist with the correct rules.
|
|
const versionRow = db
|
|
.prepare(
|
|
`SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id = ?`
|
|
)
|
|
.get(versionId) as { rules: string } | undefined;
|
|
expect(versionRow).toBeDefined();
|
|
const rules = JSON.parse(versionRow!.rules);
|
|
expect(rules).toEqual(['This is v2. Use the new Builder API.']);
|
|
});
|
|
|
|
it('regression(Bug5b): version job does not overwrite the repo-wide NULL rules entry', async () => {
|
|
// Arrange: index the main branch first to establish a repo-wide rules entry.
|
|
const mainBranchRules = ['Always use TypeScript strict mode.'];
|
|
const mainPipeline = makePipeline({
|
|
files: [
|
|
{
|
|
path: 'trueref.json',
|
|
content: JSON.stringify({ rules: mainBranchRules }),
|
|
sha: 'sha-main-config',
|
|
language: 'json'
|
|
}
|
|
],
|
|
totalFiles: 1
|
|
});
|
|
const mainJob = makeJob('/test/repo'); // no versionId → main-branch job
|
|
await mainPipeline.run(mainJob as never);
|
|
|
|
// Confirm the repo-wide entry was written.
|
|
const afterMain = db
|
|
.prepare(
|
|
`SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL`
|
|
)
|
|
.get() as { rules: string } | undefined;
|
|
expect(afterMain).toBeDefined();
|
|
expect(JSON.parse(afterMain!.rules)).toEqual(mainBranchRules);
|
|
|
|
// Act: index a version with different rules.
|
|
const versionId = insertVersion(db, { tag: 'v3.0.0', state: 'pending' });
|
|
const versionRules = ['v3 only: use the streaming API.'];
|
|
const versionPipeline = makePipeline({
|
|
files: [
|
|
{
|
|
path: 'trueref.json',
|
|
content: JSON.stringify({ rules: versionRules }),
|
|
sha: 'sha-v3-config',
|
|
language: 'json'
|
|
}
|
|
],
|
|
totalFiles: 1
|
|
});
|
|
const versionJob = makeJob('/test/repo', versionId);
|
|
await versionPipeline.run(versionJob as never);
|
|
|
|
// Assert: the repo-wide NULL entry must still contain the main-branch rules,
|
|
// not the version-specific ones.
|
|
const afterVersion = db
|
|
.prepare(
|
|
`SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL`
|
|
)
|
|
.get() as { rules: string } | undefined;
|
|
expect(afterVersion).toBeDefined();
|
|
expect(JSON.parse(afterVersion!.rules)).toEqual(mainBranchRules);
|
|
|
|
// And the version-specific row must contain the version rules.
|
|
const versionRow = db
|
|
.prepare(
|
|
`SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id = ?`
|
|
)
|
|
.get(versionId) as { rules: string } | undefined;
|
|
expect(versionRow).toBeDefined();
|
|
expect(JSON.parse(versionRow!.rules)).toEqual(versionRules);
|
|
});
|
|
|
|
it('persists rules from CrawlResult.config even when trueref.json is absent from files (folders allowlist bug)', async () => {
|
|
// Regression test for MULTIVERSION-0001:
|
|
// When trueref.json specifies a `folders` allowlist (e.g. ["src/"]),
|
|
// shouldIndexFile() excludes trueref.json itself because it lives at the
|
|
// repo root. The LocalCrawler now carries the pre-parsed config in
|
|
// CrawlResult.config so the pipeline no longer needs to find the file in
|
|
// crawlResult.files[].
|
|
const pipeline = makePipeline({
|
|
// trueref.json is NOT in files — simulates it being excluded by folders allowlist.
|
|
files: [
|
|
{
|
|
path: 'src/index.ts',
|
|
content: 'export const x = 1;',
|
|
sha: 'sha-src',
|
|
language: 'typescript'
|
|
}
|
|
],
|
|
totalFiles: 1,
|
|
// The pre-parsed config is carried here instead (set by LocalCrawler).
|
|
config: { rules: ['Use strict TypeScript.', 'Avoid any.'] }
|
|
});
|
|
const job = makeJob();
|
|
|
|
await pipeline.run(job as never);
|
|
|
|
const row = db
|
|
.prepare(
|
|
`SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL`
|
|
)
|
|
.get() as { rules: string } | undefined;
|
|
|
|
expect(row).toBeDefined();
|
|
const rules = JSON.parse(row!.rules);
|
|
expect(rules).toEqual(['Use strict TypeScript.', 'Avoid any.']);
|
|
});
|
|
|
|
it('persists version-specific rules from CrawlResult.config when trueref.json is excluded by folders allowlist', async () => {
|
|
const versionId = insertVersion(db, { tag: 'v3.0.0', state: 'pending' });
|
|
|
|
const pipeline = makePipeline({
|
|
files: [
|
|
{
|
|
path: 'src/index.ts',
|
|
content: 'export const x = 1;',
|
|
sha: 'sha-src',
|
|
language: 'typescript'
|
|
}
|
|
],
|
|
totalFiles: 1,
|
|
config: { rules: ['v3: use the streaming API.'] }
|
|
});
|
|
const job = makeJob('/test/repo', versionId);
|
|
|
|
await pipeline.run(job as never);
|
|
|
|
const versionRow = db
|
|
.prepare(
|
|
`SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id = ?`
|
|
)
|
|
.get(versionId) as { rules: string } | undefined;
|
|
|
|
expect(versionRow).toBeDefined();
|
|
const rules = JSON.parse(versionRow!.rules);
|
|
expect(rules).toEqual(['v3: use the streaming API.']);
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// differential indexing
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('differential indexing', () => {
|
|
let db: Database.Database;
|
|
|
|
beforeEach(() => {
|
|
db = createTestDb();
|
|
insertRepo(db, { source: 'local', source_url: '/tmp/test-repo' });
|
|
});
|
|
|
|
function insertDocument(
|
|
localDb: Database.Database,
|
|
overrides: Partial<Record<string, unknown>> = {}
|
|
): string {
|
|
const id = crypto.randomUUID();
|
|
localDb
|
|
.prepare(
|
|
`INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run(
|
|
(overrides.id as string) ?? id,
|
|
(overrides.repository_id as string) ?? '/test/repo',
|
|
(overrides.version_id as string | null) ?? null,
|
|
(overrides.file_path as string) ?? 'README.md',
|
|
null,
|
|
'markdown',
|
|
100,
|
|
(overrides.checksum as string) ?? 'abc123',
|
|
Math.floor(Date.now() / 1000)
|
|
);
|
|
return (overrides.id as string) ?? id;
|
|
}
|
|
|
|
function insertSnippet(
|
|
localDb: Database.Database,
|
|
documentId: string,
|
|
overrides: Partial<Record<string, unknown>> = {}
|
|
): string {
|
|
const id = crypto.randomUUID();
|
|
localDb
|
|
.prepare(
|
|
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
)
|
|
.run(
|
|
(overrides.id as string) ?? id,
|
|
documentId,
|
|
(overrides.repository_id as string) ?? '/test/repo',
|
|
(overrides.version_id as string | null) ?? null,
|
|
'info',
|
|
null,
|
|
'content',
|
|
'markdown',
|
|
null,
|
|
10,
|
|
Math.floor(Date.now() / 1000)
|
|
);
|
|
return (overrides.id as string) ?? id;
|
|
}
|
|
|
|
type PipelineInternals = IndexingPipeline & {
|
|
cloneFromAncestor: (
|
|
ancestorVersionId: string,
|
|
targetVersionId: string,
|
|
repositoryId: string,
|
|
unchangedPaths: Set<string>
|
|
) => void;
|
|
};
|
|
|
|
it('cloneFromAncestor inserts documents and snippets into the target version', () => {
|
|
const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
|
|
const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
|
|
|
|
const doc1Id = insertDocument(db, {
|
|
repository_id: '/test/repo',
|
|
version_id: ancestorVersionId,
|
|
file_path: 'README.md',
|
|
checksum: 'sha-readme'
|
|
});
|
|
const doc2Id = insertDocument(db, {
|
|
repository_id: '/test/repo',
|
|
version_id: ancestorVersionId,
|
|
file_path: 'src/index.ts',
|
|
checksum: 'sha-index'
|
|
});
|
|
insertSnippet(db, doc1Id, { repository_id: '/test/repo', version_id: ancestorVersionId });
|
|
insertSnippet(db, doc2Id, { repository_id: '/test/repo', version_id: ancestorVersionId });
|
|
|
|
const pipeline = new IndexingPipeline(
|
|
db,
|
|
vi.fn() as never,
|
|
{ crawl: vi.fn() } as never,
|
|
null
|
|
);
|
|
(pipeline as unknown as PipelineInternals).cloneFromAncestor(
|
|
ancestorVersionId,
|
|
targetVersionId,
|
|
'/test/repo',
|
|
new Set(['README.md', 'src/index.ts'])
|
|
);
|
|
|
|
const targetDocs = db
|
|
.prepare(`SELECT * FROM documents WHERE version_id = ?`)
|
|
.all(targetVersionId) as { id: string; file_path: string }[];
|
|
expect(targetDocs).toHaveLength(2);
|
|
expect(targetDocs.map((d) => d.file_path).sort()).toEqual(
|
|
['README.md', 'src/index.ts'].sort()
|
|
);
|
|
// New IDs must differ from ancestor doc IDs.
|
|
const targetDocIds = targetDocs.map((d) => d.id);
|
|
expect(targetDocIds).not.toContain(doc1Id);
|
|
expect(targetDocIds).not.toContain(doc2Id);
|
|
|
|
const targetSnippets = db
|
|
.prepare(`SELECT * FROM snippets WHERE version_id = ?`)
|
|
.all(targetVersionId) as { id: string }[];
|
|
expect(targetSnippets).toHaveLength(2);
|
|
});
|
|
|
|
it('cloneFromAncestor silently skips paths absent from the ancestor', () => {
|
|
const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
|
|
const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
|
|
|
|
insertDocument(db, {
|
|
repository_id: '/test/repo',
|
|
version_id: ancestorVersionId,
|
|
file_path: 'src/main.ts',
|
|
checksum: 'sha-main'
|
|
});
|
|
|
|
const pipeline = new IndexingPipeline(
|
|
db,
|
|
vi.fn() as never,
|
|
{ crawl: vi.fn() } as never,
|
|
null
|
|
);
|
|
(pipeline as unknown as PipelineInternals).cloneFromAncestor(
|
|
ancestorVersionId,
|
|
targetVersionId,
|
|
'/test/repo',
|
|
new Set(['src/main.ts', 'MISSING.md'])
|
|
);
|
|
|
|
const targetDocs = db
|
|
.prepare(`SELECT * FROM documents WHERE version_id = ?`)
|
|
.all(targetVersionId) as { id: string; file_path: string }[];
|
|
expect(targetDocs).toHaveLength(1);
|
|
expect(targetDocs[0].file_path).toBe('src/main.ts');
|
|
});
|
|
|
|
it('falls back to full crawl when no indexed ancestor exists', async () => {
|
|
const targetVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' });
|
|
|
|
const files = [
|
|
{
|
|
path: 'README.md',
|
|
content: '# Hello\n\nThis is documentation.',
|
|
sha: 'sha-readme',
|
|
language: 'markdown'
|
|
},
|
|
{
|
|
path: 'src/index.ts',
|
|
content: 'export const x = 1;',
|
|
sha: 'sha-index',
|
|
language: 'typescript'
|
|
}
|
|
];
|
|
|
|
const mockLocalCrawl = vi.fn().mockResolvedValue({
|
|
files,
|
|
totalFiles: 2,
|
|
skippedFiles: 0,
|
|
branch: 'main',
|
|
commitSha: 'abc'
|
|
});
|
|
|
|
const pipeline = new IndexingPipeline(
|
|
db,
|
|
vi.fn() as never,
|
|
{ crawl: mockLocalCrawl } as never,
|
|
null
|
|
);
|
|
|
|
const jobId = insertJob(db, {
|
|
repository_id: '/test/repo',
|
|
version_id: targetVersionId,
|
|
status: 'queued'
|
|
});
|
|
const job = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(jobId) as never;
|
|
|
|
await pipeline.run(job);
|
|
|
|
const updatedJob = db
|
|
.prepare(`SELECT status FROM indexing_jobs WHERE id = ?`)
|
|
.get(jobId) as { status: string };
|
|
expect(updatedJob.status).toBe('done');
|
|
|
|
const docs = db
|
|
.prepare(`SELECT * FROM documents WHERE version_id = ?`)
|
|
.all(targetVersionId) as { id: string }[];
|
|
expect(docs.length).toBeGreaterThanOrEqual(2);
|
|
});
|
|
|
|
it('cloned unchanged documents survive the diff/replace stage', async () => {
|
|
// 1. Set up ancestor and target versions.
|
|
const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
|
|
const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
|
|
|
|
// 2. Insert ancestor doc + snippet for unchanged.md.
|
|
const ancestorDocId = insertDocument(db, {
|
|
repository_id: '/test/repo',
|
|
version_id: ancestorVersionId,
|
|
file_path: 'unchanged.md',
|
|
checksum: 'sha-unchanged'
|
|
});
|
|
insertSnippet(db, ancestorDocId, {
|
|
repository_id: '/test/repo',
|
|
version_id: ancestorVersionId
|
|
});
|
|
|
|
// 3. Crawl returns ONLY changed.md (unchanged.md is absent — differential only).
|
|
const mockLocalCrawl = vi.fn().mockResolvedValue({
|
|
files: [
|
|
{
|
|
path: 'changed.md',
|
|
content: '# Changed\n\nThis file was added.',
|
|
sha: 'sha-changed',
|
|
language: 'markdown'
|
|
}
|
|
],
|
|
totalFiles: 1,
|
|
skippedFiles: 0,
|
|
branch: 'main',
|
|
commitSha: 'abc'
|
|
});
|
|
|
|
// 4. Mock buildDifferentialPlan to return a plan with the two paths.
|
|
const mockPlan = {
|
|
ancestorVersionId,
|
|
ancestorTag: 'v1.0.0',
|
|
changedPaths: new Set(['changed.md']),
|
|
deletedPaths: new Set<string>(),
|
|
unchangedPaths: new Set(['unchanged.md'])
|
|
};
|
|
const spy = vi
|
|
.spyOn(diffStrategy, 'buildDifferentialPlan')
|
|
.mockResolvedValueOnce(mockPlan);
|
|
|
|
const pipeline = new IndexingPipeline(
|
|
db,
|
|
vi.fn() as never,
|
|
{ crawl: mockLocalCrawl } as never,
|
|
null
|
|
);
|
|
|
|
// 5. Run pipeline for the target version job.
|
|
const jobId = insertJob(db, {
|
|
repository_id: '/test/repo',
|
|
version_id: targetVersionId,
|
|
status: 'queued'
|
|
});
|
|
const job = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(jobId) as never;
|
|
await pipeline.run(job);
|
|
|
|
spy.mockRestore();
|
|
|
|
// 6. Assert job completed and both docs exist under the target version.
|
|
const finalJob = db
|
|
.prepare(`SELECT status FROM indexing_jobs WHERE id = ?`)
|
|
.get(jobId) as { status: string };
|
|
expect(finalJob.status).toBe('done');
|
|
|
|
const targetDocs = db
|
|
.prepare(`SELECT file_path FROM documents WHERE version_id = ?`)
|
|
.all(targetVersionId) as { file_path: string }[];
|
|
const filePaths = targetDocs.map((d) => d.file_path);
|
|
|
|
// unchanged.md was cloned and must NOT have been deleted by computeDiff.
|
|
expect(filePaths).toContain('unchanged.md');
|
|
// changed.md was crawled and indexed in this run.
|
|
expect(filePaths).toContain('changed.md');
|
|
});
|
|
});
|