feat(TRUEREF-0021): implement differential tag indexing
This commit is contained in:
committed by
Giancarmine Salucci
parent
e63279fcf6
commit
f4fe8c6043
@@ -13,6 +13,7 @@ import { JobQueue } from './job-queue.js';
|
||||
import { IndexingPipeline } from './indexing.pipeline.js';
|
||||
import { recoverStaleJobs } from './startup.js';
|
||||
import { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
|
||||
import * as diffStrategy from './differential-strategy.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test DB factory
|
||||
@@ -1019,3 +1020,290 @@ describe('IndexingPipeline', () => {
|
||||
expect(rules).toEqual(['v3: use the streaming API.']);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// differential indexing
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('differential indexing', () => {
|
||||
let db: Database.Database;
|
||||
|
||||
beforeEach(() => {
|
||||
db = createTestDb();
|
||||
insertRepo(db, { source: 'local', source_url: '/tmp/test-repo' });
|
||||
});
|
||||
|
||||
function insertDocument(
|
||||
localDb: Database.Database,
|
||||
overrides: Partial<Record<string, unknown>> = {}
|
||||
): string {
|
||||
const id = crypto.randomUUID();
|
||||
localDb
|
||||
.prepare(
|
||||
`INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
)
|
||||
.run(
|
||||
(overrides.id as string) ?? id,
|
||||
(overrides.repository_id as string) ?? '/test/repo',
|
||||
(overrides.version_id as string | null) ?? null,
|
||||
(overrides.file_path as string) ?? 'README.md',
|
||||
null,
|
||||
'markdown',
|
||||
100,
|
||||
(overrides.checksum as string) ?? 'abc123',
|
||||
Math.floor(Date.now() / 1000)
|
||||
);
|
||||
return (overrides.id as string) ?? id;
|
||||
}
|
||||
|
||||
function insertSnippet(
|
||||
localDb: Database.Database,
|
||||
documentId: string,
|
||||
overrides: Partial<Record<string, unknown>> = {}
|
||||
): string {
|
||||
const id = crypto.randomUUID();
|
||||
localDb
|
||||
.prepare(
|
||||
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
)
|
||||
.run(
|
||||
(overrides.id as string) ?? id,
|
||||
documentId,
|
||||
(overrides.repository_id as string) ?? '/test/repo',
|
||||
(overrides.version_id as string | null) ?? null,
|
||||
'info',
|
||||
null,
|
||||
'content',
|
||||
'markdown',
|
||||
null,
|
||||
10,
|
||||
Math.floor(Date.now() / 1000)
|
||||
);
|
||||
return (overrides.id as string) ?? id;
|
||||
}
|
||||
|
||||
type PipelineInternals = IndexingPipeline & {
|
||||
cloneFromAncestor: (
|
||||
ancestorVersionId: string,
|
||||
targetVersionId: string,
|
||||
repositoryId: string,
|
||||
unchangedPaths: Set<string>
|
||||
) => void;
|
||||
};
|
||||
|
||||
it('cloneFromAncestor inserts documents and snippets into the target version', () => {
|
||||
const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
|
||||
const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
|
||||
|
||||
const doc1Id = insertDocument(db, {
|
||||
repository_id: '/test/repo',
|
||||
version_id: ancestorVersionId,
|
||||
file_path: 'README.md',
|
||||
checksum: 'sha-readme'
|
||||
});
|
||||
const doc2Id = insertDocument(db, {
|
||||
repository_id: '/test/repo',
|
||||
version_id: ancestorVersionId,
|
||||
file_path: 'src/index.ts',
|
||||
checksum: 'sha-index'
|
||||
});
|
||||
insertSnippet(db, doc1Id, { repository_id: '/test/repo', version_id: ancestorVersionId });
|
||||
insertSnippet(db, doc2Id, { repository_id: '/test/repo', version_id: ancestorVersionId });
|
||||
|
||||
const pipeline = new IndexingPipeline(
|
||||
db,
|
||||
vi.fn() as never,
|
||||
{ crawl: vi.fn() } as never,
|
||||
null
|
||||
);
|
||||
(pipeline as unknown as PipelineInternals).cloneFromAncestor(
|
||||
ancestorVersionId,
|
||||
targetVersionId,
|
||||
'/test/repo',
|
||||
new Set(['README.md', 'src/index.ts'])
|
||||
);
|
||||
|
||||
const targetDocs = db
|
||||
.prepare(`SELECT * FROM documents WHERE version_id = ?`)
|
||||
.all(targetVersionId) as { id: string; file_path: string }[];
|
||||
expect(targetDocs).toHaveLength(2);
|
||||
expect(targetDocs.map((d) => d.file_path).sort()).toEqual(
|
||||
['README.md', 'src/index.ts'].sort()
|
||||
);
|
||||
// New IDs must differ from ancestor doc IDs.
|
||||
const targetDocIds = targetDocs.map((d) => d.id);
|
||||
expect(targetDocIds).not.toContain(doc1Id);
|
||||
expect(targetDocIds).not.toContain(doc2Id);
|
||||
|
||||
const targetSnippets = db
|
||||
.prepare(`SELECT * FROM snippets WHERE version_id = ?`)
|
||||
.all(targetVersionId) as { id: string }[];
|
||||
expect(targetSnippets).toHaveLength(2);
|
||||
});
|
||||
|
||||
it('cloneFromAncestor silently skips paths absent from the ancestor', () => {
|
||||
const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
|
||||
const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
|
||||
|
||||
insertDocument(db, {
|
||||
repository_id: '/test/repo',
|
||||
version_id: ancestorVersionId,
|
||||
file_path: 'src/main.ts',
|
||||
checksum: 'sha-main'
|
||||
});
|
||||
|
||||
const pipeline = new IndexingPipeline(
|
||||
db,
|
||||
vi.fn() as never,
|
||||
{ crawl: vi.fn() } as never,
|
||||
null
|
||||
);
|
||||
(pipeline as unknown as PipelineInternals).cloneFromAncestor(
|
||||
ancestorVersionId,
|
||||
targetVersionId,
|
||||
'/test/repo',
|
||||
new Set(['src/main.ts', 'MISSING.md'])
|
||||
);
|
||||
|
||||
const targetDocs = db
|
||||
.prepare(`SELECT * FROM documents WHERE version_id = ?`)
|
||||
.all(targetVersionId) as { id: string; file_path: string }[];
|
||||
expect(targetDocs).toHaveLength(1);
|
||||
expect(targetDocs[0].file_path).toBe('src/main.ts');
|
||||
});
|
||||
|
||||
it('falls back to full crawl when no indexed ancestor exists', async () => {
|
||||
const targetVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' });
|
||||
|
||||
const files = [
|
||||
{
|
||||
path: 'README.md',
|
||||
content: '# Hello\n\nThis is documentation.',
|
||||
sha: 'sha-readme',
|
||||
language: 'markdown'
|
||||
},
|
||||
{
|
||||
path: 'src/index.ts',
|
||||
content: 'export const x = 1;',
|
||||
sha: 'sha-index',
|
||||
language: 'typescript'
|
||||
}
|
||||
];
|
||||
|
||||
const mockLocalCrawl = vi.fn().mockResolvedValue({
|
||||
files,
|
||||
totalFiles: 2,
|
||||
skippedFiles: 0,
|
||||
branch: 'main',
|
||||
commitSha: 'abc'
|
||||
});
|
||||
|
||||
const pipeline = new IndexingPipeline(
|
||||
db,
|
||||
vi.fn() as never,
|
||||
{ crawl: mockLocalCrawl } as never,
|
||||
null
|
||||
);
|
||||
|
||||
const jobId = insertJob(db, {
|
||||
repository_id: '/test/repo',
|
||||
version_id: targetVersionId,
|
||||
status: 'queued'
|
||||
});
|
||||
const job = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(jobId) as never;
|
||||
|
||||
await pipeline.run(job);
|
||||
|
||||
const updatedJob = db
|
||||
.prepare(`SELECT status FROM indexing_jobs WHERE id = ?`)
|
||||
.get(jobId) as { status: string };
|
||||
expect(updatedJob.status).toBe('done');
|
||||
|
||||
const docs = db
|
||||
.prepare(`SELECT * FROM documents WHERE version_id = ?`)
|
||||
.all(targetVersionId) as { id: string }[];
|
||||
expect(docs.length).toBeGreaterThanOrEqual(2);
|
||||
});
|
||||
|
||||
it('cloned unchanged documents survive the diff/replace stage', async () => {
|
||||
// 1. Set up ancestor and target versions.
|
||||
const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
|
||||
const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
|
||||
|
||||
// 2. Insert ancestor doc + snippet for unchanged.md.
|
||||
const ancestorDocId = insertDocument(db, {
|
||||
repository_id: '/test/repo',
|
||||
version_id: ancestorVersionId,
|
||||
file_path: 'unchanged.md',
|
||||
checksum: 'sha-unchanged'
|
||||
});
|
||||
insertSnippet(db, ancestorDocId, {
|
||||
repository_id: '/test/repo',
|
||||
version_id: ancestorVersionId
|
||||
});
|
||||
|
||||
// 3. Crawl returns ONLY changed.md (unchanged.md is absent — differential only).
|
||||
const mockLocalCrawl = vi.fn().mockResolvedValue({
|
||||
files: [
|
||||
{
|
||||
path: 'changed.md',
|
||||
content: '# Changed\n\nThis file was added.',
|
||||
sha: 'sha-changed',
|
||||
language: 'markdown'
|
||||
}
|
||||
],
|
||||
totalFiles: 1,
|
||||
skippedFiles: 0,
|
||||
branch: 'main',
|
||||
commitSha: 'abc'
|
||||
});
|
||||
|
||||
// 4. Mock buildDifferentialPlan to return a plan with the two paths.
|
||||
const mockPlan = {
|
||||
ancestorVersionId,
|
||||
ancestorTag: 'v1.0.0',
|
||||
changedPaths: new Set(['changed.md']),
|
||||
deletedPaths: new Set<string>(),
|
||||
unchangedPaths: new Set(['unchanged.md'])
|
||||
};
|
||||
const spy = vi
|
||||
.spyOn(diffStrategy, 'buildDifferentialPlan')
|
||||
.mockResolvedValueOnce(mockPlan);
|
||||
|
||||
const pipeline = new IndexingPipeline(
|
||||
db,
|
||||
vi.fn() as never,
|
||||
{ crawl: mockLocalCrawl } as never,
|
||||
null
|
||||
);
|
||||
|
||||
// 5. Run pipeline for the target version job.
|
||||
const jobId = insertJob(db, {
|
||||
repository_id: '/test/repo',
|
||||
version_id: targetVersionId,
|
||||
status: 'queued'
|
||||
});
|
||||
const job = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(jobId) as never;
|
||||
await pipeline.run(job);
|
||||
|
||||
spy.mockRestore();
|
||||
|
||||
// 6. Assert job completed and both docs exist under the target version.
|
||||
const finalJob = db
|
||||
.prepare(`SELECT status FROM indexing_jobs WHERE id = ?`)
|
||||
.get(jobId) as { status: string };
|
||||
expect(finalJob.status).toBe('done');
|
||||
|
||||
const targetDocs = db
|
||||
.prepare(`SELECT file_path FROM documents WHERE version_id = ?`)
|
||||
.all(targetVersionId) as { file_path: string }[];
|
||||
const filePaths = targetDocs.map((d) => d.file_path);
|
||||
|
||||
// unchanged.md was cloned and must NOT have been deleted by computeDiff.
|
||||
expect(filePaths).toContain('unchanged.md');
|
||||
// changed.md was crawled and indexed in this run.
|
||||
expect(filePaths).toContain('changed.md');
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user