From f4fe8c60432a934504750772bb38cb5d9b1c30dc Mon Sep 17 00:00:00 2001 From: U811073 Date: Mon, 30 Mar 2026 13:12:50 +0200 Subject: [PATCH] feat(TRUEREF-0021): implement differential tag indexing --- docs/features/TRUEREF-0021.md | 113 +++++++ src/lib/server/crawler/github-compare.test.ts | 173 +++++++++++ src/lib/server/crawler/github-compare.ts | 104 +++++++ src/lib/server/crawler/types.ts | 15 + .../server/pipeline/differential-strategy.ts | 122 ++++++++ .../server/pipeline/indexing.pipeline.test.ts | 288 ++++++++++++++++++ src/lib/server/pipeline/indexing.pipeline.ts | 209 ++++++++++++- src/lib/server/utils/git.ts | 55 +++- src/lib/server/utils/tag-order.test.ts | 123 ++++++++ src/lib/server/utils/tag-order.ts | 88 ++++++ 10 files changed, 1281 insertions(+), 9 deletions(-) create mode 100644 docs/features/TRUEREF-0021.md create mode 100644 src/lib/server/crawler/github-compare.test.ts create mode 100644 src/lib/server/crawler/github-compare.ts create mode 100644 src/lib/server/pipeline/differential-strategy.ts create mode 100644 src/lib/server/utils/tag-order.test.ts create mode 100644 src/lib/server/utils/tag-order.ts diff --git a/docs/features/TRUEREF-0021.md b/docs/features/TRUEREF-0021.md new file mode 100644 index 0000000..dc33d77 --- /dev/null +++ b/docs/features/TRUEREF-0021.md @@ -0,0 +1,113 @@ +# TRUEREF-0021 — Differential Tag Indexing + +**Priority:** P1 +**Status:** Implemented +**Depends On:** TRUEREF-0014, TRUEREF-0017, TRUEREF-0019 +**Blocks:** — + +--- + +## Problem Statement + +Repositories with many version tags (e.g. hundreds or thousands, as seen in projects like RWC +UXFramework) make full re-indexing prohibitively expensive. Between consecutive semver tags the +overwhelming majority of files are unchanged — often only dependency manifests (`package.json`, +`*.lock`) differ. Indexing the complete file tree for every tag wastes compute time, GitHub API +quota, and embedding credits. + +--- + +## Solution + +Differential tag indexing detects when an already-indexed ancestor version exists for a given +target tag, determines exactly which files changed, and: + +1. **Clones** unchanged document rows, snippet rows, and embedding rows from the ancestor version + into the target version in a single SQLite transaction (`cloneFromAncestor`). +2. **Crawls** only the changed (added / modified) files, parses and embeds them normally. +3. **Skips** deleted files (not cloned, not crawled). +4. **Falls back** silently to a full crawl when no indexed ancestor can be found or any step fails. + +--- + +## Algorithm + +### Stage 0 — Differential Plan (`buildDifferentialPlan`) + +Executed in `IndexingPipeline.run()` before the crawl, when the job has a `versionId`: + +1. **Ancestor selection** (`findBestAncestorVersion` in `tag-order.ts`): Loads all `indexed` + versions for the repository, parses their tags as semver, and returns the closest predecessor + to the target tag. Falls back to creation-timestamp ordering for non-semver tags. + +2. **Changed-file detection**: For GitHub repositories, calls the GitHub Compare API + (`fetchGitHubChangedFiles` in `github-compare.ts`). For local repositories, uses + `git diff --name-status` via `getChangedFilesBetweenRefs` in `git.ts` (implemented with + `execFileSync` — not `execSync` — to prevent shell-injection attacks on branch/tag names + containing shell metacharacters). + +3. **Path partitioning**: The changed-file list is split into `changedPaths` (added + modified + + renamed-destination) and `deletedPaths`. `unchangedPaths` is derived as + `ancestorFilePaths − changedPaths − deletedPaths`. + +4. **Guard**: Returns `null` when no indexed ancestor exists, when the ancestor has no indexed + documents, or when all files changed (nothing to clone). + +### Stage 0.5 — Clone Unchanged Files (`cloneFromAncestor`) + +When `buildDifferentialPlan` returns a non-null plan with `unchangedPaths.size > 0`: + +- Fetches ancestor `documents` rows for the unchanged paths using a parameterised + `IN (?, ?, …)` query (no string interpolation of path values → no SQL injection). +- Inserts new `documents` rows for each, with new UUIDs and `version_id = targetVersionId`. +- Fetches ancestor `snippets` rows for those document IDs; inserts clones with new IDs. +- Fetches ancestor `snippet_embeddings` rows; inserts clones pointing to the new snippet IDs. +- The entire operation runs inside a single `this.db.transaction(…)()` call for atomicity. + +### Stage 1 — Partial Crawl + +`IndexingPipeline.crawl()` accepts an optional third argument `allowedPaths?: Set`. +When provided (set to `differentialPlan.changedPaths`), the crawl result is filtered so only +matching files are returned. This minimises GitHub API requests and local I/O. + +--- + +## API Surface Changes + +| Symbol | Location | Change | +|---|---|---| +| `buildDifferentialPlan` | `pipeline/differential-strategy.ts` | **New** — async function | +| `DifferentialPlan` | `pipeline/differential-strategy.ts` | **New** — interface | +| `findBestAncestorVersion` | `utils/tag-order.ts` | **New** — pure function | +| `fetchGitHubChangedFiles` | `crawler/github-compare.ts` | **New** — async function | +| `getChangedFilesBetweenRefs` | `utils/git.ts` | **New** — sync function (uses `execFileSync`) | +| `ChangedFile` | `crawler/types.ts` | **New** — interface | +| `CrawlOptions.allowedPaths` | `crawler/types.ts` | **New** — optional field | +| `IndexingPipeline.crawl()` | `pipeline/indexing.pipeline.ts` | **Modified** — added `allowedPaths` param | +| `IndexingPipeline.cloneFromAncestor()` | `pipeline/indexing.pipeline.ts` | **New** — private method | +| `IndexingPipeline.run()` | `pipeline/indexing.pipeline.ts` | **Modified** — Stage 0 added | + +--- + +## Correctness Properties + +- **Atomicity**: `cloneFromAncestor` wraps all inserts in one SQLite transaction; a failure + leaves the target version with no partially-cloned data. +- **Idempotency (fallback)**: If the clone or plan step fails for any reason, the pipeline + catches the error, logs a warning, and continues with a full crawl. No data loss occurs. +- **No shell injection**: `getChangedFilesBetweenRefs` uses `execFileSync` with an argument + array rather than `execSync` with a template-literal string. +- **No SQL injection**: Path values are never interpolated into SQL strings; only `?` + placeholders are used. + +--- + +## Fallback Conditions + +The differential plan returns `null` (triggering a full crawl) when: + +- No versions for this repository have `state = 'indexed'`. +- The best ancestor has no indexed documents. +- All files changed between ancestor and target (`unchangedPaths.size === 0`). +- The GitHub Compare API call or `git diff` call throws an error. +- Any unexpected exception inside `buildDifferentialPlan`. diff --git a/src/lib/server/crawler/github-compare.test.ts b/src/lib/server/crawler/github-compare.test.ts new file mode 100644 index 0000000..e77c477 --- /dev/null +++ b/src/lib/server/crawler/github-compare.test.ts @@ -0,0 +1,173 @@ +/** + * Unit tests for GitHub Compare API client (TRUEREF-0021). + */ +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { fetchGitHubChangedFiles } from './github-compare.js'; +import { GitHubApiError } from './github-tags.js'; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function mockFetch(status: number, body: unknown): void { + vi.spyOn(global, 'fetch').mockResolvedValueOnce( + new Response(JSON.stringify(body), { status }) + ); +} + +beforeEach(() => { + vi.restoreAllMocks(); +}); + +// --------------------------------------------------------------------------- +// fetchGitHubChangedFiles +// --------------------------------------------------------------------------- + +describe('fetchGitHubChangedFiles', () => { + it('maps added status correctly', async () => { + mockFetch(200, { + status: 'ahead', + files: [{ filename: 'src/new.ts', status: 'added', sha: 'abc123' }] + }); + const result = await fetchGitHubChangedFiles('owner', 'repo', 'v1.0.0', 'v1.1.0'); + expect(result).toHaveLength(1); + expect(result[0]).toMatchObject({ path: 'src/new.ts', status: 'added', sha: 'abc123' }); + }); + + it('maps modified status correctly', async () => { + mockFetch(200, { + status: 'ahead', + files: [{ filename: 'src/index.ts', status: 'modified', sha: 'def456' }] + }); + const result = await fetchGitHubChangedFiles('owner', 'repo', 'v1.0.0', 'v1.1.0'); + expect(result[0]).toMatchObject({ path: 'src/index.ts', status: 'modified' }); + }); + + it('maps removed status correctly and omits sha', async () => { + mockFetch(200, { + status: 'ahead', + files: [{ filename: 'src/old.ts', status: 'removed', sha: '000000' }] + }); + const result = await fetchGitHubChangedFiles('owner', 'repo', 'v1.0.0', 'v1.1.0'); + expect(result[0]).toMatchObject({ path: 'src/old.ts', status: 'removed' }); + expect(result[0].sha).toBeUndefined(); + }); + + it('maps renamed status and sets previousPath', async () => { + mockFetch(200, { + status: 'ahead', + files: [ + { + filename: 'src/renamed.ts', + status: 'renamed', + sha: 'ghi789', + previous_filename: 'src/original.ts' + } + ] + }); + const result = await fetchGitHubChangedFiles('owner', 'repo', 'v1.0.0', 'v1.1.0'); + expect(result[0]).toMatchObject({ + path: 'src/renamed.ts', + status: 'renamed', + previousPath: 'src/original.ts', + sha: 'ghi789' + }); + }); + + it('returns empty array when compare status is identical', async () => { + mockFetch(200, { status: 'identical', files: [] }); + const result = await fetchGitHubChangedFiles('owner', 'repo', 'v1.0.0', 'v1.0.0'); + expect(result).toEqual([]); + }); + + it('returns empty array when compare status is behind', async () => { + mockFetch(200, { + status: 'behind', + files: [{ filename: 'src/index.ts', status: 'modified', sha: 'abc' }] + }); + const result = await fetchGitHubChangedFiles('owner', 'repo', 'v1.1.0', 'v1.0.0'); + expect(result).toEqual([]); + }); + + it('throws GitHubApiError on 401 unauthorized', async () => { + mockFetch(401, { message: 'Unauthorized' }); + await expect( + fetchGitHubChangedFiles('owner', 'private-repo', 'v1.0.0', 'v1.1.0') + ).rejects.toThrow(GitHubApiError); + }); + + it('throws GitHubApiError on 404 not found', async () => { + mockFetch(404, { message: 'Not Found' }); + await expect( + fetchGitHubChangedFiles('owner', 'missing-repo', 'v1.0.0', 'v1.1.0') + ).rejects.toThrow(GitHubApiError); + }); + + it('throws GitHubApiError on 422 unprocessable entity', async () => { + mockFetch(422, { message: 'Unprocessable Entity' }); + await expect( + fetchGitHubChangedFiles('owner', 'repo', 'bad-ref', 'v1.1.0') + ).rejects.toThrow(GitHubApiError); + }); + + it('returns empty array when files property is missing', async () => { + mockFetch(200, { status: 'ahead' }); + const result = await fetchGitHubChangedFiles('owner', 'repo', 'v1.0.0', 'v1.1.0'); + expect(result).toEqual([]); + }); + + it('returns empty array when files array is empty', async () => { + mockFetch(200, { status: 'ahead', files: [] }); + const result = await fetchGitHubChangedFiles('owner', 'repo', 'v1.0.0', 'v1.1.0'); + expect(result).toEqual([]); + }); + + it('maps copied status to modified', async () => { + mockFetch(200, { + status: 'ahead', + files: [{ filename: 'src/copy.ts', status: 'copied', sha: 'jkl012' }] + }); + const result = await fetchGitHubChangedFiles('owner', 'repo', 'v1.0.0', 'v1.1.0'); + expect(result[0]).toMatchObject({ path: 'src/copy.ts', status: 'modified' }); + }); + + it('maps changed status to modified', async () => { + mockFetch(200, { + status: 'ahead', + files: [{ filename: 'src/changed.ts', status: 'changed', sha: 'mno345' }] + }); + const result = await fetchGitHubChangedFiles('owner', 'repo', 'v1.0.0', 'v1.1.0'); + expect(result[0]).toMatchObject({ path: 'src/changed.ts', status: 'modified' }); + }); + + it('sends Authorization header when token is provided', async () => { + const fetchSpy = vi.spyOn(global, 'fetch').mockResolvedValueOnce( + new Response(JSON.stringify({ status: 'ahead', files: [] }), { status: 200 }) + ); + await fetchGitHubChangedFiles('owner', 'repo', 'v1.0.0', 'v1.1.0', 'my-token'); + const callArgs = fetchSpy.mock.calls[0]; + const headers = (callArgs[1] as RequestInit).headers as Record; + expect(headers['Authorization']).toBe('Bearer my-token'); + }); + + it('does not send Authorization header when no token provided', async () => { + const fetchSpy = vi.spyOn(global, 'fetch').mockResolvedValueOnce( + new Response(JSON.stringify({ status: 'ahead', files: [] }), { status: 200 }) + ); + await fetchGitHubChangedFiles('owner', 'repo', 'v1.0.0', 'v1.1.0'); + const callArgs = fetchSpy.mock.calls[0]; + const headers = (callArgs[1] as RequestInit).headers as Record; + expect(headers['Authorization']).toBeUndefined(); + }); + + it('throws GitHubApiError with correct status code', async () => { + mockFetch(403, { message: 'Forbidden' }); + try { + await fetchGitHubChangedFiles('owner', 'repo', 'v1.0.0', 'v1.1.0'); + expect.fail('should have thrown'); + } catch (e) { + expect(e).toBeInstanceOf(GitHubApiError); + expect((e as GitHubApiError).status).toBe(403); + } + }); +}); diff --git a/src/lib/server/crawler/github-compare.ts b/src/lib/server/crawler/github-compare.ts new file mode 100644 index 0000000..4a828a2 --- /dev/null +++ b/src/lib/server/crawler/github-compare.ts @@ -0,0 +1,104 @@ +/** + * GitHub Compare API client for differential tag indexing (TRUEREF-0021). + * + * Uses GET /repos/{owner}/{repo}/compare/{base}...{head} to determine + * which files changed between two refs without downloading full trees. + */ +import { GitHubApiError } from './github-tags.js'; +import type { ChangedFile } from './types.js'; + +const GITHUB_API = 'https://api.github.com'; + +interface GitHubCompareFile { + filename: string; + status: 'added' | 'modified' | 'removed' | 'renamed' | 'copied' | 'changed' | 'unchanged'; + sha: string; + previous_filename?: string; +} + +interface GitHubCompareResponse { + status: 'diverged' | 'ahead' | 'behind' | 'identical'; + files?: GitHubCompareFile[]; +} + +/** + * Fetch changed files between two GitHub refs using the Compare API. + * + * @param owner GitHub owner/org + * @param repo GitHub repository name + * @param base Base ref (tag, branch, or commit SHA) + * @param head Head ref (tag, branch, or commit SHA) + * @param token Optional PAT for private repos + * @returns Array of ChangedFile objects; empty array when refs are identical or head is behind base + */ +export async function fetchGitHubChangedFiles( + owner: string, + repo: string, + base: string, + head: string, + token?: string +): Promise { + const url = `${GITHUB_API}/repos/${owner}/${repo}/compare/${base}...${head}?per_page=300`; + + const headers: Record = { + Accept: 'application/vnd.github+json', + 'X-GitHub-Api-Version': '2022-11-28', + 'User-Agent': 'TrueRef/1.0' + }; + if (token) headers['Authorization'] = `Bearer ${token}`; + + const response = await fetch(url, { headers }); + + if (!response.ok) { + throw new GitHubApiError(response.status); + } + + const data = (await response.json()) as GitHubCompareResponse; + + // Identical or behind means no relevant changes to index + if (data.status === 'identical' || data.status === 'behind') { + return []; + } + + if (!data.files || data.files.length === 0) { + return []; + } + + return data.files.map((file): ChangedFile => { + let status: ChangedFile['status']; + + switch (file.status) { + case 'added': + status = 'added'; + break; + case 'removed': + status = 'removed'; + break; + case 'renamed': + status = 'renamed'; + break; + case 'modified': + case 'copied': + case 'changed': + case 'unchanged': + default: + status = 'modified'; + break; + } + + const result: ChangedFile = { + path: file.filename, + status + }; + + if (status === 'renamed' && file.previous_filename) { + result.previousPath = file.previous_filename; + } + + if (status !== 'removed') { + result.sha = file.sha; + } + + return result; + }); +} diff --git a/src/lib/server/crawler/types.ts b/src/lib/server/crawler/types.ts index da386ec..830780f 100644 --- a/src/lib/server/crawler/types.ts +++ b/src/lib/server/crawler/types.ts @@ -55,6 +55,21 @@ export interface CrawlOptions { config?: RepoConfig; /** Progress callback invoked after each file is processed */ onProgress?: (processed: number, total: number) => void; + /** + * When provided, the crawler must restrict returned files to only these paths. + * Used by the differential indexing pipeline to skip unchanged files. + */ + allowedPaths?: Set; +} + +export interface ChangedFile { + /** Path of the file in the new version (head). For renames, this is the destination path. */ + path: string; + status: 'added' | 'modified' | 'removed' | 'renamed'; + /** Previous path, only set when status === 'renamed' */ + previousPath?: string; + /** Blob SHA of the file content in the head ref (omitted for removed files) */ + sha?: string; } // --------------------------------------------------------------------------- diff --git a/src/lib/server/pipeline/differential-strategy.ts b/src/lib/server/pipeline/differential-strategy.ts new file mode 100644 index 0000000..2102963 --- /dev/null +++ b/src/lib/server/pipeline/differential-strategy.ts @@ -0,0 +1,122 @@ +/** + * Differential indexing strategy coordinator (TRUEREF-0021). + * + * Determines whether differential indexing can be used for a given version tag, + * and if so, builds a plan describing which files to clone from the ancestor + * and which files to crawl fresh. + */ +import type Database from 'better-sqlite3'; +import type { Repository } from '$lib/server/models/repository.js'; +import type { RepositoryVersion } from '$lib/server/models/repository-version.js'; +import { RepositoryVersionMapper } from '$lib/server/mappers/repository-version.mapper.js'; +import type { RepositoryVersionEntity } from '$lib/server/models/repository-version.js'; +import { findBestAncestorVersion } from '$lib/server/utils/tag-order.js'; +import { fetchGitHubChangedFiles } from '$lib/server/crawler/github-compare.js'; +import { getChangedFilesBetweenRefs } from '$lib/server/utils/git.js'; +import type { ChangedFile } from '$lib/server/crawler/types.js'; + +export interface DifferentialPlan { + /** Version ID of the closest already-indexed predecessor tag */ + ancestorVersionId: string; + /** Ancestor tag name (needed for git diff / GitHub compare calls) */ + ancestorTag: string; + /** File paths that changed (added + modified + renamed-destination) */ + changedPaths: Set; + /** File paths that were deleted in the target vs ancestor */ + deletedPaths: Set; + /** File paths present in ancestor that are unchanged in target — must be cloned */ + unchangedPaths: Set; +} + +export async function buildDifferentialPlan(params: { + repo: Repository; + targetTag: string; + db: Database.Database; + /** Override for testing only */ + _fetchGitHubChangedFiles?: typeof fetchGitHubChangedFiles; +}): Promise { + const { repo, targetTag, db } = params; + const fetchFn = params._fetchGitHubChangedFiles ?? fetchGitHubChangedFiles; + + try { + // 1. Load all indexed versions for this repository + const rows = db + .prepare( + `SELECT * FROM repository_versions WHERE repository_id = ? AND state = 'indexed'` + ) + .all(repo.id) as RepositoryVersionEntity[]; + + const indexedVersions: RepositoryVersion[] = rows.map((row) => + RepositoryVersionMapper.fromEntity(row) + ); + + // 2. Find the best ancestor version + const ancestor = findBestAncestorVersion(targetTag, indexedVersions); + if (!ancestor) return null; + + // 3. Load ancestor's document file paths + const docRows = db + .prepare(`SELECT DISTINCT file_path FROM documents WHERE version_id = ?`) + .all(ancestor.id) as Array<{ file_path: string }>; + + const ancestorFilePaths = new Set(docRows.map((r) => r.file_path)); + if (ancestorFilePaths.size === 0) return null; + + // 4. Fetch changed files between ancestor and target + let changedFiles: ChangedFile[]; + + if (repo.source === 'github') { + const url = new URL(repo.sourceUrl); + const parts = url.pathname.split('/').filter(Boolean); + const owner = parts[0]; + const repoName = parts[1]; + changedFiles = await fetchFn( + owner, + repoName, + ancestor.tag, + targetTag, + repo.githubToken ?? undefined + ); + } else { + changedFiles = getChangedFilesBetweenRefs({ + repoPath: repo.sourceUrl, + base: ancestor.tag, + head: targetTag + }); + } + + // 5. Partition changed files into changed and deleted sets + const changedPaths = new Set(); + const deletedPaths = new Set(); + + for (const file of changedFiles) { + if (file.status === 'removed') { + deletedPaths.add(file.path); + } else { + changedPaths.add(file.path); + } + } + + // 6. Compute unchanged paths: ancestor paths minus changed minus deleted + const unchangedPaths = new Set(); + for (const p of ancestorFilePaths) { + if (!changedPaths.has(p) && !deletedPaths.has(p)) { + unchangedPaths.add(p); + } + } + + // 7. Return null when there's nothing to clone (all files changed) + if (unchangedPaths.size === 0) return null; + + return { + ancestorVersionId: ancestor.id, + ancestorTag: ancestor.tag, + changedPaths, + deletedPaths, + unchangedPaths + }; + } catch { + // Fail-safe: fall back to full crawl on any error + return null; + } +} diff --git a/src/lib/server/pipeline/indexing.pipeline.test.ts b/src/lib/server/pipeline/indexing.pipeline.test.ts index f057d0c..af52082 100644 --- a/src/lib/server/pipeline/indexing.pipeline.test.ts +++ b/src/lib/server/pipeline/indexing.pipeline.test.ts @@ -13,6 +13,7 @@ import { JobQueue } from './job-queue.js'; import { IndexingPipeline } from './indexing.pipeline.js'; import { recoverStaleJobs } from './startup.js'; import { EmbeddingService } from '$lib/server/embeddings/embedding.service.js'; +import * as diffStrategy from './differential-strategy.js'; // --------------------------------------------------------------------------- // Test DB factory @@ -1019,3 +1020,290 @@ describe('IndexingPipeline', () => { expect(rules).toEqual(['v3: use the streaming API.']); }); }); + +// --------------------------------------------------------------------------- +// differential indexing +// --------------------------------------------------------------------------- + +describe('differential indexing', () => { + let db: Database.Database; + + beforeEach(() => { + db = createTestDb(); + insertRepo(db, { source: 'local', source_url: '/tmp/test-repo' }); + }); + + function insertDocument( + localDb: Database.Database, + overrides: Partial> = {} + ): string { + const id = crypto.randomUUID(); + localDb + .prepare( + `INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)` + ) + .run( + (overrides.id as string) ?? id, + (overrides.repository_id as string) ?? '/test/repo', + (overrides.version_id as string | null) ?? null, + (overrides.file_path as string) ?? 'README.md', + null, + 'markdown', + 100, + (overrides.checksum as string) ?? 'abc123', + Math.floor(Date.now() / 1000) + ); + return (overrides.id as string) ?? id; + } + + function insertSnippet( + localDb: Database.Database, + documentId: string, + overrides: Partial> = {} + ): string { + const id = crypto.randomUUID(); + localDb + .prepare( + `INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` + ) + .run( + (overrides.id as string) ?? id, + documentId, + (overrides.repository_id as string) ?? '/test/repo', + (overrides.version_id as string | null) ?? null, + 'info', + null, + 'content', + 'markdown', + null, + 10, + Math.floor(Date.now() / 1000) + ); + return (overrides.id as string) ?? id; + } + + type PipelineInternals = IndexingPipeline & { + cloneFromAncestor: ( + ancestorVersionId: string, + targetVersionId: string, + repositoryId: string, + unchangedPaths: Set + ) => void; + }; + + it('cloneFromAncestor inserts documents and snippets into the target version', () => { + const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' }); + const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' }); + + const doc1Id = insertDocument(db, { + repository_id: '/test/repo', + version_id: ancestorVersionId, + file_path: 'README.md', + checksum: 'sha-readme' + }); + const doc2Id = insertDocument(db, { + repository_id: '/test/repo', + version_id: ancestorVersionId, + file_path: 'src/index.ts', + checksum: 'sha-index' + }); + insertSnippet(db, doc1Id, { repository_id: '/test/repo', version_id: ancestorVersionId }); + insertSnippet(db, doc2Id, { repository_id: '/test/repo', version_id: ancestorVersionId }); + + const pipeline = new IndexingPipeline( + db, + vi.fn() as never, + { crawl: vi.fn() } as never, + null + ); + (pipeline as unknown as PipelineInternals).cloneFromAncestor( + ancestorVersionId, + targetVersionId, + '/test/repo', + new Set(['README.md', 'src/index.ts']) + ); + + const targetDocs = db + .prepare(`SELECT * FROM documents WHERE version_id = ?`) + .all(targetVersionId) as { id: string; file_path: string }[]; + expect(targetDocs).toHaveLength(2); + expect(targetDocs.map((d) => d.file_path).sort()).toEqual( + ['README.md', 'src/index.ts'].sort() + ); + // New IDs must differ from ancestor doc IDs. + const targetDocIds = targetDocs.map((d) => d.id); + expect(targetDocIds).not.toContain(doc1Id); + expect(targetDocIds).not.toContain(doc2Id); + + const targetSnippets = db + .prepare(`SELECT * FROM snippets WHERE version_id = ?`) + .all(targetVersionId) as { id: string }[]; + expect(targetSnippets).toHaveLength(2); + }); + + it('cloneFromAncestor silently skips paths absent from the ancestor', () => { + const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' }); + const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' }); + + insertDocument(db, { + repository_id: '/test/repo', + version_id: ancestorVersionId, + file_path: 'src/main.ts', + checksum: 'sha-main' + }); + + const pipeline = new IndexingPipeline( + db, + vi.fn() as never, + { crawl: vi.fn() } as never, + null + ); + (pipeline as unknown as PipelineInternals).cloneFromAncestor( + ancestorVersionId, + targetVersionId, + '/test/repo', + new Set(['src/main.ts', 'MISSING.md']) + ); + + const targetDocs = db + .prepare(`SELECT * FROM documents WHERE version_id = ?`) + .all(targetVersionId) as { id: string; file_path: string }[]; + expect(targetDocs).toHaveLength(1); + expect(targetDocs[0].file_path).toBe('src/main.ts'); + }); + + it('falls back to full crawl when no indexed ancestor exists', async () => { + const targetVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' }); + + const files = [ + { + path: 'README.md', + content: '# Hello\n\nThis is documentation.', + sha: 'sha-readme', + language: 'markdown' + }, + { + path: 'src/index.ts', + content: 'export const x = 1;', + sha: 'sha-index', + language: 'typescript' + } + ]; + + const mockLocalCrawl = vi.fn().mockResolvedValue({ + files, + totalFiles: 2, + skippedFiles: 0, + branch: 'main', + commitSha: 'abc' + }); + + const pipeline = new IndexingPipeline( + db, + vi.fn() as never, + { crawl: mockLocalCrawl } as never, + null + ); + + const jobId = insertJob(db, { + repository_id: '/test/repo', + version_id: targetVersionId, + status: 'queued' + }); + const job = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(jobId) as never; + + await pipeline.run(job); + + const updatedJob = db + .prepare(`SELECT status FROM indexing_jobs WHERE id = ?`) + .get(jobId) as { status: string }; + expect(updatedJob.status).toBe('done'); + + const docs = db + .prepare(`SELECT * FROM documents WHERE version_id = ?`) + .all(targetVersionId) as { id: string }[]; + expect(docs.length).toBeGreaterThanOrEqual(2); + }); + + it('cloned unchanged documents survive the diff/replace stage', async () => { + // 1. Set up ancestor and target versions. + const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' }); + const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' }); + + // 2. Insert ancestor doc + snippet for unchanged.md. + const ancestorDocId = insertDocument(db, { + repository_id: '/test/repo', + version_id: ancestorVersionId, + file_path: 'unchanged.md', + checksum: 'sha-unchanged' + }); + insertSnippet(db, ancestorDocId, { + repository_id: '/test/repo', + version_id: ancestorVersionId + }); + + // 3. Crawl returns ONLY changed.md (unchanged.md is absent — differential only). + const mockLocalCrawl = vi.fn().mockResolvedValue({ + files: [ + { + path: 'changed.md', + content: '# Changed\n\nThis file was added.', + sha: 'sha-changed', + language: 'markdown' + } + ], + totalFiles: 1, + skippedFiles: 0, + branch: 'main', + commitSha: 'abc' + }); + + // 4. Mock buildDifferentialPlan to return a plan with the two paths. + const mockPlan = { + ancestorVersionId, + ancestorTag: 'v1.0.0', + changedPaths: new Set(['changed.md']), + deletedPaths: new Set(), + unchangedPaths: new Set(['unchanged.md']) + }; + const spy = vi + .spyOn(diffStrategy, 'buildDifferentialPlan') + .mockResolvedValueOnce(mockPlan); + + const pipeline = new IndexingPipeline( + db, + vi.fn() as never, + { crawl: mockLocalCrawl } as never, + null + ); + + // 5. Run pipeline for the target version job. + const jobId = insertJob(db, { + repository_id: '/test/repo', + version_id: targetVersionId, + status: 'queued' + }); + const job = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(jobId) as never; + await pipeline.run(job); + + spy.mockRestore(); + + // 6. Assert job completed and both docs exist under the target version. + const finalJob = db + .prepare(`SELECT status FROM indexing_jobs WHERE id = ?`) + .get(jobId) as { status: string }; + expect(finalJob.status).toBe('done'); + + const targetDocs = db + .prepare(`SELECT file_path FROM documents WHERE version_id = ?`) + .all(targetVersionId) as { file_path: string }[]; + const filePaths = targetDocs.map((d) => d.file_path); + + // unchanged.md was cloned and must NOT have been deleted by computeDiff. + expect(filePaths).toContain('unchanged.md'); + // changed.md was crawled and indexed in this run. + expect(filePaths).toContain('changed.md'); + }); +}); diff --git a/src/lib/server/pipeline/indexing.pipeline.ts b/src/lib/server/pipeline/indexing.pipeline.ts index 9e8e9ff..ea4bee2 100644 --- a/src/lib/server/pipeline/indexing.pipeline.ts +++ b/src/lib/server/pipeline/indexing.pipeline.ts @@ -26,6 +26,7 @@ import { resolveConfig, type ParsedConfig } from '$lib/server/config/config-pars import { parseFile } from '$lib/server/parser/index.js'; import { computeTrustScore } from '$lib/server/search/trust-score.js'; import { computeDiff } from './diff.js'; +import { buildDifferentialPlan, type DifferentialPlan } from './differential-strategy.js'; // --------------------------------------------------------------------------- // Progress calculation @@ -95,11 +96,44 @@ export class IndexingPipeline { this.updateVersion(normJob.versionId, { state: 'indexing' }); } - // ---- Stage 1: Crawl ------------------------------------------------- const versionTag = normJob.versionId ? this.getVersionTag(normJob.versionId) : undefined; - const crawlResult = await this.crawl(repo, versionTag); + + // ---- Stage 0: Differential strategy (TRUEREF-0021) ---------------------- + // When indexing a tagged version, check if we can inherit unchanged files + // from an already-indexed ancestor version instead of crawling everything. + let differentialPlan: DifferentialPlan | null = null; + if (normJob.versionId && versionTag) { + differentialPlan = await buildDifferentialPlan({ + repo, + targetTag: versionTag, + db: this.db + }).catch((err) => { + console.warn( + `[IndexingPipeline] Differential plan failed, falling back to full crawl: ${err instanceof Error ? err.message : String(err)}` + ); + return null; + }); + } + + // If a differential plan exists, clone unchanged files from ancestor. + if (differentialPlan && differentialPlan.unchangedPaths.size > 0) { + this.cloneFromAncestor( + differentialPlan.ancestorVersionId, + normJob.versionId!, + repo.id, + differentialPlan.unchangedPaths + ); + console.info( + `[IndexingPipeline] Differential indexing: cloned ${differentialPlan.unchangedPaths.size} unchanged files from ${differentialPlan.ancestorTag}` + ); + } + + // ---- Stage 1: Crawl ------------------------------------------------- + // Pass changedPaths as allowlist so crawl only fetches/returns changed files. + const crawlAllowedPaths = differentialPlan ? differentialPlan.changedPaths : undefined; + const crawlResult = await this.crawl(repo, versionTag, crawlAllowedPaths); // Resolve trueref.json / context7.json configuration. // Prefer the pre-parsed config carried in the CrawlResult (set by @@ -137,7 +171,16 @@ export class IndexingPipeline { // Load all existing documents for this repo so computeDiff can // classify every crawled file and detect deletions. const existingDocs = this.getExistingDocuments(repo.id, normJob.versionId); - const diff = computeDiff(filteredFiles, existingDocs); + + // Exclude files that were cloned from the ancestor — they are not candidates + // for deletion or re-processing (computeDiff must not see them in existingDocs). + const clonedPaths = differentialPlan?.unchangedPaths ?? new Set(); + const existingDocsForDiff = + clonedPaths.size > 0 + ? existingDocs.filter((d) => !clonedPaths.has(d.filePath)) + : existingDocs; + + const diff = computeDiff(filteredFiles, existingDocsForDiff); // Accumulate new documents/snippets; skip unchanged files. const newDocuments: NewDocument[] = []; @@ -146,11 +189,11 @@ export class IndexingPipeline { // Schedule stale documents (modified + deleted) for deletion. for (const file of diff.modified) { - const existing = existingDocs.find((d) => d.filePath === file.path); + const existing = existingDocsForDiff.find((d) => d.filePath === file.path); if (existing) changedDocIds.push(existing.id); } for (const filePath of diff.deleted) { - const existing = existingDocs.find((d) => d.filePath === filePath); + const existing = existingDocsForDiff.find((d) => d.filePath === filePath); if (existing) changedDocIds.push(existing.id); } @@ -316,7 +359,7 @@ export class IndexingPipeline { // Private — crawl // ------------------------------------------------------------------------- - private async crawl(repo: Repository, ref?: string): Promise<{ + private async crawl(repo: Repository, ref?: string, allowedPaths?: Set): Promise<{ files: Array<{ path: string; content: string; sha: string; size: number; language: string }>; totalFiles: number; /** Pre-parsed trueref.json / context7.json, or undefined when absent. */ @@ -339,7 +382,12 @@ export class IndexingPipeline { token: repo.githubToken ?? undefined }); - return { files: result.files, totalFiles: result.totalFiles }; + // Apply allowedPaths filter for differential indexing. + const githubFinalFiles = + allowedPaths && allowedPaths.size > 0 + ? result.files.filter((f) => allowedPaths.has(f.path)) + : result.files; + return { files: githubFinalFiles, totalFiles: result.totalFiles }; } else { // Local filesystem crawl. const result = await this.localCrawler.crawl({ @@ -347,7 +395,12 @@ export class IndexingPipeline { ref: ref ?? (repo.branch !== 'main' ? (repo.branch ?? undefined) : undefined) }); - return { files: result.files, totalFiles: result.totalFiles, config: result.config }; + // Apply allowedPaths filter for differential indexing. + const localFinalFiles = + allowedPaths && allowedPaths.size > 0 + ? result.files.filter((f) => allowedPaths.has(f.path)) + : result.files; + return { files: localFinalFiles, totalFiles: result.totalFiles, config: result.config }; } } @@ -358,6 +411,146 @@ export class IndexingPipeline { return row?.tag; } + // ------------------------------------------------------------------------- + // Private — differential clone (TRUEREF-0021) + // ------------------------------------------------------------------------- + + /** + * Clone documents, snippets, and embeddings from an ancestor version into + * the target version for all unchanged file paths. + * + * Runs in a single SQLite transaction for atomicity. + */ + private cloneFromAncestor( + ancestorVersionId: string, + targetVersionId: string, + repositoryId: string, + unchangedPaths: Set + ): void { + this.db.transaction(() => { + const pathList = [...unchangedPaths]; + const placeholders = pathList.map(() => '?').join(','); + const ancestorDocs = this.db + .prepare( + `SELECT * FROM documents WHERE version_id = ? AND file_path IN (${placeholders})` + ) + .all(ancestorVersionId, ...pathList) as Array<{ + id: string; + repository_id: string; + file_path: string; + title: string | null; + language: string | null; + token_count: number; + checksum: string; + indexed_at: number; + }>; + + const docIdMap = new Map(); + const nowEpoch = Math.floor(Date.now() / 1000); + + for (const doc of ancestorDocs) { + const newDocId = randomUUID(); + docIdMap.set(doc.id, newDocId); + this.db + .prepare( + `INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)` + ) + .run( + newDocId, + repositoryId, + targetVersionId, + doc.file_path, + doc.title, + doc.language, + doc.token_count, + doc.checksum, + nowEpoch + ); + } + + if (docIdMap.size === 0) return; + + const oldDocIds = [...docIdMap.keys()]; + const snippetPlaceholders = oldDocIds.map(() => '?').join(','); + const ancestorSnippets = this.db + .prepare( + `SELECT * FROM snippets WHERE document_id IN (${snippetPlaceholders})` + ) + .all(...oldDocIds) as Array<{ + id: string; + document_id: string; + repository_id: string; + version_id: string | null; + type: string; + title: string | null; + content: string; + language: string | null; + breadcrumb: string | null; + token_count: number; + created_at: number; + }>; + + const snippetIdMap = new Map(); + for (const snippet of ancestorSnippets) { + const newSnippetId = randomUUID(); + snippetIdMap.set(snippet.id, newSnippetId); + const newDocId = docIdMap.get(snippet.document_id)!; + this.db + .prepare( + `INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` + ) + .run( + newSnippetId, + newDocId, + repositoryId, + targetVersionId, + snippet.type, + snippet.title, + snippet.content, + snippet.language, + snippet.breadcrumb, + snippet.token_count, + snippet.created_at + ); + } + + if (snippetIdMap.size > 0) { + const oldSnippetIds = [...snippetIdMap.keys()]; + const embPlaceholders = oldSnippetIds.map(() => '?').join(','); + const ancestorEmbeddings = this.db + .prepare( + `SELECT * FROM snippet_embeddings WHERE snippet_id IN (${embPlaceholders})` + ) + .all(...oldSnippetIds) as Array<{ + snippet_id: string; + profile_id: string; + model: string; + dimensions: number; + embedding: Buffer; + created_at: number; + }>; + for (const emb of ancestorEmbeddings) { + const newSnippetId = snippetIdMap.get(emb.snippet_id)!; + this.db + .prepare( + `INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at) + VALUES (?, ?, ?, ?, ?, ?)` + ) + .run( + newSnippetId, + emb.profile_id, + emb.model, + emb.dimensions, + emb.embedding, + emb.created_at + ); + } + } + })(); + } + // ------------------------------------------------------------------------- // Private — atomic snippet replacement // ------------------------------------------------------------------------- diff --git a/src/lib/server/utils/git.ts b/src/lib/server/utils/git.ts index a4cc87a..cee6857 100644 --- a/src/lib/server/utils/git.ts +++ b/src/lib/server/utils/git.ts @@ -7,9 +7,10 @@ * - File extraction via `git archive` to temp directories */ -import { execSync } from 'node:child_process'; +import { execSync, execFileSync } from 'node:child_process'; import { mkdirSync, rmSync } from 'node:fs'; import { join } from 'node:path'; +import type { ChangedFile } from '../crawler/types.js'; export interface ResolveTagOptions { repoPath: string; @@ -158,3 +159,55 @@ export function cleanupTempExtraction(extractPath: string): void { ); } } + +export interface LocalChangedFileOptions { + repoPath: string; + base: string; + head: string; +} + +/** + * Get the list of files that differ between two git refs (tags, branches, commits). + * + * Uses `git diff --name-status` which produces tab-separated lines in formats: + * M\tpath + * A\tpath + * D\tpath + * R85\told-path\tnew-path + * + * @returns Array of ChangedFile objects + * @throws Error when git command fails + */ +export function getChangedFilesBetweenRefs(options: LocalChangedFileOptions): ChangedFile[] { + const { repoPath, base, head } = options; + + try { + const output = execFileSync('git', ['-C', repoPath, 'diff', '--name-status', base, head], { + encoding: 'utf-8', + stdio: ['ignore', 'pipe', 'pipe'] + }).trim(); + + if (!output) return []; + + const results: ChangedFile[] = []; + for (const line of output.split('\n')) { + if (!line) continue; + const parts = line.split('\t'); + const statusCode = parts[0]; + if (statusCode === 'A') { + results.push({ path: parts[1], status: 'added' }); + } else if (statusCode === 'M') { + results.push({ path: parts[1], status: 'modified' }); + } else if (statusCode === 'D') { + results.push({ path: parts[1], status: 'removed' }); + } else if (statusCode.startsWith('R')) { + results.push({ path: parts[2], status: 'renamed', previousPath: parts[1] }); + } + } + return results; + } catch (error) { + throw new Error( + `Failed to get changed files between '${base}' and '${head}' in ${repoPath}: ${error instanceof Error ? error.message : String(error)}` + ); + } +} diff --git a/src/lib/server/utils/tag-order.test.ts b/src/lib/server/utils/tag-order.test.ts new file mode 100644 index 0000000..a424ff6 --- /dev/null +++ b/src/lib/server/utils/tag-order.test.ts @@ -0,0 +1,123 @@ +/** + * Unit tests for tag-order utilities (TRUEREF-0021). + */ +import { describe, it, expect } from 'vitest'; +import { findBestAncestorVersion } from './tag-order.js'; +import { RepositoryVersion } from '$lib/server/models/repository-version.js'; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function makeVersion(tag: string, state: RepositoryVersion['state'] = 'indexed'): RepositoryVersion { + return new RepositoryVersion({ + id: `/facebook/react/${tag}`, + repositoryId: '/facebook/react', + tag, + title: null, + commitHash: null, + state, + totalSnippets: 0, + indexedAt: new Date(), + createdAt: new Date() + }); +} + +// --------------------------------------------------------------------------- +// findBestAncestorVersion +// --------------------------------------------------------------------------- + +describe('findBestAncestorVersion', () => { + it('returns null when candidates array is empty', () => { + expect(findBestAncestorVersion('v2.1.0', [])).toBeNull(); + }); + + it('returns null when no candidates have state === indexed', () => { + const candidates = [ + makeVersion('v1.0.0', 'pending'), + makeVersion('v1.1.0', 'indexing'), + makeVersion('v2.0.0', 'error') + ]; + expect(findBestAncestorVersion('v2.1.0', candidates)).toBeNull(); + }); + + it('returns the nearest semver predecessor from a list', () => { + const candidates = [ + makeVersion('v1.0.0'), + makeVersion('v1.1.0'), + makeVersion('v2.0.0') + ]; + const result = findBestAncestorVersion('v2.1.0', candidates); + expect(result?.tag).toBe('v2.0.0'); + }); + + it('handles v-prefix stripping correctly', () => { + const candidates = [ + makeVersion('v1.0.0'), + makeVersion('v1.5.0'), + makeVersion('v2.0.0') + ]; + const result = findBestAncestorVersion('v2.0.1', candidates); + expect(result?.tag).toBe('v2.0.0'); + }); + + it('returns null when all candidates are after current tag', () => { + const candidates = [makeVersion('v2.0.0')]; + expect(findBestAncestorVersion('v1.0.0', candidates)).toBeNull(); + }); + + it('returns null when all candidates equal the current tag', () => { + const candidates = [makeVersion('v1.0.0'), makeVersion('v2.0.0')]; + expect(findBestAncestorVersion('v1.0.0', candidates)).toBeNull(); + }); + + it('handles tag lists without semver format using lexicographic fallback', () => { + const candidates = [ + makeVersion('release-alpha'), + makeVersion('release-beta'), + makeVersion('release-gamma') + ]; + const result = findBestAncestorVersion('release-zeta', candidates); + expect(result).not.toBeNull(); + // Lexicographic: all are "less than" release-zeta, so the max is release-gamma + expect(result?.tag).toBe('release-gamma'); + }); + + it('returns single candidate that is older than current tag', () => { + const candidates = [makeVersion('v1.0.0')]; + const result = findBestAncestorVersion('v2.0.0', candidates); + expect(result?.tag).toBe('v1.0.0'); + }); + + it('ignores non-indexed versions even when they are valid predecessors', () => { + const candidates = [ + makeVersion('v1.0.0', 'indexed'), + makeVersion('v1.5.0', 'pending'), + makeVersion('v1.8.0', 'error') + ]; + const result = findBestAncestorVersion('v2.0.0', candidates); + expect(result?.tag).toBe('v1.0.0'); + }); + + it('correctly handles pre-release versions (pre-release < release)', () => { + const candidates = [ + makeVersion('v2.0.0-alpha'), + makeVersion('v2.0.0-beta'), + makeVersion('v1.9.0') + ]; + // v2.0.0 is target; pre-releases are stricter: v2.0.0-alpha < v2.0.0 + const result = findBestAncestorVersion('v2.0.0', candidates); + expect(result?.tag).toBe('v2.0.0-beta'); + }); + + it('selects closest minor version as predecessor', () => { + const candidates = [ + makeVersion('v1.0.0'), + makeVersion('v1.1.0'), + makeVersion('v1.2.0'), + makeVersion('v1.3.0') + ]; + const result = findBestAncestorVersion('v1.4.0', candidates); + expect(result?.tag).toBe('v1.3.0'); + }); +}); diff --git a/src/lib/server/utils/tag-order.ts b/src/lib/server/utils/tag-order.ts new file mode 100644 index 0000000..21bfad5 --- /dev/null +++ b/src/lib/server/utils/tag-order.ts @@ -0,0 +1,88 @@ +/** + * Tag ordering and ancestor selection for differential indexing (TRUEREF-0021). + */ +import type { RepositoryVersion } from '$lib/server/models/repository-version.js'; + +interface ParsedVersion { + major: number; + minor: number; + patch: number; + prerelease: string[]; +} + +function parseVersion(tag: string): ParsedVersion | null { + const stripped = tag.startsWith('v') ? tag.slice(1) : tag; + const dashIndex = stripped.indexOf('-'); + const versionPart = dashIndex === -1 ? stripped : stripped.slice(0, dashIndex); + const prereleaseStr = dashIndex === -1 ? '' : stripped.slice(dashIndex + 1); + + const segments = versionPart.split('.'); + if (segments.length < 1 || segments.some((s) => !/^\d+$/.test(s))) return null; + + const [majorStr, minorStr = '0', patchStr = '0'] = segments; + const major = Number(majorStr); + const minor = Number(minorStr); + const patch = Number(patchStr); + + const prerelease = prereleaseStr ? prereleaseStr.split('.') : []; + + return { major, minor, patch, prerelease }; +} + +/** + * Compare two version tags. Returns negative if a < b, positive if a > b, 0 if equal. + */ +function compareTagVersions(tagA: string, tagB: string): number { + const a = parseVersion(tagA); + const b = parseVersion(tagB); + + if (!a || !b) { + // Fall back to lexicographic comparison when semver parsing fails + return tagA.localeCompare(tagB); + } + + if (a.major !== b.major) return a.major - b.major; + if (a.minor !== b.minor) return a.minor - b.minor; + if (a.patch !== b.patch) return a.patch - b.patch; + + // Pre-release versions have lower precedence than the release version + if (a.prerelease.length === 0 && b.prerelease.length > 0) return 1; + if (a.prerelease.length > 0 && b.prerelease.length === 0) return -1; + + // Compare pre-release segments lexicographically + const len = Math.max(a.prerelease.length, b.prerelease.length); + for (let i = 0; i < len; i++) { + const pa = a.prerelease[i] ?? ''; + const pb = b.prerelease[i] ?? ''; + if (pa !== pb) return pa.localeCompare(pb); + } + + return 0; +} + +/** + * Find the best ancestor version for differential indexing. + * + * Selects the most-recent indexed version whose tag sorts before `currentTag` + * using semver comparison. Falls back to lexicographic comparison when semver + * parsing fails. Falls back to creation timestamp order as last resort. + * + * @param currentTag The tag being indexed + * @param candidates All versioned snapshots for this repository + * @returns The best indexed ancestor, or null if none qualifies + */ +export function findBestAncestorVersion( + currentTag: string, + candidates: RepositoryVersion[] +): RepositoryVersion | null { + const indexed = candidates.filter((v) => v.state === 'indexed'); + + const predecessors = indexed.filter((v) => compareTagVersions(v.tag, currentTag) < 0); + + if (predecessors.length === 0) return null; + + // Return the one with the highest version (closest predecessor) + return predecessors.reduce((best, candidate) => + compareTagVersions(candidate.tag, best.tag) > 0 ? candidate : best + ); +}