/** * Differential indexing strategy coordinator (TRUEREF-0021). * * Determines whether differential indexing can be used for a given version tag, * and if so, builds a plan describing which files to clone from the ancestor * and which files to crawl fresh. */ import type Database from 'better-sqlite3'; import type { Repository } from '$lib/server/models/repository.js'; import type { RepositoryVersion } from '$lib/server/models/repository-version.js'; import { RepositoryVersionMapper } from '$lib/server/mappers/repository-version.mapper.js'; import type { RepositoryVersionEntity } from '$lib/server/models/repository-version.js'; import { findBestAncestorVersion } from '$lib/server/utils/tag-order.js'; import { fetchGitHubChangedFiles } from '$lib/server/crawler/github-compare.js'; import { getChangedFilesBetweenRefs } from '$lib/server/utils/git.js'; import type { ChangedFile } from '$lib/server/crawler/types.js'; export interface DifferentialPlan { /** Version ID of the closest already-indexed predecessor tag */ ancestorVersionId: string; /** Ancestor tag name (needed for git diff / GitHub compare calls) */ ancestorTag: string; /** File paths that changed (added + modified + renamed-destination) */ changedPaths: Set; /** File paths that were deleted in the target vs ancestor */ deletedPaths: Set; /** File paths present in ancestor that are unchanged in target — must be cloned */ unchangedPaths: Set; } export async function buildDifferentialPlan(params: { repo: Repository; targetTag: string; db: Database.Database; /** Override for testing only */ _fetchGitHubChangedFiles?: typeof fetchGitHubChangedFiles; }): Promise { const { repo, targetTag, db } = params; const fetchFn = params._fetchGitHubChangedFiles ?? fetchGitHubChangedFiles; try { // 1. Load all indexed versions for this repository const rows = db .prepare( `SELECT * FROM repository_versions WHERE repository_id = ? AND state = 'indexed'` ) .all(repo.id) as RepositoryVersionEntity[]; const indexedVersions: RepositoryVersion[] = rows.map((row) => RepositoryVersionMapper.fromEntity(row) ); // 2. Find the best ancestor version const ancestor = findBestAncestorVersion(targetTag, indexedVersions); if (!ancestor) return null; // 3. Load ancestor's document file paths const docRows = db .prepare(`SELECT DISTINCT file_path FROM documents WHERE version_id = ?`) .all(ancestor.id) as Array<{ file_path: string }>; const ancestorFilePaths = new Set(docRows.map((r) => r.file_path)); if (ancestorFilePaths.size === 0) return null; // 4. Fetch changed files between ancestor and target let changedFiles: ChangedFile[]; if (repo.source === 'github') { const url = new URL(repo.sourceUrl); const parts = url.pathname.split('/').filter(Boolean); const owner = parts[0]; const repoName = parts[1]; changedFiles = await fetchFn( owner, repoName, ancestor.tag, targetTag, repo.githubToken ?? undefined ); } else { changedFiles = getChangedFilesBetweenRefs({ repoPath: repo.sourceUrl, base: ancestor.tag, head: targetTag }); } // 5. Partition changed files into changed and deleted sets const changedPaths = new Set(); const deletedPaths = new Set(); for (const file of changedFiles) { if (file.status === 'removed') { deletedPaths.add(file.path); } else { changedPaths.add(file.path); } } // 6. Compute unchanged paths: ancestor paths minus changed minus deleted const unchangedPaths = new Set(); for (const p of ancestorFilePaths) { if (!changedPaths.has(p) && !deletedPaths.has(p)) { unchangedPaths.add(p); } } // 7. Return null when there's nothing to clone (all files changed) if (unchangedPaths.size === 0) return null; return { ancestorVersionId: ancestor.id, ancestorTag: ancestor.tag, changedPaths, deletedPaths, unchangedPaths }; } catch { // Fail-safe: fall back to full crawl on any error return null; } }