Files
trueref/src/lib/server/pipeline/differential-strategy.ts

123 lines
4.0 KiB
TypeScript

/**
* Differential indexing strategy coordinator (TRUEREF-0021).
*
* Determines whether differential indexing can be used for a given version tag,
* and if so, builds a plan describing which files to clone from the ancestor
* and which files to crawl fresh.
*/
import type Database from 'better-sqlite3';
import type { Repository } from '$lib/server/models/repository.js';
import type { RepositoryVersion } from '$lib/server/models/repository-version.js';
import { RepositoryVersionMapper } from '$lib/server/mappers/repository-version.mapper.js';
import type { RepositoryVersionEntity } from '$lib/server/models/repository-version.js';
import { findBestAncestorVersion } from '$lib/server/utils/tag-order.js';
import { fetchGitHubChangedFiles } from '$lib/server/crawler/github-compare.js';
import { getChangedFilesBetweenRefs } from '$lib/server/utils/git.js';
import type { ChangedFile } from '$lib/server/crawler/types.js';
export interface DifferentialPlan {
/** Version ID of the closest already-indexed predecessor tag */
ancestorVersionId: string;
/** Ancestor tag name (needed for git diff / GitHub compare calls) */
ancestorTag: string;
/** File paths that changed (added + modified + renamed-destination) */
changedPaths: Set<string>;
/** File paths that were deleted in the target vs ancestor */
deletedPaths: Set<string>;
/** File paths present in ancestor that are unchanged in target — must be cloned */
unchangedPaths: Set<string>;
}
export async function buildDifferentialPlan(params: {
repo: Repository;
targetTag: string;
db: Database.Database;
/** Override for testing only */
_fetchGitHubChangedFiles?: typeof fetchGitHubChangedFiles;
}): Promise<DifferentialPlan | null> {
const { repo, targetTag, db } = params;
const fetchFn = params._fetchGitHubChangedFiles ?? fetchGitHubChangedFiles;
try {
// 1. Load all indexed versions for this repository
const rows = db
.prepare(
`SELECT * FROM repository_versions WHERE repository_id = ? AND state = 'indexed'`
)
.all(repo.id) as RepositoryVersionEntity[];
const indexedVersions: RepositoryVersion[] = rows.map((row) =>
RepositoryVersionMapper.fromEntity(row)
);
// 2. Find the best ancestor version
const ancestor = findBestAncestorVersion(targetTag, indexedVersions);
if (!ancestor) return null;
// 3. Load ancestor's document file paths
const docRows = db
.prepare(`SELECT DISTINCT file_path FROM documents WHERE version_id = ?`)
.all(ancestor.id) as Array<{ file_path: string }>;
const ancestorFilePaths = new Set(docRows.map((r) => r.file_path));
if (ancestorFilePaths.size === 0) return null;
// 4. Fetch changed files between ancestor and target
let changedFiles: ChangedFile[];
if (repo.source === 'github') {
const url = new URL(repo.sourceUrl);
const parts = url.pathname.split('/').filter(Boolean);
const owner = parts[0];
const repoName = parts[1];
changedFiles = await fetchFn(
owner,
repoName,
ancestor.tag,
targetTag,
repo.githubToken ?? undefined
);
} else {
changedFiles = getChangedFilesBetweenRefs({
repoPath: repo.sourceUrl,
base: ancestor.tag,
head: targetTag
});
}
// 5. Partition changed files into changed and deleted sets
const changedPaths = new Set<string>();
const deletedPaths = new Set<string>();
for (const file of changedFiles) {
if (file.status === 'removed') {
deletedPaths.add(file.path);
} else {
changedPaths.add(file.path);
}
}
// 6. Compute unchanged paths: ancestor paths minus changed minus deleted
const unchangedPaths = new Set<string>();
for (const p of ancestorFilePaths) {
if (!changedPaths.has(p) && !deletedPaths.has(p)) {
unchangedPaths.add(p);
}
}
// 7. Return null when there's nothing to clone (all files changed)
if (unchangedPaths.size === 0) return null;
return {
ancestorVersionId: ancestor.id,
ancestorTag: ancestor.tag,
changedPaths,
deletedPaths,
unchangedPaths
};
} catch {
// Fail-safe: fall back to full crawl on any error
return null;
}
}