feat(TRUEREF-0021): implement differential tag indexing
This commit is contained in:
committed by
Giancarmine Salucci
parent
e63279fcf6
commit
f4fe8c6043
122
src/lib/server/pipeline/differential-strategy.ts
Normal file
122
src/lib/server/pipeline/differential-strategy.ts
Normal file
@@ -0,0 +1,122 @@
|
||||
/**
|
||||
* Differential indexing strategy coordinator (TRUEREF-0021).
|
||||
*
|
||||
* Determines whether differential indexing can be used for a given version tag,
|
||||
* and if so, builds a plan describing which files to clone from the ancestor
|
||||
* and which files to crawl fresh.
|
||||
*/
|
||||
import type Database from 'better-sqlite3';
|
||||
import type { Repository } from '$lib/server/models/repository.js';
|
||||
import type { RepositoryVersion } from '$lib/server/models/repository-version.js';
|
||||
import { RepositoryVersionMapper } from '$lib/server/mappers/repository-version.mapper.js';
|
||||
import type { RepositoryVersionEntity } from '$lib/server/models/repository-version.js';
|
||||
import { findBestAncestorVersion } from '$lib/server/utils/tag-order.js';
|
||||
import { fetchGitHubChangedFiles } from '$lib/server/crawler/github-compare.js';
|
||||
import { getChangedFilesBetweenRefs } from '$lib/server/utils/git.js';
|
||||
import type { ChangedFile } from '$lib/server/crawler/types.js';
|
||||
|
||||
export interface DifferentialPlan {
|
||||
/** Version ID of the closest already-indexed predecessor tag */
|
||||
ancestorVersionId: string;
|
||||
/** Ancestor tag name (needed for git diff / GitHub compare calls) */
|
||||
ancestorTag: string;
|
||||
/** File paths that changed (added + modified + renamed-destination) */
|
||||
changedPaths: Set<string>;
|
||||
/** File paths that were deleted in the target vs ancestor */
|
||||
deletedPaths: Set<string>;
|
||||
/** File paths present in ancestor that are unchanged in target — must be cloned */
|
||||
unchangedPaths: Set<string>;
|
||||
}
|
||||
|
||||
export async function buildDifferentialPlan(params: {
|
||||
repo: Repository;
|
||||
targetTag: string;
|
||||
db: Database.Database;
|
||||
/** Override for testing only */
|
||||
_fetchGitHubChangedFiles?: typeof fetchGitHubChangedFiles;
|
||||
}): Promise<DifferentialPlan | null> {
|
||||
const { repo, targetTag, db } = params;
|
||||
const fetchFn = params._fetchGitHubChangedFiles ?? fetchGitHubChangedFiles;
|
||||
|
||||
try {
|
||||
// 1. Load all indexed versions for this repository
|
||||
const rows = db
|
||||
.prepare(
|
||||
`SELECT * FROM repository_versions WHERE repository_id = ? AND state = 'indexed'`
|
||||
)
|
||||
.all(repo.id) as RepositoryVersionEntity[];
|
||||
|
||||
const indexedVersions: RepositoryVersion[] = rows.map((row) =>
|
||||
RepositoryVersionMapper.fromEntity(row)
|
||||
);
|
||||
|
||||
// 2. Find the best ancestor version
|
||||
const ancestor = findBestAncestorVersion(targetTag, indexedVersions);
|
||||
if (!ancestor) return null;
|
||||
|
||||
// 3. Load ancestor's document file paths
|
||||
const docRows = db
|
||||
.prepare(`SELECT DISTINCT file_path FROM documents WHERE version_id = ?`)
|
||||
.all(ancestor.id) as Array<{ file_path: string }>;
|
||||
|
||||
const ancestorFilePaths = new Set(docRows.map((r) => r.file_path));
|
||||
if (ancestorFilePaths.size === 0) return null;
|
||||
|
||||
// 4. Fetch changed files between ancestor and target
|
||||
let changedFiles: ChangedFile[];
|
||||
|
||||
if (repo.source === 'github') {
|
||||
const url = new URL(repo.sourceUrl);
|
||||
const parts = url.pathname.split('/').filter(Boolean);
|
||||
const owner = parts[0];
|
||||
const repoName = parts[1];
|
||||
changedFiles = await fetchFn(
|
||||
owner,
|
||||
repoName,
|
||||
ancestor.tag,
|
||||
targetTag,
|
||||
repo.githubToken ?? undefined
|
||||
);
|
||||
} else {
|
||||
changedFiles = getChangedFilesBetweenRefs({
|
||||
repoPath: repo.sourceUrl,
|
||||
base: ancestor.tag,
|
||||
head: targetTag
|
||||
});
|
||||
}
|
||||
|
||||
// 5. Partition changed files into changed and deleted sets
|
||||
const changedPaths = new Set<string>();
|
||||
const deletedPaths = new Set<string>();
|
||||
|
||||
for (const file of changedFiles) {
|
||||
if (file.status === 'removed') {
|
||||
deletedPaths.add(file.path);
|
||||
} else {
|
||||
changedPaths.add(file.path);
|
||||
}
|
||||
}
|
||||
|
||||
// 6. Compute unchanged paths: ancestor paths minus changed minus deleted
|
||||
const unchangedPaths = new Set<string>();
|
||||
for (const p of ancestorFilePaths) {
|
||||
if (!changedPaths.has(p) && !deletedPaths.has(p)) {
|
||||
unchangedPaths.add(p);
|
||||
}
|
||||
}
|
||||
|
||||
// 7. Return null when there's nothing to clone (all files changed)
|
||||
if (unchangedPaths.size === 0) return null;
|
||||
|
||||
return {
|
||||
ancestorVersionId: ancestor.id,
|
||||
ancestorTag: ancestor.tag,
|
||||
changedPaths,
|
||||
deletedPaths,
|
||||
unchangedPaths
|
||||
};
|
||||
} catch {
|
||||
// Fail-safe: fall back to full crawl on any error
|
||||
return null;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user