feat(TRUEREF-0021): implement differential tag indexing

2026-03-30 13:12:50 +02:00
parent e63279fcf6
commit f4fe8c6043
10 changed files with 1281 additions and 9 deletions
--- a/src/lib/server/pipeline/indexing.pipeline.ts
+++ b/src/lib/server/pipeline/indexing.pipeline.ts
@@ -26,6 +26,7 @@ import { resolveConfig, type ParsedConfig } from '$lib/server/config/config-pars
 import { parseFile } from '$lib/server/parser/index.js';
 import { computeTrustScore } from '$lib/server/search/trust-score.js';
 import { computeDiff } from './diff.js';
+import { buildDifferentialPlan, type DifferentialPlan } from './differential-strategy.js';

 // ---------------------------------------------------------------------------
 // Progress calculation
@@ -95,11 +96,44 @@ export class IndexingPipeline {
 				this.updateVersion(normJob.versionId, { state: 'indexing' });
 			}

-			// ---- Stage 1: Crawl -------------------------------------------------
 			const versionTag = normJob.versionId
 				? this.getVersionTag(normJob.versionId)
 				: undefined;
-			const crawlResult = await this.crawl(repo, versionTag);
+
+			// ---- Stage 0: Differential strategy (TRUEREF-0021) ----------------------
+			// When indexing a tagged version, check if we can inherit unchanged files
+			// from an already-indexed ancestor version instead of crawling everything.
+			let differentialPlan: DifferentialPlan | null = null;
+			if (normJob.versionId && versionTag) {
+				differentialPlan = await buildDifferentialPlan({
+					repo,
+					targetTag: versionTag,
+					db: this.db
+				}).catch((err) => {
+					console.warn(
+						`[IndexingPipeline] Differential plan failed, falling back to full crawl: ${err instanceof Error ? err.message : String(err)}`
+					);
+					return null;
+				});
+			}
+
+			// If a differential plan exists, clone unchanged files from ancestor.
+			if (differentialPlan && differentialPlan.unchangedPaths.size > 0) {
+				this.cloneFromAncestor(
+					differentialPlan.ancestorVersionId,
+					normJob.versionId!,
+					repo.id,
+					differentialPlan.unchangedPaths
+				);
+				console.info(
+					`[IndexingPipeline] Differential indexing: cloned ${differentialPlan.unchangedPaths.size} unchanged files from ${differentialPlan.ancestorTag}`
+				);
+			}
+
+			// ---- Stage 1: Crawl -------------------------------------------------
+			// Pass changedPaths as allowlist so crawl only fetches/returns changed files.
+			const crawlAllowedPaths = differentialPlan ? differentialPlan.changedPaths : undefined;
+			const crawlResult = await this.crawl(repo, versionTag, crawlAllowedPaths);

 			// Resolve trueref.json / context7.json configuration.
 			// Prefer the pre-parsed config carried in the CrawlResult (set by
@@ -137,7 +171,16 @@ export class IndexingPipeline {
 			// Load all existing documents for this repo so computeDiff can
 			// classify every crawled file and detect deletions.
 			const existingDocs = this.getExistingDocuments(repo.id, normJob.versionId);
-			const diff = computeDiff(filteredFiles, existingDocs);
+
+			// Exclude files that were cloned from the ancestor — they are not candidates
+			// for deletion or re-processing (computeDiff must not see them in existingDocs).
+			const clonedPaths = differentialPlan?.unchangedPaths ?? new Set<string>();
+			const existingDocsForDiff =
+				clonedPaths.size > 0
+					? existingDocs.filter((d) => !clonedPaths.has(d.filePath))
+					: existingDocs;
+
+			const diff = computeDiff(filteredFiles, existingDocsForDiff);

 			// Accumulate new documents/snippets; skip unchanged files.
 			const newDocuments: NewDocument[] = [];
@@ -146,11 +189,11 @@ export class IndexingPipeline {

 			// Schedule stale documents (modified + deleted) for deletion.
 			for (const file of diff.modified) {
-				const existing = existingDocs.find((d) => d.filePath === file.path);
+				const existing = existingDocsForDiff.find((d) => d.filePath === file.path);
 				if (existing) changedDocIds.push(existing.id);
 			}
 			for (const filePath of diff.deleted) {
-				const existing = existingDocs.find((d) => d.filePath === filePath);
+				const existing = existingDocsForDiff.find((d) => d.filePath === filePath);
 				if (existing) changedDocIds.push(existing.id);
 			}

@@ -316,7 +359,7 @@ export class IndexingPipeline {
 	// Private — crawl
 	// -------------------------------------------------------------------------

-	private async crawl(repo: Repository, ref?: string): Promise<{
+	private async crawl(repo: Repository, ref?: string, allowedPaths?: Set<string>): Promise<{
 		files: Array<{ path: string; content: string; sha: string; size: number; language: string }>;
 		totalFiles: number;
 		/** Pre-parsed trueref.json / context7.json, or undefined when absent. */
@@ -339,7 +382,12 @@ export class IndexingPipeline {
 				token: repo.githubToken ?? undefined
 			});

-			return { files: result.files, totalFiles: result.totalFiles };
+			// Apply allowedPaths filter for differential indexing.
+			const githubFinalFiles =
+				allowedPaths && allowedPaths.size > 0
+					? result.files.filter((f) => allowedPaths.has(f.path))
+					: result.files;
+			return { files: githubFinalFiles, totalFiles: result.totalFiles };
 		} else {
 			// Local filesystem crawl.
 			const result = await this.localCrawler.crawl({
@@ -347,7 +395,12 @@ export class IndexingPipeline {
 				ref: ref ?? (repo.branch !== 'main' ? (repo.branch ?? undefined) : undefined)
 			});

-			return { files: result.files, totalFiles: result.totalFiles, config: result.config };
+			// Apply allowedPaths filter for differential indexing.
+			const localFinalFiles =
+				allowedPaths && allowedPaths.size > 0
+					? result.files.filter((f) => allowedPaths.has(f.path))
+					: result.files;
+			return { files: localFinalFiles, totalFiles: result.totalFiles, config: result.config };
 		}
 	}

@@ -358,6 +411,146 @@ export class IndexingPipeline {
 		return row?.tag;
 	}

+	// -------------------------------------------------------------------------
+	// Private — differential clone (TRUEREF-0021)
+	// -------------------------------------------------------------------------
+
+	/**
+	 * Clone documents, snippets, and embeddings from an ancestor version into
+	 * the target version for all unchanged file paths.
+	 *
+	 * Runs in a single SQLite transaction for atomicity.
+	 */
+	private cloneFromAncestor(
+		ancestorVersionId: string,
+		targetVersionId: string,
+		repositoryId: string,
+		unchangedPaths: Set<string>
+	): void {
+		this.db.transaction(() => {
+			const pathList = [...unchangedPaths];
+			const placeholders = pathList.map(() => '?').join(',');
+			const ancestorDocs = this.db
+				.prepare(
+					`SELECT * FROM documents WHERE version_id = ? AND file_path IN (${placeholders})`
+				)
+				.all(ancestorVersionId, ...pathList) as Array<{
+					id: string;
+					repository_id: string;
+					file_path: string;
+					title: string | null;
+					language: string | null;
+					token_count: number;
+					checksum: string;
+					indexed_at: number;
+				}>;
+
+			const docIdMap = new Map<string, string>();
+			const nowEpoch = Math.floor(Date.now() / 1000);
+
+			for (const doc of ancestorDocs) {
+				const newDocId = randomUUID();
+				docIdMap.set(doc.id, newDocId);
+				this.db
+					.prepare(
+						`INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at)
+             VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
+					)
+					.run(
+						newDocId,
+						repositoryId,
+						targetVersionId,
+						doc.file_path,
+						doc.title,
+						doc.language,
+						doc.token_count,
+						doc.checksum,
+						nowEpoch
+					);
+			}
+
+			if (docIdMap.size === 0) return;
+
+			const oldDocIds = [...docIdMap.keys()];
+			const snippetPlaceholders = oldDocIds.map(() => '?').join(',');
+			const ancestorSnippets = this.db
+				.prepare(
+					`SELECT * FROM snippets WHERE document_id IN (${snippetPlaceholders})`
+				)
+				.all(...oldDocIds) as Array<{
+					id: string;
+					document_id: string;
+					repository_id: string;
+					version_id: string | null;
+					type: string;
+					title: string | null;
+					content: string;
+					language: string | null;
+					breadcrumb: string | null;
+					token_count: number;
+					created_at: number;
+				}>;
+
+			const snippetIdMap = new Map<string, string>();
+			for (const snippet of ancestorSnippets) {
+				const newSnippetId = randomUUID();
+				snippetIdMap.set(snippet.id, newSnippetId);
+				const newDocId = docIdMap.get(snippet.document_id)!;
+				this.db
+					.prepare(
+						`INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
+             VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
+					)
+					.run(
+						newSnippetId,
+						newDocId,
+						repositoryId,
+						targetVersionId,
+						snippet.type,
+						snippet.title,
+						snippet.content,
+						snippet.language,
+						snippet.breadcrumb,
+						snippet.token_count,
+						snippet.created_at
+					);
+			}
+
+			if (snippetIdMap.size > 0) {
+				const oldSnippetIds = [...snippetIdMap.keys()];
+				const embPlaceholders = oldSnippetIds.map(() => '?').join(',');
+				const ancestorEmbeddings = this.db
+					.prepare(
+						`SELECT * FROM snippet_embeddings WHERE snippet_id IN (${embPlaceholders})`
+					)
+					.all(...oldSnippetIds) as Array<{
+						snippet_id: string;
+						profile_id: string;
+						model: string;
+						dimensions: number;
+						embedding: Buffer;
+						created_at: number;
+					}>;
+				for (const emb of ancestorEmbeddings) {
+					const newSnippetId = snippetIdMap.get(emb.snippet_id)!;
+					this.db
+						.prepare(
+							`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
+               VALUES (?, ?, ?, ?, ?, ?)`
+						)
+						.run(
+							newSnippetId,
+							emb.profile_id,
+							emb.model,
+							emb.dimensions,
+							emb.embedding,
+							emb.created_at
+						);
+				}
+			}
+		})();
+	}
+
 	// -------------------------------------------------------------------------
 	// Private — atomic snippet replacement
 	// -------------------------------------------------------------------------