feat(TRUEREF-0021): implement differential tag indexing

2026-03-30 13:12:50 +02:00
parent e63279fcf6
commit f4fe8c6043
10 changed files with 1281 additions and 9 deletions
--- a/src/lib/server/pipeline/differential-strategy.ts
+++ b/src/lib/server/pipeline/differential-strategy.ts
@@ -0,0 +1,122 @@
+/**
+ * Differential indexing strategy coordinator (TRUEREF-0021).
+ *
+ * Determines whether differential indexing can be used for a given version tag,
+ * and if so, builds a plan describing which files to clone from the ancestor
+ * and which files to crawl fresh.
+ */
+import type Database from 'better-sqlite3';
+import type { Repository } from '$lib/server/models/repository.js';
+import type { RepositoryVersion } from '$lib/server/models/repository-version.js';
+import { RepositoryVersionMapper } from '$lib/server/mappers/repository-version.mapper.js';
+import type { RepositoryVersionEntity } from '$lib/server/models/repository-version.js';
+import { findBestAncestorVersion } from '$lib/server/utils/tag-order.js';
+import { fetchGitHubChangedFiles } from '$lib/server/crawler/github-compare.js';
+import { getChangedFilesBetweenRefs } from '$lib/server/utils/git.js';
+import type { ChangedFile } from '$lib/server/crawler/types.js';
+
+export interface DifferentialPlan {
+	/** Version ID of the closest already-indexed predecessor tag */
+	ancestorVersionId: string;
+	/** Ancestor tag name (needed for git diff / GitHub compare calls) */
+	ancestorTag: string;
+	/** File paths that changed (added + modified + renamed-destination) */
+	changedPaths: Set<string>;
+	/** File paths that were deleted in the target vs ancestor */
+	deletedPaths: Set<string>;
+	/** File paths present in ancestor that are unchanged in target — must be cloned */
+	unchangedPaths: Set<string>;
+}
+
+export async function buildDifferentialPlan(params: {
+	repo: Repository;
+	targetTag: string;
+	db: Database.Database;
+	/** Override for testing only */
+	_fetchGitHubChangedFiles?: typeof fetchGitHubChangedFiles;
+}): Promise<DifferentialPlan | null> {
+	const { repo, targetTag, db } = params;
+	const fetchFn = params._fetchGitHubChangedFiles ?? fetchGitHubChangedFiles;
+
+	try {
+		// 1. Load all indexed versions for this repository
+		const rows = db
+			.prepare(
+				`SELECT * FROM repository_versions WHERE repository_id = ? AND state = 'indexed'`
+			)
+			.all(repo.id) as RepositoryVersionEntity[];
+
+		const indexedVersions: RepositoryVersion[] = rows.map((row) =>
+			RepositoryVersionMapper.fromEntity(row)
+		);
+
+		// 2. Find the best ancestor version
+		const ancestor = findBestAncestorVersion(targetTag, indexedVersions);
+		if (!ancestor) return null;
+
+		// 3. Load ancestor's document file paths
+		const docRows = db
+			.prepare(`SELECT DISTINCT file_path FROM documents WHERE version_id = ?`)
+			.all(ancestor.id) as Array<{ file_path: string }>;
+
+		const ancestorFilePaths = new Set(docRows.map((r) => r.file_path));
+		if (ancestorFilePaths.size === 0) return null;
+
+		// 4. Fetch changed files between ancestor and target
+		let changedFiles: ChangedFile[];
+
+		if (repo.source === 'github') {
+			const url = new URL(repo.sourceUrl);
+			const parts = url.pathname.split('/').filter(Boolean);
+			const owner = parts[0];
+			const repoName = parts[1];
+			changedFiles = await fetchFn(
+				owner,
+				repoName,
+				ancestor.tag,
+				targetTag,
+				repo.githubToken ?? undefined
+			);
+		} else {
+			changedFiles = getChangedFilesBetweenRefs({
+				repoPath: repo.sourceUrl,
+				base: ancestor.tag,
+				head: targetTag
+			});
+		}
+
+		// 5. Partition changed files into changed and deleted sets
+		const changedPaths = new Set<string>();
+		const deletedPaths = new Set<string>();
+
+		for (const file of changedFiles) {
+			if (file.status === 'removed') {
+				deletedPaths.add(file.path);
+			} else {
+				changedPaths.add(file.path);
+			}
+		}
+
+		// 6. Compute unchanged paths: ancestor paths minus changed minus deleted
+		const unchangedPaths = new Set<string>();
+		for (const p of ancestorFilePaths) {
+			if (!changedPaths.has(p) && !deletedPaths.has(p)) {
+				unchangedPaths.add(p);
+			}
+		}
+
+		// 7. Return null when there's nothing to clone (all files changed)
+		if (unchangedPaths.size === 0) return null;
+
+		return {
+			ancestorVersionId: ancestor.id,
+			ancestorTag: ancestor.tag,
+			changedPaths,
+			deletedPaths,
+			unchangedPaths
+		};
+	} catch {
+		// Fail-safe: fall back to full crawl on any error
+		return null;
+	}
+}
--- a/src/lib/server/pipeline/indexing.pipeline.test.ts
+++ b/src/lib/server/pipeline/indexing.pipeline.test.ts
@@ -13,6 +13,7 @@ import { JobQueue } from './job-queue.js';
 import { IndexingPipeline } from './indexing.pipeline.js';
 import { recoverStaleJobs } from './startup.js';
 import { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
+import * as diffStrategy from './differential-strategy.js';

 // ---------------------------------------------------------------------------
 // Test DB factory
@@ -1019,3 +1020,290 @@ describe('IndexingPipeline', () => {
 		expect(rules).toEqual(['v3: use the streaming API.']);
 	});
 });
+
+// ---------------------------------------------------------------------------
+// differential indexing
+// ---------------------------------------------------------------------------
+
+describe('differential indexing', () => {
+	let db: Database.Database;
+
+	beforeEach(() => {
+		db = createTestDb();
+		insertRepo(db, { source: 'local', source_url: '/tmp/test-repo' });
+	});
+
+	function insertDocument(
+		localDb: Database.Database,
+		overrides: Partial<Record<string, unknown>> = {}
+	): string {
+		const id = crypto.randomUUID();
+		localDb
+			.prepare(
+				`INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at)
+         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
+			)
+			.run(
+				(overrides.id as string) ?? id,
+				(overrides.repository_id as string) ?? '/test/repo',
+				(overrides.version_id as string | null) ?? null,
+				(overrides.file_path as string) ?? 'README.md',
+				null,
+				'markdown',
+				100,
+				(overrides.checksum as string) ?? 'abc123',
+				Math.floor(Date.now() / 1000)
+			);
+		return (overrides.id as string) ?? id;
+	}
+
+	function insertSnippet(
+		localDb: Database.Database,
+		documentId: string,
+		overrides: Partial<Record<string, unknown>> = {}
+	): string {
+		const id = crypto.randomUUID();
+		localDb
+			.prepare(
+				`INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
+         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
+			)
+			.run(
+				(overrides.id as string) ?? id,
+				documentId,
+				(overrides.repository_id as string) ?? '/test/repo',
+				(overrides.version_id as string | null) ?? null,
+				'info',
+				null,
+				'content',
+				'markdown',
+				null,
+				10,
+				Math.floor(Date.now() / 1000)
+			);
+		return (overrides.id as string) ?? id;
+	}
+
+	type PipelineInternals = IndexingPipeline & {
+		cloneFromAncestor: (
+			ancestorVersionId: string,
+			targetVersionId: string,
+			repositoryId: string,
+			unchangedPaths: Set<string>
+		) => void;
+	};
+
+	it('cloneFromAncestor inserts documents and snippets into the target version', () => {
+		const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
+		const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
+
+		const doc1Id = insertDocument(db, {
+			repository_id: '/test/repo',
+			version_id: ancestorVersionId,
+			file_path: 'README.md',
+			checksum: 'sha-readme'
+		});
+		const doc2Id = insertDocument(db, {
+			repository_id: '/test/repo',
+			version_id: ancestorVersionId,
+			file_path: 'src/index.ts',
+			checksum: 'sha-index'
+		});
+		insertSnippet(db, doc1Id, { repository_id: '/test/repo', version_id: ancestorVersionId });
+		insertSnippet(db, doc2Id, { repository_id: '/test/repo', version_id: ancestorVersionId });
+
+		const pipeline = new IndexingPipeline(
+			db,
+			vi.fn() as never,
+			{ crawl: vi.fn() } as never,
+			null
+		);
+		(pipeline as unknown as PipelineInternals).cloneFromAncestor(
+			ancestorVersionId,
+			targetVersionId,
+			'/test/repo',
+			new Set(['README.md', 'src/index.ts'])
+		);
+
+		const targetDocs = db
+			.prepare(`SELECT * FROM documents WHERE version_id = ?`)
+			.all(targetVersionId) as { id: string; file_path: string }[];
+		expect(targetDocs).toHaveLength(2);
+		expect(targetDocs.map((d) => d.file_path).sort()).toEqual(
+			['README.md', 'src/index.ts'].sort()
+		);
+		// New IDs must differ from ancestor doc IDs.
+		const targetDocIds = targetDocs.map((d) => d.id);
+		expect(targetDocIds).not.toContain(doc1Id);
+		expect(targetDocIds).not.toContain(doc2Id);
+
+		const targetSnippets = db
+			.prepare(`SELECT * FROM snippets WHERE version_id = ?`)
+			.all(targetVersionId) as { id: string }[];
+		expect(targetSnippets).toHaveLength(2);
+	});
+
+	it('cloneFromAncestor silently skips paths absent from the ancestor', () => {
+		const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
+		const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
+
+		insertDocument(db, {
+			repository_id: '/test/repo',
+			version_id: ancestorVersionId,
+			file_path: 'src/main.ts',
+			checksum: 'sha-main'
+		});
+
+		const pipeline = new IndexingPipeline(
+			db,
+			vi.fn() as never,
+			{ crawl: vi.fn() } as never,
+			null
+		);
+		(pipeline as unknown as PipelineInternals).cloneFromAncestor(
+			ancestorVersionId,
+			targetVersionId,
+			'/test/repo',
+			new Set(['src/main.ts', 'MISSING.md'])
+		);
+
+		const targetDocs = db
+			.prepare(`SELECT * FROM documents WHERE version_id = ?`)
+			.all(targetVersionId) as { id: string; file_path: string }[];
+		expect(targetDocs).toHaveLength(1);
+		expect(targetDocs[0].file_path).toBe('src/main.ts');
+	});
+
+	it('falls back to full crawl when no indexed ancestor exists', async () => {
+		const targetVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' });
+
+		const files = [
+			{
+				path: 'README.md',
+				content: '# Hello\n\nThis is documentation.',
+				sha: 'sha-readme',
+				language: 'markdown'
+			},
+			{
+				path: 'src/index.ts',
+				content: 'export const x = 1;',
+				sha: 'sha-index',
+				language: 'typescript'
+			}
+		];
+
+		const mockLocalCrawl = vi.fn().mockResolvedValue({
+			files,
+			totalFiles: 2,
+			skippedFiles: 0,
+			branch: 'main',
+			commitSha: 'abc'
+		});
+
+		const pipeline = new IndexingPipeline(
+			db,
+			vi.fn() as never,
+			{ crawl: mockLocalCrawl } as never,
+			null
+		);
+
+		const jobId = insertJob(db, {
+			repository_id: '/test/repo',
+			version_id: targetVersionId,
+			status: 'queued'
+		});
+		const job = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(jobId) as never;
+
+		await pipeline.run(job);
+
+		const updatedJob = db
+			.prepare(`SELECT status FROM indexing_jobs WHERE id = ?`)
+			.get(jobId) as { status: string };
+		expect(updatedJob.status).toBe('done');
+
+		const docs = db
+			.prepare(`SELECT * FROM documents WHERE version_id = ?`)
+			.all(targetVersionId) as { id: string }[];
+		expect(docs.length).toBeGreaterThanOrEqual(2);
+	});
+
+	it('cloned unchanged documents survive the diff/replace stage', async () => {
+		// 1. Set up ancestor and target versions.
+		const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
+		const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
+
+		// 2. Insert ancestor doc + snippet for unchanged.md.
+		const ancestorDocId = insertDocument(db, {
+			repository_id: '/test/repo',
+			version_id: ancestorVersionId,
+			file_path: 'unchanged.md',
+			checksum: 'sha-unchanged'
+		});
+		insertSnippet(db, ancestorDocId, {
+			repository_id: '/test/repo',
+			version_id: ancestorVersionId
+		});
+
+		// 3. Crawl returns ONLY changed.md (unchanged.md is absent — differential only).
+		const mockLocalCrawl = vi.fn().mockResolvedValue({
+			files: [
+				{
+					path: 'changed.md',
+					content: '# Changed\n\nThis file was added.',
+					sha: 'sha-changed',
+					language: 'markdown'
+				}
+			],
+			totalFiles: 1,
+			skippedFiles: 0,
+			branch: 'main',
+			commitSha: 'abc'
+		});
+
+		// 4. Mock buildDifferentialPlan to return a plan with the two paths.
+		const mockPlan = {
+			ancestorVersionId,
+			ancestorTag: 'v1.0.0',
+			changedPaths: new Set(['changed.md']),
+			deletedPaths: new Set<string>(),
+			unchangedPaths: new Set(['unchanged.md'])
+		};
+		const spy = vi
+			.spyOn(diffStrategy, 'buildDifferentialPlan')
+			.mockResolvedValueOnce(mockPlan);
+
+		const pipeline = new IndexingPipeline(
+			db,
+			vi.fn() as never,
+			{ crawl: mockLocalCrawl } as never,
+			null
+		);
+
+		// 5. Run pipeline for the target version job.
+		const jobId = insertJob(db, {
+			repository_id: '/test/repo',
+			version_id: targetVersionId,
+			status: 'queued'
+		});
+		const job = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(jobId) as never;
+		await pipeline.run(job);
+
+		spy.mockRestore();
+
+		// 6. Assert job completed and both docs exist under the target version.
+		const finalJob = db
+			.prepare(`SELECT status FROM indexing_jobs WHERE id = ?`)
+			.get(jobId) as { status: string };
+		expect(finalJob.status).toBe('done');
+
+		const targetDocs = db
+			.prepare(`SELECT file_path FROM documents WHERE version_id = ?`)
+			.all(targetVersionId) as { file_path: string }[];
+		const filePaths = targetDocs.map((d) => d.file_path);
+
+		// unchanged.md was cloned and must NOT have been deleted by computeDiff.
+		expect(filePaths).toContain('unchanged.md');
+		// changed.md was crawled and indexed in this run.
+		expect(filePaths).toContain('changed.md');
+	});
+});
--- a/src/lib/server/pipeline/indexing.pipeline.ts
+++ b/src/lib/server/pipeline/indexing.pipeline.ts
@@ -26,6 +26,7 @@ import { resolveConfig, type ParsedConfig } from '$lib/server/config/config-pars
 import { parseFile } from '$lib/server/parser/index.js';
 import { computeTrustScore } from '$lib/server/search/trust-score.js';
 import { computeDiff } from './diff.js';
+import { buildDifferentialPlan, type DifferentialPlan } from './differential-strategy.js';

 // ---------------------------------------------------------------------------
 // Progress calculation
@@ -95,11 +96,44 @@ export class IndexingPipeline {
 				this.updateVersion(normJob.versionId, { state: 'indexing' });
 			}

-			// ---- Stage 1: Crawl -------------------------------------------------
 			const versionTag = normJob.versionId
 				? this.getVersionTag(normJob.versionId)
 				: undefined;
-			const crawlResult = await this.crawl(repo, versionTag);
+
+			// ---- Stage 0: Differential strategy (TRUEREF-0021) ----------------------
+			// When indexing a tagged version, check if we can inherit unchanged files
+			// from an already-indexed ancestor version instead of crawling everything.
+			let differentialPlan: DifferentialPlan | null = null;
+			if (normJob.versionId && versionTag) {
+				differentialPlan = await buildDifferentialPlan({
+					repo,
+					targetTag: versionTag,
+					db: this.db
+				}).catch((err) => {
+					console.warn(
+						`[IndexingPipeline] Differential plan failed, falling back to full crawl: ${err instanceof Error ? err.message : String(err)}`
+					);
+					return null;
+				});
+			}
+
+			// If a differential plan exists, clone unchanged files from ancestor.
+			if (differentialPlan && differentialPlan.unchangedPaths.size > 0) {
+				this.cloneFromAncestor(
+					differentialPlan.ancestorVersionId,
+					normJob.versionId!,
+					repo.id,
+					differentialPlan.unchangedPaths
+				);
+				console.info(
+					`[IndexingPipeline] Differential indexing: cloned ${differentialPlan.unchangedPaths.size} unchanged files from ${differentialPlan.ancestorTag}`
+				);
+			}
+
+			// ---- Stage 1: Crawl -------------------------------------------------
+			// Pass changedPaths as allowlist so crawl only fetches/returns changed files.
+			const crawlAllowedPaths = differentialPlan ? differentialPlan.changedPaths : undefined;
+			const crawlResult = await this.crawl(repo, versionTag, crawlAllowedPaths);

 			// Resolve trueref.json / context7.json configuration.
 			// Prefer the pre-parsed config carried in the CrawlResult (set by
@@ -137,7 +171,16 @@ export class IndexingPipeline {
 			// Load all existing documents for this repo so computeDiff can
 			// classify every crawled file and detect deletions.
 			const existingDocs = this.getExistingDocuments(repo.id, normJob.versionId);
-			const diff = computeDiff(filteredFiles, existingDocs);
+
+			// Exclude files that were cloned from the ancestor — they are not candidates
+			// for deletion or re-processing (computeDiff must not see them in existingDocs).
+			const clonedPaths = differentialPlan?.unchangedPaths ?? new Set<string>();
+			const existingDocsForDiff =
+				clonedPaths.size > 0
+					? existingDocs.filter((d) => !clonedPaths.has(d.filePath))
+					: existingDocs;
+
+			const diff = computeDiff(filteredFiles, existingDocsForDiff);

 			// Accumulate new documents/snippets; skip unchanged files.
 			const newDocuments: NewDocument[] = [];
@@ -146,11 +189,11 @@ export class IndexingPipeline {

 			// Schedule stale documents (modified + deleted) for deletion.
 			for (const file of diff.modified) {
-				const existing = existingDocs.find((d) => d.filePath === file.path);
+				const existing = existingDocsForDiff.find((d) => d.filePath === file.path);
 				if (existing) changedDocIds.push(existing.id);
 			}
 			for (const filePath of diff.deleted) {
-				const existing = existingDocs.find((d) => d.filePath === filePath);
+				const existing = existingDocsForDiff.find((d) => d.filePath === filePath);
 				if (existing) changedDocIds.push(existing.id);
 			}

@@ -316,7 +359,7 @@ export class IndexingPipeline {
 	// Private — crawl
 	// -------------------------------------------------------------------------

-	private async crawl(repo: Repository, ref?: string): Promise<{
+	private async crawl(repo: Repository, ref?: string, allowedPaths?: Set<string>): Promise<{
 		files: Array<{ path: string; content: string; sha: string; size: number; language: string }>;
 		totalFiles: number;
 		/** Pre-parsed trueref.json / context7.json, or undefined when absent. */
@@ -339,7 +382,12 @@ export class IndexingPipeline {
 				token: repo.githubToken ?? undefined
 			});

-			return { files: result.files, totalFiles: result.totalFiles };
+			// Apply allowedPaths filter for differential indexing.
+			const githubFinalFiles =
+				allowedPaths && allowedPaths.size > 0
+					? result.files.filter((f) => allowedPaths.has(f.path))
+					: result.files;
+			return { files: githubFinalFiles, totalFiles: result.totalFiles };
 		} else {
 			// Local filesystem crawl.
 			const result = await this.localCrawler.crawl({
@@ -347,7 +395,12 @@ export class IndexingPipeline {
 				ref: ref ?? (repo.branch !== 'main' ? (repo.branch ?? undefined) : undefined)
 			});

-			return { files: result.files, totalFiles: result.totalFiles, config: result.config };
+			// Apply allowedPaths filter for differential indexing.
+			const localFinalFiles =
+				allowedPaths && allowedPaths.size > 0
+					? result.files.filter((f) => allowedPaths.has(f.path))
+					: result.files;
+			return { files: localFinalFiles, totalFiles: result.totalFiles, config: result.config };
 		}
 	}

@@ -358,6 +411,146 @@ export class IndexingPipeline {
 		return row?.tag;
 	}

+	// -------------------------------------------------------------------------
+	// Private — differential clone (TRUEREF-0021)
+	// -------------------------------------------------------------------------
+
+	/**
+	 * Clone documents, snippets, and embeddings from an ancestor version into
+	 * the target version for all unchanged file paths.
+	 *
+	 * Runs in a single SQLite transaction for atomicity.
+	 */
+	private cloneFromAncestor(
+		ancestorVersionId: string,
+		targetVersionId: string,
+		repositoryId: string,
+		unchangedPaths: Set<string>
+	): void {
+		this.db.transaction(() => {
+			const pathList = [...unchangedPaths];
+			const placeholders = pathList.map(() => '?').join(',');
+			const ancestorDocs = this.db
+				.prepare(
+					`SELECT * FROM documents WHERE version_id = ? AND file_path IN (${placeholders})`
+				)
+				.all(ancestorVersionId, ...pathList) as Array<{
+					id: string;
+					repository_id: string;
+					file_path: string;
+					title: string | null;
+					language: string | null;
+					token_count: number;
+					checksum: string;
+					indexed_at: number;
+				}>;
+
+			const docIdMap = new Map<string, string>();
+			const nowEpoch = Math.floor(Date.now() / 1000);
+
+			for (const doc of ancestorDocs) {
+				const newDocId = randomUUID();
+				docIdMap.set(doc.id, newDocId);
+				this.db
+					.prepare(
+						`INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at)
+             VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
+					)
+					.run(
+						newDocId,
+						repositoryId,
+						targetVersionId,
+						doc.file_path,
+						doc.title,
+						doc.language,
+						doc.token_count,
+						doc.checksum,
+						nowEpoch
+					);
+			}
+
+			if (docIdMap.size === 0) return;
+
+			const oldDocIds = [...docIdMap.keys()];
+			const snippetPlaceholders = oldDocIds.map(() => '?').join(',');
+			const ancestorSnippets = this.db
+				.prepare(
+					`SELECT * FROM snippets WHERE document_id IN (${snippetPlaceholders})`
+				)
+				.all(...oldDocIds) as Array<{
+					id: string;
+					document_id: string;
+					repository_id: string;
+					version_id: string | null;
+					type: string;
+					title: string | null;
+					content: string;
+					language: string | null;
+					breadcrumb: string | null;
+					token_count: number;
+					created_at: number;
+				}>;
+
+			const snippetIdMap = new Map<string, string>();
+			for (const snippet of ancestorSnippets) {
+				const newSnippetId = randomUUID();
+				snippetIdMap.set(snippet.id, newSnippetId);
+				const newDocId = docIdMap.get(snippet.document_id)!;
+				this.db
+					.prepare(
+						`INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
+             VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
+					)
+					.run(
+						newSnippetId,
+						newDocId,
+						repositoryId,
+						targetVersionId,
+						snippet.type,
+						snippet.title,
+						snippet.content,
+						snippet.language,
+						snippet.breadcrumb,
+						snippet.token_count,
+						snippet.created_at
+					);
+			}
+
+			if (snippetIdMap.size > 0) {
+				const oldSnippetIds = [...snippetIdMap.keys()];
+				const embPlaceholders = oldSnippetIds.map(() => '?').join(',');
+				const ancestorEmbeddings = this.db
+					.prepare(
+						`SELECT * FROM snippet_embeddings WHERE snippet_id IN (${embPlaceholders})`
+					)
+					.all(...oldSnippetIds) as Array<{
+						snippet_id: string;
+						profile_id: string;
+						model: string;
+						dimensions: number;
+						embedding: Buffer;
+						created_at: number;
+					}>;
+				for (const emb of ancestorEmbeddings) {
+					const newSnippetId = snippetIdMap.get(emb.snippet_id)!;
+					this.db
+						.prepare(
+							`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
+               VALUES (?, ?, ?, ?, ?, ?)`
+						)
+						.run(
+							newSnippetId,
+							emb.profile_id,
+							emb.model,
+							emb.dimensions,
+							emb.embedding,
+							emb.created_at
+						);
+				}
+			}
+		})();
+	}
+
 	// -------------------------------------------------------------------------
 	// Private — atomic snippet replacement
 	// -------------------------------------------------------------------------