feat(TRUEREF-0021): implement differential tag indexing

This commit is contained in:
U811073
2026-03-30 13:12:50 +02:00
committed by Giancarmine Salucci
parent e63279fcf6
commit f4fe8c6043
10 changed files with 1281 additions and 9 deletions

View File

@@ -0,0 +1,122 @@
/**
* Differential indexing strategy coordinator (TRUEREF-0021).
*
* Determines whether differential indexing can be used for a given version tag,
* and if so, builds a plan describing which files to clone from the ancestor
* and which files to crawl fresh.
*/
import type Database from 'better-sqlite3';
import type { Repository } from '$lib/server/models/repository.js';
import type { RepositoryVersion } from '$lib/server/models/repository-version.js';
import { RepositoryVersionMapper } from '$lib/server/mappers/repository-version.mapper.js';
import type { RepositoryVersionEntity } from '$lib/server/models/repository-version.js';
import { findBestAncestorVersion } from '$lib/server/utils/tag-order.js';
import { fetchGitHubChangedFiles } from '$lib/server/crawler/github-compare.js';
import { getChangedFilesBetweenRefs } from '$lib/server/utils/git.js';
import type { ChangedFile } from '$lib/server/crawler/types.js';
export interface DifferentialPlan {
/** Version ID of the closest already-indexed predecessor tag */
ancestorVersionId: string;
/** Ancestor tag name (needed for git diff / GitHub compare calls) */
ancestorTag: string;
/** File paths that changed (added + modified + renamed-destination) */
changedPaths: Set<string>;
/** File paths that were deleted in the target vs ancestor */
deletedPaths: Set<string>;
/** File paths present in ancestor that are unchanged in target — must be cloned */
unchangedPaths: Set<string>;
}
export async function buildDifferentialPlan(params: {
repo: Repository;
targetTag: string;
db: Database.Database;
/** Override for testing only */
_fetchGitHubChangedFiles?: typeof fetchGitHubChangedFiles;
}): Promise<DifferentialPlan | null> {
const { repo, targetTag, db } = params;
const fetchFn = params._fetchGitHubChangedFiles ?? fetchGitHubChangedFiles;
try {
// 1. Load all indexed versions for this repository
const rows = db
.prepare(
`SELECT * FROM repository_versions WHERE repository_id = ? AND state = 'indexed'`
)
.all(repo.id) as RepositoryVersionEntity[];
const indexedVersions: RepositoryVersion[] = rows.map((row) =>
RepositoryVersionMapper.fromEntity(row)
);
// 2. Find the best ancestor version
const ancestor = findBestAncestorVersion(targetTag, indexedVersions);
if (!ancestor) return null;
// 3. Load ancestor's document file paths
const docRows = db
.prepare(`SELECT DISTINCT file_path FROM documents WHERE version_id = ?`)
.all(ancestor.id) as Array<{ file_path: string }>;
const ancestorFilePaths = new Set(docRows.map((r) => r.file_path));
if (ancestorFilePaths.size === 0) return null;
// 4. Fetch changed files between ancestor and target
let changedFiles: ChangedFile[];
if (repo.source === 'github') {
const url = new URL(repo.sourceUrl);
const parts = url.pathname.split('/').filter(Boolean);
const owner = parts[0];
const repoName = parts[1];
changedFiles = await fetchFn(
owner,
repoName,
ancestor.tag,
targetTag,
repo.githubToken ?? undefined
);
} else {
changedFiles = getChangedFilesBetweenRefs({
repoPath: repo.sourceUrl,
base: ancestor.tag,
head: targetTag
});
}
// 5. Partition changed files into changed and deleted sets
const changedPaths = new Set<string>();
const deletedPaths = new Set<string>();
for (const file of changedFiles) {
if (file.status === 'removed') {
deletedPaths.add(file.path);
} else {
changedPaths.add(file.path);
}
}
// 6. Compute unchanged paths: ancestor paths minus changed minus deleted
const unchangedPaths = new Set<string>();
for (const p of ancestorFilePaths) {
if (!changedPaths.has(p) && !deletedPaths.has(p)) {
unchangedPaths.add(p);
}
}
// 7. Return null when there's nothing to clone (all files changed)
if (unchangedPaths.size === 0) return null;
return {
ancestorVersionId: ancestor.id,
ancestorTag: ancestor.tag,
changedPaths,
deletedPaths,
unchangedPaths
};
} catch {
// Fail-safe: fall back to full crawl on any error
return null;
}
}

View File

@@ -13,6 +13,7 @@ import { JobQueue } from './job-queue.js';
import { IndexingPipeline } from './indexing.pipeline.js';
import { recoverStaleJobs } from './startup.js';
import { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
import * as diffStrategy from './differential-strategy.js';
// ---------------------------------------------------------------------------
// Test DB factory
@@ -1019,3 +1020,290 @@ describe('IndexingPipeline', () => {
expect(rules).toEqual(['v3: use the streaming API.']);
});
});
// ---------------------------------------------------------------------------
// differential indexing
// ---------------------------------------------------------------------------
describe('differential indexing', () => {
let db: Database.Database;
beforeEach(() => {
db = createTestDb();
insertRepo(db, { source: 'local', source_url: '/tmp/test-repo' });
});
function insertDocument(
localDb: Database.Database,
overrides: Partial<Record<string, unknown>> = {}
): string {
const id = crypto.randomUUID();
localDb
.prepare(
`INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
)
.run(
(overrides.id as string) ?? id,
(overrides.repository_id as string) ?? '/test/repo',
(overrides.version_id as string | null) ?? null,
(overrides.file_path as string) ?? 'README.md',
null,
'markdown',
100,
(overrides.checksum as string) ?? 'abc123',
Math.floor(Date.now() / 1000)
);
return (overrides.id as string) ?? id;
}
function insertSnippet(
localDb: Database.Database,
documentId: string,
overrides: Partial<Record<string, unknown>> = {}
): string {
const id = crypto.randomUUID();
localDb
.prepare(
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
)
.run(
(overrides.id as string) ?? id,
documentId,
(overrides.repository_id as string) ?? '/test/repo',
(overrides.version_id as string | null) ?? null,
'info',
null,
'content',
'markdown',
null,
10,
Math.floor(Date.now() / 1000)
);
return (overrides.id as string) ?? id;
}
type PipelineInternals = IndexingPipeline & {
cloneFromAncestor: (
ancestorVersionId: string,
targetVersionId: string,
repositoryId: string,
unchangedPaths: Set<string>
) => void;
};
it('cloneFromAncestor inserts documents and snippets into the target version', () => {
const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
const doc1Id = insertDocument(db, {
repository_id: '/test/repo',
version_id: ancestorVersionId,
file_path: 'README.md',
checksum: 'sha-readme'
});
const doc2Id = insertDocument(db, {
repository_id: '/test/repo',
version_id: ancestorVersionId,
file_path: 'src/index.ts',
checksum: 'sha-index'
});
insertSnippet(db, doc1Id, { repository_id: '/test/repo', version_id: ancestorVersionId });
insertSnippet(db, doc2Id, { repository_id: '/test/repo', version_id: ancestorVersionId });
const pipeline = new IndexingPipeline(
db,
vi.fn() as never,
{ crawl: vi.fn() } as never,
null
);
(pipeline as unknown as PipelineInternals).cloneFromAncestor(
ancestorVersionId,
targetVersionId,
'/test/repo',
new Set(['README.md', 'src/index.ts'])
);
const targetDocs = db
.prepare(`SELECT * FROM documents WHERE version_id = ?`)
.all(targetVersionId) as { id: string; file_path: string }[];
expect(targetDocs).toHaveLength(2);
expect(targetDocs.map((d) => d.file_path).sort()).toEqual(
['README.md', 'src/index.ts'].sort()
);
// New IDs must differ from ancestor doc IDs.
const targetDocIds = targetDocs.map((d) => d.id);
expect(targetDocIds).not.toContain(doc1Id);
expect(targetDocIds).not.toContain(doc2Id);
const targetSnippets = db
.prepare(`SELECT * FROM snippets WHERE version_id = ?`)
.all(targetVersionId) as { id: string }[];
expect(targetSnippets).toHaveLength(2);
});
it('cloneFromAncestor silently skips paths absent from the ancestor', () => {
const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
insertDocument(db, {
repository_id: '/test/repo',
version_id: ancestorVersionId,
file_path: 'src/main.ts',
checksum: 'sha-main'
});
const pipeline = new IndexingPipeline(
db,
vi.fn() as never,
{ crawl: vi.fn() } as never,
null
);
(pipeline as unknown as PipelineInternals).cloneFromAncestor(
ancestorVersionId,
targetVersionId,
'/test/repo',
new Set(['src/main.ts', 'MISSING.md'])
);
const targetDocs = db
.prepare(`SELECT * FROM documents WHERE version_id = ?`)
.all(targetVersionId) as { id: string; file_path: string }[];
expect(targetDocs).toHaveLength(1);
expect(targetDocs[0].file_path).toBe('src/main.ts');
});
it('falls back to full crawl when no indexed ancestor exists', async () => {
const targetVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' });
const files = [
{
path: 'README.md',
content: '# Hello\n\nThis is documentation.',
sha: 'sha-readme',
language: 'markdown'
},
{
path: 'src/index.ts',
content: 'export const x = 1;',
sha: 'sha-index',
language: 'typescript'
}
];
const mockLocalCrawl = vi.fn().mockResolvedValue({
files,
totalFiles: 2,
skippedFiles: 0,
branch: 'main',
commitSha: 'abc'
});
const pipeline = new IndexingPipeline(
db,
vi.fn() as never,
{ crawl: mockLocalCrawl } as never,
null
);
const jobId = insertJob(db, {
repository_id: '/test/repo',
version_id: targetVersionId,
status: 'queued'
});
const job = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(jobId) as never;
await pipeline.run(job);
const updatedJob = db
.prepare(`SELECT status FROM indexing_jobs WHERE id = ?`)
.get(jobId) as { status: string };
expect(updatedJob.status).toBe('done');
const docs = db
.prepare(`SELECT * FROM documents WHERE version_id = ?`)
.all(targetVersionId) as { id: string }[];
expect(docs.length).toBeGreaterThanOrEqual(2);
});
it('cloned unchanged documents survive the diff/replace stage', async () => {
// 1. Set up ancestor and target versions.
const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
// 2. Insert ancestor doc + snippet for unchanged.md.
const ancestorDocId = insertDocument(db, {
repository_id: '/test/repo',
version_id: ancestorVersionId,
file_path: 'unchanged.md',
checksum: 'sha-unchanged'
});
insertSnippet(db, ancestorDocId, {
repository_id: '/test/repo',
version_id: ancestorVersionId
});
// 3. Crawl returns ONLY changed.md (unchanged.md is absent — differential only).
const mockLocalCrawl = vi.fn().mockResolvedValue({
files: [
{
path: 'changed.md',
content: '# Changed\n\nThis file was added.',
sha: 'sha-changed',
language: 'markdown'
}
],
totalFiles: 1,
skippedFiles: 0,
branch: 'main',
commitSha: 'abc'
});
// 4. Mock buildDifferentialPlan to return a plan with the two paths.
const mockPlan = {
ancestorVersionId,
ancestorTag: 'v1.0.0',
changedPaths: new Set(['changed.md']),
deletedPaths: new Set<string>(),
unchangedPaths: new Set(['unchanged.md'])
};
const spy = vi
.spyOn(diffStrategy, 'buildDifferentialPlan')
.mockResolvedValueOnce(mockPlan);
const pipeline = new IndexingPipeline(
db,
vi.fn() as never,
{ crawl: mockLocalCrawl } as never,
null
);
// 5. Run pipeline for the target version job.
const jobId = insertJob(db, {
repository_id: '/test/repo',
version_id: targetVersionId,
status: 'queued'
});
const job = db.prepare(`SELECT * FROM indexing_jobs WHERE id = ?`).get(jobId) as never;
await pipeline.run(job);
spy.mockRestore();
// 6. Assert job completed and both docs exist under the target version.
const finalJob = db
.prepare(`SELECT status FROM indexing_jobs WHERE id = ?`)
.get(jobId) as { status: string };
expect(finalJob.status).toBe('done');
const targetDocs = db
.prepare(`SELECT file_path FROM documents WHERE version_id = ?`)
.all(targetVersionId) as { file_path: string }[];
const filePaths = targetDocs.map((d) => d.file_path);
// unchanged.md was cloned and must NOT have been deleted by computeDiff.
expect(filePaths).toContain('unchanged.md');
// changed.md was crawled and indexed in this run.
expect(filePaths).toContain('changed.md');
});
});

View File

@@ -26,6 +26,7 @@ import { resolveConfig, type ParsedConfig } from '$lib/server/config/config-pars
import { parseFile } from '$lib/server/parser/index.js';
import { computeTrustScore } from '$lib/server/search/trust-score.js';
import { computeDiff } from './diff.js';
import { buildDifferentialPlan, type DifferentialPlan } from './differential-strategy.js';
// ---------------------------------------------------------------------------
// Progress calculation
@@ -95,11 +96,44 @@ export class IndexingPipeline {
this.updateVersion(normJob.versionId, { state: 'indexing' });
}
// ---- Stage 1: Crawl -------------------------------------------------
const versionTag = normJob.versionId
? this.getVersionTag(normJob.versionId)
: undefined;
const crawlResult = await this.crawl(repo, versionTag);
// ---- Stage 0: Differential strategy (TRUEREF-0021) ----------------------
// When indexing a tagged version, check if we can inherit unchanged files
// from an already-indexed ancestor version instead of crawling everything.
let differentialPlan: DifferentialPlan | null = null;
if (normJob.versionId && versionTag) {
differentialPlan = await buildDifferentialPlan({
repo,
targetTag: versionTag,
db: this.db
}).catch((err) => {
console.warn(
`[IndexingPipeline] Differential plan failed, falling back to full crawl: ${err instanceof Error ? err.message : String(err)}`
);
return null;
});
}
// If a differential plan exists, clone unchanged files from ancestor.
if (differentialPlan && differentialPlan.unchangedPaths.size > 0) {
this.cloneFromAncestor(
differentialPlan.ancestorVersionId,
normJob.versionId!,
repo.id,
differentialPlan.unchangedPaths
);
console.info(
`[IndexingPipeline] Differential indexing: cloned ${differentialPlan.unchangedPaths.size} unchanged files from ${differentialPlan.ancestorTag}`
);
}
// ---- Stage 1: Crawl -------------------------------------------------
// Pass changedPaths as allowlist so crawl only fetches/returns changed files.
const crawlAllowedPaths = differentialPlan ? differentialPlan.changedPaths : undefined;
const crawlResult = await this.crawl(repo, versionTag, crawlAllowedPaths);
// Resolve trueref.json / context7.json configuration.
// Prefer the pre-parsed config carried in the CrawlResult (set by
@@ -137,7 +171,16 @@ export class IndexingPipeline {
// Load all existing documents for this repo so computeDiff can
// classify every crawled file and detect deletions.
const existingDocs = this.getExistingDocuments(repo.id, normJob.versionId);
const diff = computeDiff(filteredFiles, existingDocs);
// Exclude files that were cloned from the ancestor — they are not candidates
// for deletion or re-processing (computeDiff must not see them in existingDocs).
const clonedPaths = differentialPlan?.unchangedPaths ?? new Set<string>();
const existingDocsForDiff =
clonedPaths.size > 0
? existingDocs.filter((d) => !clonedPaths.has(d.filePath))
: existingDocs;
const diff = computeDiff(filteredFiles, existingDocsForDiff);
// Accumulate new documents/snippets; skip unchanged files.
const newDocuments: NewDocument[] = [];
@@ -146,11 +189,11 @@ export class IndexingPipeline {
// Schedule stale documents (modified + deleted) for deletion.
for (const file of diff.modified) {
const existing = existingDocs.find((d) => d.filePath === file.path);
const existing = existingDocsForDiff.find((d) => d.filePath === file.path);
if (existing) changedDocIds.push(existing.id);
}
for (const filePath of diff.deleted) {
const existing = existingDocs.find((d) => d.filePath === filePath);
const existing = existingDocsForDiff.find((d) => d.filePath === filePath);
if (existing) changedDocIds.push(existing.id);
}
@@ -316,7 +359,7 @@ export class IndexingPipeline {
// Private — crawl
// -------------------------------------------------------------------------
private async crawl(repo: Repository, ref?: string): Promise<{
private async crawl(repo: Repository, ref?: string, allowedPaths?: Set<string>): Promise<{
files: Array<{ path: string; content: string; sha: string; size: number; language: string }>;
totalFiles: number;
/** Pre-parsed trueref.json / context7.json, or undefined when absent. */
@@ -339,7 +382,12 @@ export class IndexingPipeline {
token: repo.githubToken ?? undefined
});
return { files: result.files, totalFiles: result.totalFiles };
// Apply allowedPaths filter for differential indexing.
const githubFinalFiles =
allowedPaths && allowedPaths.size > 0
? result.files.filter((f) => allowedPaths.has(f.path))
: result.files;
return { files: githubFinalFiles, totalFiles: result.totalFiles };
} else {
// Local filesystem crawl.
const result = await this.localCrawler.crawl({
@@ -347,7 +395,12 @@ export class IndexingPipeline {
ref: ref ?? (repo.branch !== 'main' ? (repo.branch ?? undefined) : undefined)
});
return { files: result.files, totalFiles: result.totalFiles, config: result.config };
// Apply allowedPaths filter for differential indexing.
const localFinalFiles =
allowedPaths && allowedPaths.size > 0
? result.files.filter((f) => allowedPaths.has(f.path))
: result.files;
return { files: localFinalFiles, totalFiles: result.totalFiles, config: result.config };
}
}
@@ -358,6 +411,146 @@ export class IndexingPipeline {
return row?.tag;
}
// -------------------------------------------------------------------------
// Private — differential clone (TRUEREF-0021)
// -------------------------------------------------------------------------
/**
* Clone documents, snippets, and embeddings from an ancestor version into
* the target version for all unchanged file paths.
*
* Runs in a single SQLite transaction for atomicity.
*/
private cloneFromAncestor(
ancestorVersionId: string,
targetVersionId: string,
repositoryId: string,
unchangedPaths: Set<string>
): void {
this.db.transaction(() => {
const pathList = [...unchangedPaths];
const placeholders = pathList.map(() => '?').join(',');
const ancestorDocs = this.db
.prepare(
`SELECT * FROM documents WHERE version_id = ? AND file_path IN (${placeholders})`
)
.all(ancestorVersionId, ...pathList) as Array<{
id: string;
repository_id: string;
file_path: string;
title: string | null;
language: string | null;
token_count: number;
checksum: string;
indexed_at: number;
}>;
const docIdMap = new Map<string, string>();
const nowEpoch = Math.floor(Date.now() / 1000);
for (const doc of ancestorDocs) {
const newDocId = randomUUID();
docIdMap.set(doc.id, newDocId);
this.db
.prepare(
`INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
)
.run(
newDocId,
repositoryId,
targetVersionId,
doc.file_path,
doc.title,
doc.language,
doc.token_count,
doc.checksum,
nowEpoch
);
}
if (docIdMap.size === 0) return;
const oldDocIds = [...docIdMap.keys()];
const snippetPlaceholders = oldDocIds.map(() => '?').join(',');
const ancestorSnippets = this.db
.prepare(
`SELECT * FROM snippets WHERE document_id IN (${snippetPlaceholders})`
)
.all(...oldDocIds) as Array<{
id: string;
document_id: string;
repository_id: string;
version_id: string | null;
type: string;
title: string | null;
content: string;
language: string | null;
breadcrumb: string | null;
token_count: number;
created_at: number;
}>;
const snippetIdMap = new Map<string, string>();
for (const snippet of ancestorSnippets) {
const newSnippetId = randomUUID();
snippetIdMap.set(snippet.id, newSnippetId);
const newDocId = docIdMap.get(snippet.document_id)!;
this.db
.prepare(
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
)
.run(
newSnippetId,
newDocId,
repositoryId,
targetVersionId,
snippet.type,
snippet.title,
snippet.content,
snippet.language,
snippet.breadcrumb,
snippet.token_count,
snippet.created_at
);
}
if (snippetIdMap.size > 0) {
const oldSnippetIds = [...snippetIdMap.keys()];
const embPlaceholders = oldSnippetIds.map(() => '?').join(',');
const ancestorEmbeddings = this.db
.prepare(
`SELECT * FROM snippet_embeddings WHERE snippet_id IN (${embPlaceholders})`
)
.all(...oldSnippetIds) as Array<{
snippet_id: string;
profile_id: string;
model: string;
dimensions: number;
embedding: Buffer;
created_at: number;
}>;
for (const emb of ancestorEmbeddings) {
const newSnippetId = snippetIdMap.get(emb.snippet_id)!;
this.db
.prepare(
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
VALUES (?, ?, ?, ?, ?, ?)`
)
.run(
newSnippetId,
emb.profile_id,
emb.model,
emb.dimensions,
emb.embedding,
emb.created_at
);
}
}
})();
}
// -------------------------------------------------------------------------
// Private — atomic snippet replacement
// -------------------------------------------------------------------------