feat(TRUEREF-0021): implement differential tag indexing
This commit is contained in:
committed by
Giancarmine Salucci
parent
e63279fcf6
commit
f4fe8c6043
@@ -26,6 +26,7 @@ import { resolveConfig, type ParsedConfig } from '$lib/server/config/config-pars
|
||||
import { parseFile } from '$lib/server/parser/index.js';
|
||||
import { computeTrustScore } from '$lib/server/search/trust-score.js';
|
||||
import { computeDiff } from './diff.js';
|
||||
import { buildDifferentialPlan, type DifferentialPlan } from './differential-strategy.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Progress calculation
|
||||
@@ -95,11 +96,44 @@ export class IndexingPipeline {
|
||||
this.updateVersion(normJob.versionId, { state: 'indexing' });
|
||||
}
|
||||
|
||||
// ---- Stage 1: Crawl -------------------------------------------------
|
||||
const versionTag = normJob.versionId
|
||||
? this.getVersionTag(normJob.versionId)
|
||||
: undefined;
|
||||
const crawlResult = await this.crawl(repo, versionTag);
|
||||
|
||||
// ---- Stage 0: Differential strategy (TRUEREF-0021) ----------------------
|
||||
// When indexing a tagged version, check if we can inherit unchanged files
|
||||
// from an already-indexed ancestor version instead of crawling everything.
|
||||
let differentialPlan: DifferentialPlan | null = null;
|
||||
if (normJob.versionId && versionTag) {
|
||||
differentialPlan = await buildDifferentialPlan({
|
||||
repo,
|
||||
targetTag: versionTag,
|
||||
db: this.db
|
||||
}).catch((err) => {
|
||||
console.warn(
|
||||
`[IndexingPipeline] Differential plan failed, falling back to full crawl: ${err instanceof Error ? err.message : String(err)}`
|
||||
);
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
// If a differential plan exists, clone unchanged files from ancestor.
|
||||
if (differentialPlan && differentialPlan.unchangedPaths.size > 0) {
|
||||
this.cloneFromAncestor(
|
||||
differentialPlan.ancestorVersionId,
|
||||
normJob.versionId!,
|
||||
repo.id,
|
||||
differentialPlan.unchangedPaths
|
||||
);
|
||||
console.info(
|
||||
`[IndexingPipeline] Differential indexing: cloned ${differentialPlan.unchangedPaths.size} unchanged files from ${differentialPlan.ancestorTag}`
|
||||
);
|
||||
}
|
||||
|
||||
// ---- Stage 1: Crawl -------------------------------------------------
|
||||
// Pass changedPaths as allowlist so crawl only fetches/returns changed files.
|
||||
const crawlAllowedPaths = differentialPlan ? differentialPlan.changedPaths : undefined;
|
||||
const crawlResult = await this.crawl(repo, versionTag, crawlAllowedPaths);
|
||||
|
||||
// Resolve trueref.json / context7.json configuration.
|
||||
// Prefer the pre-parsed config carried in the CrawlResult (set by
|
||||
@@ -137,7 +171,16 @@ export class IndexingPipeline {
|
||||
// Load all existing documents for this repo so computeDiff can
|
||||
// classify every crawled file and detect deletions.
|
||||
const existingDocs = this.getExistingDocuments(repo.id, normJob.versionId);
|
||||
const diff = computeDiff(filteredFiles, existingDocs);
|
||||
|
||||
// Exclude files that were cloned from the ancestor — they are not candidates
|
||||
// for deletion or re-processing (computeDiff must not see them in existingDocs).
|
||||
const clonedPaths = differentialPlan?.unchangedPaths ?? new Set<string>();
|
||||
const existingDocsForDiff =
|
||||
clonedPaths.size > 0
|
||||
? existingDocs.filter((d) => !clonedPaths.has(d.filePath))
|
||||
: existingDocs;
|
||||
|
||||
const diff = computeDiff(filteredFiles, existingDocsForDiff);
|
||||
|
||||
// Accumulate new documents/snippets; skip unchanged files.
|
||||
const newDocuments: NewDocument[] = [];
|
||||
@@ -146,11 +189,11 @@ export class IndexingPipeline {
|
||||
|
||||
// Schedule stale documents (modified + deleted) for deletion.
|
||||
for (const file of diff.modified) {
|
||||
const existing = existingDocs.find((d) => d.filePath === file.path);
|
||||
const existing = existingDocsForDiff.find((d) => d.filePath === file.path);
|
||||
if (existing) changedDocIds.push(existing.id);
|
||||
}
|
||||
for (const filePath of diff.deleted) {
|
||||
const existing = existingDocs.find((d) => d.filePath === filePath);
|
||||
const existing = existingDocsForDiff.find((d) => d.filePath === filePath);
|
||||
if (existing) changedDocIds.push(existing.id);
|
||||
}
|
||||
|
||||
@@ -316,7 +359,7 @@ export class IndexingPipeline {
|
||||
// Private — crawl
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private async crawl(repo: Repository, ref?: string): Promise<{
|
||||
private async crawl(repo: Repository, ref?: string, allowedPaths?: Set<string>): Promise<{
|
||||
files: Array<{ path: string; content: string; sha: string; size: number; language: string }>;
|
||||
totalFiles: number;
|
||||
/** Pre-parsed trueref.json / context7.json, or undefined when absent. */
|
||||
@@ -339,7 +382,12 @@ export class IndexingPipeline {
|
||||
token: repo.githubToken ?? undefined
|
||||
});
|
||||
|
||||
return { files: result.files, totalFiles: result.totalFiles };
|
||||
// Apply allowedPaths filter for differential indexing.
|
||||
const githubFinalFiles =
|
||||
allowedPaths && allowedPaths.size > 0
|
||||
? result.files.filter((f) => allowedPaths.has(f.path))
|
||||
: result.files;
|
||||
return { files: githubFinalFiles, totalFiles: result.totalFiles };
|
||||
} else {
|
||||
// Local filesystem crawl.
|
||||
const result = await this.localCrawler.crawl({
|
||||
@@ -347,7 +395,12 @@ export class IndexingPipeline {
|
||||
ref: ref ?? (repo.branch !== 'main' ? (repo.branch ?? undefined) : undefined)
|
||||
});
|
||||
|
||||
return { files: result.files, totalFiles: result.totalFiles, config: result.config };
|
||||
// Apply allowedPaths filter for differential indexing.
|
||||
const localFinalFiles =
|
||||
allowedPaths && allowedPaths.size > 0
|
||||
? result.files.filter((f) => allowedPaths.has(f.path))
|
||||
: result.files;
|
||||
return { files: localFinalFiles, totalFiles: result.totalFiles, config: result.config };
|
||||
}
|
||||
}
|
||||
|
||||
@@ -358,6 +411,146 @@ export class IndexingPipeline {
|
||||
return row?.tag;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Private — differential clone (TRUEREF-0021)
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Clone documents, snippets, and embeddings from an ancestor version into
|
||||
* the target version for all unchanged file paths.
|
||||
*
|
||||
* Runs in a single SQLite transaction for atomicity.
|
||||
*/
|
||||
private cloneFromAncestor(
|
||||
ancestorVersionId: string,
|
||||
targetVersionId: string,
|
||||
repositoryId: string,
|
||||
unchangedPaths: Set<string>
|
||||
): void {
|
||||
this.db.transaction(() => {
|
||||
const pathList = [...unchangedPaths];
|
||||
const placeholders = pathList.map(() => '?').join(',');
|
||||
const ancestorDocs = this.db
|
||||
.prepare(
|
||||
`SELECT * FROM documents WHERE version_id = ? AND file_path IN (${placeholders})`
|
||||
)
|
||||
.all(ancestorVersionId, ...pathList) as Array<{
|
||||
id: string;
|
||||
repository_id: string;
|
||||
file_path: string;
|
||||
title: string | null;
|
||||
language: string | null;
|
||||
token_count: number;
|
||||
checksum: string;
|
||||
indexed_at: number;
|
||||
}>;
|
||||
|
||||
const docIdMap = new Map<string, string>();
|
||||
const nowEpoch = Math.floor(Date.now() / 1000);
|
||||
|
||||
for (const doc of ancestorDocs) {
|
||||
const newDocId = randomUUID();
|
||||
docIdMap.set(doc.id, newDocId);
|
||||
this.db
|
||||
.prepare(
|
||||
`INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
)
|
||||
.run(
|
||||
newDocId,
|
||||
repositoryId,
|
||||
targetVersionId,
|
||||
doc.file_path,
|
||||
doc.title,
|
||||
doc.language,
|
||||
doc.token_count,
|
||||
doc.checksum,
|
||||
nowEpoch
|
||||
);
|
||||
}
|
||||
|
||||
if (docIdMap.size === 0) return;
|
||||
|
||||
const oldDocIds = [...docIdMap.keys()];
|
||||
const snippetPlaceholders = oldDocIds.map(() => '?').join(',');
|
||||
const ancestorSnippets = this.db
|
||||
.prepare(
|
||||
`SELECT * FROM snippets WHERE document_id IN (${snippetPlaceholders})`
|
||||
)
|
||||
.all(...oldDocIds) as Array<{
|
||||
id: string;
|
||||
document_id: string;
|
||||
repository_id: string;
|
||||
version_id: string | null;
|
||||
type: string;
|
||||
title: string | null;
|
||||
content: string;
|
||||
language: string | null;
|
||||
breadcrumb: string | null;
|
||||
token_count: number;
|
||||
created_at: number;
|
||||
}>;
|
||||
|
||||
const snippetIdMap = new Map<string, string>();
|
||||
for (const snippet of ancestorSnippets) {
|
||||
const newSnippetId = randomUUID();
|
||||
snippetIdMap.set(snippet.id, newSnippetId);
|
||||
const newDocId = docIdMap.get(snippet.document_id)!;
|
||||
this.db
|
||||
.prepare(
|
||||
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
)
|
||||
.run(
|
||||
newSnippetId,
|
||||
newDocId,
|
||||
repositoryId,
|
||||
targetVersionId,
|
||||
snippet.type,
|
||||
snippet.title,
|
||||
snippet.content,
|
||||
snippet.language,
|
||||
snippet.breadcrumb,
|
||||
snippet.token_count,
|
||||
snippet.created_at
|
||||
);
|
||||
}
|
||||
|
||||
if (snippetIdMap.size > 0) {
|
||||
const oldSnippetIds = [...snippetIdMap.keys()];
|
||||
const embPlaceholders = oldSnippetIds.map(() => '?').join(',');
|
||||
const ancestorEmbeddings = this.db
|
||||
.prepare(
|
||||
`SELECT * FROM snippet_embeddings WHERE snippet_id IN (${embPlaceholders})`
|
||||
)
|
||||
.all(...oldSnippetIds) as Array<{
|
||||
snippet_id: string;
|
||||
profile_id: string;
|
||||
model: string;
|
||||
dimensions: number;
|
||||
embedding: Buffer;
|
||||
created_at: number;
|
||||
}>;
|
||||
for (const emb of ancestorEmbeddings) {
|
||||
const newSnippetId = snippetIdMap.get(emb.snippet_id)!;
|
||||
this.db
|
||||
.prepare(
|
||||
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?)`
|
||||
)
|
||||
.run(
|
||||
newSnippetId,
|
||||
emb.profile_id,
|
||||
emb.model,
|
||||
emb.dimensions,
|
||||
emb.embedding,
|
||||
emb.created_at
|
||||
);
|
||||
}
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Private — atomic snippet replacement
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user