TRUEREF-0023 rewrite indexing pipeline - parallel reads - serialized writes
This commit is contained in:
@@ -28,6 +28,14 @@ import { parseFile } from '$lib/server/parser/index.js';
|
||||
import { computeTrustScore } from '$lib/server/search/trust-score.js';
|
||||
import { computeDiff } from './diff.js';
|
||||
import { buildDifferentialPlan, type DifferentialPlan } from './differential-strategy.js';
|
||||
import {
|
||||
cloneFromAncestor as cloneFromAncestorInDatabase,
|
||||
replaceSnippets as replaceSnippetsInDatabase,
|
||||
updateRepo as updateRepoInDatabase,
|
||||
updateVersion as updateVersionInDatabase,
|
||||
type CloneFromAncestorRequest
|
||||
} from './write-operations.js';
|
||||
import type { SerializedFields } from './worker-types.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Progress calculation
|
||||
@@ -70,7 +78,23 @@ export class IndexingPipeline {
|
||||
private readonly db: Database.Database,
|
||||
private readonly githubCrawl: typeof GithubCrawlFn,
|
||||
private readonly localCrawler: LocalCrawler,
|
||||
private readonly embeddingService: EmbeddingService | null
|
||||
private readonly embeddingService: EmbeddingService | null,
|
||||
private readonly writeDelegate?: {
|
||||
persistJobUpdates?: boolean;
|
||||
replaceSnippets?: (
|
||||
changedDocIds: string[],
|
||||
newDocuments: NewDocument[],
|
||||
newSnippets: NewSnippet[]
|
||||
) => Promise<void>;
|
||||
cloneFromAncestor?: (request: CloneFromAncestorRequest) => Promise<void>;
|
||||
updateRepo?: (repositoryId: string, fields: SerializedFields) => Promise<void>;
|
||||
updateVersion?: (versionId: string, fields: SerializedFields) => Promise<void>;
|
||||
upsertRepoConfig?: (
|
||||
repositoryId: string,
|
||||
versionId: string | null,
|
||||
rules: string[]
|
||||
) => Promise<void>;
|
||||
}
|
||||
) {
|
||||
this.sqliteVecStore = new SqliteVecStore(db);
|
||||
}
|
||||
@@ -117,14 +141,12 @@ export class IndexingPipeline {
|
||||
if (!repo) throw new Error(`Repository ${repositoryId} not found`);
|
||||
|
||||
// Mark repo as actively indexing.
|
||||
this.updateRepo(repo.id, { state: 'indexing' });
|
||||
await this.updateRepo(repo.id, { state: 'indexing' });
|
||||
if (normJob.versionId) {
|
||||
this.updateVersion(normJob.versionId, { state: 'indexing' });
|
||||
await this.updateVersion(normJob.versionId, { state: 'indexing' });
|
||||
}
|
||||
|
||||
const versionTag = normJob.versionId
|
||||
? this.getVersionTag(normJob.versionId)
|
||||
: undefined;
|
||||
const versionTag = normJob.versionId ? this.getVersionTag(normJob.versionId) : undefined;
|
||||
|
||||
// ---- Stage 0: Differential strategy (TRUEREF-0021) ----------------------
|
||||
// When indexing a tagged version, check if we can inherit unchanged files
|
||||
@@ -147,12 +169,12 @@ export class IndexingPipeline {
|
||||
// If a differential plan exists, clone unchanged files from ancestor.
|
||||
if (differentialPlan && differentialPlan.unchangedPaths.size > 0) {
|
||||
reportStage('cloning');
|
||||
this.cloneFromAncestor(
|
||||
differentialPlan.ancestorVersionId,
|
||||
normJob.versionId!,
|
||||
repo.id,
|
||||
differentialPlan.unchangedPaths
|
||||
);
|
||||
await this.cloneFromAncestor({
|
||||
ancestorVersionId: differentialPlan.ancestorVersionId,
|
||||
targetVersionId: normJob.versionId!,
|
||||
repositoryId: repo.id,
|
||||
unchangedPaths: [...differentialPlan.unchangedPaths]
|
||||
});
|
||||
console.info(
|
||||
`[IndexingPipeline] Differential indexing: cloned ${differentialPlan.unchangedPaths.size} unchanged files from ${differentialPlan.ancestorTag}`
|
||||
);
|
||||
@@ -174,7 +196,11 @@ export class IndexingPipeline {
|
||||
if (crawlResult.config) {
|
||||
// Config was pre-parsed by the crawler — wrap it in a ParsedConfig
|
||||
// shell so the rest of the pipeline can use it uniformly.
|
||||
parsedConfig = { config: crawlResult.config, source: 'trueref.json', warnings: [] } satisfies ParsedConfig;
|
||||
parsedConfig = {
|
||||
config: crawlResult.config,
|
||||
source: 'trueref.json',
|
||||
warnings: []
|
||||
} satisfies ParsedConfig;
|
||||
} else {
|
||||
const configFile = crawlResult.files.find(
|
||||
(f) => f.path === 'trueref.json' || f.path === 'context7.json'
|
||||
@@ -189,7 +215,10 @@ export class IndexingPipeline {
|
||||
const filteredFiles =
|
||||
excludeFiles.length > 0
|
||||
? crawlResult.files.filter(
|
||||
(f) => !excludeFiles.some((pattern) => IndexingPipeline.matchesExcludePattern(f.path, pattern))
|
||||
(f) =>
|
||||
!excludeFiles.some((pattern) =>
|
||||
IndexingPipeline.matchesExcludePattern(f.path, pattern)
|
||||
)
|
||||
)
|
||||
: crawlResult.files;
|
||||
|
||||
@@ -303,7 +332,13 @@ export class IndexingPipeline {
|
||||
this.embeddingService !== null
|
||||
);
|
||||
this.updateJob(job.id, { processedFiles: totalProcessed, progress });
|
||||
reportStage('parsing', `${totalProcessed} / ${totalFiles} files`, progress, totalProcessed, totalFiles);
|
||||
reportStage(
|
||||
'parsing',
|
||||
`${totalProcessed} / ${totalFiles} files`,
|
||||
progress,
|
||||
totalProcessed,
|
||||
totalFiles
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -312,7 +347,7 @@ export class IndexingPipeline {
|
||||
|
||||
// ---- Stage 3: Atomic replacement ------------------------------------
|
||||
reportStage('storing');
|
||||
this.replaceSnippets(repo.id, changedDocIds, newDocuments, newSnippets);
|
||||
await this.replaceSnippets(repo.id, changedDocIds, newDocuments, newSnippets);
|
||||
|
||||
// ---- Stage 4: Embeddings (if provider is configured) ----------------
|
||||
if (this.embeddingService) {
|
||||
@@ -325,7 +360,7 @@ export class IndexingPipeline {
|
||||
if (snippetIds.length === 0) {
|
||||
// No missing embeddings for the active profile; parsing progress is final.
|
||||
} else {
|
||||
const embeddingsTotal = snippetIds.length;
|
||||
const embeddingsTotal = snippetIds.length;
|
||||
|
||||
await this.embeddingService.embedSnippets(snippetIds, (done) => {
|
||||
const progress = calculateProgress(
|
||||
@@ -350,7 +385,7 @@ export class IndexingPipeline {
|
||||
state: 'indexed'
|
||||
});
|
||||
|
||||
this.updateRepo(repo.id, {
|
||||
await this.updateRepo(repo.id, {
|
||||
state: 'indexed',
|
||||
totalSnippets: stats.totalSnippets,
|
||||
totalTokens: stats.totalTokens,
|
||||
@@ -360,7 +395,7 @@ export class IndexingPipeline {
|
||||
|
||||
if (normJob.versionId) {
|
||||
const versionStats = this.computeVersionStats(normJob.versionId);
|
||||
this.updateVersion(normJob.versionId, {
|
||||
await this.updateVersion(normJob.versionId, {
|
||||
state: 'indexed',
|
||||
totalSnippets: versionStats.totalSnippets,
|
||||
indexedAt: Math.floor(Date.now() / 1000)
|
||||
@@ -371,12 +406,12 @@ export class IndexingPipeline {
|
||||
if (parsedConfig?.config.rules?.length) {
|
||||
if (!normJob.versionId) {
|
||||
// Main-branch job: write the repo-wide entry only.
|
||||
this.upsertRepoConfig(repo.id, null, parsedConfig.config.rules);
|
||||
await this.upsertRepoConfig(repo.id, null, parsedConfig.config.rules);
|
||||
} else {
|
||||
// Version job: write only the version-specific entry.
|
||||
// Writing to the NULL row here would overwrite repo-wide rules
|
||||
// with whatever the last-indexed version happened to carry.
|
||||
this.upsertRepoConfig(repo.id, normJob.versionId, parsedConfig.config.rules);
|
||||
await this.upsertRepoConfig(repo.id, normJob.versionId, parsedConfig.config.rules);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -398,9 +433,9 @@ export class IndexingPipeline {
|
||||
});
|
||||
|
||||
// Restore repo to error state but preserve any existing indexed data.
|
||||
this.updateRepo(repositoryId, { state: 'error' });
|
||||
await this.updateRepo(repositoryId, { state: 'error' });
|
||||
if (normJob.versionId) {
|
||||
this.updateVersion(normJob.versionId, { state: 'error' });
|
||||
await this.updateVersion(normJob.versionId, { state: 'error' });
|
||||
}
|
||||
|
||||
throw error;
|
||||
@@ -411,7 +446,11 @@ export class IndexingPipeline {
|
||||
// Private — crawl
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private async crawl(repo: Repository, ref?: string, allowedPaths?: Set<string>): Promise<{
|
||||
private async crawl(
|
||||
repo: Repository,
|
||||
ref?: string,
|
||||
allowedPaths?: Set<string>
|
||||
): Promise<{
|
||||
files: Array<{ path: string; content: string; sha: string; size: number; language: string }>;
|
||||
totalFiles: number;
|
||||
/** Pre-parsed trueref.json / context7.json, or undefined when absent. */
|
||||
@@ -473,219 +512,50 @@ export class IndexingPipeline {
|
||||
*
|
||||
* Runs in a single SQLite transaction for atomicity.
|
||||
*/
|
||||
private cloneFromAncestor(
|
||||
ancestorVersionId: string,
|
||||
targetVersionId: string,
|
||||
repositoryId: string,
|
||||
unchangedPaths: Set<string>
|
||||
): void {
|
||||
this.db.transaction(() => {
|
||||
const pathList = [...unchangedPaths];
|
||||
const placeholders = pathList.map(() => '?').join(',');
|
||||
const ancestorDocs = this.db
|
||||
.prepare(
|
||||
`SELECT * FROM documents WHERE version_id = ? AND file_path IN (${placeholders})`
|
||||
)
|
||||
.all(ancestorVersionId, ...pathList) as Array<{
|
||||
id: string;
|
||||
repository_id: string;
|
||||
file_path: string;
|
||||
title: string | null;
|
||||
language: string | null;
|
||||
token_count: number;
|
||||
checksum: string;
|
||||
indexed_at: number;
|
||||
}>;
|
||||
private async cloneFromAncestor(
|
||||
requestOrAncestorVersionId: CloneFromAncestorRequest | string,
|
||||
targetVersionId?: string,
|
||||
repositoryId?: string,
|
||||
unchangedPaths?: Set<string>
|
||||
): Promise<void> {
|
||||
const request: CloneFromAncestorRequest =
|
||||
typeof requestOrAncestorVersionId === 'string'
|
||||
? {
|
||||
ancestorVersionId: requestOrAncestorVersionId,
|
||||
targetVersionId: targetVersionId!,
|
||||
repositoryId: repositoryId!,
|
||||
unchangedPaths: [...(unchangedPaths ?? new Set<string>())]
|
||||
}
|
||||
: requestOrAncestorVersionId;
|
||||
|
||||
const docIdMap = new Map<string, string>();
|
||||
const nowEpoch = Math.floor(Date.now() / 1000);
|
||||
if (request.unchangedPaths.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const doc of ancestorDocs) {
|
||||
const newDocId = randomUUID();
|
||||
docIdMap.set(doc.id, newDocId);
|
||||
this.db
|
||||
.prepare(
|
||||
`INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
)
|
||||
.run(
|
||||
newDocId,
|
||||
repositoryId,
|
||||
targetVersionId,
|
||||
doc.file_path,
|
||||
doc.title,
|
||||
doc.language,
|
||||
doc.token_count,
|
||||
doc.checksum,
|
||||
nowEpoch
|
||||
);
|
||||
}
|
||||
if (this.writeDelegate?.cloneFromAncestor) {
|
||||
await this.writeDelegate.cloneFromAncestor(request);
|
||||
return;
|
||||
}
|
||||
|
||||
if (docIdMap.size === 0) return;
|
||||
|
||||
const oldDocIds = [...docIdMap.keys()];
|
||||
const snippetPlaceholders = oldDocIds.map(() => '?').join(',');
|
||||
const ancestorSnippets = this.db
|
||||
.prepare(
|
||||
`SELECT * FROM snippets WHERE document_id IN (${snippetPlaceholders})`
|
||||
)
|
||||
.all(...oldDocIds) as Array<{
|
||||
id: string;
|
||||
document_id: string;
|
||||
repository_id: string;
|
||||
version_id: string | null;
|
||||
type: string;
|
||||
title: string | null;
|
||||
content: string;
|
||||
language: string | null;
|
||||
breadcrumb: string | null;
|
||||
token_count: number;
|
||||
created_at: number;
|
||||
}>;
|
||||
|
||||
const snippetIdMap = new Map<string, string>();
|
||||
for (const snippet of ancestorSnippets) {
|
||||
const newSnippetId = randomUUID();
|
||||
snippetIdMap.set(snippet.id, newSnippetId);
|
||||
const newDocId = docIdMap.get(snippet.document_id)!;
|
||||
this.db
|
||||
.prepare(
|
||||
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
)
|
||||
.run(
|
||||
newSnippetId,
|
||||
newDocId,
|
||||
repositoryId,
|
||||
targetVersionId,
|
||||
snippet.type,
|
||||
snippet.title,
|
||||
snippet.content,
|
||||
snippet.language,
|
||||
snippet.breadcrumb,
|
||||
snippet.token_count,
|
||||
snippet.created_at
|
||||
);
|
||||
}
|
||||
|
||||
if (snippetIdMap.size > 0) {
|
||||
const oldSnippetIds = [...snippetIdMap.keys()];
|
||||
const embPlaceholders = oldSnippetIds.map(() => '?').join(',');
|
||||
const ancestorEmbeddings = this.db
|
||||
.prepare(
|
||||
`SELECT * FROM snippet_embeddings WHERE snippet_id IN (${embPlaceholders})`
|
||||
)
|
||||
.all(...oldSnippetIds) as Array<{
|
||||
snippet_id: string;
|
||||
profile_id: string;
|
||||
model: string;
|
||||
dimensions: number;
|
||||
embedding: Buffer;
|
||||
created_at: number;
|
||||
}>;
|
||||
for (const emb of ancestorEmbeddings) {
|
||||
const newSnippetId = snippetIdMap.get(emb.snippet_id)!;
|
||||
this.db
|
||||
.prepare(
|
||||
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?)`
|
||||
)
|
||||
.run(
|
||||
newSnippetId,
|
||||
emb.profile_id,
|
||||
emb.model,
|
||||
emb.dimensions,
|
||||
emb.embedding,
|
||||
emb.created_at
|
||||
);
|
||||
this.sqliteVecStore.upsertEmbeddingBuffer(
|
||||
emb.profile_id,
|
||||
newSnippetId,
|
||||
emb.embedding,
|
||||
emb.dimensions
|
||||
);
|
||||
}
|
||||
}
|
||||
})();
|
||||
cloneFromAncestorInDatabase(this.db, request);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Private — atomic snippet replacement
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private replaceSnippets(
|
||||
private async replaceSnippets(
|
||||
_repositoryId: string,
|
||||
changedDocIds: string[],
|
||||
newDocuments: NewDocument[],
|
||||
newSnippets: NewSnippet[]
|
||||
): void {
|
||||
const insertDoc = this.db.prepare(
|
||||
`INSERT INTO documents
|
||||
(id, repository_id, version_id, file_path, title, language,
|
||||
token_count, checksum, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
);
|
||||
): Promise<void> {
|
||||
if (this.writeDelegate?.replaceSnippets) {
|
||||
await this.writeDelegate.replaceSnippets(changedDocIds, newDocuments, newSnippets);
|
||||
return;
|
||||
}
|
||||
|
||||
const insertSnippet = this.db.prepare(
|
||||
`INSERT INTO snippets
|
||||
(id, document_id, repository_id, version_id, type, title,
|
||||
content, language, breadcrumb, token_count, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
);
|
||||
|
||||
this.db.transaction(() => {
|
||||
this.sqliteVecStore.deleteEmbeddingsForDocumentIds(changedDocIds);
|
||||
|
||||
// Delete stale documents (cascade deletes their snippets via FK).
|
||||
if (changedDocIds.length > 0) {
|
||||
const placeholders = changedDocIds.map(() => '?').join(',');
|
||||
this.db
|
||||
.prepare(`DELETE FROM documents WHERE id IN (${placeholders})`)
|
||||
.run(...changedDocIds);
|
||||
}
|
||||
|
||||
// Insert new documents.
|
||||
for (const doc of newDocuments) {
|
||||
const indexedAtSeconds =
|
||||
doc.indexedAt instanceof Date
|
||||
? Math.floor(doc.indexedAt.getTime() / 1000)
|
||||
: Math.floor(Date.now() / 1000);
|
||||
|
||||
insertDoc.run(
|
||||
doc.id,
|
||||
doc.repositoryId,
|
||||
doc.versionId ?? null,
|
||||
doc.filePath,
|
||||
doc.title ?? null,
|
||||
doc.language ?? null,
|
||||
doc.tokenCount ?? 0,
|
||||
doc.checksum,
|
||||
indexedAtSeconds
|
||||
);
|
||||
}
|
||||
|
||||
// Insert new snippets.
|
||||
for (const snippet of newSnippets) {
|
||||
const createdAtSeconds =
|
||||
snippet.createdAt instanceof Date
|
||||
? Math.floor(snippet.createdAt.getTime() / 1000)
|
||||
: Math.floor(Date.now() / 1000);
|
||||
|
||||
insertSnippet.run(
|
||||
snippet.id,
|
||||
snippet.documentId,
|
||||
snippet.repositoryId,
|
||||
snippet.versionId ?? null,
|
||||
snippet.type,
|
||||
snippet.title ?? null,
|
||||
snippet.content,
|
||||
snippet.language ?? null,
|
||||
snippet.breadcrumb ?? null,
|
||||
snippet.tokenCount ?? 0,
|
||||
createdAtSeconds
|
||||
);
|
||||
}
|
||||
})();
|
||||
replaceSnippetsInDatabase(this.db, changedDocIds, newDocuments, newSnippets);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -709,9 +579,10 @@ export class IndexingPipeline {
|
||||
|
||||
private computeVersionStats(versionId: string): { totalSnippets: number } {
|
||||
const row = this.db
|
||||
.prepare<[string], { total_snippets: number }>(
|
||||
`SELECT COUNT(*) as total_snippets FROM snippets WHERE version_id = ?`
|
||||
)
|
||||
.prepare<
|
||||
[string],
|
||||
{ total_snippets: number }
|
||||
>(`SELECT COUNT(*) as total_snippets FROM snippets WHERE version_id = ?`)
|
||||
.get(versionId);
|
||||
|
||||
return { totalSnippets: row?.total_snippets ?? 0 };
|
||||
@@ -750,6 +621,10 @@ export class IndexingPipeline {
|
||||
}
|
||||
|
||||
private updateJob(id: string, fields: Record<string, unknown>): void {
|
||||
if (this.writeDelegate?.persistJobUpdates === false) {
|
||||
return;
|
||||
}
|
||||
|
||||
const sets = Object.keys(fields)
|
||||
.map((k) => `${toSnake(k)} = ?`)
|
||||
.join(', ');
|
||||
@@ -757,43 +632,44 @@ export class IndexingPipeline {
|
||||
this.db.prepare(`UPDATE indexing_jobs SET ${sets} WHERE id = ?`).run(...values);
|
||||
}
|
||||
|
||||
private updateRepo(id: string, fields: Record<string, unknown>): void {
|
||||
const now = Math.floor(Date.now() / 1000);
|
||||
const allFields = { ...fields, updatedAt: now };
|
||||
const sets = Object.keys(allFields)
|
||||
.map((k) => `${toSnake(k)} = ?`)
|
||||
.join(', ');
|
||||
const values = [...Object.values(allFields), id];
|
||||
this.db.prepare(`UPDATE repositories SET ${sets} WHERE id = ?`).run(...values);
|
||||
private async updateRepo(id: string, fields: SerializedFields): Promise<void> {
|
||||
if (this.writeDelegate?.updateRepo) {
|
||||
await this.writeDelegate.updateRepo(id, fields);
|
||||
return;
|
||||
}
|
||||
|
||||
updateRepoInDatabase(this.db, id, fields);
|
||||
}
|
||||
|
||||
private updateVersion(id: string, fields: Record<string, unknown>): void {
|
||||
const sets = Object.keys(fields)
|
||||
.map((k) => `${toSnake(k)} = ?`)
|
||||
.join(', ');
|
||||
const values = [...Object.values(fields), id];
|
||||
this.db.prepare(`UPDATE repository_versions SET ${sets} WHERE id = ?`).run(...values);
|
||||
private async updateVersion(id: string, fields: SerializedFields): Promise<void> {
|
||||
if (this.writeDelegate?.updateVersion) {
|
||||
await this.writeDelegate.updateVersion(id, fields);
|
||||
return;
|
||||
}
|
||||
|
||||
updateVersionInDatabase(this.db, id, fields);
|
||||
}
|
||||
|
||||
private upsertRepoConfig(
|
||||
private async upsertRepoConfig(
|
||||
repositoryId: string,
|
||||
versionId: string | null,
|
||||
rules: string[]
|
||||
): void {
|
||||
): Promise<void> {
|
||||
if (this.writeDelegate?.upsertRepoConfig) {
|
||||
await this.writeDelegate.upsertRepoConfig(repositoryId, versionId, rules);
|
||||
return;
|
||||
}
|
||||
|
||||
const now = Math.floor(Date.now() / 1000);
|
||||
// Use DELETE + INSERT because ON CONFLICT … DO UPDATE doesn't work reliably
|
||||
// with partial unique indexes in all SQLite versions.
|
||||
if (versionId === null) {
|
||||
this.db
|
||||
.prepare(
|
||||
`DELETE FROM repository_configs WHERE repository_id = ? AND version_id IS NULL`
|
||||
)
|
||||
.prepare(`DELETE FROM repository_configs WHERE repository_id = ? AND version_id IS NULL`)
|
||||
.run(repositoryId);
|
||||
} else {
|
||||
this.db
|
||||
.prepare(
|
||||
`DELETE FROM repository_configs WHERE repository_id = ? AND version_id = ?`
|
||||
)
|
||||
.prepare(`DELETE FROM repository_configs WHERE repository_id = ? AND version_id = ?`)
|
||||
.run(repositoryId, versionId);
|
||||
}
|
||||
this.db
|
||||
|
||||
Reference in New Issue
Block a user