/** * IndexingPipeline — orchestrates the full crawl → parse → store → embed * flow for a single repository indexing job (TRUEREF-0009). * * Atomicity guarantee: * Old documents/snippets for changed files are deleted and replaced inside * a single SQLite transaction. If anything fails after that transaction the * already-committed data stays intact and the job is marked failed so * callers can retry. * * Progress model: * - Without embeddings: crawl+parse = 100 % * - With embeddings : crawl+parse = 80 %, embeddings = 20 % */ import { createHash } from 'node:crypto'; import type Database from 'better-sqlite3'; import type { Document, NewDocument, NewSnippet } from '$lib/types'; import type { crawl as GithubCrawlFn } from '$lib/server/crawler/github.crawler.js'; import type { LocalCrawler } from '$lib/server/crawler/local.crawler.js'; import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.js'; import { RepositoryMapper } from '$lib/server/mappers/repository.mapper.js'; import { IndexingJob } from '$lib/server/models/indexing-job.js'; import { Repository, RepositoryEntity } from '$lib/server/models/repository.js'; import { parseFile } from '$lib/server/parser/index.js'; import { computeTrustScore } from '$lib/server/search/trust-score.js'; import { computeDiff } from './diff.js'; // --------------------------------------------------------------------------- // Progress calculation // --------------------------------------------------------------------------- function calculateProgress( processedFiles: number, totalFiles: number, embeddingsDone: number, embeddingsTotal: number, hasEmbeddings: boolean ): number { if (totalFiles === 0) return 0; if (!hasEmbeddings) { return Math.round((processedFiles / totalFiles) * 100); } const parseProgress = (processedFiles / totalFiles) * 80; const embedProgress = embeddingsTotal > 0 ? (embeddingsDone / embeddingsTotal) * 20 : 0; return Math.round(parseProgress + embedProgress); } // --------------------------------------------------------------------------- // SHA-256 helper // --------------------------------------------------------------------------- function sha256(content: string): string { return createHash('sha256').update(content, 'utf-8').digest('hex'); } // --------------------------------------------------------------------------- // IndexingPipeline // --------------------------------------------------------------------------- export class IndexingPipeline { constructor( private readonly db: Database.Database, private readonly githubCrawl: typeof GithubCrawlFn, private readonly localCrawler: LocalCrawler, private readonly embeddingService: EmbeddingService | null ) {} // ------------------------------------------------------------------------- // Public — run a job end to end // ------------------------------------------------------------------------- async run(job: IndexingJob): Promise { // better-sqlite3 raw queries return snake_case keys; Drizzle types use camelCase. // Accept both so the pipeline works when called from raw SQL contexts. const raw = job as unknown as Record; const repositoryId = (job.repositoryId ?? raw['repository_id']) as string; const versionId = (job.versionId ?? raw['version_id'] ?? null) as string | null; // Rebuild a normalised job view for the rest of this method. const normJob = { ...job, repositoryId, versionId }; this.updateJob(job.id, { status: 'running', startedAt: Math.floor(Date.now() / 1000) }); try { const repo = this.getRepository(repositoryId); if (!repo) throw new Error(`Repository ${repositoryId} not found`); // Mark repo as actively indexing. this.updateRepo(repo.id, { state: 'indexing' }); // ---- Stage 1: Crawl ------------------------------------------------- const crawlResult = await this.crawl(repo, normJob); const totalFiles = crawlResult.totalFiles; this.updateJob(job.id, { totalFiles }); // ---- Stage 2: Parse & diff ------------------------------------------ // Load all existing documents for this repo so computeDiff can // classify every crawled file and detect deletions. const existingDocs = this.getExistingDocuments(repo.id, normJob.versionId); const diff = computeDiff(crawlResult.files, existingDocs); // Accumulate new documents/snippets; skip unchanged files. const newDocuments: NewDocument[] = []; const newSnippets: NewSnippet[] = []; const changedDocIds: string[] = []; // Schedule stale documents (modified + deleted) for deletion. for (const file of diff.modified) { const existing = existingDocs.find((d) => d.filePath === file.path); if (existing) changedDocIds.push(existing.id); } for (const filePath of diff.deleted) { const existing = existingDocs.find((d) => d.filePath === filePath); if (existing) changedDocIds.push(existing.id); } // Only parse and embed files that are new or have changed. const filesToProcess = [...diff.added, ...diff.modified]; let processedFiles = diff.unchanged.length; // unchanged files count as processed // Report unchanged files as already processed so the progress bar // immediately reflects real work done (especially on incremental re-index // where most or all files are unchanged). if (processedFiles > 0) { const initialProgress = calculateProgress( processedFiles, totalFiles, 0, 0, this.embeddingService !== null ); this.updateJob(job.id, { processedFiles, progress: initialProgress }); } for (const [i, file] of filesToProcess.entries()) { const checksum = file.sha || sha256(file.content); // Create new document record. const documentId = crypto.randomUUID(); const now = new Date(); const newDoc: NewDocument = { id: documentId, repositoryId: repo.id, versionId: normJob.versionId ?? null, filePath: file.path, title: null, language: file.language, tokenCount: 0, checksum, indexedAt: now }; // Parse into snippets. const snippets = parseFile(file, { repositoryId: repo.id, documentId, versionId: normJob.versionId ?? undefined }); // Update document token count from snippet totals. const tokenCount = snippets.reduce((sum, s) => sum + (s.tokenCount ?? 0), 0); newDoc.tokenCount = tokenCount; newDocuments.push(newDoc); newSnippets.push(...snippets); // Count ALL files (including skipped unchanged ones) in progress. const totalProcessed = diff.unchanged.length + i + 1; const progress = calculateProgress( totalProcessed, totalFiles, 0, 0, this.embeddingService !== null ); this.updateJob(job.id, { processedFiles: totalProcessed, progress }); } // After the loop processedFiles should reflect the full count. processedFiles = diff.unchanged.length + filesToProcess.length; // ---- Stage 3: Atomic replacement ------------------------------------ this.replaceSnippets(repo.id, changedDocIds, newDocuments, newSnippets); // ---- Stage 4: Embeddings (if provider is configured) ---------------- if (this.embeddingService && newSnippets.length > 0) { const snippetIds = newSnippets.map((s) => s.id!); const embeddingsTotal = snippetIds.length; await this.embeddingService.embedSnippets(snippetIds, (done) => { const progress = calculateProgress( processedFiles, totalFiles, done, embeddingsTotal, true ); this.updateJob(job.id, { progress }); }); } // ---- Stage 5: Update repository stats -------------------------------- const stats = this.computeStats(repo.id); const freshRepo = this.getRepository(repo.id)!; const trustScore = computeTrustScore({ ...freshRepo, totalSnippets: stats.totalSnippets, totalTokens: stats.totalTokens, state: 'indexed' }); this.updateRepo(repo.id, { state: 'indexed', totalSnippets: stats.totalSnippets, totalTokens: stats.totalTokens, trustScore, lastIndexedAt: Math.floor(Date.now() / 1000) }); this.updateJob(job.id, { status: 'done', progress: 100, completedAt: Math.floor(Date.now() / 1000) }); } catch (error) { const message = error instanceof Error ? error.message : String(error); console.error(`[IndexingPipeline] Job ${job.id} failed: ${message}`); this.updateJob(job.id, { status: 'failed', error: message, completedAt: Math.floor(Date.now() / 1000) }); // Restore repo to error state but preserve any existing indexed data. this.updateRepo(repositoryId, { state: 'error' }); throw error; } } // ------------------------------------------------------------------------- // Private — crawl // ------------------------------------------------------------------------- private async crawl( repo: Repository, job: IndexingJob ): Promise<{ files: Array<{ path: string; content: string; sha: string; size: number; language: string }>; totalFiles: number }> { if (repo.source === 'github') { // Parse owner/repo from the canonical ID: "/owner/repo" const parts = repo.id.replace(/^\//, '').split('/'); const owner = parts[0]; const repoName = parts[1]; if (!owner || !repoName) { throw new Error(`Cannot parse GitHub owner/repo from id: ${repo.id}`); } const result = await this.githubCrawl({ owner, repo: repoName, ref: repo.branch ?? undefined, token: repo.githubToken ?? undefined }); return { files: result.files, totalFiles: result.totalFiles }; } else { // Local filesystem crawl. const result = await this.localCrawler.crawl({ rootPath: repo.sourceUrl, ref: repo.branch !== 'main' ? (repo.branch ?? undefined) : undefined }); return { files: result.files, totalFiles: result.totalFiles }; } } // ------------------------------------------------------------------------- // Private — atomic snippet replacement // ------------------------------------------------------------------------- private replaceSnippets( _repositoryId: string, changedDocIds: string[], newDocuments: NewDocument[], newSnippets: NewSnippet[] ): void { const insertDoc = this.db.prepare( `INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)` ); const insertSnippet = this.db.prepare( `INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` ); this.db.transaction(() => { // Delete stale documents (cascade deletes their snippets via FK). if (changedDocIds.length > 0) { const placeholders = changedDocIds.map(() => '?').join(','); this.db .prepare(`DELETE FROM documents WHERE id IN (${placeholders})`) .run(...changedDocIds); } // Insert new documents. for (const doc of newDocuments) { const indexedAtSeconds = doc.indexedAt instanceof Date ? Math.floor(doc.indexedAt.getTime() / 1000) : Math.floor(Date.now() / 1000); insertDoc.run( doc.id, doc.repositoryId, doc.versionId ?? null, doc.filePath, doc.title ?? null, doc.language ?? null, doc.tokenCount ?? 0, doc.checksum, indexedAtSeconds ); } // Insert new snippets. for (const snippet of newSnippets) { const createdAtSeconds = snippet.createdAt instanceof Date ? Math.floor(snippet.createdAt.getTime() / 1000) : Math.floor(Date.now() / 1000); insertSnippet.run( snippet.id, snippet.documentId, snippet.repositoryId, snippet.versionId ?? null, snippet.type, snippet.title ?? null, snippet.content, snippet.language ?? null, snippet.breadcrumb ?? null, snippet.tokenCount ?? 0, createdAtSeconds ); } })(); } // ------------------------------------------------------------------------- // Private — stats // ------------------------------------------------------------------------- private computeStats(repositoryId: string): { totalSnippets: number; totalTokens: number } { const row = this.db .prepare<[string], { total_snippets: number; total_tokens: number }>( `SELECT COUNT(*) as total_snippets, COALESCE(SUM(token_count), 0) as total_tokens FROM snippets WHERE repository_id = ?` ) .get(repositoryId); return { totalSnippets: row?.total_snippets ?? 0, totalTokens: row?.total_tokens ?? 0 }; } // ------------------------------------------------------------------------- // Private — DB helpers // ------------------------------------------------------------------------- private getExistingDocuments(repositoryId: string, versionId: string | null): Document[] { if (versionId) { return this.db .prepare<[string, string], Document>( `SELECT id, repository_id as repositoryId, version_id as versionId, file_path as filePath, title, language, token_count as tokenCount, checksum, indexed_at as indexedAt FROM documents WHERE repository_id = ? AND version_id = ?` ) .all(repositoryId, versionId) as Document[]; } return this.db .prepare<[string], Document>( `SELECT id, repository_id as repositoryId, version_id as versionId, file_path as filePath, title, language, token_count as tokenCount, checksum, indexed_at as indexedAt FROM documents WHERE repository_id = ? AND version_id IS NULL` ) .all(repositoryId) as Document[]; } private getRepository(id: string): Repository | null { const raw = this.db .prepare<[string], RepositoryEntity>(`SELECT * FROM repositories WHERE id = ?`) .get(id); return raw ? RepositoryMapper.fromEntity(new RepositoryEntity(raw)) : null; } private updateJob(id: string, fields: Record): void { const sets = Object.keys(fields) .map((k) => `${toSnake(k)} = ?`) .join(', '); const values = [...Object.values(fields), id]; this.db.prepare(`UPDATE indexing_jobs SET ${sets} WHERE id = ?`).run(...values); } private updateRepo(id: string, fields: Record): void { const now = Math.floor(Date.now() / 1000); const allFields = { ...fields, updatedAt: now }; const sets = Object.keys(allFields) .map((k) => `${toSnake(k)} = ?`) .join(', '); const values = [...Object.values(allFields), id]; this.db.prepare(`UPDATE repositories SET ${sets} WHERE id = ?`).run(...values); } } // --------------------------------------------------------------------------- // Utility // --------------------------------------------------------------------------- /** Convert camelCase to snake_case for DB column mapping. */ function toSnake(key: string): string { return key.replace(/[A-Z]/g, (c) => `_${c.toLowerCase()}`); }