From 666ec7d55f65eeb680135b26c6b34fa0c204525c Mon Sep 17 00:00:00 2001 From: Giancarmine Salucci Date: Sat, 28 Mar 2026 10:44:30 +0100 Subject: [PATCH] feat(MULTIVERSION-0001): wire trueref.json into pipeline + per-version rules - Add migration 0003: recreate repository_configs with nullable version_id column and two partial unique indexes (repo-wide: version_id IS NULL, per-version: (repository_id, version_id) WHERE version_id IS NOT NULL) - Update schema.ts to reflect the new composite structure with uniqueIndex partial constraints via drizzle-orm sql helper - IndexingPipeline: parse trueref.json / context7.json after crawl, apply excludeFiles filter before diff computation, update totalFiles accordingly - IndexingPipeline: persist repo-wide rules (version_id=null) and version-specific rules (when versionId set) via upsertRepoConfig helper - Add matchesExcludePattern static helper supporting plain filename, glob prefix (docs/legacy*), and exact path patterns - context endpoint: split getRules into repo-wide + version-specific lookup with dedup merge; pass versionId at call site - Update test DB loaders to include migration 0003 - Add pipeline tests for excludeFiles, repo-wide rules persistence, and per-version rules persistence - Add integration tests for merged rules, repo-only rules, and dedup logic Co-Authored-By: Claude Sonnet 4.6 --- .../migrations/0003_multiversion_config.sql | 30 +++++ .../server/db/migrations/meta/_journal.json | 7 ++ src/lib/server/db/schema.ts | 53 +++++--- .../server/pipeline/indexing.pipeline.test.ts | 116 +++++++++++++++++- src/lib/server/pipeline/indexing.pipeline.ts | 91 +++++++++++++- .../api/v1/api-contract.integration.test.ts | 99 +++++++++++++++ src/routes/api/v1/context/+server.ts | 54 ++++++-- 7 files changed, 418 insertions(+), 32 deletions(-) create mode 100644 src/lib/server/db/migrations/0003_multiversion_config.sql diff --git a/src/lib/server/db/migrations/0003_multiversion_config.sql b/src/lib/server/db/migrations/0003_multiversion_config.sql new file mode 100644 index 0000000..764f2bd --- /dev/null +++ b/src/lib/server/db/migrations/0003_multiversion_config.sql @@ -0,0 +1,30 @@ +PRAGMA foreign_keys=OFF; +--> statement-breakpoint +CREATE TABLE `__new_repository_configs` ( + `repository_id` text NOT NULL, + `version_id` text, + `project_title` text, + `description` text, + `folders` text, + `exclude_folders` text, + `exclude_files` text, + `rules` text, + `previous_versions` text, + `updated_at` integer NOT NULL, + FOREIGN KEY (`repository_id`) REFERENCES `repositories`(`id`) ON UPDATE no action ON DELETE cascade +); +--> statement-breakpoint +INSERT INTO `__new_repository_configs` + (repository_id, version_id, project_title, description, folders, exclude_folders, exclude_files, rules, previous_versions, updated_at) + SELECT repository_id, NULL, project_title, description, folders, exclude_folders, exclude_files, rules, previous_versions, updated_at + FROM `repository_configs`; +--> statement-breakpoint +DROP TABLE `repository_configs`; +--> statement-breakpoint +ALTER TABLE `__new_repository_configs` RENAME TO `repository_configs`; +--> statement-breakpoint +PRAGMA foreign_keys=ON; +--> statement-breakpoint +CREATE UNIQUE INDEX `uniq_repo_config_base` ON `repository_configs` (`repository_id`) WHERE `version_id` IS NULL; +--> statement-breakpoint +CREATE UNIQUE INDEX `uniq_repo_config_version` ON `repository_configs` (`repository_id`, `version_id`) WHERE `version_id` IS NOT NULL; diff --git a/src/lib/server/db/migrations/meta/_journal.json b/src/lib/server/db/migrations/meta/_journal.json index 6545d34..9c1be26 100644 --- a/src/lib/server/db/migrations/meta/_journal.json +++ b/src/lib/server/db/migrations/meta/_journal.json @@ -22,6 +22,13 @@ "when": 1774461897742, "tag": "0002_silky_stellaris", "breakpoints": true + }, + { + "idx": 3, + "version": "6", + "when": 1743155877000, + "tag": "0003_multiversion_config", + "breakpoints": true } ] } diff --git a/src/lib/server/db/schema.ts b/src/lib/server/db/schema.ts index 889f217..4afa044 100644 --- a/src/lib/server/db/schema.ts +++ b/src/lib/server/db/schema.ts @@ -1,4 +1,13 @@ -import { blob, integer, primaryKey, real, sqliteTable, text } from 'drizzle-orm/sqlite-core'; +import { sql } from 'drizzle-orm'; +import { + blob, + integer, + primaryKey, + real, + sqliteTable, + text, + uniqueIndex +} from 'drizzle-orm/sqlite-core'; // --------------------------------------------------------------------------- // repositories @@ -148,21 +157,33 @@ export const indexingJobs = sqliteTable('indexing_jobs', { // --------------------------------------------------------------------------- // repository_configs // --------------------------------------------------------------------------- -export const repositoryConfigs = sqliteTable('repository_configs', { - repositoryId: text('repository_id') - .primaryKey() - .references(() => repositories.id, { onDelete: 'cascade' }), - projectTitle: text('project_title'), - description: text('description'), - folders: text('folders', { mode: 'json' }).$type(), - excludeFolders: text('exclude_folders', { mode: 'json' }).$type(), - excludeFiles: text('exclude_files', { mode: 'json' }).$type(), - rules: text('rules', { mode: 'json' }).$type(), - previousVersions: text('previous_versions', { mode: 'json' }).$type< - { tag: string; title: string; commitHash?: string }[] - >(), - updatedAt: integer('updated_at', { mode: 'timestamp' }).notNull() -}); +export const repositoryConfigs = sqliteTable( + 'repository_configs', + { + repositoryId: text('repository_id') + .notNull() + .references(() => repositories.id, { onDelete: 'cascade' }), + versionId: text('version_id'), + projectTitle: text('project_title'), + description: text('description'), + folders: text('folders', { mode: 'json' }).$type(), + excludeFolders: text('exclude_folders', { mode: 'json' }).$type(), + excludeFiles: text('exclude_files', { mode: 'json' }).$type(), + rules: text('rules', { mode: 'json' }).$type(), + previousVersions: text('previous_versions', { mode: 'json' }).$type< + { tag: string; title: string; commitHash?: string }[] + >(), + updatedAt: integer('updated_at', { mode: 'timestamp' }).notNull() + }, + (table) => [ + uniqueIndex('uniq_repo_config_base') + .on(table.repositoryId) + .where(sql`${table.versionId} IS NULL`), + uniqueIndex('uniq_repo_config_version') + .on(table.repositoryId, table.versionId) + .where(sql`${table.versionId} IS NOT NULL`) + ] +); // --------------------------------------------------------------------------- // settings diff --git a/src/lib/server/pipeline/indexing.pipeline.test.ts b/src/lib/server/pipeline/indexing.pipeline.test.ts index 3a86dbb..59c0078 100644 --- a/src/lib/server/pipeline/indexing.pipeline.test.ts +++ b/src/lib/server/pipeline/indexing.pipeline.test.ts @@ -26,7 +26,8 @@ function createTestDb(): Database.Database { for (const migrationFile of [ '0000_large_master_chief.sql', '0001_quick_nighthawk.sql', - '0002_silky_stellaris.sql' + '0002_silky_stellaris.sql', + '0003_multiversion_config.sql' ]) { const migrationSql = readFileSync(join(migrationsFolder, migrationFile), 'utf-8'); @@ -771,4 +772,117 @@ describe('IndexingPipeline', () => { ref: undefined }); }); + + it('excludes files matching excludeFiles patterns from trueref.json', async () => { + const truerefConfig = JSON.stringify({ + excludeFiles: ['migration-guide.md', 'docs/legacy*'] + }); + const files = [ + { + path: 'trueref.json', + content: truerefConfig, + sha: 'sha-config', + language: 'json' + }, + { + path: 'README.md', + content: '# Hello\n\nThis is documentation.', + sha: 'sha-readme', + language: 'markdown' + }, + { + path: 'migration-guide.md', + content: '# Migration Guide\n\nThis should be excluded.', + sha: 'sha-migration', + language: 'markdown' + }, + { + path: 'docs/legacy-api.md', + content: '# Legacy API\n\nShould be excluded by glob prefix.', + sha: 'sha-legacy', + language: 'markdown' + } + ]; + const pipeline = makePipeline({ files, totalFiles: files.length }); + const job = makeJob(); + + await pipeline.run(job as never); + + const docs = db + .prepare(`SELECT file_path FROM documents ORDER BY file_path`) + .all() as { file_path: string }[]; + const filePaths = docs.map((d) => d.file_path); + + // migration-guide.md and docs/legacy-api.md must be absent. + expect(filePaths).not.toContain('migration-guide.md'); + expect(filePaths).not.toContain('docs/legacy-api.md'); + + // README.md must still be indexed. + expect(filePaths).toContain('README.md'); + }); + + it('persists repo-wide rules from trueref.json to repository_configs after indexing', async () => { + const truerefConfig = JSON.stringify({ + rules: ['Always use TypeScript strict mode', 'Prefer async/await over callbacks'] + }); + const files = [ + { + path: 'trueref.json', + content: truerefConfig, + sha: 'sha-config', + language: 'json' + } + ]; + const pipeline = makePipeline({ files, totalFiles: files.length }); + const job = makeJob(); + + await pipeline.run(job as never); + + const row = db + .prepare( + `SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL` + ) + .get() as { rules: string } | undefined; + + expect(row).toBeDefined(); + const rules = JSON.parse(row!.rules); + expect(rules).toEqual(['Always use TypeScript strict mode', 'Prefer async/await over callbacks']); + }); + + it('persists version-specific rules under (repositoryId, versionId) when job has versionId', async () => { + const versionId = insertVersion(db, { tag: 'v2.0.0', state: 'pending' }); + const truerefConfig = JSON.stringify({ + rules: ['This is v2. Use the new Builder API.'] + }); + const files = [ + { + path: 'trueref.json', + content: truerefConfig, + sha: 'sha-config', + language: 'json' + } + ]; + const pipeline = makePipeline({ files, totalFiles: files.length }); + const job = makeJob('/test/repo', versionId); + + await pipeline.run(job as never); + + // Repo-wide row (version_id IS NULL) must exist. + const repoRow = db + .prepare( + `SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL` + ) + .get() as { rules: string } | undefined; + expect(repoRow).toBeDefined(); + + // Version-specific row must also exist. + const versionRow = db + .prepare( + `SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id = ?` + ) + .get(versionId) as { rules: string } | undefined; + expect(versionRow).toBeDefined(); + const rules = JSON.parse(versionRow!.rules); + expect(rules).toEqual(['This is v2. Use the new Builder API.']); + }); }); diff --git a/src/lib/server/pipeline/indexing.pipeline.ts b/src/lib/server/pipeline/indexing.pipeline.ts index 6a9c752..ed01062 100644 --- a/src/lib/server/pipeline/indexing.pipeline.ts +++ b/src/lib/server/pipeline/indexing.pipeline.ts @@ -22,6 +22,7 @@ import type { EmbeddingService } from '$lib/server/embeddings/embedding.service. import { RepositoryMapper } from '$lib/server/mappers/repository.mapper.js'; import { IndexingJob } from '$lib/server/models/indexing-job.js'; import { Repository, RepositoryEntity } from '$lib/server/models/repository.js'; +import { resolveConfig } from '$lib/server/config/config-parser.js'; import { parseFile } from '$lib/server/parser/index.js'; import { computeTrustScore } from '$lib/server/search/trust-score.js'; import { computeDiff } from './diff.js'; @@ -99,15 +100,32 @@ export class IndexingPipeline { ? this.getVersionTag(normJob.versionId) : undefined; const crawlResult = await this.crawl(repo, versionTag); - const totalFiles = crawlResult.totalFiles; + // Parse trueref.json / context7.json if present in the crawl results. + const configFile = crawlResult.files.find( + (f) => f.path === 'trueref.json' || f.path === 'context7.json' + ); + const parsedConfig = configFile + ? resolveConfig([{ filename: configFile.path, content: configFile.content }]) + : null; + const excludeFiles: string[] = parsedConfig?.config.excludeFiles ?? []; + + // Filter out excluded files before diff computation. + const filteredFiles = + excludeFiles.length > 0 + ? crawlResult.files.filter( + (f) => !excludeFiles.some((pattern) => IndexingPipeline.matchesExcludePattern(f.path, pattern)) + ) + : crawlResult.files; + + const totalFiles = filteredFiles.length; this.updateJob(job.id, { totalFiles }); // ---- Stage 2: Parse & diff ------------------------------------------ // Load all existing documents for this repo so computeDiff can // classify every crawled file and detect deletions. const existingDocs = this.getExistingDocuments(repo.id, normJob.versionId); - const diff = computeDiff(crawlResult.files, existingDocs); + const diff = computeDiff(filteredFiles, existingDocs); // Accumulate new documents/snippets; skip unchanged files. const newDocuments: NewDocument[] = []; @@ -244,6 +262,16 @@ export class IndexingPipeline { }); } + // ---- Stage 6: Persist rules from config ---------------------------- + if (parsedConfig?.config.rules?.length) { + // Repo-wide rules (versionId = null). + this.upsertRepoConfig(repo.id, null, parsedConfig.config.rules); + // Version-specific rules stored separately when indexing a version. + if (normJob.versionId) { + this.upsertRepoConfig(repo.id, normJob.versionId, parsedConfig.config.rules); + } + } + this.updateJob(job.id, { status: 'done', progress: 100, @@ -476,6 +504,65 @@ export class IndexingPipeline { const values = [...Object.values(fields), id]; this.db.prepare(`UPDATE repository_versions SET ${sets} WHERE id = ?`).run(...values); } + + private upsertRepoConfig( + repositoryId: string, + versionId: string | null, + rules: string[] + ): void { + const now = Math.floor(Date.now() / 1000); + // Use DELETE + INSERT because ON CONFLICT … DO UPDATE doesn't work reliably + // with partial unique indexes in all SQLite versions. + if (versionId === null) { + this.db + .prepare( + `DELETE FROM repository_configs WHERE repository_id = ? AND version_id IS NULL` + ) + .run(repositoryId); + } else { + this.db + .prepare( + `DELETE FROM repository_configs WHERE repository_id = ? AND version_id = ?` + ) + .run(repositoryId, versionId); + } + this.db + .prepare( + `INSERT INTO repository_configs (repository_id, version_id, rules, updated_at) + VALUES (?, ?, ?, ?)` + ) + .run(repositoryId, versionId, JSON.stringify(rules), now); + } + + // ------------------------------------------------------------------------- + // Private — static helpers + // ------------------------------------------------------------------------- + + /** + * Returns true when `filePath` matches the given exclude `pattern`. + * + * Supported patterns: + * - Plain filename: `migration-guide.md` matches any path ending in `/migration-guide.md` + * or equal to `migration-guide.md`. + * - Glob prefix with wildcard: `docs/migration*` matches paths that start with `docs/migration`. + * - Exact path: `src/legacy/old-api.ts` matches exactly that path. + */ + private static matchesExcludePattern(filePath: string, pattern: string): boolean { + if (pattern.includes('*')) { + // Glob-style: treat everything before the '*' as a required prefix. + const prefix = pattern.slice(0, pattern.indexOf('*')); + return filePath.startsWith(prefix); + } + + // No wildcard — treat as plain name or exact path. + if (!pattern.includes('/')) { + // Plain filename: match basename (path ends with / or equals pattern). + return filePath === pattern || filePath.endsWith('/' + pattern); + } + + // Contains a slash — exact path match. + return filePath === pattern; + } } // --------------------------------------------------------------------------- diff --git a/src/routes/api/v1/api-contract.integration.test.ts b/src/routes/api/v1/api-contract.integration.test.ts index 0131bcc..82ae634 100644 --- a/src/routes/api/v1/api-contract.integration.test.ts +++ b/src/routes/api/v1/api-contract.integration.test.ts @@ -55,6 +55,7 @@ function createTestDb(): Database.Database { const migration0 = readFileSync(join(migrationsFolder, '0000_large_master_chief.sql'), 'utf-8'); const migration1 = readFileSync(join(migrationsFolder, '0001_quick_nighthawk.sql'), 'utf-8'); const migration2 = readFileSync(join(migrationsFolder, '0002_silky_stellaris.sql'), 'utf-8'); + const migration3 = readFileSync(join(migrationsFolder, '0003_multiversion_config.sql'), 'utf-8'); // Apply first migration const statements0 = migration0 @@ -85,6 +86,15 @@ function createTestDb(): Database.Database { client.exec(statement); } + const statements3 = migration3 + .split('--> statement-breakpoint') + .map((statement) => statement.trim()) + .filter(Boolean); + + for (const statement of statements3) { + client.exec(statement); + } + client.exec(readFileSync(ftsFile, 'utf-8')); return client; @@ -487,6 +497,95 @@ describe('API contract integration', () => { }); }); + it('GET /api/v1/context returns merged repo-wide and version-specific rules', async () => { + const repositoryId = seedRepo(db); + const versionId = seedVersion(db, repositoryId, 'v2.0.0'); + const documentId = seedDocument(db, repositoryId, versionId); + + // Insert repo-wide rules (version_id IS NULL). + db.prepare( + `INSERT INTO repository_configs (repository_id, version_id, rules, updated_at) + VALUES (?, NULL, ?, ?)` + ).run(repositoryId, JSON.stringify(['Repo-wide rule']), NOW_S); + + // Insert version-specific rules. + db.prepare( + `INSERT INTO repository_configs (repository_id, version_id, rules, updated_at) + VALUES (?, ?, ?, ?)` + ).run(repositoryId, versionId, JSON.stringify(['Version-specific rule']), NOW_S); + + seedSnippet(db, { + documentId, + repositoryId, + versionId, + content: 'some versioned content' + }); + + const response = await getContext({ + url: new URL( + `http://test/api/v1/context?libraryId=${encodeURIComponent(`${repositoryId}/v2.0.0`)}&query=${encodeURIComponent('versioned content')}` + ) + } as never); + + expect(response.status).toBe(200); + const body = await response.json(); + // Both repo-wide and version-specific rules should appear (deduped). + expect(body.rules).toEqual(['Repo-wide rule', 'Version-specific rule']); + }); + + it('GET /api/v1/context returns only repo-wide rules when no version is requested', async () => { + const repositoryId = seedRepo(db); + const documentId = seedDocument(db, repositoryId); + + // Insert repo-wide rules (version_id IS NULL). + db.prepare( + `INSERT INTO repository_configs (repository_id, version_id, rules, updated_at) + VALUES (?, NULL, ?, ?)` + ).run(repositoryId, JSON.stringify(['Repo-wide rule only']), NOW_S); + + seedSnippet(db, { documentId, repositoryId, content: 'some content' }); + + const response = await getContext({ + url: new URL( + `http://test/api/v1/context?libraryId=${encodeURIComponent(repositoryId)}&query=${encodeURIComponent('some content')}` + ) + } as never); + + expect(response.status).toBe(200); + const body = await response.json(); + expect(body.rules).toEqual(['Repo-wide rule only']); + }); + + it('GET /api/v1/context deduplicates rules that appear in both repo-wide and version config', async () => { + const repositoryId = seedRepo(db); + const versionId = seedVersion(db, repositoryId, 'v3.0.0'); + const documentId = seedDocument(db, repositoryId, versionId); + + const sharedRule = 'Use TypeScript strict mode'; + db.prepare( + `INSERT INTO repository_configs (repository_id, version_id, rules, updated_at) + VALUES (?, NULL, ?, ?)` + ).run(repositoryId, JSON.stringify([sharedRule]), NOW_S); + + db.prepare( + `INSERT INTO repository_configs (repository_id, version_id, rules, updated_at) + VALUES (?, ?, ?, ?)` + ).run(repositoryId, versionId, JSON.stringify([sharedRule, 'Version-only rule']), NOW_S); + + seedSnippet(db, { documentId, repositoryId, versionId, content: 'dedup test content' }); + + const response = await getContext({ + url: new URL( + `http://test/api/v1/context?libraryId=${encodeURIComponent(`${repositoryId}/v3.0.0`)}&query=${encodeURIComponent('dedup test')}` + ) + } as never); + + expect(response.status).toBe(200); + const body = await response.json(); + // sharedRule appears once, version-only rule appended. + expect(body.rules).toEqual([sharedRule, 'Version-only rule']); + }); + it('GET /api/v1/context returns 404 with VERSION_NOT_FOUND when version does not exist', async () => { const repositoryId = seedRepo(db); diff --git a/src/routes/api/v1/context/+server.ts b/src/routes/api/v1/context/+server.ts index c42f8de..ed711dc 100644 --- a/src/routes/api/v1/context/+server.ts +++ b/src/routes/api/v1/context/+server.ts @@ -54,24 +54,52 @@ interface RawRepoConfig { rules: string | null; } -function getRules(db: ReturnType, repositoryId: string): string[] { - const row = db - .prepare< - [string], - RawRepoConfig - >(`SELECT rules FROM repository_configs WHERE repository_id = ?`) - .get(repositoryId); - - if (!row?.rules) return []; - +function parseRulesJson(raw: string | null | undefined): string[] { + if (!raw) return []; try { - const parsed = JSON.parse(row.rules); + const parsed = JSON.parse(raw); return Array.isArray(parsed) ? (parsed as string[]) : []; } catch { return []; } } +function getRules( + db: ReturnType, + repositoryId: string, + versionId?: string +): string[] { + // Repo-wide rules (version_id IS NULL). + const repoRow = db + .prepare< + [string], + RawRepoConfig + >(`SELECT rules FROM repository_configs WHERE repository_id = ? AND version_id IS NULL`) + .get(repositoryId); + + const repoRules = parseRulesJson(repoRow?.rules); + + if (!versionId) return repoRules; + + // Version-specific rules. + const versionRow = db + .prepare< + [string, string], + RawRepoConfig + >(`SELECT rules FROM repository_configs WHERE repository_id = ? AND version_id = ?`) + .get(repositoryId, versionId); + + const versionRules = parseRulesJson(versionRow?.rules); + + // Merge: repo-wide first, then version-specific (deduped by content). + const seen = new Set(repoRules); + const merged = [...repoRules]; + for (const r of versionRules) { + if (!seen.has(r)) merged.push(r); + } + return merged; +} + interface RawRepoState { state: 'pending' | 'indexing' | 'indexed' | 'error'; id: string; @@ -283,8 +311,8 @@ export const GET: RequestHandler = async ({ url }) => { snippetVersions }; - // Load rules from repository_configs. - const rules = getRules(db, parsed.repositoryId); + // Load rules from repository_configs (repo-wide + version-specific merged). + const rules = getRules(db, parsed.repositoryId, versionId); if (responseType === 'txt') { const text = formatContextTxt(selectedResults, rules, metadata);