feat(MULTIVERSION-0001): wire trueref.json into pipeline + per-version rules

- Add migration 0003: recreate repository_configs with nullable version_id
  column and two partial unique indexes (repo-wide: version_id IS NULL,
  per-version: (repository_id, version_id) WHERE version_id IS NOT NULL)
- Update schema.ts to reflect the new composite structure with uniqueIndex
  partial constraints via drizzle-orm sql helper
- IndexingPipeline: parse trueref.json / context7.json after crawl, apply
  excludeFiles filter before diff computation, update totalFiles accordingly
- IndexingPipeline: persist repo-wide rules (version_id=null) and
  version-specific rules (when versionId set) via upsertRepoConfig helper
- Add matchesExcludePattern static helper supporting plain filename,
  glob prefix (docs/legacy*), and exact path patterns
- context endpoint: split getRules into repo-wide + version-specific lookup
  with dedup merge; pass versionId at call site
- Update test DB loaders to include migration 0003
- Add pipeline tests for excludeFiles, repo-wide rules persistence, and
  per-version rules persistence
- Add integration tests for merged rules, repo-only rules, and dedup logic

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Giancarmine Salucci
2026-03-28 10:44:30 +01:00
parent 255838dcc0
commit 666ec7d55f
7 changed files with 418 additions and 32 deletions

View File

@@ -0,0 +1,30 @@
PRAGMA foreign_keys=OFF;
--> statement-breakpoint
CREATE TABLE `__new_repository_configs` (
`repository_id` text NOT NULL,
`version_id` text,
`project_title` text,
`description` text,
`folders` text,
`exclude_folders` text,
`exclude_files` text,
`rules` text,
`previous_versions` text,
`updated_at` integer NOT NULL,
FOREIGN KEY (`repository_id`) REFERENCES `repositories`(`id`) ON UPDATE no action ON DELETE cascade
);
--> statement-breakpoint
INSERT INTO `__new_repository_configs`
(repository_id, version_id, project_title, description, folders, exclude_folders, exclude_files, rules, previous_versions, updated_at)
SELECT repository_id, NULL, project_title, description, folders, exclude_folders, exclude_files, rules, previous_versions, updated_at
FROM `repository_configs`;
--> statement-breakpoint
DROP TABLE `repository_configs`;
--> statement-breakpoint
ALTER TABLE `__new_repository_configs` RENAME TO `repository_configs`;
--> statement-breakpoint
PRAGMA foreign_keys=ON;
--> statement-breakpoint
CREATE UNIQUE INDEX `uniq_repo_config_base` ON `repository_configs` (`repository_id`) WHERE `version_id` IS NULL;
--> statement-breakpoint
CREATE UNIQUE INDEX `uniq_repo_config_version` ON `repository_configs` (`repository_id`, `version_id`) WHERE `version_id` IS NOT NULL;

View File

@@ -22,6 +22,13 @@
"when": 1774461897742, "when": 1774461897742,
"tag": "0002_silky_stellaris", "tag": "0002_silky_stellaris",
"breakpoints": true "breakpoints": true
},
{
"idx": 3,
"version": "6",
"when": 1743155877000,
"tag": "0003_multiversion_config",
"breakpoints": true
} }
] ]
} }

View File

@@ -1,4 +1,13 @@
import { blob, integer, primaryKey, real, sqliteTable, text } from 'drizzle-orm/sqlite-core'; import { sql } from 'drizzle-orm';
import {
blob,
integer,
primaryKey,
real,
sqliteTable,
text,
uniqueIndex
} from 'drizzle-orm/sqlite-core';
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
// repositories // repositories
@@ -148,21 +157,33 @@ export const indexingJobs = sqliteTable('indexing_jobs', {
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
// repository_configs // repository_configs
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
export const repositoryConfigs = sqliteTable('repository_configs', { export const repositoryConfigs = sqliteTable(
repositoryId: text('repository_id') 'repository_configs',
.primaryKey() {
.references(() => repositories.id, { onDelete: 'cascade' }), repositoryId: text('repository_id')
projectTitle: text('project_title'), .notNull()
description: text('description'), .references(() => repositories.id, { onDelete: 'cascade' }),
folders: text('folders', { mode: 'json' }).$type<string[]>(), versionId: text('version_id'),
excludeFolders: text('exclude_folders', { mode: 'json' }).$type<string[]>(), projectTitle: text('project_title'),
excludeFiles: text('exclude_files', { mode: 'json' }).$type<string[]>(), description: text('description'),
rules: text('rules', { mode: 'json' }).$type<string[]>(), folders: text('folders', { mode: 'json' }).$type<string[]>(),
previousVersions: text('previous_versions', { mode: 'json' }).$type< excludeFolders: text('exclude_folders', { mode: 'json' }).$type<string[]>(),
{ tag: string; title: string; commitHash?: string }[] excludeFiles: text('exclude_files', { mode: 'json' }).$type<string[]>(),
>(), rules: text('rules', { mode: 'json' }).$type<string[]>(),
updatedAt: integer('updated_at', { mode: 'timestamp' }).notNull() previousVersions: text('previous_versions', { mode: 'json' }).$type<
}); { tag: string; title: string; commitHash?: string }[]
>(),
updatedAt: integer('updated_at', { mode: 'timestamp' }).notNull()
},
(table) => [
uniqueIndex('uniq_repo_config_base')
.on(table.repositoryId)
.where(sql`${table.versionId} IS NULL`),
uniqueIndex('uniq_repo_config_version')
.on(table.repositoryId, table.versionId)
.where(sql`${table.versionId} IS NOT NULL`)
]
);
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
// settings // settings

View File

@@ -26,7 +26,8 @@ function createTestDb(): Database.Database {
for (const migrationFile of [ for (const migrationFile of [
'0000_large_master_chief.sql', '0000_large_master_chief.sql',
'0001_quick_nighthawk.sql', '0001_quick_nighthawk.sql',
'0002_silky_stellaris.sql' '0002_silky_stellaris.sql',
'0003_multiversion_config.sql'
]) { ]) {
const migrationSql = readFileSync(join(migrationsFolder, migrationFile), 'utf-8'); const migrationSql = readFileSync(join(migrationsFolder, migrationFile), 'utf-8');
@@ -771,4 +772,117 @@ describe('IndexingPipeline', () => {
ref: undefined ref: undefined
}); });
}); });
it('excludes files matching excludeFiles patterns from trueref.json', async () => {
const truerefConfig = JSON.stringify({
excludeFiles: ['migration-guide.md', 'docs/legacy*']
});
const files = [
{
path: 'trueref.json',
content: truerefConfig,
sha: 'sha-config',
language: 'json'
},
{
path: 'README.md',
content: '# Hello\n\nThis is documentation.',
sha: 'sha-readme',
language: 'markdown'
},
{
path: 'migration-guide.md',
content: '# Migration Guide\n\nThis should be excluded.',
sha: 'sha-migration',
language: 'markdown'
},
{
path: 'docs/legacy-api.md',
content: '# Legacy API\n\nShould be excluded by glob prefix.',
sha: 'sha-legacy',
language: 'markdown'
}
];
const pipeline = makePipeline({ files, totalFiles: files.length });
const job = makeJob();
await pipeline.run(job as never);
const docs = db
.prepare(`SELECT file_path FROM documents ORDER BY file_path`)
.all() as { file_path: string }[];
const filePaths = docs.map((d) => d.file_path);
// migration-guide.md and docs/legacy-api.md must be absent.
expect(filePaths).not.toContain('migration-guide.md');
expect(filePaths).not.toContain('docs/legacy-api.md');
// README.md must still be indexed.
expect(filePaths).toContain('README.md');
});
it('persists repo-wide rules from trueref.json to repository_configs after indexing', async () => {
const truerefConfig = JSON.stringify({
rules: ['Always use TypeScript strict mode', 'Prefer async/await over callbacks']
});
const files = [
{
path: 'trueref.json',
content: truerefConfig,
sha: 'sha-config',
language: 'json'
}
];
const pipeline = makePipeline({ files, totalFiles: files.length });
const job = makeJob();
await pipeline.run(job as never);
const row = db
.prepare(
`SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL`
)
.get() as { rules: string } | undefined;
expect(row).toBeDefined();
const rules = JSON.parse(row!.rules);
expect(rules).toEqual(['Always use TypeScript strict mode', 'Prefer async/await over callbacks']);
});
it('persists version-specific rules under (repositoryId, versionId) when job has versionId', async () => {
const versionId = insertVersion(db, { tag: 'v2.0.0', state: 'pending' });
const truerefConfig = JSON.stringify({
rules: ['This is v2. Use the new Builder API.']
});
const files = [
{
path: 'trueref.json',
content: truerefConfig,
sha: 'sha-config',
language: 'json'
}
];
const pipeline = makePipeline({ files, totalFiles: files.length });
const job = makeJob('/test/repo', versionId);
await pipeline.run(job as never);
// Repo-wide row (version_id IS NULL) must exist.
const repoRow = db
.prepare(
`SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id IS NULL`
)
.get() as { rules: string } | undefined;
expect(repoRow).toBeDefined();
// Version-specific row must also exist.
const versionRow = db
.prepare(
`SELECT rules FROM repository_configs WHERE repository_id = '/test/repo' AND version_id = ?`
)
.get(versionId) as { rules: string } | undefined;
expect(versionRow).toBeDefined();
const rules = JSON.parse(versionRow!.rules);
expect(rules).toEqual(['This is v2. Use the new Builder API.']);
});
}); });

View File

@@ -22,6 +22,7 @@ import type { EmbeddingService } from '$lib/server/embeddings/embedding.service.
import { RepositoryMapper } from '$lib/server/mappers/repository.mapper.js'; import { RepositoryMapper } from '$lib/server/mappers/repository.mapper.js';
import { IndexingJob } from '$lib/server/models/indexing-job.js'; import { IndexingJob } from '$lib/server/models/indexing-job.js';
import { Repository, RepositoryEntity } from '$lib/server/models/repository.js'; import { Repository, RepositoryEntity } from '$lib/server/models/repository.js';
import { resolveConfig } from '$lib/server/config/config-parser.js';
import { parseFile } from '$lib/server/parser/index.js'; import { parseFile } from '$lib/server/parser/index.js';
import { computeTrustScore } from '$lib/server/search/trust-score.js'; import { computeTrustScore } from '$lib/server/search/trust-score.js';
import { computeDiff } from './diff.js'; import { computeDiff } from './diff.js';
@@ -99,15 +100,32 @@ export class IndexingPipeline {
? this.getVersionTag(normJob.versionId) ? this.getVersionTag(normJob.versionId)
: undefined; : undefined;
const crawlResult = await this.crawl(repo, versionTag); const crawlResult = await this.crawl(repo, versionTag);
const totalFiles = crawlResult.totalFiles;
// Parse trueref.json / context7.json if present in the crawl results.
const configFile = crawlResult.files.find(
(f) => f.path === 'trueref.json' || f.path === 'context7.json'
);
const parsedConfig = configFile
? resolveConfig([{ filename: configFile.path, content: configFile.content }])
: null;
const excludeFiles: string[] = parsedConfig?.config.excludeFiles ?? [];
// Filter out excluded files before diff computation.
const filteredFiles =
excludeFiles.length > 0
? crawlResult.files.filter(
(f) => !excludeFiles.some((pattern) => IndexingPipeline.matchesExcludePattern(f.path, pattern))
)
: crawlResult.files;
const totalFiles = filteredFiles.length;
this.updateJob(job.id, { totalFiles }); this.updateJob(job.id, { totalFiles });
// ---- Stage 2: Parse & diff ------------------------------------------ // ---- Stage 2: Parse & diff ------------------------------------------
// Load all existing documents for this repo so computeDiff can // Load all existing documents for this repo so computeDiff can
// classify every crawled file and detect deletions. // classify every crawled file and detect deletions.
const existingDocs = this.getExistingDocuments(repo.id, normJob.versionId); const existingDocs = this.getExistingDocuments(repo.id, normJob.versionId);
const diff = computeDiff(crawlResult.files, existingDocs); const diff = computeDiff(filteredFiles, existingDocs);
// Accumulate new documents/snippets; skip unchanged files. // Accumulate new documents/snippets; skip unchanged files.
const newDocuments: NewDocument[] = []; const newDocuments: NewDocument[] = [];
@@ -244,6 +262,16 @@ export class IndexingPipeline {
}); });
} }
// ---- Stage 6: Persist rules from config ----------------------------
if (parsedConfig?.config.rules?.length) {
// Repo-wide rules (versionId = null).
this.upsertRepoConfig(repo.id, null, parsedConfig.config.rules);
// Version-specific rules stored separately when indexing a version.
if (normJob.versionId) {
this.upsertRepoConfig(repo.id, normJob.versionId, parsedConfig.config.rules);
}
}
this.updateJob(job.id, { this.updateJob(job.id, {
status: 'done', status: 'done',
progress: 100, progress: 100,
@@ -476,6 +504,65 @@ export class IndexingPipeline {
const values = [...Object.values(fields), id]; const values = [...Object.values(fields), id];
this.db.prepare(`UPDATE repository_versions SET ${sets} WHERE id = ?`).run(...values); this.db.prepare(`UPDATE repository_versions SET ${sets} WHERE id = ?`).run(...values);
} }
private upsertRepoConfig(
repositoryId: string,
versionId: string | null,
rules: string[]
): void {
const now = Math.floor(Date.now() / 1000);
// Use DELETE + INSERT because ON CONFLICT … DO UPDATE doesn't work reliably
// with partial unique indexes in all SQLite versions.
if (versionId === null) {
this.db
.prepare(
`DELETE FROM repository_configs WHERE repository_id = ? AND version_id IS NULL`
)
.run(repositoryId);
} else {
this.db
.prepare(
`DELETE FROM repository_configs WHERE repository_id = ? AND version_id = ?`
)
.run(repositoryId, versionId);
}
this.db
.prepare(
`INSERT INTO repository_configs (repository_id, version_id, rules, updated_at)
VALUES (?, ?, ?, ?)`
)
.run(repositoryId, versionId, JSON.stringify(rules), now);
}
// -------------------------------------------------------------------------
// Private — static helpers
// -------------------------------------------------------------------------
/**
* Returns true when `filePath` matches the given exclude `pattern`.
*
* Supported patterns:
* - Plain filename: `migration-guide.md` matches any path ending in `/migration-guide.md`
* or equal to `migration-guide.md`.
* - Glob prefix with wildcard: `docs/migration*` matches paths that start with `docs/migration`.
* - Exact path: `src/legacy/old-api.ts` matches exactly that path.
*/
private static matchesExcludePattern(filePath: string, pattern: string): boolean {
if (pattern.includes('*')) {
// Glob-style: treat everything before the '*' as a required prefix.
const prefix = pattern.slice(0, pattern.indexOf('*'));
return filePath.startsWith(prefix);
}
// No wildcard — treat as plain name or exact path.
if (!pattern.includes('/')) {
// Plain filename: match basename (path ends with /<pattern> or equals pattern).
return filePath === pattern || filePath.endsWith('/' + pattern);
}
// Contains a slash — exact path match.
return filePath === pattern;
}
} }
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------

View File

@@ -55,6 +55,7 @@ function createTestDb(): Database.Database {
const migration0 = readFileSync(join(migrationsFolder, '0000_large_master_chief.sql'), 'utf-8'); const migration0 = readFileSync(join(migrationsFolder, '0000_large_master_chief.sql'), 'utf-8');
const migration1 = readFileSync(join(migrationsFolder, '0001_quick_nighthawk.sql'), 'utf-8'); const migration1 = readFileSync(join(migrationsFolder, '0001_quick_nighthawk.sql'), 'utf-8');
const migration2 = readFileSync(join(migrationsFolder, '0002_silky_stellaris.sql'), 'utf-8'); const migration2 = readFileSync(join(migrationsFolder, '0002_silky_stellaris.sql'), 'utf-8');
const migration3 = readFileSync(join(migrationsFolder, '0003_multiversion_config.sql'), 'utf-8');
// Apply first migration // Apply first migration
const statements0 = migration0 const statements0 = migration0
@@ -85,6 +86,15 @@ function createTestDb(): Database.Database {
client.exec(statement); client.exec(statement);
} }
const statements3 = migration3
.split('--> statement-breakpoint')
.map((statement) => statement.trim())
.filter(Boolean);
for (const statement of statements3) {
client.exec(statement);
}
client.exec(readFileSync(ftsFile, 'utf-8')); client.exec(readFileSync(ftsFile, 'utf-8'));
return client; return client;
@@ -487,6 +497,95 @@ describe('API contract integration', () => {
}); });
}); });
it('GET /api/v1/context returns merged repo-wide and version-specific rules', async () => {
const repositoryId = seedRepo(db);
const versionId = seedVersion(db, repositoryId, 'v2.0.0');
const documentId = seedDocument(db, repositoryId, versionId);
// Insert repo-wide rules (version_id IS NULL).
db.prepare(
`INSERT INTO repository_configs (repository_id, version_id, rules, updated_at)
VALUES (?, NULL, ?, ?)`
).run(repositoryId, JSON.stringify(['Repo-wide rule']), NOW_S);
// Insert version-specific rules.
db.prepare(
`INSERT INTO repository_configs (repository_id, version_id, rules, updated_at)
VALUES (?, ?, ?, ?)`
).run(repositoryId, versionId, JSON.stringify(['Version-specific rule']), NOW_S);
seedSnippet(db, {
documentId,
repositoryId,
versionId,
content: 'some versioned content'
});
const response = await getContext({
url: new URL(
`http://test/api/v1/context?libraryId=${encodeURIComponent(`${repositoryId}/v2.0.0`)}&query=${encodeURIComponent('versioned content')}`
)
} as never);
expect(response.status).toBe(200);
const body = await response.json();
// Both repo-wide and version-specific rules should appear (deduped).
expect(body.rules).toEqual(['Repo-wide rule', 'Version-specific rule']);
});
it('GET /api/v1/context returns only repo-wide rules when no version is requested', async () => {
const repositoryId = seedRepo(db);
const documentId = seedDocument(db, repositoryId);
// Insert repo-wide rules (version_id IS NULL).
db.prepare(
`INSERT INTO repository_configs (repository_id, version_id, rules, updated_at)
VALUES (?, NULL, ?, ?)`
).run(repositoryId, JSON.stringify(['Repo-wide rule only']), NOW_S);
seedSnippet(db, { documentId, repositoryId, content: 'some content' });
const response = await getContext({
url: new URL(
`http://test/api/v1/context?libraryId=${encodeURIComponent(repositoryId)}&query=${encodeURIComponent('some content')}`
)
} as never);
expect(response.status).toBe(200);
const body = await response.json();
expect(body.rules).toEqual(['Repo-wide rule only']);
});
it('GET /api/v1/context deduplicates rules that appear in both repo-wide and version config', async () => {
const repositoryId = seedRepo(db);
const versionId = seedVersion(db, repositoryId, 'v3.0.0');
const documentId = seedDocument(db, repositoryId, versionId);
const sharedRule = 'Use TypeScript strict mode';
db.prepare(
`INSERT INTO repository_configs (repository_id, version_id, rules, updated_at)
VALUES (?, NULL, ?, ?)`
).run(repositoryId, JSON.stringify([sharedRule]), NOW_S);
db.prepare(
`INSERT INTO repository_configs (repository_id, version_id, rules, updated_at)
VALUES (?, ?, ?, ?)`
).run(repositoryId, versionId, JSON.stringify([sharedRule, 'Version-only rule']), NOW_S);
seedSnippet(db, { documentId, repositoryId, versionId, content: 'dedup test content' });
const response = await getContext({
url: new URL(
`http://test/api/v1/context?libraryId=${encodeURIComponent(`${repositoryId}/v3.0.0`)}&query=${encodeURIComponent('dedup test')}`
)
} as never);
expect(response.status).toBe(200);
const body = await response.json();
// sharedRule appears once, version-only rule appended.
expect(body.rules).toEqual([sharedRule, 'Version-only rule']);
});
it('GET /api/v1/context returns 404 with VERSION_NOT_FOUND when version does not exist', async () => { it('GET /api/v1/context returns 404 with VERSION_NOT_FOUND when version does not exist', async () => {
const repositoryId = seedRepo(db); const repositoryId = seedRepo(db);

View File

@@ -54,24 +54,52 @@ interface RawRepoConfig {
rules: string | null; rules: string | null;
} }
function getRules(db: ReturnType<typeof getClient>, repositoryId: string): string[] { function parseRulesJson(raw: string | null | undefined): string[] {
const row = db if (!raw) return [];
.prepare<
[string],
RawRepoConfig
>(`SELECT rules FROM repository_configs WHERE repository_id = ?`)
.get(repositoryId);
if (!row?.rules) return [];
try { try {
const parsed = JSON.parse(row.rules); const parsed = JSON.parse(raw);
return Array.isArray(parsed) ? (parsed as string[]) : []; return Array.isArray(parsed) ? (parsed as string[]) : [];
} catch { } catch {
return []; return [];
} }
} }
function getRules(
db: ReturnType<typeof getClient>,
repositoryId: string,
versionId?: string
): string[] {
// Repo-wide rules (version_id IS NULL).
const repoRow = db
.prepare<
[string],
RawRepoConfig
>(`SELECT rules FROM repository_configs WHERE repository_id = ? AND version_id IS NULL`)
.get(repositoryId);
const repoRules = parseRulesJson(repoRow?.rules);
if (!versionId) return repoRules;
// Version-specific rules.
const versionRow = db
.prepare<
[string, string],
RawRepoConfig
>(`SELECT rules FROM repository_configs WHERE repository_id = ? AND version_id = ?`)
.get(repositoryId, versionId);
const versionRules = parseRulesJson(versionRow?.rules);
// Merge: repo-wide first, then version-specific (deduped by content).
const seen = new Set(repoRules);
const merged = [...repoRules];
for (const r of versionRules) {
if (!seen.has(r)) merged.push(r);
}
return merged;
}
interface RawRepoState { interface RawRepoState {
state: 'pending' | 'indexing' | 'indexed' | 'error'; state: 'pending' | 'indexing' | 'indexed' | 'error';
id: string; id: string;
@@ -283,8 +311,8 @@ export const GET: RequestHandler = async ({ url }) => {
snippetVersions snippetVersions
}; };
// Load rules from repository_configs. // Load rules from repository_configs (repo-wide + version-specific merged).
const rules = getRules(db, parsed.repositoryId); const rules = getRules(db, parsed.repositoryId, versionId);
if (responseType === 'txt') { if (responseType === 'txt') {
const text = formatContextTxt(selectedResults, rules, metadata); const text = formatContextTxt(selectedResults, rules, metadata);