feat(TRUEREF-0023): add sqlite-vec search pipeline

This commit is contained in:
Giancarmine Salucci
2026-04-01 14:09:19 +02:00
parent 0752636847
commit 9525c58e9a
45 changed files with 4009 additions and 614 deletions

View File

@@ -13,6 +13,9 @@ import { JobQueue } from './job-queue.js';
import { IndexingPipeline } from './indexing.pipeline.js';
import { recoverStaleJobs } from './startup.js';
import { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
import { loadSqliteVec } from '$lib/server/db/sqlite-vec.js';
import { SqliteVecStore } from '$lib/server/search/sqlite-vec.store.js';
import { sqliteVecRowidTableName, sqliteVecTableName } from '$lib/server/db/sqlite-vec.js';
import * as diffStrategy from './differential-strategy.js';
// ---------------------------------------------------------------------------
@@ -22,6 +25,7 @@ import * as diffStrategy from './differential-strategy.js';
function createTestDb(): Database.Database {
const client = new Database(':memory:');
client.pragma('foreign_keys = ON');
loadSqliteVec(client);
const migrationsFolder = join(import.meta.dirname, '../db/migrations');
for (const migrationFile of [
@@ -29,7 +33,9 @@ function createTestDb(): Database.Database {
'0001_quick_nighthawk.sql',
'0002_silky_stellaris.sql',
'0003_multiversion_config.sql',
'0004_complete_sentry.sql'
'0004_complete_sentry.sql',
'0005_fix_stage_defaults.sql',
'0006_yielding_centennial.sql'
]) {
const migrationSql = readFileSync(join(migrationsFolder, migrationFile), 'utf-8');
@@ -539,6 +545,52 @@ describe('IndexingPipeline', () => {
expect(finalChecksum).toBe('sha-v2');
});
it('removes derived vec rows when changed documents are replaced', async () => {
const docId = crypto.randomUUID();
const snippetId = crypto.randomUUID();
const embedding = Float32Array.from([1, 0, 0]);
const vecStore = new SqliteVecStore(db);
db.prepare(
`INSERT INTO documents (id, repository_id, version_id, file_path, checksum, indexed_at)
VALUES (?, '/test/repo', NULL, 'README.md', 'stale-doc', ?)`
).run(docId, now);
db.prepare(
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, content, created_at)
VALUES (?, ?, '/test/repo', NULL, 'info', 'stale snippet', ?)`
).run(snippetId, docId, now);
db.prepare(
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
VALUES (?, 'local-default', 'test-model', 3, ?, ?)`
).run(snippetId, Buffer.from(embedding.buffer), now);
vecStore.upsertEmbedding('local-default', snippetId, embedding);
const pipeline = makePipeline({
files: [
{
path: 'README.md',
content: '# Updated\n\nFresh content.',
sha: 'sha-fresh',
language: 'markdown'
}
],
totalFiles: 1
});
const job = makeJob();
await pipeline.run(job as never);
const vecTable = sqliteVecTableName('local-default');
const rowidTable = sqliteVecRowidTableName('local-default');
const vecCount = db.prepare(`SELECT COUNT(*) as n FROM "${vecTable}"`).get() as { n: number };
const rowidCount = db.prepare(`SELECT COUNT(*) as n FROM "${rowidTable}"`).get() as {
n: number;
};
expect(vecCount.n).toBe(0);
expect(rowidCount.n).toBe(0);
});
it('updates job progress as files are processed', async () => {
const files = Array.from({ length: 5 }, (_, i) => ({
path: `file${i}.md`,
@@ -700,6 +752,60 @@ describe('IndexingPipeline', () => {
expect(version.indexed_at).not.toBeNull();
});
it('clones ancestor embeddings into the derived vec store for differential indexing', async () => {
const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
const vecStore = new SqliteVecStore(db);
const docId = crypto.randomUUID();
const snippetId = crypto.randomUUID();
const embedding = Float32Array.from([0.2, 0.4, 0.6]);
db.prepare(
`INSERT INTO documents (id, repository_id, version_id, file_path, checksum, indexed_at)
VALUES (?, '/test/repo', ?, 'README.md', 'ancestor-doc', ?)`
).run(docId, ancestorVersionId, now);
db.prepare(
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, content, created_at)
VALUES (?, ?, '/test/repo', ?, 'info', 'ancestor snippet', ?)`
).run(snippetId, docId, ancestorVersionId, now);
db.prepare(
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
VALUES (?, 'local-default', 'test-model', 3, ?, ?)`
).run(snippetId, Buffer.from(embedding.buffer), now);
vecStore.upsertEmbedding('local-default', snippetId, embedding);
vi.spyOn(diffStrategy, 'buildDifferentialPlan').mockResolvedValue({
ancestorTag: 'v1.0.0',
ancestorVersionId,
changedPaths: new Set<string>(),
unchangedPaths: new Set<string>(['README.md'])
});
const pipeline = makePipeline({ files: [], totalFiles: 0 });
const job = makeJob('/test/repo', targetVersionId);
await pipeline.run(job as never);
const targetRows = db
.prepare(
`SELECT se.snippet_id, se.embedding
FROM snippet_embeddings se
INNER JOIN snippets s ON s.id = se.snippet_id
WHERE s.version_id = ?`
)
.all(targetVersionId) as Array<{ snippet_id: string; embedding: Buffer }>;
expect(targetRows).toHaveLength(1);
const matches = vecStore.queryNearestNeighbors(embedding, {
repositoryId: '/test/repo',
versionId: targetVersionId,
profileId: 'local-default',
limit: 5
});
expect(matches[0]?.snippetId).toBe(targetRows[0].snippet_id);
});
it('updates repository_versions state to error when pipeline throws and job has versionId', async () => {
const versionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' });
const errorCrawl = vi.fn().mockRejectedValue(new Error('crawl failed'));