feat(TRUEREF-0023): add sqlite-vec search pipeline
This commit is contained in:
@@ -13,6 +13,9 @@ import { JobQueue } from './job-queue.js';
|
||||
import { IndexingPipeline } from './indexing.pipeline.js';
|
||||
import { recoverStaleJobs } from './startup.js';
|
||||
import { EmbeddingService } from '$lib/server/embeddings/embedding.service.js';
|
||||
import { loadSqliteVec } from '$lib/server/db/sqlite-vec.js';
|
||||
import { SqliteVecStore } from '$lib/server/search/sqlite-vec.store.js';
|
||||
import { sqliteVecRowidTableName, sqliteVecTableName } from '$lib/server/db/sqlite-vec.js';
|
||||
import * as diffStrategy from './differential-strategy.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -22,6 +25,7 @@ import * as diffStrategy from './differential-strategy.js';
|
||||
function createTestDb(): Database.Database {
|
||||
const client = new Database(':memory:');
|
||||
client.pragma('foreign_keys = ON');
|
||||
loadSqliteVec(client);
|
||||
|
||||
const migrationsFolder = join(import.meta.dirname, '../db/migrations');
|
||||
for (const migrationFile of [
|
||||
@@ -29,7 +33,9 @@ function createTestDb(): Database.Database {
|
||||
'0001_quick_nighthawk.sql',
|
||||
'0002_silky_stellaris.sql',
|
||||
'0003_multiversion_config.sql',
|
||||
'0004_complete_sentry.sql'
|
||||
'0004_complete_sentry.sql',
|
||||
'0005_fix_stage_defaults.sql',
|
||||
'0006_yielding_centennial.sql'
|
||||
]) {
|
||||
const migrationSql = readFileSync(join(migrationsFolder, migrationFile), 'utf-8');
|
||||
|
||||
@@ -539,6 +545,52 @@ describe('IndexingPipeline', () => {
|
||||
expect(finalChecksum).toBe('sha-v2');
|
||||
});
|
||||
|
||||
it('removes derived vec rows when changed documents are replaced', async () => {
|
||||
const docId = crypto.randomUUID();
|
||||
const snippetId = crypto.randomUUID();
|
||||
const embedding = Float32Array.from([1, 0, 0]);
|
||||
const vecStore = new SqliteVecStore(db);
|
||||
|
||||
db.prepare(
|
||||
`INSERT INTO documents (id, repository_id, version_id, file_path, checksum, indexed_at)
|
||||
VALUES (?, '/test/repo', NULL, 'README.md', 'stale-doc', ?)`
|
||||
).run(docId, now);
|
||||
db.prepare(
|
||||
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, content, created_at)
|
||||
VALUES (?, ?, '/test/repo', NULL, 'info', 'stale snippet', ?)`
|
||||
).run(snippetId, docId, now);
|
||||
db.prepare(
|
||||
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
||||
VALUES (?, 'local-default', 'test-model', 3, ?, ?)`
|
||||
).run(snippetId, Buffer.from(embedding.buffer), now);
|
||||
vecStore.upsertEmbedding('local-default', snippetId, embedding);
|
||||
|
||||
const pipeline = makePipeline({
|
||||
files: [
|
||||
{
|
||||
path: 'README.md',
|
||||
content: '# Updated\n\nFresh content.',
|
||||
sha: 'sha-fresh',
|
||||
language: 'markdown'
|
||||
}
|
||||
],
|
||||
totalFiles: 1
|
||||
});
|
||||
const job = makeJob();
|
||||
|
||||
await pipeline.run(job as never);
|
||||
|
||||
const vecTable = sqliteVecTableName('local-default');
|
||||
const rowidTable = sqliteVecRowidTableName('local-default');
|
||||
const vecCount = db.prepare(`SELECT COUNT(*) as n FROM "${vecTable}"`).get() as { n: number };
|
||||
const rowidCount = db.prepare(`SELECT COUNT(*) as n FROM "${rowidTable}"`).get() as {
|
||||
n: number;
|
||||
};
|
||||
|
||||
expect(vecCount.n).toBe(0);
|
||||
expect(rowidCount.n).toBe(0);
|
||||
});
|
||||
|
||||
it('updates job progress as files are processed', async () => {
|
||||
const files = Array.from({ length: 5 }, (_, i) => ({
|
||||
path: `file${i}.md`,
|
||||
@@ -700,6 +752,60 @@ describe('IndexingPipeline', () => {
|
||||
expect(version.indexed_at).not.toBeNull();
|
||||
});
|
||||
|
||||
it('clones ancestor embeddings into the derived vec store for differential indexing', async () => {
|
||||
const ancestorVersionId = insertVersion(db, { tag: 'v1.0.0', state: 'indexed' });
|
||||
const targetVersionId = insertVersion(db, { tag: 'v1.1.0', state: 'pending' });
|
||||
const vecStore = new SqliteVecStore(db);
|
||||
const docId = crypto.randomUUID();
|
||||
const snippetId = crypto.randomUUID();
|
||||
const embedding = Float32Array.from([0.2, 0.4, 0.6]);
|
||||
|
||||
db.prepare(
|
||||
`INSERT INTO documents (id, repository_id, version_id, file_path, checksum, indexed_at)
|
||||
VALUES (?, '/test/repo', ?, 'README.md', 'ancestor-doc', ?)`
|
||||
).run(docId, ancestorVersionId, now);
|
||||
db.prepare(
|
||||
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, content, created_at)
|
||||
VALUES (?, ?, '/test/repo', ?, 'info', 'ancestor snippet', ?)`
|
||||
).run(snippetId, docId, ancestorVersionId, now);
|
||||
db.prepare(
|
||||
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
||||
VALUES (?, 'local-default', 'test-model', 3, ?, ?)`
|
||||
).run(snippetId, Buffer.from(embedding.buffer), now);
|
||||
vecStore.upsertEmbedding('local-default', snippetId, embedding);
|
||||
|
||||
vi.spyOn(diffStrategy, 'buildDifferentialPlan').mockResolvedValue({
|
||||
ancestorTag: 'v1.0.0',
|
||||
ancestorVersionId,
|
||||
changedPaths: new Set<string>(),
|
||||
unchangedPaths: new Set<string>(['README.md'])
|
||||
});
|
||||
|
||||
const pipeline = makePipeline({ files: [], totalFiles: 0 });
|
||||
const job = makeJob('/test/repo', targetVersionId);
|
||||
|
||||
await pipeline.run(job as never);
|
||||
|
||||
const targetRows = db
|
||||
.prepare(
|
||||
`SELECT se.snippet_id, se.embedding
|
||||
FROM snippet_embeddings se
|
||||
INNER JOIN snippets s ON s.id = se.snippet_id
|
||||
WHERE s.version_id = ?`
|
||||
)
|
||||
.all(targetVersionId) as Array<{ snippet_id: string; embedding: Buffer }>;
|
||||
|
||||
expect(targetRows).toHaveLength(1);
|
||||
const matches = vecStore.queryNearestNeighbors(embedding, {
|
||||
repositoryId: '/test/repo',
|
||||
versionId: targetVersionId,
|
||||
profileId: 'local-default',
|
||||
limit: 5
|
||||
});
|
||||
|
||||
expect(matches[0]?.snippetId).toBe(targetRows[0].snippet_id);
|
||||
});
|
||||
|
||||
it('updates repository_versions state to error when pipeline throws and job has versionId', async () => {
|
||||
const versionId = insertVersion(db, { tag: 'v1.0.0', state: 'pending' });
|
||||
const errorCrawl = vi.fn().mockRejectedValue(new Error('crawl failed'));
|
||||
|
||||
Reference in New Issue
Block a user