TRUEREF-0023 rewrite indexing pipeline - parallel reads - serialized writes

This commit is contained in:
Giancarmine Salucci
2026-04-02 09:49:38 +02:00
parent 9525c58e9a
commit f86be4106b
68 changed files with 5042 additions and 3131 deletions

View File

@@ -4,6 +4,7 @@
*/
import Database from 'better-sqlite3';
import { env } from '$env/dynamic/private';
import { applySqlitePragmas } from './connection';
import { loadSqliteVec } from './sqlite-vec';
let _client: Database.Database | null = null;
@@ -12,14 +13,7 @@ export function getClient(): Database.Database {
if (!_client) {
if (!env.DATABASE_URL) throw new Error('DATABASE_URL is not set');
_client = new Database(env.DATABASE_URL);
_client.pragma('journal_mode = WAL');
_client.pragma('foreign_keys = ON');
_client.pragma('busy_timeout = 5000');
_client.pragma('synchronous = NORMAL');
_client.pragma('cache_size = -65536');
_client.pragma('temp_store = MEMORY');
_client.pragma('mmap_size = 268435456');
_client.pragma('wal_autocheckpoint = 1000');
applySqlitePragmas(_client);
loadSqliteVec(_client);
}
return _client;

View File

@@ -0,0 +1,14 @@
import type Database from 'better-sqlite3';
export const SQLITE_BUSY_TIMEOUT_MS = 30000;
export function applySqlitePragmas(db: Database.Database): void {
db.pragma('journal_mode = WAL');
db.pragma('foreign_keys = ON');
db.pragma(`busy_timeout = ${SQLITE_BUSY_TIMEOUT_MS}`);
db.pragma('synchronous = NORMAL');
db.pragma('cache_size = -65536');
db.pragma('temp_store = MEMORY');
db.pragma('mmap_size = 268435456');
db.pragma('wal_autocheckpoint = 1000');
}

View File

@@ -5,6 +5,7 @@ import { readFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { join, dirname } from 'node:path';
import * as schema from './schema';
import { applySqlitePragmas } from './connection';
import { loadSqliteVec } from './sqlite-vec';
import { env } from '$env/dynamic/private';
@@ -12,19 +13,7 @@ if (!env.DATABASE_URL) throw new Error('DATABASE_URL is not set');
const client = new Database(env.DATABASE_URL);
// Enable WAL mode for better concurrent read performance.
client.pragma('journal_mode = WAL');
// Enforce foreign key constraints.
client.pragma('foreign_keys = ON');
// Wait up to 5 s when the DB is locked instead of failing immediately.
// Prevents SQLITE_BUSY errors when the indexing pipeline holds the write lock
// and an HTTP request arrives simultaneously.
client.pragma('busy_timeout = 5000');
client.pragma('synchronous = NORMAL');
client.pragma('cache_size = -65536');
client.pragma('temp_store = MEMORY');
client.pragma('mmap_size = 268435456');
client.pragma('wal_autocheckpoint = 1000');
applySqlitePragmas(client);
loadSqliteVec(client);
export const db = drizzle(client, { schema });

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,55 +1,55 @@
{
"version": "7",
"dialect": "sqlite",
"entries": [
{
"idx": 0,
"version": "6",
"when": 1774196053634,
"tag": "0000_large_master_chief",
"breakpoints": true
},
{
"idx": 1,
"version": "6",
"when": 1774448049161,
"tag": "0001_quick_nighthawk",
"breakpoints": true
},
{
"idx": 2,
"version": "6",
"when": 1774461897742,
"tag": "0002_silky_stellaris",
"breakpoints": true
},
{
"idx": 3,
"version": "6",
"when": 1743155877000,
"tag": "0003_multiversion_config",
"breakpoints": true
},
{
"idx": 4,
"version": "6",
"when": 1774880275833,
"tag": "0004_complete_sentry",
"breakpoints": true
},
{
"idx": 5,
"version": "6",
"when": 1774890536284,
"tag": "0005_fix_stage_defaults",
"breakpoints": true
},
{
"idx": 6,
"version": "6",
"when": 1775038799913,
"tag": "0006_yielding_centennial",
"breakpoints": true
}
]
}
"version": "7",
"dialect": "sqlite",
"entries": [
{
"idx": 0,
"version": "6",
"when": 1774196053634,
"tag": "0000_large_master_chief",
"breakpoints": true
},
{
"idx": 1,
"version": "6",
"when": 1774448049161,
"tag": "0001_quick_nighthawk",
"breakpoints": true
},
{
"idx": 2,
"version": "6",
"when": 1774461897742,
"tag": "0002_silky_stellaris",
"breakpoints": true
},
{
"idx": 3,
"version": "6",
"when": 1743155877000,
"tag": "0003_multiversion_config",
"breakpoints": true
},
{
"idx": 4,
"version": "6",
"when": 1774880275833,
"tag": "0004_complete_sentry",
"breakpoints": true
},
{
"idx": 5,
"version": "6",
"when": 1774890536284,
"tag": "0005_fix_stage_defaults",
"breakpoints": true
},
{
"idx": 6,
"version": "6",
"when": 1775038799913,
"tag": "0006_yielding_centennial",
"breakpoints": true
}
]
}

View File

@@ -349,14 +349,14 @@ describe('snippet_embeddings table', () => {
});
it('keeps the relational schema free of vec_embedding and retains the profile index', () => {
const columns = client
.prepare("PRAGMA table_info('snippet_embeddings')")
.all() as Array<{ name: string }>;
const columns = client.prepare("PRAGMA table_info('snippet_embeddings')").all() as Array<{
name: string;
}>;
expect(columns.map((column) => column.name)).not.toContain('vec_embedding');
const indexes = client
.prepare("PRAGMA index_list('snippet_embeddings')")
.all() as Array<{ name: string }>;
const indexes = client.prepare("PRAGMA index_list('snippet_embeddings')").all() as Array<{
name: string;
}>;
expect(indexes.map((index) => index.name)).toContain('idx_embeddings_profile');
});

View File

@@ -13,29 +13,33 @@ import {
// ---------------------------------------------------------------------------
// repositories
// ---------------------------------------------------------------------------
export const repositories = sqliteTable('repositories', {
id: text('id').primaryKey(), // e.g. "/facebook/react" or "/local/my-sdk"
title: text('title').notNull(),
description: text('description'),
source: text('source', { enum: ['github', 'local'] }).notNull(),
sourceUrl: text('source_url').notNull(), // GitHub URL or absolute local path
branch: text('branch').default('main'),
state: text('state', {
enum: ['pending', 'indexing', 'indexed', 'error']
})
.notNull()
.default('pending'),
totalSnippets: integer('total_snippets').default(0),
totalTokens: integer('total_tokens').default(0),
trustScore: real('trust_score').default(0), // 0.010.0
benchmarkScore: real('benchmark_score').default(0), // 0.0100.0; reserved for future quality metrics
stars: integer('stars'),
// TODO: encrypt at rest in production; stored as plaintext for v1
githubToken: text('github_token'),
lastIndexedAt: integer('last_indexed_at', { mode: 'timestamp' }),
createdAt: integer('created_at', { mode: 'timestamp' }).notNull(),
updatedAt: integer('updated_at', { mode: 'timestamp' }).notNull()
}, (t) => [index('idx_repositories_state').on(t.state)]);
export const repositories = sqliteTable(
'repositories',
{
id: text('id').primaryKey(), // e.g. "/facebook/react" or "/local/my-sdk"
title: text('title').notNull(),
description: text('description'),
source: text('source', { enum: ['github', 'local'] }).notNull(),
sourceUrl: text('source_url').notNull(), // GitHub URL or absolute local path
branch: text('branch').default('main'),
state: text('state', {
enum: ['pending', 'indexing', 'indexed', 'error']
})
.notNull()
.default('pending'),
totalSnippets: integer('total_snippets').default(0),
totalTokens: integer('total_tokens').default(0),
trustScore: real('trust_score').default(0), // 0.010.0
benchmarkScore: real('benchmark_score').default(0), // 0.0100.0; reserved for future quality metrics
stars: integer('stars'),
// TODO: encrypt at rest in production; stored as plaintext for v1
githubToken: text('github_token'),
lastIndexedAt: integer('last_indexed_at', { mode: 'timestamp' }),
createdAt: integer('created_at', { mode: 'timestamp' }).notNull(),
updatedAt: integer('updated_at', { mode: 'timestamp' }).notNull()
},
(t) => [index('idx_repositories_state').on(t.state)]
);
// ---------------------------------------------------------------------------
// repository_versions
@@ -61,43 +65,51 @@ export const repositoryVersions = sqliteTable('repository_versions', {
// ---------------------------------------------------------------------------
// documents
// ---------------------------------------------------------------------------
export const documents = sqliteTable('documents', {
id: text('id').primaryKey(), // UUID
repositoryId: text('repository_id')
.notNull()
.references(() => repositories.id, { onDelete: 'cascade' }),
versionId: text('version_id').references(() => repositoryVersions.id, { onDelete: 'cascade' }),
filePath: text('file_path').notNull(), // relative path within repo
title: text('title'),
language: text('language'), // e.g. "typescript", "markdown"
tokenCount: integer('token_count').default(0),
checksum: text('checksum').notNull(), // SHA-256 of file content
indexedAt: integer('indexed_at', { mode: 'timestamp' }).notNull()
}, (t) => [index('idx_documents_repo_version').on(t.repositoryId, t.versionId)]);
export const documents = sqliteTable(
'documents',
{
id: text('id').primaryKey(), // UUID
repositoryId: text('repository_id')
.notNull()
.references(() => repositories.id, { onDelete: 'cascade' }),
versionId: text('version_id').references(() => repositoryVersions.id, { onDelete: 'cascade' }),
filePath: text('file_path').notNull(), // relative path within repo
title: text('title'),
language: text('language'), // e.g. "typescript", "markdown"
tokenCount: integer('token_count').default(0),
checksum: text('checksum').notNull(), // SHA-256 of file content
indexedAt: integer('indexed_at', { mode: 'timestamp' }).notNull()
},
(t) => [index('idx_documents_repo_version').on(t.repositoryId, t.versionId)]
);
// ---------------------------------------------------------------------------
// snippets
// ---------------------------------------------------------------------------
export const snippets = sqliteTable('snippets', {
id: text('id').primaryKey(), // UUID
documentId: text('document_id')
.notNull()
.references(() => documents.id, { onDelete: 'cascade' }),
repositoryId: text('repository_id')
.notNull()
.references(() => repositories.id, { onDelete: 'cascade' }),
versionId: text('version_id').references(() => repositoryVersions.id, { onDelete: 'cascade' }),
type: text('type', { enum: ['code', 'info'] }).notNull(),
title: text('title'),
content: text('content').notNull(), // searchable text / code
language: text('language'),
breadcrumb: text('breadcrumb'), // e.g. "Installation > Getting Started"
tokenCount: integer('token_count').default(0),
createdAt: integer('created_at', { mode: 'timestamp' }).notNull()
}, (t) => [
index('idx_snippets_repo_version').on(t.repositoryId, t.versionId),
index('idx_snippets_repo_type').on(t.repositoryId, t.type),
]);
export const snippets = sqliteTable(
'snippets',
{
id: text('id').primaryKey(), // UUID
documentId: text('document_id')
.notNull()
.references(() => documents.id, { onDelete: 'cascade' }),
repositoryId: text('repository_id')
.notNull()
.references(() => repositories.id, { onDelete: 'cascade' }),
versionId: text('version_id').references(() => repositoryVersions.id, { onDelete: 'cascade' }),
type: text('type', { enum: ['code', 'info'] }).notNull(),
title: text('title'),
content: text('content').notNull(), // searchable text / code
language: text('language'),
breadcrumb: text('breadcrumb'), // e.g. "Installation > Getting Started"
tokenCount: integer('token_count').default(0),
createdAt: integer('created_at', { mode: 'timestamp' }).notNull()
},
(t) => [
index('idx_snippets_repo_version').on(t.repositoryId, t.versionId),
index('idx_snippets_repo_type').on(t.repositoryId, t.type)
]
);
// ---------------------------------------------------------------------------
// embedding_profiles
@@ -134,34 +146,52 @@ export const snippetEmbeddings = sqliteTable(
},
(table) => [
primaryKey({ columns: [table.snippetId, table.profileId] }),
index('idx_embeddings_profile').on(table.profileId, table.snippetId),
index('idx_embeddings_profile').on(table.profileId, table.snippetId)
]
);
// ---------------------------------------------------------------------------
// indexing_jobs
// ---------------------------------------------------------------------------
export const indexingJobs = sqliteTable('indexing_jobs', {
id: text('id').primaryKey(), // UUID
repositoryId: text('repository_id')
.notNull()
.references(() => repositories.id, { onDelete: 'cascade' }),
versionId: text('version_id'),
status: text('status', {
enum: ['queued', 'running', 'paused', 'cancelled', 'done', 'failed']
})
.notNull()
.default('queued'),
progress: integer('progress').default(0), // 0100
totalFiles: integer('total_files').default(0),
processedFiles: integer('processed_files').default(0),
stage: text('stage', { enum: ['queued', 'differential', 'crawling', 'cloning', 'parsing', 'storing', 'embedding', 'done', 'failed'] }).notNull().default('queued'),
stageDetail: text('stage_detail'),
error: text('error'),
startedAt: integer('started_at', { mode: 'timestamp' }),
completedAt: integer('completed_at', { mode: 'timestamp' }),
createdAt: integer('created_at', { mode: 'timestamp' }).notNull()
}, (t) => [index('idx_jobs_repo_status').on(t.repositoryId, t.status)]);
export const indexingJobs = sqliteTable(
'indexing_jobs',
{
id: text('id').primaryKey(), // UUID
repositoryId: text('repository_id')
.notNull()
.references(() => repositories.id, { onDelete: 'cascade' }),
versionId: text('version_id'),
status: text('status', {
enum: ['queued', 'running', 'paused', 'cancelled', 'done', 'failed']
})
.notNull()
.default('queued'),
progress: integer('progress').default(0), // 0100
totalFiles: integer('total_files').default(0),
processedFiles: integer('processed_files').default(0),
stage: text('stage', {
enum: [
'queued',
'differential',
'crawling',
'cloning',
'parsing',
'storing',
'embedding',
'done',
'failed'
]
})
.notNull()
.default('queued'),
stageDetail: text('stage_detail'),
error: text('error'),
startedAt: integer('started_at', { mode: 'timestamp' }),
completedAt: integer('completed_at', { mode: 'timestamp' }),
createdAt: integer('created_at', { mode: 'timestamp' }).notNull()
},
(t) => [index('idx_jobs_repo_status').on(t.repositoryId, t.status)]
);
// ---------------------------------------------------------------------------
// repository_configs

View File

@@ -46,4 +46,4 @@ export function loadSqliteVec(db: Database.Database): void {
sqliteVec.load(db);
loadedConnections.add(db);
}
}