TRUEREF-0023 rewrite indexing pipeline - parallel reads - serialized writes
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
*/
|
||||
import Database from 'better-sqlite3';
|
||||
import { env } from '$env/dynamic/private';
|
||||
import { applySqlitePragmas } from './connection';
|
||||
import { loadSqliteVec } from './sqlite-vec';
|
||||
|
||||
let _client: Database.Database | null = null;
|
||||
@@ -12,14 +13,7 @@ export function getClient(): Database.Database {
|
||||
if (!_client) {
|
||||
if (!env.DATABASE_URL) throw new Error('DATABASE_URL is not set');
|
||||
_client = new Database(env.DATABASE_URL);
|
||||
_client.pragma('journal_mode = WAL');
|
||||
_client.pragma('foreign_keys = ON');
|
||||
_client.pragma('busy_timeout = 5000');
|
||||
_client.pragma('synchronous = NORMAL');
|
||||
_client.pragma('cache_size = -65536');
|
||||
_client.pragma('temp_store = MEMORY');
|
||||
_client.pragma('mmap_size = 268435456');
|
||||
_client.pragma('wal_autocheckpoint = 1000');
|
||||
applySqlitePragmas(_client);
|
||||
loadSqliteVec(_client);
|
||||
}
|
||||
return _client;
|
||||
|
||||
14
src/lib/server/db/connection.ts
Normal file
14
src/lib/server/db/connection.ts
Normal file
@@ -0,0 +1,14 @@
|
||||
import type Database from 'better-sqlite3';
|
||||
|
||||
export const SQLITE_BUSY_TIMEOUT_MS = 30000;
|
||||
|
||||
export function applySqlitePragmas(db: Database.Database): void {
|
||||
db.pragma('journal_mode = WAL');
|
||||
db.pragma('foreign_keys = ON');
|
||||
db.pragma(`busy_timeout = ${SQLITE_BUSY_TIMEOUT_MS}`);
|
||||
db.pragma('synchronous = NORMAL');
|
||||
db.pragma('cache_size = -65536');
|
||||
db.pragma('temp_store = MEMORY');
|
||||
db.pragma('mmap_size = 268435456');
|
||||
db.pragma('wal_autocheckpoint = 1000');
|
||||
}
|
||||
@@ -5,6 +5,7 @@ import { readFileSync } from 'node:fs';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { join, dirname } from 'node:path';
|
||||
import * as schema from './schema';
|
||||
import { applySqlitePragmas } from './connection';
|
||||
import { loadSqliteVec } from './sqlite-vec';
|
||||
import { env } from '$env/dynamic/private';
|
||||
|
||||
@@ -12,19 +13,7 @@ if (!env.DATABASE_URL) throw new Error('DATABASE_URL is not set');
|
||||
|
||||
const client = new Database(env.DATABASE_URL);
|
||||
|
||||
// Enable WAL mode for better concurrent read performance.
|
||||
client.pragma('journal_mode = WAL');
|
||||
// Enforce foreign key constraints.
|
||||
client.pragma('foreign_keys = ON');
|
||||
// Wait up to 5 s when the DB is locked instead of failing immediately.
|
||||
// Prevents SQLITE_BUSY errors when the indexing pipeline holds the write lock
|
||||
// and an HTTP request arrives simultaneously.
|
||||
client.pragma('busy_timeout = 5000');
|
||||
client.pragma('synchronous = NORMAL');
|
||||
client.pragma('cache_size = -65536');
|
||||
client.pragma('temp_store = MEMORY');
|
||||
client.pragma('mmap_size = 268435456');
|
||||
client.pragma('wal_autocheckpoint = 1000');
|
||||
applySqlitePragmas(client);
|
||||
loadSqliteVec(client);
|
||||
|
||||
export const db = drizzle(client, { schema });
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,55 +1,55 @@
|
||||
{
|
||||
"version": "7",
|
||||
"dialect": "sqlite",
|
||||
"entries": [
|
||||
{
|
||||
"idx": 0,
|
||||
"version": "6",
|
||||
"when": 1774196053634,
|
||||
"tag": "0000_large_master_chief",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 1,
|
||||
"version": "6",
|
||||
"when": 1774448049161,
|
||||
"tag": "0001_quick_nighthawk",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 2,
|
||||
"version": "6",
|
||||
"when": 1774461897742,
|
||||
"tag": "0002_silky_stellaris",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 3,
|
||||
"version": "6",
|
||||
"when": 1743155877000,
|
||||
"tag": "0003_multiversion_config",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 4,
|
||||
"version": "6",
|
||||
"when": 1774880275833,
|
||||
"tag": "0004_complete_sentry",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 5,
|
||||
"version": "6",
|
||||
"when": 1774890536284,
|
||||
"tag": "0005_fix_stage_defaults",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 6,
|
||||
"version": "6",
|
||||
"when": 1775038799913,
|
||||
"tag": "0006_yielding_centennial",
|
||||
"breakpoints": true
|
||||
}
|
||||
]
|
||||
}
|
||||
"version": "7",
|
||||
"dialect": "sqlite",
|
||||
"entries": [
|
||||
{
|
||||
"idx": 0,
|
||||
"version": "6",
|
||||
"when": 1774196053634,
|
||||
"tag": "0000_large_master_chief",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 1,
|
||||
"version": "6",
|
||||
"when": 1774448049161,
|
||||
"tag": "0001_quick_nighthawk",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 2,
|
||||
"version": "6",
|
||||
"when": 1774461897742,
|
||||
"tag": "0002_silky_stellaris",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 3,
|
||||
"version": "6",
|
||||
"when": 1743155877000,
|
||||
"tag": "0003_multiversion_config",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 4,
|
||||
"version": "6",
|
||||
"when": 1774880275833,
|
||||
"tag": "0004_complete_sentry",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 5,
|
||||
"version": "6",
|
||||
"when": 1774890536284,
|
||||
"tag": "0005_fix_stage_defaults",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 6,
|
||||
"version": "6",
|
||||
"when": 1775038799913,
|
||||
"tag": "0006_yielding_centennial",
|
||||
"breakpoints": true
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -349,14 +349,14 @@ describe('snippet_embeddings table', () => {
|
||||
});
|
||||
|
||||
it('keeps the relational schema free of vec_embedding and retains the profile index', () => {
|
||||
const columns = client
|
||||
.prepare("PRAGMA table_info('snippet_embeddings')")
|
||||
.all() as Array<{ name: string }>;
|
||||
const columns = client.prepare("PRAGMA table_info('snippet_embeddings')").all() as Array<{
|
||||
name: string;
|
||||
}>;
|
||||
expect(columns.map((column) => column.name)).not.toContain('vec_embedding');
|
||||
|
||||
const indexes = client
|
||||
.prepare("PRAGMA index_list('snippet_embeddings')")
|
||||
.all() as Array<{ name: string }>;
|
||||
const indexes = client.prepare("PRAGMA index_list('snippet_embeddings')").all() as Array<{
|
||||
name: string;
|
||||
}>;
|
||||
expect(indexes.map((index) => index.name)).toContain('idx_embeddings_profile');
|
||||
});
|
||||
|
||||
|
||||
@@ -13,29 +13,33 @@ import {
|
||||
// ---------------------------------------------------------------------------
|
||||
// repositories
|
||||
// ---------------------------------------------------------------------------
|
||||
export const repositories = sqliteTable('repositories', {
|
||||
id: text('id').primaryKey(), // e.g. "/facebook/react" or "/local/my-sdk"
|
||||
title: text('title').notNull(),
|
||||
description: text('description'),
|
||||
source: text('source', { enum: ['github', 'local'] }).notNull(),
|
||||
sourceUrl: text('source_url').notNull(), // GitHub URL or absolute local path
|
||||
branch: text('branch').default('main'),
|
||||
state: text('state', {
|
||||
enum: ['pending', 'indexing', 'indexed', 'error']
|
||||
})
|
||||
.notNull()
|
||||
.default('pending'),
|
||||
totalSnippets: integer('total_snippets').default(0),
|
||||
totalTokens: integer('total_tokens').default(0),
|
||||
trustScore: real('trust_score').default(0), // 0.0–10.0
|
||||
benchmarkScore: real('benchmark_score').default(0), // 0.0–100.0; reserved for future quality metrics
|
||||
stars: integer('stars'),
|
||||
// TODO: encrypt at rest in production; stored as plaintext for v1
|
||||
githubToken: text('github_token'),
|
||||
lastIndexedAt: integer('last_indexed_at', { mode: 'timestamp' }),
|
||||
createdAt: integer('created_at', { mode: 'timestamp' }).notNull(),
|
||||
updatedAt: integer('updated_at', { mode: 'timestamp' }).notNull()
|
||||
}, (t) => [index('idx_repositories_state').on(t.state)]);
|
||||
export const repositories = sqliteTable(
|
||||
'repositories',
|
||||
{
|
||||
id: text('id').primaryKey(), // e.g. "/facebook/react" or "/local/my-sdk"
|
||||
title: text('title').notNull(),
|
||||
description: text('description'),
|
||||
source: text('source', { enum: ['github', 'local'] }).notNull(),
|
||||
sourceUrl: text('source_url').notNull(), // GitHub URL or absolute local path
|
||||
branch: text('branch').default('main'),
|
||||
state: text('state', {
|
||||
enum: ['pending', 'indexing', 'indexed', 'error']
|
||||
})
|
||||
.notNull()
|
||||
.default('pending'),
|
||||
totalSnippets: integer('total_snippets').default(0),
|
||||
totalTokens: integer('total_tokens').default(0),
|
||||
trustScore: real('trust_score').default(0), // 0.0–10.0
|
||||
benchmarkScore: real('benchmark_score').default(0), // 0.0–100.0; reserved for future quality metrics
|
||||
stars: integer('stars'),
|
||||
// TODO: encrypt at rest in production; stored as plaintext for v1
|
||||
githubToken: text('github_token'),
|
||||
lastIndexedAt: integer('last_indexed_at', { mode: 'timestamp' }),
|
||||
createdAt: integer('created_at', { mode: 'timestamp' }).notNull(),
|
||||
updatedAt: integer('updated_at', { mode: 'timestamp' }).notNull()
|
||||
},
|
||||
(t) => [index('idx_repositories_state').on(t.state)]
|
||||
);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// repository_versions
|
||||
@@ -61,43 +65,51 @@ export const repositoryVersions = sqliteTable('repository_versions', {
|
||||
// ---------------------------------------------------------------------------
|
||||
// documents
|
||||
// ---------------------------------------------------------------------------
|
||||
export const documents = sqliteTable('documents', {
|
||||
id: text('id').primaryKey(), // UUID
|
||||
repositoryId: text('repository_id')
|
||||
.notNull()
|
||||
.references(() => repositories.id, { onDelete: 'cascade' }),
|
||||
versionId: text('version_id').references(() => repositoryVersions.id, { onDelete: 'cascade' }),
|
||||
filePath: text('file_path').notNull(), // relative path within repo
|
||||
title: text('title'),
|
||||
language: text('language'), // e.g. "typescript", "markdown"
|
||||
tokenCount: integer('token_count').default(0),
|
||||
checksum: text('checksum').notNull(), // SHA-256 of file content
|
||||
indexedAt: integer('indexed_at', { mode: 'timestamp' }).notNull()
|
||||
}, (t) => [index('idx_documents_repo_version').on(t.repositoryId, t.versionId)]);
|
||||
export const documents = sqliteTable(
|
||||
'documents',
|
||||
{
|
||||
id: text('id').primaryKey(), // UUID
|
||||
repositoryId: text('repository_id')
|
||||
.notNull()
|
||||
.references(() => repositories.id, { onDelete: 'cascade' }),
|
||||
versionId: text('version_id').references(() => repositoryVersions.id, { onDelete: 'cascade' }),
|
||||
filePath: text('file_path').notNull(), // relative path within repo
|
||||
title: text('title'),
|
||||
language: text('language'), // e.g. "typescript", "markdown"
|
||||
tokenCount: integer('token_count').default(0),
|
||||
checksum: text('checksum').notNull(), // SHA-256 of file content
|
||||
indexedAt: integer('indexed_at', { mode: 'timestamp' }).notNull()
|
||||
},
|
||||
(t) => [index('idx_documents_repo_version').on(t.repositoryId, t.versionId)]
|
||||
);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// snippets
|
||||
// ---------------------------------------------------------------------------
|
||||
export const snippets = sqliteTable('snippets', {
|
||||
id: text('id').primaryKey(), // UUID
|
||||
documentId: text('document_id')
|
||||
.notNull()
|
||||
.references(() => documents.id, { onDelete: 'cascade' }),
|
||||
repositoryId: text('repository_id')
|
||||
.notNull()
|
||||
.references(() => repositories.id, { onDelete: 'cascade' }),
|
||||
versionId: text('version_id').references(() => repositoryVersions.id, { onDelete: 'cascade' }),
|
||||
type: text('type', { enum: ['code', 'info'] }).notNull(),
|
||||
title: text('title'),
|
||||
content: text('content').notNull(), // searchable text / code
|
||||
language: text('language'),
|
||||
breadcrumb: text('breadcrumb'), // e.g. "Installation > Getting Started"
|
||||
tokenCount: integer('token_count').default(0),
|
||||
createdAt: integer('created_at', { mode: 'timestamp' }).notNull()
|
||||
}, (t) => [
|
||||
index('idx_snippets_repo_version').on(t.repositoryId, t.versionId),
|
||||
index('idx_snippets_repo_type').on(t.repositoryId, t.type),
|
||||
]);
|
||||
export const snippets = sqliteTable(
|
||||
'snippets',
|
||||
{
|
||||
id: text('id').primaryKey(), // UUID
|
||||
documentId: text('document_id')
|
||||
.notNull()
|
||||
.references(() => documents.id, { onDelete: 'cascade' }),
|
||||
repositoryId: text('repository_id')
|
||||
.notNull()
|
||||
.references(() => repositories.id, { onDelete: 'cascade' }),
|
||||
versionId: text('version_id').references(() => repositoryVersions.id, { onDelete: 'cascade' }),
|
||||
type: text('type', { enum: ['code', 'info'] }).notNull(),
|
||||
title: text('title'),
|
||||
content: text('content').notNull(), // searchable text / code
|
||||
language: text('language'),
|
||||
breadcrumb: text('breadcrumb'), // e.g. "Installation > Getting Started"
|
||||
tokenCount: integer('token_count').default(0),
|
||||
createdAt: integer('created_at', { mode: 'timestamp' }).notNull()
|
||||
},
|
||||
(t) => [
|
||||
index('idx_snippets_repo_version').on(t.repositoryId, t.versionId),
|
||||
index('idx_snippets_repo_type').on(t.repositoryId, t.type)
|
||||
]
|
||||
);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// embedding_profiles
|
||||
@@ -134,34 +146,52 @@ export const snippetEmbeddings = sqliteTable(
|
||||
},
|
||||
(table) => [
|
||||
primaryKey({ columns: [table.snippetId, table.profileId] }),
|
||||
index('idx_embeddings_profile').on(table.profileId, table.snippetId),
|
||||
index('idx_embeddings_profile').on(table.profileId, table.snippetId)
|
||||
]
|
||||
);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// indexing_jobs
|
||||
// ---------------------------------------------------------------------------
|
||||
export const indexingJobs = sqliteTable('indexing_jobs', {
|
||||
id: text('id').primaryKey(), // UUID
|
||||
repositoryId: text('repository_id')
|
||||
.notNull()
|
||||
.references(() => repositories.id, { onDelete: 'cascade' }),
|
||||
versionId: text('version_id'),
|
||||
status: text('status', {
|
||||
enum: ['queued', 'running', 'paused', 'cancelled', 'done', 'failed']
|
||||
})
|
||||
.notNull()
|
||||
.default('queued'),
|
||||
progress: integer('progress').default(0), // 0–100
|
||||
totalFiles: integer('total_files').default(0),
|
||||
processedFiles: integer('processed_files').default(0),
|
||||
stage: text('stage', { enum: ['queued', 'differential', 'crawling', 'cloning', 'parsing', 'storing', 'embedding', 'done', 'failed'] }).notNull().default('queued'),
|
||||
stageDetail: text('stage_detail'),
|
||||
error: text('error'),
|
||||
startedAt: integer('started_at', { mode: 'timestamp' }),
|
||||
completedAt: integer('completed_at', { mode: 'timestamp' }),
|
||||
createdAt: integer('created_at', { mode: 'timestamp' }).notNull()
|
||||
}, (t) => [index('idx_jobs_repo_status').on(t.repositoryId, t.status)]);
|
||||
export const indexingJobs = sqliteTable(
|
||||
'indexing_jobs',
|
||||
{
|
||||
id: text('id').primaryKey(), // UUID
|
||||
repositoryId: text('repository_id')
|
||||
.notNull()
|
||||
.references(() => repositories.id, { onDelete: 'cascade' }),
|
||||
versionId: text('version_id'),
|
||||
status: text('status', {
|
||||
enum: ['queued', 'running', 'paused', 'cancelled', 'done', 'failed']
|
||||
})
|
||||
.notNull()
|
||||
.default('queued'),
|
||||
progress: integer('progress').default(0), // 0–100
|
||||
totalFiles: integer('total_files').default(0),
|
||||
processedFiles: integer('processed_files').default(0),
|
||||
stage: text('stage', {
|
||||
enum: [
|
||||
'queued',
|
||||
'differential',
|
||||
'crawling',
|
||||
'cloning',
|
||||
'parsing',
|
||||
'storing',
|
||||
'embedding',
|
||||
'done',
|
||||
'failed'
|
||||
]
|
||||
})
|
||||
.notNull()
|
||||
.default('queued'),
|
||||
stageDetail: text('stage_detail'),
|
||||
error: text('error'),
|
||||
startedAt: integer('started_at', { mode: 'timestamp' }),
|
||||
completedAt: integer('completed_at', { mode: 'timestamp' }),
|
||||
createdAt: integer('created_at', { mode: 'timestamp' }).notNull()
|
||||
},
|
||||
(t) => [index('idx_jobs_repo_status').on(t.repositoryId, t.status)]
|
||||
);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// repository_configs
|
||||
|
||||
@@ -46,4 +46,4 @@ export function loadSqliteVec(db: Database.Database): void {
|
||||
|
||||
sqliteVec.load(db);
|
||||
loadedConnections.add(db);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user