TRUEREF-0023 rewrite indexing pipeline - parallel reads - serialized writes
This commit is contained in:
343
src/lib/server/pipeline/write-operations.ts
Normal file
343
src/lib/server/pipeline/write-operations.ts
Normal file
@@ -0,0 +1,343 @@
|
||||
import { randomUUID } from 'node:crypto';
|
||||
import type Database from 'better-sqlite3';
|
||||
import type { NewDocument, NewSnippet } from '$lib/types';
|
||||
import { SqliteVecStore } from '$lib/server/search/sqlite-vec.store.js';
|
||||
import type {
|
||||
SerializedDocument,
|
||||
SerializedEmbedding,
|
||||
SerializedFields,
|
||||
SerializedSnippet
|
||||
} from './worker-types.js';
|
||||
|
||||
type DocumentLike = Pick<
|
||||
NewDocument,
|
||||
| 'id'
|
||||
| 'repositoryId'
|
||||
| 'versionId'
|
||||
| 'filePath'
|
||||
| 'title'
|
||||
| 'language'
|
||||
| 'tokenCount'
|
||||
| 'checksum'
|
||||
> & {
|
||||
indexedAt: Date | number;
|
||||
};
|
||||
|
||||
type SnippetLike = Pick<
|
||||
NewSnippet,
|
||||
| 'id'
|
||||
| 'documentId'
|
||||
| 'repositoryId'
|
||||
| 'versionId'
|
||||
| 'type'
|
||||
| 'title'
|
||||
| 'content'
|
||||
| 'language'
|
||||
| 'breadcrumb'
|
||||
| 'tokenCount'
|
||||
> & {
|
||||
createdAt: Date | number;
|
||||
};
|
||||
|
||||
export interface CloneFromAncestorRequest {
|
||||
ancestorVersionId: string;
|
||||
targetVersionId: string;
|
||||
repositoryId: string;
|
||||
unchangedPaths: string[];
|
||||
}
|
||||
|
||||
export interface PersistedEmbedding {
|
||||
snippetId: string;
|
||||
profileId: string;
|
||||
model: string;
|
||||
dimensions: number;
|
||||
embedding: Buffer | Uint8Array;
|
||||
}
|
||||
|
||||
function toEpochSeconds(value: Date | number): number {
|
||||
return value instanceof Date ? Math.floor(value.getTime() / 1000) : value;
|
||||
}
|
||||
|
||||
function toSnake(key: string): string {
|
||||
return key.replace(/[A-Z]/g, (char) => `_${char.toLowerCase()}`);
|
||||
}
|
||||
|
||||
function replaceSnippetsInternal(
|
||||
db: Database.Database,
|
||||
changedDocIds: string[],
|
||||
newDocuments: DocumentLike[],
|
||||
newSnippets: SnippetLike[]
|
||||
): void {
|
||||
const sqliteVecStore = new SqliteVecStore(db);
|
||||
const insertDoc = db.prepare(
|
||||
`INSERT INTO documents
|
||||
(id, repository_id, version_id, file_path, title, language,
|
||||
token_count, checksum, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
);
|
||||
|
||||
const insertSnippet = db.prepare(
|
||||
`INSERT INTO snippets
|
||||
(id, document_id, repository_id, version_id, type, title,
|
||||
content, language, breadcrumb, token_count, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
);
|
||||
|
||||
db.transaction(() => {
|
||||
sqliteVecStore.deleteEmbeddingsForDocumentIds(changedDocIds);
|
||||
|
||||
if (changedDocIds.length > 0) {
|
||||
const placeholders = changedDocIds.map(() => '?').join(',');
|
||||
db.prepare(`DELETE FROM documents WHERE id IN (${placeholders})`).run(...changedDocIds);
|
||||
}
|
||||
|
||||
for (const doc of newDocuments) {
|
||||
insertDoc.run(
|
||||
doc.id,
|
||||
doc.repositoryId,
|
||||
doc.versionId ?? null,
|
||||
doc.filePath,
|
||||
doc.title ?? null,
|
||||
doc.language ?? null,
|
||||
doc.tokenCount ?? 0,
|
||||
doc.checksum,
|
||||
toEpochSeconds(doc.indexedAt)
|
||||
);
|
||||
}
|
||||
|
||||
for (const snippet of newSnippets) {
|
||||
insertSnippet.run(
|
||||
snippet.id,
|
||||
snippet.documentId,
|
||||
snippet.repositoryId,
|
||||
snippet.versionId ?? null,
|
||||
snippet.type,
|
||||
snippet.title ?? null,
|
||||
snippet.content,
|
||||
snippet.language ?? null,
|
||||
snippet.breadcrumb ?? null,
|
||||
snippet.tokenCount ?? 0,
|
||||
toEpochSeconds(snippet.createdAt)
|
||||
);
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
export function replaceSnippets(
|
||||
db: Database.Database,
|
||||
changedDocIds: string[],
|
||||
newDocuments: NewDocument[],
|
||||
newSnippets: NewSnippet[]
|
||||
): void {
|
||||
replaceSnippetsInternal(db, changedDocIds, newDocuments, newSnippets);
|
||||
}
|
||||
|
||||
export function replaceSerializedSnippets(
|
||||
db: Database.Database,
|
||||
changedDocIds: string[],
|
||||
documents: SerializedDocument[],
|
||||
snippets: SerializedSnippet[]
|
||||
): void {
|
||||
replaceSnippetsInternal(db, changedDocIds, documents, snippets);
|
||||
}
|
||||
|
||||
export function cloneFromAncestor(db: Database.Database, request: CloneFromAncestorRequest): void {
|
||||
const sqliteVecStore = new SqliteVecStore(db);
|
||||
const { ancestorVersionId, targetVersionId, repositoryId, unchangedPaths } = request;
|
||||
|
||||
db.transaction(() => {
|
||||
const pathList = [...unchangedPaths];
|
||||
if (pathList.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const placeholders = pathList.map(() => '?').join(',');
|
||||
const ancestorDocs = db
|
||||
.prepare(`SELECT * FROM documents WHERE version_id = ? AND file_path IN (${placeholders})`)
|
||||
.all(ancestorVersionId, ...pathList) as Array<{
|
||||
id: string;
|
||||
repository_id: string;
|
||||
file_path: string;
|
||||
title: string | null;
|
||||
language: string | null;
|
||||
token_count: number;
|
||||
checksum: string;
|
||||
indexed_at: number;
|
||||
}>;
|
||||
|
||||
const docIdMap = new Map<string, string>();
|
||||
const nowEpoch = Math.floor(Date.now() / 1000);
|
||||
|
||||
for (const doc of ancestorDocs) {
|
||||
const newDocId = randomUUID();
|
||||
docIdMap.set(doc.id, newDocId);
|
||||
db.prepare(
|
||||
`INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
).run(
|
||||
newDocId,
|
||||
repositoryId,
|
||||
targetVersionId,
|
||||
doc.file_path,
|
||||
doc.title,
|
||||
doc.language,
|
||||
doc.token_count,
|
||||
doc.checksum,
|
||||
nowEpoch
|
||||
);
|
||||
}
|
||||
|
||||
if (docIdMap.size === 0) return;
|
||||
|
||||
const oldDocIds = [...docIdMap.keys()];
|
||||
const snippetPlaceholders = oldDocIds.map(() => '?').join(',');
|
||||
const ancestorSnippets = db
|
||||
.prepare(`SELECT * FROM snippets WHERE document_id IN (${snippetPlaceholders})`)
|
||||
.all(...oldDocIds) as Array<{
|
||||
id: string;
|
||||
document_id: string;
|
||||
repository_id: string;
|
||||
version_id: string | null;
|
||||
type: string;
|
||||
title: string | null;
|
||||
content: string;
|
||||
language: string | null;
|
||||
breadcrumb: string | null;
|
||||
token_count: number;
|
||||
created_at: number;
|
||||
}>;
|
||||
|
||||
const snippetIdMap = new Map<string, string>();
|
||||
for (const snippet of ancestorSnippets) {
|
||||
const newSnippetId = randomUUID();
|
||||
snippetIdMap.set(snippet.id, newSnippetId);
|
||||
const newDocId = docIdMap.get(snippet.document_id)!;
|
||||
db.prepare(
|
||||
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
||||
).run(
|
||||
newSnippetId,
|
||||
newDocId,
|
||||
repositoryId,
|
||||
targetVersionId,
|
||||
snippet.type,
|
||||
snippet.title,
|
||||
snippet.content,
|
||||
snippet.language,
|
||||
snippet.breadcrumb,
|
||||
snippet.token_count,
|
||||
snippet.created_at
|
||||
);
|
||||
}
|
||||
|
||||
if (snippetIdMap.size === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const oldSnippetIds = [...snippetIdMap.keys()];
|
||||
const embPlaceholders = oldSnippetIds.map(() => '?').join(',');
|
||||
const ancestorEmbeddings = db
|
||||
.prepare(`SELECT * FROM snippet_embeddings WHERE snippet_id IN (${embPlaceholders})`)
|
||||
.all(...oldSnippetIds) as Array<{
|
||||
snippet_id: string;
|
||||
profile_id: string;
|
||||
model: string;
|
||||
dimensions: number;
|
||||
embedding: Buffer;
|
||||
created_at: number;
|
||||
}>;
|
||||
|
||||
for (const emb of ancestorEmbeddings) {
|
||||
const newSnippetId = snippetIdMap.get(emb.snippet_id)!;
|
||||
db.prepare(
|
||||
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?)`
|
||||
).run(newSnippetId, emb.profile_id, emb.model, emb.dimensions, emb.embedding, emb.created_at);
|
||||
sqliteVecStore.upsertEmbeddingBuffer(
|
||||
emb.profile_id,
|
||||
newSnippetId,
|
||||
emb.embedding,
|
||||
emb.dimensions
|
||||
);
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
export function upsertEmbeddings(db: Database.Database, embeddings: PersistedEmbedding[]): void {
|
||||
if (embeddings.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const sqliteVecStore = new SqliteVecStore(db);
|
||||
const insert = db.prepare<[string, string, string, number, Buffer]>(`
|
||||
INSERT OR REPLACE INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, unixepoch())
|
||||
`);
|
||||
|
||||
db.transaction(() => {
|
||||
for (const item of embeddings) {
|
||||
const embeddingBuffer = Buffer.isBuffer(item.embedding)
|
||||
? item.embedding
|
||||
: Buffer.from(item.embedding);
|
||||
|
||||
insert.run(item.snippetId, item.profileId, item.model, item.dimensions, embeddingBuffer);
|
||||
|
||||
sqliteVecStore.upsertEmbeddingBuffer(
|
||||
item.profileId,
|
||||
item.snippetId,
|
||||
embeddingBuffer,
|
||||
item.dimensions
|
||||
);
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
export function upsertSerializedEmbeddings(
|
||||
db: Database.Database,
|
||||
embeddings: SerializedEmbedding[]
|
||||
): void {
|
||||
upsertEmbeddings(
|
||||
db,
|
||||
embeddings.map((item) => ({
|
||||
snippetId: item.snippetId,
|
||||
profileId: item.profileId,
|
||||
model: item.model,
|
||||
dimensions: item.dimensions,
|
||||
embedding: item.embedding
|
||||
}))
|
||||
);
|
||||
}
|
||||
|
||||
export function updateRepo(
|
||||
db: Database.Database,
|
||||
repositoryId: string,
|
||||
fields: SerializedFields
|
||||
): void {
|
||||
const now = Math.floor(Date.now() / 1000);
|
||||
const allFields = { ...fields, updatedAt: now };
|
||||
const sets = Object.keys(allFields)
|
||||
.map((key) => `${toSnake(key)} = ?`)
|
||||
.join(', ');
|
||||
const values = [...Object.values(allFields), repositoryId];
|
||||
db.prepare(`UPDATE repositories SET ${sets} WHERE id = ?`).run(...values);
|
||||
}
|
||||
|
||||
export function updateJob(db: Database.Database, jobId: string, fields: SerializedFields): void {
|
||||
const sets = Object.keys(fields)
|
||||
.map((key) => `${toSnake(key)} = ?`)
|
||||
.join(', ');
|
||||
const values = [...Object.values(fields), jobId];
|
||||
db.prepare(`UPDATE indexing_jobs SET ${sets} WHERE id = ?`).run(...values);
|
||||
}
|
||||
|
||||
export function updateVersion(
|
||||
db: Database.Database,
|
||||
versionId: string,
|
||||
fields: SerializedFields
|
||||
): void {
|
||||
const sets = Object.keys(fields)
|
||||
.map((key) => `${toSnake(key)} = ?`)
|
||||
.join(', ');
|
||||
const values = [...Object.values(fields), versionId];
|
||||
db.prepare(`UPDATE repository_versions SET ${sets} WHERE id = ?`).run(...values);
|
||||
}
|
||||
Reference in New Issue
Block a user