Files
trueref/src/lib/server/pipeline/write-operations.ts

344 lines
8.9 KiB
TypeScript

import { randomUUID } from 'node:crypto';
import type Database from 'better-sqlite3';
import type { NewDocument, NewSnippet } from '$lib/types';
import { SqliteVecStore } from '$lib/server/search/sqlite-vec.store.js';
import type {
SerializedDocument,
SerializedEmbedding,
SerializedFields,
SerializedSnippet
} from './worker-types.js';
type DocumentLike = Pick<
NewDocument,
| 'id'
| 'repositoryId'
| 'versionId'
| 'filePath'
| 'title'
| 'language'
| 'tokenCount'
| 'checksum'
> & {
indexedAt: Date | number;
};
type SnippetLike = Pick<
NewSnippet,
| 'id'
| 'documentId'
| 'repositoryId'
| 'versionId'
| 'type'
| 'title'
| 'content'
| 'language'
| 'breadcrumb'
| 'tokenCount'
> & {
createdAt: Date | number;
};
export interface CloneFromAncestorRequest {
ancestorVersionId: string;
targetVersionId: string;
repositoryId: string;
unchangedPaths: string[];
}
export interface PersistedEmbedding {
snippetId: string;
profileId: string;
model: string;
dimensions: number;
embedding: Buffer | Uint8Array;
}
function toEpochSeconds(value: Date | number): number {
return value instanceof Date ? Math.floor(value.getTime() / 1000) : value;
}
function toSnake(key: string): string {
return key.replace(/[A-Z]/g, (char) => `_${char.toLowerCase()}`);
}
function replaceSnippetsInternal(
db: Database.Database,
changedDocIds: string[],
newDocuments: DocumentLike[],
newSnippets: SnippetLike[]
): void {
const sqliteVecStore = new SqliteVecStore(db);
const insertDoc = db.prepare(
`INSERT INTO documents
(id, repository_id, version_id, file_path, title, language,
token_count, checksum, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
);
const insertSnippet = db.prepare(
`INSERT INTO snippets
(id, document_id, repository_id, version_id, type, title,
content, language, breadcrumb, token_count, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
);
db.transaction(() => {
sqliteVecStore.deleteEmbeddingsForDocumentIds(changedDocIds);
if (changedDocIds.length > 0) {
const placeholders = changedDocIds.map(() => '?').join(',');
db.prepare(`DELETE FROM documents WHERE id IN (${placeholders})`).run(...changedDocIds);
}
for (const doc of newDocuments) {
insertDoc.run(
doc.id,
doc.repositoryId,
doc.versionId ?? null,
doc.filePath,
doc.title ?? null,
doc.language ?? null,
doc.tokenCount ?? 0,
doc.checksum,
toEpochSeconds(doc.indexedAt)
);
}
for (const snippet of newSnippets) {
insertSnippet.run(
snippet.id,
snippet.documentId,
snippet.repositoryId,
snippet.versionId ?? null,
snippet.type,
snippet.title ?? null,
snippet.content,
snippet.language ?? null,
snippet.breadcrumb ?? null,
snippet.tokenCount ?? 0,
toEpochSeconds(snippet.createdAt)
);
}
})();
}
export function replaceSnippets(
db: Database.Database,
changedDocIds: string[],
newDocuments: NewDocument[],
newSnippets: NewSnippet[]
): void {
replaceSnippetsInternal(db, changedDocIds, newDocuments, newSnippets);
}
export function replaceSerializedSnippets(
db: Database.Database,
changedDocIds: string[],
documents: SerializedDocument[],
snippets: SerializedSnippet[]
): void {
replaceSnippetsInternal(db, changedDocIds, documents, snippets);
}
export function cloneFromAncestor(db: Database.Database, request: CloneFromAncestorRequest): void {
const sqliteVecStore = new SqliteVecStore(db);
const { ancestorVersionId, targetVersionId, repositoryId, unchangedPaths } = request;
db.transaction(() => {
const pathList = [...unchangedPaths];
if (pathList.length === 0) {
return;
}
const placeholders = pathList.map(() => '?').join(',');
const ancestorDocs = db
.prepare(`SELECT * FROM documents WHERE version_id = ? AND file_path IN (${placeholders})`)
.all(ancestorVersionId, ...pathList) as Array<{
id: string;
repository_id: string;
file_path: string;
title: string | null;
language: string | null;
token_count: number;
checksum: string;
indexed_at: number;
}>;
const docIdMap = new Map<string, string>();
const nowEpoch = Math.floor(Date.now() / 1000);
for (const doc of ancestorDocs) {
const newDocId = randomUUID();
docIdMap.set(doc.id, newDocId);
db.prepare(
`INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
).run(
newDocId,
repositoryId,
targetVersionId,
doc.file_path,
doc.title,
doc.language,
doc.token_count,
doc.checksum,
nowEpoch
);
}
if (docIdMap.size === 0) return;
const oldDocIds = [...docIdMap.keys()];
const snippetPlaceholders = oldDocIds.map(() => '?').join(',');
const ancestorSnippets = db
.prepare(`SELECT * FROM snippets WHERE document_id IN (${snippetPlaceholders})`)
.all(...oldDocIds) as Array<{
id: string;
document_id: string;
repository_id: string;
version_id: string | null;
type: string;
title: string | null;
content: string;
language: string | null;
breadcrumb: string | null;
token_count: number;
created_at: number;
}>;
const snippetIdMap = new Map<string, string>();
for (const snippet of ancestorSnippets) {
const newSnippetId = randomUUID();
snippetIdMap.set(snippet.id, newSnippetId);
const newDocId = docIdMap.get(snippet.document_id)!;
db.prepare(
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
).run(
newSnippetId,
newDocId,
repositoryId,
targetVersionId,
snippet.type,
snippet.title,
snippet.content,
snippet.language,
snippet.breadcrumb,
snippet.token_count,
snippet.created_at
);
}
if (snippetIdMap.size === 0) {
return;
}
const oldSnippetIds = [...snippetIdMap.keys()];
const embPlaceholders = oldSnippetIds.map(() => '?').join(',');
const ancestorEmbeddings = db
.prepare(`SELECT * FROM snippet_embeddings WHERE snippet_id IN (${embPlaceholders})`)
.all(...oldSnippetIds) as Array<{
snippet_id: string;
profile_id: string;
model: string;
dimensions: number;
embedding: Buffer;
created_at: number;
}>;
for (const emb of ancestorEmbeddings) {
const newSnippetId = snippetIdMap.get(emb.snippet_id)!;
db.prepare(
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
VALUES (?, ?, ?, ?, ?, ?)`
).run(newSnippetId, emb.profile_id, emb.model, emb.dimensions, emb.embedding, emb.created_at);
sqliteVecStore.upsertEmbeddingBuffer(
emb.profile_id,
newSnippetId,
emb.embedding,
emb.dimensions
);
}
})();
}
export function upsertEmbeddings(db: Database.Database, embeddings: PersistedEmbedding[]): void {
if (embeddings.length === 0) {
return;
}
const sqliteVecStore = new SqliteVecStore(db);
const insert = db.prepare<[string, string, string, number, Buffer]>(`
INSERT OR REPLACE INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
VALUES (?, ?, ?, ?, ?, unixepoch())
`);
db.transaction(() => {
for (const item of embeddings) {
const embeddingBuffer = Buffer.isBuffer(item.embedding)
? item.embedding
: Buffer.from(item.embedding);
insert.run(item.snippetId, item.profileId, item.model, item.dimensions, embeddingBuffer);
sqliteVecStore.upsertEmbeddingBuffer(
item.profileId,
item.snippetId,
embeddingBuffer,
item.dimensions
);
}
})();
}
export function upsertSerializedEmbeddings(
db: Database.Database,
embeddings: SerializedEmbedding[]
): void {
upsertEmbeddings(
db,
embeddings.map((item) => ({
snippetId: item.snippetId,
profileId: item.profileId,
model: item.model,
dimensions: item.dimensions,
embedding: item.embedding
}))
);
}
export function updateRepo(
db: Database.Database,
repositoryId: string,
fields: SerializedFields
): void {
const now = Math.floor(Date.now() / 1000);
const allFields = { ...fields, updatedAt: now };
const sets = Object.keys(allFields)
.map((key) => `${toSnake(key)} = ?`)
.join(', ');
const values = [...Object.values(allFields), repositoryId];
db.prepare(`UPDATE repositories SET ${sets} WHERE id = ?`).run(...values);
}
export function updateJob(db: Database.Database, jobId: string, fields: SerializedFields): void {
const sets = Object.keys(fields)
.map((key) => `${toSnake(key)} = ?`)
.join(', ');
const values = [...Object.values(fields), jobId];
db.prepare(`UPDATE indexing_jobs SET ${sets} WHERE id = ?`).run(...values);
}
export function updateVersion(
db: Database.Database,
versionId: string,
fields: SerializedFields
): void {
const sets = Object.keys(fields)
.map((key) => `${toSnake(key)} = ?`)
.join(', ');
const values = [...Object.values(fields), versionId];
db.prepare(`UPDATE repository_versions SET ${sets} WHERE id = ?`).run(...values);
}