344 lines
8.9 KiB
TypeScript
344 lines
8.9 KiB
TypeScript
import { randomUUID } from 'node:crypto';
|
|
import type Database from 'better-sqlite3';
|
|
import type { NewDocument, NewSnippet } from '$lib/types';
|
|
import { SqliteVecStore } from '$lib/server/search/sqlite-vec.store.js';
|
|
import type {
|
|
SerializedDocument,
|
|
SerializedEmbedding,
|
|
SerializedFields,
|
|
SerializedSnippet
|
|
} from './worker-types.js';
|
|
|
|
type DocumentLike = Pick<
|
|
NewDocument,
|
|
| 'id'
|
|
| 'repositoryId'
|
|
| 'versionId'
|
|
| 'filePath'
|
|
| 'title'
|
|
| 'language'
|
|
| 'tokenCount'
|
|
| 'checksum'
|
|
> & {
|
|
indexedAt: Date | number;
|
|
};
|
|
|
|
type SnippetLike = Pick<
|
|
NewSnippet,
|
|
| 'id'
|
|
| 'documentId'
|
|
| 'repositoryId'
|
|
| 'versionId'
|
|
| 'type'
|
|
| 'title'
|
|
| 'content'
|
|
| 'language'
|
|
| 'breadcrumb'
|
|
| 'tokenCount'
|
|
> & {
|
|
createdAt: Date | number;
|
|
};
|
|
|
|
export interface CloneFromAncestorRequest {
|
|
ancestorVersionId: string;
|
|
targetVersionId: string;
|
|
repositoryId: string;
|
|
unchangedPaths: string[];
|
|
}
|
|
|
|
export interface PersistedEmbedding {
|
|
snippetId: string;
|
|
profileId: string;
|
|
model: string;
|
|
dimensions: number;
|
|
embedding: Buffer | Uint8Array;
|
|
}
|
|
|
|
function toEpochSeconds(value: Date | number): number {
|
|
return value instanceof Date ? Math.floor(value.getTime() / 1000) : value;
|
|
}
|
|
|
|
function toSnake(key: string): string {
|
|
return key.replace(/[A-Z]/g, (char) => `_${char.toLowerCase()}`);
|
|
}
|
|
|
|
function replaceSnippetsInternal(
|
|
db: Database.Database,
|
|
changedDocIds: string[],
|
|
newDocuments: DocumentLike[],
|
|
newSnippets: SnippetLike[]
|
|
): void {
|
|
const sqliteVecStore = new SqliteVecStore(db);
|
|
const insertDoc = db.prepare(
|
|
`INSERT INTO documents
|
|
(id, repository_id, version_id, file_path, title, language,
|
|
token_count, checksum, indexed_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
);
|
|
|
|
const insertSnippet = db.prepare(
|
|
`INSERT INTO snippets
|
|
(id, document_id, repository_id, version_id, type, title,
|
|
content, language, breadcrumb, token_count, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
);
|
|
|
|
db.transaction(() => {
|
|
sqliteVecStore.deleteEmbeddingsForDocumentIds(changedDocIds);
|
|
|
|
if (changedDocIds.length > 0) {
|
|
const placeholders = changedDocIds.map(() => '?').join(',');
|
|
db.prepare(`DELETE FROM documents WHERE id IN (${placeholders})`).run(...changedDocIds);
|
|
}
|
|
|
|
for (const doc of newDocuments) {
|
|
insertDoc.run(
|
|
doc.id,
|
|
doc.repositoryId,
|
|
doc.versionId ?? null,
|
|
doc.filePath,
|
|
doc.title ?? null,
|
|
doc.language ?? null,
|
|
doc.tokenCount ?? 0,
|
|
doc.checksum,
|
|
toEpochSeconds(doc.indexedAt)
|
|
);
|
|
}
|
|
|
|
for (const snippet of newSnippets) {
|
|
insertSnippet.run(
|
|
snippet.id,
|
|
snippet.documentId,
|
|
snippet.repositoryId,
|
|
snippet.versionId ?? null,
|
|
snippet.type,
|
|
snippet.title ?? null,
|
|
snippet.content,
|
|
snippet.language ?? null,
|
|
snippet.breadcrumb ?? null,
|
|
snippet.tokenCount ?? 0,
|
|
toEpochSeconds(snippet.createdAt)
|
|
);
|
|
}
|
|
})();
|
|
}
|
|
|
|
export function replaceSnippets(
|
|
db: Database.Database,
|
|
changedDocIds: string[],
|
|
newDocuments: NewDocument[],
|
|
newSnippets: NewSnippet[]
|
|
): void {
|
|
replaceSnippetsInternal(db, changedDocIds, newDocuments, newSnippets);
|
|
}
|
|
|
|
export function replaceSerializedSnippets(
|
|
db: Database.Database,
|
|
changedDocIds: string[],
|
|
documents: SerializedDocument[],
|
|
snippets: SerializedSnippet[]
|
|
): void {
|
|
replaceSnippetsInternal(db, changedDocIds, documents, snippets);
|
|
}
|
|
|
|
export function cloneFromAncestor(db: Database.Database, request: CloneFromAncestorRequest): void {
|
|
const sqliteVecStore = new SqliteVecStore(db);
|
|
const { ancestorVersionId, targetVersionId, repositoryId, unchangedPaths } = request;
|
|
|
|
db.transaction(() => {
|
|
const pathList = [...unchangedPaths];
|
|
if (pathList.length === 0) {
|
|
return;
|
|
}
|
|
|
|
const placeholders = pathList.map(() => '?').join(',');
|
|
const ancestorDocs = db
|
|
.prepare(`SELECT * FROM documents WHERE version_id = ? AND file_path IN (${placeholders})`)
|
|
.all(ancestorVersionId, ...pathList) as Array<{
|
|
id: string;
|
|
repository_id: string;
|
|
file_path: string;
|
|
title: string | null;
|
|
language: string | null;
|
|
token_count: number;
|
|
checksum: string;
|
|
indexed_at: number;
|
|
}>;
|
|
|
|
const docIdMap = new Map<string, string>();
|
|
const nowEpoch = Math.floor(Date.now() / 1000);
|
|
|
|
for (const doc of ancestorDocs) {
|
|
const newDocId = randomUUID();
|
|
docIdMap.set(doc.id, newDocId);
|
|
db.prepare(
|
|
`INSERT INTO documents (id, repository_id, version_id, file_path, title, language, token_count, checksum, indexed_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
).run(
|
|
newDocId,
|
|
repositoryId,
|
|
targetVersionId,
|
|
doc.file_path,
|
|
doc.title,
|
|
doc.language,
|
|
doc.token_count,
|
|
doc.checksum,
|
|
nowEpoch
|
|
);
|
|
}
|
|
|
|
if (docIdMap.size === 0) return;
|
|
|
|
const oldDocIds = [...docIdMap.keys()];
|
|
const snippetPlaceholders = oldDocIds.map(() => '?').join(',');
|
|
const ancestorSnippets = db
|
|
.prepare(`SELECT * FROM snippets WHERE document_id IN (${snippetPlaceholders})`)
|
|
.all(...oldDocIds) as Array<{
|
|
id: string;
|
|
document_id: string;
|
|
repository_id: string;
|
|
version_id: string | null;
|
|
type: string;
|
|
title: string | null;
|
|
content: string;
|
|
language: string | null;
|
|
breadcrumb: string | null;
|
|
token_count: number;
|
|
created_at: number;
|
|
}>;
|
|
|
|
const snippetIdMap = new Map<string, string>();
|
|
for (const snippet of ancestorSnippets) {
|
|
const newSnippetId = randomUUID();
|
|
snippetIdMap.set(snippet.id, newSnippetId);
|
|
const newDocId = docIdMap.get(snippet.document_id)!;
|
|
db.prepare(
|
|
`INSERT INTO snippets (id, document_id, repository_id, version_id, type, title, content, language, breadcrumb, token_count, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
|
|
).run(
|
|
newSnippetId,
|
|
newDocId,
|
|
repositoryId,
|
|
targetVersionId,
|
|
snippet.type,
|
|
snippet.title,
|
|
snippet.content,
|
|
snippet.language,
|
|
snippet.breadcrumb,
|
|
snippet.token_count,
|
|
snippet.created_at
|
|
);
|
|
}
|
|
|
|
if (snippetIdMap.size === 0) {
|
|
return;
|
|
}
|
|
|
|
const oldSnippetIds = [...snippetIdMap.keys()];
|
|
const embPlaceholders = oldSnippetIds.map(() => '?').join(',');
|
|
const ancestorEmbeddings = db
|
|
.prepare(`SELECT * FROM snippet_embeddings WHERE snippet_id IN (${embPlaceholders})`)
|
|
.all(...oldSnippetIds) as Array<{
|
|
snippet_id: string;
|
|
profile_id: string;
|
|
model: string;
|
|
dimensions: number;
|
|
embedding: Buffer;
|
|
created_at: number;
|
|
}>;
|
|
|
|
for (const emb of ancestorEmbeddings) {
|
|
const newSnippetId = snippetIdMap.get(emb.snippet_id)!;
|
|
db.prepare(
|
|
`INSERT INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
|
VALUES (?, ?, ?, ?, ?, ?)`
|
|
).run(newSnippetId, emb.profile_id, emb.model, emb.dimensions, emb.embedding, emb.created_at);
|
|
sqliteVecStore.upsertEmbeddingBuffer(
|
|
emb.profile_id,
|
|
newSnippetId,
|
|
emb.embedding,
|
|
emb.dimensions
|
|
);
|
|
}
|
|
})();
|
|
}
|
|
|
|
export function upsertEmbeddings(db: Database.Database, embeddings: PersistedEmbedding[]): void {
|
|
if (embeddings.length === 0) {
|
|
return;
|
|
}
|
|
|
|
const sqliteVecStore = new SqliteVecStore(db);
|
|
const insert = db.prepare<[string, string, string, number, Buffer]>(`
|
|
INSERT OR REPLACE INTO snippet_embeddings (snippet_id, profile_id, model, dimensions, embedding, created_at)
|
|
VALUES (?, ?, ?, ?, ?, unixepoch())
|
|
`);
|
|
|
|
db.transaction(() => {
|
|
for (const item of embeddings) {
|
|
const embeddingBuffer = Buffer.isBuffer(item.embedding)
|
|
? item.embedding
|
|
: Buffer.from(item.embedding);
|
|
|
|
insert.run(item.snippetId, item.profileId, item.model, item.dimensions, embeddingBuffer);
|
|
|
|
sqliteVecStore.upsertEmbeddingBuffer(
|
|
item.profileId,
|
|
item.snippetId,
|
|
embeddingBuffer,
|
|
item.dimensions
|
|
);
|
|
}
|
|
})();
|
|
}
|
|
|
|
export function upsertSerializedEmbeddings(
|
|
db: Database.Database,
|
|
embeddings: SerializedEmbedding[]
|
|
): void {
|
|
upsertEmbeddings(
|
|
db,
|
|
embeddings.map((item) => ({
|
|
snippetId: item.snippetId,
|
|
profileId: item.profileId,
|
|
model: item.model,
|
|
dimensions: item.dimensions,
|
|
embedding: item.embedding
|
|
}))
|
|
);
|
|
}
|
|
|
|
export function updateRepo(
|
|
db: Database.Database,
|
|
repositoryId: string,
|
|
fields: SerializedFields
|
|
): void {
|
|
const now = Math.floor(Date.now() / 1000);
|
|
const allFields = { ...fields, updatedAt: now };
|
|
const sets = Object.keys(allFields)
|
|
.map((key) => `${toSnake(key)} = ?`)
|
|
.join(', ');
|
|
const values = [...Object.values(allFields), repositoryId];
|
|
db.prepare(`UPDATE repositories SET ${sets} WHERE id = ?`).run(...values);
|
|
}
|
|
|
|
export function updateJob(db: Database.Database, jobId: string, fields: SerializedFields): void {
|
|
const sets = Object.keys(fields)
|
|
.map((key) => `${toSnake(key)} = ?`)
|
|
.join(', ');
|
|
const values = [...Object.values(fields), jobId];
|
|
db.prepare(`UPDATE indexing_jobs SET ${sets} WHERE id = ?`).run(...values);
|
|
}
|
|
|
|
export function updateVersion(
|
|
db: Database.Database,
|
|
versionId: string,
|
|
fields: SerializedFields
|
|
): void {
|
|
const sets = Object.keys(fields)
|
|
.map((key) => `${toSnake(key)} = ?`)
|
|
.join(', ');
|
|
const values = [...Object.values(fields), versionId];
|
|
db.prepare(`UPDATE repository_versions SET ${sets} WHERE id = ?`).run(...values);
|
|
}
|