From 929c4824973c6e6ceab56f1c4578f816aaafb24f Mon Sep 17 00:00:00 2001 From: Giancarmine Salucci Date: Tue, 12 May 2026 00:10:32 +0200 Subject: [PATCH] refactor(transcript): drop Tonemark rewrite Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- package-lock.json | 42 +++- package.json | 3 +- src/lib/server/downloader.ts | 100 +++----- src/lib/server/pipeline.ts | 6 +- src/lib/server/postprocess.ts | 235 ------------------ src/routes/api/jobs/[id]/reprocess/+server.ts | 6 +- src/routes/api/webhook/[jobId]/+server.ts | 4 +- src/tests/downloader.test.ts | 80 ++++++ src/tests/postprocess.test.ts | 204 --------------- src/tests/webhook.test.ts | 21 +- 10 files changed, 161 insertions(+), 540 deletions(-) delete mode 100644 src/lib/server/postprocess.ts create mode 100644 src/tests/downloader.test.ts delete mode 100644 src/tests/postprocess.test.ts diff --git a/package-lock.json b/package-lock.json index cf35f81..417b395 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12,7 +12,8 @@ "better-sqlite3": "^12.9.0", "form-data": "^4.0.5", "node-fetch": "^3.3.2", - "web-push": "^3.6.7" + "web-push": "^3.6.7", + "youtube-transcript": "^1.3.1" }, "devDependencies": { "@sveltejs/adapter-auto": "^7.0.1", @@ -89,6 +90,27 @@ "node": ">=18" } }, + "node_modules/@emnapi/core": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.10.0.tgz", + "integrity": "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw==", + "license": "MIT", + "optional": true, + "dependencies": { + "@emnapi/wasi-threads": "1.2.1", + "tslib": "^2.4.0" + } + }, + "node_modules/@emnapi/runtime": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.10.0.tgz", + "integrity": "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA==", + "license": "MIT", + "optional": true, + "dependencies": { + "tslib": "^2.4.0" + } + }, "node_modules/@emnapi/wasi-threads": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.2.1.tgz", @@ -896,7 +918,6 @@ "resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.59.1.tgz", "integrity": "sha512-d8OON70AphLdDesuTIl//M2O6fRTIicX8aYv8vhCiYEhTTI2OboKqey0Hu1A4VFhqwgqtq0vKDmPFGkw8kKmgw==", "license": "MIT", - "peer": true, "dependencies": { "@standard-schema/spec": "^1.0.0", "@sveltejs/acorn-typescript": "^1.0.5", @@ -938,7 +959,6 @@ "resolved": "https://registry.npmjs.org/@sveltejs/vite-plugin-svelte/-/vite-plugin-svelte-7.1.1.tgz", "integrity": "sha512-FOJdbE5pxae68DoTBJ49t1dIA7TSmMHR6CsuJhX90cO/UfrEMHA7KJNUj3WdZuUDJPu4ujqpJ2Tgqd2gTWr6Xg==", "license": "MIT", - "peer": true, "dependencies": { "deepmerge": "^4.3.1", "magic-string": "^0.30.21", @@ -1313,7 +1333,6 @@ "integrity": "sha512-38C0/Ddb7HcRG0Z4/DUem8x57d2p9jYgp18mkaYswEOQBGsI1CG4f/hjm0ZCeaJfWhSZ4k7jgs29V1Zom7Ki9A==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@bcoe/v8-coverage": "^1.0.2", "@vitest/utils": "4.1.5", @@ -1467,7 +1486,6 @@ "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.16.0.tgz", "integrity": "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw==", "license": "MIT", - "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -3021,7 +3039,6 @@ "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.60.3.tgz", "integrity": "sha512-pAQK9HalE84QSm4Po3EmWIZPd3FnjkShVkiMlz1iligWYkWQ7wHYd1PF/T7QZ5TVSD6uSTon5gBVMSM4JfBV+A==", "license": "MIT", - "peer": true, "dependencies": { "@types/estree": "1.0.8" }, @@ -3255,7 +3272,6 @@ "resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.5.tgz", "integrity": "sha512-2uCs/LZ9us+AktdzYJM8OcxQ8qnPS1kpaO7syGT/MgO+6Qr1Ybl+TqPq+97u7PHqmmMlye5ZkoyXONy5mjjAbw==", "license": "MIT", - "peer": true, "dependencies": { "@jridgewell/remapping": "^2.3.4", "@jridgewell/sourcemap-codec": "^1.5.0", @@ -3428,7 +3444,6 @@ "integrity": "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw==", "devOptional": true, "license": "Apache-2.0", - "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -3455,7 +3470,6 @@ "resolved": "https://registry.npmjs.org/vite/-/vite-8.0.10.tgz", "integrity": "sha512-rZuUu9j6J5uotLDs+cAA4O5H4K1SfPliUlQwqa6YEwSrWDZzP4rhm00oJR5snMewjxF5V/K3D4kctsUTsIU9Mw==", "license": "MIT", - "peer": true, "dependencies": { "lightningcss": "^1.32.0", "picomatch": "^4.0.4", @@ -3553,7 +3567,6 @@ "integrity": "sha512-9Xx1v3/ih3m9hN+SbfkUyy0JAs72ap3r7joc87XL6jwF0jGg6mFBvQ1SrwaX+h8BlkX6Hz9shdd1uo6AF+ZGpg==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@vitest/expect": "4.1.5", "@vitest/mocker": "4.1.5", @@ -3689,6 +3702,15 @@ "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", "license": "ISC" }, + "node_modules/youtube-transcript": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/youtube-transcript/-/youtube-transcript-1.3.1.tgz", + "integrity": "sha512-NDCjwad113TGybbYF51y9Z4tcwzBHUZWQdF9veULNca18L+FdDbHHtTHIr69WVa3bB90l67S8kN0HtL2JO9fhg==", + "license": "MIT", + "engines": { + "node": ">=18.0.0" + } + }, "node_modules/zimmerframe": { "version": "1.1.4", "resolved": "https://registry.npmjs.org/zimmerframe/-/zimmerframe-1.1.4.tgz", diff --git a/package.json b/package.json index 2b19247..e92cda6 100644 --- a/package.json +++ b/package.json @@ -34,6 +34,7 @@ "better-sqlite3": "^12.9.0", "form-data": "^4.0.5", "node-fetch": "^3.3.2", - "web-push": "^3.6.7" + "web-push": "^3.6.7", + "youtube-transcript": "^1.3.1" } } diff --git a/src/lib/server/downloader.ts b/src/lib/server/downloader.ts index 5899caa..a641a59 100644 --- a/src/lib/server/downloader.ts +++ b/src/lib/server/downloader.ts @@ -1,8 +1,9 @@ import { execFile } from 'child_process'; import { promisify } from 'util'; import { existsSync } from 'fs'; -import { mkdir, unlink, writeFile } from 'fs/promises'; +import { mkdir, writeFile } from 'fs/promises'; import { join } from 'path'; +import { fetchTranscript, type TranscriptResponse } from 'youtube-transcript'; const execFileAsync = promisify(execFile); const TMP_DIR = join(process.env.DATA_DIR ?? '/tmp/.whisper-pwa', 'downloads'); @@ -26,43 +27,33 @@ export interface AudioResult { export type DownloadResult = CaptionResult | AudioResult; /** Try to get auto-generated captions from YouTube. Returns null if unavailable. */ -async function tryGetCaptions(url: string, outDir: string): Promise { - const jsonPath = join(outDir, 'info.json'); +async function tryGetCaptions(url: string, _outDir: string): Promise { try { - await execFileAsync('yt-dlp', [ - '--write-auto-subs', - '--sub-langs', 'en.*', - '--skip-download', - '--write-info-json', - '--no-playlist', - '-o', join(outDir, '%(title)s.%(ext)s'), - url - ]); - - // Find the VTT/SRT file - const { readdirSync } = await import('fs'); - const files = readdirSync(outDir); - const vttFile = files.find((f) => f.endsWith('.vtt') || f.endsWith('.srt')); - if (!vttFile) return null; - - let title = 'Untitled'; - if (existsSync(jsonPath)) { - try { - const info = JSON.parse((await import('fs')).readFileSync(jsonPath, 'utf8')); - title = info.title ?? title; - } catch { /* ignore */ } - } - - const content = (await import('fs')).readFileSync(join(outDir, vttFile), 'utf8'); - const segments = parseVtt(content); + const transcript = await fetchTranscript(url, { lang: 'en' }); + const segments = transcriptEntriesToSegments(transcript); if (segments.length === 0) return null; + const title = await getYouTubeTitle(url); return { type: 'captions', segments, title }; } catch { return null; } } +async function getYouTubeTitle(url: string): Promise { + try { + const { stdout } = await execFileAsync('yt-dlp', [ + '--dump-single-json', + '--skip-download', + '--no-playlist', + url + ]); + return JSON.parse(stdout).title ?? 'Untitled'; + } catch { + return 'Untitled'; + } +} + /** Download best audio from YouTube. Returns path to audio file. */ async function downloadAudio(url: string, outDir: string): Promise<{ audioPath: string; title: string }> { await execFileAsync('yt-dlp', [ @@ -124,39 +115,22 @@ export async function cleanupJobTmp(jobId: string) { } catch { /* ignore */ } } -/** Parse a WebVTT string into segments. */ -function parseVtt( - content: string +export function transcriptEntriesToSegments( + entries: TranscriptResponse[] ): Array<{ index: number; start: number; end: number; text: string; words: [] }> { - const segments: Array<{ index: number; start: number; end: number; text: string; words: [] }> = []; - const blocks = content.split(/\n\n+/); - let index = 0; - - for (const block of blocks) { - const lines = block.trim().split('\n'); - const timeLine = lines.find((l) => l.includes('-->')); - if (!timeLine) continue; - - const [startStr, endStr] = timeLine.split('-->').map((s) => s.trim().split(' ')[0]); - const start = vttTimeToSec(startStr); - const end = vttTimeToSec(endStr); - const text = lines - .filter((l) => !l.includes('-->') && !/^\d+$/.test(l.trim()) && l.trim()) - .join(' ') - .replace(/<[^>]+>/g, '') - .trim(); - - if (text) { - segments.push({ index: index++, start, end, text, words: [] }); - } - } - - return segments; -} - -function vttTimeToSec(t: string): number { - const parts = t.split(':').map(Number); - if (parts.length === 3) return parts[0] * 3600 + parts[1] * 60 + parts[2]; - if (parts.length === 2) return parts[0] * 60 + parts[1]; - return parts[0]; + const useMilliseconds = entries.some((entry) => entry.offset > 1000 || entry.duration > 1000); + return entries + .map((entry) => { + const start = useMilliseconds ? entry.offset / 1000 : entry.offset; + const duration = useMilliseconds ? entry.duration / 1000 : entry.duration; + return { + index: 0, + start, + end: start + duration, + text: entry.text.trim(), + words: [] as [] + }; + }) + .filter((entry) => entry.text.length > 0) + .map((entry, index) => ({ ...entry, index })); } diff --git a/src/lib/server/pipeline.ts b/src/lib/server/pipeline.ts index cdd724a..1e8f58f 100644 --- a/src/lib/server/pipeline.ts +++ b/src/lib/server/pipeline.ts @@ -96,15 +96,13 @@ async function runJob( if (captionSegments) { // Caption fast path — skip whisper - const { deduplicateSegments } = await import('./postprocess.js'); const { writeOutputs } = await import('./formatter.js'); - const segments = deduplicateSegments(captionSegments); - const paths = await writeOutputs(segments, title, jobId); + const paths = await writeOutputs(captionSegments, title, jobId); updateJob({ id: jobId, status: 'done', progress: 100, - segmentsJson: JSON.stringify(segments), + segmentsJson: JSON.stringify(captionSegments), outputDir: paths.srt.replace(/\/[^/]+$/, '') }); emitProgress(jobId, { type: 'done' }); diff --git a/src/lib/server/postprocess.ts b/src/lib/server/postprocess.ts deleted file mode 100644 index 49c1cf1..0000000 --- a/src/lib/server/postprocess.ts +++ /dev/null @@ -1,235 +0,0 @@ -import type { Segment } from '$lib/types.js'; - -// ── Collapse consecutive repeated phrases within a segment's text ──────────── - -function collapseRepeats(text: string): string { - let prev = ''; - // Keep applying until stable - while (true) { - const next = collapseOnce(text); - if (next === prev || next === text) return next; - prev = text; - text = next; - } -} - -function collapseOnce(text: string): string { - // Match any repeated phrase (2+ words) appearing consecutively - return text.replace(/\b(.{10,}?)\s+\1\b/gi, '$1'); -} - -// ── Merge consecutive segments with identical (or near-identical) text ─────── - -function normalise(s: string) { - return s.toLowerCase().replace(/[^\w\s]/g, '').replace(/\s+/g, ' ').trim(); -} - -function mergeConsecutive(segments: Segment[]): Segment[] { - const out: Segment[] = []; - for (const seg of segments) { - const last = out[out.length - 1]; - if (last && normalise(last.text) === normalise(seg.text)) { - last.end = seg.end; - } else { - out.push({ ...seg }); - } - } - return out; -} - -// ── Collapse rolling prefix/suffix chains from backend segment hypotheses ────── - -const MAX_CHAIN_GAP_SECS = 0.15; -const MIN_MEANINGFUL_WORDS = 2; -const MIN_MEANINGFUL_CHARS = 8; -const MIN_OVERLAP_WORDS = 1; - -function splitWords(text: string): string[] { - return text.trim().split(/\s+/).filter(Boolean); -} - -function normaliseWords(text: string): string[] { - return splitWords(text) - .map((word) => word.toLowerCase().replace(/[^\w]/g, '')) - .filter(Boolean); -} - -function arraysEqual(a: string[], b: string[]): boolean { - return a.length === b.length && a.every((value, index) => value === b[index]); -} - -function startsWithWords(full: string[], prefix: string[]): boolean { - return prefix.length <= full.length && arraysEqual(full.slice(0, prefix.length), prefix); -} - -function endsWithWords(full: string[], suffix: string[]): boolean { - return suffix.length <= full.length && arraysEqual(full.slice(full.length - suffix.length), suffix); -} - -function suffixPrefixOverlap(left: string[], right: string[]): number { - const max = Math.min(left.length, right.length); - for (let size = max; size >= 1; size--) { - if (arraysEqual(left.slice(left.length - size), right.slice(0, size))) return size; - } - return 0; -} - -function isMeaningfulPhrase(words: string[]): boolean { - return words.length >= MIN_MEANINGFUL_WORDS && words.join(' ').length >= MIN_MEANINGFUL_CHARS; -} - -function isShortCarryover(seg: Segment, words: string[]): boolean { - return seg.end - seg.start <= 0.2 || words.length <= 2 || words.join(' ').length <= 16; -} - -function trimLeadingWords(text: string, count: number): string { - return splitWords(text).slice(count).join(' ').trim(); -} - -function collapseIncrementalSegments(segments: Segment[]): Segment[] { - const out: Segment[] = []; - - for (const seg of segments) { - let current: Segment = { - ...seg, - text: seg.text.trim() - }; - - if (!current.text) continue; - - const last = out[out.length - 1]; - if (!last) { - out.push(current); - continue; - } - - const gap = current.start - last.end; - if (gap > MAX_CHAIN_GAP_SECS) { - out.push(current); - continue; - } - - const lastWords = normaliseWords(last.text); - const currentWords = normaliseWords(current.text); - if (lastWords.length === 0 || currentWords.length === 0) { - out.push(current); - continue; - } - - if ( - currentWords.length > lastWords.length && - startsWithWords(currentWords, lastWords) && - (isMeaningfulPhrase(lastWords) || isShortCarryover(last, lastWords)) - ) { - last.text = current.text; - last.end = current.end; - last.words = current.words; - continue; - } - - if ( - endsWithWords(lastWords, currentWords) && - (isMeaningfulPhrase(currentWords) || isShortCarryover(current, currentWords)) - ) { - last.end = Math.max(last.end, current.end); - continue; - } - - const overlapWords = suffixPrefixOverlap(lastWords, currentWords); - if (overlapWords >= MIN_OVERLAP_WORDS) { - const trimmedText = trimLeadingWords(current.text, overlapWords); - if (!trimmedText) { - last.end = Math.max(last.end, current.end); - continue; - } - - current = { - ...current, - start: Math.max(current.start, last.end), - text: trimmedText, - words: [] - }; - } - - out.push(current); - } - - return out; -} - -// ── N-gram deduplication ───────────────────────────────────────────────────── - -const NGRAM_N = 6; -const LOOKBACK_CHARS = 500; -const SIMILARITY_THRESHOLD = 0.6; - -function ngrams(text: string, n: number): string[] { - const words = text.toLowerCase().split(/\s+/); - const grams: string[] = []; - for (let i = 0; i <= words.length - n; i++) { - grams.push(words.slice(i, i + n).join(' ')); - } - return grams; -} - -function jaccardSimilarity(a: string, b: string): number { - const ga = new Set(ngrams(a, NGRAM_N)); - const gb = new Set(ngrams(b, NGRAM_N)); - // If neither text is long enough to produce n-grams they cannot be compared; - // treat as dissimilar so short segments are never incorrectly discarded. - if (ga.size === 0 && gb.size === 0) return 0; - const intersection = [...ga].filter((g) => gb.has(g)).length; - const union = new Set([...ga, ...gb]).size; - return union === 0 ? 0 : intersection / union; -} - -function ngramDedup(segments: Segment[]): Segment[] { - const out: Segment[] = []; - for (const seg of segments) { - const windowText = out - .slice(-20) - .map((s) => s.text) - .join(' ') - .slice(-LOOKBACK_CHARS); - - if (windowText.length > 0 && jaccardSimilarity(seg.text, windowText) >= SIMILARITY_THRESHOLD) { - continue; // duplicate — skip - } - out.push(seg); - } - return out; -} - -// ── Full deduplication pipeline ────────────────────────────────────────────── - -export function deduplicateSegments(segments: Segment[]): Segment[] { - if (!Array.isArray(segments)) return []; - // 1. Collapse repeats within each segment's text - let result = segments.map((s) => ({ - ...s, - text: collapseRepeats(s.text.trim()) - })); - - // 2. Remove empty segments - result = result.filter((s) => s.text.length > 0); - - // 3. Collapse rolling backend hypotheses before generic dedup - result = collapseIncrementalSegments(result); - - // 4. First merge pass - result = mergeConsecutive(result); - - // 5. N-gram dedup - result = ngramDedup(result); - - // 6. Re-run rolling collapse after removals create new adjacencies - result = collapseIncrementalSegments(result); - - // 7. Second merge pass (catches new adjacencies after dedup) - result = mergeConsecutive(result); - - // 8. Re-index - result.forEach((s, i) => (s.index = i)); - - return result; -} diff --git a/src/routes/api/jobs/[id]/reprocess/+server.ts b/src/routes/api/jobs/[id]/reprocess/+server.ts index 0f74f5b..15f9aa1 100644 --- a/src/routes/api/jobs/[id]/reprocess/+server.ts +++ b/src/routes/api/jobs/[id]/reprocess/+server.ts @@ -1,10 +1,9 @@ import { json, error } from '@sveltejs/kit'; import { getJob, updateJob } from '$lib/server/db.js'; -import { deduplicateSegments } from '$lib/server/postprocess.js'; import { writeOutputs } from '$lib/server/formatter.js'; import type { Segment } from '$lib/types.js'; -/** POST /api/jobs/[id]/reprocess — re-run post-processing and regenerate all output files. */ +/** POST /api/jobs/[id]/reprocess — regenerate output files from stored canonical segments. */ export async function POST({ params }) { const job = getJob(params.id); if (!job) throw error(404, 'Job not found'); @@ -14,8 +13,7 @@ export async function POST({ params }) { } try { - const rawSegments = JSON.parse(job.segmentsJson) as Segment[]; - const segments = deduplicateSegments(rawSegments); + const segments = JSON.parse(job.segmentsJson) as Segment[]; const paths = await writeOutputs(segments, job.title, job.id); const outputDir = paths.srt.replace(/\/[^/]+$/, ''); diff --git a/src/routes/api/webhook/[jobId]/+server.ts b/src/routes/api/webhook/[jobId]/+server.ts index 5d89d0f..e1fdb56 100644 --- a/src/routes/api/webhook/[jobId]/+server.ts +++ b/src/routes/api/webhook/[jobId]/+server.ts @@ -1,6 +1,5 @@ import { json, error } from '@sveltejs/kit'; import { getJob, updateJob, setJobStatus } from '$lib/server/db.js'; -import { deduplicateSegments } from '$lib/server/postprocess.js'; import { writeOutputs } from '$lib/server/formatter.js'; import { sendNotification } from '$lib/server/push.js'; import { cleanupJobTmp } from '$lib/server/downloader.js'; @@ -40,8 +39,7 @@ try { setJobStatus(jobId, 'processing', 90); emitProgress(jobId, { type: 'status', status: 'processing', progress: 90 }); -const rawSegments = (whisperJob.segments ?? []) as Segment[]; -const segments = deduplicateSegments(rawSegments); +const segments = (whisperJob.segments ?? []) as Segment[]; const paths = await writeOutputs(segments, job.title, jobId); const outputDir = paths.srt.replace(/\/[^/]+$/, ''); diff --git a/src/tests/downloader.test.ts b/src/tests/downloader.test.ts new file mode 100644 index 0000000..c04fac2 --- /dev/null +++ b/src/tests/downloader.test.ts @@ -0,0 +1,80 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { rm } from 'fs/promises'; +import type { TranscriptResponse } from 'youtube-transcript'; + +const { mockExecFile, mockFetchTranscript } = vi.hoisted(() => ({ + mockExecFile: vi.fn(), + mockFetchTranscript: vi.fn() +})); + +const TEST_DATA_DIR = `/tmp/tonemark-downloader-test-${Date.now()}`; +vi.stubEnv('DATA_DIR', TEST_DATA_DIR); + +vi.mock('child_process', () => ({ + execFile: mockExecFile +})); + +vi.mock('youtube-transcript', () => ({ + fetchTranscript: mockFetchTranscript +})); + +import { downloadYouTube, transcriptEntriesToSegments } from '$lib/server/downloader.js'; + +beforeEach(() => { + vi.clearAllMocks(); + mockExecFile.mockImplementation((...args: unknown[]) => { + const cb = args.at(-1) as (...callbackArgs: unknown[]) => void; + cb(null, JSON.stringify({ title: 'Fetched Title' }), ''); + }); +}); + +afterEach(async () => { + await rm(TEST_DATA_DIR, { recursive: true, force: true }).catch(() => {}); +}); + +describe('transcriptEntriesToSegments', () => { + it('converts millisecond transcript offsets into second-based segments', () => { + const entries: TranscriptResponse[] = [ + { text: 'Hello everyone.', offset: 15240, duration: 4240, lang: 'en' }, + { text: 'Um, welcome to this talk.', offset: 16600, duration: 5080, lang: 'en' } + ]; + + expect(transcriptEntriesToSegments(entries)).toEqual([ + { index: 0, start: 15.24, end: 19.48, text: 'Hello everyone.', words: [] }, + { index: 1, start: 16.6, end: 21.68, text: 'Um, welcome to this talk.', words: [] } + ]); + }); + + it('preserves second-based transcript offsets and drops empty text', () => { + const entries: TranscriptResponse[] = [ + { text: ' ', offset: 0, duration: 1.5, lang: 'en' }, + { text: 'Clean caption cue', offset: 91.08, duration: 3.72, lang: 'en' } + ]; + + expect(transcriptEntriesToSegments(entries)).toEqual([ + { index: 0, start: 91.08, end: 94.8, text: 'Clean caption cue', words: [] } + ]); + }); +}); + +describe('downloadYouTube', () => { + it('uses fetched transcript entries directly for caption jobs', async () => { + mockFetchTranscript.mockResolvedValue([ + { text: 'Hello everyone.', offset: 15240, duration: 4240, lang: 'en' }, + { text: 'Um, welcome to this talk.', offset: 16600, duration: 5080, lang: 'en' } + ] satisfies TranscriptResponse[]); + + const result = await downloadYouTube('https://youtube.com/watch?v=qdh_x-uRs9g', 'job-1'); + + expect(mockFetchTranscript).toHaveBeenCalledWith('https://youtube.com/watch?v=qdh_x-uRs9g', { + lang: 'en' + }); + expect(result).toMatchObject({ + type: 'captions', + segments: [ + { index: 0, start: 15.24, end: 19.48, text: 'Hello everyone.', words: [] }, + { index: 1, start: 16.6, end: 21.68, text: 'Um, welcome to this talk.', words: [] } + ] + }); + }); +}); diff --git a/src/tests/postprocess.test.ts b/src/tests/postprocess.test.ts deleted file mode 100644 index a74dd14..0000000 --- a/src/tests/postprocess.test.ts +++ /dev/null @@ -1,204 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import { - deduplicateSegments -} from '$lib/server/postprocess.js'; -import type { Segment } from '$lib/types.js'; - -// ── helpers ────────────────────────────────────────────────────────────────── - -function seg(index: number, start: number, end: number, text: string): Segment { - return { index, start, end, text, words: [] }; -} - -// ── collapseRepeats (tested indirectly via deduplicateSegments) ─────────────── - -describe('deduplicateSegments — collapseRepeats', () => { - it('leaves text without repetition unchanged', () => { - const input = [seg(0, 0, 5, ' Hello world, this is a sentence.')]; - const [out] = deduplicateSegments(input); - expect(out.text).toBe('Hello world, this is a sentence.'); - }); - - it('collapses a consecutive repeated phrase inside a segment', () => { - const input = [seg(0, 0, 5, ' the quick brown fox the quick brown fox')]; - const [out] = deduplicateSegments(input); - expect(out.text).not.toMatch(/the quick brown fox.*the quick brown fox/i); - }); - - it('handles multiple repetitions recursively', () => { - // "welcome everyone" = 16 chars — qualifies for the ≥10-char collapse regex - const input = [seg(0, 0, 5, ' welcome everyone welcome everyone welcome everyone')]; - const result = deduplicateSegments(input); - const text = result[0]?.text ?? ''; - expect((text.match(/welcome everyone/gi) ?? []).length).toBeLessThan(3); - }); -}); - -// ── mergeConsecutive ────────────────────────────────────────────────────────── - -describe('deduplicateSegments — mergeConsecutive', () => { - it('merges adjacent segments with identical text', () => { - const input = [ - seg(0, 0, 2, ' Hello world.'), - seg(1, 2, 4, ' Hello world.') - ]; - const result = deduplicateSegments(input); - expect(result).toHaveLength(1); - expect(result[0].end).toBe(4); - }); - - it('keeps adjacent segments with different text', () => { - const input = [ - seg(0, 0, 2, ' First sentence.'), - seg(1, 2, 4, ' Second sentence.') - ]; - const result = deduplicateSegments(input); - expect(result).toHaveLength(2); - }); - - it('normalises punctuation and case for merge comparison', () => { - const input = [ - seg(0, 0, 2, ' Hello, World!'), - seg(1, 2, 4, ' hello world') - ]; - const result = deduplicateSegments(input); - expect(result).toHaveLength(1); - }); -}); - -// ── rolling prefix/suffix chain collapse ─────────────────────────────────────── - -describe('deduplicateSegments — rolling backend hypotheses', () => { - it('collapses prefix-growth chains from stored backend segments', () => { - const input = [ - seg(0, 15.24, 16.6, 'Hello everyone.'), - seg(1, 16.6, 19.47, 'Hello everyone. Um, welcome to this talk.'), - seg(2, 19.47, 19.48, 'Um, welcome to this talk.'), - seg(3, 19.48, 21.67, "Um, welcome to this talk. I'll be speaking about small model"), - seg(4, 21.67, 21.68, "I'll be speaking about small model"), - seg(5, 21.68, 24.59, "I'll be speaking about small model inference and a gap that we've") - ]; - - const result = deduplicateSegments(input); - - expect(result).toHaveLength(2); - expect(result[0]).toMatchObject({ - index: 0, - start: 15.24, - end: 19.48, - text: 'Hello everyone. Um, welcome to this talk.' - }); - expect(result[1]).toMatchObject({ - index: 1, - start: 19.48, - end: 24.59, - text: "I'll be speaking about small model inference and a gap that we've" - }); - }); - - it('does not collapse similar phrases when there is a real timing gap', () => { - const input = [ - seg(0, 0, 1, 'Hello everyone.'), - seg(1, 2, 4, 'Hello everyone. Welcome back.') - ]; - - const result = deduplicateSegments(input); - - expect(result).toHaveLength(2); - expect(result[0].text).toBe('Hello everyone.'); - expect(result[1].text).toBe('Hello everyone. Welcome back.'); - }); - - it('collapses tiny one-word carry-over segments from caption-style output', () => { - const input = [ - seg(0, 94.8, 96.4, 'world.'), - seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'), - seg(2, 98.96, 100.72, 'inference.'), - seg(3, 100.72, 103.92, 'inference. So, as someone who kind of wants to'), - seg(4, 107.19, 107.2, 'and'), - seg(5, 107.2, 109.56, 'and work to understand the problems and the') - ]; - - const result = deduplicateSegments(input); - - expect(result).toHaveLength(3); - expect(result[0].text).toBe('world. And that aspect that I overlooked was'); - expect(result[1].text).toBe('inference. So, as someone who kind of wants to'); - expect(result[2].text).toBe('and work to understand the problems and the'); - }); - - it('trims single-word suffix-prefix overlap between adjacent segments', () => { - const input = [ - seg(0, 94.8, 96.4, 'world.'), - seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'), - seg(2, 120.12, 123.71, 'to find more about inference.'), - seg(3, 123.72, 126.92, "inference. So, I've done a lot of work with VLAM,") - ]; - - const result = deduplicateSegments(input); - - expect(result).toHaveLength(3); - expect(result[0].text).toBe('world. And that aspect that I overlooked was'); - expect(result[2].text).toBe("So, I've done a lot of work with VLAM,"); - }); -}); - -// ── ngramDedup ──────────────────────────────────────────────────────────────── - -describe('deduplicateSegments — ngramDedup', () => { - it('passes through completely unique segments', () => { - const input = [ - seg(0, 0, 5, ' The cat sat on the mat quite happily today.'), - seg(1, 5, 10, ' Later the dog ran across the yard chasing a ball.') - ]; - expect(deduplicateSegments(input)).toHaveLength(2); - }); - - it('removes a segment that is highly similar to recent context', () => { - // Repeat a long sentence verbatim — should be caught as duplicate - const longText = - ' This is a very specific and unique sentence about transcription quality matters greatly.'; - const input = [seg(0, 0, 5, longText), seg(1, 5, 10, longText)]; - // After mergeConsecutive the second one is already merged, so result is 1 - expect(deduplicateSegments(input)).toHaveLength(1); - }); -}); - -// ── deduplicateSegments — full pipeline ────────────────────────────────────── - -describe('deduplicateSegments — full pipeline', () => { - it('returns empty array for empty input', () => { - expect(deduplicateSegments([])).toEqual([]); - }); - - it('removes segments whose text is empty after trimming', () => { - const input = [seg(0, 0, 1, ' '), seg(1, 1, 2, ' Hello.')]; - const result = deduplicateSegments(input); - expect(result).toHaveLength(1); - expect(result[0].text).toBe('Hello.'); - }); - - it('re-indexes output segments starting from 0', () => { - const input = [ - seg(5, 0, 2, ' First unique sentence here.'), - seg(8, 2, 4, ' Second different sentence there.') - ]; - const result = deduplicateSegments(input); - result.forEach((s, i) => expect(s.index).toBe(i)); - }); - - it('runs the full pipeline: trim → remove empty → merge → ngram → merge → reindex', () => { - const input = [ - seg(0, 0, 2, ' Good morning everyone.'), - seg(1, 2, 3, ' '), // empty — removed - seg(2, 3, 5, ' Good morning everyone.'), // duplicate — merged - seg(3, 5, 7, ' Welcome to our presentation today.') - ]; - const result = deduplicateSegments(input); - expect(result).toHaveLength(2); - expect(result[0].text).toBe('Good morning everyone.'); - expect(result[1].text).toBe('Welcome to our presentation today.'); - expect(result[0].index).toBe(0); - expect(result[1].index).toBe(1); - }); -}); diff --git a/src/tests/webhook.test.ts b/src/tests/webhook.test.ts index 105951c..8e7982f 100644 --- a/src/tests/webhook.test.ts +++ b/src/tests/webhook.test.ts @@ -7,7 +7,6 @@ const { mockGetJob, mockUpdateJob, mockSetJobStatus, - mockDeduplicateSegments, mockWriteOutputs, mockSendNotification, mockCleanupJobTmp, @@ -16,7 +15,6 @@ const { mockGetJob: vi.fn(), mockUpdateJob: vi.fn(), mockSetJobStatus: vi.fn(), - mockDeduplicateSegments: vi.fn((segs: Segment[]) => segs), mockWriteOutputs: vi.fn(), mockSendNotification: vi.fn(), mockCleanupJobTmp: vi.fn(), @@ -29,10 +27,6 @@ vi.mock('$lib/server/db.js', () => ({ setJobStatus: mockSetJobStatus })); -vi.mock('$lib/server/postprocess.js', () => ({ - deduplicateSegments: mockDeduplicateSegments -})); - vi.mock('$lib/server/formatter.js', () => ({ writeOutputs: mockWriteOutputs })); @@ -91,7 +85,6 @@ function makeSeg(index: number, text: string): Segment { beforeEach(() => { vi.clearAllMocks(); - mockDeduplicateSegments.mockImplementation((segs: Segment[]) => segs); mockWriteOutputs.mockResolvedValue({ srt: '/out/dir/title.srt', txt: '/out/dir/title.txt', @@ -218,25 +211,21 @@ describe('POST /api/webhook/[jobId] — whisper failure', () => { describe('POST /api/webhook/[jobId] — success with segments', () => { const segments = [makeSeg(0, 'Hello world.'), makeSeg(1, 'This is a test.')]; - it('runs deduplication on received segments', async () => { + it('passes received segments through unchanged', async () => { mockGetJob.mockReturnValue(makeJob('job-3')); await POST(makeEvent('job-3', makeWhisperJob({ segments })) as any); - expect(mockDeduplicateSegments).toHaveBeenCalledWith(segments); + expect(mockWriteOutputs).toHaveBeenCalledWith(segments, 'Test Video', 'job-3'); }); - it('calls writeOutputs with the deduplicated segments and job title', async () => { + it('calls writeOutputs with the received segments and job title', async () => { mockGetJob.mockReturnValue(makeJob('job-4', 'My Lecture')); - const deduped = [makeSeg(0, 'Hello world.')]; - mockDeduplicateSegments.mockReturnValue(deduped); await POST(makeEvent('job-4', makeWhisperJob({ segments })) as any); - expect(mockWriteOutputs).toHaveBeenCalledWith(deduped, 'My Lecture', 'job-4'); + expect(mockWriteOutputs).toHaveBeenCalledWith(segments, 'My Lecture', 'job-4'); }); it('stores serialised segments_json in the database', async () => { mockGetJob.mockReturnValue(makeJob('job-5')); - const deduped = [makeSeg(0, 'Result text.')]; - mockDeduplicateSegments.mockReturnValue(deduped); await POST(makeEvent('job-5', makeWhisperJob({ segments })) as any); @@ -244,7 +233,7 @@ describe('POST /api/webhook/[jobId] — success with segments', () => { expect.objectContaining({ id: 'job-5', status: 'done', - segmentsJson: JSON.stringify(deduped) + segmentsJson: JSON.stringify(segments) }) ); });