refactor(transcript): drop Tonemark rewrite

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-12 00:10:32 +02:00
parent df50e74939
commit 929c482497
10 changed files with 161 additions and 540 deletions
--- a/package-lock.json
+++ b/package-lock.json
@@ -12,7 +12,8 @@
        "better-sqlite3": "^12.9.0",
        "form-data": "^4.0.5",
        "node-fetch": "^3.3.2",
-        "web-push": "^3.6.7"
+        "web-push": "^3.6.7",
+        "youtube-transcript": "^1.3.1"
      },
      "devDependencies": {
        "@sveltejs/adapter-auto": "^7.0.1",
@@ -89,6 +90,27 @@
        "node": ">=18"
      }
    },
+    "node_modules/@emnapi/core": {
+      "version": "1.10.0",
+      "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.10.0.tgz",
+      "integrity": "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw==",
+      "license": "MIT",
+      "optional": true,
+      "dependencies": {
+        "@emnapi/wasi-threads": "1.2.1",
+        "tslib": "^2.4.0"
+      }
+    },
+    "node_modules/@emnapi/runtime": {
+      "version": "1.10.0",
+      "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.10.0.tgz",
+      "integrity": "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA==",
+      "license": "MIT",
+      "optional": true,
+      "dependencies": {
+        "tslib": "^2.4.0"
+      }
+    },
    "node_modules/@emnapi/wasi-threads": {
      "version": "1.2.1",
      "resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.2.1.tgz",
@@ -896,7 +918,6 @@
      "resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.59.1.tgz",
      "integrity": "sha512-d8OON70AphLdDesuTIl//M2O6fRTIicX8aYv8vhCiYEhTTI2OboKqey0Hu1A4VFhqwgqtq0vKDmPFGkw8kKmgw==",
      "license": "MIT",
-      "peer": true,
      "dependencies": {
        "@standard-schema/spec": "^1.0.0",
        "@sveltejs/acorn-typescript": "^1.0.5",
@@ -938,7 +959,6 @@
      "resolved": "https://registry.npmjs.org/@sveltejs/vite-plugin-svelte/-/vite-plugin-svelte-7.1.1.tgz",
      "integrity": "sha512-FOJdbE5pxae68DoTBJ49t1dIA7TSmMHR6CsuJhX90cO/UfrEMHA7KJNUj3WdZuUDJPu4ujqpJ2Tgqd2gTWr6Xg==",
      "license": "MIT",
-      "peer": true,
      "dependencies": {
        "deepmerge": "^4.3.1",
        "magic-string": "^0.30.21",
@@ -1313,7 +1333,6 @@
      "integrity": "sha512-38C0/Ddb7HcRG0Z4/DUem8x57d2p9jYgp18mkaYswEOQBGsI1CG4f/hjm0ZCeaJfWhSZ4k7jgs29V1Zom7Ki9A==",
      "dev": true,
      "license": "MIT",
-      "peer": true,
      "dependencies": {
        "@bcoe/v8-coverage": "^1.0.2",
        "@vitest/utils": "4.1.5",
@@ -1467,7 +1486,6 @@
      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.16.0.tgz",
      "integrity": "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw==",
      "license": "MIT",
-      "peer": true,
      "bin": {
        "acorn": "bin/acorn"
      },
@@ -3021,7 +3039,6 @@
      "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.60.3.tgz",
      "integrity": "sha512-pAQK9HalE84QSm4Po3EmWIZPd3FnjkShVkiMlz1iligWYkWQ7wHYd1PF/T7QZ5TVSD6uSTon5gBVMSM4JfBV+A==",
      "license": "MIT",
-      "peer": true,
      "dependencies": {
        "@types/estree": "1.0.8"
      },
@@ -3255,7 +3272,6 @@
      "resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.5.tgz",
      "integrity": "sha512-2uCs/LZ9us+AktdzYJM8OcxQ8qnPS1kpaO7syGT/MgO+6Qr1Ybl+TqPq+97u7PHqmmMlye5ZkoyXONy5mjjAbw==",
      "license": "MIT",
-      "peer": true,
      "dependencies": {
        "@jridgewell/remapping": "^2.3.4",
        "@jridgewell/sourcemap-codec": "^1.5.0",
@@ -3428,7 +3444,6 @@
      "integrity": "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw==",
      "devOptional": true,
      "license": "Apache-2.0",
-      "peer": true,
      "bin": {
        "tsc": "bin/tsc",
        "tsserver": "bin/tsserver"
@@ -3455,7 +3470,6 @@
      "resolved": "https://registry.npmjs.org/vite/-/vite-8.0.10.tgz",
      "integrity": "sha512-rZuUu9j6J5uotLDs+cAA4O5H4K1SfPliUlQwqa6YEwSrWDZzP4rhm00oJR5snMewjxF5V/K3D4kctsUTsIU9Mw==",
      "license": "MIT",
-      "peer": true,
      "dependencies": {
        "lightningcss": "^1.32.0",
        "picomatch": "^4.0.4",
@@ -3553,7 +3567,6 @@
      "integrity": "sha512-9Xx1v3/ih3m9hN+SbfkUyy0JAs72ap3r7joc87XL6jwF0jGg6mFBvQ1SrwaX+h8BlkX6Hz9shdd1uo6AF+ZGpg==",
      "dev": true,
      "license": "MIT",
-      "peer": true,
      "dependencies": {
        "@vitest/expect": "4.1.5",
        "@vitest/mocker": "4.1.5",
@@ -3689,6 +3702,15 @@
      "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
      "license": "ISC"
    },
+    "node_modules/youtube-transcript": {
+      "version": "1.3.1",
+      "resolved": "https://registry.npmjs.org/youtube-transcript/-/youtube-transcript-1.3.1.tgz",
+      "integrity": "sha512-NDCjwad113TGybbYF51y9Z4tcwzBHUZWQdF9veULNca18L+FdDbHHtTHIr69WVa3bB90l67S8kN0HtL2JO9fhg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=18.0.0"
+      }
+    },
    "node_modules/zimmerframe": {
      "version": "1.1.4",
      "resolved": "https://registry.npmjs.org/zimmerframe/-/zimmerframe-1.1.4.tgz",
--- a/package.json
+++ b/package.json
@@ -34,6 +34,7 @@
    "better-sqlite3": "^12.9.0",
    "form-data": "^4.0.5",
    "node-fetch": "^3.3.2",
-    "web-push": "^3.6.7"
+    "web-push": "^3.6.7",
+    "youtube-transcript": "^1.3.1"
  }
 }
--- a/src/lib/server/downloader.ts
+++ b/src/lib/server/downloader.ts
@@ -1,8 +1,9 @@
 import { execFile } from 'child_process';
 import { promisify } from 'util';
 import { existsSync } from 'fs';
-import { mkdir, unlink, writeFile } from 'fs/promises';
+import { mkdir, writeFile } from 'fs/promises';
 import { join } from 'path';
+import { fetchTranscript, type TranscriptResponse } from 'youtube-transcript';

 const execFileAsync = promisify(execFile);
 const TMP_DIR = join(process.env.DATA_DIR ?? '/tmp/.whisper-pwa', 'downloads');
@@ -26,43 +27,33 @@ export interface AudioResult {
 export type DownloadResult = CaptionResult | AudioResult;

 /** Try to get auto-generated captions from YouTube. Returns null if unavailable. */
-async function tryGetCaptions(url: string, outDir: string): Promise<CaptionResult | null> {
-	const jsonPath = join(outDir, 'info.json');
+async function tryGetCaptions(url: string, _outDir: string): Promise<CaptionResult | null> {
 	try {
-		await execFileAsync('yt-dlp', [
-			'--write-auto-subs',
-			'--sub-langs', 'en.*',
-			'--skip-download',
-			'--write-info-json',
-			'--no-playlist',
-			'-o', join(outDir, '%(title)s.%(ext)s'),
-			url
-		]);
-
-		// Find the VTT/SRT file
-		const { readdirSync } = await import('fs');
-		const files = readdirSync(outDir);
-		const vttFile = files.find((f) => f.endsWith('.vtt') || f.endsWith('.srt'));
-		if (!vttFile) return null;
-
-		let title = 'Untitled';
-		if (existsSync(jsonPath)) {
-			try {
-				const info = JSON.parse((await import('fs')).readFileSync(jsonPath, 'utf8'));
-				title = info.title ?? title;
-			} catch { /* ignore */ }
-		}
-
-		const content = (await import('fs')).readFileSync(join(outDir, vttFile), 'utf8');
-		const segments = parseVtt(content);
+		const transcript = await fetchTranscript(url, { lang: 'en' });
+		const segments = transcriptEntriesToSegments(transcript);
 		if (segments.length === 0) return null;

+		const title = await getYouTubeTitle(url);
 		return { type: 'captions', segments, title };
 	} catch {
 		return null;
 	}
 }

+async function getYouTubeTitle(url: string): Promise<string> {
+	try {
+		const { stdout } = await execFileAsync('yt-dlp', [
+			'--dump-single-json',
+			'--skip-download',
+			'--no-playlist',
+			url
+		]);
+		return JSON.parse(stdout).title ?? 'Untitled';
+	} catch {
+		return 'Untitled';
+	}
+}
+
 /** Download best audio from YouTube. Returns path to audio file. */
 async function downloadAudio(url: string, outDir: string): Promise<{ audioPath: string; title: string }> {
 	await execFileAsync('yt-dlp', [
@@ -124,39 +115,22 @@ export async function cleanupJobTmp(jobId: string) {
 	} catch { /* ignore */ }
 }

-/** Parse a WebVTT string into segments. */
-function parseVtt(
-	content: string
+export function transcriptEntriesToSegments(
+	entries: TranscriptResponse[]
 ): Array<{ index: number; start: number; end: number; text: string; words: [] }> {
-	const segments: Array<{ index: number; start: number; end: number; text: string; words: [] }> = [];
-	const blocks = content.split(/\n\n+/);
-	let index = 0;
-
-	for (const block of blocks) {
-		const lines = block.trim().split('\n');
-		const timeLine = lines.find((l) => l.includes('-->'));
-		if (!timeLine) continue;
-
-		const [startStr, endStr] = timeLine.split('-->').map((s) => s.trim().split(' ')[0]);
-		const start = vttTimeToSec(startStr);
-		const end = vttTimeToSec(endStr);
-		const text = lines
-			.filter((l) => !l.includes('-->') && !/^\d+$/.test(l.trim()) && l.trim())
-			.join(' ')
-			.replace(/<[^>]+>/g, '')
-			.trim();
-
-		if (text) {
-			segments.push({ index: index++, start, end, text, words: [] });
-		}
-	}
-
-	return segments;
-}
-
-function vttTimeToSec(t: string): number {
-	const parts = t.split(':').map(Number);
-	if (parts.length === 3) return parts[0] * 3600 + parts[1] * 60 + parts[2];
-	if (parts.length === 2) return parts[0] * 60 + parts[1];
-	return parts[0];
+	const useMilliseconds = entries.some((entry) => entry.offset > 1000 || entry.duration > 1000);
+	return entries
+		.map((entry) => {
+			const start = useMilliseconds ? entry.offset / 1000 : entry.offset;
+			const duration = useMilliseconds ? entry.duration / 1000 : entry.duration;
+			return {
+				index: 0,
+				start,
+				end: start + duration,
+				text: entry.text.trim(),
+				words: [] as []
+			};
+		})
+		.filter((entry) => entry.text.length > 0)
+		.map((entry, index) => ({ ...entry, index }));
 }
--- a/src/lib/server/pipeline.ts
+++ b/src/lib/server/pipeline.ts
@@ -96,15 +96,13 @@ async function runJob(

 		if (captionSegments) {
 			// Caption fast path — skip whisper
-			const { deduplicateSegments } = await import('./postprocess.js');
 			const { writeOutputs } = await import('./formatter.js');
-			const segments = deduplicateSegments(captionSegments);
-			const paths = await writeOutputs(segments, title, jobId);
+			const paths = await writeOutputs(captionSegments, title, jobId);
 			updateJob({
 				id: jobId,
 				status: 'done',
 				progress: 100,
-				segmentsJson: JSON.stringify(segments),
+				segmentsJson: JSON.stringify(captionSegments),
 				outputDir: paths.srt.replace(/\/[^/]+$/, '')
 			});
 			emitProgress(jobId, { type: 'done' });
--- a/src/lib/server/postprocess.ts
+++ b/src/lib/server/postprocess.ts
@@ -1,235 +0,0 @@
-import type { Segment } from '$lib/types.js';
-
-// ── Collapse consecutive repeated phrases within a segment's text ────────────
-
-function collapseRepeats(text: string): string {
-	let prev = '';
-	// Keep applying until stable
-	while (true) {
-		const next = collapseOnce(text);
-		if (next === prev || next === text) return next;
-		prev = text;
-		text = next;
-	}
-}
-
-function collapseOnce(text: string): string {
-	// Match any repeated phrase (2+ words) appearing consecutively
-	return text.replace(/\b(.{10,}?)\s+\1\b/gi, '$1');
-}
-
-// ── Merge consecutive segments with identical (or near-identical) text ───────
-
-function normalise(s: string) {
-	return s.toLowerCase().replace(/[^\w\s]/g, '').replace(/\s+/g, ' ').trim();
-}
-
-function mergeConsecutive(segments: Segment[]): Segment[] {
-	const out: Segment[] = [];
-	for (const seg of segments) {
-		const last = out[out.length - 1];
-		if (last && normalise(last.text) === normalise(seg.text)) {
-			last.end = seg.end;
-		} else {
-			out.push({ ...seg });
-		}
-	}
-	return out;
-}
-
-// ── Collapse rolling prefix/suffix chains from backend segment hypotheses ──────
-
-const MAX_CHAIN_GAP_SECS = 0.15;
-const MIN_MEANINGFUL_WORDS = 2;
-const MIN_MEANINGFUL_CHARS = 8;
-const MIN_OVERLAP_WORDS = 1;
-
-function splitWords(text: string): string[] {
-	return text.trim().split(/\s+/).filter(Boolean);
-}
-
-function normaliseWords(text: string): string[] {
-	return splitWords(text)
-		.map((word) => word.toLowerCase().replace(/[^\w]/g, ''))
-		.filter(Boolean);
-}
-
-function arraysEqual(a: string[], b: string[]): boolean {
-	return a.length === b.length && a.every((value, index) => value === b[index]);
-}
-
-function startsWithWords(full: string[], prefix: string[]): boolean {
-	return prefix.length <= full.length && arraysEqual(full.slice(0, prefix.length), prefix);
-}
-
-function endsWithWords(full: string[], suffix: string[]): boolean {
-	return suffix.length <= full.length && arraysEqual(full.slice(full.length - suffix.length), suffix);
-}
-
-function suffixPrefixOverlap(left: string[], right: string[]): number {
-	const max = Math.min(left.length, right.length);
-	for (let size = max; size >= 1; size--) {
-		if (arraysEqual(left.slice(left.length - size), right.slice(0, size))) return size;
-	}
-	return 0;
-}
-
-function isMeaningfulPhrase(words: string[]): boolean {
-	return words.length >= MIN_MEANINGFUL_WORDS && words.join(' ').length >= MIN_MEANINGFUL_CHARS;
-}
-
-function isShortCarryover(seg: Segment, words: string[]): boolean {
-	return seg.end - seg.start <= 0.2 || words.length <= 2 || words.join(' ').length <= 16;
-}
-
-function trimLeadingWords(text: string, count: number): string {
-	return splitWords(text).slice(count).join(' ').trim();
-}
-
-function collapseIncrementalSegments(segments: Segment[]): Segment[] {
-	const out: Segment[] = [];
-
-	for (const seg of segments) {
-		let current: Segment = {
-			...seg,
-			text: seg.text.trim()
-		};
-
-		if (!current.text) continue;
-
-		const last = out[out.length - 1];
-		if (!last) {
-			out.push(current);
-			continue;
-		}
-
-		const gap = current.start - last.end;
-		if (gap > MAX_CHAIN_GAP_SECS) {
-			out.push(current);
-			continue;
-		}
-
-		const lastWords = normaliseWords(last.text);
-		const currentWords = normaliseWords(current.text);
-		if (lastWords.length === 0 || currentWords.length === 0) {
-			out.push(current);
-			continue;
-		}
-
-		if (
-			currentWords.length > lastWords.length &&
-			startsWithWords(currentWords, lastWords) &&
-			(isMeaningfulPhrase(lastWords) || isShortCarryover(last, lastWords))
-		) {
-			last.text = current.text;
-			last.end = current.end;
-			last.words = current.words;
-			continue;
-		}
-
-		if (
-			endsWithWords(lastWords, currentWords) &&
-			(isMeaningfulPhrase(currentWords) || isShortCarryover(current, currentWords))
-		) {
-			last.end = Math.max(last.end, current.end);
-			continue;
-		}
-
-		const overlapWords = suffixPrefixOverlap(lastWords, currentWords);
-		if (overlapWords >= MIN_OVERLAP_WORDS) {
-			const trimmedText = trimLeadingWords(current.text, overlapWords);
-			if (!trimmedText) {
-				last.end = Math.max(last.end, current.end);
-				continue;
-			}
-
-			current = {
-				...current,
-				start: Math.max(current.start, last.end),
-				text: trimmedText,
-				words: []
-			};
-		}
-
-		out.push(current);
-	}
-
-	return out;
-}
-
-// ── N-gram deduplication ─────────────────────────────────────────────────────
-
-const NGRAM_N = 6;
-const LOOKBACK_CHARS = 500;
-const SIMILARITY_THRESHOLD = 0.6;
-
-function ngrams(text: string, n: number): string[] {
-	const words = text.toLowerCase().split(/\s+/);
-	const grams: string[] = [];
-	for (let i = 0; i <= words.length - n; i++) {
-		grams.push(words.slice(i, i + n).join(' '));
-	}
-	return grams;
-}
-
-function jaccardSimilarity(a: string, b: string): number {
-	const ga = new Set(ngrams(a, NGRAM_N));
-	const gb = new Set(ngrams(b, NGRAM_N));
-	// If neither text is long enough to produce n-grams they cannot be compared;
-	// treat as dissimilar so short segments are never incorrectly discarded.
-	if (ga.size === 0 && gb.size === 0) return 0;
-	const intersection = [...ga].filter((g) => gb.has(g)).length;
-	const union = new Set([...ga, ...gb]).size;
-	return union === 0 ? 0 : intersection / union;
-}
-
-function ngramDedup(segments: Segment[]): Segment[] {
-	const out: Segment[] = [];
-	for (const seg of segments) {
-		const windowText = out
-			.slice(-20)
-			.map((s) => s.text)
-			.join(' ')
-			.slice(-LOOKBACK_CHARS);
-
-		if (windowText.length > 0 && jaccardSimilarity(seg.text, windowText) >= SIMILARITY_THRESHOLD) {
-			continue; // duplicate — skip
-		}
-		out.push(seg);
-	}
-	return out;
-}
-
-// ── Full deduplication pipeline ──────────────────────────────────────────────
-
-export function deduplicateSegments(segments: Segment[]): Segment[] {
-	if (!Array.isArray(segments)) return [];
-	// 1. Collapse repeats within each segment's text
-	let result = segments.map((s) => ({
-		...s,
-		text: collapseRepeats(s.text.trim())
-	}));
-
-	// 2. Remove empty segments
-	result = result.filter((s) => s.text.length > 0);
-
-	// 3. Collapse rolling backend hypotheses before generic dedup
-	result = collapseIncrementalSegments(result);
-
-	// 4. First merge pass
-	result = mergeConsecutive(result);
-
-	// 5. N-gram dedup
-	result = ngramDedup(result);
-
-	// 6. Re-run rolling collapse after removals create new adjacencies
-	result = collapseIncrementalSegments(result);
-
-	// 7. Second merge pass (catches new adjacencies after dedup)
-	result = mergeConsecutive(result);
-
-	// 8. Re-index
-	result.forEach((s, i) => (s.index = i));
-
-	return result;
-}
--- a/src/routes/api/jobs/[id]/reprocess/+server.ts
+++ b/src/routes/api/jobs/[id]/reprocess/+server.ts
@@ -1,10 +1,9 @@
 import { json, error } from '@sveltejs/kit';
 import { getJob, updateJob } from '$lib/server/db.js';
-import { deduplicateSegments } from '$lib/server/postprocess.js';
 import { writeOutputs } from '$lib/server/formatter.js';
 import type { Segment } from '$lib/types.js';

-/** POST /api/jobs/[id]/reprocess — re-run post-processing and regenerate all output files. */
+/** POST /api/jobs/[id]/reprocess — regenerate output files from stored canonical segments. */
 export async function POST({ params }) {
 	const job = getJob(params.id);
 	if (!job) throw error(404, 'Job not found');
@@ -14,8 +13,7 @@ export async function POST({ params }) {
 	}

 	try {
-		const rawSegments = JSON.parse(job.segmentsJson) as Segment[];
-		const segments = deduplicateSegments(rawSegments);
+		const segments = JSON.parse(job.segmentsJson) as Segment[];

 		const paths = await writeOutputs(segments, job.title, job.id);
 		const outputDir = paths.srt.replace(/\/[^/]+$/, '');
--- a/src/routes/api/webhook/[jobId]/+server.ts
+++ b/src/routes/api/webhook/[jobId]/+server.ts
@@ -1,6 +1,5 @@
 import { json, error } from '@sveltejs/kit';
 import { getJob, updateJob, setJobStatus } from '$lib/server/db.js';
-import { deduplicateSegments } from '$lib/server/postprocess.js';
 import { writeOutputs } from '$lib/server/formatter.js';
 import { sendNotification } from '$lib/server/push.js';
 import { cleanupJobTmp } from '$lib/server/downloader.js';
@@ -40,8 +39,7 @@ try {
 setJobStatus(jobId, 'processing', 90);
 emitProgress(jobId, { type: 'status', status: 'processing', progress: 90 });

-const rawSegments = (whisperJob.segments ?? []) as Segment[];
-const segments = deduplicateSegments(rawSegments);
+const segments = (whisperJob.segments ?? []) as Segment[];

 const paths = await writeOutputs(segments, job.title, jobId);
 const outputDir = paths.srt.replace(/\/[^/]+$/, '');
--- a/src/tests/downloader.test.ts
+++ b/src/tests/downloader.test.ts
@@ -0,0 +1,80 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+import { rm } from 'fs/promises';
+import type { TranscriptResponse } from 'youtube-transcript';
+
+const { mockExecFile, mockFetchTranscript } = vi.hoisted(() => ({
+	mockExecFile: vi.fn(),
+	mockFetchTranscript: vi.fn()
+}));
+
+const TEST_DATA_DIR = `/tmp/tonemark-downloader-test-${Date.now()}`;
+vi.stubEnv('DATA_DIR', TEST_DATA_DIR);
+
+vi.mock('child_process', () => ({
+	execFile: mockExecFile
+}));
+
+vi.mock('youtube-transcript', () => ({
+	fetchTranscript: mockFetchTranscript
+}));
+
+import { downloadYouTube, transcriptEntriesToSegments } from '$lib/server/downloader.js';
+
+beforeEach(() => {
+	vi.clearAllMocks();
+	mockExecFile.mockImplementation((...args: unknown[]) => {
+		const cb = args.at(-1) as (...callbackArgs: unknown[]) => void;
+		cb(null, JSON.stringify({ title: 'Fetched Title' }), '');
+	});
+});
+
+afterEach(async () => {
+	await rm(TEST_DATA_DIR, { recursive: true, force: true }).catch(() => {});
+});
+
+describe('transcriptEntriesToSegments', () => {
+	it('converts millisecond transcript offsets into second-based segments', () => {
+		const entries: TranscriptResponse[] = [
+			{ text: 'Hello everyone.', offset: 15240, duration: 4240, lang: 'en' },
+			{ text: 'Um, welcome to this talk.', offset: 16600, duration: 5080, lang: 'en' }
+		];
+
+		expect(transcriptEntriesToSegments(entries)).toEqual([
+			{ index: 0, start: 15.24, end: 19.48, text: 'Hello everyone.', words: [] },
+			{ index: 1, start: 16.6, end: 21.68, text: 'Um, welcome to this talk.', words: [] }
+		]);
+	});
+
+	it('preserves second-based transcript offsets and drops empty text', () => {
+		const entries: TranscriptResponse[] = [
+			{ text: '  ', offset: 0, duration: 1.5, lang: 'en' },
+			{ text: 'Clean caption cue', offset: 91.08, duration: 3.72, lang: 'en' }
+		];
+
+		expect(transcriptEntriesToSegments(entries)).toEqual([
+			{ index: 0, start: 91.08, end: 94.8, text: 'Clean caption cue', words: [] }
+		]);
+	});
+});
+
+describe('downloadYouTube', () => {
+	it('uses fetched transcript entries directly for caption jobs', async () => {
+		mockFetchTranscript.mockResolvedValue([
+			{ text: 'Hello everyone.', offset: 15240, duration: 4240, lang: 'en' },
+			{ text: 'Um, welcome to this talk.', offset: 16600, duration: 5080, lang: 'en' }
+		] satisfies TranscriptResponse[]);
+
+		const result = await downloadYouTube('https://youtube.com/watch?v=qdh_x-uRs9g', 'job-1');
+
+		expect(mockFetchTranscript).toHaveBeenCalledWith('https://youtube.com/watch?v=qdh_x-uRs9g', {
+			lang: 'en'
+		});
+		expect(result).toMatchObject({
+			type: 'captions',
+			segments: [
+				{ index: 0, start: 15.24, end: 19.48, text: 'Hello everyone.', words: [] },
+				{ index: 1, start: 16.6, end: 21.68, text: 'Um, welcome to this talk.', words: [] }
+			]
+		});
+	});
+});
--- a/src/tests/postprocess.test.ts
+++ b/src/tests/postprocess.test.ts
@@ -1,204 +0,0 @@
-import { describe, it, expect } from 'vitest';
-import {
-	deduplicateSegments
-} from '$lib/server/postprocess.js';
-import type { Segment } from '$lib/types.js';
-
-// ── helpers ──────────────────────────────────────────────────────────────────
-
-function seg(index: number, start: number, end: number, text: string): Segment {
-	return { index, start, end, text, words: [] };
-}
-
-// ── collapseRepeats (tested indirectly via deduplicateSegments) ───────────────
-
-describe('deduplicateSegments — collapseRepeats', () => {
-	it('leaves text without repetition unchanged', () => {
-		const input = [seg(0, 0, 5, ' Hello world, this is a sentence.')];
-		const [out] = deduplicateSegments(input);
-		expect(out.text).toBe('Hello world, this is a sentence.');
-	});
-
-	it('collapses a consecutive repeated phrase inside a segment', () => {
-		const input = [seg(0, 0, 5, ' the quick brown fox the quick brown fox')];
-		const [out] = deduplicateSegments(input);
-		expect(out.text).not.toMatch(/the quick brown fox.*the quick brown fox/i);
-	});
-
-	it('handles multiple repetitions recursively', () => {
-		// "welcome everyone" = 16 chars — qualifies for the ≥10-char collapse regex
-		const input = [seg(0, 0, 5, ' welcome everyone welcome everyone welcome everyone')];
-		const result = deduplicateSegments(input);
-		const text = result[0]?.text ?? '';
-		expect((text.match(/welcome everyone/gi) ?? []).length).toBeLessThan(3);
-	});
-});
-
-// ── mergeConsecutive ──────────────────────────────────────────────────────────
-
-describe('deduplicateSegments — mergeConsecutive', () => {
-	it('merges adjacent segments with identical text', () => {
-		const input = [
-			seg(0, 0, 2, ' Hello world.'),
-			seg(1, 2, 4, ' Hello world.')
-		];
-		const result = deduplicateSegments(input);
-		expect(result).toHaveLength(1);
-		expect(result[0].end).toBe(4);
-	});
-
-	it('keeps adjacent segments with different text', () => {
-		const input = [
-			seg(0, 0, 2, ' First sentence.'),
-			seg(1, 2, 4, ' Second sentence.')
-		];
-		const result = deduplicateSegments(input);
-		expect(result).toHaveLength(2);
-	});
-
-	it('normalises punctuation and case for merge comparison', () => {
-		const input = [
-			seg(0, 0, 2, ' Hello, World!'),
-			seg(1, 2, 4, ' hello world')
-		];
-		const result = deduplicateSegments(input);
-		expect(result).toHaveLength(1);
-	});
-});
-
-// ── rolling prefix/suffix chain collapse ───────────────────────────────────────
-
-describe('deduplicateSegments — rolling backend hypotheses', () => {
-	it('collapses prefix-growth chains from stored backend segments', () => {
-		const input = [
-			seg(0, 15.24, 16.6, 'Hello everyone.'),
-			seg(1, 16.6, 19.47, 'Hello everyone. Um, welcome to this talk.'),
-			seg(2, 19.47, 19.48, 'Um, welcome to this talk.'),
-			seg(3, 19.48, 21.67, "Um, welcome to this talk. I'll be speaking about small model"),
-			seg(4, 21.67, 21.68, "I'll be speaking about small model"),
-			seg(5, 21.68, 24.59, "I'll be speaking about small model inference and a gap that we've")
-		];
-
-		const result = deduplicateSegments(input);
-
-		expect(result).toHaveLength(2);
-		expect(result[0]).toMatchObject({
-			index: 0,
-			start: 15.24,
-			end: 19.48,
-			text: 'Hello everyone. Um, welcome to this talk.'
-		});
-		expect(result[1]).toMatchObject({
-			index: 1,
-			start: 19.48,
-			end: 24.59,
-			text: "I'll be speaking about small model inference and a gap that we've"
-		});
-	});
-
-	it('does not collapse similar phrases when there is a real timing gap', () => {
-		const input = [
-			seg(0, 0, 1, 'Hello everyone.'),
-			seg(1, 2, 4, 'Hello everyone. Welcome back.')
-		];
-
-		const result = deduplicateSegments(input);
-
-		expect(result).toHaveLength(2);
-		expect(result[0].text).toBe('Hello everyone.');
-		expect(result[1].text).toBe('Hello everyone. Welcome back.');
-	});
-
-	it('collapses tiny one-word carry-over segments from caption-style output', () => {
-		const input = [
-			seg(0, 94.8, 96.4, 'world.'),
-			seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
-			seg(2, 98.96, 100.72, 'inference.'),
-			seg(3, 100.72, 103.92, 'inference. So, as someone who kind of wants to'),
-			seg(4, 107.19, 107.2, 'and'),
-			seg(5, 107.2, 109.56, 'and work to understand the problems and the')
-		];
-
-		const result = deduplicateSegments(input);
-
-		expect(result).toHaveLength(3);
-		expect(result[0].text).toBe('world. And that aspect that I overlooked was');
-		expect(result[1].text).toBe('inference. So, as someone who kind of wants to');
-		expect(result[2].text).toBe('and work to understand the problems and the');
-	});
-
-	it('trims single-word suffix-prefix overlap between adjacent segments', () => {
-		const input = [
-			seg(0, 94.8, 96.4, 'world.'),
-			seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
-			seg(2, 120.12, 123.71, 'to find more about inference.'),
-			seg(3, 123.72, 126.92, "inference. So, I've done a lot of work with VLAM,")
-		];
-
-		const result = deduplicateSegments(input);
-
-		expect(result).toHaveLength(3);
-		expect(result[0].text).toBe('world. And that aspect that I overlooked was');
-		expect(result[2].text).toBe("So, I've done a lot of work with VLAM,");
-	});
-});
-
-// ── ngramDedup ────────────────────────────────────────────────────────────────
-
-describe('deduplicateSegments — ngramDedup', () => {
-	it('passes through completely unique segments', () => {
-		const input = [
-			seg(0, 0, 5, ' The cat sat on the mat quite happily today.'),
-			seg(1, 5, 10, ' Later the dog ran across the yard chasing a ball.')
-		];
-		expect(deduplicateSegments(input)).toHaveLength(2);
-	});
-
-	it('removes a segment that is highly similar to recent context', () => {
-		// Repeat a long sentence verbatim — should be caught as duplicate
-		const longText =
-			' This is a very specific and unique sentence about transcription quality matters greatly.';
-		const input = [seg(0, 0, 5, longText), seg(1, 5, 10, longText)];
-		// After mergeConsecutive the second one is already merged, so result is 1
-		expect(deduplicateSegments(input)).toHaveLength(1);
-	});
-});
-
-// ── deduplicateSegments — full pipeline ──────────────────────────────────────
-
-describe('deduplicateSegments — full pipeline', () => {
-	it('returns empty array for empty input', () => {
-		expect(deduplicateSegments([])).toEqual([]);
-	});
-
-	it('removes segments whose text is empty after trimming', () => {
-		const input = [seg(0, 0, 1, '   '), seg(1, 1, 2, ' Hello.')];
-		const result = deduplicateSegments(input);
-		expect(result).toHaveLength(1);
-		expect(result[0].text).toBe('Hello.');
-	});
-
-	it('re-indexes output segments starting from 0', () => {
-		const input = [
-			seg(5, 0, 2, ' First unique sentence here.'),
-			seg(8, 2, 4, ' Second different sentence there.')
-		];
-		const result = deduplicateSegments(input);
-		result.forEach((s, i) => expect(s.index).toBe(i));
-	});
-
-	it('runs the full pipeline: trim → remove empty → merge → ngram → merge → reindex', () => {
-		const input = [
-			seg(0, 0, 2, ' Good morning everyone.'),
-			seg(1, 2, 3, '   '), // empty — removed
-			seg(2, 3, 5, ' Good morning everyone.'), // duplicate — merged
-			seg(3, 5, 7, ' Welcome to our presentation today.')
-		];
-		const result = deduplicateSegments(input);
-		expect(result).toHaveLength(2);
-		expect(result[0].text).toBe('Good morning everyone.');
-		expect(result[1].text).toBe('Welcome to our presentation today.');
-		expect(result[0].index).toBe(0);
-		expect(result[1].index).toBe(1);
-	});
-});
--- a/src/tests/webhook.test.ts
+++ b/src/tests/webhook.test.ts
@@ -7,7 +7,6 @@ const {
 	mockGetJob,
 	mockUpdateJob,
 	mockSetJobStatus,
-	mockDeduplicateSegments,
 	mockWriteOutputs,
 	mockSendNotification,
 	mockCleanupJobTmp,
@@ -16,7 +15,6 @@ const {
 	mockGetJob: vi.fn(),
 	mockUpdateJob: vi.fn(),
 	mockSetJobStatus: vi.fn(),
-	mockDeduplicateSegments: vi.fn((segs: Segment[]) => segs),
 	mockWriteOutputs: vi.fn(),
 	mockSendNotification: vi.fn(),
 	mockCleanupJobTmp: vi.fn(),
@@ -29,10 +27,6 @@ vi.mock('$lib/server/db.js', () => ({
 	setJobStatus: mockSetJobStatus
 }));

-vi.mock('$lib/server/postprocess.js', () => ({
-	deduplicateSegments: mockDeduplicateSegments
-}));
-
 vi.mock('$lib/server/formatter.js', () => ({
 	writeOutputs: mockWriteOutputs
 }));
@@ -91,7 +85,6 @@ function makeSeg(index: number, text: string): Segment {

 beforeEach(() => {
 	vi.clearAllMocks();
-	mockDeduplicateSegments.mockImplementation((segs: Segment[]) => segs);
 	mockWriteOutputs.mockResolvedValue({
 		srt: '/out/dir/title.srt',
 		txt: '/out/dir/title.txt',
@@ -218,25 +211,21 @@ describe('POST /api/webhook/[jobId] — whisper failure', () => {
 describe('POST /api/webhook/[jobId] — success with segments', () => {
 	const segments = [makeSeg(0, 'Hello world.'), makeSeg(1, 'This is a test.')];

-	it('runs deduplication on received segments', async () => {
+	it('passes received segments through unchanged', async () => {
 		mockGetJob.mockReturnValue(makeJob('job-3'));
 		await POST(makeEvent('job-3', makeWhisperJob({ segments })) as any);
-		expect(mockDeduplicateSegments).toHaveBeenCalledWith(segments);
+		expect(mockWriteOutputs).toHaveBeenCalledWith(segments, 'Test Video', 'job-3');
 	});

-	it('calls writeOutputs with the deduplicated segments and job title', async () => {
+	it('calls writeOutputs with the received segments and job title', async () => {
 		mockGetJob.mockReturnValue(makeJob('job-4', 'My Lecture'));
-		const deduped = [makeSeg(0, 'Hello world.')];
-		mockDeduplicateSegments.mockReturnValue(deduped);

 		await POST(makeEvent('job-4', makeWhisperJob({ segments })) as any);
-		expect(mockWriteOutputs).toHaveBeenCalledWith(deduped, 'My Lecture', 'job-4');
+		expect(mockWriteOutputs).toHaveBeenCalledWith(segments, 'My Lecture', 'job-4');
 	});

 	it('stores serialised segments_json in the database', async () => {
 		mockGetJob.mockReturnValue(makeJob('job-5'));
-		const deduped = [makeSeg(0, 'Result text.')];
-		mockDeduplicateSegments.mockReturnValue(deduped);

 		await POST(makeEvent('job-5', makeWhisperJob({ segments })) as any);

@@ -244,7 +233,7 @@ describe('POST /api/webhook/[jobId] — success with segments', () => {
 			expect.objectContaining({
 				id: 'job-5',
 				status: 'done',
-				segmentsJson: JSON.stringify(deduped)
+				segmentsJson: JSON.stringify(segments)
 			})
 		);
 	});