refactor(transcript): drop Tonemark rewrite

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-12 00:10:32 +02:00
parent df50e74939
commit 929c482497
10 changed files with 161 additions and 540 deletions
--- a/src/tests/downloader.test.ts
+++ b/src/tests/downloader.test.ts
@@ -0,0 +1,80 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+import { rm } from 'fs/promises';
+import type { TranscriptResponse } from 'youtube-transcript';
+
+const { mockExecFile, mockFetchTranscript } = vi.hoisted(() => ({
+	mockExecFile: vi.fn(),
+	mockFetchTranscript: vi.fn()
+}));
+
+const TEST_DATA_DIR = `/tmp/tonemark-downloader-test-${Date.now()}`;
+vi.stubEnv('DATA_DIR', TEST_DATA_DIR);
+
+vi.mock('child_process', () => ({
+	execFile: mockExecFile
+}));
+
+vi.mock('youtube-transcript', () => ({
+	fetchTranscript: mockFetchTranscript
+}));
+
+import { downloadYouTube, transcriptEntriesToSegments } from '$lib/server/downloader.js';
+
+beforeEach(() => {
+	vi.clearAllMocks();
+	mockExecFile.mockImplementation((...args: unknown[]) => {
+		const cb = args.at(-1) as (...callbackArgs: unknown[]) => void;
+		cb(null, JSON.stringify({ title: 'Fetched Title' }), '');
+	});
+});
+
+afterEach(async () => {
+	await rm(TEST_DATA_DIR, { recursive: true, force: true }).catch(() => {});
+});
+
+describe('transcriptEntriesToSegments', () => {
+	it('converts millisecond transcript offsets into second-based segments', () => {
+		const entries: TranscriptResponse[] = [
+			{ text: 'Hello everyone.', offset: 15240, duration: 4240, lang: 'en' },
+			{ text: 'Um, welcome to this talk.', offset: 16600, duration: 5080, lang: 'en' }
+		];
+
+		expect(transcriptEntriesToSegments(entries)).toEqual([
+			{ index: 0, start: 15.24, end: 19.48, text: 'Hello everyone.', words: [] },
+			{ index: 1, start: 16.6, end: 21.68, text: 'Um, welcome to this talk.', words: [] }
+		]);
+	});
+
+	it('preserves second-based transcript offsets and drops empty text', () => {
+		const entries: TranscriptResponse[] = [
+			{ text: '  ', offset: 0, duration: 1.5, lang: 'en' },
+			{ text: 'Clean caption cue', offset: 91.08, duration: 3.72, lang: 'en' }
+		];
+
+		expect(transcriptEntriesToSegments(entries)).toEqual([
+			{ index: 0, start: 91.08, end: 94.8, text: 'Clean caption cue', words: [] }
+		]);
+	});
+});
+
+describe('downloadYouTube', () => {
+	it('uses fetched transcript entries directly for caption jobs', async () => {
+		mockFetchTranscript.mockResolvedValue([
+			{ text: 'Hello everyone.', offset: 15240, duration: 4240, lang: 'en' },
+			{ text: 'Um, welcome to this talk.', offset: 16600, duration: 5080, lang: 'en' }
+		] satisfies TranscriptResponse[]);
+
+		const result = await downloadYouTube('https://youtube.com/watch?v=qdh_x-uRs9g', 'job-1');
+
+		expect(mockFetchTranscript).toHaveBeenCalledWith('https://youtube.com/watch?v=qdh_x-uRs9g', {
+			lang: 'en'
+		});
+		expect(result).toMatchObject({
+			type: 'captions',
+			segments: [
+				{ index: 0, start: 15.24, end: 19.48, text: 'Hello everyone.', words: [] },
+				{ index: 1, start: 16.6, end: 21.68, text: 'Um, welcome to this talk.', words: [] }
+			]
+		});
+	});
+});
--- a/src/tests/postprocess.test.ts
+++ b/src/tests/postprocess.test.ts
@@ -1,204 +0,0 @@
-import { describe, it, expect } from 'vitest';
-import {
-	deduplicateSegments
-} from '$lib/server/postprocess.js';
-import type { Segment } from '$lib/types.js';
-
-// ── helpers ──────────────────────────────────────────────────────────────────
-
-function seg(index: number, start: number, end: number, text: string): Segment {
-	return { index, start, end, text, words: [] };
-}
-
-// ── collapseRepeats (tested indirectly via deduplicateSegments) ───────────────
-
-describe('deduplicateSegments — collapseRepeats', () => {
-	it('leaves text without repetition unchanged', () => {
-		const input = [seg(0, 0, 5, ' Hello world, this is a sentence.')];
-		const [out] = deduplicateSegments(input);
-		expect(out.text).toBe('Hello world, this is a sentence.');
-	});
-
-	it('collapses a consecutive repeated phrase inside a segment', () => {
-		const input = [seg(0, 0, 5, ' the quick brown fox the quick brown fox')];
-		const [out] = deduplicateSegments(input);
-		expect(out.text).not.toMatch(/the quick brown fox.*the quick brown fox/i);
-	});
-
-	it('handles multiple repetitions recursively', () => {
-		// "welcome everyone" = 16 chars — qualifies for the ≥10-char collapse regex
-		const input = [seg(0, 0, 5, ' welcome everyone welcome everyone welcome everyone')];
-		const result = deduplicateSegments(input);
-		const text = result[0]?.text ?? '';
-		expect((text.match(/welcome everyone/gi) ?? []).length).toBeLessThan(3);
-	});
-});
-
-// ── mergeConsecutive ──────────────────────────────────────────────────────────
-
-describe('deduplicateSegments — mergeConsecutive', () => {
-	it('merges adjacent segments with identical text', () => {
-		const input = [
-			seg(0, 0, 2, ' Hello world.'),
-			seg(1, 2, 4, ' Hello world.')
-		];
-		const result = deduplicateSegments(input);
-		expect(result).toHaveLength(1);
-		expect(result[0].end).toBe(4);
-	});
-
-	it('keeps adjacent segments with different text', () => {
-		const input = [
-			seg(0, 0, 2, ' First sentence.'),
-			seg(1, 2, 4, ' Second sentence.')
-		];
-		const result = deduplicateSegments(input);
-		expect(result).toHaveLength(2);
-	});
-
-	it('normalises punctuation and case for merge comparison', () => {
-		const input = [
-			seg(0, 0, 2, ' Hello, World!'),
-			seg(1, 2, 4, ' hello world')
-		];
-		const result = deduplicateSegments(input);
-		expect(result).toHaveLength(1);
-	});
-});
-
-// ── rolling prefix/suffix chain collapse ───────────────────────────────────────
-
-describe('deduplicateSegments — rolling backend hypotheses', () => {
-	it('collapses prefix-growth chains from stored backend segments', () => {
-		const input = [
-			seg(0, 15.24, 16.6, 'Hello everyone.'),
-			seg(1, 16.6, 19.47, 'Hello everyone. Um, welcome to this talk.'),
-			seg(2, 19.47, 19.48, 'Um, welcome to this talk.'),
-			seg(3, 19.48, 21.67, "Um, welcome to this talk. I'll be speaking about small model"),
-			seg(4, 21.67, 21.68, "I'll be speaking about small model"),
-			seg(5, 21.68, 24.59, "I'll be speaking about small model inference and a gap that we've")
-		];
-
-		const result = deduplicateSegments(input);
-
-		expect(result).toHaveLength(2);
-		expect(result[0]).toMatchObject({
-			index: 0,
-			start: 15.24,
-			end: 19.48,
-			text: 'Hello everyone. Um, welcome to this talk.'
-		});
-		expect(result[1]).toMatchObject({
-			index: 1,
-			start: 19.48,
-			end: 24.59,
-			text: "I'll be speaking about small model inference and a gap that we've"
-		});
-	});
-
-	it('does not collapse similar phrases when there is a real timing gap', () => {
-		const input = [
-			seg(0, 0, 1, 'Hello everyone.'),
-			seg(1, 2, 4, 'Hello everyone. Welcome back.')
-		];
-
-		const result = deduplicateSegments(input);
-
-		expect(result).toHaveLength(2);
-		expect(result[0].text).toBe('Hello everyone.');
-		expect(result[1].text).toBe('Hello everyone. Welcome back.');
-	});
-
-	it('collapses tiny one-word carry-over segments from caption-style output', () => {
-		const input = [
-			seg(0, 94.8, 96.4, 'world.'),
-			seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
-			seg(2, 98.96, 100.72, 'inference.'),
-			seg(3, 100.72, 103.92, 'inference. So, as someone who kind of wants to'),
-			seg(4, 107.19, 107.2, 'and'),
-			seg(5, 107.2, 109.56, 'and work to understand the problems and the')
-		];
-
-		const result = deduplicateSegments(input);
-
-		expect(result).toHaveLength(3);
-		expect(result[0].text).toBe('world. And that aspect that I overlooked was');
-		expect(result[1].text).toBe('inference. So, as someone who kind of wants to');
-		expect(result[2].text).toBe('and work to understand the problems and the');
-	});
-
-	it('trims single-word suffix-prefix overlap between adjacent segments', () => {
-		const input = [
-			seg(0, 94.8, 96.4, 'world.'),
-			seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
-			seg(2, 120.12, 123.71, 'to find more about inference.'),
-			seg(3, 123.72, 126.92, "inference. So, I've done a lot of work with VLAM,")
-		];
-
-		const result = deduplicateSegments(input);
-
-		expect(result).toHaveLength(3);
-		expect(result[0].text).toBe('world. And that aspect that I overlooked was');
-		expect(result[2].text).toBe("So, I've done a lot of work with VLAM,");
-	});
-});
-
-// ── ngramDedup ────────────────────────────────────────────────────────────────
-
-describe('deduplicateSegments — ngramDedup', () => {
-	it('passes through completely unique segments', () => {
-		const input = [
-			seg(0, 0, 5, ' The cat sat on the mat quite happily today.'),
-			seg(1, 5, 10, ' Later the dog ran across the yard chasing a ball.')
-		];
-		expect(deduplicateSegments(input)).toHaveLength(2);
-	});
-
-	it('removes a segment that is highly similar to recent context', () => {
-		// Repeat a long sentence verbatim — should be caught as duplicate
-		const longText =
-			' This is a very specific and unique sentence about transcription quality matters greatly.';
-		const input = [seg(0, 0, 5, longText), seg(1, 5, 10, longText)];
-		// After mergeConsecutive the second one is already merged, so result is 1
-		expect(deduplicateSegments(input)).toHaveLength(1);
-	});
-});
-
-// ── deduplicateSegments — full pipeline ──────────────────────────────────────
-
-describe('deduplicateSegments — full pipeline', () => {
-	it('returns empty array for empty input', () => {
-		expect(deduplicateSegments([])).toEqual([]);
-	});
-
-	it('removes segments whose text is empty after trimming', () => {
-		const input = [seg(0, 0, 1, '   '), seg(1, 1, 2, ' Hello.')];
-		const result = deduplicateSegments(input);
-		expect(result).toHaveLength(1);
-		expect(result[0].text).toBe('Hello.');
-	});
-
-	it('re-indexes output segments starting from 0', () => {
-		const input = [
-			seg(5, 0, 2, ' First unique sentence here.'),
-			seg(8, 2, 4, ' Second different sentence there.')
-		];
-		const result = deduplicateSegments(input);
-		result.forEach((s, i) => expect(s.index).toBe(i));
-	});
-
-	it('runs the full pipeline: trim → remove empty → merge → ngram → merge → reindex', () => {
-		const input = [
-			seg(0, 0, 2, ' Good morning everyone.'),
-			seg(1, 2, 3, '   '), // empty — removed
-			seg(2, 3, 5, ' Good morning everyone.'), // duplicate — merged
-			seg(3, 5, 7, ' Welcome to our presentation today.')
-		];
-		const result = deduplicateSegments(input);
-		expect(result).toHaveLength(2);
-		expect(result[0].text).toBe('Good morning everyone.');
-		expect(result[1].text).toBe('Welcome to our presentation today.');
-		expect(result[0].index).toBe(0);
-		expect(result[1].index).toBe(1);
-	});
-});
--- a/src/tests/webhook.test.ts
+++ b/src/tests/webhook.test.ts
@@ -7,7 +7,6 @@ const {
 	mockGetJob,
 	mockUpdateJob,
 	mockSetJobStatus,
-	mockDeduplicateSegments,
 	mockWriteOutputs,
 	mockSendNotification,
 	mockCleanupJobTmp,
@@ -16,7 +15,6 @@ const {
 	mockGetJob: vi.fn(),
 	mockUpdateJob: vi.fn(),
 	mockSetJobStatus: vi.fn(),
-	mockDeduplicateSegments: vi.fn((segs: Segment[]) => segs),
 	mockWriteOutputs: vi.fn(),
 	mockSendNotification: vi.fn(),
 	mockCleanupJobTmp: vi.fn(),
@@ -29,10 +27,6 @@ vi.mock('$lib/server/db.js', () => ({
 	setJobStatus: mockSetJobStatus
 }));

-vi.mock('$lib/server/postprocess.js', () => ({
-	deduplicateSegments: mockDeduplicateSegments
-}));
-
 vi.mock('$lib/server/formatter.js', () => ({
 	writeOutputs: mockWriteOutputs
 }));
@@ -91,7 +85,6 @@ function makeSeg(index: number, text: string): Segment {

 beforeEach(() => {
 	vi.clearAllMocks();
-	mockDeduplicateSegments.mockImplementation((segs: Segment[]) => segs);
 	mockWriteOutputs.mockResolvedValue({
 		srt: '/out/dir/title.srt',
 		txt: '/out/dir/title.txt',
@@ -218,25 +211,21 @@ describe('POST /api/webhook/[jobId] — whisper failure', () => {
 describe('POST /api/webhook/[jobId] — success with segments', () => {
 	const segments = [makeSeg(0, 'Hello world.'), makeSeg(1, 'This is a test.')];

-	it('runs deduplication on received segments', async () => {
+	it('passes received segments through unchanged', async () => {
 		mockGetJob.mockReturnValue(makeJob('job-3'));
 		await POST(makeEvent('job-3', makeWhisperJob({ segments })) as any);
-		expect(mockDeduplicateSegments).toHaveBeenCalledWith(segments);
+		expect(mockWriteOutputs).toHaveBeenCalledWith(segments, 'Test Video', 'job-3');
 	});

-	it('calls writeOutputs with the deduplicated segments and job title', async () => {
+	it('calls writeOutputs with the received segments and job title', async () => {
 		mockGetJob.mockReturnValue(makeJob('job-4', 'My Lecture'));
-		const deduped = [makeSeg(0, 'Hello world.')];
-		mockDeduplicateSegments.mockReturnValue(deduped);

 		await POST(makeEvent('job-4', makeWhisperJob({ segments })) as any);
-		expect(mockWriteOutputs).toHaveBeenCalledWith(deduped, 'My Lecture', 'job-4');
+		expect(mockWriteOutputs).toHaveBeenCalledWith(segments, 'My Lecture', 'job-4');
 	});

 	it('stores serialised segments_json in the database', async () => {
 		mockGetJob.mockReturnValue(makeJob('job-5'));
-		const deduped = [makeSeg(0, 'Result text.')];
-		mockDeduplicateSegments.mockReturnValue(deduped);

 		await POST(makeEvent('job-5', makeWhisperJob({ segments })) as any);

@@ -244,7 +233,7 @@ describe('POST /api/webhook/[jobId] — success with segments', () => {
 			expect.objectContaining({
 				id: 'job-5',
 				status: 'done',
-				segmentsJson: JSON.stringify(deduped)
+				segmentsJson: JSON.stringify(segments)
 			})
 		);
 	});