refactor(transcript): drop Tonemark rewrite
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
80
src/tests/downloader.test.ts
Normal file
80
src/tests/downloader.test.ts
Normal file
@@ -0,0 +1,80 @@
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
import { rm } from 'fs/promises';
|
||||
import type { TranscriptResponse } from 'youtube-transcript';
|
||||
|
||||
const { mockExecFile, mockFetchTranscript } = vi.hoisted(() => ({
|
||||
mockExecFile: vi.fn(),
|
||||
mockFetchTranscript: vi.fn()
|
||||
}));
|
||||
|
||||
const TEST_DATA_DIR = `/tmp/tonemark-downloader-test-${Date.now()}`;
|
||||
vi.stubEnv('DATA_DIR', TEST_DATA_DIR);
|
||||
|
||||
vi.mock('child_process', () => ({
|
||||
execFile: mockExecFile
|
||||
}));
|
||||
|
||||
vi.mock('youtube-transcript', () => ({
|
||||
fetchTranscript: mockFetchTranscript
|
||||
}));
|
||||
|
||||
import { downloadYouTube, transcriptEntriesToSegments } from '$lib/server/downloader.js';
|
||||
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
mockExecFile.mockImplementation((...args: unknown[]) => {
|
||||
const cb = args.at(-1) as (...callbackArgs: unknown[]) => void;
|
||||
cb(null, JSON.stringify({ title: 'Fetched Title' }), '');
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(TEST_DATA_DIR, { recursive: true, force: true }).catch(() => {});
|
||||
});
|
||||
|
||||
describe('transcriptEntriesToSegments', () => {
|
||||
it('converts millisecond transcript offsets into second-based segments', () => {
|
||||
const entries: TranscriptResponse[] = [
|
||||
{ text: 'Hello everyone.', offset: 15240, duration: 4240, lang: 'en' },
|
||||
{ text: 'Um, welcome to this talk.', offset: 16600, duration: 5080, lang: 'en' }
|
||||
];
|
||||
|
||||
expect(transcriptEntriesToSegments(entries)).toEqual([
|
||||
{ index: 0, start: 15.24, end: 19.48, text: 'Hello everyone.', words: [] },
|
||||
{ index: 1, start: 16.6, end: 21.68, text: 'Um, welcome to this talk.', words: [] }
|
||||
]);
|
||||
});
|
||||
|
||||
it('preserves second-based transcript offsets and drops empty text', () => {
|
||||
const entries: TranscriptResponse[] = [
|
||||
{ text: ' ', offset: 0, duration: 1.5, lang: 'en' },
|
||||
{ text: 'Clean caption cue', offset: 91.08, duration: 3.72, lang: 'en' }
|
||||
];
|
||||
|
||||
expect(transcriptEntriesToSegments(entries)).toEqual([
|
||||
{ index: 0, start: 91.08, end: 94.8, text: 'Clean caption cue', words: [] }
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
describe('downloadYouTube', () => {
|
||||
it('uses fetched transcript entries directly for caption jobs', async () => {
|
||||
mockFetchTranscript.mockResolvedValue([
|
||||
{ text: 'Hello everyone.', offset: 15240, duration: 4240, lang: 'en' },
|
||||
{ text: 'Um, welcome to this talk.', offset: 16600, duration: 5080, lang: 'en' }
|
||||
] satisfies TranscriptResponse[]);
|
||||
|
||||
const result = await downloadYouTube('https://youtube.com/watch?v=qdh_x-uRs9g', 'job-1');
|
||||
|
||||
expect(mockFetchTranscript).toHaveBeenCalledWith('https://youtube.com/watch?v=qdh_x-uRs9g', {
|
||||
lang: 'en'
|
||||
});
|
||||
expect(result).toMatchObject({
|
||||
type: 'captions',
|
||||
segments: [
|
||||
{ index: 0, start: 15.24, end: 19.48, text: 'Hello everyone.', words: [] },
|
||||
{ index: 1, start: 16.6, end: 21.68, text: 'Um, welcome to this talk.', words: [] }
|
||||
]
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -1,204 +0,0 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import {
|
||||
deduplicateSegments
|
||||
} from '$lib/server/postprocess.js';
|
||||
import type { Segment } from '$lib/types.js';
|
||||
|
||||
// ── helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
function seg(index: number, start: number, end: number, text: string): Segment {
|
||||
return { index, start, end, text, words: [] };
|
||||
}
|
||||
|
||||
// ── collapseRepeats (tested indirectly via deduplicateSegments) ───────────────
|
||||
|
||||
describe('deduplicateSegments — collapseRepeats', () => {
|
||||
it('leaves text without repetition unchanged', () => {
|
||||
const input = [seg(0, 0, 5, ' Hello world, this is a sentence.')];
|
||||
const [out] = deduplicateSegments(input);
|
||||
expect(out.text).toBe('Hello world, this is a sentence.');
|
||||
});
|
||||
|
||||
it('collapses a consecutive repeated phrase inside a segment', () => {
|
||||
const input = [seg(0, 0, 5, ' the quick brown fox the quick brown fox')];
|
||||
const [out] = deduplicateSegments(input);
|
||||
expect(out.text).not.toMatch(/the quick brown fox.*the quick brown fox/i);
|
||||
});
|
||||
|
||||
it('handles multiple repetitions recursively', () => {
|
||||
// "welcome everyone" = 16 chars — qualifies for the ≥10-char collapse regex
|
||||
const input = [seg(0, 0, 5, ' welcome everyone welcome everyone welcome everyone')];
|
||||
const result = deduplicateSegments(input);
|
||||
const text = result[0]?.text ?? '';
|
||||
expect((text.match(/welcome everyone/gi) ?? []).length).toBeLessThan(3);
|
||||
});
|
||||
});
|
||||
|
||||
// ── mergeConsecutive ──────────────────────────────────────────────────────────
|
||||
|
||||
describe('deduplicateSegments — mergeConsecutive', () => {
|
||||
it('merges adjacent segments with identical text', () => {
|
||||
const input = [
|
||||
seg(0, 0, 2, ' Hello world.'),
|
||||
seg(1, 2, 4, ' Hello world.')
|
||||
];
|
||||
const result = deduplicateSegments(input);
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result[0].end).toBe(4);
|
||||
});
|
||||
|
||||
it('keeps adjacent segments with different text', () => {
|
||||
const input = [
|
||||
seg(0, 0, 2, ' First sentence.'),
|
||||
seg(1, 2, 4, ' Second sentence.')
|
||||
];
|
||||
const result = deduplicateSegments(input);
|
||||
expect(result).toHaveLength(2);
|
||||
});
|
||||
|
||||
it('normalises punctuation and case for merge comparison', () => {
|
||||
const input = [
|
||||
seg(0, 0, 2, ' Hello, World!'),
|
||||
seg(1, 2, 4, ' hello world')
|
||||
];
|
||||
const result = deduplicateSegments(input);
|
||||
expect(result).toHaveLength(1);
|
||||
});
|
||||
});
|
||||
|
||||
// ── rolling prefix/suffix chain collapse ───────────────────────────────────────
|
||||
|
||||
describe('deduplicateSegments — rolling backend hypotheses', () => {
|
||||
it('collapses prefix-growth chains from stored backend segments', () => {
|
||||
const input = [
|
||||
seg(0, 15.24, 16.6, 'Hello everyone.'),
|
||||
seg(1, 16.6, 19.47, 'Hello everyone. Um, welcome to this talk.'),
|
||||
seg(2, 19.47, 19.48, 'Um, welcome to this talk.'),
|
||||
seg(3, 19.48, 21.67, "Um, welcome to this talk. I'll be speaking about small model"),
|
||||
seg(4, 21.67, 21.68, "I'll be speaking about small model"),
|
||||
seg(5, 21.68, 24.59, "I'll be speaking about small model inference and a gap that we've")
|
||||
];
|
||||
|
||||
const result = deduplicateSegments(input);
|
||||
|
||||
expect(result).toHaveLength(2);
|
||||
expect(result[0]).toMatchObject({
|
||||
index: 0,
|
||||
start: 15.24,
|
||||
end: 19.48,
|
||||
text: 'Hello everyone. Um, welcome to this talk.'
|
||||
});
|
||||
expect(result[1]).toMatchObject({
|
||||
index: 1,
|
||||
start: 19.48,
|
||||
end: 24.59,
|
||||
text: "I'll be speaking about small model inference and a gap that we've"
|
||||
});
|
||||
});
|
||||
|
||||
it('does not collapse similar phrases when there is a real timing gap', () => {
|
||||
const input = [
|
||||
seg(0, 0, 1, 'Hello everyone.'),
|
||||
seg(1, 2, 4, 'Hello everyone. Welcome back.')
|
||||
];
|
||||
|
||||
const result = deduplicateSegments(input);
|
||||
|
||||
expect(result).toHaveLength(2);
|
||||
expect(result[0].text).toBe('Hello everyone.');
|
||||
expect(result[1].text).toBe('Hello everyone. Welcome back.');
|
||||
});
|
||||
|
||||
it('collapses tiny one-word carry-over segments from caption-style output', () => {
|
||||
const input = [
|
||||
seg(0, 94.8, 96.4, 'world.'),
|
||||
seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
|
||||
seg(2, 98.96, 100.72, 'inference.'),
|
||||
seg(3, 100.72, 103.92, 'inference. So, as someone who kind of wants to'),
|
||||
seg(4, 107.19, 107.2, 'and'),
|
||||
seg(5, 107.2, 109.56, 'and work to understand the problems and the')
|
||||
];
|
||||
|
||||
const result = deduplicateSegments(input);
|
||||
|
||||
expect(result).toHaveLength(3);
|
||||
expect(result[0].text).toBe('world. And that aspect that I overlooked was');
|
||||
expect(result[1].text).toBe('inference. So, as someone who kind of wants to');
|
||||
expect(result[2].text).toBe('and work to understand the problems and the');
|
||||
});
|
||||
|
||||
it('trims single-word suffix-prefix overlap between adjacent segments', () => {
|
||||
const input = [
|
||||
seg(0, 94.8, 96.4, 'world.'),
|
||||
seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
|
||||
seg(2, 120.12, 123.71, 'to find more about inference.'),
|
||||
seg(3, 123.72, 126.92, "inference. So, I've done a lot of work with VLAM,")
|
||||
];
|
||||
|
||||
const result = deduplicateSegments(input);
|
||||
|
||||
expect(result).toHaveLength(3);
|
||||
expect(result[0].text).toBe('world. And that aspect that I overlooked was');
|
||||
expect(result[2].text).toBe("So, I've done a lot of work with VLAM,");
|
||||
});
|
||||
});
|
||||
|
||||
// ── ngramDedup ────────────────────────────────────────────────────────────────
|
||||
|
||||
describe('deduplicateSegments — ngramDedup', () => {
|
||||
it('passes through completely unique segments', () => {
|
||||
const input = [
|
||||
seg(0, 0, 5, ' The cat sat on the mat quite happily today.'),
|
||||
seg(1, 5, 10, ' Later the dog ran across the yard chasing a ball.')
|
||||
];
|
||||
expect(deduplicateSegments(input)).toHaveLength(2);
|
||||
});
|
||||
|
||||
it('removes a segment that is highly similar to recent context', () => {
|
||||
// Repeat a long sentence verbatim — should be caught as duplicate
|
||||
const longText =
|
||||
' This is a very specific and unique sentence about transcription quality matters greatly.';
|
||||
const input = [seg(0, 0, 5, longText), seg(1, 5, 10, longText)];
|
||||
// After mergeConsecutive the second one is already merged, so result is 1
|
||||
expect(deduplicateSegments(input)).toHaveLength(1);
|
||||
});
|
||||
});
|
||||
|
||||
// ── deduplicateSegments — full pipeline ──────────────────────────────────────
|
||||
|
||||
describe('deduplicateSegments — full pipeline', () => {
|
||||
it('returns empty array for empty input', () => {
|
||||
expect(deduplicateSegments([])).toEqual([]);
|
||||
});
|
||||
|
||||
it('removes segments whose text is empty after trimming', () => {
|
||||
const input = [seg(0, 0, 1, ' '), seg(1, 1, 2, ' Hello.')];
|
||||
const result = deduplicateSegments(input);
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result[0].text).toBe('Hello.');
|
||||
});
|
||||
|
||||
it('re-indexes output segments starting from 0', () => {
|
||||
const input = [
|
||||
seg(5, 0, 2, ' First unique sentence here.'),
|
||||
seg(8, 2, 4, ' Second different sentence there.')
|
||||
];
|
||||
const result = deduplicateSegments(input);
|
||||
result.forEach((s, i) => expect(s.index).toBe(i));
|
||||
});
|
||||
|
||||
it('runs the full pipeline: trim → remove empty → merge → ngram → merge → reindex', () => {
|
||||
const input = [
|
||||
seg(0, 0, 2, ' Good morning everyone.'),
|
||||
seg(1, 2, 3, ' '), // empty — removed
|
||||
seg(2, 3, 5, ' Good morning everyone.'), // duplicate — merged
|
||||
seg(3, 5, 7, ' Welcome to our presentation today.')
|
||||
];
|
||||
const result = deduplicateSegments(input);
|
||||
expect(result).toHaveLength(2);
|
||||
expect(result[0].text).toBe('Good morning everyone.');
|
||||
expect(result[1].text).toBe('Welcome to our presentation today.');
|
||||
expect(result[0].index).toBe(0);
|
||||
expect(result[1].index).toBe(1);
|
||||
});
|
||||
});
|
||||
@@ -7,7 +7,6 @@ const {
|
||||
mockGetJob,
|
||||
mockUpdateJob,
|
||||
mockSetJobStatus,
|
||||
mockDeduplicateSegments,
|
||||
mockWriteOutputs,
|
||||
mockSendNotification,
|
||||
mockCleanupJobTmp,
|
||||
@@ -16,7 +15,6 @@ const {
|
||||
mockGetJob: vi.fn(),
|
||||
mockUpdateJob: vi.fn(),
|
||||
mockSetJobStatus: vi.fn(),
|
||||
mockDeduplicateSegments: vi.fn((segs: Segment[]) => segs),
|
||||
mockWriteOutputs: vi.fn(),
|
||||
mockSendNotification: vi.fn(),
|
||||
mockCleanupJobTmp: vi.fn(),
|
||||
@@ -29,10 +27,6 @@ vi.mock('$lib/server/db.js', () => ({
|
||||
setJobStatus: mockSetJobStatus
|
||||
}));
|
||||
|
||||
vi.mock('$lib/server/postprocess.js', () => ({
|
||||
deduplicateSegments: mockDeduplicateSegments
|
||||
}));
|
||||
|
||||
vi.mock('$lib/server/formatter.js', () => ({
|
||||
writeOutputs: mockWriteOutputs
|
||||
}));
|
||||
@@ -91,7 +85,6 @@ function makeSeg(index: number, text: string): Segment {
|
||||
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
mockDeduplicateSegments.mockImplementation((segs: Segment[]) => segs);
|
||||
mockWriteOutputs.mockResolvedValue({
|
||||
srt: '/out/dir/title.srt',
|
||||
txt: '/out/dir/title.txt',
|
||||
@@ -218,25 +211,21 @@ describe('POST /api/webhook/[jobId] — whisper failure', () => {
|
||||
describe('POST /api/webhook/[jobId] — success with segments', () => {
|
||||
const segments = [makeSeg(0, 'Hello world.'), makeSeg(1, 'This is a test.')];
|
||||
|
||||
it('runs deduplication on received segments', async () => {
|
||||
it('passes received segments through unchanged', async () => {
|
||||
mockGetJob.mockReturnValue(makeJob('job-3'));
|
||||
await POST(makeEvent('job-3', makeWhisperJob({ segments })) as any);
|
||||
expect(mockDeduplicateSegments).toHaveBeenCalledWith(segments);
|
||||
expect(mockWriteOutputs).toHaveBeenCalledWith(segments, 'Test Video', 'job-3');
|
||||
});
|
||||
|
||||
it('calls writeOutputs with the deduplicated segments and job title', async () => {
|
||||
it('calls writeOutputs with the received segments and job title', async () => {
|
||||
mockGetJob.mockReturnValue(makeJob('job-4', 'My Lecture'));
|
||||
const deduped = [makeSeg(0, 'Hello world.')];
|
||||
mockDeduplicateSegments.mockReturnValue(deduped);
|
||||
|
||||
await POST(makeEvent('job-4', makeWhisperJob({ segments })) as any);
|
||||
expect(mockWriteOutputs).toHaveBeenCalledWith(deduped, 'My Lecture', 'job-4');
|
||||
expect(mockWriteOutputs).toHaveBeenCalledWith(segments, 'My Lecture', 'job-4');
|
||||
});
|
||||
|
||||
it('stores serialised segments_json in the database', async () => {
|
||||
mockGetJob.mockReturnValue(makeJob('job-5'));
|
||||
const deduped = [makeSeg(0, 'Result text.')];
|
||||
mockDeduplicateSegments.mockReturnValue(deduped);
|
||||
|
||||
await POST(makeEvent('job-5', makeWhisperJob({ segments })) as any);
|
||||
|
||||
@@ -244,7 +233,7 @@ describe('POST /api/webhook/[jobId] — success with segments', () => {
|
||||
expect.objectContaining({
|
||||
id: 'job-5',
|
||||
status: 'done',
|
||||
segmentsJson: JSON.stringify(deduped)
|
||||
segmentsJson: JSON.stringify(segments)
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user