refactor(transcript): drop Tonemark rewrite
All checks were successful
Build & Push Docker Image / test (push) Successful in 10s
Build & Push Docker Image / build-and-push (push) Successful in 50s

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-12 00:10:32 +02:00
parent df50e74939
commit 929c482497
10 changed files with 161 additions and 540 deletions

42
package-lock.json generated
View File

@@ -12,7 +12,8 @@
"better-sqlite3": "^12.9.0",
"form-data": "^4.0.5",
"node-fetch": "^3.3.2",
"web-push": "^3.6.7"
"web-push": "^3.6.7",
"youtube-transcript": "^1.3.1"
},
"devDependencies": {
"@sveltejs/adapter-auto": "^7.0.1",
@@ -89,6 +90,27 @@
"node": ">=18"
}
},
"node_modules/@emnapi/core": {
"version": "1.10.0",
"resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.10.0.tgz",
"integrity": "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw==",
"license": "MIT",
"optional": true,
"dependencies": {
"@emnapi/wasi-threads": "1.2.1",
"tslib": "^2.4.0"
}
},
"node_modules/@emnapi/runtime": {
"version": "1.10.0",
"resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.10.0.tgz",
"integrity": "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA==",
"license": "MIT",
"optional": true,
"dependencies": {
"tslib": "^2.4.0"
}
},
"node_modules/@emnapi/wasi-threads": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.2.1.tgz",
@@ -896,7 +918,6 @@
"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.59.1.tgz",
"integrity": "sha512-d8OON70AphLdDesuTIl//M2O6fRTIicX8aYv8vhCiYEhTTI2OboKqey0Hu1A4VFhqwgqtq0vKDmPFGkw8kKmgw==",
"license": "MIT",
"peer": true,
"dependencies": {
"@standard-schema/spec": "^1.0.0",
"@sveltejs/acorn-typescript": "^1.0.5",
@@ -938,7 +959,6 @@
"resolved": "https://registry.npmjs.org/@sveltejs/vite-plugin-svelte/-/vite-plugin-svelte-7.1.1.tgz",
"integrity": "sha512-FOJdbE5pxae68DoTBJ49t1dIA7TSmMHR6CsuJhX90cO/UfrEMHA7KJNUj3WdZuUDJPu4ujqpJ2Tgqd2gTWr6Xg==",
"license": "MIT",
"peer": true,
"dependencies": {
"deepmerge": "^4.3.1",
"magic-string": "^0.30.21",
@@ -1313,7 +1333,6 @@
"integrity": "sha512-38C0/Ddb7HcRG0Z4/DUem8x57d2p9jYgp18mkaYswEOQBGsI1CG4f/hjm0ZCeaJfWhSZ4k7jgs29V1Zom7Ki9A==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@bcoe/v8-coverage": "^1.0.2",
"@vitest/utils": "4.1.5",
@@ -1467,7 +1486,6 @@
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.16.0.tgz",
"integrity": "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw==",
"license": "MIT",
"peer": true,
"bin": {
"acorn": "bin/acorn"
},
@@ -3021,7 +3039,6 @@
"resolved": "https://registry.npmjs.org/rollup/-/rollup-4.60.3.tgz",
"integrity": "sha512-pAQK9HalE84QSm4Po3EmWIZPd3FnjkShVkiMlz1iligWYkWQ7wHYd1PF/T7QZ5TVSD6uSTon5gBVMSM4JfBV+A==",
"license": "MIT",
"peer": true,
"dependencies": {
"@types/estree": "1.0.8"
},
@@ -3255,7 +3272,6 @@
"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.5.tgz",
"integrity": "sha512-2uCs/LZ9us+AktdzYJM8OcxQ8qnPS1kpaO7syGT/MgO+6Qr1Ybl+TqPq+97u7PHqmmMlye5ZkoyXONy5mjjAbw==",
"license": "MIT",
"peer": true,
"dependencies": {
"@jridgewell/remapping": "^2.3.4",
"@jridgewell/sourcemap-codec": "^1.5.0",
@@ -3428,7 +3444,6 @@
"integrity": "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw==",
"devOptional": true,
"license": "Apache-2.0",
"peer": true,
"bin": {
"tsc": "bin/tsc",
"tsserver": "bin/tsserver"
@@ -3455,7 +3470,6 @@
"resolved": "https://registry.npmjs.org/vite/-/vite-8.0.10.tgz",
"integrity": "sha512-rZuUu9j6J5uotLDs+cAA4O5H4K1SfPliUlQwqa6YEwSrWDZzP4rhm00oJR5snMewjxF5V/K3D4kctsUTsIU9Mw==",
"license": "MIT",
"peer": true,
"dependencies": {
"lightningcss": "^1.32.0",
"picomatch": "^4.0.4",
@@ -3553,7 +3567,6 @@
"integrity": "sha512-9Xx1v3/ih3m9hN+SbfkUyy0JAs72ap3r7joc87XL6jwF0jGg6mFBvQ1SrwaX+h8BlkX6Hz9shdd1uo6AF+ZGpg==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@vitest/expect": "4.1.5",
"@vitest/mocker": "4.1.5",
@@ -3689,6 +3702,15 @@
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
"license": "ISC"
},
"node_modules/youtube-transcript": {
"version": "1.3.1",
"resolved": "https://registry.npmjs.org/youtube-transcript/-/youtube-transcript-1.3.1.tgz",
"integrity": "sha512-NDCjwad113TGybbYF51y9Z4tcwzBHUZWQdF9veULNca18L+FdDbHHtTHIr69WVa3bB90l67S8kN0HtL2JO9fhg==",
"license": "MIT",
"engines": {
"node": ">=18.0.0"
}
},
"node_modules/zimmerframe": {
"version": "1.1.4",
"resolved": "https://registry.npmjs.org/zimmerframe/-/zimmerframe-1.1.4.tgz",

View File

@@ -34,6 +34,7 @@
"better-sqlite3": "^12.9.0",
"form-data": "^4.0.5",
"node-fetch": "^3.3.2",
"web-push": "^3.6.7"
"web-push": "^3.6.7",
"youtube-transcript": "^1.3.1"
}
}

View File

@@ -1,8 +1,9 @@
import { execFile } from 'child_process';
import { promisify } from 'util';
import { existsSync } from 'fs';
import { mkdir, unlink, writeFile } from 'fs/promises';
import { mkdir, writeFile } from 'fs/promises';
import { join } from 'path';
import { fetchTranscript, type TranscriptResponse } from 'youtube-transcript';
const execFileAsync = promisify(execFile);
const TMP_DIR = join(process.env.DATA_DIR ?? '/tmp/.whisper-pwa', 'downloads');
@@ -26,43 +27,33 @@ export interface AudioResult {
export type DownloadResult = CaptionResult | AudioResult;
/** Try to get auto-generated captions from YouTube. Returns null if unavailable. */
async function tryGetCaptions(url: string, outDir: string): Promise<CaptionResult | null> {
const jsonPath = join(outDir, 'info.json');
async function tryGetCaptions(url: string, _outDir: string): Promise<CaptionResult | null> {
try {
await execFileAsync('yt-dlp', [
'--write-auto-subs',
'--sub-langs', 'en.*',
'--skip-download',
'--write-info-json',
'--no-playlist',
'-o', join(outDir, '%(title)s.%(ext)s'),
url
]);
// Find the VTT/SRT file
const { readdirSync } = await import('fs');
const files = readdirSync(outDir);
const vttFile = files.find((f) => f.endsWith('.vtt') || f.endsWith('.srt'));
if (!vttFile) return null;
let title = 'Untitled';
if (existsSync(jsonPath)) {
try {
const info = JSON.parse((await import('fs')).readFileSync(jsonPath, 'utf8'));
title = info.title ?? title;
} catch { /* ignore */ }
}
const content = (await import('fs')).readFileSync(join(outDir, vttFile), 'utf8');
const segments = parseVtt(content);
const transcript = await fetchTranscript(url, { lang: 'en' });
const segments = transcriptEntriesToSegments(transcript);
if (segments.length === 0) return null;
const title = await getYouTubeTitle(url);
return { type: 'captions', segments, title };
} catch {
return null;
}
}
async function getYouTubeTitle(url: string): Promise<string> {
try {
const { stdout } = await execFileAsync('yt-dlp', [
'--dump-single-json',
'--skip-download',
'--no-playlist',
url
]);
return JSON.parse(stdout).title ?? 'Untitled';
} catch {
return 'Untitled';
}
}
/** Download best audio from YouTube. Returns path to audio file. */
async function downloadAudio(url: string, outDir: string): Promise<{ audioPath: string; title: string }> {
await execFileAsync('yt-dlp', [
@@ -124,39 +115,22 @@ export async function cleanupJobTmp(jobId: string) {
} catch { /* ignore */ }
}
/** Parse a WebVTT string into segments. */
function parseVtt(
content: string
export function transcriptEntriesToSegments(
entries: TranscriptResponse[]
): Array<{ index: number; start: number; end: number; text: string; words: [] }> {
const segments: Array<{ index: number; start: number; end: number; text: string; words: [] }> = [];
const blocks = content.split(/\n\n+/);
let index = 0;
for (const block of blocks) {
const lines = block.trim().split('\n');
const timeLine = lines.find((l) => l.includes('-->'));
if (!timeLine) continue;
const [startStr, endStr] = timeLine.split('-->').map((s) => s.trim().split(' ')[0]);
const start = vttTimeToSec(startStr);
const end = vttTimeToSec(endStr);
const text = lines
.filter((l) => !l.includes('-->') && !/^\d+$/.test(l.trim()) && l.trim())
.join(' ')
.replace(/<[^>]+>/g, '')
.trim();
if (text) {
segments.push({ index: index++, start, end, text, words: [] });
}
}
return segments;
}
function vttTimeToSec(t: string): number {
const parts = t.split(':').map(Number);
if (parts.length === 3) return parts[0] * 3600 + parts[1] * 60 + parts[2];
if (parts.length === 2) return parts[0] * 60 + parts[1];
return parts[0];
const useMilliseconds = entries.some((entry) => entry.offset > 1000 || entry.duration > 1000);
return entries
.map((entry) => {
const start = useMilliseconds ? entry.offset / 1000 : entry.offset;
const duration = useMilliseconds ? entry.duration / 1000 : entry.duration;
return {
index: 0,
start,
end: start + duration,
text: entry.text.trim(),
words: [] as []
};
})
.filter((entry) => entry.text.length > 0)
.map((entry, index) => ({ ...entry, index }));
}

View File

@@ -96,15 +96,13 @@ async function runJob(
if (captionSegments) {
// Caption fast path — skip whisper
const { deduplicateSegments } = await import('./postprocess.js');
const { writeOutputs } = await import('./formatter.js');
const segments = deduplicateSegments(captionSegments);
const paths = await writeOutputs(segments, title, jobId);
const paths = await writeOutputs(captionSegments, title, jobId);
updateJob({
id: jobId,
status: 'done',
progress: 100,
segmentsJson: JSON.stringify(segments),
segmentsJson: JSON.stringify(captionSegments),
outputDir: paths.srt.replace(/\/[^/]+$/, '')
});
emitProgress(jobId, { type: 'done' });

View File

@@ -1,235 +0,0 @@
import type { Segment } from '$lib/types.js';
// ── Collapse consecutive repeated phrases within a segment's text ────────────
function collapseRepeats(text: string): string {
let prev = '';
// Keep applying until stable
while (true) {
const next = collapseOnce(text);
if (next === prev || next === text) return next;
prev = text;
text = next;
}
}
function collapseOnce(text: string): string {
// Match any repeated phrase (2+ words) appearing consecutively
return text.replace(/\b(.{10,}?)\s+\1\b/gi, '$1');
}
// ── Merge consecutive segments with identical (or near-identical) text ───────
function normalise(s: string) {
return s.toLowerCase().replace(/[^\w\s]/g, '').replace(/\s+/g, ' ').trim();
}
function mergeConsecutive(segments: Segment[]): Segment[] {
const out: Segment[] = [];
for (const seg of segments) {
const last = out[out.length - 1];
if (last && normalise(last.text) === normalise(seg.text)) {
last.end = seg.end;
} else {
out.push({ ...seg });
}
}
return out;
}
// ── Collapse rolling prefix/suffix chains from backend segment hypotheses ──────
const MAX_CHAIN_GAP_SECS = 0.15;
const MIN_MEANINGFUL_WORDS = 2;
const MIN_MEANINGFUL_CHARS = 8;
const MIN_OVERLAP_WORDS = 1;
function splitWords(text: string): string[] {
return text.trim().split(/\s+/).filter(Boolean);
}
function normaliseWords(text: string): string[] {
return splitWords(text)
.map((word) => word.toLowerCase().replace(/[^\w]/g, ''))
.filter(Boolean);
}
function arraysEqual(a: string[], b: string[]): boolean {
return a.length === b.length && a.every((value, index) => value === b[index]);
}
function startsWithWords(full: string[], prefix: string[]): boolean {
return prefix.length <= full.length && arraysEqual(full.slice(0, prefix.length), prefix);
}
function endsWithWords(full: string[], suffix: string[]): boolean {
return suffix.length <= full.length && arraysEqual(full.slice(full.length - suffix.length), suffix);
}
function suffixPrefixOverlap(left: string[], right: string[]): number {
const max = Math.min(left.length, right.length);
for (let size = max; size >= 1; size--) {
if (arraysEqual(left.slice(left.length - size), right.slice(0, size))) return size;
}
return 0;
}
function isMeaningfulPhrase(words: string[]): boolean {
return words.length >= MIN_MEANINGFUL_WORDS && words.join(' ').length >= MIN_MEANINGFUL_CHARS;
}
function isShortCarryover(seg: Segment, words: string[]): boolean {
return seg.end - seg.start <= 0.2 || words.length <= 2 || words.join(' ').length <= 16;
}
function trimLeadingWords(text: string, count: number): string {
return splitWords(text).slice(count).join(' ').trim();
}
function collapseIncrementalSegments(segments: Segment[]): Segment[] {
const out: Segment[] = [];
for (const seg of segments) {
let current: Segment = {
...seg,
text: seg.text.trim()
};
if (!current.text) continue;
const last = out[out.length - 1];
if (!last) {
out.push(current);
continue;
}
const gap = current.start - last.end;
if (gap > MAX_CHAIN_GAP_SECS) {
out.push(current);
continue;
}
const lastWords = normaliseWords(last.text);
const currentWords = normaliseWords(current.text);
if (lastWords.length === 0 || currentWords.length === 0) {
out.push(current);
continue;
}
if (
currentWords.length > lastWords.length &&
startsWithWords(currentWords, lastWords) &&
(isMeaningfulPhrase(lastWords) || isShortCarryover(last, lastWords))
) {
last.text = current.text;
last.end = current.end;
last.words = current.words;
continue;
}
if (
endsWithWords(lastWords, currentWords) &&
(isMeaningfulPhrase(currentWords) || isShortCarryover(current, currentWords))
) {
last.end = Math.max(last.end, current.end);
continue;
}
const overlapWords = suffixPrefixOverlap(lastWords, currentWords);
if (overlapWords >= MIN_OVERLAP_WORDS) {
const trimmedText = trimLeadingWords(current.text, overlapWords);
if (!trimmedText) {
last.end = Math.max(last.end, current.end);
continue;
}
current = {
...current,
start: Math.max(current.start, last.end),
text: trimmedText,
words: []
};
}
out.push(current);
}
return out;
}
// ── N-gram deduplication ─────────────────────────────────────────────────────
const NGRAM_N = 6;
const LOOKBACK_CHARS = 500;
const SIMILARITY_THRESHOLD = 0.6;
function ngrams(text: string, n: number): string[] {
const words = text.toLowerCase().split(/\s+/);
const grams: string[] = [];
for (let i = 0; i <= words.length - n; i++) {
grams.push(words.slice(i, i + n).join(' '));
}
return grams;
}
function jaccardSimilarity(a: string, b: string): number {
const ga = new Set(ngrams(a, NGRAM_N));
const gb = new Set(ngrams(b, NGRAM_N));
// If neither text is long enough to produce n-grams they cannot be compared;
// treat as dissimilar so short segments are never incorrectly discarded.
if (ga.size === 0 && gb.size === 0) return 0;
const intersection = [...ga].filter((g) => gb.has(g)).length;
const union = new Set([...ga, ...gb]).size;
return union === 0 ? 0 : intersection / union;
}
function ngramDedup(segments: Segment[]): Segment[] {
const out: Segment[] = [];
for (const seg of segments) {
const windowText = out
.slice(-20)
.map((s) => s.text)
.join(' ')
.slice(-LOOKBACK_CHARS);
if (windowText.length > 0 && jaccardSimilarity(seg.text, windowText) >= SIMILARITY_THRESHOLD) {
continue; // duplicate — skip
}
out.push(seg);
}
return out;
}
// ── Full deduplication pipeline ──────────────────────────────────────────────
export function deduplicateSegments(segments: Segment[]): Segment[] {
if (!Array.isArray(segments)) return [];
// 1. Collapse repeats within each segment's text
let result = segments.map((s) => ({
...s,
text: collapseRepeats(s.text.trim())
}));
// 2. Remove empty segments
result = result.filter((s) => s.text.length > 0);
// 3. Collapse rolling backend hypotheses before generic dedup
result = collapseIncrementalSegments(result);
// 4. First merge pass
result = mergeConsecutive(result);
// 5. N-gram dedup
result = ngramDedup(result);
// 6. Re-run rolling collapse after removals create new adjacencies
result = collapseIncrementalSegments(result);
// 7. Second merge pass (catches new adjacencies after dedup)
result = mergeConsecutive(result);
// 8. Re-index
result.forEach((s, i) => (s.index = i));
return result;
}

View File

@@ -1,10 +1,9 @@
import { json, error } from '@sveltejs/kit';
import { getJob, updateJob } from '$lib/server/db.js';
import { deduplicateSegments } from '$lib/server/postprocess.js';
import { writeOutputs } from '$lib/server/formatter.js';
import type { Segment } from '$lib/types.js';
/** POST /api/jobs/[id]/reprocess — re-run post-processing and regenerate all output files. */
/** POST /api/jobs/[id]/reprocess — regenerate output files from stored canonical segments. */
export async function POST({ params }) {
const job = getJob(params.id);
if (!job) throw error(404, 'Job not found');
@@ -14,8 +13,7 @@ export async function POST({ params }) {
}
try {
const rawSegments = JSON.parse(job.segmentsJson) as Segment[];
const segments = deduplicateSegments(rawSegments);
const segments = JSON.parse(job.segmentsJson) as Segment[];
const paths = await writeOutputs(segments, job.title, job.id);
const outputDir = paths.srt.replace(/\/[^/]+$/, '');

View File

@@ -1,6 +1,5 @@
import { json, error } from '@sveltejs/kit';
import { getJob, updateJob, setJobStatus } from '$lib/server/db.js';
import { deduplicateSegments } from '$lib/server/postprocess.js';
import { writeOutputs } from '$lib/server/formatter.js';
import { sendNotification } from '$lib/server/push.js';
import { cleanupJobTmp } from '$lib/server/downloader.js';
@@ -40,8 +39,7 @@ try {
setJobStatus(jobId, 'processing', 90);
emitProgress(jobId, { type: 'status', status: 'processing', progress: 90 });
const rawSegments = (whisperJob.segments ?? []) as Segment[];
const segments = deduplicateSegments(rawSegments);
const segments = (whisperJob.segments ?? []) as Segment[];
const paths = await writeOutputs(segments, job.title, jobId);
const outputDir = paths.srt.replace(/\/[^/]+$/, '');

View File

@@ -0,0 +1,80 @@
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { rm } from 'fs/promises';
import type { TranscriptResponse } from 'youtube-transcript';
const { mockExecFile, mockFetchTranscript } = vi.hoisted(() => ({
mockExecFile: vi.fn(),
mockFetchTranscript: vi.fn()
}));
const TEST_DATA_DIR = `/tmp/tonemark-downloader-test-${Date.now()}`;
vi.stubEnv('DATA_DIR', TEST_DATA_DIR);
vi.mock('child_process', () => ({
execFile: mockExecFile
}));
vi.mock('youtube-transcript', () => ({
fetchTranscript: mockFetchTranscript
}));
import { downloadYouTube, transcriptEntriesToSegments } from '$lib/server/downloader.js';
beforeEach(() => {
vi.clearAllMocks();
mockExecFile.mockImplementation((...args: unknown[]) => {
const cb = args.at(-1) as (...callbackArgs: unknown[]) => void;
cb(null, JSON.stringify({ title: 'Fetched Title' }), '');
});
});
afterEach(async () => {
await rm(TEST_DATA_DIR, { recursive: true, force: true }).catch(() => {});
});
describe('transcriptEntriesToSegments', () => {
it('converts millisecond transcript offsets into second-based segments', () => {
const entries: TranscriptResponse[] = [
{ text: 'Hello everyone.', offset: 15240, duration: 4240, lang: 'en' },
{ text: 'Um, welcome to this talk.', offset: 16600, duration: 5080, lang: 'en' }
];
expect(transcriptEntriesToSegments(entries)).toEqual([
{ index: 0, start: 15.24, end: 19.48, text: 'Hello everyone.', words: [] },
{ index: 1, start: 16.6, end: 21.68, text: 'Um, welcome to this talk.', words: [] }
]);
});
it('preserves second-based transcript offsets and drops empty text', () => {
const entries: TranscriptResponse[] = [
{ text: ' ', offset: 0, duration: 1.5, lang: 'en' },
{ text: 'Clean caption cue', offset: 91.08, duration: 3.72, lang: 'en' }
];
expect(transcriptEntriesToSegments(entries)).toEqual([
{ index: 0, start: 91.08, end: 94.8, text: 'Clean caption cue', words: [] }
]);
});
});
describe('downloadYouTube', () => {
it('uses fetched transcript entries directly for caption jobs', async () => {
mockFetchTranscript.mockResolvedValue([
{ text: 'Hello everyone.', offset: 15240, duration: 4240, lang: 'en' },
{ text: 'Um, welcome to this talk.', offset: 16600, duration: 5080, lang: 'en' }
] satisfies TranscriptResponse[]);
const result = await downloadYouTube('https://youtube.com/watch?v=qdh_x-uRs9g', 'job-1');
expect(mockFetchTranscript).toHaveBeenCalledWith('https://youtube.com/watch?v=qdh_x-uRs9g', {
lang: 'en'
});
expect(result).toMatchObject({
type: 'captions',
segments: [
{ index: 0, start: 15.24, end: 19.48, text: 'Hello everyone.', words: [] },
{ index: 1, start: 16.6, end: 21.68, text: 'Um, welcome to this talk.', words: [] }
]
});
});
});

View File

@@ -1,204 +0,0 @@
import { describe, it, expect } from 'vitest';
import {
deduplicateSegments
} from '$lib/server/postprocess.js';
import type { Segment } from '$lib/types.js';
// ── helpers ──────────────────────────────────────────────────────────────────
function seg(index: number, start: number, end: number, text: string): Segment {
return { index, start, end, text, words: [] };
}
// ── collapseRepeats (tested indirectly via deduplicateSegments) ───────────────
describe('deduplicateSegments — collapseRepeats', () => {
it('leaves text without repetition unchanged', () => {
const input = [seg(0, 0, 5, ' Hello world, this is a sentence.')];
const [out] = deduplicateSegments(input);
expect(out.text).toBe('Hello world, this is a sentence.');
});
it('collapses a consecutive repeated phrase inside a segment', () => {
const input = [seg(0, 0, 5, ' the quick brown fox the quick brown fox')];
const [out] = deduplicateSegments(input);
expect(out.text).not.toMatch(/the quick brown fox.*the quick brown fox/i);
});
it('handles multiple repetitions recursively', () => {
// "welcome everyone" = 16 chars — qualifies for the ≥10-char collapse regex
const input = [seg(0, 0, 5, ' welcome everyone welcome everyone welcome everyone')];
const result = deduplicateSegments(input);
const text = result[0]?.text ?? '';
expect((text.match(/welcome everyone/gi) ?? []).length).toBeLessThan(3);
});
});
// ── mergeConsecutive ──────────────────────────────────────────────────────────
describe('deduplicateSegments — mergeConsecutive', () => {
it('merges adjacent segments with identical text', () => {
const input = [
seg(0, 0, 2, ' Hello world.'),
seg(1, 2, 4, ' Hello world.')
];
const result = deduplicateSegments(input);
expect(result).toHaveLength(1);
expect(result[0].end).toBe(4);
});
it('keeps adjacent segments with different text', () => {
const input = [
seg(0, 0, 2, ' First sentence.'),
seg(1, 2, 4, ' Second sentence.')
];
const result = deduplicateSegments(input);
expect(result).toHaveLength(2);
});
it('normalises punctuation and case for merge comparison', () => {
const input = [
seg(0, 0, 2, ' Hello, World!'),
seg(1, 2, 4, ' hello world')
];
const result = deduplicateSegments(input);
expect(result).toHaveLength(1);
});
});
// ── rolling prefix/suffix chain collapse ───────────────────────────────────────
describe('deduplicateSegments — rolling backend hypotheses', () => {
it('collapses prefix-growth chains from stored backend segments', () => {
const input = [
seg(0, 15.24, 16.6, 'Hello everyone.'),
seg(1, 16.6, 19.47, 'Hello everyone. Um, welcome to this talk.'),
seg(2, 19.47, 19.48, 'Um, welcome to this talk.'),
seg(3, 19.48, 21.67, "Um, welcome to this talk. I'll be speaking about small model"),
seg(4, 21.67, 21.68, "I'll be speaking about small model"),
seg(5, 21.68, 24.59, "I'll be speaking about small model inference and a gap that we've")
];
const result = deduplicateSegments(input);
expect(result).toHaveLength(2);
expect(result[0]).toMatchObject({
index: 0,
start: 15.24,
end: 19.48,
text: 'Hello everyone. Um, welcome to this talk.'
});
expect(result[1]).toMatchObject({
index: 1,
start: 19.48,
end: 24.59,
text: "I'll be speaking about small model inference and a gap that we've"
});
});
it('does not collapse similar phrases when there is a real timing gap', () => {
const input = [
seg(0, 0, 1, 'Hello everyone.'),
seg(1, 2, 4, 'Hello everyone. Welcome back.')
];
const result = deduplicateSegments(input);
expect(result).toHaveLength(2);
expect(result[0].text).toBe('Hello everyone.');
expect(result[1].text).toBe('Hello everyone. Welcome back.');
});
it('collapses tiny one-word carry-over segments from caption-style output', () => {
const input = [
seg(0, 94.8, 96.4, 'world.'),
seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
seg(2, 98.96, 100.72, 'inference.'),
seg(3, 100.72, 103.92, 'inference. So, as someone who kind of wants to'),
seg(4, 107.19, 107.2, 'and'),
seg(5, 107.2, 109.56, 'and work to understand the problems and the')
];
const result = deduplicateSegments(input);
expect(result).toHaveLength(3);
expect(result[0].text).toBe('world. And that aspect that I overlooked was');
expect(result[1].text).toBe('inference. So, as someone who kind of wants to');
expect(result[2].text).toBe('and work to understand the problems and the');
});
it('trims single-word suffix-prefix overlap between adjacent segments', () => {
const input = [
seg(0, 94.8, 96.4, 'world.'),
seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
seg(2, 120.12, 123.71, 'to find more about inference.'),
seg(3, 123.72, 126.92, "inference. So, I've done a lot of work with VLAM,")
];
const result = deduplicateSegments(input);
expect(result).toHaveLength(3);
expect(result[0].text).toBe('world. And that aspect that I overlooked was');
expect(result[2].text).toBe("So, I've done a lot of work with VLAM,");
});
});
// ── ngramDedup ────────────────────────────────────────────────────────────────
describe('deduplicateSegments — ngramDedup', () => {
it('passes through completely unique segments', () => {
const input = [
seg(0, 0, 5, ' The cat sat on the mat quite happily today.'),
seg(1, 5, 10, ' Later the dog ran across the yard chasing a ball.')
];
expect(deduplicateSegments(input)).toHaveLength(2);
});
it('removes a segment that is highly similar to recent context', () => {
// Repeat a long sentence verbatim — should be caught as duplicate
const longText =
' This is a very specific and unique sentence about transcription quality matters greatly.';
const input = [seg(0, 0, 5, longText), seg(1, 5, 10, longText)];
// After mergeConsecutive the second one is already merged, so result is 1
expect(deduplicateSegments(input)).toHaveLength(1);
});
});
// ── deduplicateSegments — full pipeline ──────────────────────────────────────
describe('deduplicateSegments — full pipeline', () => {
it('returns empty array for empty input', () => {
expect(deduplicateSegments([])).toEqual([]);
});
it('removes segments whose text is empty after trimming', () => {
const input = [seg(0, 0, 1, ' '), seg(1, 1, 2, ' Hello.')];
const result = deduplicateSegments(input);
expect(result).toHaveLength(1);
expect(result[0].text).toBe('Hello.');
});
it('re-indexes output segments starting from 0', () => {
const input = [
seg(5, 0, 2, ' First unique sentence here.'),
seg(8, 2, 4, ' Second different sentence there.')
];
const result = deduplicateSegments(input);
result.forEach((s, i) => expect(s.index).toBe(i));
});
it('runs the full pipeline: trim → remove empty → merge → ngram → merge → reindex', () => {
const input = [
seg(0, 0, 2, ' Good morning everyone.'),
seg(1, 2, 3, ' '), // empty — removed
seg(2, 3, 5, ' Good morning everyone.'), // duplicate — merged
seg(3, 5, 7, ' Welcome to our presentation today.')
];
const result = deduplicateSegments(input);
expect(result).toHaveLength(2);
expect(result[0].text).toBe('Good morning everyone.');
expect(result[1].text).toBe('Welcome to our presentation today.');
expect(result[0].index).toBe(0);
expect(result[1].index).toBe(1);
});
});

View File

@@ -7,7 +7,6 @@ const {
mockGetJob,
mockUpdateJob,
mockSetJobStatus,
mockDeduplicateSegments,
mockWriteOutputs,
mockSendNotification,
mockCleanupJobTmp,
@@ -16,7 +15,6 @@ const {
mockGetJob: vi.fn(),
mockUpdateJob: vi.fn(),
mockSetJobStatus: vi.fn(),
mockDeduplicateSegments: vi.fn((segs: Segment[]) => segs),
mockWriteOutputs: vi.fn(),
mockSendNotification: vi.fn(),
mockCleanupJobTmp: vi.fn(),
@@ -29,10 +27,6 @@ vi.mock('$lib/server/db.js', () => ({
setJobStatus: mockSetJobStatus
}));
vi.mock('$lib/server/postprocess.js', () => ({
deduplicateSegments: mockDeduplicateSegments
}));
vi.mock('$lib/server/formatter.js', () => ({
writeOutputs: mockWriteOutputs
}));
@@ -91,7 +85,6 @@ function makeSeg(index: number, text: string): Segment {
beforeEach(() => {
vi.clearAllMocks();
mockDeduplicateSegments.mockImplementation((segs: Segment[]) => segs);
mockWriteOutputs.mockResolvedValue({
srt: '/out/dir/title.srt',
txt: '/out/dir/title.txt',
@@ -218,25 +211,21 @@ describe('POST /api/webhook/[jobId] — whisper failure', () => {
describe('POST /api/webhook/[jobId] — success with segments', () => {
const segments = [makeSeg(0, 'Hello world.'), makeSeg(1, 'This is a test.')];
it('runs deduplication on received segments', async () => {
it('passes received segments through unchanged', async () => {
mockGetJob.mockReturnValue(makeJob('job-3'));
await POST(makeEvent('job-3', makeWhisperJob({ segments })) as any);
expect(mockDeduplicateSegments).toHaveBeenCalledWith(segments);
expect(mockWriteOutputs).toHaveBeenCalledWith(segments, 'Test Video', 'job-3');
});
it('calls writeOutputs with the deduplicated segments and job title', async () => {
it('calls writeOutputs with the received segments and job title', async () => {
mockGetJob.mockReturnValue(makeJob('job-4', 'My Lecture'));
const deduped = [makeSeg(0, 'Hello world.')];
mockDeduplicateSegments.mockReturnValue(deduped);
await POST(makeEvent('job-4', makeWhisperJob({ segments })) as any);
expect(mockWriteOutputs).toHaveBeenCalledWith(deduped, 'My Lecture', 'job-4');
expect(mockWriteOutputs).toHaveBeenCalledWith(segments, 'My Lecture', 'job-4');
});
it('stores serialised segments_json in the database', async () => {
mockGetJob.mockReturnValue(makeJob('job-5'));
const deduped = [makeSeg(0, 'Result text.')];
mockDeduplicateSegments.mockReturnValue(deduped);
await POST(makeEvent('job-5', makeWhisperJob({ segments })) as any);
@@ -244,7 +233,7 @@ describe('POST /api/webhook/[jobId] — success with segments', () => {
expect.objectContaining({
id: 'job-5',
status: 'done',
segmentsJson: JSON.stringify(deduped)
segmentsJson: JSON.stringify(segments)
})
);
});