refactor(transcript): drop Tonemark rewrite
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
42
package-lock.json
generated
42
package-lock.json
generated
@@ -12,7 +12,8 @@
|
|||||||
"better-sqlite3": "^12.9.0",
|
"better-sqlite3": "^12.9.0",
|
||||||
"form-data": "^4.0.5",
|
"form-data": "^4.0.5",
|
||||||
"node-fetch": "^3.3.2",
|
"node-fetch": "^3.3.2",
|
||||||
"web-push": "^3.6.7"
|
"web-push": "^3.6.7",
|
||||||
|
"youtube-transcript": "^1.3.1"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@sveltejs/adapter-auto": "^7.0.1",
|
"@sveltejs/adapter-auto": "^7.0.1",
|
||||||
@@ -89,6 +90,27 @@
|
|||||||
"node": ">=18"
|
"node": ">=18"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@emnapi/core": {
|
||||||
|
"version": "1.10.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.10.0.tgz",
|
||||||
|
"integrity": "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw==",
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"dependencies": {
|
||||||
|
"@emnapi/wasi-threads": "1.2.1",
|
||||||
|
"tslib": "^2.4.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@emnapi/runtime": {
|
||||||
|
"version": "1.10.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.10.0.tgz",
|
||||||
|
"integrity": "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA==",
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"dependencies": {
|
||||||
|
"tslib": "^2.4.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/@emnapi/wasi-threads": {
|
"node_modules/@emnapi/wasi-threads": {
|
||||||
"version": "1.2.1",
|
"version": "1.2.1",
|
||||||
"resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.2.1.tgz",
|
"resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.2.1.tgz",
|
||||||
@@ -896,7 +918,6 @@
|
|||||||
"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.59.1.tgz",
|
"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.59.1.tgz",
|
||||||
"integrity": "sha512-d8OON70AphLdDesuTIl//M2O6fRTIicX8aYv8vhCiYEhTTI2OboKqey0Hu1A4VFhqwgqtq0vKDmPFGkw8kKmgw==",
|
"integrity": "sha512-d8OON70AphLdDesuTIl//M2O6fRTIicX8aYv8vhCiYEhTTI2OboKqey0Hu1A4VFhqwgqtq0vKDmPFGkw8kKmgw==",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"peer": true,
|
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@standard-schema/spec": "^1.0.0",
|
"@standard-schema/spec": "^1.0.0",
|
||||||
"@sveltejs/acorn-typescript": "^1.0.5",
|
"@sveltejs/acorn-typescript": "^1.0.5",
|
||||||
@@ -938,7 +959,6 @@
|
|||||||
"resolved": "https://registry.npmjs.org/@sveltejs/vite-plugin-svelte/-/vite-plugin-svelte-7.1.1.tgz",
|
"resolved": "https://registry.npmjs.org/@sveltejs/vite-plugin-svelte/-/vite-plugin-svelte-7.1.1.tgz",
|
||||||
"integrity": "sha512-FOJdbE5pxae68DoTBJ49t1dIA7TSmMHR6CsuJhX90cO/UfrEMHA7KJNUj3WdZuUDJPu4ujqpJ2Tgqd2gTWr6Xg==",
|
"integrity": "sha512-FOJdbE5pxae68DoTBJ49t1dIA7TSmMHR6CsuJhX90cO/UfrEMHA7KJNUj3WdZuUDJPu4ujqpJ2Tgqd2gTWr6Xg==",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"peer": true,
|
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"deepmerge": "^4.3.1",
|
"deepmerge": "^4.3.1",
|
||||||
"magic-string": "^0.30.21",
|
"magic-string": "^0.30.21",
|
||||||
@@ -1313,7 +1333,6 @@
|
|||||||
"integrity": "sha512-38C0/Ddb7HcRG0Z4/DUem8x57d2p9jYgp18mkaYswEOQBGsI1CG4f/hjm0ZCeaJfWhSZ4k7jgs29V1Zom7Ki9A==",
|
"integrity": "sha512-38C0/Ddb7HcRG0Z4/DUem8x57d2p9jYgp18mkaYswEOQBGsI1CG4f/hjm0ZCeaJfWhSZ4k7jgs29V1Zom7Ki9A==",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"peer": true,
|
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@bcoe/v8-coverage": "^1.0.2",
|
"@bcoe/v8-coverage": "^1.0.2",
|
||||||
"@vitest/utils": "4.1.5",
|
"@vitest/utils": "4.1.5",
|
||||||
@@ -1467,7 +1486,6 @@
|
|||||||
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.16.0.tgz",
|
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.16.0.tgz",
|
||||||
"integrity": "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw==",
|
"integrity": "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw==",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"peer": true,
|
|
||||||
"bin": {
|
"bin": {
|
||||||
"acorn": "bin/acorn"
|
"acorn": "bin/acorn"
|
||||||
},
|
},
|
||||||
@@ -3021,7 +3039,6 @@
|
|||||||
"resolved": "https://registry.npmjs.org/rollup/-/rollup-4.60.3.tgz",
|
"resolved": "https://registry.npmjs.org/rollup/-/rollup-4.60.3.tgz",
|
||||||
"integrity": "sha512-pAQK9HalE84QSm4Po3EmWIZPd3FnjkShVkiMlz1iligWYkWQ7wHYd1PF/T7QZ5TVSD6uSTon5gBVMSM4JfBV+A==",
|
"integrity": "sha512-pAQK9HalE84QSm4Po3EmWIZPd3FnjkShVkiMlz1iligWYkWQ7wHYd1PF/T7QZ5TVSD6uSTon5gBVMSM4JfBV+A==",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"peer": true,
|
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@types/estree": "1.0.8"
|
"@types/estree": "1.0.8"
|
||||||
},
|
},
|
||||||
@@ -3255,7 +3272,6 @@
|
|||||||
"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.5.tgz",
|
"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.5.tgz",
|
||||||
"integrity": "sha512-2uCs/LZ9us+AktdzYJM8OcxQ8qnPS1kpaO7syGT/MgO+6Qr1Ybl+TqPq+97u7PHqmmMlye5ZkoyXONy5mjjAbw==",
|
"integrity": "sha512-2uCs/LZ9us+AktdzYJM8OcxQ8qnPS1kpaO7syGT/MgO+6Qr1Ybl+TqPq+97u7PHqmmMlye5ZkoyXONy5mjjAbw==",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"peer": true,
|
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@jridgewell/remapping": "^2.3.4",
|
"@jridgewell/remapping": "^2.3.4",
|
||||||
"@jridgewell/sourcemap-codec": "^1.5.0",
|
"@jridgewell/sourcemap-codec": "^1.5.0",
|
||||||
@@ -3428,7 +3444,6 @@
|
|||||||
"integrity": "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw==",
|
"integrity": "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw==",
|
||||||
"devOptional": true,
|
"devOptional": true,
|
||||||
"license": "Apache-2.0",
|
"license": "Apache-2.0",
|
||||||
"peer": true,
|
|
||||||
"bin": {
|
"bin": {
|
||||||
"tsc": "bin/tsc",
|
"tsc": "bin/tsc",
|
||||||
"tsserver": "bin/tsserver"
|
"tsserver": "bin/tsserver"
|
||||||
@@ -3455,7 +3470,6 @@
|
|||||||
"resolved": "https://registry.npmjs.org/vite/-/vite-8.0.10.tgz",
|
"resolved": "https://registry.npmjs.org/vite/-/vite-8.0.10.tgz",
|
||||||
"integrity": "sha512-rZuUu9j6J5uotLDs+cAA4O5H4K1SfPliUlQwqa6YEwSrWDZzP4rhm00oJR5snMewjxF5V/K3D4kctsUTsIU9Mw==",
|
"integrity": "sha512-rZuUu9j6J5uotLDs+cAA4O5H4K1SfPliUlQwqa6YEwSrWDZzP4rhm00oJR5snMewjxF5V/K3D4kctsUTsIU9Mw==",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"peer": true,
|
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"lightningcss": "^1.32.0",
|
"lightningcss": "^1.32.0",
|
||||||
"picomatch": "^4.0.4",
|
"picomatch": "^4.0.4",
|
||||||
@@ -3553,7 +3567,6 @@
|
|||||||
"integrity": "sha512-9Xx1v3/ih3m9hN+SbfkUyy0JAs72ap3r7joc87XL6jwF0jGg6mFBvQ1SrwaX+h8BlkX6Hz9shdd1uo6AF+ZGpg==",
|
"integrity": "sha512-9Xx1v3/ih3m9hN+SbfkUyy0JAs72ap3r7joc87XL6jwF0jGg6mFBvQ1SrwaX+h8BlkX6Hz9shdd1uo6AF+ZGpg==",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"peer": true,
|
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@vitest/expect": "4.1.5",
|
"@vitest/expect": "4.1.5",
|
||||||
"@vitest/mocker": "4.1.5",
|
"@vitest/mocker": "4.1.5",
|
||||||
@@ -3689,6 +3702,15 @@
|
|||||||
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
|
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
|
||||||
"license": "ISC"
|
"license": "ISC"
|
||||||
},
|
},
|
||||||
|
"node_modules/youtube-transcript": {
|
||||||
|
"version": "1.3.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/youtube-transcript/-/youtube-transcript-1.3.1.tgz",
|
||||||
|
"integrity": "sha512-NDCjwad113TGybbYF51y9Z4tcwzBHUZWQdF9veULNca18L+FdDbHHtTHIr69WVa3bB90l67S8kN0HtL2JO9fhg==",
|
||||||
|
"license": "MIT",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/zimmerframe": {
|
"node_modules/zimmerframe": {
|
||||||
"version": "1.1.4",
|
"version": "1.1.4",
|
||||||
"resolved": "https://registry.npmjs.org/zimmerframe/-/zimmerframe-1.1.4.tgz",
|
"resolved": "https://registry.npmjs.org/zimmerframe/-/zimmerframe-1.1.4.tgz",
|
||||||
|
|||||||
@@ -34,6 +34,7 @@
|
|||||||
"better-sqlite3": "^12.9.0",
|
"better-sqlite3": "^12.9.0",
|
||||||
"form-data": "^4.0.5",
|
"form-data": "^4.0.5",
|
||||||
"node-fetch": "^3.3.2",
|
"node-fetch": "^3.3.2",
|
||||||
"web-push": "^3.6.7"
|
"web-push": "^3.6.7",
|
||||||
|
"youtube-transcript": "^1.3.1"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
import { execFile } from 'child_process';
|
import { execFile } from 'child_process';
|
||||||
import { promisify } from 'util';
|
import { promisify } from 'util';
|
||||||
import { existsSync } from 'fs';
|
import { existsSync } from 'fs';
|
||||||
import { mkdir, unlink, writeFile } from 'fs/promises';
|
import { mkdir, writeFile } from 'fs/promises';
|
||||||
import { join } from 'path';
|
import { join } from 'path';
|
||||||
|
import { fetchTranscript, type TranscriptResponse } from 'youtube-transcript';
|
||||||
|
|
||||||
const execFileAsync = promisify(execFile);
|
const execFileAsync = promisify(execFile);
|
||||||
const TMP_DIR = join(process.env.DATA_DIR ?? '/tmp/.whisper-pwa', 'downloads');
|
const TMP_DIR = join(process.env.DATA_DIR ?? '/tmp/.whisper-pwa', 'downloads');
|
||||||
@@ -26,43 +27,33 @@ export interface AudioResult {
|
|||||||
export type DownloadResult = CaptionResult | AudioResult;
|
export type DownloadResult = CaptionResult | AudioResult;
|
||||||
|
|
||||||
/** Try to get auto-generated captions from YouTube. Returns null if unavailable. */
|
/** Try to get auto-generated captions from YouTube. Returns null if unavailable. */
|
||||||
async function tryGetCaptions(url: string, outDir: string): Promise<CaptionResult | null> {
|
async function tryGetCaptions(url: string, _outDir: string): Promise<CaptionResult | null> {
|
||||||
const jsonPath = join(outDir, 'info.json');
|
|
||||||
try {
|
try {
|
||||||
await execFileAsync('yt-dlp', [
|
const transcript = await fetchTranscript(url, { lang: 'en' });
|
||||||
'--write-auto-subs',
|
const segments = transcriptEntriesToSegments(transcript);
|
||||||
'--sub-langs', 'en.*',
|
|
||||||
'--skip-download',
|
|
||||||
'--write-info-json',
|
|
||||||
'--no-playlist',
|
|
||||||
'-o', join(outDir, '%(title)s.%(ext)s'),
|
|
||||||
url
|
|
||||||
]);
|
|
||||||
|
|
||||||
// Find the VTT/SRT file
|
|
||||||
const { readdirSync } = await import('fs');
|
|
||||||
const files = readdirSync(outDir);
|
|
||||||
const vttFile = files.find((f) => f.endsWith('.vtt') || f.endsWith('.srt'));
|
|
||||||
if (!vttFile) return null;
|
|
||||||
|
|
||||||
let title = 'Untitled';
|
|
||||||
if (existsSync(jsonPath)) {
|
|
||||||
try {
|
|
||||||
const info = JSON.parse((await import('fs')).readFileSync(jsonPath, 'utf8'));
|
|
||||||
title = info.title ?? title;
|
|
||||||
} catch { /* ignore */ }
|
|
||||||
}
|
|
||||||
|
|
||||||
const content = (await import('fs')).readFileSync(join(outDir, vttFile), 'utf8');
|
|
||||||
const segments = parseVtt(content);
|
|
||||||
if (segments.length === 0) return null;
|
if (segments.length === 0) return null;
|
||||||
|
|
||||||
|
const title = await getYouTubeTitle(url);
|
||||||
return { type: 'captions', segments, title };
|
return { type: 'captions', segments, title };
|
||||||
} catch {
|
} catch {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function getYouTubeTitle(url: string): Promise<string> {
|
||||||
|
try {
|
||||||
|
const { stdout } = await execFileAsync('yt-dlp', [
|
||||||
|
'--dump-single-json',
|
||||||
|
'--skip-download',
|
||||||
|
'--no-playlist',
|
||||||
|
url
|
||||||
|
]);
|
||||||
|
return JSON.parse(stdout).title ?? 'Untitled';
|
||||||
|
} catch {
|
||||||
|
return 'Untitled';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** Download best audio from YouTube. Returns path to audio file. */
|
/** Download best audio from YouTube. Returns path to audio file. */
|
||||||
async function downloadAudio(url: string, outDir: string): Promise<{ audioPath: string; title: string }> {
|
async function downloadAudio(url: string, outDir: string): Promise<{ audioPath: string; title: string }> {
|
||||||
await execFileAsync('yt-dlp', [
|
await execFileAsync('yt-dlp', [
|
||||||
@@ -124,39 +115,22 @@ export async function cleanupJobTmp(jobId: string) {
|
|||||||
} catch { /* ignore */ }
|
} catch { /* ignore */ }
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Parse a WebVTT string into segments. */
|
export function transcriptEntriesToSegments(
|
||||||
function parseVtt(
|
entries: TranscriptResponse[]
|
||||||
content: string
|
|
||||||
): Array<{ index: number; start: number; end: number; text: string; words: [] }> {
|
): Array<{ index: number; start: number; end: number; text: string; words: [] }> {
|
||||||
const segments: Array<{ index: number; start: number; end: number; text: string; words: [] }> = [];
|
const useMilliseconds = entries.some((entry) => entry.offset > 1000 || entry.duration > 1000);
|
||||||
const blocks = content.split(/\n\n+/);
|
return entries
|
||||||
let index = 0;
|
.map((entry) => {
|
||||||
|
const start = useMilliseconds ? entry.offset / 1000 : entry.offset;
|
||||||
for (const block of blocks) {
|
const duration = useMilliseconds ? entry.duration / 1000 : entry.duration;
|
||||||
const lines = block.trim().split('\n');
|
return {
|
||||||
const timeLine = lines.find((l) => l.includes('-->'));
|
index: 0,
|
||||||
if (!timeLine) continue;
|
start,
|
||||||
|
end: start + duration,
|
||||||
const [startStr, endStr] = timeLine.split('-->').map((s) => s.trim().split(' ')[0]);
|
text: entry.text.trim(),
|
||||||
const start = vttTimeToSec(startStr);
|
words: [] as []
|
||||||
const end = vttTimeToSec(endStr);
|
};
|
||||||
const text = lines
|
})
|
||||||
.filter((l) => !l.includes('-->') && !/^\d+$/.test(l.trim()) && l.trim())
|
.filter((entry) => entry.text.length > 0)
|
||||||
.join(' ')
|
.map((entry, index) => ({ ...entry, index }));
|
||||||
.replace(/<[^>]+>/g, '')
|
|
||||||
.trim();
|
|
||||||
|
|
||||||
if (text) {
|
|
||||||
segments.push({ index: index++, start, end, text, words: [] });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return segments;
|
|
||||||
}
|
|
||||||
|
|
||||||
function vttTimeToSec(t: string): number {
|
|
||||||
const parts = t.split(':').map(Number);
|
|
||||||
if (parts.length === 3) return parts[0] * 3600 + parts[1] * 60 + parts[2];
|
|
||||||
if (parts.length === 2) return parts[0] * 60 + parts[1];
|
|
||||||
return parts[0];
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -96,15 +96,13 @@ async function runJob(
|
|||||||
|
|
||||||
if (captionSegments) {
|
if (captionSegments) {
|
||||||
// Caption fast path — skip whisper
|
// Caption fast path — skip whisper
|
||||||
const { deduplicateSegments } = await import('./postprocess.js');
|
|
||||||
const { writeOutputs } = await import('./formatter.js');
|
const { writeOutputs } = await import('./formatter.js');
|
||||||
const segments = deduplicateSegments(captionSegments);
|
const paths = await writeOutputs(captionSegments, title, jobId);
|
||||||
const paths = await writeOutputs(segments, title, jobId);
|
|
||||||
updateJob({
|
updateJob({
|
||||||
id: jobId,
|
id: jobId,
|
||||||
status: 'done',
|
status: 'done',
|
||||||
progress: 100,
|
progress: 100,
|
||||||
segmentsJson: JSON.stringify(segments),
|
segmentsJson: JSON.stringify(captionSegments),
|
||||||
outputDir: paths.srt.replace(/\/[^/]+$/, '')
|
outputDir: paths.srt.replace(/\/[^/]+$/, '')
|
||||||
});
|
});
|
||||||
emitProgress(jobId, { type: 'done' });
|
emitProgress(jobId, { type: 'done' });
|
||||||
|
|||||||
@@ -1,235 +0,0 @@
|
|||||||
import type { Segment } from '$lib/types.js';
|
|
||||||
|
|
||||||
// ── Collapse consecutive repeated phrases within a segment's text ────────────
|
|
||||||
|
|
||||||
function collapseRepeats(text: string): string {
|
|
||||||
let prev = '';
|
|
||||||
// Keep applying until stable
|
|
||||||
while (true) {
|
|
||||||
const next = collapseOnce(text);
|
|
||||||
if (next === prev || next === text) return next;
|
|
||||||
prev = text;
|
|
||||||
text = next;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function collapseOnce(text: string): string {
|
|
||||||
// Match any repeated phrase (2+ words) appearing consecutively
|
|
||||||
return text.replace(/\b(.{10,}?)\s+\1\b/gi, '$1');
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Merge consecutive segments with identical (or near-identical) text ───────
|
|
||||||
|
|
||||||
function normalise(s: string) {
|
|
||||||
return s.toLowerCase().replace(/[^\w\s]/g, '').replace(/\s+/g, ' ').trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
function mergeConsecutive(segments: Segment[]): Segment[] {
|
|
||||||
const out: Segment[] = [];
|
|
||||||
for (const seg of segments) {
|
|
||||||
const last = out[out.length - 1];
|
|
||||||
if (last && normalise(last.text) === normalise(seg.text)) {
|
|
||||||
last.end = seg.end;
|
|
||||||
} else {
|
|
||||||
out.push({ ...seg });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Collapse rolling prefix/suffix chains from backend segment hypotheses ──────
|
|
||||||
|
|
||||||
const MAX_CHAIN_GAP_SECS = 0.15;
|
|
||||||
const MIN_MEANINGFUL_WORDS = 2;
|
|
||||||
const MIN_MEANINGFUL_CHARS = 8;
|
|
||||||
const MIN_OVERLAP_WORDS = 1;
|
|
||||||
|
|
||||||
function splitWords(text: string): string[] {
|
|
||||||
return text.trim().split(/\s+/).filter(Boolean);
|
|
||||||
}
|
|
||||||
|
|
||||||
function normaliseWords(text: string): string[] {
|
|
||||||
return splitWords(text)
|
|
||||||
.map((word) => word.toLowerCase().replace(/[^\w]/g, ''))
|
|
||||||
.filter(Boolean);
|
|
||||||
}
|
|
||||||
|
|
||||||
function arraysEqual(a: string[], b: string[]): boolean {
|
|
||||||
return a.length === b.length && a.every((value, index) => value === b[index]);
|
|
||||||
}
|
|
||||||
|
|
||||||
function startsWithWords(full: string[], prefix: string[]): boolean {
|
|
||||||
return prefix.length <= full.length && arraysEqual(full.slice(0, prefix.length), prefix);
|
|
||||||
}
|
|
||||||
|
|
||||||
function endsWithWords(full: string[], suffix: string[]): boolean {
|
|
||||||
return suffix.length <= full.length && arraysEqual(full.slice(full.length - suffix.length), suffix);
|
|
||||||
}
|
|
||||||
|
|
||||||
function suffixPrefixOverlap(left: string[], right: string[]): number {
|
|
||||||
const max = Math.min(left.length, right.length);
|
|
||||||
for (let size = max; size >= 1; size--) {
|
|
||||||
if (arraysEqual(left.slice(left.length - size), right.slice(0, size))) return size;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
function isMeaningfulPhrase(words: string[]): boolean {
|
|
||||||
return words.length >= MIN_MEANINGFUL_WORDS && words.join(' ').length >= MIN_MEANINGFUL_CHARS;
|
|
||||||
}
|
|
||||||
|
|
||||||
function isShortCarryover(seg: Segment, words: string[]): boolean {
|
|
||||||
return seg.end - seg.start <= 0.2 || words.length <= 2 || words.join(' ').length <= 16;
|
|
||||||
}
|
|
||||||
|
|
||||||
function trimLeadingWords(text: string, count: number): string {
|
|
||||||
return splitWords(text).slice(count).join(' ').trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
function collapseIncrementalSegments(segments: Segment[]): Segment[] {
|
|
||||||
const out: Segment[] = [];
|
|
||||||
|
|
||||||
for (const seg of segments) {
|
|
||||||
let current: Segment = {
|
|
||||||
...seg,
|
|
||||||
text: seg.text.trim()
|
|
||||||
};
|
|
||||||
|
|
||||||
if (!current.text) continue;
|
|
||||||
|
|
||||||
const last = out[out.length - 1];
|
|
||||||
if (!last) {
|
|
||||||
out.push(current);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
const gap = current.start - last.end;
|
|
||||||
if (gap > MAX_CHAIN_GAP_SECS) {
|
|
||||||
out.push(current);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
const lastWords = normaliseWords(last.text);
|
|
||||||
const currentWords = normaliseWords(current.text);
|
|
||||||
if (lastWords.length === 0 || currentWords.length === 0) {
|
|
||||||
out.push(current);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (
|
|
||||||
currentWords.length > lastWords.length &&
|
|
||||||
startsWithWords(currentWords, lastWords) &&
|
|
||||||
(isMeaningfulPhrase(lastWords) || isShortCarryover(last, lastWords))
|
|
||||||
) {
|
|
||||||
last.text = current.text;
|
|
||||||
last.end = current.end;
|
|
||||||
last.words = current.words;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (
|
|
||||||
endsWithWords(lastWords, currentWords) &&
|
|
||||||
(isMeaningfulPhrase(currentWords) || isShortCarryover(current, currentWords))
|
|
||||||
) {
|
|
||||||
last.end = Math.max(last.end, current.end);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
const overlapWords = suffixPrefixOverlap(lastWords, currentWords);
|
|
||||||
if (overlapWords >= MIN_OVERLAP_WORDS) {
|
|
||||||
const trimmedText = trimLeadingWords(current.text, overlapWords);
|
|
||||||
if (!trimmedText) {
|
|
||||||
last.end = Math.max(last.end, current.end);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
current = {
|
|
||||||
...current,
|
|
||||||
start: Math.max(current.start, last.end),
|
|
||||||
text: trimmedText,
|
|
||||||
words: []
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
out.push(current);
|
|
||||||
}
|
|
||||||
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── N-gram deduplication ─────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
const NGRAM_N = 6;
|
|
||||||
const LOOKBACK_CHARS = 500;
|
|
||||||
const SIMILARITY_THRESHOLD = 0.6;
|
|
||||||
|
|
||||||
function ngrams(text: string, n: number): string[] {
|
|
||||||
const words = text.toLowerCase().split(/\s+/);
|
|
||||||
const grams: string[] = [];
|
|
||||||
for (let i = 0; i <= words.length - n; i++) {
|
|
||||||
grams.push(words.slice(i, i + n).join(' '));
|
|
||||||
}
|
|
||||||
return grams;
|
|
||||||
}
|
|
||||||
|
|
||||||
function jaccardSimilarity(a: string, b: string): number {
|
|
||||||
const ga = new Set(ngrams(a, NGRAM_N));
|
|
||||||
const gb = new Set(ngrams(b, NGRAM_N));
|
|
||||||
// If neither text is long enough to produce n-grams they cannot be compared;
|
|
||||||
// treat as dissimilar so short segments are never incorrectly discarded.
|
|
||||||
if (ga.size === 0 && gb.size === 0) return 0;
|
|
||||||
const intersection = [...ga].filter((g) => gb.has(g)).length;
|
|
||||||
const union = new Set([...ga, ...gb]).size;
|
|
||||||
return union === 0 ? 0 : intersection / union;
|
|
||||||
}
|
|
||||||
|
|
||||||
function ngramDedup(segments: Segment[]): Segment[] {
|
|
||||||
const out: Segment[] = [];
|
|
||||||
for (const seg of segments) {
|
|
||||||
const windowText = out
|
|
||||||
.slice(-20)
|
|
||||||
.map((s) => s.text)
|
|
||||||
.join(' ')
|
|
||||||
.slice(-LOOKBACK_CHARS);
|
|
||||||
|
|
||||||
if (windowText.length > 0 && jaccardSimilarity(seg.text, windowText) >= SIMILARITY_THRESHOLD) {
|
|
||||||
continue; // duplicate — skip
|
|
||||||
}
|
|
||||||
out.push(seg);
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Full deduplication pipeline ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
export function deduplicateSegments(segments: Segment[]): Segment[] {
|
|
||||||
if (!Array.isArray(segments)) return [];
|
|
||||||
// 1. Collapse repeats within each segment's text
|
|
||||||
let result = segments.map((s) => ({
|
|
||||||
...s,
|
|
||||||
text: collapseRepeats(s.text.trim())
|
|
||||||
}));
|
|
||||||
|
|
||||||
// 2. Remove empty segments
|
|
||||||
result = result.filter((s) => s.text.length > 0);
|
|
||||||
|
|
||||||
// 3. Collapse rolling backend hypotheses before generic dedup
|
|
||||||
result = collapseIncrementalSegments(result);
|
|
||||||
|
|
||||||
// 4. First merge pass
|
|
||||||
result = mergeConsecutive(result);
|
|
||||||
|
|
||||||
// 5. N-gram dedup
|
|
||||||
result = ngramDedup(result);
|
|
||||||
|
|
||||||
// 6. Re-run rolling collapse after removals create new adjacencies
|
|
||||||
result = collapseIncrementalSegments(result);
|
|
||||||
|
|
||||||
// 7. Second merge pass (catches new adjacencies after dedup)
|
|
||||||
result = mergeConsecutive(result);
|
|
||||||
|
|
||||||
// 8. Re-index
|
|
||||||
result.forEach((s, i) => (s.index = i));
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
@@ -1,10 +1,9 @@
|
|||||||
import { json, error } from '@sveltejs/kit';
|
import { json, error } from '@sveltejs/kit';
|
||||||
import { getJob, updateJob } from '$lib/server/db.js';
|
import { getJob, updateJob } from '$lib/server/db.js';
|
||||||
import { deduplicateSegments } from '$lib/server/postprocess.js';
|
|
||||||
import { writeOutputs } from '$lib/server/formatter.js';
|
import { writeOutputs } from '$lib/server/formatter.js';
|
||||||
import type { Segment } from '$lib/types.js';
|
import type { Segment } from '$lib/types.js';
|
||||||
|
|
||||||
/** POST /api/jobs/[id]/reprocess — re-run post-processing and regenerate all output files. */
|
/** POST /api/jobs/[id]/reprocess — regenerate output files from stored canonical segments. */
|
||||||
export async function POST({ params }) {
|
export async function POST({ params }) {
|
||||||
const job = getJob(params.id);
|
const job = getJob(params.id);
|
||||||
if (!job) throw error(404, 'Job not found');
|
if (!job) throw error(404, 'Job not found');
|
||||||
@@ -14,8 +13,7 @@ export async function POST({ params }) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const rawSegments = JSON.parse(job.segmentsJson) as Segment[];
|
const segments = JSON.parse(job.segmentsJson) as Segment[];
|
||||||
const segments = deduplicateSegments(rawSegments);
|
|
||||||
|
|
||||||
const paths = await writeOutputs(segments, job.title, job.id);
|
const paths = await writeOutputs(segments, job.title, job.id);
|
||||||
const outputDir = paths.srt.replace(/\/[^/]+$/, '');
|
const outputDir = paths.srt.replace(/\/[^/]+$/, '');
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
import { json, error } from '@sveltejs/kit';
|
import { json, error } from '@sveltejs/kit';
|
||||||
import { getJob, updateJob, setJobStatus } from '$lib/server/db.js';
|
import { getJob, updateJob, setJobStatus } from '$lib/server/db.js';
|
||||||
import { deduplicateSegments } from '$lib/server/postprocess.js';
|
|
||||||
import { writeOutputs } from '$lib/server/formatter.js';
|
import { writeOutputs } from '$lib/server/formatter.js';
|
||||||
import { sendNotification } from '$lib/server/push.js';
|
import { sendNotification } from '$lib/server/push.js';
|
||||||
import { cleanupJobTmp } from '$lib/server/downloader.js';
|
import { cleanupJobTmp } from '$lib/server/downloader.js';
|
||||||
@@ -40,8 +39,7 @@ try {
|
|||||||
setJobStatus(jobId, 'processing', 90);
|
setJobStatus(jobId, 'processing', 90);
|
||||||
emitProgress(jobId, { type: 'status', status: 'processing', progress: 90 });
|
emitProgress(jobId, { type: 'status', status: 'processing', progress: 90 });
|
||||||
|
|
||||||
const rawSegments = (whisperJob.segments ?? []) as Segment[];
|
const segments = (whisperJob.segments ?? []) as Segment[];
|
||||||
const segments = deduplicateSegments(rawSegments);
|
|
||||||
|
|
||||||
const paths = await writeOutputs(segments, job.title, jobId);
|
const paths = await writeOutputs(segments, job.title, jobId);
|
||||||
const outputDir = paths.srt.replace(/\/[^/]+$/, '');
|
const outputDir = paths.srt.replace(/\/[^/]+$/, '');
|
||||||
|
|||||||
80
src/tests/downloader.test.ts
Normal file
80
src/tests/downloader.test.ts
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||||
|
import { rm } from 'fs/promises';
|
||||||
|
import type { TranscriptResponse } from 'youtube-transcript';
|
||||||
|
|
||||||
|
const { mockExecFile, mockFetchTranscript } = vi.hoisted(() => ({
|
||||||
|
mockExecFile: vi.fn(),
|
||||||
|
mockFetchTranscript: vi.fn()
|
||||||
|
}));
|
||||||
|
|
||||||
|
const TEST_DATA_DIR = `/tmp/tonemark-downloader-test-${Date.now()}`;
|
||||||
|
vi.stubEnv('DATA_DIR', TEST_DATA_DIR);
|
||||||
|
|
||||||
|
vi.mock('child_process', () => ({
|
||||||
|
execFile: mockExecFile
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock('youtube-transcript', () => ({
|
||||||
|
fetchTranscript: mockFetchTranscript
|
||||||
|
}));
|
||||||
|
|
||||||
|
import { downloadYouTube, transcriptEntriesToSegments } from '$lib/server/downloader.js';
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
vi.clearAllMocks();
|
||||||
|
mockExecFile.mockImplementation((...args: unknown[]) => {
|
||||||
|
const cb = args.at(-1) as (...callbackArgs: unknown[]) => void;
|
||||||
|
cb(null, JSON.stringify({ title: 'Fetched Title' }), '');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(async () => {
|
||||||
|
await rm(TEST_DATA_DIR, { recursive: true, force: true }).catch(() => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('transcriptEntriesToSegments', () => {
|
||||||
|
it('converts millisecond transcript offsets into second-based segments', () => {
|
||||||
|
const entries: TranscriptResponse[] = [
|
||||||
|
{ text: 'Hello everyone.', offset: 15240, duration: 4240, lang: 'en' },
|
||||||
|
{ text: 'Um, welcome to this talk.', offset: 16600, duration: 5080, lang: 'en' }
|
||||||
|
];
|
||||||
|
|
||||||
|
expect(transcriptEntriesToSegments(entries)).toEqual([
|
||||||
|
{ index: 0, start: 15.24, end: 19.48, text: 'Hello everyone.', words: [] },
|
||||||
|
{ index: 1, start: 16.6, end: 21.68, text: 'Um, welcome to this talk.', words: [] }
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('preserves second-based transcript offsets and drops empty text', () => {
|
||||||
|
const entries: TranscriptResponse[] = [
|
||||||
|
{ text: ' ', offset: 0, duration: 1.5, lang: 'en' },
|
||||||
|
{ text: 'Clean caption cue', offset: 91.08, duration: 3.72, lang: 'en' }
|
||||||
|
];
|
||||||
|
|
||||||
|
expect(transcriptEntriesToSegments(entries)).toEqual([
|
||||||
|
{ index: 0, start: 91.08, end: 94.8, text: 'Clean caption cue', words: [] }
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('downloadYouTube', () => {
|
||||||
|
it('uses fetched transcript entries directly for caption jobs', async () => {
|
||||||
|
mockFetchTranscript.mockResolvedValue([
|
||||||
|
{ text: 'Hello everyone.', offset: 15240, duration: 4240, lang: 'en' },
|
||||||
|
{ text: 'Um, welcome to this talk.', offset: 16600, duration: 5080, lang: 'en' }
|
||||||
|
] satisfies TranscriptResponse[]);
|
||||||
|
|
||||||
|
const result = await downloadYouTube('https://youtube.com/watch?v=qdh_x-uRs9g', 'job-1');
|
||||||
|
|
||||||
|
expect(mockFetchTranscript).toHaveBeenCalledWith('https://youtube.com/watch?v=qdh_x-uRs9g', {
|
||||||
|
lang: 'en'
|
||||||
|
});
|
||||||
|
expect(result).toMatchObject({
|
||||||
|
type: 'captions',
|
||||||
|
segments: [
|
||||||
|
{ index: 0, start: 15.24, end: 19.48, text: 'Hello everyone.', words: [] },
|
||||||
|
{ index: 1, start: 16.6, end: 21.68, text: 'Um, welcome to this talk.', words: [] }
|
||||||
|
]
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -1,204 +0,0 @@
|
|||||||
import { describe, it, expect } from 'vitest';
|
|
||||||
import {
|
|
||||||
deduplicateSegments
|
|
||||||
} from '$lib/server/postprocess.js';
|
|
||||||
import type { Segment } from '$lib/types.js';
|
|
||||||
|
|
||||||
// ── helpers ──────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
function seg(index: number, start: number, end: number, text: string): Segment {
|
|
||||||
return { index, start, end, text, words: [] };
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── collapseRepeats (tested indirectly via deduplicateSegments) ───────────────
|
|
||||||
|
|
||||||
describe('deduplicateSegments — collapseRepeats', () => {
|
|
||||||
it('leaves text without repetition unchanged', () => {
|
|
||||||
const input = [seg(0, 0, 5, ' Hello world, this is a sentence.')];
|
|
||||||
const [out] = deduplicateSegments(input);
|
|
||||||
expect(out.text).toBe('Hello world, this is a sentence.');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('collapses a consecutive repeated phrase inside a segment', () => {
|
|
||||||
const input = [seg(0, 0, 5, ' the quick brown fox the quick brown fox')];
|
|
||||||
const [out] = deduplicateSegments(input);
|
|
||||||
expect(out.text).not.toMatch(/the quick brown fox.*the quick brown fox/i);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('handles multiple repetitions recursively', () => {
|
|
||||||
// "welcome everyone" = 16 chars — qualifies for the ≥10-char collapse regex
|
|
||||||
const input = [seg(0, 0, 5, ' welcome everyone welcome everyone welcome everyone')];
|
|
||||||
const result = deduplicateSegments(input);
|
|
||||||
const text = result[0]?.text ?? '';
|
|
||||||
expect((text.match(/welcome everyone/gi) ?? []).length).toBeLessThan(3);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
// ── mergeConsecutive ──────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
describe('deduplicateSegments — mergeConsecutive', () => {
|
|
||||||
it('merges adjacent segments with identical text', () => {
|
|
||||||
const input = [
|
|
||||||
seg(0, 0, 2, ' Hello world.'),
|
|
||||||
seg(1, 2, 4, ' Hello world.')
|
|
||||||
];
|
|
||||||
const result = deduplicateSegments(input);
|
|
||||||
expect(result).toHaveLength(1);
|
|
||||||
expect(result[0].end).toBe(4);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('keeps adjacent segments with different text', () => {
|
|
||||||
const input = [
|
|
||||||
seg(0, 0, 2, ' First sentence.'),
|
|
||||||
seg(1, 2, 4, ' Second sentence.')
|
|
||||||
];
|
|
||||||
const result = deduplicateSegments(input);
|
|
||||||
expect(result).toHaveLength(2);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('normalises punctuation and case for merge comparison', () => {
|
|
||||||
const input = [
|
|
||||||
seg(0, 0, 2, ' Hello, World!'),
|
|
||||||
seg(1, 2, 4, ' hello world')
|
|
||||||
];
|
|
||||||
const result = deduplicateSegments(input);
|
|
||||||
expect(result).toHaveLength(1);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
// ── rolling prefix/suffix chain collapse ───────────────────────────────────────
|
|
||||||
|
|
||||||
describe('deduplicateSegments — rolling backend hypotheses', () => {
|
|
||||||
it('collapses prefix-growth chains from stored backend segments', () => {
|
|
||||||
const input = [
|
|
||||||
seg(0, 15.24, 16.6, 'Hello everyone.'),
|
|
||||||
seg(1, 16.6, 19.47, 'Hello everyone. Um, welcome to this talk.'),
|
|
||||||
seg(2, 19.47, 19.48, 'Um, welcome to this talk.'),
|
|
||||||
seg(3, 19.48, 21.67, "Um, welcome to this talk. I'll be speaking about small model"),
|
|
||||||
seg(4, 21.67, 21.68, "I'll be speaking about small model"),
|
|
||||||
seg(5, 21.68, 24.59, "I'll be speaking about small model inference and a gap that we've")
|
|
||||||
];
|
|
||||||
|
|
||||||
const result = deduplicateSegments(input);
|
|
||||||
|
|
||||||
expect(result).toHaveLength(2);
|
|
||||||
expect(result[0]).toMatchObject({
|
|
||||||
index: 0,
|
|
||||||
start: 15.24,
|
|
||||||
end: 19.48,
|
|
||||||
text: 'Hello everyone. Um, welcome to this talk.'
|
|
||||||
});
|
|
||||||
expect(result[1]).toMatchObject({
|
|
||||||
index: 1,
|
|
||||||
start: 19.48,
|
|
||||||
end: 24.59,
|
|
||||||
text: "I'll be speaking about small model inference and a gap that we've"
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
it('does not collapse similar phrases when there is a real timing gap', () => {
|
|
||||||
const input = [
|
|
||||||
seg(0, 0, 1, 'Hello everyone.'),
|
|
||||||
seg(1, 2, 4, 'Hello everyone. Welcome back.')
|
|
||||||
];
|
|
||||||
|
|
||||||
const result = deduplicateSegments(input);
|
|
||||||
|
|
||||||
expect(result).toHaveLength(2);
|
|
||||||
expect(result[0].text).toBe('Hello everyone.');
|
|
||||||
expect(result[1].text).toBe('Hello everyone. Welcome back.');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('collapses tiny one-word carry-over segments from caption-style output', () => {
|
|
||||||
const input = [
|
|
||||||
seg(0, 94.8, 96.4, 'world.'),
|
|
||||||
seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
|
|
||||||
seg(2, 98.96, 100.72, 'inference.'),
|
|
||||||
seg(3, 100.72, 103.92, 'inference. So, as someone who kind of wants to'),
|
|
||||||
seg(4, 107.19, 107.2, 'and'),
|
|
||||||
seg(5, 107.2, 109.56, 'and work to understand the problems and the')
|
|
||||||
];
|
|
||||||
|
|
||||||
const result = deduplicateSegments(input);
|
|
||||||
|
|
||||||
expect(result).toHaveLength(3);
|
|
||||||
expect(result[0].text).toBe('world. And that aspect that I overlooked was');
|
|
||||||
expect(result[1].text).toBe('inference. So, as someone who kind of wants to');
|
|
||||||
expect(result[2].text).toBe('and work to understand the problems and the');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('trims single-word suffix-prefix overlap between adjacent segments', () => {
|
|
||||||
const input = [
|
|
||||||
seg(0, 94.8, 96.4, 'world.'),
|
|
||||||
seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
|
|
||||||
seg(2, 120.12, 123.71, 'to find more about inference.'),
|
|
||||||
seg(3, 123.72, 126.92, "inference. So, I've done a lot of work with VLAM,")
|
|
||||||
];
|
|
||||||
|
|
||||||
const result = deduplicateSegments(input);
|
|
||||||
|
|
||||||
expect(result).toHaveLength(3);
|
|
||||||
expect(result[0].text).toBe('world. And that aspect that I overlooked was');
|
|
||||||
expect(result[2].text).toBe("So, I've done a lot of work with VLAM,");
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
// ── ngramDedup ────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
describe('deduplicateSegments — ngramDedup', () => {
|
|
||||||
it('passes through completely unique segments', () => {
|
|
||||||
const input = [
|
|
||||||
seg(0, 0, 5, ' The cat sat on the mat quite happily today.'),
|
|
||||||
seg(1, 5, 10, ' Later the dog ran across the yard chasing a ball.')
|
|
||||||
];
|
|
||||||
expect(deduplicateSegments(input)).toHaveLength(2);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('removes a segment that is highly similar to recent context', () => {
|
|
||||||
// Repeat a long sentence verbatim — should be caught as duplicate
|
|
||||||
const longText =
|
|
||||||
' This is a very specific and unique sentence about transcription quality matters greatly.';
|
|
||||||
const input = [seg(0, 0, 5, longText), seg(1, 5, 10, longText)];
|
|
||||||
// After mergeConsecutive the second one is already merged, so result is 1
|
|
||||||
expect(deduplicateSegments(input)).toHaveLength(1);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
// ── deduplicateSegments — full pipeline ──────────────────────────────────────
|
|
||||||
|
|
||||||
describe('deduplicateSegments — full pipeline', () => {
|
|
||||||
it('returns empty array for empty input', () => {
|
|
||||||
expect(deduplicateSegments([])).toEqual([]);
|
|
||||||
});
|
|
||||||
|
|
||||||
it('removes segments whose text is empty after trimming', () => {
|
|
||||||
const input = [seg(0, 0, 1, ' '), seg(1, 1, 2, ' Hello.')];
|
|
||||||
const result = deduplicateSegments(input);
|
|
||||||
expect(result).toHaveLength(1);
|
|
||||||
expect(result[0].text).toBe('Hello.');
|
|
||||||
});
|
|
||||||
|
|
||||||
it('re-indexes output segments starting from 0', () => {
|
|
||||||
const input = [
|
|
||||||
seg(5, 0, 2, ' First unique sentence here.'),
|
|
||||||
seg(8, 2, 4, ' Second different sentence there.')
|
|
||||||
];
|
|
||||||
const result = deduplicateSegments(input);
|
|
||||||
result.forEach((s, i) => expect(s.index).toBe(i));
|
|
||||||
});
|
|
||||||
|
|
||||||
it('runs the full pipeline: trim → remove empty → merge → ngram → merge → reindex', () => {
|
|
||||||
const input = [
|
|
||||||
seg(0, 0, 2, ' Good morning everyone.'),
|
|
||||||
seg(1, 2, 3, ' '), // empty — removed
|
|
||||||
seg(2, 3, 5, ' Good morning everyone.'), // duplicate — merged
|
|
||||||
seg(3, 5, 7, ' Welcome to our presentation today.')
|
|
||||||
];
|
|
||||||
const result = deduplicateSegments(input);
|
|
||||||
expect(result).toHaveLength(2);
|
|
||||||
expect(result[0].text).toBe('Good morning everyone.');
|
|
||||||
expect(result[1].text).toBe('Welcome to our presentation today.');
|
|
||||||
expect(result[0].index).toBe(0);
|
|
||||||
expect(result[1].index).toBe(1);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
@@ -7,7 +7,6 @@ const {
|
|||||||
mockGetJob,
|
mockGetJob,
|
||||||
mockUpdateJob,
|
mockUpdateJob,
|
||||||
mockSetJobStatus,
|
mockSetJobStatus,
|
||||||
mockDeduplicateSegments,
|
|
||||||
mockWriteOutputs,
|
mockWriteOutputs,
|
||||||
mockSendNotification,
|
mockSendNotification,
|
||||||
mockCleanupJobTmp,
|
mockCleanupJobTmp,
|
||||||
@@ -16,7 +15,6 @@ const {
|
|||||||
mockGetJob: vi.fn(),
|
mockGetJob: vi.fn(),
|
||||||
mockUpdateJob: vi.fn(),
|
mockUpdateJob: vi.fn(),
|
||||||
mockSetJobStatus: vi.fn(),
|
mockSetJobStatus: vi.fn(),
|
||||||
mockDeduplicateSegments: vi.fn((segs: Segment[]) => segs),
|
|
||||||
mockWriteOutputs: vi.fn(),
|
mockWriteOutputs: vi.fn(),
|
||||||
mockSendNotification: vi.fn(),
|
mockSendNotification: vi.fn(),
|
||||||
mockCleanupJobTmp: vi.fn(),
|
mockCleanupJobTmp: vi.fn(),
|
||||||
@@ -29,10 +27,6 @@ vi.mock('$lib/server/db.js', () => ({
|
|||||||
setJobStatus: mockSetJobStatus
|
setJobStatus: mockSetJobStatus
|
||||||
}));
|
}));
|
||||||
|
|
||||||
vi.mock('$lib/server/postprocess.js', () => ({
|
|
||||||
deduplicateSegments: mockDeduplicateSegments
|
|
||||||
}));
|
|
||||||
|
|
||||||
vi.mock('$lib/server/formatter.js', () => ({
|
vi.mock('$lib/server/formatter.js', () => ({
|
||||||
writeOutputs: mockWriteOutputs
|
writeOutputs: mockWriteOutputs
|
||||||
}));
|
}));
|
||||||
@@ -91,7 +85,6 @@ function makeSeg(index: number, text: string): Segment {
|
|||||||
|
|
||||||
beforeEach(() => {
|
beforeEach(() => {
|
||||||
vi.clearAllMocks();
|
vi.clearAllMocks();
|
||||||
mockDeduplicateSegments.mockImplementation((segs: Segment[]) => segs);
|
|
||||||
mockWriteOutputs.mockResolvedValue({
|
mockWriteOutputs.mockResolvedValue({
|
||||||
srt: '/out/dir/title.srt',
|
srt: '/out/dir/title.srt',
|
||||||
txt: '/out/dir/title.txt',
|
txt: '/out/dir/title.txt',
|
||||||
@@ -218,25 +211,21 @@ describe('POST /api/webhook/[jobId] — whisper failure', () => {
|
|||||||
describe('POST /api/webhook/[jobId] — success with segments', () => {
|
describe('POST /api/webhook/[jobId] — success with segments', () => {
|
||||||
const segments = [makeSeg(0, 'Hello world.'), makeSeg(1, 'This is a test.')];
|
const segments = [makeSeg(0, 'Hello world.'), makeSeg(1, 'This is a test.')];
|
||||||
|
|
||||||
it('runs deduplication on received segments', async () => {
|
it('passes received segments through unchanged', async () => {
|
||||||
mockGetJob.mockReturnValue(makeJob('job-3'));
|
mockGetJob.mockReturnValue(makeJob('job-3'));
|
||||||
await POST(makeEvent('job-3', makeWhisperJob({ segments })) as any);
|
await POST(makeEvent('job-3', makeWhisperJob({ segments })) as any);
|
||||||
expect(mockDeduplicateSegments).toHaveBeenCalledWith(segments);
|
expect(mockWriteOutputs).toHaveBeenCalledWith(segments, 'Test Video', 'job-3');
|
||||||
});
|
});
|
||||||
|
|
||||||
it('calls writeOutputs with the deduplicated segments and job title', async () => {
|
it('calls writeOutputs with the received segments and job title', async () => {
|
||||||
mockGetJob.mockReturnValue(makeJob('job-4', 'My Lecture'));
|
mockGetJob.mockReturnValue(makeJob('job-4', 'My Lecture'));
|
||||||
const deduped = [makeSeg(0, 'Hello world.')];
|
|
||||||
mockDeduplicateSegments.mockReturnValue(deduped);
|
|
||||||
|
|
||||||
await POST(makeEvent('job-4', makeWhisperJob({ segments })) as any);
|
await POST(makeEvent('job-4', makeWhisperJob({ segments })) as any);
|
||||||
expect(mockWriteOutputs).toHaveBeenCalledWith(deduped, 'My Lecture', 'job-4');
|
expect(mockWriteOutputs).toHaveBeenCalledWith(segments, 'My Lecture', 'job-4');
|
||||||
});
|
});
|
||||||
|
|
||||||
it('stores serialised segments_json in the database', async () => {
|
it('stores serialised segments_json in the database', async () => {
|
||||||
mockGetJob.mockReturnValue(makeJob('job-5'));
|
mockGetJob.mockReturnValue(makeJob('job-5'));
|
||||||
const deduped = [makeSeg(0, 'Result text.')];
|
|
||||||
mockDeduplicateSegments.mockReturnValue(deduped);
|
|
||||||
|
|
||||||
await POST(makeEvent('job-5', makeWhisperJob({ segments })) as any);
|
await POST(makeEvent('job-5', makeWhisperJob({ segments })) as any);
|
||||||
|
|
||||||
@@ -244,7 +233,7 @@ describe('POST /api/webhook/[jobId] — success with segments', () => {
|
|||||||
expect.objectContaining({
|
expect.objectContaining({
|
||||||
id: 'job-5',
|
id: 'job-5',
|
||||||
status: 'done',
|
status: 'done',
|
||||||
segmentsJson: JSON.stringify(deduped)
|
segmentsJson: JSON.stringify(segments)
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user