refactor(transcript): drop Tonemark rewrite
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
42
package-lock.json
generated
42
package-lock.json
generated
@@ -12,7 +12,8 @@
|
||||
"better-sqlite3": "^12.9.0",
|
||||
"form-data": "^4.0.5",
|
||||
"node-fetch": "^3.3.2",
|
||||
"web-push": "^3.6.7"
|
||||
"web-push": "^3.6.7",
|
||||
"youtube-transcript": "^1.3.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@sveltejs/adapter-auto": "^7.0.1",
|
||||
@@ -89,6 +90,27 @@
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/@emnapi/core": {
|
||||
"version": "1.10.0",
|
||||
"resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.10.0.tgz",
|
||||
"integrity": "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"@emnapi/wasi-threads": "1.2.1",
|
||||
"tslib": "^2.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@emnapi/runtime": {
|
||||
"version": "1.10.0",
|
||||
"resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.10.0.tgz",
|
||||
"integrity": "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"tslib": "^2.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@emnapi/wasi-threads": {
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.2.1.tgz",
|
||||
@@ -896,7 +918,6 @@
|
||||
"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.59.1.tgz",
|
||||
"integrity": "sha512-d8OON70AphLdDesuTIl//M2O6fRTIicX8aYv8vhCiYEhTTI2OboKqey0Hu1A4VFhqwgqtq0vKDmPFGkw8kKmgw==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@standard-schema/spec": "^1.0.0",
|
||||
"@sveltejs/acorn-typescript": "^1.0.5",
|
||||
@@ -938,7 +959,6 @@
|
||||
"resolved": "https://registry.npmjs.org/@sveltejs/vite-plugin-svelte/-/vite-plugin-svelte-7.1.1.tgz",
|
||||
"integrity": "sha512-FOJdbE5pxae68DoTBJ49t1dIA7TSmMHR6CsuJhX90cO/UfrEMHA7KJNUj3WdZuUDJPu4ujqpJ2Tgqd2gTWr6Xg==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"deepmerge": "^4.3.1",
|
||||
"magic-string": "^0.30.21",
|
||||
@@ -1313,7 +1333,6 @@
|
||||
"integrity": "sha512-38C0/Ddb7HcRG0Z4/DUem8x57d2p9jYgp18mkaYswEOQBGsI1CG4f/hjm0ZCeaJfWhSZ4k7jgs29V1Zom7Ki9A==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@bcoe/v8-coverage": "^1.0.2",
|
||||
"@vitest/utils": "4.1.5",
|
||||
@@ -1467,7 +1486,6 @@
|
||||
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.16.0.tgz",
|
||||
"integrity": "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"bin": {
|
||||
"acorn": "bin/acorn"
|
||||
},
|
||||
@@ -3021,7 +3039,6 @@
|
||||
"resolved": "https://registry.npmjs.org/rollup/-/rollup-4.60.3.tgz",
|
||||
"integrity": "sha512-pAQK9HalE84QSm4Po3EmWIZPd3FnjkShVkiMlz1iligWYkWQ7wHYd1PF/T7QZ5TVSD6uSTon5gBVMSM4JfBV+A==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@types/estree": "1.0.8"
|
||||
},
|
||||
@@ -3255,7 +3272,6 @@
|
||||
"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.5.tgz",
|
||||
"integrity": "sha512-2uCs/LZ9us+AktdzYJM8OcxQ8qnPS1kpaO7syGT/MgO+6Qr1Ybl+TqPq+97u7PHqmmMlye5ZkoyXONy5mjjAbw==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@jridgewell/remapping": "^2.3.4",
|
||||
"@jridgewell/sourcemap-codec": "^1.5.0",
|
||||
@@ -3428,7 +3444,6 @@
|
||||
"integrity": "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw==",
|
||||
"devOptional": true,
|
||||
"license": "Apache-2.0",
|
||||
"peer": true,
|
||||
"bin": {
|
||||
"tsc": "bin/tsc",
|
||||
"tsserver": "bin/tsserver"
|
||||
@@ -3455,7 +3470,6 @@
|
||||
"resolved": "https://registry.npmjs.org/vite/-/vite-8.0.10.tgz",
|
||||
"integrity": "sha512-rZuUu9j6J5uotLDs+cAA4O5H4K1SfPliUlQwqa6YEwSrWDZzP4rhm00oJR5snMewjxF5V/K3D4kctsUTsIU9Mw==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"lightningcss": "^1.32.0",
|
||||
"picomatch": "^4.0.4",
|
||||
@@ -3553,7 +3567,6 @@
|
||||
"integrity": "sha512-9Xx1v3/ih3m9hN+SbfkUyy0JAs72ap3r7joc87XL6jwF0jGg6mFBvQ1SrwaX+h8BlkX6Hz9shdd1uo6AF+ZGpg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@vitest/expect": "4.1.5",
|
||||
"@vitest/mocker": "4.1.5",
|
||||
@@ -3689,6 +3702,15 @@
|
||||
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/youtube-transcript": {
|
||||
"version": "1.3.1",
|
||||
"resolved": "https://registry.npmjs.org/youtube-transcript/-/youtube-transcript-1.3.1.tgz",
|
||||
"integrity": "sha512-NDCjwad113TGybbYF51y9Z4tcwzBHUZWQdF9veULNca18L+FdDbHHtTHIr69WVa3bB90l67S8kN0HtL2JO9fhg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=18.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/zimmerframe": {
|
||||
"version": "1.1.4",
|
||||
"resolved": "https://registry.npmjs.org/zimmerframe/-/zimmerframe-1.1.4.tgz",
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
"better-sqlite3": "^12.9.0",
|
||||
"form-data": "^4.0.5",
|
||||
"node-fetch": "^3.3.2",
|
||||
"web-push": "^3.6.7"
|
||||
"web-push": "^3.6.7",
|
||||
"youtube-transcript": "^1.3.1"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import { execFile } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
import { existsSync } from 'fs';
|
||||
import { mkdir, unlink, writeFile } from 'fs/promises';
|
||||
import { mkdir, writeFile } from 'fs/promises';
|
||||
import { join } from 'path';
|
||||
import { fetchTranscript, type TranscriptResponse } from 'youtube-transcript';
|
||||
|
||||
const execFileAsync = promisify(execFile);
|
||||
const TMP_DIR = join(process.env.DATA_DIR ?? '/tmp/.whisper-pwa', 'downloads');
|
||||
@@ -26,43 +27,33 @@ export interface AudioResult {
|
||||
export type DownloadResult = CaptionResult | AudioResult;
|
||||
|
||||
/** Try to get auto-generated captions from YouTube. Returns null if unavailable. */
|
||||
async function tryGetCaptions(url: string, outDir: string): Promise<CaptionResult | null> {
|
||||
const jsonPath = join(outDir, 'info.json');
|
||||
async function tryGetCaptions(url: string, _outDir: string): Promise<CaptionResult | null> {
|
||||
try {
|
||||
await execFileAsync('yt-dlp', [
|
||||
'--write-auto-subs',
|
||||
'--sub-langs', 'en.*',
|
||||
'--skip-download',
|
||||
'--write-info-json',
|
||||
'--no-playlist',
|
||||
'-o', join(outDir, '%(title)s.%(ext)s'),
|
||||
url
|
||||
]);
|
||||
|
||||
// Find the VTT/SRT file
|
||||
const { readdirSync } = await import('fs');
|
||||
const files = readdirSync(outDir);
|
||||
const vttFile = files.find((f) => f.endsWith('.vtt') || f.endsWith('.srt'));
|
||||
if (!vttFile) return null;
|
||||
|
||||
let title = 'Untitled';
|
||||
if (existsSync(jsonPath)) {
|
||||
try {
|
||||
const info = JSON.parse((await import('fs')).readFileSync(jsonPath, 'utf8'));
|
||||
title = info.title ?? title;
|
||||
} catch { /* ignore */ }
|
||||
}
|
||||
|
||||
const content = (await import('fs')).readFileSync(join(outDir, vttFile), 'utf8');
|
||||
const segments = parseVtt(content);
|
||||
const transcript = await fetchTranscript(url, { lang: 'en' });
|
||||
const segments = transcriptEntriesToSegments(transcript);
|
||||
if (segments.length === 0) return null;
|
||||
|
||||
const title = await getYouTubeTitle(url);
|
||||
return { type: 'captions', segments, title };
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function getYouTubeTitle(url: string): Promise<string> {
|
||||
try {
|
||||
const { stdout } = await execFileAsync('yt-dlp', [
|
||||
'--dump-single-json',
|
||||
'--skip-download',
|
||||
'--no-playlist',
|
||||
url
|
||||
]);
|
||||
return JSON.parse(stdout).title ?? 'Untitled';
|
||||
} catch {
|
||||
return 'Untitled';
|
||||
}
|
||||
}
|
||||
|
||||
/** Download best audio from YouTube. Returns path to audio file. */
|
||||
async function downloadAudio(url: string, outDir: string): Promise<{ audioPath: string; title: string }> {
|
||||
await execFileAsync('yt-dlp', [
|
||||
@@ -124,39 +115,22 @@ export async function cleanupJobTmp(jobId: string) {
|
||||
} catch { /* ignore */ }
|
||||
}
|
||||
|
||||
/** Parse a WebVTT string into segments. */
|
||||
function parseVtt(
|
||||
content: string
|
||||
export function transcriptEntriesToSegments(
|
||||
entries: TranscriptResponse[]
|
||||
): Array<{ index: number; start: number; end: number; text: string; words: [] }> {
|
||||
const segments: Array<{ index: number; start: number; end: number; text: string; words: [] }> = [];
|
||||
const blocks = content.split(/\n\n+/);
|
||||
let index = 0;
|
||||
|
||||
for (const block of blocks) {
|
||||
const lines = block.trim().split('\n');
|
||||
const timeLine = lines.find((l) => l.includes('-->'));
|
||||
if (!timeLine) continue;
|
||||
|
||||
const [startStr, endStr] = timeLine.split('-->').map((s) => s.trim().split(' ')[0]);
|
||||
const start = vttTimeToSec(startStr);
|
||||
const end = vttTimeToSec(endStr);
|
||||
const text = lines
|
||||
.filter((l) => !l.includes('-->') && !/^\d+$/.test(l.trim()) && l.trim())
|
||||
.join(' ')
|
||||
.replace(/<[^>]+>/g, '')
|
||||
.trim();
|
||||
|
||||
if (text) {
|
||||
segments.push({ index: index++, start, end, text, words: [] });
|
||||
}
|
||||
}
|
||||
|
||||
return segments;
|
||||
}
|
||||
|
||||
function vttTimeToSec(t: string): number {
|
||||
const parts = t.split(':').map(Number);
|
||||
if (parts.length === 3) return parts[0] * 3600 + parts[1] * 60 + parts[2];
|
||||
if (parts.length === 2) return parts[0] * 60 + parts[1];
|
||||
return parts[0];
|
||||
const useMilliseconds = entries.some((entry) => entry.offset > 1000 || entry.duration > 1000);
|
||||
return entries
|
||||
.map((entry) => {
|
||||
const start = useMilliseconds ? entry.offset / 1000 : entry.offset;
|
||||
const duration = useMilliseconds ? entry.duration / 1000 : entry.duration;
|
||||
return {
|
||||
index: 0,
|
||||
start,
|
||||
end: start + duration,
|
||||
text: entry.text.trim(),
|
||||
words: [] as []
|
||||
};
|
||||
})
|
||||
.filter((entry) => entry.text.length > 0)
|
||||
.map((entry, index) => ({ ...entry, index }));
|
||||
}
|
||||
|
||||
@@ -96,15 +96,13 @@ async function runJob(
|
||||
|
||||
if (captionSegments) {
|
||||
// Caption fast path — skip whisper
|
||||
const { deduplicateSegments } = await import('./postprocess.js');
|
||||
const { writeOutputs } = await import('./formatter.js');
|
||||
const segments = deduplicateSegments(captionSegments);
|
||||
const paths = await writeOutputs(segments, title, jobId);
|
||||
const paths = await writeOutputs(captionSegments, title, jobId);
|
||||
updateJob({
|
||||
id: jobId,
|
||||
status: 'done',
|
||||
progress: 100,
|
||||
segmentsJson: JSON.stringify(segments),
|
||||
segmentsJson: JSON.stringify(captionSegments),
|
||||
outputDir: paths.srt.replace(/\/[^/]+$/, '')
|
||||
});
|
||||
emitProgress(jobId, { type: 'done' });
|
||||
|
||||
@@ -1,235 +0,0 @@
|
||||
import type { Segment } from '$lib/types.js';
|
||||
|
||||
// ── Collapse consecutive repeated phrases within a segment's text ────────────
|
||||
|
||||
function collapseRepeats(text: string): string {
|
||||
let prev = '';
|
||||
// Keep applying until stable
|
||||
while (true) {
|
||||
const next = collapseOnce(text);
|
||||
if (next === prev || next === text) return next;
|
||||
prev = text;
|
||||
text = next;
|
||||
}
|
||||
}
|
||||
|
||||
function collapseOnce(text: string): string {
|
||||
// Match any repeated phrase (2+ words) appearing consecutively
|
||||
return text.replace(/\b(.{10,}?)\s+\1\b/gi, '$1');
|
||||
}
|
||||
|
||||
// ── Merge consecutive segments with identical (or near-identical) text ───────
|
||||
|
||||
function normalise(s: string) {
|
||||
return s.toLowerCase().replace(/[^\w\s]/g, '').replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
function mergeConsecutive(segments: Segment[]): Segment[] {
|
||||
const out: Segment[] = [];
|
||||
for (const seg of segments) {
|
||||
const last = out[out.length - 1];
|
||||
if (last && normalise(last.text) === normalise(seg.text)) {
|
||||
last.end = seg.end;
|
||||
} else {
|
||||
out.push({ ...seg });
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// ── Collapse rolling prefix/suffix chains from backend segment hypotheses ──────
|
||||
|
||||
const MAX_CHAIN_GAP_SECS = 0.15;
|
||||
const MIN_MEANINGFUL_WORDS = 2;
|
||||
const MIN_MEANINGFUL_CHARS = 8;
|
||||
const MIN_OVERLAP_WORDS = 1;
|
||||
|
||||
function splitWords(text: string): string[] {
|
||||
return text.trim().split(/\s+/).filter(Boolean);
|
||||
}
|
||||
|
||||
function normaliseWords(text: string): string[] {
|
||||
return splitWords(text)
|
||||
.map((word) => word.toLowerCase().replace(/[^\w]/g, ''))
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
function arraysEqual(a: string[], b: string[]): boolean {
|
||||
return a.length === b.length && a.every((value, index) => value === b[index]);
|
||||
}
|
||||
|
||||
function startsWithWords(full: string[], prefix: string[]): boolean {
|
||||
return prefix.length <= full.length && arraysEqual(full.slice(0, prefix.length), prefix);
|
||||
}
|
||||
|
||||
function endsWithWords(full: string[], suffix: string[]): boolean {
|
||||
return suffix.length <= full.length && arraysEqual(full.slice(full.length - suffix.length), suffix);
|
||||
}
|
||||
|
||||
function suffixPrefixOverlap(left: string[], right: string[]): number {
|
||||
const max = Math.min(left.length, right.length);
|
||||
for (let size = max; size >= 1; size--) {
|
||||
if (arraysEqual(left.slice(left.length - size), right.slice(0, size))) return size;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function isMeaningfulPhrase(words: string[]): boolean {
|
||||
return words.length >= MIN_MEANINGFUL_WORDS && words.join(' ').length >= MIN_MEANINGFUL_CHARS;
|
||||
}
|
||||
|
||||
function isShortCarryover(seg: Segment, words: string[]): boolean {
|
||||
return seg.end - seg.start <= 0.2 || words.length <= 2 || words.join(' ').length <= 16;
|
||||
}
|
||||
|
||||
function trimLeadingWords(text: string, count: number): string {
|
||||
return splitWords(text).slice(count).join(' ').trim();
|
||||
}
|
||||
|
||||
function collapseIncrementalSegments(segments: Segment[]): Segment[] {
|
||||
const out: Segment[] = [];
|
||||
|
||||
for (const seg of segments) {
|
||||
let current: Segment = {
|
||||
...seg,
|
||||
text: seg.text.trim()
|
||||
};
|
||||
|
||||
if (!current.text) continue;
|
||||
|
||||
const last = out[out.length - 1];
|
||||
if (!last) {
|
||||
out.push(current);
|
||||
continue;
|
||||
}
|
||||
|
||||
const gap = current.start - last.end;
|
||||
if (gap > MAX_CHAIN_GAP_SECS) {
|
||||
out.push(current);
|
||||
continue;
|
||||
}
|
||||
|
||||
const lastWords = normaliseWords(last.text);
|
||||
const currentWords = normaliseWords(current.text);
|
||||
if (lastWords.length === 0 || currentWords.length === 0) {
|
||||
out.push(current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (
|
||||
currentWords.length > lastWords.length &&
|
||||
startsWithWords(currentWords, lastWords) &&
|
||||
(isMeaningfulPhrase(lastWords) || isShortCarryover(last, lastWords))
|
||||
) {
|
||||
last.text = current.text;
|
||||
last.end = current.end;
|
||||
last.words = current.words;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (
|
||||
endsWithWords(lastWords, currentWords) &&
|
||||
(isMeaningfulPhrase(currentWords) || isShortCarryover(current, currentWords))
|
||||
) {
|
||||
last.end = Math.max(last.end, current.end);
|
||||
continue;
|
||||
}
|
||||
|
||||
const overlapWords = suffixPrefixOverlap(lastWords, currentWords);
|
||||
if (overlapWords >= MIN_OVERLAP_WORDS) {
|
||||
const trimmedText = trimLeadingWords(current.text, overlapWords);
|
||||
if (!trimmedText) {
|
||||
last.end = Math.max(last.end, current.end);
|
||||
continue;
|
||||
}
|
||||
|
||||
current = {
|
||||
...current,
|
||||
start: Math.max(current.start, last.end),
|
||||
text: trimmedText,
|
||||
words: []
|
||||
};
|
||||
}
|
||||
|
||||
out.push(current);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// ── N-gram deduplication ─────────────────────────────────────────────────────
|
||||
|
||||
const NGRAM_N = 6;
|
||||
const LOOKBACK_CHARS = 500;
|
||||
const SIMILARITY_THRESHOLD = 0.6;
|
||||
|
||||
function ngrams(text: string, n: number): string[] {
|
||||
const words = text.toLowerCase().split(/\s+/);
|
||||
const grams: string[] = [];
|
||||
for (let i = 0; i <= words.length - n; i++) {
|
||||
grams.push(words.slice(i, i + n).join(' '));
|
||||
}
|
||||
return grams;
|
||||
}
|
||||
|
||||
function jaccardSimilarity(a: string, b: string): number {
|
||||
const ga = new Set(ngrams(a, NGRAM_N));
|
||||
const gb = new Set(ngrams(b, NGRAM_N));
|
||||
// If neither text is long enough to produce n-grams they cannot be compared;
|
||||
// treat as dissimilar so short segments are never incorrectly discarded.
|
||||
if (ga.size === 0 && gb.size === 0) return 0;
|
||||
const intersection = [...ga].filter((g) => gb.has(g)).length;
|
||||
const union = new Set([...ga, ...gb]).size;
|
||||
return union === 0 ? 0 : intersection / union;
|
||||
}
|
||||
|
||||
function ngramDedup(segments: Segment[]): Segment[] {
|
||||
const out: Segment[] = [];
|
||||
for (const seg of segments) {
|
||||
const windowText = out
|
||||
.slice(-20)
|
||||
.map((s) => s.text)
|
||||
.join(' ')
|
||||
.slice(-LOOKBACK_CHARS);
|
||||
|
||||
if (windowText.length > 0 && jaccardSimilarity(seg.text, windowText) >= SIMILARITY_THRESHOLD) {
|
||||
continue; // duplicate — skip
|
||||
}
|
||||
out.push(seg);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// ── Full deduplication pipeline ──────────────────────────────────────────────
|
||||
|
||||
export function deduplicateSegments(segments: Segment[]): Segment[] {
|
||||
if (!Array.isArray(segments)) return [];
|
||||
// 1. Collapse repeats within each segment's text
|
||||
let result = segments.map((s) => ({
|
||||
...s,
|
||||
text: collapseRepeats(s.text.trim())
|
||||
}));
|
||||
|
||||
// 2. Remove empty segments
|
||||
result = result.filter((s) => s.text.length > 0);
|
||||
|
||||
// 3. Collapse rolling backend hypotheses before generic dedup
|
||||
result = collapseIncrementalSegments(result);
|
||||
|
||||
// 4. First merge pass
|
||||
result = mergeConsecutive(result);
|
||||
|
||||
// 5. N-gram dedup
|
||||
result = ngramDedup(result);
|
||||
|
||||
// 6. Re-run rolling collapse after removals create new adjacencies
|
||||
result = collapseIncrementalSegments(result);
|
||||
|
||||
// 7. Second merge pass (catches new adjacencies after dedup)
|
||||
result = mergeConsecutive(result);
|
||||
|
||||
// 8. Re-index
|
||||
result.forEach((s, i) => (s.index = i));
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -1,10 +1,9 @@
|
||||
import { json, error } from '@sveltejs/kit';
|
||||
import { getJob, updateJob } from '$lib/server/db.js';
|
||||
import { deduplicateSegments } from '$lib/server/postprocess.js';
|
||||
import { writeOutputs } from '$lib/server/formatter.js';
|
||||
import type { Segment } from '$lib/types.js';
|
||||
|
||||
/** POST /api/jobs/[id]/reprocess — re-run post-processing and regenerate all output files. */
|
||||
/** POST /api/jobs/[id]/reprocess — regenerate output files from stored canonical segments. */
|
||||
export async function POST({ params }) {
|
||||
const job = getJob(params.id);
|
||||
if (!job) throw error(404, 'Job not found');
|
||||
@@ -14,8 +13,7 @@ export async function POST({ params }) {
|
||||
}
|
||||
|
||||
try {
|
||||
const rawSegments = JSON.parse(job.segmentsJson) as Segment[];
|
||||
const segments = deduplicateSegments(rawSegments);
|
||||
const segments = JSON.parse(job.segmentsJson) as Segment[];
|
||||
|
||||
const paths = await writeOutputs(segments, job.title, job.id);
|
||||
const outputDir = paths.srt.replace(/\/[^/]+$/, '');
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import { json, error } from '@sveltejs/kit';
|
||||
import { getJob, updateJob, setJobStatus } from '$lib/server/db.js';
|
||||
import { deduplicateSegments } from '$lib/server/postprocess.js';
|
||||
import { writeOutputs } from '$lib/server/formatter.js';
|
||||
import { sendNotification } from '$lib/server/push.js';
|
||||
import { cleanupJobTmp } from '$lib/server/downloader.js';
|
||||
@@ -40,8 +39,7 @@ try {
|
||||
setJobStatus(jobId, 'processing', 90);
|
||||
emitProgress(jobId, { type: 'status', status: 'processing', progress: 90 });
|
||||
|
||||
const rawSegments = (whisperJob.segments ?? []) as Segment[];
|
||||
const segments = deduplicateSegments(rawSegments);
|
||||
const segments = (whisperJob.segments ?? []) as Segment[];
|
||||
|
||||
const paths = await writeOutputs(segments, job.title, jobId);
|
||||
const outputDir = paths.srt.replace(/\/[^/]+$/, '');
|
||||
|
||||
80
src/tests/downloader.test.ts
Normal file
80
src/tests/downloader.test.ts
Normal file
@@ -0,0 +1,80 @@
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
import { rm } from 'fs/promises';
|
||||
import type { TranscriptResponse } from 'youtube-transcript';
|
||||
|
||||
const { mockExecFile, mockFetchTranscript } = vi.hoisted(() => ({
|
||||
mockExecFile: vi.fn(),
|
||||
mockFetchTranscript: vi.fn()
|
||||
}));
|
||||
|
||||
const TEST_DATA_DIR = `/tmp/tonemark-downloader-test-${Date.now()}`;
|
||||
vi.stubEnv('DATA_DIR', TEST_DATA_DIR);
|
||||
|
||||
vi.mock('child_process', () => ({
|
||||
execFile: mockExecFile
|
||||
}));
|
||||
|
||||
vi.mock('youtube-transcript', () => ({
|
||||
fetchTranscript: mockFetchTranscript
|
||||
}));
|
||||
|
||||
import { downloadYouTube, transcriptEntriesToSegments } from '$lib/server/downloader.js';
|
||||
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
mockExecFile.mockImplementation((...args: unknown[]) => {
|
||||
const cb = args.at(-1) as (...callbackArgs: unknown[]) => void;
|
||||
cb(null, JSON.stringify({ title: 'Fetched Title' }), '');
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(TEST_DATA_DIR, { recursive: true, force: true }).catch(() => {});
|
||||
});
|
||||
|
||||
describe('transcriptEntriesToSegments', () => {
|
||||
it('converts millisecond transcript offsets into second-based segments', () => {
|
||||
const entries: TranscriptResponse[] = [
|
||||
{ text: 'Hello everyone.', offset: 15240, duration: 4240, lang: 'en' },
|
||||
{ text: 'Um, welcome to this talk.', offset: 16600, duration: 5080, lang: 'en' }
|
||||
];
|
||||
|
||||
expect(transcriptEntriesToSegments(entries)).toEqual([
|
||||
{ index: 0, start: 15.24, end: 19.48, text: 'Hello everyone.', words: [] },
|
||||
{ index: 1, start: 16.6, end: 21.68, text: 'Um, welcome to this talk.', words: [] }
|
||||
]);
|
||||
});
|
||||
|
||||
it('preserves second-based transcript offsets and drops empty text', () => {
|
||||
const entries: TranscriptResponse[] = [
|
||||
{ text: ' ', offset: 0, duration: 1.5, lang: 'en' },
|
||||
{ text: 'Clean caption cue', offset: 91.08, duration: 3.72, lang: 'en' }
|
||||
];
|
||||
|
||||
expect(transcriptEntriesToSegments(entries)).toEqual([
|
||||
{ index: 0, start: 91.08, end: 94.8, text: 'Clean caption cue', words: [] }
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
describe('downloadYouTube', () => {
|
||||
it('uses fetched transcript entries directly for caption jobs', async () => {
|
||||
mockFetchTranscript.mockResolvedValue([
|
||||
{ text: 'Hello everyone.', offset: 15240, duration: 4240, lang: 'en' },
|
||||
{ text: 'Um, welcome to this talk.', offset: 16600, duration: 5080, lang: 'en' }
|
||||
] satisfies TranscriptResponse[]);
|
||||
|
||||
const result = await downloadYouTube('https://youtube.com/watch?v=qdh_x-uRs9g', 'job-1');
|
||||
|
||||
expect(mockFetchTranscript).toHaveBeenCalledWith('https://youtube.com/watch?v=qdh_x-uRs9g', {
|
||||
lang: 'en'
|
||||
});
|
||||
expect(result).toMatchObject({
|
||||
type: 'captions',
|
||||
segments: [
|
||||
{ index: 0, start: 15.24, end: 19.48, text: 'Hello everyone.', words: [] },
|
||||
{ index: 1, start: 16.6, end: 21.68, text: 'Um, welcome to this talk.', words: [] }
|
||||
]
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -1,204 +0,0 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import {
|
||||
deduplicateSegments
|
||||
} from '$lib/server/postprocess.js';
|
||||
import type { Segment } from '$lib/types.js';
|
||||
|
||||
// ── helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
function seg(index: number, start: number, end: number, text: string): Segment {
|
||||
return { index, start, end, text, words: [] };
|
||||
}
|
||||
|
||||
// ── collapseRepeats (tested indirectly via deduplicateSegments) ───────────────
|
||||
|
||||
describe('deduplicateSegments — collapseRepeats', () => {
|
||||
it('leaves text without repetition unchanged', () => {
|
||||
const input = [seg(0, 0, 5, ' Hello world, this is a sentence.')];
|
||||
const [out] = deduplicateSegments(input);
|
||||
expect(out.text).toBe('Hello world, this is a sentence.');
|
||||
});
|
||||
|
||||
it('collapses a consecutive repeated phrase inside a segment', () => {
|
||||
const input = [seg(0, 0, 5, ' the quick brown fox the quick brown fox')];
|
||||
const [out] = deduplicateSegments(input);
|
||||
expect(out.text).not.toMatch(/the quick brown fox.*the quick brown fox/i);
|
||||
});
|
||||
|
||||
it('handles multiple repetitions recursively', () => {
|
||||
// "welcome everyone" = 16 chars — qualifies for the ≥10-char collapse regex
|
||||
const input = [seg(0, 0, 5, ' welcome everyone welcome everyone welcome everyone')];
|
||||
const result = deduplicateSegments(input);
|
||||
const text = result[0]?.text ?? '';
|
||||
expect((text.match(/welcome everyone/gi) ?? []).length).toBeLessThan(3);
|
||||
});
|
||||
});
|
||||
|
||||
// ── mergeConsecutive ──────────────────────────────────────────────────────────
|
||||
|
||||
describe('deduplicateSegments — mergeConsecutive', () => {
|
||||
it('merges adjacent segments with identical text', () => {
|
||||
const input = [
|
||||
seg(0, 0, 2, ' Hello world.'),
|
||||
seg(1, 2, 4, ' Hello world.')
|
||||
];
|
||||
const result = deduplicateSegments(input);
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result[0].end).toBe(4);
|
||||
});
|
||||
|
||||
it('keeps adjacent segments with different text', () => {
|
||||
const input = [
|
||||
seg(0, 0, 2, ' First sentence.'),
|
||||
seg(1, 2, 4, ' Second sentence.')
|
||||
];
|
||||
const result = deduplicateSegments(input);
|
||||
expect(result).toHaveLength(2);
|
||||
});
|
||||
|
||||
it('normalises punctuation and case for merge comparison', () => {
|
||||
const input = [
|
||||
seg(0, 0, 2, ' Hello, World!'),
|
||||
seg(1, 2, 4, ' hello world')
|
||||
];
|
||||
const result = deduplicateSegments(input);
|
||||
expect(result).toHaveLength(1);
|
||||
});
|
||||
});
|
||||
|
||||
// ── rolling prefix/suffix chain collapse ───────────────────────────────────────
|
||||
|
||||
describe('deduplicateSegments — rolling backend hypotheses', () => {
|
||||
it('collapses prefix-growth chains from stored backend segments', () => {
|
||||
const input = [
|
||||
seg(0, 15.24, 16.6, 'Hello everyone.'),
|
||||
seg(1, 16.6, 19.47, 'Hello everyone. Um, welcome to this talk.'),
|
||||
seg(2, 19.47, 19.48, 'Um, welcome to this talk.'),
|
||||
seg(3, 19.48, 21.67, "Um, welcome to this talk. I'll be speaking about small model"),
|
||||
seg(4, 21.67, 21.68, "I'll be speaking about small model"),
|
||||
seg(5, 21.68, 24.59, "I'll be speaking about small model inference and a gap that we've")
|
||||
];
|
||||
|
||||
const result = deduplicateSegments(input);
|
||||
|
||||
expect(result).toHaveLength(2);
|
||||
expect(result[0]).toMatchObject({
|
||||
index: 0,
|
||||
start: 15.24,
|
||||
end: 19.48,
|
||||
text: 'Hello everyone. Um, welcome to this talk.'
|
||||
});
|
||||
expect(result[1]).toMatchObject({
|
||||
index: 1,
|
||||
start: 19.48,
|
||||
end: 24.59,
|
||||
text: "I'll be speaking about small model inference and a gap that we've"
|
||||
});
|
||||
});
|
||||
|
||||
it('does not collapse similar phrases when there is a real timing gap', () => {
|
||||
const input = [
|
||||
seg(0, 0, 1, 'Hello everyone.'),
|
||||
seg(1, 2, 4, 'Hello everyone. Welcome back.')
|
||||
];
|
||||
|
||||
const result = deduplicateSegments(input);
|
||||
|
||||
expect(result).toHaveLength(2);
|
||||
expect(result[0].text).toBe('Hello everyone.');
|
||||
expect(result[1].text).toBe('Hello everyone. Welcome back.');
|
||||
});
|
||||
|
||||
it('collapses tiny one-word carry-over segments from caption-style output', () => {
|
||||
const input = [
|
||||
seg(0, 94.8, 96.4, 'world.'),
|
||||
seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
|
||||
seg(2, 98.96, 100.72, 'inference.'),
|
||||
seg(3, 100.72, 103.92, 'inference. So, as someone who kind of wants to'),
|
||||
seg(4, 107.19, 107.2, 'and'),
|
||||
seg(5, 107.2, 109.56, 'and work to understand the problems and the')
|
||||
];
|
||||
|
||||
const result = deduplicateSegments(input);
|
||||
|
||||
expect(result).toHaveLength(3);
|
||||
expect(result[0].text).toBe('world. And that aspect that I overlooked was');
|
||||
expect(result[1].text).toBe('inference. So, as someone who kind of wants to');
|
||||
expect(result[2].text).toBe('and work to understand the problems and the');
|
||||
});
|
||||
|
||||
it('trims single-word suffix-prefix overlap between adjacent segments', () => {
|
||||
const input = [
|
||||
seg(0, 94.8, 96.4, 'world.'),
|
||||
seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
|
||||
seg(2, 120.12, 123.71, 'to find more about inference.'),
|
||||
seg(3, 123.72, 126.92, "inference. So, I've done a lot of work with VLAM,")
|
||||
];
|
||||
|
||||
const result = deduplicateSegments(input);
|
||||
|
||||
expect(result).toHaveLength(3);
|
||||
expect(result[0].text).toBe('world. And that aspect that I overlooked was');
|
||||
expect(result[2].text).toBe("So, I've done a lot of work with VLAM,");
|
||||
});
|
||||
});
|
||||
|
||||
// ── ngramDedup ────────────────────────────────────────────────────────────────
|
||||
|
||||
describe('deduplicateSegments — ngramDedup', () => {
|
||||
it('passes through completely unique segments', () => {
|
||||
const input = [
|
||||
seg(0, 0, 5, ' The cat sat on the mat quite happily today.'),
|
||||
seg(1, 5, 10, ' Later the dog ran across the yard chasing a ball.')
|
||||
];
|
||||
expect(deduplicateSegments(input)).toHaveLength(2);
|
||||
});
|
||||
|
||||
it('removes a segment that is highly similar to recent context', () => {
|
||||
// Repeat a long sentence verbatim — should be caught as duplicate
|
||||
const longText =
|
||||
' This is a very specific and unique sentence about transcription quality matters greatly.';
|
||||
const input = [seg(0, 0, 5, longText), seg(1, 5, 10, longText)];
|
||||
// After mergeConsecutive the second one is already merged, so result is 1
|
||||
expect(deduplicateSegments(input)).toHaveLength(1);
|
||||
});
|
||||
});
|
||||
|
||||
// ── deduplicateSegments — full pipeline ──────────────────────────────────────
|
||||
|
||||
describe('deduplicateSegments — full pipeline', () => {
|
||||
it('returns empty array for empty input', () => {
|
||||
expect(deduplicateSegments([])).toEqual([]);
|
||||
});
|
||||
|
||||
it('removes segments whose text is empty after trimming', () => {
|
||||
const input = [seg(0, 0, 1, ' '), seg(1, 1, 2, ' Hello.')];
|
||||
const result = deduplicateSegments(input);
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result[0].text).toBe('Hello.');
|
||||
});
|
||||
|
||||
it('re-indexes output segments starting from 0', () => {
|
||||
const input = [
|
||||
seg(5, 0, 2, ' First unique sentence here.'),
|
||||
seg(8, 2, 4, ' Second different sentence there.')
|
||||
];
|
||||
const result = deduplicateSegments(input);
|
||||
result.forEach((s, i) => expect(s.index).toBe(i));
|
||||
});
|
||||
|
||||
it('runs the full pipeline: trim → remove empty → merge → ngram → merge → reindex', () => {
|
||||
const input = [
|
||||
seg(0, 0, 2, ' Good morning everyone.'),
|
||||
seg(1, 2, 3, ' '), // empty — removed
|
||||
seg(2, 3, 5, ' Good morning everyone.'), // duplicate — merged
|
||||
seg(3, 5, 7, ' Welcome to our presentation today.')
|
||||
];
|
||||
const result = deduplicateSegments(input);
|
||||
expect(result).toHaveLength(2);
|
||||
expect(result[0].text).toBe('Good morning everyone.');
|
||||
expect(result[1].text).toBe('Welcome to our presentation today.');
|
||||
expect(result[0].index).toBe(0);
|
||||
expect(result[1].index).toBe(1);
|
||||
});
|
||||
});
|
||||
@@ -7,7 +7,6 @@ const {
|
||||
mockGetJob,
|
||||
mockUpdateJob,
|
||||
mockSetJobStatus,
|
||||
mockDeduplicateSegments,
|
||||
mockWriteOutputs,
|
||||
mockSendNotification,
|
||||
mockCleanupJobTmp,
|
||||
@@ -16,7 +15,6 @@ const {
|
||||
mockGetJob: vi.fn(),
|
||||
mockUpdateJob: vi.fn(),
|
||||
mockSetJobStatus: vi.fn(),
|
||||
mockDeduplicateSegments: vi.fn((segs: Segment[]) => segs),
|
||||
mockWriteOutputs: vi.fn(),
|
||||
mockSendNotification: vi.fn(),
|
||||
mockCleanupJobTmp: vi.fn(),
|
||||
@@ -29,10 +27,6 @@ vi.mock('$lib/server/db.js', () => ({
|
||||
setJobStatus: mockSetJobStatus
|
||||
}));
|
||||
|
||||
vi.mock('$lib/server/postprocess.js', () => ({
|
||||
deduplicateSegments: mockDeduplicateSegments
|
||||
}));
|
||||
|
||||
vi.mock('$lib/server/formatter.js', () => ({
|
||||
writeOutputs: mockWriteOutputs
|
||||
}));
|
||||
@@ -91,7 +85,6 @@ function makeSeg(index: number, text: string): Segment {
|
||||
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
mockDeduplicateSegments.mockImplementation((segs: Segment[]) => segs);
|
||||
mockWriteOutputs.mockResolvedValue({
|
||||
srt: '/out/dir/title.srt',
|
||||
txt: '/out/dir/title.txt',
|
||||
@@ -218,25 +211,21 @@ describe('POST /api/webhook/[jobId] — whisper failure', () => {
|
||||
describe('POST /api/webhook/[jobId] — success with segments', () => {
|
||||
const segments = [makeSeg(0, 'Hello world.'), makeSeg(1, 'This is a test.')];
|
||||
|
||||
it('runs deduplication on received segments', async () => {
|
||||
it('passes received segments through unchanged', async () => {
|
||||
mockGetJob.mockReturnValue(makeJob('job-3'));
|
||||
await POST(makeEvent('job-3', makeWhisperJob({ segments })) as any);
|
||||
expect(mockDeduplicateSegments).toHaveBeenCalledWith(segments);
|
||||
expect(mockWriteOutputs).toHaveBeenCalledWith(segments, 'Test Video', 'job-3');
|
||||
});
|
||||
|
||||
it('calls writeOutputs with the deduplicated segments and job title', async () => {
|
||||
it('calls writeOutputs with the received segments and job title', async () => {
|
||||
mockGetJob.mockReturnValue(makeJob('job-4', 'My Lecture'));
|
||||
const deduped = [makeSeg(0, 'Hello world.')];
|
||||
mockDeduplicateSegments.mockReturnValue(deduped);
|
||||
|
||||
await POST(makeEvent('job-4', makeWhisperJob({ segments })) as any);
|
||||
expect(mockWriteOutputs).toHaveBeenCalledWith(deduped, 'My Lecture', 'job-4');
|
||||
expect(mockWriteOutputs).toHaveBeenCalledWith(segments, 'My Lecture', 'job-4');
|
||||
});
|
||||
|
||||
it('stores serialised segments_json in the database', async () => {
|
||||
mockGetJob.mockReturnValue(makeJob('job-5'));
|
||||
const deduped = [makeSeg(0, 'Result text.')];
|
||||
mockDeduplicateSegments.mockReturnValue(deduped);
|
||||
|
||||
await POST(makeEvent('job-5', makeWhisperJob({ segments })) as any);
|
||||
|
||||
@@ -244,7 +233,7 @@ describe('POST /api/webhook/[jobId] — success with segments', () => {
|
||||
expect.objectContaining({
|
||||
id: 'job-5',
|
||||
status: 'done',
|
||||
segmentsJson: JSON.stringify(deduped)
|
||||
segmentsJson: JSON.stringify(segments)
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user