Some checks failed
Build & Push Docker Image / build-and-push (push) Failing after 11s
Tonemark is a SvelteKit PWA for transcribing YouTube videos, audio and video files, and microphone recordings using a local Whisper backend. Features: - Dark glassmorphic UI with electric-lime accent (5 switchable themes) - Rail nav (desktop) / tab bar (mobile) layout - Drop zone, YouTube URL input, and live audio recording inputs - Audio mode waveform cards (none / standard / aggressive / auto) - Real-time transcription progress with animated waveform - Job queue with SSE streaming updates - Push notifications on job completion - PWA with native SvelteKit service worker - SRT / TXT / MD / JSON transcript downloads Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
109 lines
3.2 KiB
TypeScript
109 lines
3.2 KiB
TypeScript
import type { Segment } from '$lib/types.js';
|
|
|
|
// ── Collapse consecutive repeated phrases within a segment's text ────────────
|
|
|
|
function collapseRepeats(text: string): string {
|
|
let prev = '';
|
|
// Keep applying until stable
|
|
while (true) {
|
|
const next = collapseOnce(text);
|
|
if (next === prev || next === text) return next;
|
|
prev = text;
|
|
text = next;
|
|
}
|
|
}
|
|
|
|
function collapseOnce(text: string): string {
|
|
// Match any repeated phrase (2+ words) appearing consecutively
|
|
return text.replace(/\b(.{10,}?)\s+\1\b/gi, '$1');
|
|
}
|
|
|
|
// ── Merge consecutive segments with identical (or near-identical) text ───────
|
|
|
|
function normalise(s: string) {
|
|
return s.toLowerCase().replace(/[^\w\s]/g, '').replace(/\s+/g, ' ').trim();
|
|
}
|
|
|
|
function mergeConsecutive(segments: Segment[]): Segment[] {
|
|
const out: Segment[] = [];
|
|
for (const seg of segments) {
|
|
const last = out[out.length - 1];
|
|
if (last && normalise(last.text) === normalise(seg.text)) {
|
|
last.end = seg.end;
|
|
} else {
|
|
out.push({ ...seg });
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
// ── N-gram deduplication ─────────────────────────────────────────────────────
|
|
|
|
const NGRAM_N = 6;
|
|
const LOOKBACK_CHARS = 500;
|
|
const SIMILARITY_THRESHOLD = 0.6;
|
|
|
|
function ngrams(text: string, n: number): string[] {
|
|
const words = text.toLowerCase().split(/\s+/);
|
|
const grams: string[] = [];
|
|
for (let i = 0; i <= words.length - n; i++) {
|
|
grams.push(words.slice(i, i + n).join(' '));
|
|
}
|
|
return grams;
|
|
}
|
|
|
|
function jaccardSimilarity(a: string, b: string): number {
|
|
const ga = new Set(ngrams(a, NGRAM_N));
|
|
const gb = new Set(ngrams(b, NGRAM_N));
|
|
// If neither text is long enough to produce n-grams they cannot be compared;
|
|
// treat as dissimilar so short segments are never incorrectly discarded.
|
|
if (ga.size === 0 && gb.size === 0) return 0;
|
|
const intersection = [...ga].filter((g) => gb.has(g)).length;
|
|
const union = new Set([...ga, ...gb]).size;
|
|
return union === 0 ? 0 : intersection / union;
|
|
}
|
|
|
|
function ngramDedup(segments: Segment[]): Segment[] {
|
|
const out: Segment[] = [];
|
|
for (const seg of segments) {
|
|
const windowText = out
|
|
.slice(-20)
|
|
.map((s) => s.text)
|
|
.join(' ')
|
|
.slice(-LOOKBACK_CHARS);
|
|
|
|
if (windowText.length > 0 && jaccardSimilarity(seg.text, windowText) >= SIMILARITY_THRESHOLD) {
|
|
continue; // duplicate — skip
|
|
}
|
|
out.push(seg);
|
|
}
|
|
return out;
|
|
}
|
|
|
|
// ── Full deduplication pipeline ──────────────────────────────────────────────
|
|
|
|
export function deduplicateSegments(segments: Segment[]): Segment[] {
|
|
// 1. Collapse repeats within each segment's text
|
|
let result = segments.map((s) => ({
|
|
...s,
|
|
text: collapseRepeats(s.text.trim())
|
|
}));
|
|
|
|
// 2. Remove empty segments
|
|
result = result.filter((s) => s.text.length > 0);
|
|
|
|
// 3. First merge pass
|
|
result = mergeConsecutive(result);
|
|
|
|
// 4. N-gram dedup
|
|
result = ngramDedup(result);
|
|
|
|
// 5. Second merge pass (catches new adjacencies after dedup)
|
|
result = mergeConsecutive(result);
|
|
|
|
// 6. Re-index
|
|
result.forEach((s, i) => (s.index = i));
|
|
|
|
return result;
|
|
}
|