Initial commit: Tonemark PWA

Tonemark is a SvelteKit PWA for transcribing YouTube videos, audio and video files, and microphone recordings using a local Whisper backend. Features: - Dark glassmorphic UI with electric-lime accent (5 switchable themes) - Rail nav (desktop) / tab bar (mobile) layout - Drop zone, YouTube URL input, and live audio recording inputs - Audio mode waveform cards (none / standard / aggressive / auto) - Real-time transcription progress with animated waveform - Job queue with SSE streaming updates - Push notifications on job completion - PWA with native SvelteKit service worker - SRT / TXT / MD / JSON transcript downloads Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-06 16:41:25 +02:00
commit 13a96b6efa
68 changed files with 9712 additions and 0 deletions
--- a/src/lib/server/postprocess.ts
+++ b/src/lib/server/postprocess.ts
@@ -0,0 +1,108 @@
+import type { Segment } from '$lib/types.js';
+
+// ── Collapse consecutive repeated phrases within a segment's text ────────────
+
+function collapseRepeats(text: string): string {
+	let prev = '';
+	// Keep applying until stable
+	while (true) {
+		const next = collapseOnce(text);
+		if (next === prev || next === text) return next;
+		prev = text;
+		text = next;
+	}
+}
+
+function collapseOnce(text: string): string {
+	// Match any repeated phrase (2+ words) appearing consecutively
+	return text.replace(/\b(.{10,}?)\s+\1\b/gi, '$1');
+}
+
+// ── Merge consecutive segments with identical (or near-identical) text ───────
+
+function normalise(s: string) {
+	return s.toLowerCase().replace(/[^\w\s]/g, '').replace(/\s+/g, ' ').trim();
+}
+
+function mergeConsecutive(segments: Segment[]): Segment[] {
+	const out: Segment[] = [];
+	for (const seg of segments) {
+		const last = out[out.length - 1];
+		if (last && normalise(last.text) === normalise(seg.text)) {
+			last.end = seg.end;
+		} else {
+			out.push({ ...seg });
+		}
+	}
+	return out;
+}
+
+// ── N-gram deduplication ─────────────────────────────────────────────────────
+
+const NGRAM_N = 6;
+const LOOKBACK_CHARS = 500;
+const SIMILARITY_THRESHOLD = 0.6;
+
+function ngrams(text: string, n: number): string[] {
+	const words = text.toLowerCase().split(/\s+/);
+	const grams: string[] = [];
+	for (let i = 0; i <= words.length - n; i++) {
+		grams.push(words.slice(i, i + n).join(' '));
+	}
+	return grams;
+}
+
+function jaccardSimilarity(a: string, b: string): number {
+	const ga = new Set(ngrams(a, NGRAM_N));
+	const gb = new Set(ngrams(b, NGRAM_N));
+	// If neither text is long enough to produce n-grams they cannot be compared;
+	// treat as dissimilar so short segments are never incorrectly discarded.
+	if (ga.size === 0 && gb.size === 0) return 0;
+	const intersection = [...ga].filter((g) => gb.has(g)).length;
+	const union = new Set([...ga, ...gb]).size;
+	return union === 0 ? 0 : intersection / union;
+}
+
+function ngramDedup(segments: Segment[]): Segment[] {
+	const out: Segment[] = [];
+	for (const seg of segments) {
+		const windowText = out
+			.slice(-20)
+			.map((s) => s.text)
+			.join(' ')
+			.slice(-LOOKBACK_CHARS);
+
+		if (windowText.length > 0 && jaccardSimilarity(seg.text, windowText) >= SIMILARITY_THRESHOLD) {
+			continue; // duplicate — skip
+		}
+		out.push(seg);
+	}
+	return out;
+}
+
+// ── Full deduplication pipeline ──────────────────────────────────────────────
+
+export function deduplicateSegments(segments: Segment[]): Segment[] {
+	// 1. Collapse repeats within each segment's text
+	let result = segments.map((s) => ({
+		...s,
+		text: collapseRepeats(s.text.trim())
+	}));
+
+	// 2. Remove empty segments
+	result = result.filter((s) => s.text.length > 0);
+
+	// 3. First merge pass
+	result = mergeConsecutive(result);
+
+	// 4. N-gram dedup
+	result = ngramDedup(result);
+
+	// 5. Second merge pass (catches new adjacencies after dedup)
+	result = mergeConsecutive(result);
+
+	// 6. Re-index
+	result.forEach((s, i) => (s.index = i));
+
+	return result;
+}