Initial commit: Tonemark PWA
Some checks failed
Build & Push Docker Image / build-and-push (push) Failing after 11s
Some checks failed
Build & Push Docker Image / build-and-push (push) Failing after 11s
Tonemark is a SvelteKit PWA for transcribing YouTube videos, audio and video files, and microphone recordings using a local Whisper backend. Features: - Dark glassmorphic UI with electric-lime accent (5 switchable themes) - Rail nav (desktop) / tab bar (mobile) layout - Drop zone, YouTube URL input, and live audio recording inputs - Audio mode waveform cards (none / standard / aggressive / auto) - Real-time transcription progress with animated waveform - Job queue with SSE streaming updates - Push notifications on job completion - PWA with native SvelteKit service worker - SRT / TXT / MD / JSON transcript downloads Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
108
src/lib/server/postprocess.ts
Normal file
108
src/lib/server/postprocess.ts
Normal file
@@ -0,0 +1,108 @@
|
||||
import type { Segment } from '$lib/types.js';
|
||||
|
||||
// ── Collapse consecutive repeated phrases within a segment's text ────────────
|
||||
|
||||
function collapseRepeats(text: string): string {
|
||||
let prev = '';
|
||||
// Keep applying until stable
|
||||
while (true) {
|
||||
const next = collapseOnce(text);
|
||||
if (next === prev || next === text) return next;
|
||||
prev = text;
|
||||
text = next;
|
||||
}
|
||||
}
|
||||
|
||||
function collapseOnce(text: string): string {
|
||||
// Match any repeated phrase (2+ words) appearing consecutively
|
||||
return text.replace(/\b(.{10,}?)\s+\1\b/gi, '$1');
|
||||
}
|
||||
|
||||
// ── Merge consecutive segments with identical (or near-identical) text ───────
|
||||
|
||||
function normalise(s: string) {
|
||||
return s.toLowerCase().replace(/[^\w\s]/g, '').replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
function mergeConsecutive(segments: Segment[]): Segment[] {
|
||||
const out: Segment[] = [];
|
||||
for (const seg of segments) {
|
||||
const last = out[out.length - 1];
|
||||
if (last && normalise(last.text) === normalise(seg.text)) {
|
||||
last.end = seg.end;
|
||||
} else {
|
||||
out.push({ ...seg });
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// ── N-gram deduplication ─────────────────────────────────────────────────────
|
||||
|
||||
const NGRAM_N = 6;
|
||||
const LOOKBACK_CHARS = 500;
|
||||
const SIMILARITY_THRESHOLD = 0.6;
|
||||
|
||||
function ngrams(text: string, n: number): string[] {
|
||||
const words = text.toLowerCase().split(/\s+/);
|
||||
const grams: string[] = [];
|
||||
for (let i = 0; i <= words.length - n; i++) {
|
||||
grams.push(words.slice(i, i + n).join(' '));
|
||||
}
|
||||
return grams;
|
||||
}
|
||||
|
||||
function jaccardSimilarity(a: string, b: string): number {
|
||||
const ga = new Set(ngrams(a, NGRAM_N));
|
||||
const gb = new Set(ngrams(b, NGRAM_N));
|
||||
// If neither text is long enough to produce n-grams they cannot be compared;
|
||||
// treat as dissimilar so short segments are never incorrectly discarded.
|
||||
if (ga.size === 0 && gb.size === 0) return 0;
|
||||
const intersection = [...ga].filter((g) => gb.has(g)).length;
|
||||
const union = new Set([...ga, ...gb]).size;
|
||||
return union === 0 ? 0 : intersection / union;
|
||||
}
|
||||
|
||||
function ngramDedup(segments: Segment[]): Segment[] {
|
||||
const out: Segment[] = [];
|
||||
for (const seg of segments) {
|
||||
const windowText = out
|
||||
.slice(-20)
|
||||
.map((s) => s.text)
|
||||
.join(' ')
|
||||
.slice(-LOOKBACK_CHARS);
|
||||
|
||||
if (windowText.length > 0 && jaccardSimilarity(seg.text, windowText) >= SIMILARITY_THRESHOLD) {
|
||||
continue; // duplicate — skip
|
||||
}
|
||||
out.push(seg);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// ── Full deduplication pipeline ──────────────────────────────────────────────
|
||||
|
||||
export function deduplicateSegments(segments: Segment[]): Segment[] {
|
||||
// 1. Collapse repeats within each segment's text
|
||||
let result = segments.map((s) => ({
|
||||
...s,
|
||||
text: collapseRepeats(s.text.trim())
|
||||
}));
|
||||
|
||||
// 2. Remove empty segments
|
||||
result = result.filter((s) => s.text.length > 0);
|
||||
|
||||
// 3. First merge pass
|
||||
result = mergeConsecutive(result);
|
||||
|
||||
// 4. N-gram dedup
|
||||
result = ngramDedup(result);
|
||||
|
||||
// 5. Second merge pass (catches new adjacencies after dedup)
|
||||
result = mergeConsecutive(result);
|
||||
|
||||
// 6. Re-index
|
||||
result.forEach((s, i) => (s.index = i));
|
||||
|
||||
return result;
|
||||
}
|
||||
Reference in New Issue
Block a user