import type { Segment } from '$lib/types.js'; // ── Collapse consecutive repeated phrases within a segment's text ──────────── function collapseRepeats(text: string): string { let prev = ''; // Keep applying until stable while (true) { const next = collapseOnce(text); if (next === prev || next === text) return next; prev = text; text = next; } } function collapseOnce(text: string): string { // Match any repeated phrase (2+ words) appearing consecutively return text.replace(/\b(.{10,}?)\s+\1\b/gi, '$1'); } // ── Merge consecutive segments with identical (or near-identical) text ─────── function normalise(s: string) { return s.toLowerCase().replace(/[^\w\s]/g, '').replace(/\s+/g, ' ').trim(); } function mergeConsecutive(segments: Segment[]): Segment[] { const out: Segment[] = []; for (const seg of segments) { const last = out[out.length - 1]; if (last && normalise(last.text) === normalise(seg.text)) { last.end = seg.end; } else { out.push({ ...seg }); } } return out; } // ── N-gram deduplication ───────────────────────────────────────────────────── const NGRAM_N = 6; const LOOKBACK_CHARS = 500; const SIMILARITY_THRESHOLD = 0.6; function ngrams(text: string, n: number): string[] { const words = text.toLowerCase().split(/\s+/); const grams: string[] = []; for (let i = 0; i <= words.length - n; i++) { grams.push(words.slice(i, i + n).join(' ')); } return grams; } function jaccardSimilarity(a: string, b: string): number { const ga = new Set(ngrams(a, NGRAM_N)); const gb = new Set(ngrams(b, NGRAM_N)); // If neither text is long enough to produce n-grams they cannot be compared; // treat as dissimilar so short segments are never incorrectly discarded. if (ga.size === 0 && gb.size === 0) return 0; const intersection = [...ga].filter((g) => gb.has(g)).length; const union = new Set([...ga, ...gb]).size; return union === 0 ? 0 : intersection / union; } function ngramDedup(segments: Segment[]): Segment[] { const out: Segment[] = []; for (const seg of segments) { const windowText = out .slice(-20) .map((s) => s.text) .join(' ') .slice(-LOOKBACK_CHARS); if (windowText.length > 0 && jaccardSimilarity(seg.text, windowText) >= SIMILARITY_THRESHOLD) { continue; // duplicate — skip } out.push(seg); } return out; } // ── Full deduplication pipeline ────────────────────────────────────────────── export function deduplicateSegments(segments: Segment[]): Segment[] { // 1. Collapse repeats within each segment's text let result = segments.map((s) => ({ ...s, text: collapseRepeats(s.text.trim()) })); // 2. Remove empty segments result = result.filter((s) => s.text.length > 0); // 3. First merge pass result = mergeConsecutive(result); // 4. N-gram dedup result = ngramDedup(result); // 5. Second merge pass (catches new adjacencies after dedup) result = mergeConsecutive(result); // 6. Re-index result.forEach((s, i) => (s.index = i)); return result; }