tonemark/src/lib/server/postprocess.ts

import type { Segment } from '$lib/types.js';

// ── Collapse consecutive repeated phrases within a segment's text ────────────

function collapseRepeats(text: string): string {
	let prev = '';
	// Keep applying until stable
	while (true) {
		const next = collapseOnce(text);
		if (next === prev || next === text) return next;
		prev = text;
		text = next;
	}
}

function collapseOnce(text: string): string {
	// Match any repeated phrase (2+ words) appearing consecutively
	return text.replace(/\b(.{10,}?)\s+\1\b/gi, '$1');
}

// ── Merge consecutive segments with identical (or near-identical) text ───────

function normalise(s: string) {
	return s.toLowerCase().replace(/[^\w\s]/g, '').replace(/\s+/g, ' ').trim();
}

function mergeConsecutive(segments: Segment[]): Segment[] {
	const out: Segment[] = [];
	for (const seg of segments) {
		const last = out[out.length - 1];
		if (last && normalise(last.text) === normalise(seg.text)) {
			last.end = seg.end;
		} else {
			out.push({ ...seg });
		}
	}
	return out;
}

// ── N-gram deduplication ─────────────────────────────────────────────────────

const NGRAM_N = 6;
const LOOKBACK_CHARS = 500;
const SIMILARITY_THRESHOLD = 0.6;

function ngrams(text: string, n: number): string[] {
	const words = text.toLowerCase().split(/\s+/);
	const grams: string[] = [];
	for (let i = 0; i <= words.length - n; i++) {
		grams.push(words.slice(i, i + n).join(' '));
	}
	return grams;
}

function jaccardSimilarity(a: string, b: string): number {
	const ga = new Set(ngrams(a, NGRAM_N));
	const gb = new Set(ngrams(b, NGRAM_N));
	// If neither text is long enough to produce n-grams they cannot be compared;
	// treat as dissimilar so short segments are never incorrectly discarded.
	if (ga.size === 0 && gb.size === 0) return 0;
	const intersection = [...ga].filter((g) => gb.has(g)).length;
	const union = new Set([...ga, ...gb]).size;
	return union === 0 ? 0 : intersection / union;
}

function ngramDedup(segments: Segment[]): Segment[] {
	const out: Segment[] = [];
	for (const seg of segments) {
		const windowText = out
			.slice(-20)
			.map((s) => s.text)
			.join(' ')
			.slice(-LOOKBACK_CHARS);

		if (windowText.length > 0 && jaccardSimilarity(seg.text, windowText) >= SIMILARITY_THRESHOLD) {
			continue; // duplicate — skip
		}
		out.push(seg);
	}
	return out;
}

// ── Full deduplication pipeline ──────────────────────────────────────────────

export function deduplicateSegments(segments: Segment[]): Segment[] {
	// 1. Collapse repeats within each segment's text
	let result = segments.map((s) => ({
		...s,
		text: collapseRepeats(s.text.trim())
	}));

	// 2. Remove empty segments
	result = result.filter((s) => s.text.length > 0);

	// 3. First merge pass
	result = mergeConsecutive(result);

	// 4. N-gram dedup
	result = ngramDedup(result);

	// 5. Second merge pass (catches new adjacencies after dedup)
	result = mergeConsecutive(result);

	// 6. Re-index
	result.forEach((s, i) => (s.index = i));

	return result;
}