From d8a73e150ab734827240c8d4c5ea3e6bc70d7d03 Mon Sep 17 00:00:00 2001 From: Giancarmine Salucci Date: Tue, 12 May 2026 00:10:32 +0200 Subject: [PATCH] fix(worker): port final segment cleanup Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/worker.rs | 254 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 242 insertions(+), 12 deletions(-) diff --git a/src/worker.rs b/src/worker.rs index 65f75f9..df75b02 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -623,20 +623,77 @@ fn to_chunk_ranges(cuts: &[f32], total_secs: f32) -> Vec<(f32, f32)> { const MAX_CHAIN_GAP_SECS: f32 = 0.15; const MIN_MEANINGFUL_WORDS: usize = 2; const MIN_MEANINGFUL_CHARS: usize = 8; -const MIN_OVERLAP_WORDS: usize = 3; +const MIN_OVERLAP_WORDS: usize = 1; +const SHORT_CARRYOVER_MAX_SECS: f32 = 0.2; +const SHORT_CARRYOVER_MAX_WORDS: usize = 2; +const SHORT_CARRYOVER_MAX_CHARS: usize = 16; +const NGRAM_N: usize = 6; +const LOOKBACK_CHARS: usize = 500; +const SIMILARITY_THRESHOLD: f32 = 0.6; -fn normalised_words(text: &str) -> Vec { +fn split_words(text: &str) -> Vec<&str> { text.split_whitespace() - .map(|word| { - word.chars() - .filter(|ch| ch.is_alphanumeric() || *ch == '_') - .flat_map(|ch| ch.to_lowercase()) - .collect::() - }) .filter(|word| !word.is_empty()) .collect() } +fn normalise_token(word: &str) -> String { + word.chars() + .filter(|ch| ch.is_alphanumeric() || *ch == '_') + .flat_map(|ch| ch.to_lowercase()) + .collect() +} + +fn normalised_words(text: &str) -> Vec { + split_words(text) + .into_iter() + .map(normalise_token) + .filter(|word| !word.is_empty()) + .collect() +} + +fn collapse_repeated_phrase_once(text: &str) -> String { + let raw_words = split_words(text); + if raw_words.len() < 4 { + return text.trim().to_string(); + } + + let normalised: Vec = raw_words.iter().map(|word| normalise_token(word)).collect(); + + for size in (2..=raw_words.len() / 2).rev() { + for start in 0..=raw_words.len().saturating_sub(size * 2) { + let phrase_chars = raw_words[start..start + size] + .iter() + .map(|word| word.len()) + .sum::() + + size.saturating_sub(1); + if phrase_chars < 10 { + continue; + } + + if normalised[start..start + size] == normalised[start + size..start + size * 2] { + let mut collapsed = Vec::with_capacity(raw_words.len() - size); + collapsed.extend_from_slice(&raw_words[..start + size]); + collapsed.extend_from_slice(&raw_words[start + size * 2..]); + return collapsed.join(" ").trim().to_string(); + } + } + } + + text.trim().to_string() +} + +fn collapse_repeats(text: &str) -> String { + let mut current = text.trim().to_string(); + loop { + let next = collapse_repeated_phrase_once(¤t); + if next == current { + return next; + } + current = next; + } +} + fn starts_with_words(full: &[String], prefix: &[String]) -> bool { prefix.len() <= full.len() && full.iter().take(prefix.len()).eq(prefix.iter()) } @@ -664,8 +721,16 @@ fn is_meaningful_phrase(words: &[String]) -> bool { && words.iter().map(|word| word.len()).sum::() >= MIN_MEANINGFUL_CHARS } +fn is_short_carryover(seg: &Segment, words: &[String]) -> bool { + seg.end - seg.start <= SHORT_CARRYOVER_MAX_SECS + || words.len() <= SHORT_CARRYOVER_MAX_WORDS + || words.iter().map(|word| word.len()).sum::() + words.len().saturating_sub(1) + <= SHORT_CARRYOVER_MAX_CHARS +} + fn trim_leading_words(text: &str, count: usize) -> String { - text.split_whitespace() + split_words(text) + .into_iter() .skip(count) .collect::>() .join(" ") @@ -721,7 +786,7 @@ fn collapse_incremental_segments(segments: Vec) -> Vec { if seg_words.len() > last_words.len() && starts_with_words(&seg_words, &last_words) - && is_meaningful_phrase(&last_words) + && (is_meaningful_phrase(&last_words) || is_short_carryover(last, &last_words)) { last.text = seg.text; last.end = seg.end; @@ -729,7 +794,9 @@ fn collapse_incremental_segments(segments: Vec) -> Vec { continue; } - if ends_with_words(&last_words, &seg_words) && is_meaningful_phrase(&seg_words) { + if ends_with_words(&last_words, &seg_words) + && (is_meaningful_phrase(&seg_words) || is_short_carryover(&seg, &seg_words)) + { last.end = last.end.max(seg.end); continue; } @@ -753,9 +820,85 @@ fn collapse_incremental_segments(segments: Vec) -> Vec { out } +fn ngrams(text: &str, n: usize) -> HashSet { + let words = text + .to_lowercase() + .split_whitespace() + .map(str::to_string) + .collect::>(); + + if words.len() < n { + return HashSet::new(); + } + + let mut grams = HashSet::new(); + for idx in 0..=words.len() - n { + grams.insert(words[idx..idx + n].join(" ")); + } + grams +} + +fn jaccard_similarity(left: &str, right: &str) -> f32 { + let left_grams = ngrams(left, NGRAM_N); + let right_grams = ngrams(right, NGRAM_N); + + if left_grams.is_empty() && right_grams.is_empty() { + return 0.0; + } + + let intersection = left_grams.intersection(&right_grams).count(); + let union = left_grams.union(&right_grams).count(); + + if union == 0 { + 0.0 + } else { + intersection as f32 / union as f32 + } +} + +fn tail_chars(text: &str, limit: usize) -> String { + let chars = text.chars().collect::>(); + let start = chars.len().saturating_sub(limit); + chars[start..].iter().collect() +} + +fn ngram_dedup(segments: Vec) -> Vec { + let mut out = Vec::with_capacity(segments.len()); + + for seg in segments { + let window_text = out + .iter() + .skip(out.len().saturating_sub(20)) + .map(|segment: &Segment| segment.text.as_str()) + .collect::>() + .join(" "); + let recent_context = tail_chars(&window_text, LOOKBACK_CHARS); + + if !recent_context.is_empty() + && jaccard_similarity(&seg.text, &recent_context) >= SIMILARITY_THRESHOLD + { + continue; + } + + out.push(seg); + } + + out +} + fn normalise_segments(segments: Vec) -> Vec { - let mut result = collapse_incremental_segments(segments); + let mut result = segments + .into_iter() + .map(|mut seg| { + seg.text = collapse_repeats(seg.text.trim()); + seg + }) + .filter(|seg| !seg.text.is_empty()) + .collect::>(); + + result = collapse_incremental_segments(result); result = merge_identical_segments(result); + result = ngram_dedup(result); result = collapse_incremental_segments(result); merge_identical_segments(result) } @@ -1041,6 +1184,21 @@ mod tests { assert!((result[1].end - 24.59).abs() < 0.01); } + #[test] + fn test_normalise_segments_collapses_repeated_phrase_inside_segment() { + let input = vec![segment( + 0, + 0.0, + 5.0, + "the quick brown fox the quick brown fox jumps over the fence", + )]; + + let result = normalise_segments(input); + + assert_eq!(result.len(), 1); + assert_eq!(result[0].text, "the quick brown fox jumps over the fence"); + } + #[test] fn test_normalise_segments_keeps_real_gap() { let input = vec![ @@ -1054,4 +1212,76 @@ mod tests { assert_eq!(result[0].text, "Hello everyone."); assert_eq!(result[1].text, "Hello everyone. Welcome back."); } + + #[test] + fn test_normalise_segments_collapses_tiny_carry_over_segments() { + let input = vec![ + segment(0, 94.8, 96.4, "world."), + segment( + 1, + 96.4, + 98.96, + "world. And that aspect that I overlooked was", + ), + segment(2, 98.96, 100.72, "inference."), + segment( + 3, + 100.72, + 103.92, + "inference. So, as someone who kind of wants to", + ), + segment(4, 107.19, 107.2, "and"), + segment( + 5, + 107.2, + 109.56, + "and work to understand the problems and the", + ), + ]; + + let result = normalise_segments(input); + + assert_eq!(result.len(), 3); + assert_eq!( + result[0].text, + "world. And that aspect that I overlooked was" + ); + assert_eq!( + result[1].text, + "inference. So, as someone who kind of wants to" + ); + assert_eq!( + result[2].text, + "and work to understand the problems and the" + ); + } + + #[test] + fn test_normalise_segments_trims_single_word_adjacent_overlap() { + let input = vec![ + segment(0, 94.8, 96.4, "world."), + segment( + 1, + 96.4, + 98.96, + "world. And that aspect that I overlooked was", + ), + segment(2, 120.12, 123.71, "to find more about inference."), + segment( + 3, + 123.72, + 126.92, + "inference. So, I've done a lot of work with VLAM,", + ), + ]; + + let result = normalise_segments(input); + + assert_eq!(result.len(), 3); + assert_eq!( + result[0].text, + "world. And that aspect that I overlooked was" + ); + assert_eq!(result[2].text, "So, I've done a lot of work with VLAM,"); + } }