fix(worker): port final segment cleanup
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
254
src/worker.rs
254
src/worker.rs
@@ -623,20 +623,77 @@ fn to_chunk_ranges(cuts: &[f32], total_secs: f32) -> Vec<(f32, f32)> {
|
||||
const MAX_CHAIN_GAP_SECS: f32 = 0.15;
|
||||
const MIN_MEANINGFUL_WORDS: usize = 2;
|
||||
const MIN_MEANINGFUL_CHARS: usize = 8;
|
||||
const MIN_OVERLAP_WORDS: usize = 3;
|
||||
const MIN_OVERLAP_WORDS: usize = 1;
|
||||
const SHORT_CARRYOVER_MAX_SECS: f32 = 0.2;
|
||||
const SHORT_CARRYOVER_MAX_WORDS: usize = 2;
|
||||
const SHORT_CARRYOVER_MAX_CHARS: usize = 16;
|
||||
const NGRAM_N: usize = 6;
|
||||
const LOOKBACK_CHARS: usize = 500;
|
||||
const SIMILARITY_THRESHOLD: f32 = 0.6;
|
||||
|
||||
fn normalised_words(text: &str) -> Vec<String> {
|
||||
fn split_words(text: &str) -> Vec<&str> {
|
||||
text.split_whitespace()
|
||||
.map(|word| {
|
||||
word.chars()
|
||||
.filter(|ch| ch.is_alphanumeric() || *ch == '_')
|
||||
.flat_map(|ch| ch.to_lowercase())
|
||||
.collect::<String>()
|
||||
})
|
||||
.filter(|word| !word.is_empty())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn normalise_token(word: &str) -> String {
|
||||
word.chars()
|
||||
.filter(|ch| ch.is_alphanumeric() || *ch == '_')
|
||||
.flat_map(|ch| ch.to_lowercase())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn normalised_words(text: &str) -> Vec<String> {
|
||||
split_words(text)
|
||||
.into_iter()
|
||||
.map(normalise_token)
|
||||
.filter(|word| !word.is_empty())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn collapse_repeated_phrase_once(text: &str) -> String {
|
||||
let raw_words = split_words(text);
|
||||
if raw_words.len() < 4 {
|
||||
return text.trim().to_string();
|
||||
}
|
||||
|
||||
let normalised: Vec<String> = raw_words.iter().map(|word| normalise_token(word)).collect();
|
||||
|
||||
for size in (2..=raw_words.len() / 2).rev() {
|
||||
for start in 0..=raw_words.len().saturating_sub(size * 2) {
|
||||
let phrase_chars = raw_words[start..start + size]
|
||||
.iter()
|
||||
.map(|word| word.len())
|
||||
.sum::<usize>()
|
||||
+ size.saturating_sub(1);
|
||||
if phrase_chars < 10 {
|
||||
continue;
|
||||
}
|
||||
|
||||
if normalised[start..start + size] == normalised[start + size..start + size * 2] {
|
||||
let mut collapsed = Vec::with_capacity(raw_words.len() - size);
|
||||
collapsed.extend_from_slice(&raw_words[..start + size]);
|
||||
collapsed.extend_from_slice(&raw_words[start + size * 2..]);
|
||||
return collapsed.join(" ").trim().to_string();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
text.trim().to_string()
|
||||
}
|
||||
|
||||
fn collapse_repeats(text: &str) -> String {
|
||||
let mut current = text.trim().to_string();
|
||||
loop {
|
||||
let next = collapse_repeated_phrase_once(¤t);
|
||||
if next == current {
|
||||
return next;
|
||||
}
|
||||
current = next;
|
||||
}
|
||||
}
|
||||
|
||||
fn starts_with_words(full: &[String], prefix: &[String]) -> bool {
|
||||
prefix.len() <= full.len() && full.iter().take(prefix.len()).eq(prefix.iter())
|
||||
}
|
||||
@@ -664,8 +721,16 @@ fn is_meaningful_phrase(words: &[String]) -> bool {
|
||||
&& words.iter().map(|word| word.len()).sum::<usize>() >= MIN_MEANINGFUL_CHARS
|
||||
}
|
||||
|
||||
fn is_short_carryover(seg: &Segment, words: &[String]) -> bool {
|
||||
seg.end - seg.start <= SHORT_CARRYOVER_MAX_SECS
|
||||
|| words.len() <= SHORT_CARRYOVER_MAX_WORDS
|
||||
|| words.iter().map(|word| word.len()).sum::<usize>() + words.len().saturating_sub(1)
|
||||
<= SHORT_CARRYOVER_MAX_CHARS
|
||||
}
|
||||
|
||||
fn trim_leading_words(text: &str, count: usize) -> String {
|
||||
text.split_whitespace()
|
||||
split_words(text)
|
||||
.into_iter()
|
||||
.skip(count)
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
@@ -721,7 +786,7 @@ fn collapse_incremental_segments(segments: Vec<Segment>) -> Vec<Segment> {
|
||||
|
||||
if seg_words.len() > last_words.len()
|
||||
&& starts_with_words(&seg_words, &last_words)
|
||||
&& is_meaningful_phrase(&last_words)
|
||||
&& (is_meaningful_phrase(&last_words) || is_short_carryover(last, &last_words))
|
||||
{
|
||||
last.text = seg.text;
|
||||
last.end = seg.end;
|
||||
@@ -729,7 +794,9 @@ fn collapse_incremental_segments(segments: Vec<Segment>) -> Vec<Segment> {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ends_with_words(&last_words, &seg_words) && is_meaningful_phrase(&seg_words) {
|
||||
if ends_with_words(&last_words, &seg_words)
|
||||
&& (is_meaningful_phrase(&seg_words) || is_short_carryover(&seg, &seg_words))
|
||||
{
|
||||
last.end = last.end.max(seg.end);
|
||||
continue;
|
||||
}
|
||||
@@ -753,9 +820,85 @@ fn collapse_incremental_segments(segments: Vec<Segment>) -> Vec<Segment> {
|
||||
out
|
||||
}
|
||||
|
||||
fn ngrams(text: &str, n: usize) -> HashSet<String> {
|
||||
let words = text
|
||||
.to_lowercase()
|
||||
.split_whitespace()
|
||||
.map(str::to_string)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if words.len() < n {
|
||||
return HashSet::new();
|
||||
}
|
||||
|
||||
let mut grams = HashSet::new();
|
||||
for idx in 0..=words.len() - n {
|
||||
grams.insert(words[idx..idx + n].join(" "));
|
||||
}
|
||||
grams
|
||||
}
|
||||
|
||||
fn jaccard_similarity(left: &str, right: &str) -> f32 {
|
||||
let left_grams = ngrams(left, NGRAM_N);
|
||||
let right_grams = ngrams(right, NGRAM_N);
|
||||
|
||||
if left_grams.is_empty() && right_grams.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let intersection = left_grams.intersection(&right_grams).count();
|
||||
let union = left_grams.union(&right_grams).count();
|
||||
|
||||
if union == 0 {
|
||||
0.0
|
||||
} else {
|
||||
intersection as f32 / union as f32
|
||||
}
|
||||
}
|
||||
|
||||
fn tail_chars(text: &str, limit: usize) -> String {
|
||||
let chars = text.chars().collect::<Vec<_>>();
|
||||
let start = chars.len().saturating_sub(limit);
|
||||
chars[start..].iter().collect()
|
||||
}
|
||||
|
||||
fn ngram_dedup(segments: Vec<Segment>) -> Vec<Segment> {
|
||||
let mut out = Vec::with_capacity(segments.len());
|
||||
|
||||
for seg in segments {
|
||||
let window_text = out
|
||||
.iter()
|
||||
.skip(out.len().saturating_sub(20))
|
||||
.map(|segment: &Segment| segment.text.as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ");
|
||||
let recent_context = tail_chars(&window_text, LOOKBACK_CHARS);
|
||||
|
||||
if !recent_context.is_empty()
|
||||
&& jaccard_similarity(&seg.text, &recent_context) >= SIMILARITY_THRESHOLD
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
out.push(seg);
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
fn normalise_segments(segments: Vec<Segment>) -> Vec<Segment> {
|
||||
let mut result = collapse_incremental_segments(segments);
|
||||
let mut result = segments
|
||||
.into_iter()
|
||||
.map(|mut seg| {
|
||||
seg.text = collapse_repeats(seg.text.trim());
|
||||
seg
|
||||
})
|
||||
.filter(|seg| !seg.text.is_empty())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
result = collapse_incremental_segments(result);
|
||||
result = merge_identical_segments(result);
|
||||
result = ngram_dedup(result);
|
||||
result = collapse_incremental_segments(result);
|
||||
merge_identical_segments(result)
|
||||
}
|
||||
@@ -1041,6 +1184,21 @@ mod tests {
|
||||
assert!((result[1].end - 24.59).abs() < 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalise_segments_collapses_repeated_phrase_inside_segment() {
|
||||
let input = vec![segment(
|
||||
0,
|
||||
0.0,
|
||||
5.0,
|
||||
"the quick brown fox the quick brown fox jumps over the fence",
|
||||
)];
|
||||
|
||||
let result = normalise_segments(input);
|
||||
|
||||
assert_eq!(result.len(), 1);
|
||||
assert_eq!(result[0].text, "the quick brown fox jumps over the fence");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalise_segments_keeps_real_gap() {
|
||||
let input = vec![
|
||||
@@ -1054,4 +1212,76 @@ mod tests {
|
||||
assert_eq!(result[0].text, "Hello everyone.");
|
||||
assert_eq!(result[1].text, "Hello everyone. Welcome back.");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalise_segments_collapses_tiny_carry_over_segments() {
|
||||
let input = vec![
|
||||
segment(0, 94.8, 96.4, "world."),
|
||||
segment(
|
||||
1,
|
||||
96.4,
|
||||
98.96,
|
||||
"world. And that aspect that I overlooked was",
|
||||
),
|
||||
segment(2, 98.96, 100.72, "inference."),
|
||||
segment(
|
||||
3,
|
||||
100.72,
|
||||
103.92,
|
||||
"inference. So, as someone who kind of wants to",
|
||||
),
|
||||
segment(4, 107.19, 107.2, "and"),
|
||||
segment(
|
||||
5,
|
||||
107.2,
|
||||
109.56,
|
||||
"and work to understand the problems and the",
|
||||
),
|
||||
];
|
||||
|
||||
let result = normalise_segments(input);
|
||||
|
||||
assert_eq!(result.len(), 3);
|
||||
assert_eq!(
|
||||
result[0].text,
|
||||
"world. And that aspect that I overlooked was"
|
||||
);
|
||||
assert_eq!(
|
||||
result[1].text,
|
||||
"inference. So, as someone who kind of wants to"
|
||||
);
|
||||
assert_eq!(
|
||||
result[2].text,
|
||||
"and work to understand the problems and the"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalise_segments_trims_single_word_adjacent_overlap() {
|
||||
let input = vec![
|
||||
segment(0, 94.8, 96.4, "world."),
|
||||
segment(
|
||||
1,
|
||||
96.4,
|
||||
98.96,
|
||||
"world. And that aspect that I overlooked was",
|
||||
),
|
||||
segment(2, 120.12, 123.71, "to find more about inference."),
|
||||
segment(
|
||||
3,
|
||||
123.72,
|
||||
126.92,
|
||||
"inference. So, I've done a lot of work with VLAM,",
|
||||
),
|
||||
];
|
||||
|
||||
let result = normalise_segments(input);
|
||||
|
||||
assert_eq!(result.len(), 3);
|
||||
assert_eq!(
|
||||
result[0].text,
|
||||
"world. And that aspect that I overlooked was"
|
||||
);
|
||||
assert_eq!(result[2].text, "So, I've done a lot of work with VLAM,");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user