fix(worker): port final segment cleanup
All checks were successful
Build & Push Docker Image / test (push) Successful in 6m2s
Build & Push Docker Image / build-and-push (push) Successful in 6m31s

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-12 00:10:32 +02:00
parent cb0b07b2ff
commit d8a73e150a

View File

@@ -623,20 +623,77 @@ fn to_chunk_ranges(cuts: &[f32], total_secs: f32) -> Vec<(f32, f32)> {
const MAX_CHAIN_GAP_SECS: f32 = 0.15;
const MIN_MEANINGFUL_WORDS: usize = 2;
const MIN_MEANINGFUL_CHARS: usize = 8;
const MIN_OVERLAP_WORDS: usize = 3;
const MIN_OVERLAP_WORDS: usize = 1;
const SHORT_CARRYOVER_MAX_SECS: f32 = 0.2;
const SHORT_CARRYOVER_MAX_WORDS: usize = 2;
const SHORT_CARRYOVER_MAX_CHARS: usize = 16;
const NGRAM_N: usize = 6;
const LOOKBACK_CHARS: usize = 500;
const SIMILARITY_THRESHOLD: f32 = 0.6;
fn normalised_words(text: &str) -> Vec<String> {
fn split_words(text: &str) -> Vec<&str> {
text.split_whitespace()
.map(|word| {
word.chars()
.filter(|ch| ch.is_alphanumeric() || *ch == '_')
.flat_map(|ch| ch.to_lowercase())
.collect::<String>()
})
.filter(|word| !word.is_empty())
.collect()
}
fn normalise_token(word: &str) -> String {
word.chars()
.filter(|ch| ch.is_alphanumeric() || *ch == '_')
.flat_map(|ch| ch.to_lowercase())
.collect()
}
fn normalised_words(text: &str) -> Vec<String> {
split_words(text)
.into_iter()
.map(normalise_token)
.filter(|word| !word.is_empty())
.collect()
}
fn collapse_repeated_phrase_once(text: &str) -> String {
let raw_words = split_words(text);
if raw_words.len() < 4 {
return text.trim().to_string();
}
let normalised: Vec<String> = raw_words.iter().map(|word| normalise_token(word)).collect();
for size in (2..=raw_words.len() / 2).rev() {
for start in 0..=raw_words.len().saturating_sub(size * 2) {
let phrase_chars = raw_words[start..start + size]
.iter()
.map(|word| word.len())
.sum::<usize>()
+ size.saturating_sub(1);
if phrase_chars < 10 {
continue;
}
if normalised[start..start + size] == normalised[start + size..start + size * 2] {
let mut collapsed = Vec::with_capacity(raw_words.len() - size);
collapsed.extend_from_slice(&raw_words[..start + size]);
collapsed.extend_from_slice(&raw_words[start + size * 2..]);
return collapsed.join(" ").trim().to_string();
}
}
}
text.trim().to_string()
}
fn collapse_repeats(text: &str) -> String {
let mut current = text.trim().to_string();
loop {
let next = collapse_repeated_phrase_once(&current);
if next == current {
return next;
}
current = next;
}
}
fn starts_with_words(full: &[String], prefix: &[String]) -> bool {
prefix.len() <= full.len() && full.iter().take(prefix.len()).eq(prefix.iter())
}
@@ -664,8 +721,16 @@ fn is_meaningful_phrase(words: &[String]) -> bool {
&& words.iter().map(|word| word.len()).sum::<usize>() >= MIN_MEANINGFUL_CHARS
}
fn is_short_carryover(seg: &Segment, words: &[String]) -> bool {
seg.end - seg.start <= SHORT_CARRYOVER_MAX_SECS
|| words.len() <= SHORT_CARRYOVER_MAX_WORDS
|| words.iter().map(|word| word.len()).sum::<usize>() + words.len().saturating_sub(1)
<= SHORT_CARRYOVER_MAX_CHARS
}
fn trim_leading_words(text: &str, count: usize) -> String {
text.split_whitespace()
split_words(text)
.into_iter()
.skip(count)
.collect::<Vec<_>>()
.join(" ")
@@ -721,7 +786,7 @@ fn collapse_incremental_segments(segments: Vec<Segment>) -> Vec<Segment> {
if seg_words.len() > last_words.len()
&& starts_with_words(&seg_words, &last_words)
&& is_meaningful_phrase(&last_words)
&& (is_meaningful_phrase(&last_words) || is_short_carryover(last, &last_words))
{
last.text = seg.text;
last.end = seg.end;
@@ -729,7 +794,9 @@ fn collapse_incremental_segments(segments: Vec<Segment>) -> Vec<Segment> {
continue;
}
if ends_with_words(&last_words, &seg_words) && is_meaningful_phrase(&seg_words) {
if ends_with_words(&last_words, &seg_words)
&& (is_meaningful_phrase(&seg_words) || is_short_carryover(&seg, &seg_words))
{
last.end = last.end.max(seg.end);
continue;
}
@@ -753,9 +820,85 @@ fn collapse_incremental_segments(segments: Vec<Segment>) -> Vec<Segment> {
out
}
fn ngrams(text: &str, n: usize) -> HashSet<String> {
let words = text
.to_lowercase()
.split_whitespace()
.map(str::to_string)
.collect::<Vec<_>>();
if words.len() < n {
return HashSet::new();
}
let mut grams = HashSet::new();
for idx in 0..=words.len() - n {
grams.insert(words[idx..idx + n].join(" "));
}
grams
}
fn jaccard_similarity(left: &str, right: &str) -> f32 {
let left_grams = ngrams(left, NGRAM_N);
let right_grams = ngrams(right, NGRAM_N);
if left_grams.is_empty() && right_grams.is_empty() {
return 0.0;
}
let intersection = left_grams.intersection(&right_grams).count();
let union = left_grams.union(&right_grams).count();
if union == 0 {
0.0
} else {
intersection as f32 / union as f32
}
}
fn tail_chars(text: &str, limit: usize) -> String {
let chars = text.chars().collect::<Vec<_>>();
let start = chars.len().saturating_sub(limit);
chars[start..].iter().collect()
}
fn ngram_dedup(segments: Vec<Segment>) -> Vec<Segment> {
let mut out = Vec::with_capacity(segments.len());
for seg in segments {
let window_text = out
.iter()
.skip(out.len().saturating_sub(20))
.map(|segment: &Segment| segment.text.as_str())
.collect::<Vec<_>>()
.join(" ");
let recent_context = tail_chars(&window_text, LOOKBACK_CHARS);
if !recent_context.is_empty()
&& jaccard_similarity(&seg.text, &recent_context) >= SIMILARITY_THRESHOLD
{
continue;
}
out.push(seg);
}
out
}
fn normalise_segments(segments: Vec<Segment>) -> Vec<Segment> {
let mut result = collapse_incremental_segments(segments);
let mut result = segments
.into_iter()
.map(|mut seg| {
seg.text = collapse_repeats(seg.text.trim());
seg
})
.filter(|seg| !seg.text.is_empty())
.collect::<Vec<_>>();
result = collapse_incremental_segments(result);
result = merge_identical_segments(result);
result = ngram_dedup(result);
result = collapse_incremental_segments(result);
merge_identical_segments(result)
}
@@ -1041,6 +1184,21 @@ mod tests {
assert!((result[1].end - 24.59).abs() < 0.01);
}
#[test]
fn test_normalise_segments_collapses_repeated_phrase_inside_segment() {
let input = vec![segment(
0,
0.0,
5.0,
"the quick brown fox the quick brown fox jumps over the fence",
)];
let result = normalise_segments(input);
assert_eq!(result.len(), 1);
assert_eq!(result[0].text, "the quick brown fox jumps over the fence");
}
#[test]
fn test_normalise_segments_keeps_real_gap() {
let input = vec![
@@ -1054,4 +1212,76 @@ mod tests {
assert_eq!(result[0].text, "Hello everyone.");
assert_eq!(result[1].text, "Hello everyone. Welcome back.");
}
#[test]
fn test_normalise_segments_collapses_tiny_carry_over_segments() {
let input = vec![
segment(0, 94.8, 96.4, "world."),
segment(
1,
96.4,
98.96,
"world. And that aspect that I overlooked was",
),
segment(2, 98.96, 100.72, "inference."),
segment(
3,
100.72,
103.92,
"inference. So, as someone who kind of wants to",
),
segment(4, 107.19, 107.2, "and"),
segment(
5,
107.2,
109.56,
"and work to understand the problems and the",
),
];
let result = normalise_segments(input);
assert_eq!(result.len(), 3);
assert_eq!(
result[0].text,
"world. And that aspect that I overlooked was"
);
assert_eq!(
result[1].text,
"inference. So, as someone who kind of wants to"
);
assert_eq!(
result[2].text,
"and work to understand the problems and the"
);
}
#[test]
fn test_normalise_segments_trims_single_word_adjacent_overlap() {
let input = vec![
segment(0, 94.8, 96.4, "world."),
segment(
1,
96.4,
98.96,
"world. And that aspect that I overlooked was",
),
segment(2, 120.12, 123.71, "to find more about inference."),
segment(
3,
123.72,
126.92,
"inference. So, I've done a lot of work with VLAM,",
),
];
let result = normalise_segments(input);
assert_eq!(result.len(), 3);
assert_eq!(
result[0].text,
"world. And that aspect that I overlooked was"
);
assert_eq!(result[2].text, "So, I've done a lot of work with VLAM,");
}
}