From ef9c04b070049432e9fde1f776c0bd3ff50986a3 Mon Sep 17 00:00:00 2001 From: mozempk Date: Wed, 6 May 2026 02:13:20 +0200 Subject: [PATCH] fix: trim trailing silence from each chunk before whisper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Whisper hallucinates filler tokens (Bye., Thank you., etc.) into end-of-chunk silence. This is especially visible on the final chunk of long audio where the outro silence triggers a 10× repetition loop. Fix: after slicing each PCM chunk, scan backwards to find the last sample above −35 dB, then keep 0.5 s of padding and truncate. Applied to every chunk, not just the last — any chunk ending in a long silence period gets the same protection. Constants match the silencedetect filter already used for chunking: THRESHOLD = 0.0178 (−35 dB) PADDING = 8000 samples (0.5 s at 16 kHz) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/worker.rs | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/worker.rs b/src/worker.rs index 3649859..6942cb7 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -326,7 +326,8 @@ async fn process_job( for (ci, (chunk_start, chunk_end)) in chunks.iter().enumerate() { let s0 = (*chunk_start * 16_000.0) as usize; let s1 = ((*chunk_end * 16_000.0) as usize).min(pcm.len()); - let chunk_pcm = pcm[s0..s1].to_vec(); + let mut chunk_pcm = pcm[s0..s1].to_vec(); + trim_trailing_silence(&mut chunk_pcm); // Base percent this chunk starts at. let base = (ci * 100 / n) as u8; @@ -403,6 +404,29 @@ async fn process_job( Ok((all_segments, language, total_secs)) } +/// Trim trailing silence from a 16 kHz mono PCM buffer. +/// +/// Scans backwards to find the last sample above −35 dB, then keeps +/// 0.5 s of padding after it. This prevents whisper from hallucinating +/// filler tokens into end-of-chunk silence. +fn trim_trailing_silence(pcm: &mut Vec) { + const THRESHOLD: f32 = 0.017_8; // −35 dB (10^(−35/20)) + const PADDING: usize = 8_000; // 0.5 s at 16 kHz + + if let Some(last_loud) = pcm.iter().rposition(|&s| s.abs() > THRESHOLD) { + let new_len = (last_loud + 1 + PADDING).min(pcm.len()); + if new_len < pcm.len() { + tracing::trace!( + original_samples = pcm.len(), + trimmed_samples = pcm.len() - new_len, + "trimmed trailing silence" + ); + pcm.truncate(new_len); + } + } + // All-silent chunk: keep as-is — whisper will produce zero segments, which is correct. +} + /// Decode any audio file to 16 kHz mono PCM f32 using ffmpeg. async fn decode_audio(path: &std::path::Path) -> crate::Result> { use tokio::process::Command;