fix: trim trailing silence from each chunk before whisper
All checks were successful
Build & Push Docker Image / build-and-push (push) Successful in 6m44s

Whisper hallucinates filler tokens (Bye., Thank you., etc.) into
end-of-chunk silence. This is especially visible on the final chunk
of long audio where the outro silence triggers a 10× repetition loop.

Fix: after slicing each PCM chunk, scan backwards to find the last
sample above −35 dB, then keep 0.5 s of padding and truncate.
Applied to every chunk, not just the last — any chunk ending in a long
silence period gets the same protection.

Constants match the silencedetect filter already used for chunking:
  THRESHOLD = 0.0178  (−35 dB)
  PADDING   = 8000 samples (0.5 s at 16 kHz)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
mozempk
2026-05-06 02:13:20 +02:00
parent 35e7ea8d28
commit ef9c04b070

View File

@@ -326,7 +326,8 @@ async fn process_job(
for (ci, (chunk_start, chunk_end)) in chunks.iter().enumerate() {
let s0 = (*chunk_start * 16_000.0) as usize;
let s1 = ((*chunk_end * 16_000.0) as usize).min(pcm.len());
let chunk_pcm = pcm[s0..s1].to_vec();
let mut chunk_pcm = pcm[s0..s1].to_vec();
trim_trailing_silence(&mut chunk_pcm);
// Base percent this chunk starts at.
let base = (ci * 100 / n) as u8;
@@ -403,6 +404,29 @@ async fn process_job(
Ok((all_segments, language, total_secs))
}
/// Trim trailing silence from a 16 kHz mono PCM buffer.
///
/// Scans backwards to find the last sample above 35 dB, then keeps
/// 0.5 s of padding after it. This prevents whisper from hallucinating
/// filler tokens into end-of-chunk silence.
fn trim_trailing_silence(pcm: &mut Vec<f32>) {
const THRESHOLD: f32 = 0.017_8; // 35 dB (10^(35/20))
const PADDING: usize = 8_000; // 0.5 s at 16 kHz
if let Some(last_loud) = pcm.iter().rposition(|&s| s.abs() > THRESHOLD) {
let new_len = (last_loud + 1 + PADDING).min(pcm.len());
if new_len < pcm.len() {
tracing::trace!(
original_samples = pcm.len(),
trimmed_samples = pcm.len() - new_len,
"trimmed trailing silence"
);
pcm.truncate(new_len);
}
}
// All-silent chunk: keep as-is — whisper will produce zero segments, which is correct.
}
/// Decode any audio file to 16 kHz mono PCM f32 using ffmpeg.
async fn decode_audio(path: &std::path::Path) -> crate::Result<Vec<f32>> {
use tokio::process::Command;