fix: trim trailing silence from each chunk before whisper

Whisper hallucinates filler tokens (Bye., Thank you., etc.) into end-of-chunk silence. This is especially visible on the final chunk of long audio where the outro silence triggers a 10× repetition loop. Fix: after slicing each PCM chunk, scan backwards to find the last sample above −35 dB, then keep 0.5 s of padding and truncate. Applied to every chunk, not just the last — any chunk ending in a long silence period gets the same protection. Constants match the silencedetect filter already used for chunking: THRESHOLD = 0.0178 (−35 dB) PADDING = 8000 samples (0.5 s at 16 kHz) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-06 02:13:20 +02:00
parent 35e7ea8d28
commit ef9c04b070
1 changed files with 25 additions and 1 deletions
--- a/src/worker.rs
+++ b/src/worker.rs
@@ -326,7 +326,8 @@ async fn process_job(
    for (ci, (chunk_start, chunk_end)) in chunks.iter().enumerate() {
        let s0 = (*chunk_start * 16_000.0) as usize;
        let s1 = ((*chunk_end   * 16_000.0) as usize).min(pcm.len());
-        let chunk_pcm = pcm[s0..s1].to_vec();
+        let mut chunk_pcm = pcm[s0..s1].to_vec();
        trim_trailing_silence(&mut chunk_pcm);
        // Base percent this chunk starts at.
        let base  = (ci * 100 / n) as u8;
@@ -403,6 +404,29 @@ async fn process_job(
    Ok((all_segments, language, total_secs))
 }
 /// Trim trailing silence from a 16 kHz mono PCM buffer.
 ///
 /// Scans backwards to find the last sample above −35 dB, then keeps
 /// 0.5 s of padding after it. This prevents whisper from hallucinating
 /// filler tokens into end-of-chunk silence.
 fn trim_trailing_silence(pcm: &mut Vec<f32>) {
    const THRESHOLD: f32 = 0.017_8; // −35 dB  (10^(−35/20))
    const PADDING:   usize = 8_000;  // 0.5 s at 16 kHz
    if let Some(last_loud) = pcm.iter().rposition(|&s| s.abs() > THRESHOLD) {
        let new_len = (last_loud + 1 + PADDING).min(pcm.len());
        if new_len < pcm.len() {
            tracing::trace!(
                original_samples = pcm.len(),
                trimmed_samples  = pcm.len() - new_len,
                "trimmed trailing silence"
            );
            pcm.truncate(new_len);
        }
    }
    // All-silent chunk: keep as-is — whisper will produce zero segments, which is correct.
 }
 /// Decode any audio file to 16 kHz mono PCM f32 using ffmpeg.
 async fn decode_audio(path: &std::path::Path) -> crate::Result<Vec<f32>> {
    use tokio::process::Command;