fix: trim trailing silence from each chunk before whisper
All checks were successful
Build & Push Docker Image / build-and-push (push) Successful in 6m44s
All checks were successful
Build & Push Docker Image / build-and-push (push) Successful in 6m44s
Whisper hallucinates filler tokens (Bye., Thank you., etc.) into end-of-chunk silence. This is especially visible on the final chunk of long audio where the outro silence triggers a 10× repetition loop. Fix: after slicing each PCM chunk, scan backwards to find the last sample above −35 dB, then keep 0.5 s of padding and truncate. Applied to every chunk, not just the last — any chunk ending in a long silence period gets the same protection. Constants match the silencedetect filter already used for chunking: THRESHOLD = 0.0178 (−35 dB) PADDING = 8000 samples (0.5 s at 16 kHz) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -326,7 +326,8 @@ async fn process_job(
|
||||
for (ci, (chunk_start, chunk_end)) in chunks.iter().enumerate() {
|
||||
let s0 = (*chunk_start * 16_000.0) as usize;
|
||||
let s1 = ((*chunk_end * 16_000.0) as usize).min(pcm.len());
|
||||
let chunk_pcm = pcm[s0..s1].to_vec();
|
||||
let mut chunk_pcm = pcm[s0..s1].to_vec();
|
||||
trim_trailing_silence(&mut chunk_pcm);
|
||||
|
||||
// Base percent this chunk starts at.
|
||||
let base = (ci * 100 / n) as u8;
|
||||
@@ -403,6 +404,29 @@ async fn process_job(
|
||||
Ok((all_segments, language, total_secs))
|
||||
}
|
||||
|
||||
/// Trim trailing silence from a 16 kHz mono PCM buffer.
|
||||
///
|
||||
/// Scans backwards to find the last sample above −35 dB, then keeps
|
||||
/// 0.5 s of padding after it. This prevents whisper from hallucinating
|
||||
/// filler tokens into end-of-chunk silence.
|
||||
fn trim_trailing_silence(pcm: &mut Vec<f32>) {
|
||||
const THRESHOLD: f32 = 0.017_8; // −35 dB (10^(−35/20))
|
||||
const PADDING: usize = 8_000; // 0.5 s at 16 kHz
|
||||
|
||||
if let Some(last_loud) = pcm.iter().rposition(|&s| s.abs() > THRESHOLD) {
|
||||
let new_len = (last_loud + 1 + PADDING).min(pcm.len());
|
||||
if new_len < pcm.len() {
|
||||
tracing::trace!(
|
||||
original_samples = pcm.len(),
|
||||
trimmed_samples = pcm.len() - new_len,
|
||||
"trimmed trailing silence"
|
||||
);
|
||||
pcm.truncate(new_len);
|
||||
}
|
||||
}
|
||||
// All-silent chunk: keep as-is — whisper will produce zero segments, which is correct.
|
||||
}
|
||||
|
||||
/// Decode any audio file to 16 kHz mono PCM f32 using ffmpeg.
|
||||
async fn decode_audio(path: &std::path::Path) -> crate::Result<Vec<f32>> {
|
||||
use tokio::process::Command;
|
||||
|
||||
Reference in New Issue
Block a user