fix: trim trailing silence from each chunk before whisper
All checks were successful
Build & Push Docker Image / build-and-push (push) Successful in 6m44s
All checks were successful
Build & Push Docker Image / build-and-push (push) Successful in 6m44s
Whisper hallucinates filler tokens (Bye., Thank you., etc.) into end-of-chunk silence. This is especially visible on the final chunk of long audio where the outro silence triggers a 10× repetition loop. Fix: after slicing each PCM chunk, scan backwards to find the last sample above −35 dB, then keep 0.5 s of padding and truncate. Applied to every chunk, not just the last — any chunk ending in a long silence period gets the same protection. Constants match the silencedetect filter already used for chunking: THRESHOLD = 0.0178 (−35 dB) PADDING = 8000 samples (0.5 s at 16 kHz) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -326,7 +326,8 @@ async fn process_job(
|
|||||||
for (ci, (chunk_start, chunk_end)) in chunks.iter().enumerate() {
|
for (ci, (chunk_start, chunk_end)) in chunks.iter().enumerate() {
|
||||||
let s0 = (*chunk_start * 16_000.0) as usize;
|
let s0 = (*chunk_start * 16_000.0) as usize;
|
||||||
let s1 = ((*chunk_end * 16_000.0) as usize).min(pcm.len());
|
let s1 = ((*chunk_end * 16_000.0) as usize).min(pcm.len());
|
||||||
let chunk_pcm = pcm[s0..s1].to_vec();
|
let mut chunk_pcm = pcm[s0..s1].to_vec();
|
||||||
|
trim_trailing_silence(&mut chunk_pcm);
|
||||||
|
|
||||||
// Base percent this chunk starts at.
|
// Base percent this chunk starts at.
|
||||||
let base = (ci * 100 / n) as u8;
|
let base = (ci * 100 / n) as u8;
|
||||||
@@ -403,6 +404,29 @@ async fn process_job(
|
|||||||
Ok((all_segments, language, total_secs))
|
Ok((all_segments, language, total_secs))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Trim trailing silence from a 16 kHz mono PCM buffer.
|
||||||
|
///
|
||||||
|
/// Scans backwards to find the last sample above −35 dB, then keeps
|
||||||
|
/// 0.5 s of padding after it. This prevents whisper from hallucinating
|
||||||
|
/// filler tokens into end-of-chunk silence.
|
||||||
|
fn trim_trailing_silence(pcm: &mut Vec<f32>) {
|
||||||
|
const THRESHOLD: f32 = 0.017_8; // −35 dB (10^(−35/20))
|
||||||
|
const PADDING: usize = 8_000; // 0.5 s at 16 kHz
|
||||||
|
|
||||||
|
if let Some(last_loud) = pcm.iter().rposition(|&s| s.abs() > THRESHOLD) {
|
||||||
|
let new_len = (last_loud + 1 + PADDING).min(pcm.len());
|
||||||
|
if new_len < pcm.len() {
|
||||||
|
tracing::trace!(
|
||||||
|
original_samples = pcm.len(),
|
||||||
|
trimmed_samples = pcm.len() - new_len,
|
||||||
|
"trimmed trailing silence"
|
||||||
|
);
|
||||||
|
pcm.truncate(new_len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// All-silent chunk: keep as-is — whisper will produce zero segments, which is correct.
|
||||||
|
}
|
||||||
|
|
||||||
/// Decode any audio file to 16 kHz mono PCM f32 using ffmpeg.
|
/// Decode any audio file to 16 kHz mono PCM f32 using ffmpeg.
|
||||||
async fn decode_audio(path: &std::path::Path) -> crate::Result<Vec<f32>> {
|
async fn decode_audio(path: &std::path::Path) -> crate::Result<Vec<f32>> {
|
||||||
use tokio::process::Command;
|
use tokio::process::Command;
|
||||||
|
|||||||
Reference in New Issue
Block a user