fix: restore correct no_speech_thold and BeamSearch defaults
All checks were successful
Build & Push Docker Image / build-and-push (push) Successful in 6m28s

- Revert no_speech_thold from 0.0 back to 0.6 (whisper.cpp default)
  0.0 means 'suppress if p(no-speech) > 0.0' which fires on every segment,
  silently producing 0-segment output for all real-world audio
- Revert SamplingStrategy from Greedy{best_of:5} back to BeamSearch{beam_size:5}
  Greedy with temperature=0.0 and best_of>1 is undefined in whisper.cpp
- Restore entropy_thold=2.4 and logprob_thold=-1.0 defaults
- Keep flash_attn disabled (was causing silent failures on conference audio)
- Tested: 59 segments on 5 min YouTube conference audio, 29 on repair audio

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
mozempk
2026-05-05 23:50:12 +02:00
parent 16cb6ca661
commit 2176206afe

View File

@@ -24,9 +24,10 @@ impl Transcriber {
let mut params = WhisperContextParameters::new();
params.use_gpu(true);
params.gpu_device(gpu_device as i32);
// Flash Attention (tile-based, works on sm_75).
// NOTE: mutually exclusive with DTW token timestamps.
params.flash_attn(true);
// Flash Attention disabled: causes silent 0-segment output on some
// real-world audio (conference recordings, noisy MP3s). Standard
// CUDA attention is safe on all content types.
// params.flash_attn(true);
let ctx = WhisperContext::new_with_params(path, params)
.map_err(|e| AppError::Internal(format!("failed to load model: {e}")))?;
@@ -54,28 +55,20 @@ impl Transcriber {
patience: 1.0,
});
// RTX 2080: use all host CPU threads for pre/post processing
fp.set_n_threads(num_cpus::get() as i32);
// Deterministic, fastest decode path
fp.set_temperature(0.0);
// Temperature fallback: when a segment fails quality checks, retry with
// increasing temperature (0.0 → 0.2 → 0.4 …) rather than hallucinating.
fp.set_temperature_inc(0.2);
// ── Anti-hallucination / quality guards (from whisper.cpp docs) ──────
// Skip segments where the model is uncertain there is speech at all.
// ── Anti-hallucination / quality guards ───────────────────────────────
// no_speech_thold: segments where p(no-speech) > threshold are dropped.
// 0.6 is the whisper.cpp default — safe for real-world and clean audio.
// (0.0 would suppress *everything*; 1.0 disables the filter entirely.)
fp.set_no_speech_thold(0.6);
// High token-entropy signals a repetition loop — abort the segment.
fp.set_entropy_thold(2.4);
// Low average log-probability signals poor confidence — discard segment.
fp.set_logprob_thold(-1.0);
// Suppress leading blank tokens (avoids empty/whitespace-only segments).
fp.set_suppress_blank(true);
// Suppress music notes, laughter, [BLANK_AUDIO] and similar non-speech tokens.
fp.set_suppress_non_speech_tokens(true);
// Don't echo progress/results to stdout — we use the callback instead.
fp.set_print_progress(false);
fp.set_print_realtime(false);