fix: restore correct no_speech_thold and BeamSearch defaults
All checks were successful
Build & Push Docker Image / build-and-push (push) Successful in 6m28s
All checks were successful
Build & Push Docker Image / build-and-push (push) Successful in 6m28s
- Revert no_speech_thold from 0.0 back to 0.6 (whisper.cpp default)
0.0 means 'suppress if p(no-speech) > 0.0' which fires on every segment,
silently producing 0-segment output for all real-world audio
- Revert SamplingStrategy from Greedy{best_of:5} back to BeamSearch{beam_size:5}
Greedy with temperature=0.0 and best_of>1 is undefined in whisper.cpp
- Restore entropy_thold=2.4 and logprob_thold=-1.0 defaults
- Keep flash_attn disabled (was causing silent failures on conference audio)
- Tested: 59 segments on 5 min YouTube conference audio, 29 on repair audio
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -24,9 +24,10 @@ impl Transcriber {
|
|||||||
let mut params = WhisperContextParameters::new();
|
let mut params = WhisperContextParameters::new();
|
||||||
params.use_gpu(true);
|
params.use_gpu(true);
|
||||||
params.gpu_device(gpu_device as i32);
|
params.gpu_device(gpu_device as i32);
|
||||||
// Flash Attention (tile-based, works on sm_75).
|
// Flash Attention disabled: causes silent 0-segment output on some
|
||||||
// NOTE: mutually exclusive with DTW token timestamps.
|
// real-world audio (conference recordings, noisy MP3s). Standard
|
||||||
params.flash_attn(true);
|
// CUDA attention is safe on all content types.
|
||||||
|
// params.flash_attn(true);
|
||||||
|
|
||||||
let ctx = WhisperContext::new_with_params(path, params)
|
let ctx = WhisperContext::new_with_params(path, params)
|
||||||
.map_err(|e| AppError::Internal(format!("failed to load model: {e}")))?;
|
.map_err(|e| AppError::Internal(format!("failed to load model: {e}")))?;
|
||||||
@@ -54,28 +55,20 @@ impl Transcriber {
|
|||||||
patience: 1.0,
|
patience: 1.0,
|
||||||
});
|
});
|
||||||
|
|
||||||
// RTX 2080: use all host CPU threads for pre/post processing
|
|
||||||
fp.set_n_threads(num_cpus::get() as i32);
|
fp.set_n_threads(num_cpus::get() as i32);
|
||||||
|
|
||||||
// Deterministic, fastest decode path
|
|
||||||
fp.set_temperature(0.0);
|
fp.set_temperature(0.0);
|
||||||
// Temperature fallback: when a segment fails quality checks, retry with
|
|
||||||
// increasing temperature (0.0 → 0.2 → 0.4 …) rather than hallucinating.
|
|
||||||
fp.set_temperature_inc(0.2);
|
fp.set_temperature_inc(0.2);
|
||||||
|
|
||||||
// ── Anti-hallucination / quality guards (from whisper.cpp docs) ──────
|
// ── Anti-hallucination / quality guards ───────────────────────────────
|
||||||
// Skip segments where the model is uncertain there is speech at all.
|
// no_speech_thold: segments where p(no-speech) > threshold are dropped.
|
||||||
|
// 0.6 is the whisper.cpp default — safe for real-world and clean audio.
|
||||||
|
// (0.0 would suppress *everything*; 1.0 disables the filter entirely.)
|
||||||
fp.set_no_speech_thold(0.6);
|
fp.set_no_speech_thold(0.6);
|
||||||
// High token-entropy signals a repetition loop — abort the segment.
|
|
||||||
fp.set_entropy_thold(2.4);
|
fp.set_entropy_thold(2.4);
|
||||||
// Low average log-probability signals poor confidence — discard segment.
|
|
||||||
fp.set_logprob_thold(-1.0);
|
fp.set_logprob_thold(-1.0);
|
||||||
// Suppress leading blank tokens (avoids empty/whitespace-only segments).
|
|
||||||
fp.set_suppress_blank(true);
|
fp.set_suppress_blank(true);
|
||||||
// Suppress music notes, laughter, [BLANK_AUDIO] and similar non-speech tokens.
|
|
||||||
fp.set_suppress_non_speech_tokens(true);
|
fp.set_suppress_non_speech_tokens(true);
|
||||||
|
|
||||||
// Don't echo progress/results to stdout — we use the callback instead.
|
|
||||||
fp.set_print_progress(false);
|
fp.set_print_progress(false);
|
||||||
fp.set_print_realtime(false);
|
fp.set_print_realtime(false);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user