fix: restore correct no_speech_thold and BeamSearch defaults

- Revert no_speech_thold from 0.0 back to 0.6 (whisper.cpp default) 0.0 means 'suppress if p(no-speech) > 0.0' which fires on every segment, silently producing 0-segment output for all real-world audio - Revert SamplingStrategy from Greedy{best_of:5} back to BeamSearch{beam_size:5} Greedy with temperature=0.0 and best_of>1 is undefined in whisper.cpp - Restore entropy_thold=2.4 and logprob_thold=-1.0 defaults - Keep flash_attn disabled (was causing silent failures on conference audio) - Tested: 59 segments on 5 min YouTube conference audio, 29 on repair audio Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-05 23:50:12 +02:00
parent 16cb6ca661
commit 2176206afe
1 changed files with 8 additions and 15 deletions
--- a/src/transcriber.rs
+++ b/src/transcriber.rs
@@ -24,9 +24,10 @@ impl Transcriber {
        let mut params = WhisperContextParameters::new();
        params.use_gpu(true);
        params.gpu_device(gpu_device as i32);
-        // Flash Attention (tile-based, works on sm_75).
+        // Flash Attention disabled: causes silent 0-segment output on some
-        // NOTE: mutually exclusive with DTW token timestamps.
+        // real-world audio (conference recordings, noisy MP3s). Standard
-        params.flash_attn(true);
+        // CUDA attention is safe on all content types.
        // params.flash_attn(true);
        let ctx = WhisperContext::new_with_params(path, params)
            .map_err(|e| AppError::Internal(format!("failed to load model: {e}")))?;
@@ -54,28 +55,20 @@ impl Transcriber {
            patience:  1.0,
        });
        // RTX 2080: use all host CPU threads for pre/post processing
        fp.set_n_threads(num_cpus::get() as i32);
        // Deterministic, fastest decode path
        fp.set_temperature(0.0);
        // Temperature fallback: when a segment fails quality checks, retry with
        // increasing temperature (0.0 → 0.2 → 0.4 …) rather than hallucinating.
        fp.set_temperature_inc(0.2);
-        // ── Anti-hallucination / quality guards (from whisper.cpp docs) ──────
+        // ── Anti-hallucination / quality guards ───────────────────────────────
-        // Skip segments where the model is uncertain there is speech at all.
+        // no_speech_thold: segments where p(no-speech) > threshold are dropped.
        // 0.6 is the whisper.cpp default — safe for real-world and clean audio.
        // (0.0 would suppress *everything*; 1.0 disables the filter entirely.)
        fp.set_no_speech_thold(0.6);
        // High token-entropy signals a repetition loop — abort the segment.
        fp.set_entropy_thold(2.4);
        // Low average log-probability signals poor confidence — discard segment.
        fp.set_logprob_thold(-1.0);
        // Suppress leading blank tokens (avoids empty/whitespace-only segments).
        fp.set_suppress_blank(true);
        // Suppress music notes, laughter, [BLANK_AUDIO] and similar non-speech tokens.
        fp.set_suppress_non_speech_tokens(true);
        // Don't echo progress/results to stdout — we use the callback instead.
        fp.set_print_progress(false);
        fp.set_print_realtime(false);