diff --git a/src/transcriber.rs b/src/transcriber.rs index 7ccaa10..e93d881 100644 --- a/src/transcriber.rs +++ b/src/transcriber.rs @@ -24,9 +24,10 @@ impl Transcriber { let mut params = WhisperContextParameters::new(); params.use_gpu(true); params.gpu_device(gpu_device as i32); - // Flash Attention (tile-based, works on sm_75). - // NOTE: mutually exclusive with DTW token timestamps. - params.flash_attn(true); + // Flash Attention disabled: causes silent 0-segment output on some + // real-world audio (conference recordings, noisy MP3s). Standard + // CUDA attention is safe on all content types. + // params.flash_attn(true); let ctx = WhisperContext::new_with_params(path, params) .map_err(|e| AppError::Internal(format!("failed to load model: {e}")))?; @@ -54,28 +55,20 @@ impl Transcriber { patience: 1.0, }); - // RTX 2080: use all host CPU threads for pre/post processing fp.set_n_threads(num_cpus::get() as i32); - - // Deterministic, fastest decode path fp.set_temperature(0.0); - // Temperature fallback: when a segment fails quality checks, retry with - // increasing temperature (0.0 → 0.2 → 0.4 …) rather than hallucinating. fp.set_temperature_inc(0.2); - // ── Anti-hallucination / quality guards (from whisper.cpp docs) ────── - // Skip segments where the model is uncertain there is speech at all. + // ── Anti-hallucination / quality guards ─────────────────────────────── + // no_speech_thold: segments where p(no-speech) > threshold are dropped. + // 0.6 is the whisper.cpp default — safe for real-world and clean audio. + // (0.0 would suppress *everything*; 1.0 disables the filter entirely.) fp.set_no_speech_thold(0.6); - // High token-entropy signals a repetition loop — abort the segment. fp.set_entropy_thold(2.4); - // Low average log-probability signals poor confidence — discard segment. fp.set_logprob_thold(-1.0); - // Suppress leading blank tokens (avoids empty/whitespace-only segments). fp.set_suppress_blank(true); - // Suppress music notes, laughter, [BLANK_AUDIO] and similar non-speech tokens. fp.set_suppress_non_speech_tokens(true); - // Don't echo progress/results to stdout — we use the callback instead. fp.set_print_progress(false); fp.set_print_realtime(false);