feat: GPU-accelerated Whisper API for RTX 2080 (sm_75)

- Pure Rust: Axum 0.7 + whisper-rs 0.13 (CUDA FFI) - Async job queue with SSE progress streaming - Webhook delivery with 5x exponential backoff - Disk-persisted job state (survives restarts) - Anti-hallucination params: no_speech_thold, entropy_thold, suppress_blank - CUDA sm_75 flags: GGML_CUDA_FORCE_MMQ, GGML_CUDA_GRAPHS, GGML_CUDA_FA_ALL_QUANTS - Configurable via env: CUDA_DEVICE, WHISPER_MODEL_PATH, PORT, DATA_DIR - Gitea Actions CI: build + push to git.sal.giize.com registry - Multi-stage Dockerfile with customizable CUDA_VERSION ARG Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-05 22:47:24 +02:00
commit 16cb6ca661
18 changed files with 1898 additions and 0 deletions
--- a/src/routes/health.rs
+++ b/src/routes/health.rs
@@ -0,0 +1,56 @@
+use std::sync::atomic::Ordering;
+
+use axum::extract::State;
+use axum::Json;
+
+use crate::{models::HealthResponse, AppState, Result};
+
+/// Return service health, GPU info, and queue depth.
+#[utoipa::path(
+    get,
+    path = "/health",
+    tag  = "system",
+    responses(
+        (status = 200, description = "Service healthy", body = HealthResponse),
+    )
+)]
+pub async fn health(State(state): State<AppState>) -> Result<Json<HealthResponse>> {
+    let (gpu_name, vram_total_mb) = gpu_info(state.gpu_device);
+
+    Ok(Json(HealthResponse {
+        status:        "ok".into(),
+        gpu_name,
+        vram_total_mb,
+        model:         state.model_name.to_string(),
+        queue_depth:   state.queue_depth.load(Ordering::Relaxed),
+    }))
+}
+
+/// Query NVIDIA GPU info via `nvidia-smi` for the given CUDA device index.
+fn gpu_info(device: u32) -> (Option<String>, Option<u64>) {
+    let Ok(out) = std::process::Command::new("nvidia-smi")
+        .args([
+            &format!("--id={device}"),
+            "--query-gpu=name,memory.total",
+            "--format=csv,noheader,nounits",
+        ])
+        .output()
+    else {
+        return (None, None);
+    };
+
+    if !out.status.success() {
+        return (None, None);
+    }
+
+    let line = String::from_utf8_lossy(&out.stdout);
+    let line = line.trim();
+    let mut parts = line.splitn(2, ',');
+
+    let name = parts.next().map(|s| s.trim().to_owned());
+    let vram = parts
+        .next()
+        .and_then(|s| s.trim().parse::<u64>().ok());
+
+    (name, vram)
+}