feat: GPU-accelerated Whisper API for RTX 2080 (sm_75)
All checks were successful
Build & Push Docker Image / build-and-push (push) Successful in 11m13s
All checks were successful
Build & Push Docker Image / build-and-push (push) Successful in 11m13s
- Pure Rust: Axum 0.7 + whisper-rs 0.13 (CUDA FFI) - Async job queue with SSE progress streaming - Webhook delivery with 5x exponential backoff - Disk-persisted job state (survives restarts) - Anti-hallucination params: no_speech_thold, entropy_thold, suppress_blank - CUDA sm_75 flags: GGML_CUDA_FORCE_MMQ, GGML_CUDA_GRAPHS, GGML_CUDA_FA_ALL_QUANTS - Configurable via env: CUDA_DEVICE, WHISPER_MODEL_PATH, PORT, DATA_DIR - Gitea Actions CI: build + push to git.sal.giize.com registry - Multi-stage Dockerfile with customizable CUDA_VERSION ARG Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
56
src/routes/health.rs
Normal file
56
src/routes/health.rs
Normal file
@@ -0,0 +1,56 @@
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use axum::extract::State;
|
||||
use axum::Json;
|
||||
|
||||
use crate::{models::HealthResponse, AppState, Result};
|
||||
|
||||
/// Return service health, GPU info, and queue depth.
|
||||
#[utoipa::path(
|
||||
get,
|
||||
path = "/health",
|
||||
tag = "system",
|
||||
responses(
|
||||
(status = 200, description = "Service healthy", body = HealthResponse),
|
||||
)
|
||||
)]
|
||||
pub async fn health(State(state): State<AppState>) -> Result<Json<HealthResponse>> {
|
||||
let (gpu_name, vram_total_mb) = gpu_info(state.gpu_device);
|
||||
|
||||
Ok(Json(HealthResponse {
|
||||
status: "ok".into(),
|
||||
gpu_name,
|
||||
vram_total_mb,
|
||||
model: state.model_name.to_string(),
|
||||
queue_depth: state.queue_depth.load(Ordering::Relaxed),
|
||||
}))
|
||||
}
|
||||
|
||||
/// Query NVIDIA GPU info via `nvidia-smi` for the given CUDA device index.
|
||||
fn gpu_info(device: u32) -> (Option<String>, Option<u64>) {
|
||||
let Ok(out) = std::process::Command::new("nvidia-smi")
|
||||
.args([
|
||||
&format!("--id={device}"),
|
||||
"--query-gpu=name,memory.total",
|
||||
"--format=csv,noheader,nounits",
|
||||
])
|
||||
.output()
|
||||
else {
|
||||
return (None, None);
|
||||
};
|
||||
|
||||
if !out.status.success() {
|
||||
return (None, None);
|
||||
}
|
||||
|
||||
let line = String::from_utf8_lossy(&out.stdout);
|
||||
let line = line.trim();
|
||||
let mut parts = line.splitn(2, ',');
|
||||
|
||||
let name = parts.next().map(|s| s.trim().to_owned());
|
||||
let vram = parts
|
||||
.next()
|
||||
.and_then(|s| s.trim().parse::<u64>().ok());
|
||||
|
||||
(name, vram)
|
||||
}
|
||||
Reference in New Issue
Block a user