use std::sync::atomic::Ordering; use axum::extract::State; use axum::Json; use crate::{models::HealthResponse, AppState, Result}; /// Return service health, GPU info, and queue depth. #[utoipa::path( get, path = "/health", tag = "system", responses( (status = 200, description = "Service healthy", body = HealthResponse), ) )] pub async fn health(State(state): State) -> Result> { let (gpu_name, vram_total_mb) = gpu_info(state.gpu_device); let model_state_tag = state.model_state.read().await.tag().to_string(); Ok(Json(HealthResponse { status: "ok".into(), gpu_name, vram_total_mb, model: state.model_name.to_string(), queue_depth: state.queue_depth.load(Ordering::Relaxed), model_state: model_state_tag, })) } /// Query NVIDIA GPU info via `nvidia-smi` for the given CUDA device index. fn gpu_info(device: u32) -> (Option, Option) { let Ok(out) = std::process::Command::new("nvidia-smi") .args([ &format!("--id={device}"), "--query-gpu=name,memory.total", "--format=csv,noheader,nounits", ]) .output() else { return (None, None); }; if !out.status.success() { return (None, None); } let line = String::from_utf8_lossy(&out.stdout); let line = line.trim(); let mut parts = line.splitn(2, ','); let name = parts.next().map(|s| s.trim().to_owned()); let vram = parts.next().and_then(|s| s.trim().parse::().ok()); (name, vram) }