Normalize rolling partial-hypothesis chains before final job persistence so downstream clients receive stable transcript segments instead of echoed continuations. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
57 lines
1.6 KiB
Rust
57 lines
1.6 KiB
Rust
use std::sync::atomic::Ordering;
|
|
|
|
use axum::extract::State;
|
|
use axum::Json;
|
|
|
|
use crate::{models::HealthResponse, AppState, Result};
|
|
|
|
/// Return service health, GPU info, and queue depth.
|
|
#[utoipa::path(
|
|
get,
|
|
path = "/health",
|
|
tag = "system",
|
|
responses(
|
|
(status = 200, description = "Service healthy", body = HealthResponse),
|
|
)
|
|
)]
|
|
pub async fn health(State(state): State<AppState>) -> Result<Json<HealthResponse>> {
|
|
let (gpu_name, vram_total_mb) = gpu_info(state.gpu_device);
|
|
let model_state_tag = state.model_state.read().await.tag().to_string();
|
|
|
|
Ok(Json(HealthResponse {
|
|
status: "ok".into(),
|
|
gpu_name,
|
|
vram_total_mb,
|
|
model: state.model_name.to_string(),
|
|
queue_depth: state.queue_depth.load(Ordering::Relaxed),
|
|
model_state: model_state_tag,
|
|
}))
|
|
}
|
|
|
|
/// Query NVIDIA GPU info via `nvidia-smi` for the given CUDA device index.
|
|
fn gpu_info(device: u32) -> (Option<String>, Option<u64>) {
|
|
let Ok(out) = std::process::Command::new("nvidia-smi")
|
|
.args([
|
|
&format!("--id={device}"),
|
|
"--query-gpu=name,memory.total",
|
|
"--format=csv,noheader,nounits",
|
|
])
|
|
.output()
|
|
else {
|
|
return (None, None);
|
|
};
|
|
|
|
if !out.status.success() {
|
|
return (None, None);
|
|
}
|
|
|
|
let line = String::from_utf8_lossy(&out.stdout);
|
|
let line = line.trim();
|
|
let mut parts = line.splitn(2, ',');
|
|
|
|
let name = parts.next().map(|s| s.trim().to_owned());
|
|
let vram = parts.next().and_then(|s| s.trim().parse::<u64>().ok());
|
|
|
|
(name, vram)
|
|
}
|