feat: dynamic model loading/unloading with GPU polling
All checks were successful
Build & Push Docker Image / build-and-push (push) Successful in 8m41s
All checks were successful
Build & Push Docker Image / build-and-push (push) Successful in 8m41s
- Model starts unloaded (lazy); loads on first job or POST /model/load
- Auto-unloads after IDLE_TIMEOUT_SECS (default 300) of inactivity
- POST /model/unload for immediate manual release
- GPU-busy detection: on VRAM OOM, enters WaitingForGpu and retries
every GPU_POLL_INTERVAL_SECS (default 30) indefinitely
- POST /jobs when unloaded → 503 + Retry-After header, triggers load
- AppError::OutOfMemory and AppError::ModelNotReady variants
- WorkerCmd channel (SyncSender<WorkerCmd>) replaces bare tx_req channel
- Idle timer via recv_timeout(1s) tick inside OS thread (no extra thread)
- Model lifecycle events broadcast via tokio broadcast channel (SSE + webhooks)
- webhook_registry: all clients that ever submitted a webhook_url receive
model_ready and model_unloaded webhooks
- GPU warmup retained on every (re)load
New routes:
GET /model/status — current state + VRAM stats
POST /model/load — trigger load (idempotent)
POST /model/unload — immediate unload
GET /model/events — SSE stream of model lifecycle events
New env vars:
IDLE_TIMEOUT_SECS (default 300)
GPU_POLL_INTERVAL_SECS (default 30)
Tests:
tests/test_model_lifecycle.sh — 18 integration tests (full state machine,
SSE events, webhooks, concurrency, unload-during-load)
tests/test_idle_timeout.sh — 5 tests with short IDLE_TIMEOUT_SECS=5
test_all.sh updated: loads model before job submission, asserts
model_state in /health, adds POST /model/unload at end
Docs:
docs/USAGE.md: model lifecycle section, new env vars, 503 retry pattern,
updated /health response shape
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
141
src/error.rs
141
src/error.rs
@@ -1,6 +1,6 @@
|
||||
use thiserror::Error;
|
||||
use axum::{
|
||||
http::StatusCode,
|
||||
http::{StatusCode, HeaderValue, header},
|
||||
response::{IntoResponse, Response},
|
||||
Json,
|
||||
};
|
||||
@@ -21,19 +21,138 @@ pub enum AppError {
|
||||
|
||||
#[error("internal error: {0}")]
|
||||
Internal(String),
|
||||
|
||||
/// Returned when `whisper_init_state` or `cudaMalloc` fails due to
|
||||
/// insufficient VRAM. The worker uses this to distinguish a recoverable
|
||||
/// VRAM-pressure failure from a hard internal error.
|
||||
#[error("out of GPU memory: {0}")]
|
||||
OutOfMemory(String),
|
||||
|
||||
/// Returned when a job is submitted but the model is not yet loaded.
|
||||
/// Carries the current state tag and recommended Retry-After seconds.
|
||||
#[error("model not ready: {state}")]
|
||||
ModelNotReady { state: String, retry_after_secs: u64 },
|
||||
}
|
||||
|
||||
impl AppError {
|
||||
/// Returns true if the error string contains patterns emitted by
|
||||
/// whisper.cpp / GGML when a CUDA memory allocation fails.
|
||||
pub fn is_oom(msg: &str) -> bool {
|
||||
msg.contains("cudaMalloc failed")
|
||||
|| msg.contains("out of memory")
|
||||
|| msg.contains("CUDA error: out of memory")
|
||||
|| msg.contains("alloc_buffer")
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoResponse for AppError {
|
||||
fn into_response(self) -> Response {
|
||||
let (status, message) = match &self {
|
||||
AppError::NotFound(m) => (StatusCode::NOT_FOUND, m.clone()),
|
||||
AppError::BadRequest(m) => (StatusCode::BAD_REQUEST, m.clone()),
|
||||
AppError::Conflict(m) => (StatusCode::CONFLICT, m.clone()),
|
||||
AppError::Internal(m) => (StatusCode::INTERNAL_SERVER_ERROR, m.clone()),
|
||||
};
|
||||
|
||||
tracing::error!(status = status.as_u16(), error = %message);
|
||||
|
||||
(status, Json(json!({ "error": message }))).into_response()
|
||||
match self {
|
||||
AppError::NotFound(m) => {
|
||||
(StatusCode::NOT_FOUND, Json(json!({ "error": m }))).into_response()
|
||||
}
|
||||
AppError::BadRequest(m) => {
|
||||
(StatusCode::BAD_REQUEST, Json(json!({ "error": m }))).into_response()
|
||||
}
|
||||
AppError::Conflict(m) => {
|
||||
(StatusCode::CONFLICT, Json(json!({ "error": m }))).into_response()
|
||||
}
|
||||
AppError::Internal(m) => {
|
||||
tracing::error!(error = %m, "internal error");
|
||||
(StatusCode::INTERNAL_SERVER_ERROR, Json(json!({ "error": m }))).into_response()
|
||||
}
|
||||
AppError::OutOfMemory(m) => {
|
||||
tracing::warn!(error = %m, "GPU out of memory during model load");
|
||||
(StatusCode::SERVICE_UNAVAILABLE, Json(json!({ "error": m }))).into_response()
|
||||
}
|
||||
AppError::ModelNotReady { state, retry_after_secs } => {
|
||||
let body = Json(json!({
|
||||
"error": "model_not_ready",
|
||||
"state": state,
|
||||
"retry_after_secs": retry_after_secs,
|
||||
}));
|
||||
let mut resp = (StatusCode::SERVICE_UNAVAILABLE, body).into_response();
|
||||
resp.headers_mut().insert(
|
||||
header::RETRY_AFTER,
|
||||
HeaderValue::from_str(&retry_after_secs.to_string())
|
||||
.unwrap_or(HeaderValue::from_static("30")),
|
||||
);
|
||||
resp
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Unit tests ───────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use axum::body::to_bytes;
|
||||
|
||||
#[test]
|
||||
fn test_is_oom_cuda_malloc() {
|
||||
assert!(AppError::is_oom("cudaMalloc failed: out of memory"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_oom_alloc_buffer() {
|
||||
// Exact message from ggml_backend_cuda_buffer_type_alloc_buffer
|
||||
assert!(AppError::is_oom(
|
||||
"ggml_backend_cuda_buffer_type_alloc_buffer: allocating 2951.01 MiB on device 0: cudaMalloc failed: out of memory"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_oom_generic_out_of_memory() {
|
||||
assert!(AppError::is_oom("CUDA error: out of memory"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_oom_other_error() {
|
||||
assert!(!AppError::is_oom("failed to open model file"));
|
||||
assert!(!AppError::is_oom("invalid model format"));
|
||||
assert!(!AppError::is_oom(""));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_model_not_ready_response_has_retry_after_header() {
|
||||
let err = AppError::ModelNotReady { state: "loading".into(), retry_after_secs: 10 };
|
||||
let resp = err.into_response();
|
||||
assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE);
|
||||
let retry_after = resp.headers().get(header::RETRY_AFTER)
|
||||
.expect("Retry-After header missing");
|
||||
assert_eq!(retry_after, "10");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_model_not_ready_response_body() {
|
||||
let err = AppError::ModelNotReady { state: "unloaded".into(), retry_after_secs: 30 };
|
||||
let resp = err.into_response();
|
||||
let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap();
|
||||
let v: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
|
||||
assert_eq!(v["error"], "model_not_ready");
|
||||
assert_eq!(v["state"], "unloaded");
|
||||
assert_eq!(v["retry_after_secs"], 30);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_model_not_ready_loading_retry_after_10() {
|
||||
let err = AppError::ModelNotReady { state: "loading".into(), retry_after_secs: 10 };
|
||||
let resp = err.into_response();
|
||||
assert_eq!(
|
||||
resp.headers().get(header::RETRY_AFTER).unwrap(),
|
||||
"10"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_model_not_ready_unloaded_retry_after_30() {
|
||||
let err = AppError::ModelNotReady { state: "unloaded".into(), retry_after_secs: 30 };
|
||||
let resp = err.into_response();
|
||||
assert_eq!(
|
||||
resp.headers().get(header::RETRY_AFTER).unwrap(),
|
||||
"30"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user