feat: dynamic model loading/unloading with GPU polling
All checks were successful
Build & Push Docker Image / build-and-push (push) Successful in 8m41s

- Model starts unloaded (lazy); loads on first job or POST /model/load
- Auto-unloads after IDLE_TIMEOUT_SECS (default 300) of inactivity
- POST /model/unload for immediate manual release
- GPU-busy detection: on VRAM OOM, enters WaitingForGpu and retries
  every GPU_POLL_INTERVAL_SECS (default 30) indefinitely
- POST /jobs when unloaded → 503 + Retry-After header, triggers load
- AppError::OutOfMemory and AppError::ModelNotReady variants
- WorkerCmd channel (SyncSender<WorkerCmd>) replaces bare tx_req channel
- Idle timer via recv_timeout(1s) tick inside OS thread (no extra thread)
- Model lifecycle events broadcast via tokio broadcast channel (SSE + webhooks)
- webhook_registry: all clients that ever submitted a webhook_url receive
  model_ready and model_unloaded webhooks
- GPU warmup retained on every (re)load

New routes:
  GET  /model/status  — current state + VRAM stats
  POST /model/load    — trigger load (idempotent)
  POST /model/unload  — immediate unload
  GET  /model/events  — SSE stream of model lifecycle events

New env vars:
  IDLE_TIMEOUT_SECS       (default 300)
  GPU_POLL_INTERVAL_SECS  (default 30)

Tests:
  tests/test_model_lifecycle.sh — 18 integration tests (full state machine,
    SSE events, webhooks, concurrency, unload-during-load)
  tests/test_idle_timeout.sh    — 5 tests with short IDLE_TIMEOUT_SECS=5
  test_all.sh updated: loads model before job submission, asserts
    model_state in /health, adds POST /model/unload at end

Docs:
  docs/USAGE.md: model lifecycle section, new env vars, 503 retry pattern,
    updated /health response shape

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
mozempk
2026-05-08 17:57:20 +02:00
parent 78c6fab81b
commit b191fbe200
13 changed files with 2053 additions and 148 deletions

View File

@@ -1,6 +1,6 @@
use thiserror::Error;
use axum::{
http::StatusCode,
http::{StatusCode, HeaderValue, header},
response::{IntoResponse, Response},
Json,
};
@@ -21,19 +21,138 @@ pub enum AppError {
#[error("internal error: {0}")]
Internal(String),
/// Returned when `whisper_init_state` or `cudaMalloc` fails due to
/// insufficient VRAM. The worker uses this to distinguish a recoverable
/// VRAM-pressure failure from a hard internal error.
#[error("out of GPU memory: {0}")]
OutOfMemory(String),
/// Returned when a job is submitted but the model is not yet loaded.
/// Carries the current state tag and recommended Retry-After seconds.
#[error("model not ready: {state}")]
ModelNotReady { state: String, retry_after_secs: u64 },
}
impl AppError {
/// Returns true if the error string contains patterns emitted by
/// whisper.cpp / GGML when a CUDA memory allocation fails.
pub fn is_oom(msg: &str) -> bool {
msg.contains("cudaMalloc failed")
|| msg.contains("out of memory")
|| msg.contains("CUDA error: out of memory")
|| msg.contains("alloc_buffer")
}
}
impl IntoResponse for AppError {
fn into_response(self) -> Response {
let (status, message) = match &self {
AppError::NotFound(m) => (StatusCode::NOT_FOUND, m.clone()),
AppError::BadRequest(m) => (StatusCode::BAD_REQUEST, m.clone()),
AppError::Conflict(m) => (StatusCode::CONFLICT, m.clone()),
AppError::Internal(m) => (StatusCode::INTERNAL_SERVER_ERROR, m.clone()),
};
tracing::error!(status = status.as_u16(), error = %message);
(status, Json(json!({ "error": message }))).into_response()
match self {
AppError::NotFound(m) => {
(StatusCode::NOT_FOUND, Json(json!({ "error": m }))).into_response()
}
AppError::BadRequest(m) => {
(StatusCode::BAD_REQUEST, Json(json!({ "error": m }))).into_response()
}
AppError::Conflict(m) => {
(StatusCode::CONFLICT, Json(json!({ "error": m }))).into_response()
}
AppError::Internal(m) => {
tracing::error!(error = %m, "internal error");
(StatusCode::INTERNAL_SERVER_ERROR, Json(json!({ "error": m }))).into_response()
}
AppError::OutOfMemory(m) => {
tracing::warn!(error = %m, "GPU out of memory during model load");
(StatusCode::SERVICE_UNAVAILABLE, Json(json!({ "error": m }))).into_response()
}
AppError::ModelNotReady { state, retry_after_secs } => {
let body = Json(json!({
"error": "model_not_ready",
"state": state,
"retry_after_secs": retry_after_secs,
}));
let mut resp = (StatusCode::SERVICE_UNAVAILABLE, body).into_response();
resp.headers_mut().insert(
header::RETRY_AFTER,
HeaderValue::from_str(&retry_after_secs.to_string())
.unwrap_or(HeaderValue::from_static("30")),
);
resp
}
}
}
}
// ── Unit tests ───────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
use axum::body::to_bytes;
#[test]
fn test_is_oom_cuda_malloc() {
assert!(AppError::is_oom("cudaMalloc failed: out of memory"));
}
#[test]
fn test_is_oom_alloc_buffer() {
// Exact message from ggml_backend_cuda_buffer_type_alloc_buffer
assert!(AppError::is_oom(
"ggml_backend_cuda_buffer_type_alloc_buffer: allocating 2951.01 MiB on device 0: cudaMalloc failed: out of memory"
));
}
#[test]
fn test_is_oom_generic_out_of_memory() {
assert!(AppError::is_oom("CUDA error: out of memory"));
}
#[test]
fn test_is_oom_other_error() {
assert!(!AppError::is_oom("failed to open model file"));
assert!(!AppError::is_oom("invalid model format"));
assert!(!AppError::is_oom(""));
}
#[tokio::test]
async fn test_model_not_ready_response_has_retry_after_header() {
let err = AppError::ModelNotReady { state: "loading".into(), retry_after_secs: 10 };
let resp = err.into_response();
assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE);
let retry_after = resp.headers().get(header::RETRY_AFTER)
.expect("Retry-After header missing");
assert_eq!(retry_after, "10");
}
#[tokio::test]
async fn test_model_not_ready_response_body() {
let err = AppError::ModelNotReady { state: "unloaded".into(), retry_after_secs: 30 };
let resp = err.into_response();
let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap();
let v: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
assert_eq!(v["error"], "model_not_ready");
assert_eq!(v["state"], "unloaded");
assert_eq!(v["retry_after_secs"], 30);
}
#[tokio::test]
async fn test_model_not_ready_loading_retry_after_10() {
let err = AppError::ModelNotReady { state: "loading".into(), retry_after_secs: 10 };
let resp = err.into_response();
assert_eq!(
resp.headers().get(header::RETRY_AFTER).unwrap(),
"10"
);
}
#[tokio::test]
async fn test_model_not_ready_unloaded_retry_after_30() {
let err = AppError::ModelNotReady { state: "unloaded".into(), retry_after_secs: 30 };
let resp = err.into_response();
assert_eq!(
resp.headers().get(header::RETRY_AFTER).unwrap(),
"30"
);
}
}