feat: dynamic model loading/unloading with GPU polling

- Model starts unloaded (lazy); loads on first job or POST /model/load - Auto-unloads after IDLE_TIMEOUT_SECS (default 300) of inactivity - POST /model/unload for immediate manual release - GPU-busy detection: on VRAM OOM, enters WaitingForGpu and retries every GPU_POLL_INTERVAL_SECS (default 30) indefinitely - POST /jobs when unloaded → 503 + Retry-After header, triggers load - AppError::OutOfMemory and AppError::ModelNotReady variants - WorkerCmd channel (SyncSender<WorkerCmd>) replaces bare tx_req channel - Idle timer via recv_timeout(1s) tick inside OS thread (no extra thread) - Model lifecycle events broadcast via tokio broadcast channel (SSE + webhooks) - webhook_registry: all clients that ever submitted a webhook_url receive model_ready and model_unloaded webhooks - GPU warmup retained on every (re)load New routes: GET /model/status — current state + VRAM stats POST /model/load — trigger load (idempotent) POST /model/unload — immediate unload GET /model/events — SSE stream of model lifecycle events New env vars: IDLE_TIMEOUT_SECS (default 300) GPU_POLL_INTERVAL_SECS (default 30) Tests: tests/test_model_lifecycle.sh — 18 integration tests (full state machine, SSE events, webhooks, concurrency, unload-during-load) tests/test_idle_timeout.sh — 5 tests with short IDLE_TIMEOUT_SECS=5 test_all.sh updated: loads model before job submission, asserts model_state in /health, adds POST /model/unload at end Docs: docs/USAGE.md: model lifecycle section, new env vars, 503 retry pattern, updated /health response shape Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-08 17:57:20 +02:00
parent 78c6fab81b
commit b191fbe200
13 changed files with 2053 additions and 148 deletions
--- a/src/error.rs
+++ b/src/error.rs
@@ -1,6 +1,6 @@
 use thiserror::Error;
 use axum::{
-    http::StatusCode,
+    http::{StatusCode, HeaderValue, header},
    response::{IntoResponse, Response},
    Json,
 };
@@ -21,19 +21,138 @@ pub enum AppError {

    #[error("internal error: {0}")]
    Internal(String),
+
+    /// Returned when `whisper_init_state` or `cudaMalloc` fails due to
+    /// insufficient VRAM. The worker uses this to distinguish a recoverable
+    /// VRAM-pressure failure from a hard internal error.
+    #[error("out of GPU memory: {0}")]
+    OutOfMemory(String),
+
+    /// Returned when a job is submitted but the model is not yet loaded.
+    /// Carries the current state tag and recommended Retry-After seconds.
+    #[error("model not ready: {state}")]
+    ModelNotReady { state: String, retry_after_secs: u64 },
+}
+
+impl AppError {
+    /// Returns true if the error string contains patterns emitted by
+    /// whisper.cpp / GGML when a CUDA memory allocation fails.
+    pub fn is_oom(msg: &str) -> bool {
+        msg.contains("cudaMalloc failed")
+            || msg.contains("out of memory")
+            || msg.contains("CUDA error: out of memory")
+            || msg.contains("alloc_buffer")
+    }
 }

 impl IntoResponse for AppError {
    fn into_response(self) -> Response {
-        let (status, message) = match &self {
-            AppError::NotFound(m)   => (StatusCode::NOT_FOUND,             m.clone()),
-            AppError::BadRequest(m) => (StatusCode::BAD_REQUEST,           m.clone()),
-            AppError::Conflict(m)   => (StatusCode::CONFLICT,              m.clone()),
-            AppError::Internal(m)   => (StatusCode::INTERNAL_SERVER_ERROR, m.clone()),
-        };
-
-        tracing::error!(status = status.as_u16(), error = %message);
-
-        (status, Json(json!({ "error": message }))).into_response()
+        match self {
+            AppError::NotFound(m) => {
+                (StatusCode::NOT_FOUND, Json(json!({ "error": m }))).into_response()
+            }
+            AppError::BadRequest(m) => {
+                (StatusCode::BAD_REQUEST, Json(json!({ "error": m }))).into_response()
+            }
+            AppError::Conflict(m) => {
+                (StatusCode::CONFLICT, Json(json!({ "error": m }))).into_response()
+            }
+            AppError::Internal(m) => {
+                tracing::error!(error = %m, "internal error");
+                (StatusCode::INTERNAL_SERVER_ERROR, Json(json!({ "error": m }))).into_response()
+            }
+            AppError::OutOfMemory(m) => {
+                tracing::warn!(error = %m, "GPU out of memory during model load");
+                (StatusCode::SERVICE_UNAVAILABLE, Json(json!({ "error": m }))).into_response()
+            }
+            AppError::ModelNotReady { state, retry_after_secs } => {
+                let body = Json(json!({
+                    "error":            "model_not_ready",
+                    "state":            state,
+                    "retry_after_secs": retry_after_secs,
+                }));
+                let mut resp = (StatusCode::SERVICE_UNAVAILABLE, body).into_response();
+                resp.headers_mut().insert(
+                    header::RETRY_AFTER,
+                    HeaderValue::from_str(&retry_after_secs.to_string())
+                        .unwrap_or(HeaderValue::from_static("30")),
+                );
+                resp
+            }
+        }
+    }
+}
+
+// ── Unit tests ───────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use axum::body::to_bytes;
+
+    #[test]
+    fn test_is_oom_cuda_malloc() {
+        assert!(AppError::is_oom("cudaMalloc failed: out of memory"));
+    }
+
+    #[test]
+    fn test_is_oom_alloc_buffer() {
+        // Exact message from ggml_backend_cuda_buffer_type_alloc_buffer
+        assert!(AppError::is_oom(
+            "ggml_backend_cuda_buffer_type_alloc_buffer: allocating 2951.01 MiB on device 0: cudaMalloc failed: out of memory"
+        ));
+    }
+
+    #[test]
+    fn test_is_oom_generic_out_of_memory() {
+        assert!(AppError::is_oom("CUDA error: out of memory"));
+    }
+
+    #[test]
+    fn test_is_oom_other_error() {
+        assert!(!AppError::is_oom("failed to open model file"));
+        assert!(!AppError::is_oom("invalid model format"));
+        assert!(!AppError::is_oom(""));
+    }
+
+    #[tokio::test]
+    async fn test_model_not_ready_response_has_retry_after_header() {
+        let err = AppError::ModelNotReady { state: "loading".into(), retry_after_secs: 10 };
+        let resp = err.into_response();
+        assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE);
+        let retry_after = resp.headers().get(header::RETRY_AFTER)
+            .expect("Retry-After header missing");
+        assert_eq!(retry_after, "10");
+    }
+
+    #[tokio::test]
+    async fn test_model_not_ready_response_body() {
+        let err = AppError::ModelNotReady { state: "unloaded".into(), retry_after_secs: 30 };
+        let resp = err.into_response();
+        let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap();
+        let v: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
+        assert_eq!(v["error"], "model_not_ready");
+        assert_eq!(v["state"], "unloaded");
+        assert_eq!(v["retry_after_secs"], 30);
+    }
+
+    #[tokio::test]
+    async fn test_model_not_ready_loading_retry_after_10() {
+        let err = AppError::ModelNotReady { state: "loading".into(), retry_after_secs: 10 };
+        let resp = err.into_response();
+        assert_eq!(
+            resp.headers().get(header::RETRY_AFTER).unwrap(),
+            "10"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_model_not_ready_unloaded_retry_after_30() {
+        let err = AppError::ModelNotReady { state: "unloaded".into(), retry_after_secs: 30 };
+        let resp = err.into_response();
+        assert_eq!(
+            resp.headers().get(header::RETRY_AFTER).unwrap(),
+            "30"
+        );
    }
 }