Normalize rolling partial-hypothesis chains before final job persistence so downstream clients receive stable transcript segments instead of echoed continuations. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
177 lines
5.8 KiB
Rust
177 lines
5.8 KiB
Rust
use axum::{
|
|
http::{header, HeaderValue, StatusCode},
|
|
response::{IntoResponse, Response},
|
|
Json,
|
|
};
|
|
use serde_json::json;
|
|
use thiserror::Error;
|
|
|
|
pub type Result<T> = std::result::Result<T, AppError>;
|
|
|
|
#[derive(Debug, Error)]
|
|
pub enum AppError {
|
|
#[error("not found: {0}")]
|
|
NotFound(String),
|
|
|
|
#[error("bad request: {0}")]
|
|
BadRequest(String),
|
|
|
|
#[error("conflict: {0}")]
|
|
Conflict(String),
|
|
|
|
#[error("internal error: {0}")]
|
|
Internal(String),
|
|
|
|
/// Returned when `whisper_init_state` or `cudaMalloc` fails due to
|
|
/// insufficient VRAM. The worker uses this to distinguish a recoverable
|
|
/// VRAM-pressure failure from a hard internal error.
|
|
#[error("out of GPU memory: {0}")]
|
|
OutOfMemory(String),
|
|
|
|
/// Returned when a job is submitted but the model is not yet loaded.
|
|
/// Carries the current state tag and recommended Retry-After seconds.
|
|
#[error("model not ready: {state}")]
|
|
ModelNotReady {
|
|
state: String,
|
|
retry_after_secs: u64,
|
|
},
|
|
}
|
|
|
|
impl AppError {
|
|
/// Returns true if the error string contains patterns emitted by
|
|
/// whisper.cpp / GGML when a CUDA memory allocation fails.
|
|
pub fn is_oom(msg: &str) -> bool {
|
|
msg.contains("cudaMalloc failed")
|
|
|| msg.contains("out of memory")
|
|
|| msg.contains("CUDA error: out of memory")
|
|
|| msg.contains("alloc_buffer")
|
|
}
|
|
}
|
|
|
|
impl IntoResponse for AppError {
|
|
fn into_response(self) -> Response {
|
|
match self {
|
|
AppError::NotFound(m) => {
|
|
(StatusCode::NOT_FOUND, Json(json!({ "error": m }))).into_response()
|
|
}
|
|
AppError::BadRequest(m) => {
|
|
(StatusCode::BAD_REQUEST, Json(json!({ "error": m }))).into_response()
|
|
}
|
|
AppError::Conflict(m) => {
|
|
(StatusCode::CONFLICT, Json(json!({ "error": m }))).into_response()
|
|
}
|
|
AppError::Internal(m) => {
|
|
tracing::error!(error = %m, "internal error");
|
|
(
|
|
StatusCode::INTERNAL_SERVER_ERROR,
|
|
Json(json!({ "error": m })),
|
|
)
|
|
.into_response()
|
|
}
|
|
AppError::OutOfMemory(m) => {
|
|
tracing::warn!(error = %m, "GPU out of memory during model load");
|
|
(StatusCode::SERVICE_UNAVAILABLE, Json(json!({ "error": m }))).into_response()
|
|
}
|
|
AppError::ModelNotReady {
|
|
state,
|
|
retry_after_secs,
|
|
} => {
|
|
let body = Json(json!({
|
|
"error": "model_not_ready",
|
|
"state": state,
|
|
"retry_after_secs": retry_after_secs,
|
|
}));
|
|
let mut resp = (StatusCode::SERVICE_UNAVAILABLE, body).into_response();
|
|
resp.headers_mut().insert(
|
|
header::RETRY_AFTER,
|
|
HeaderValue::from_str(&retry_after_secs.to_string())
|
|
.unwrap_or(HeaderValue::from_static("30")),
|
|
);
|
|
resp
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// ── Unit tests ───────────────────────────────────────────────────────────────
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use axum::body::to_bytes;
|
|
|
|
#[test]
|
|
fn test_is_oom_cuda_malloc() {
|
|
assert!(AppError::is_oom("cudaMalloc failed: out of memory"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_is_oom_alloc_buffer() {
|
|
// Exact message from ggml_backend_cuda_buffer_type_alloc_buffer
|
|
assert!(AppError::is_oom(
|
|
"ggml_backend_cuda_buffer_type_alloc_buffer: allocating 2951.01 MiB on device 0: cudaMalloc failed: out of memory"
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn test_is_oom_generic_out_of_memory() {
|
|
assert!(AppError::is_oom("CUDA error: out of memory"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_is_oom_other_error() {
|
|
assert!(!AppError::is_oom("failed to open model file"));
|
|
assert!(!AppError::is_oom("invalid model format"));
|
|
assert!(!AppError::is_oom(""));
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_model_not_ready_response_has_retry_after_header() {
|
|
let err = AppError::ModelNotReady {
|
|
state: "loading".into(),
|
|
retry_after_secs: 10,
|
|
};
|
|
let resp = err.into_response();
|
|
assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE);
|
|
let retry_after = resp
|
|
.headers()
|
|
.get(header::RETRY_AFTER)
|
|
.expect("Retry-After header missing");
|
|
assert_eq!(retry_after, "10");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_model_not_ready_response_body() {
|
|
let err = AppError::ModelNotReady {
|
|
state: "unloaded".into(),
|
|
retry_after_secs: 30,
|
|
};
|
|
let resp = err.into_response();
|
|
let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap();
|
|
let v: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
|
|
assert_eq!(v["error"], "model_not_ready");
|
|
assert_eq!(v["state"], "unloaded");
|
|
assert_eq!(v["retry_after_secs"], 30);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_model_not_ready_loading_retry_after_10() {
|
|
let err = AppError::ModelNotReady {
|
|
state: "loading".into(),
|
|
retry_after_secs: 10,
|
|
};
|
|
let resp = err.into_response();
|
|
assert_eq!(resp.headers().get(header::RETRY_AFTER).unwrap(), "10");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_model_not_ready_unloaded_retry_after_30() {
|
|
let err = AppError::ModelNotReady {
|
|
state: "unloaded".into(),
|
|
retry_after_secs: 30,
|
|
};
|
|
let resp = err.into_response();
|
|
assert_eq!(resp.headers().get(header::RETRY_AFTER).unwrap(), "30");
|
|
}
|
|
}
|