feat: GPU-accelerated Whisper API for RTX 2080 (sm_75)

- Pure Rust: Axum 0.7 + whisper-rs 0.13 (CUDA FFI) - Async job queue with SSE progress streaming - Webhook delivery with 5x exponential backoff - Disk-persisted job state (survives restarts) - Anti-hallucination params: no_speech_thold, entropy_thold, suppress_blank - CUDA sm_75 flags: GGML_CUDA_FORCE_MMQ, GGML_CUDA_GRAPHS, GGML_CUDA_FA_ALL_QUANTS - Configurable via env: CUDA_DEVICE, WHISPER_MODEL_PATH, PORT, DATA_DIR - Gitea Actions CI: build + push to git.sal.giize.com registry - Multi-stage Dockerfile with customizable CUDA_VERSION ARG Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-05 22:47:24 +02:00
commit 16cb6ca661
18 changed files with 1898 additions and 0 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,52 @@
+services:
+  whisper:
+    image: whisper-rtx2080:latest
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        # ── CUDA / base image ─────────────────────────────────────
+        # CUDA 12.x: CUDNN_TAG = "cudnn"
+        # CUDA 11.x: CUDNN_TAG = "cudnn8"
+        CUDA_VERSION:  "12.4.1"
+        CUDNN_TAG:     "cudnn"
+        UBUNTU_VERSION: "22.04"
+
+    # ── GPU access (requires NVIDIA Container Toolkit on host) ───
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+
+    ports:
+      - "8080:8080"
+
+    volumes:
+      # Job state — survives container restarts
+      - whisper-data:/data
+      # Model cache — avoids re-downloading large-v3 on every start
+      - whisper-models:/models
+
+    environment:
+      PORT:                "8080"
+      RUST_LOG:            "info"
+      DATA_DIR:            "/data"
+      WHISPER_MODEL:       "large-v3"
+      WHISPER_MODEL_PATH:  "/models/ggml-large-v3.bin"
+
+    restart: unless-stopped
+
+    healthcheck:
+      test:     ["CMD", "curl", "-sf", "http://localhost:8080/health"]
+      interval: 30s
+      timeout:  10s
+      retries:  3
+      # Give the server time to load the model on first start
+      start_period: 90s
+
+volumes:
+  whisper-data:
+  whisper-models: