feat: GPU-accelerated Whisper API for RTX 2080 (sm_75)

- Pure Rust: Axum 0.7 + whisper-rs 0.13 (CUDA FFI) - Async job queue with SSE progress streaming - Webhook delivery with 5x exponential backoff - Disk-persisted job state (survives restarts) - Anti-hallucination params: no_speech_thold, entropy_thold, suppress_blank - CUDA sm_75 flags: GGML_CUDA_FORCE_MMQ, GGML_CUDA_GRAPHS, GGML_CUDA_FA_ALL_QUANTS - Configurable via env: CUDA_DEVICE, WHISPER_MODEL_PATH, PORT, DATA_DIR - Gitea Actions CI: build + push to git.sal.giize.com registry - Multi-stage Dockerfile with customizable CUDA_VERSION ARG Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-05 22:47:24 +02:00
commit 16cb6ca661
18 changed files with 1898 additions and 0 deletions
--- a/129
+++ b/129
@@ -0,0 +1,129 @@
+# ============================================================
+# whisper-rtx2080 — Multi-stage Dockerfile
+# Optimised for NVIDIA RTX 2080 (Turing, sm_75, 8 GB VRAM)
+# ============================================================
+#
+# Build-arg reference:
+#
+#   CUDA_VERSION      CUDA toolkit version      default: 12.4.1
+#   CUDNN_TAG         cuDNN tag suffix          default: cudnn
+#                     (CUDA 12.x → "cudnn",  CUDA 11.x → "cudnn8")
+#   UBUNTU_VERSION    Ubuntu base version       default: 22.04
+#
+# Examples:
+#   docker build -t whisper-rtx2080 .
+#   docker build --build-arg CUDA_VERSION=12.1.0 --build-arg CUDNN_TAG=cudnn8 -t whisper-rtx2080:cu121 .
+#   docker build --build-arg CUDA_VERSION=11.8.0 --build-arg CUDNN_TAG=cudnn8 --build-arg UBUNTU_VERSION=20.04 -t whisper-rtx2080:cu118 .
+
+ARG CUDA_VERSION=12.4.1
+ARG CUDNN_TAG=cudnn
+ARG UBUNTU_VERSION=22.04
+
+# ╔══════════════════════════════════════════════════════════╗
+# ║  STAGE 1 — builder                                       ║
+# ║  Full CUDA devel image + Rust toolchain                  ║
+# ║  Compiles whisper.cpp (CUDA kernels) + Rust binary       ║
+# ╚══════════════════════════════════════════════════════════╝
+FROM nvidia/cuda:${CUDA_VERSION}-${CUDNN_TAG}-devel-ubuntu${UBUNTU_VERSION} AS builder
+
+ARG CUDA_VERSION=12.4.1
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# ── System build dependencies ────────────────────────────────────────────────
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cmake \
+        git \
+        curl \
+        pkg-config \
+        libclang-dev \
+        clang \
+        ca-certificates \
+        # ffmpeg headers (not strictly needed at build time, but avoids surprises)
+        libavformat-dev \
+        libavcodec-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# ── Rust toolchain ───────────────────────────────────────────────────────────
+ENV RUSTUP_HOME=/usr/local/rustup \
+    CARGO_HOME=/usr/local/cargo \
+    PATH=/usr/local/cargo/bin:$PATH
+
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
+    | sh -s -- -y --default-toolchain stable --profile minimal \
+    && rustup component add rustfmt
+
+# ── Clone whisper.cpp (whisper-rs pins a specific commit via its build.rs) ──
+# whisper-rs downloads and builds whisper.cpp automatically via its build script.
+# We only need to ensure the CUDA flags are forwarded through env vars.
+
+# ── CUDA architecture flags for RTX 2080 (sm_75) ────────────────────────────
+# These are picked up by whisper-rs's build.rs when it invokes cmake internally.
+ENV GGML_CUDA=ON \
+    CMAKE_CUDA_ARCHITECTURES=75 \
+    GGML_CUDA_FORCE_MMQ=ON \
+    GGML_CUDA_GRAPHS=ON \
+    GGML_CUDA_FA_ALL_QUANTS=ON \
+    GGML_CUDA_F16=ON \
+    # Tell whisper-rs / cmake where nvcc lives
+    CUDA_PATH=/usr/local/cuda \
+    LIBCLANG_PATH=/usr/lib/llvm-14/lib
+
+# ── Copy source and build ────────────────────────────────────────────────────
+WORKDIR /build
+COPY Cargo.toml ./
+COPY src/       ./src/
+
+# Build in release mode — LTO + single codegen unit (see Cargo.toml profile)
+RUN --mount=type=cache,target=/usr/local/cargo/registry \
+    --mount=type=cache,target=/build/target \
+    cargo build --release \
+    && cp target/release/whisper-server /usr/local/bin/whisper-server
+
+
+# ╔══════════════════════════════════════════════════════════╗
+# ║  STAGE 2 — runtime                                       ║
+# ║  Minimal CUDA runtime image — no build tools             ║
+# ╚══════════════════════════════════════════════════════════╝
+FROM nvidia/cuda:${CUDA_VERSION}-${CUDNN_TAG}-runtime-ubuntu${UBUNTU_VERSION}
+
+ARG CUDA_VERSION=12.4.1
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# ── Runtime dependencies only ────────────────────────────────────────────────
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        ffmpeg \
+        ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# ── NVIDIA container runtime ─────────────────────────────────────────────────
+ENV NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
+    CUDA_DEVICE_ORDER=PCI_BUS_ID
+
+# ── CTranslate2 / GGML VRAM tuning for RTX 2080 ─────────────────────────────
+# Limit CUDA allocator chunk size to avoid fragmenting the 8 GB pool.
+ENV GGML_CUDA_NO_VMM=0
+
+# ── Application defaults (all overridable at runtime) ────────────────────────
+ENV PORT=8080 \
+    RUST_LOG=info \
+    DATA_DIR=/data \
+    WHISPER_MODEL=large-v3 \
+    WHISPER_MODEL_PATH=/models/ggml-large-v3.bin
+
+# ── Binary ───────────────────────────────────────────────────────────────────
+COPY --from=builder /usr/local/bin/whisper-server /app/whisper-server
+RUN chmod +x /app/whisper-server
+
+# ── Volumes & ports ──────────────────────────────────────────────────────────
+RUN mkdir -p /data /models
+VOLUME ["/data", "/models"]
+EXPOSE 8080
+
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -sf http://localhost:${PORT}/health || exit 1
+
+ENTRYPOINT ["/app/whisper-server"]