whisper-rtx2080/Dockerfile

# ============================================================
# whisper-rtx2080 — Multi-stage Dockerfile
# Optimised for NVIDIA RTX 2080 (Turing, sm_75, 8 GB VRAM)
# ============================================================
#
# Build-arg reference:
#
#   CUDA_VERSION      CUDA toolkit version      default: 12.4.1
#   CUDNN_TAG         cuDNN tag suffix          default: cudnn
#                     (CUDA 12.x → "cudnn",  CUDA 11.x → "cudnn8")
#   UBUNTU_VERSION    Ubuntu base version       default: 22.04
#
# Examples:
#   docker build -t whisper-rtx2080 .
#   docker build --build-arg CUDA_VERSION=12.1.0 --build-arg CUDNN_TAG=cudnn8 -t whisper-rtx2080:cu121 .
#   docker build --build-arg CUDA_VERSION=11.8.0 --build-arg CUDNN_TAG=cudnn8 --build-arg UBUNTU_VERSION=20.04 -t whisper-rtx2080:cu118 .

ARG CUDA_VERSION=12.4.1
ARG CUDNN_TAG=cudnn
ARG UBUNTU_VERSION=22.04

# ╔══════════════════════════════════════════════════════════╗
# ║  STAGE 1 — builder                                       ║
# ║  Full CUDA devel image + Rust toolchain                  ║
# ║  Compiles whisper.cpp (CUDA kernels) + Rust binary       ║
# ╚══════════════════════════════════════════════════════════╝
FROM nvidia/cuda:${CUDA_VERSION}-${CUDNN_TAG}-devel-ubuntu${UBUNTU_VERSION} AS builder

ARG CUDA_VERSION=12.4.1

ENV DEBIAN_FRONTEND=noninteractive

# ── System build dependencies ────────────────────────────────────────────────
RUN apt-get update && apt-get install -y --no-install-recommends \
        build-essential \
        cmake \
        git \
        curl \
        pkg-config \
        libclang-dev \
        clang \
        ca-certificates \
        # ffmpeg headers (not strictly needed at build time, but avoids surprises)
        libavformat-dev \
        libavcodec-dev \
    && rm -rf /var/lib/apt/lists/*

# ── Rust toolchain ───────────────────────────────────────────────────────────
ENV RUSTUP_HOME=/usr/local/rustup \
    CARGO_HOME=/usr/local/cargo \
    PATH=/usr/local/cargo/bin:$PATH

RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
    | sh -s -- -y --default-toolchain stable --profile minimal \
    && rustup component add rustfmt

# ── Clone whisper.cpp (whisper-rs pins a specific commit via its build.rs) ──
# whisper-rs downloads and builds whisper.cpp automatically via its build script.
# We only need to ensure the CUDA flags are forwarded through env vars.

# ── CUDA architecture flags for RTX 2080 (sm_75) ────────────────────────────
# These are picked up by whisper-rs's build.rs when it invokes cmake internally.
ENV GGML_CUDA=ON \
    CMAKE_CUDA_ARCHITECTURES=75 \
    GGML_CUDA_FORCE_MMQ=ON \
    GGML_CUDA_GRAPHS=ON \
    GGML_CUDA_FA_ALL_QUANTS=ON \
    GGML_CUDA_F16=ON \
    # Tell whisper-rs / cmake where nvcc lives
    CUDA_PATH=/usr/local/cuda \
    LIBCLANG_PATH=/usr/lib/llvm-14/lib

# ── Copy source and build ────────────────────────────────────────────────────
WORKDIR /build
COPY Cargo.toml ./
COPY src/       ./src/

# Build in release mode — LTO + single codegen unit (see Cargo.toml profile)
RUN --mount=type=cache,target=/usr/local/cargo/registry \
    --mount=type=cache,target=/build/target \
    cargo build --release \
    && cp target/release/whisper-server /usr/local/bin/whisper-server


# ╔══════════════════════════════════════════════════════════╗
# ║  STAGE 2 — runtime                                       ║
# ║  Minimal CUDA runtime image — no build tools             ║
# ╚══════════════════════════════════════════════════════════╝
FROM nvidia/cuda:${CUDA_VERSION}-${CUDNN_TAG}-runtime-ubuntu${UBUNTU_VERSION}

ARG CUDA_VERSION=12.4.1

ENV DEBIAN_FRONTEND=noninteractive

# ── Runtime dependencies only ────────────────────────────────────────────────
RUN apt-get update && apt-get install -y --no-install-recommends \
        ffmpeg \
        ca-certificates \
    && rm -rf /var/lib/apt/lists/*

# ── NVIDIA container runtime ─────────────────────────────────────────────────
ENV NVIDIA_VISIBLE_DEVICES=all \
    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
    CUDA_DEVICE_ORDER=PCI_BUS_ID

# ── CTranslate2 / GGML VRAM tuning for RTX 2080 ─────────────────────────────
# Limit CUDA allocator chunk size to avoid fragmenting the 8 GB pool.
ENV GGML_CUDA_NO_VMM=0

# ── Application defaults (all overridable at runtime) ────────────────────────
ENV PORT=8080 \
    RUST_LOG=info \
    DATA_DIR=/data \
    WHISPER_MODEL=large-v3 \
    WHISPER_MODEL_PATH=/models/ggml-large-v3.bin

# ── Binary ───────────────────────────────────────────────────────────────────
COPY --from=builder /usr/local/bin/whisper-server /app/whisper-server
RUN chmod +x /app/whisper-server

# ── Volumes & ports ──────────────────────────────────────────────────────────
RUN mkdir -p /data /models
VOLUME ["/data", "/models"]
EXPOSE 8080

HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD curl -sf http://localhost:${PORT}/health || exit 1

ENTRYPOINT ["/app/whisper-server"]