services: whisper: image: whisper-rtx2080:latest build: context: . dockerfile: Dockerfile args: # ── CUDA / base image ───────────────────────────────────── # CUDA 12.x: CUDNN_TAG = "cudnn" # CUDA 11.x: CUDNN_TAG = "cudnn8" CUDA_VERSION: "12.4.1" CUDNN_TAG: "cudnn" UBUNTU_VERSION: "22.04" # ── GPU access (requires NVIDIA Container Toolkit on host) ─── deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] ports: - "8080:8080" volumes: # Job state — survives container restarts - whisper-data:/data # Model cache — avoids re-downloading large-v3 on every start - whisper-models:/models environment: PORT: "8080" RUST_LOG: "info" DATA_DIR: "/data" WHISPER_MODEL: "large-v3" WHISPER_MODEL_PATH: "/models/ggml-large-v3.bin" restart: unless-stopped healthcheck: test: ["CMD", "curl", "-sf", "http://localhost:8080/health"] interval: 30s timeout: 10s retries: 3 # Give the server time to load the model on first start start_period: 90s volumes: whisper-data: whisper-models: