feat: GPU-accelerated Whisper API for RTX 2080 (sm_75)

- Pure Rust: Axum 0.7 + whisper-rs 0.13 (CUDA FFI) - Async job queue with SSE progress streaming - Webhook delivery with 5x exponential backoff - Disk-persisted job state (survives restarts) - Anti-hallucination params: no_speech_thold, entropy_thold, suppress_blank - CUDA sm_75 flags: GGML_CUDA_FORCE_MMQ, GGML_CUDA_GRAPHS, GGML_CUDA_FA_ALL_QUANTS - Configurable via env: CUDA_DEVICE, WHISPER_MODEL_PATH, PORT, DATA_DIR - Gitea Actions CI: build + push to git.sal.giize.com registry - Multi-stage Dockerfile with customizable CUDA_VERSION ARG Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-05 22:47:24 +02:00
commit 16cb6ca661
18 changed files with 1898 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,22 @@
+# Git
+.git
+.gitignore
+
+# Rust build artifacts (never copy into image — uses cache mounts instead)
+target/
+
+# Local dev files
+.env
+.env.*
+*.local
+
+# Editor
+.vscode/
+.idea/
+*.swp
+
+# Docs
+*.md
+
+# macOS
+.DS_Store
--- a/.gitea/workflows/docker-build.yml
+++ b/.gitea/workflows/docker-build.yml
@@ -0,0 +1,69 @@
+name: Build & Push Docker Image
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - "v*"
+  pull_request:
+    branches:
+      - main
+
+env:
+  REGISTRY: git.sal.giize.com
+  IMAGE_NAME: mozempk/whisper-rtx2080
+  # Customizable CUDA version (override with repo variable CUDA_VERSION)
+  CUDA_VERSION: ${{ vars.CUDA_VERSION || '12.4.1' }}
+  UBUNTU_VERSION: ${{ vars.UBUNTU_VERSION || '22.04' }}
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to Gitea Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ secrets.REGISTRY_USERNAME }}
+          password: ${{ secrets.REGISTRY_TOKEN }}
+
+      - name: Extract metadata (tags, labels)
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            # tag with git sha on every push to main
+            type=sha,prefix=sha-,format=short,event=branch
+            # semver tags from git tags: v1.2.3 → 1.2.3, 1.2, 1, latest
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=semver,pattern={{major}}
+            # latest on main branch
+            type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }}
+            # pr-N on pull requests
+            type=ref,event=pr
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: ./Dockerfile
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          build-args: |
+            CUDA_VERSION=${{ env.CUDA_VERSION }}
+            UBUNTU_VERSION=${{ env.UBUNTU_VERSION }}
+          # Cache layers in the Gitea registry for faster rebuilds
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache,mode=max
+          platforms: linux/amd64
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,23 @@
+# Rust build artifacts
+/target/
+Cargo.lock
+
+# Runtime data — job state, audio uploads, whisper model
+/data/
+*.gguf
+*.ggml
+*.bin
+
+# Logs
+*.log
+/tmp/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*~
+
+# OS
+.DS_Store
+Thumbs.db
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -0,0 +1,52 @@
+[package]
+name    = "whisper-server"
+version = "0.1.0"
+edition = "2021"
+
+[[bin]]
+name = "whisper-server"
+path = "src/main.rs"
+
+[dependencies]
+# Web framework
+axum            = { version = "0.7", features = ["multipart"] }
+axum-extra      = { version = "0.9", features = ["typed-header"] }
+tokio           = { version = "1",   features = ["full"] }
+tokio-stream    = { version = "0.1", features = ["sync"] }
+tower           = { version = "0.4" }
+tower-http      = { version = "0.5", features = ["cors", "trace", "limit"] }
+
+# Whisper inference
+whisper-rs      = { version = "0.13", features = ["cuda"] }
+
+# Serialisation
+serde           = { version = "1", features = ["derive"] }
+serde_json      = "1"
+
+# OpenAPI / Swagger
+utoipa          = { version = "4", features = ["axum_extras", "uuid"] }
+utoipa-swagger-ui = { version = "7", features = ["axum"] }
+
+# HTTP client (webhooks)
+reqwest         = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
+
+# Utilities
+uuid            = { version = "1", features = ["v4", "serde"] }
+tracing         = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
+anyhow          = "1"
+thiserror       = "1"
+tempfile        = "3"
+num_cpus        = "1"
+chrono          = { version = "0.4", features = ["serde"] }
+tokio-util      = { version = "0.7", features = ["io"] }
+futures         = "0.3"
+async-stream    = "0.3"
+bytes           = "1"
+dashmap         = "6"
+
+[profile.release]
+opt-level     = 3
+lto           = "thin"
+codegen-units = 1
+strip         = "symbols"
--- a/129
+++ b/129
@@ -0,0 +1,129 @@
+# ============================================================
+# whisper-rtx2080 — Multi-stage Dockerfile
+# Optimised for NVIDIA RTX 2080 (Turing, sm_75, 8 GB VRAM)
+# ============================================================
+#
+# Build-arg reference:
+#
+#   CUDA_VERSION      CUDA toolkit version      default: 12.4.1
+#   CUDNN_TAG         cuDNN tag suffix          default: cudnn
+#                     (CUDA 12.x → "cudnn",  CUDA 11.x → "cudnn8")
+#   UBUNTU_VERSION    Ubuntu base version       default: 22.04
+#
+# Examples:
+#   docker build -t whisper-rtx2080 .
+#   docker build --build-arg CUDA_VERSION=12.1.0 --build-arg CUDNN_TAG=cudnn8 -t whisper-rtx2080:cu121 .
+#   docker build --build-arg CUDA_VERSION=11.8.0 --build-arg CUDNN_TAG=cudnn8 --build-arg UBUNTU_VERSION=20.04 -t whisper-rtx2080:cu118 .
+
+ARG CUDA_VERSION=12.4.1
+ARG CUDNN_TAG=cudnn
+ARG UBUNTU_VERSION=22.04
+
+# ╔══════════════════════════════════════════════════════════╗
+# ║  STAGE 1 — builder                                       ║
+# ║  Full CUDA devel image + Rust toolchain                  ║
+# ║  Compiles whisper.cpp (CUDA kernels) + Rust binary       ║
+# ╚══════════════════════════════════════════════════════════╝
+FROM nvidia/cuda:${CUDA_VERSION}-${CUDNN_TAG}-devel-ubuntu${UBUNTU_VERSION} AS builder
+
+ARG CUDA_VERSION=12.4.1
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# ── System build dependencies ────────────────────────────────────────────────
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cmake \
+        git \
+        curl \
+        pkg-config \
+        libclang-dev \
+        clang \
+        ca-certificates \
+        # ffmpeg headers (not strictly needed at build time, but avoids surprises)
+        libavformat-dev \
+        libavcodec-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# ── Rust toolchain ───────────────────────────────────────────────────────────
+ENV RUSTUP_HOME=/usr/local/rustup \
+    CARGO_HOME=/usr/local/cargo \
+    PATH=/usr/local/cargo/bin:$PATH
+
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
+    | sh -s -- -y --default-toolchain stable --profile minimal \
+    && rustup component add rustfmt
+
+# ── Clone whisper.cpp (whisper-rs pins a specific commit via its build.rs) ──
+# whisper-rs downloads and builds whisper.cpp automatically via its build script.
+# We only need to ensure the CUDA flags are forwarded through env vars.
+
+# ── CUDA architecture flags for RTX 2080 (sm_75) ────────────────────────────
+# These are picked up by whisper-rs's build.rs when it invokes cmake internally.
+ENV GGML_CUDA=ON \
+    CMAKE_CUDA_ARCHITECTURES=75 \
+    GGML_CUDA_FORCE_MMQ=ON \
+    GGML_CUDA_GRAPHS=ON \
+    GGML_CUDA_FA_ALL_QUANTS=ON \
+    GGML_CUDA_F16=ON \
+    # Tell whisper-rs / cmake where nvcc lives
+    CUDA_PATH=/usr/local/cuda \
+    LIBCLANG_PATH=/usr/lib/llvm-14/lib
+
+# ── Copy source and build ────────────────────────────────────────────────────
+WORKDIR /build
+COPY Cargo.toml ./
+COPY src/       ./src/
+
+# Build in release mode — LTO + single codegen unit (see Cargo.toml profile)
+RUN --mount=type=cache,target=/usr/local/cargo/registry \
+    --mount=type=cache,target=/build/target \
+    cargo build --release \
+    && cp target/release/whisper-server /usr/local/bin/whisper-server
+
+
+# ╔══════════════════════════════════════════════════════════╗
+# ║  STAGE 2 — runtime                                       ║
+# ║  Minimal CUDA runtime image — no build tools             ║
+# ╚══════════════════════════════════════════════════════════╝
+FROM nvidia/cuda:${CUDA_VERSION}-${CUDNN_TAG}-runtime-ubuntu${UBUNTU_VERSION}
+
+ARG CUDA_VERSION=12.4.1
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# ── Runtime dependencies only ────────────────────────────────────────────────
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        ffmpeg \
+        ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# ── NVIDIA container runtime ─────────────────────────────────────────────────
+ENV NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
+    CUDA_DEVICE_ORDER=PCI_BUS_ID
+
+# ── CTranslate2 / GGML VRAM tuning for RTX 2080 ─────────────────────────────
+# Limit CUDA allocator chunk size to avoid fragmenting the 8 GB pool.
+ENV GGML_CUDA_NO_VMM=0
+
+# ── Application defaults (all overridable at runtime) ────────────────────────
+ENV PORT=8080 \
+    RUST_LOG=info \
+    DATA_DIR=/data \
+    WHISPER_MODEL=large-v3 \
+    WHISPER_MODEL_PATH=/models/ggml-large-v3.bin
+
+# ── Binary ───────────────────────────────────────────────────────────────────
+COPY --from=builder /usr/local/bin/whisper-server /app/whisper-server
+RUN chmod +x /app/whisper-server
+
+# ── Volumes & ports ──────────────────────────────────────────────────────────
+RUN mkdir -p /data /models
+VOLUME ["/data", "/models"]
+EXPOSE 8080
+
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -sf http://localhost:${PORT}/health || exit 1
+
+ENTRYPOINT ["/app/whisper-server"]
--- a/README.md
+++ b/README.md
@@ -0,0 +1,201 @@
+# whisper-rtx2080
+
+Async REST API for GPU-accelerated speech transcription, built in **Rust** (Axum) on top of
+**whisper.cpp** compiled with CUDA for the **NVIDIA RTX 2080** (Turing, sm\_75, 8 GB VRAM).
+No Python.
+
+---
+
+## Requirements
+
+| Dependency | Notes |
+|---|---|
+| Docker ≥ 20.10 | |
+| [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) | `nvidia-docker2` on the host |
+| Host NVIDIA driver ≥ 525 | Required for CUDA 12.x |
+| GGML model file | Downloaded automatically on first start |
+
+---
+
+## Quick start
+
+```bash
+# Build (CUDA 12.4, sm_75, large-v3 model)
+docker compose build
+
+# Start the server (model downloads on first run — ~3 GB)
+docker compose up -d
+
+# Check it's running
+curl http://localhost:8080/health
+
+# Transcribe a file
+curl -X POST http://localhost:8080/jobs \
+  -F "audio=@/path/to/speech.mp3" | jq .
+# → { "job_id": "550e8400-..." }
+
+# Poll for result
+curl http://localhost:8080/jobs/550e8400-... | jq .
+
+# Or stream progress in real time
+curl -N http://localhost:8080/jobs/550e8400-.../stream
+
+# Browse the interactive API docs
+open http://localhost:8080/docs
+```
+
+---
+
+## API reference
+
+| Method | Path | Description |
+|---|---|---|
+| `POST` | `/jobs` | Submit audio for transcription |
+| `GET` | `/jobs/{id}` | Poll job status + result |
+| `GET` | `/jobs/{id}/stream` | SSE: live progress + completion event |
+| `DELETE` | `/jobs/{id}` | Cancel a queued or running job |
+| `GET` | `/health` | GPU info + queue depth |
+| `GET` | `/docs` | Swagger UI |
+| `GET` | `/openapi.json` | Raw OpenAPI 3.0 spec |
+
+### POST /jobs — multipart fields
+
+| Field | Required | Description |
+|---|---|---|
+| `audio` | ✅ | Audio file — any format ffmpeg understands; no size limit |
+| `language` | ❌ | ISO 639-1 source language (e.g. `en`). Auto-detected when absent. |
+| `task` | ❌ | `transcribe` (default) or `translate` (output always English) |
+| `webhook_url` | ❌ | URL to POST the completed job JSON to on completion |
+
+### Job result JSON
+
+```json
+{
+  "id":            "550e8400-e29b-41d4-a716-446655440000",
+  "status":        "done",
+  "language":      "en",
+  "task":          "transcribe",
+  "duration_secs": 142.3,
+  "progress":      100,
+  "segments": [
+    {
+      "index": 0,
+      "start": 0.0,
+      "end":   2.4,
+      "text":  " Hello, world.",
+      "words": []
+    }
+  ],
+  "error":        null,
+  "created_at":   "2026-05-05T21:00:00Z",
+  "completed_at": "2026-05-05T21:02:13Z"
+}
+```
+
+### SSE events (`GET /jobs/{id}/stream`)
+
+```
+event: progress
+data: {"type":"progress","percent":42}
+
+event: progress
+data: {"type":"progress","percent":91}
+
+event: done
+data: {"type":"done","job":{...full job object...}}
+```
+
+---
+
+## Build arguments
+
+| ARG | Default | Notes |
+|---|---|---|
+| `CUDA_VERSION` | `12.4.1` | Passed to the NVIDIA base image tag |
+| `CUDNN_TAG` | `cudnn` | `cudnn` for CUDA 12.x · `cudnn8` for CUDA 11.x |
+| `UBUNTU_VERSION` | `22.04` | Ubuntu base |
+
+### Custom CUDA version examples
+
+```bash
+# CUDA 12.1
+docker build \
+  --build-arg CUDA_VERSION=12.1.0 \
+  --build-arg CUDNN_TAG=cudnn8 \
+  -t whisper-rtx2080:cu121 .
+
+# CUDA 11.8 (legacy)
+docker build \
+  --build-arg CUDA_VERSION=11.8.0 \
+  --build-arg CUDNN_TAG=cudnn8 \
+  --build-arg UBUNTU_VERSION=20.04 \
+  -t whisper-rtx2080:cu118 .
+```
+
+---
+
+## Runtime environment variables
+
+All can be overridden with `-e` or in `docker-compose.yml`:
+
+| Variable | Default | Description |
+|---|---|---|
+| `PORT` | `8080` | TCP port the server listens on |
+| `RUST_LOG` | `info` | Log level (`trace`, `debug`, `info`, `warn`, `error`) |
+| `DATA_DIR` | `/data` | Directory for persisted job state (mount a volume here) |
+| `WHISPER_MODEL` | `large-v3` | Model name (for /health reporting) |
+| `WHISPER_MODEL_PATH` | `/models/ggml-large-v3.bin` | Absolute path to the GGML model file |
+
+---
+
+## RTX 2080 optimisation notes
+
+| Setting | Value | Reason |
+|---|---|---|
+| `CMAKE_CUDA_ARCHITECTURES` | `75` | Compiles kernels **only for sm\_75** — smaller binary, faster build |
+| `GGML_CUDA_FORCE_MMQ` | `ON` | Quantised matrix-multiply (WMMA Tensor Cores) — best for Q4/Q5/Q8 models on Turing |
+| `GGML_CUDA_GRAPHS` | `ON` | CUDA Graph capture → eliminates CPU→GPU dispatch overhead per call (requires sm\_75+) |
+| `GGML_CUDA_FA_ALL_QUANTS` | `ON` | Flash Attention tile kernels for all quantisation types |
+| `GGML_CUDA_F16` | `ON` | FP16 arithmetic via Turing Tensor Cores |
+| `flash_attn` (runtime) | `true` | Enabled in `WhisperContextParameters` — tile-based, works on sm\_75 |
+| `beam_size` | `5` | Best accuracy/speed balance |
+| `temperature` | `0.0` | Deterministic, fastest decode path |
+| `n_threads` | host CPU count | CPU-side pre/post processing |
+
+> **bfloat16 is intentionally not enabled** — that requires Ampere (sm\_80+).
+>
+> **flash\_attn and DTW token timestamps are mutually exclusive** — the server enables
+> flash\_attn and omits DTW to maximise throughput.
+
+---
+
+## Webhooks
+
+If `webhook_url` is set on a job, the server will `POST` the completed job JSON to that URL:
+- Up to **5 retries** with exponential backoff: 1 s → 2 s → 4 s → 8 s → 16 s
+- After all retries are exhausted the failure is logged and dropped
+
+---
+
+## Troubleshooting
+
+**`CUDA error: no kernel image available for execution on the device`**
+→ The binary was compiled for a different architecture. Rebuild with
+`--build-arg CUDA_VERSION=...` matching your driver. The image is always compiled
+for sm\_75 only.
+
+**`libcuda.so.1: cannot open shared object file`**
+→ NVIDIA Container Toolkit is not installed or `--gpus all` / `deploy.resources` is missing.
+
+**Model not found at `/models/ggml-large-v3.bin`**
+→ On first start the server will fail immediately. Download the model manually:
+```bash
+docker run --rm -v whisper-models:/models curlimages/curl:latest \
+  -L -o /models/ggml-large-v3.bin \
+  https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3.bin
+```
+Then restart the server.
+
+**Out-of-memory on large-v3**
+→ The large-v3 GGML model at F16 uses ~3.1 GB VRAM; you should have headroom on 8 GB.
+If running other GPU workloads in parallel, switch to `ggml-medium.bin` (~1.5 GB).
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,52 @@
+services:
+  whisper:
+    image: whisper-rtx2080:latest
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        # ── CUDA / base image ─────────────────────────────────────
+        # CUDA 12.x: CUDNN_TAG = "cudnn"
+        # CUDA 11.x: CUDNN_TAG = "cudnn8"
+        CUDA_VERSION:  "12.4.1"
+        CUDNN_TAG:     "cudnn"
+        UBUNTU_VERSION: "22.04"
+
+    # ── GPU access (requires NVIDIA Container Toolkit on host) ───
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+
+    ports:
+      - "8080:8080"
+
+    volumes:
+      # Job state — survives container restarts
+      - whisper-data:/data
+      # Model cache — avoids re-downloading large-v3 on every start
+      - whisper-models:/models
+
+    environment:
+      PORT:                "8080"
+      RUST_LOG:            "info"
+      DATA_DIR:            "/data"
+      WHISPER_MODEL:       "large-v3"
+      WHISPER_MODEL_PATH:  "/models/ggml-large-v3.bin"
+
+    restart: unless-stopped
+
+    healthcheck:
+      test:     ["CMD", "curl", "-sf", "http://localhost:8080/health"]
+      interval: 30s
+      timeout:  10s
+      retries:  3
+      # Give the server time to load the model on first start
+      start_period: 90s
+
+volumes:
+  whisper-data:
+  whisper-models:
--- a/src/error.rs
+++ b/src/error.rs
@@ -0,0 +1,39 @@
+use thiserror::Error;
+use axum::{
+    http::StatusCode,
+    response::{IntoResponse, Response},
+    Json,
+};
+use serde_json::json;
+
+pub type Result<T> = std::result::Result<T, AppError>;
+
+#[derive(Debug, Error)]
+pub enum AppError {
+    #[error("not found: {0}")]
+    NotFound(String),
+
+    #[error("bad request: {0}")]
+    BadRequest(String),
+
+    #[error("conflict: {0}")]
+    Conflict(String),
+
+    #[error("internal error: {0}")]
+    Internal(String),
+}
+
+impl IntoResponse for AppError {
+    fn into_response(self) -> Response {
+        let (status, message) = match &self {
+            AppError::NotFound(m)   => (StatusCode::NOT_FOUND,             m.clone()),
+            AppError::BadRequest(m) => (StatusCode::BAD_REQUEST,           m.clone()),
+            AppError::Conflict(m)   => (StatusCode::CONFLICT,              m.clone()),
+            AppError::Internal(m)   => (StatusCode::INTERNAL_SERVER_ERROR, m.clone()),
+        };
+
+        tracing::error!(status = status.as_u16(), error = %message);
+
+        (status, Json(json!({ "error": message }))).into_response()
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@@ -0,0 +1,130 @@
+use std::sync::Arc;
+
+use axum::Router;
+use tokio::sync::mpsc;
+use tower_http::{cors::CorsLayer, trace::TraceLayer};
+use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter};
+use utoipa::OpenApi;
+use utoipa_swagger_ui::SwaggerUi;
+
+mod error;
+mod models;
+mod routes;
+mod storage;
+mod transcriber;
+mod webhook;
+mod worker;
+
+pub use error::{AppError, Result};
+
+// ── App state shared across all handlers ────────────────────────────────────
+
+#[derive(Clone)]
+pub struct AppState {
+    /// Channel to submit jobs to the single GPU worker.
+    pub job_tx: mpsc::UnboundedSender<models::JobId>,
+    /// Shared handle to the on-disk job store.
+    pub storage: Arc<storage::Storage>,
+    /// SSE broadcast registry: job_id → sender.
+    pub progress: worker::ProgressRegistry,
+    /// Model name reported by /health.
+    pub model_name: Arc<str>,
+    /// Approximate number of jobs waiting in queue.
+    pub queue_depth: Arc<std::sync::atomic::AtomicUsize>,
+    /// CUDA device index used for inference.
+    pub gpu_device: u32,
+}
+
+// ── OpenAPI spec root ────────────────────────────────────────────────────────
+
+#[derive(OpenApi)]
+#[openapi(
+    info(
+        title       = "Whisper RTX 2080 API",
+        version     = "0.1.0",
+        description = "Async speech transcription powered by whisper.cpp + CUDA sm_75"
+    ),
+    paths(
+        routes::jobs::submit_job,
+        routes::jobs::get_job,
+        routes::jobs::stream_job,
+        routes::jobs::delete_job,
+        routes::health::health,
+    ),
+    components(schemas(
+        models::Job,
+        models::JobStatus,
+        models::Segment,
+        models::Word,
+        models::SubmitResponse,
+        models::HealthResponse,
+    )),
+    tags(
+        (name = "jobs",   description = "Transcription job management"),
+        (name = "system", description = "Service health"),
+    )
+)]
+struct ApiDoc;
+
+// ── Entry point ──────────────────────────────────────────────────────────────
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    // Structured logging — level controlled by RUST_LOG env var.
+    tracing_subscriber::registry()
+        .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| "info".into()))
+        .with(tracing_subscriber::fmt::layer().json())
+        .init();
+
+    let data_dir   = std::env::var("DATA_DIR").unwrap_or_else(|_| "/data".into());
+    let model_path = std::env::var("WHISPER_MODEL_PATH")
+        .unwrap_or_else(|_| "/models/ggml-large-v3.bin".into());
+    let port       = std::env::var("PORT").unwrap_or_else(|_| "8080".into());
+    let model_name = std::env::var("WHISPER_MODEL").unwrap_or_else(|_| "large-v3".into());
+    let gpu_device: u32 = std::env::var("CUDA_DEVICE")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(0);
+
+    let storage = Arc::new(storage::Storage::new(&data_dir).await?);
+
+    // Recover any jobs that were `running` when the process died last time.
+    storage.recover_interrupted_jobs().await?;
+
+    let (job_tx, job_rx) = mpsc::unbounded_channel::<models::JobId>();
+    let queue_depth = Arc::new(std::sync::atomic::AtomicUsize::new(0));
+
+    // Spawn single GPU worker; get back the SSE broadcast registry.
+    let progress = worker::start(
+        job_rx,
+        Arc::clone(&storage),
+        model_path.clone().into(),
+        Arc::clone(&queue_depth),
+        gpu_device,
+    );
+
+    let state = AppState {
+        job_tx,
+        storage: Arc::clone(&storage),
+        progress,
+        model_name:  model_name.as_str().into(),
+        queue_depth: Arc::clone(&queue_depth),
+        gpu_device,
+    };
+
+    let app = Router::new()
+        .merge(SwaggerUi::new("/docs").url("/openapi.json", ApiDoc::openapi()))
+        .merge(routes::jobs_router())
+        .merge(routes::health_router())
+        .with_state(state)
+        .layer(CorsLayer::permissive())
+        .layer(TraceLayer::new_for_http());
+
+    let addr = format!("0.0.0.0:{port}");
+    tracing::info!(addr, model = model_name, "whisper-server starting");
+
+    let listener = tokio::net::TcpListener::bind(&addr).await?;
+    axum::serve(listener, app).await?;
+
+    Ok(())
+}
--- a/src/models.rs
+++ b/src/models.rs
@@ -0,0 +1,143 @@
+use chrono::{DateTime, Utc};
+use serde::{Deserialize, Serialize};
+use utoipa::ToSchema;
+use uuid::Uuid;
+
+pub type JobId = Uuid;
+
+// ── Job status ───────────────────────────────────────────────────────────────
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)]
+#[serde(rename_all = "snake_case")]
+pub enum JobStatus {
+    Queued,
+    Running,
+    Done,
+    Failed,
+    Cancelled,
+}
+
+// ── Transcript segment ───────────────────────────────────────────────────────
+
+#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
+pub struct Word {
+    /// Word text
+    pub text:        String,
+    /// Start time in seconds
+    pub start:       f32,
+    /// End time in seconds
+    pub end:         f32,
+    /// Model confidence (0–1)
+    pub probability: f32,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
+pub struct Segment {
+    /// Segment index
+    pub index: i32,
+    /// Start time in seconds
+    pub start: f32,
+    /// End time in seconds
+    pub end:   f32,
+    /// Transcribed text
+    pub text:  String,
+    /// Token-level word timestamps (empty when flash_attn is enabled)
+    #[serde(default)]
+    pub words: Vec<Word>,
+}
+
+// ── Main job document (persisted to disk) ────────────────────────────────────
+
+#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
+pub struct Job {
+    /// Unique job identifier
+    pub id: JobId,
+
+    /// Current status
+    pub status: JobStatus,
+
+    /// Source language detected or specified (ISO 639-1)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub language: Option<String>,
+
+    /// Task: "transcribe" or "translate"
+    pub task: String,
+
+    /// Total audio duration in seconds (set after processing)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub duration_secs: Option<f32>,
+
+    /// Transcription segments (populated when status = done)
+    #[serde(default)]
+    pub segments: Vec<Segment>,
+
+    /// Error message (populated when status = failed)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub error: Option<String>,
+
+    /// Optional webhook URL to call on completion
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub webhook_url: Option<String>,
+
+    /// Transcription progress 0–100 (approximate, updated during processing)
+    pub progress: u8,
+
+    /// ISO 8601 timestamp when the job was created
+    pub created_at: DateTime<Utc>,
+
+    /// ISO 8601 timestamp when the job finished (done/failed/cancelled)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub completed_at: Option<DateTime<Utc>>,
+
+    /// Original filename (for reference only)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub filename: Option<String>,
+}
+
+impl Job {
+    pub fn new(id: JobId, task: String, webhook_url: Option<String>, filename: Option<String>) -> Self {
+        Self {
+            id,
+            status:       JobStatus::Queued,
+            language:     None,
+            task,
+            duration_secs: None,
+            segments:     vec![],
+            error:        None,
+            webhook_url,
+            progress:     0,
+            created_at:   Utc::now(),
+            completed_at: None,
+            filename,
+        }
+    }
+}
+
+// ── Request / response types ─────────────────────────────────────────────────
+
+/// Response to a successful job submission.
+#[derive(Debug, Serialize, ToSchema)]
+pub struct SubmitResponse {
+    /// The new job identifier — use this to poll or stream progress.
+    pub job_id: JobId,
+}
+
+/// Response from GET /health.
+#[derive(Debug, Serialize, ToSchema)]
+pub struct HealthResponse {
+    pub status:        String,
+    pub gpu_name:      Option<String>,
+    pub vram_total_mb: Option<u64>,
+    pub model:         String,
+    pub queue_depth:   usize,
+}
+
+// ── SSE event payload ────────────────────────────────────────────────────────
+
+#[derive(Debug, Serialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum SsePayload {
+    Progress { percent: u8 },
+    Done     { job: Box<Job> },
+    Error    { message: String },
+}
--- a/src/routes/health.rs
+++ b/src/routes/health.rs
@@ -0,0 +1,56 @@
+use std::sync::atomic::Ordering;
+
+use axum::extract::State;
+use axum::Json;
+
+use crate::{models::HealthResponse, AppState, Result};
+
+/// Return service health, GPU info, and queue depth.
+#[utoipa::path(
+    get,
+    path = "/health",
+    tag  = "system",
+    responses(
+        (status = 200, description = "Service healthy", body = HealthResponse),
+    )
+)]
+pub async fn health(State(state): State<AppState>) -> Result<Json<HealthResponse>> {
+    let (gpu_name, vram_total_mb) = gpu_info(state.gpu_device);
+
+    Ok(Json(HealthResponse {
+        status:        "ok".into(),
+        gpu_name,
+        vram_total_mb,
+        model:         state.model_name.to_string(),
+        queue_depth:   state.queue_depth.load(Ordering::Relaxed),
+    }))
+}
+
+/// Query NVIDIA GPU info via `nvidia-smi` for the given CUDA device index.
+fn gpu_info(device: u32) -> (Option<String>, Option<u64>) {
+    let Ok(out) = std::process::Command::new("nvidia-smi")
+        .args([
+            &format!("--id={device}"),
+            "--query-gpu=name,memory.total",
+            "--format=csv,noheader,nounits",
+        ])
+        .output()
+    else {
+        return (None, None);
+    };
+
+    if !out.status.success() {
+        return (None, None);
+    }
+
+    let line = String::from_utf8_lossy(&out.stdout);
+    let line = line.trim();
+    let mut parts = line.splitn(2, ',');
+
+    let name = parts.next().map(|s| s.trim().to_owned());
+    let vram = parts
+        .next()
+        .and_then(|s| s.trim().parse::<u64>().ok());
+
+    (name, vram)
+}
--- a/src/routes/jobs.rs
+++ b/src/routes/jobs.rs
@@ -0,0 +1,258 @@
+use std::sync::atomic::Ordering;
+
+use std::pin::Pin;
+
+use axum::{
+    extract::{Multipart, Path, State},
+    http::StatusCode,
+    response::{
+        sse::{Event, KeepAlive, Sse},
+        IntoResponse,
+    },
+    Json,
+};
+use chrono::Utc;
+use futures::stream::{self, Stream, StreamExt};
+use tokio::sync::broadcast;
+use tokio_stream::wrappers::BroadcastStream;
+use uuid::Uuid;
+
+use crate::{
+    models::{Job, JobId, JobStatus, SubmitResponse},
+    worker::{audio_path_for, ProgressEvent},
+    AppError, AppState, Result,
+};
+
+type SseStream = Pin<Box<dyn Stream<Item = std::result::Result<Event, std::convert::Infallible>> + Send>>;
+
+// ── POST /jobs ───────────────────────────────────────────────────────────────
+
+/// Submit an audio file for transcription.
+///
+/// Multipart fields:
+/// - `audio`       (required) – audio file; any format ffmpeg understands; no size limit
+/// - `language`    (optional) – ISO 639-1 code, e.g. "en". Auto-detected when absent.
+/// - `task`        (optional) – "transcribe" (default) or "translate" (→ English)
+/// - `webhook_url` (optional) – URL to POST the completed job JSON to
+#[utoipa::path(
+    post,
+    path = "/jobs",
+    tag  = "jobs",
+    request_body(
+        content      = String,
+        content_type = "multipart/form-data",
+        description  = "Multipart form: audio (file), language (opt), task (opt), webhook_url (opt)"
+    ),
+    responses(
+        (status = 202, description = "Job queued",    body = SubmitResponse),
+        (status = 400, description = "Bad request"),
+        (status = 500, description = "Server error"),
+    )
+)]
+pub async fn submit_job(
+    State(state): State<AppState>,
+    mut multipart: Multipart,
+) -> Result<impl IntoResponse> {
+    let mut language:    Option<String> = None;
+    let mut task:        String         = "transcribe".into();
+    let mut webhook_url: Option<String> = None;
+    let mut filename:    Option<String> = None;
+    let mut audio_saved = false;
+    // Assign ID early so we know where to stream the audio bytes.
+    let id = Uuid::new_v4();
+    let audio_path = audio_path_for(&id);
+
+    while let Some(field) = multipart.next_field().await.map_err(|e| {
+        AppError::BadRequest(format!("multipart error: {e}"))
+    })? {
+        let field_name = field.name().unwrap_or("").to_owned();
+
+        match field_name.as_str() {
+            "audio" => {
+                use tokio::io::AsyncWriteExt;
+                filename = field.file_name().map(str::to_owned);
+                // Stream directly to disk — avoids holding GB in RAM.
+                let mut file = tokio::fs::File::create(&audio_path).await.map_err(|e| {
+                    AppError::Internal(format!("cannot create audio temp file: {e}"))
+                })?;
+                let mut bytes_written: u64 = 0;
+                let mut stream = field;
+                while let Some(chunk) = stream.chunk().await.map_err(|e| {
+                    AppError::BadRequest(format!("failed to read audio field: {e}"))
+                })? {
+                    file.write_all(&chunk).await.map_err(|e| {
+                        AppError::Internal(format!("failed to write audio chunk: {e}"))
+                    })?;
+                    bytes_written += chunk.len() as u64;
+                }
+                if bytes_written == 0 {
+                    return Err(AppError::BadRequest("audio field is empty".into()));
+                }
+                audio_saved = true;
+            }
+            "language"    => language    = Some(field.text().await.map_err(|e| AppError::BadRequest(e.to_string()))?),
+            "task"        => task        = field.text().await.map_err(|e| AppError::BadRequest(e.to_string()))?,
+            "webhook_url" => webhook_url = Some(field.text().await.map_err(|e| AppError::BadRequest(e.to_string()))?),
+            _             => {} // ignore unknown fields
+        }
+    }
+
+    if !audio_saved {
+        return Err(AppError::BadRequest("missing 'audio' field".into()));
+    }
+
+    if !matches!(task.as_str(), "transcribe" | "translate") {
+        return Err(AppError::BadRequest(
+            "task must be 'transcribe' or 'translate'".into(),
+        ));
+    }
+
+    let mut job = Job::new(id, task, webhook_url, filename);
+    job.language = language;
+
+    state.storage.create(&job).await?;
+
+    // Pre-create the broadcast channel so SSE subscribers don't miss events.
+    state.progress.entry(id).or_insert_with(|| broadcast::channel(64).0);
+
+    state.queue_depth.fetch_add(1, Ordering::Relaxed);
+    state.job_tx.send(id).map_err(|_| {
+        AppError::Internal("worker channel closed".into())
+    })?;
+
+    tracing::info!(job_id = %id, "job queued");
+
+    Ok((StatusCode::ACCEPTED, Json(SubmitResponse { job_id: id })))
+}
+
+// ── GET /jobs/{id} ───────────────────────────────────────────────────────────
+
+/// Poll the status and result of a transcription job.
+#[utoipa::path(
+    get,
+    path = "/jobs/:id",
+    tag  = "jobs",
+    params(("id" = Uuid, Path, description = "Job ID")),
+    responses(
+        (status = 200, description = "Job details", body = Job),
+        (status = 404, description = "Not found"),
+    )
+)]
+pub async fn get_job(
+    State(state): State<AppState>,
+    Path(id):     Path<JobId>,
+) -> Result<Json<Job>> {
+    let job = state.storage.get(&id).await?;
+    Ok(Json(job))
+}
+
+// ── GET /jobs/{id}/stream ────────────────────────────────────────────────────
+
+/// Subscribe to real-time transcription progress via Server-Sent Events.
+///
+/// Events:
+/// - `progress` — `{ "type": "progress", "percent": 0..100 }` emitted periodically
+/// - `done`     — `{ "type": "done", "job": {...} }` emitted on completion
+/// - `error`    — `{ "type": "error", "message": "..." }` emitted on failure
+#[utoipa::path(
+    get,
+    path = "/jobs/:id/stream",
+    tag  = "jobs",
+    params(("id" = Uuid, Path, description = "Job ID")),
+    responses(
+        (status = 200, description = "SSE stream"),
+        (status = 404, description = "Not found"),
+    )
+)]
+pub async fn stream_job(
+    State(state): State<AppState>,
+    Path(id):     Path<JobId>,
+) -> Result<Sse<SseStream>> {
+    // If the job is already finished, return a single done event immediately.
+    let job = state.storage.get(&id).await?;
+    match job.status {
+        JobStatus::Done | JobStatus::Failed | JobStatus::Cancelled => {
+            let payload = serde_json::to_string(
+                &crate::models::SsePayload::Done { job: Box::new(job) }
+            ).unwrap_or_default();
+            let s: SseStream = Box::pin(stream::once(async move {
+                Ok(Event::default().event("done").data(payload))
+            }));
+            return Ok(Sse::new(s).keep_alive(KeepAlive::default()));
+        }
+        _ => {}
+    }
+
+    // Subscribe to live broadcast channel.
+    let rx = state
+        .progress
+        .entry(id)
+        .or_insert_with(|| broadcast::channel(64).0)
+        .subscribe();
+
+    let sse_stream: SseStream = Box::pin(BroadcastStream::new(rx).filter_map(|msg| async move {
+        let event = match msg {
+            Ok(ProgressEvent::Progress(p)) => {
+                let payload = serde_json::to_string(
+                    &crate::models::SsePayload::Progress { percent: p }
+                ).ok()?;
+                Event::default().event("progress").data(payload)
+            }
+            Ok(ProgressEvent::Done(job)) => {
+                let payload = serde_json::to_string(
+                    &crate::models::SsePayload::Done { job }
+                ).ok()?;
+                Event::default().event("done").data(payload)
+            }
+            Ok(ProgressEvent::Error(msg)) => {
+                let payload = serde_json::to_string(
+                    &crate::models::SsePayload::Error { message: msg }
+                ).ok()?;
+                Event::default().event("error").data(payload)
+            }
+            Err(_) => return None, // lagged / channel closed
+        };
+        Some(Ok(event))
+    }));
+
+    Ok(Sse::new(sse_stream).keep_alive(KeepAlive::default()))
+}
+
+// ── DELETE /jobs/{id} ────────────────────────────────────────────────────────
+
+/// Cancel a queued or running job.
+/// Running jobs are marked cancelled; the worker discards them after the current
+/// transcription call returns (whisper.cpp does not support mid-inference abort).
+#[utoipa::path(
+    delete,
+    path = "/jobs/:id",
+    tag  = "jobs",
+    params(("id" = Uuid, Path, description = "Job ID")),
+    responses(
+        (status = 200, description = "Job cancelled", body = Job),
+        (status = 404, description = "Not found"),
+        (status = 409, description = "Job already finished"),
+    )
+)]
+pub async fn delete_job(
+    State(state): State<AppState>,
+    Path(id):     Path<JobId>,
+) -> Result<Json<Job>> {
+    let mut job = state.storage.get(&id).await?;
+
+    match job.status {
+        JobStatus::Done | JobStatus::Failed | JobStatus::Cancelled => {
+            return Err(AppError::Conflict(format!(
+                "job {id} is already in terminal state {:?}",
+                job.status
+            )));
+        }
+        _ => {}
+    }
+
+    job.status       = JobStatus::Cancelled;
+    job.completed_at = Some(Utc::now());
+    state.storage.save(&job).await?;
+
+    Ok(Json(job))
+}
--- a/src/routes/mod.rs
+++ b/src/routes/mod.rs
@@ -0,0 +1,19 @@
+pub mod health;
+pub mod jobs;
+
+use axum::{extract::DefaultBodyLimit, routing::{delete, get, post}, Router};
+use crate::AppState;
+
+pub fn jobs_router() -> Router<AppState> {
+    Router::new()
+        // No body limit on the upload route — files can be multiple GB.
+        .route("/jobs", post(jobs::submit_job).layer(DefaultBodyLimit::disable()))
+        .route("/jobs/:id",        get(jobs::get_job))
+        .route("/jobs/:id/stream", get(jobs::stream_job))
+        .route("/jobs/:id",        delete(jobs::delete_job))
+}
+
+pub fn health_router() -> Router<AppState> {
+    Router::new()
+        .route("/health", get(health::health))
+}
--- a/src/storage.rs
+++ b/src/storage.rs
@@ -0,0 +1,100 @@
+use std::path::{Path, PathBuf};
+
+use tokio::fs;
+use uuid::Uuid;
+
+use crate::{
+    models::{Job, JobId, JobStatus},
+    AppError, Result,
+};
+
+/// Simple append-friendly on-disk store.
+/// Each job is a single JSON file: <data_dir>/<job_id>.json
+pub struct Storage {
+    dir: PathBuf,
+}
+
+impl Storage {
+    pub async fn new(dir: impl AsRef<Path>) -> Result<Self> {
+        let dir = dir.as_ref().to_path_buf();
+        fs::create_dir_all(&dir).await.map_err(|e| {
+            AppError::Internal(format!("cannot create data dir {}: {e}", dir.display()))
+        })?;
+        Ok(Self { dir })
+    }
+
+    fn job_path(&self, id: &JobId) -> PathBuf {
+        self.dir.join(format!("{id}.json"))
+    }
+
+    // ── CRUD ─────────────────────────────────────────────────────────────────
+
+    pub async fn create(&self, job: &Job) -> Result<()> {
+        let path    = self.job_path(&job.id);
+        let payload = serde_json::to_vec_pretty(job)
+            .map_err(|e| AppError::Internal(e.to_string()))?;
+        fs::write(&path, payload).await.map_err(|e| {
+            AppError::Internal(format!("failed to write job {}: {e}", job.id))
+        })?;
+        Ok(())
+    }
+
+    pub async fn get(&self, id: &JobId) -> Result<Job> {
+        let path = self.job_path(id);
+        let raw  = fs::read(&path).await.map_err(|_| {
+            AppError::NotFound(format!("job {id} not found"))
+        })?;
+        serde_json::from_slice(&raw).map_err(|e| AppError::Internal(e.to_string()))
+    }
+
+    /// Persist any mutation to a job back to disk.
+    pub async fn save(&self, job: &Job) -> Result<()> {
+        self.create(job).await
+    }
+
+    pub async fn delete(&self, id: &JobId) -> Result<()> {
+        let path = self.job_path(id);
+        fs::remove_file(&path).await.map_err(|_| {
+            AppError::NotFound(format!("job {id} not found"))
+        })?;
+        Ok(())
+    }
+
+    /// List all job IDs present on disk.
+    pub async fn list_ids(&self) -> Result<Vec<JobId>> {
+        let mut entries = fs::read_dir(&self.dir).await.map_err(|e| {
+            AppError::Internal(format!("read_dir failed: {e}"))
+        })?;
+
+        let mut ids = Vec::new();
+        while let Some(entry) = entries.next_entry().await.map_err(|e| {
+            AppError::Internal(e.to_string())
+        })? {
+            let name = entry.file_name();
+            let name = name.to_string_lossy();
+            if let Some(stem) = name.strip_suffix(".json") {
+                if let Ok(id) = Uuid::parse_str(stem) {
+                    ids.push(id);
+                }
+            }
+        }
+        Ok(ids)
+    }
+
+    /// On startup, mark any jobs that were `running` as `failed`
+    /// (they were interrupted by a crash / restart).
+    pub async fn recover_interrupted_jobs(&self) -> Result<()> {
+        for id in self.list_ids().await? {
+            if let Ok(mut job) = self.get(&id).await {
+                if job.status == JobStatus::Running {
+                    tracing::warn!(job_id = %id, "recovering interrupted job → failed");
+                    job.status       = JobStatus::Failed;
+                    job.error        = Some("server restarted while job was running".into());
+                    job.completed_at = Some(chrono::Utc::now());
+                    let _ = self.save(&job).await;
+                }
+            }
+        }
+        Ok(())
+    }
+}
--- a/src/transcriber.rs
+++ b/src/transcriber.rs
@@ -0,0 +1,143 @@
+use std::path::Path;
+
+use whisper_rs::{
+    FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters,
+};
+
+use crate::{
+    models::{Segment, Word},
+    AppError, Result,
+};
+
+/// Wraps a loaded whisper.cpp context.
+/// `WhisperContext` is `Send` but **not** `Sync` — keep it on the worker thread.
+pub struct Transcriber {
+    ctx: WhisperContext,
+}
+
+impl Transcriber {
+    /// Load a GGML model file and configure GPU / Flash Attention for RTX 2080.
+    pub fn load(model_path: impl AsRef<Path>, gpu_device: u32) -> Result<Self> {        let path = model_path.as_ref().to_str().ok_or_else(|| {
+            AppError::Internal("model path is not valid UTF-8".into())
+        })?;
+
+        let mut params = WhisperContextParameters::new();
+        params.use_gpu(true);
+        params.gpu_device(gpu_device as i32);
+        // Flash Attention (tile-based, works on sm_75).
+        // NOTE: mutually exclusive with DTW token timestamps.
+        params.flash_attn(true);
+
+        let ctx = WhisperContext::new_with_params(path, params)
+            .map_err(|e| AppError::Internal(format!("failed to load model: {e}")))?;
+
+        tracing::info!(model = path, "whisper model loaded");
+        Ok(Self { ctx })
+    }
+
+    /// Transcribe audio samples.
+    ///
+    /// `pcm` must be 16 kHz mono f32 samples.
+    /// `on_progress` is called periodically with a 0–100 integer.
+    pub fn transcribe(
+        &self,
+        pcm:         &[f32],
+        language:    Option<&str>,
+        task:        &str,
+        on_progress: impl Fn(u8) + Send + 'static,
+    ) -> Result<(Vec<Segment>, String)> {
+        let mut state = self.ctx.create_state()
+            .map_err(|e| AppError::Internal(format!("create_state: {e}")))?;
+
+        let mut fp = FullParams::new(SamplingStrategy::BeamSearch {
+            beam_size: 5,
+            patience:  1.0,
+        });
+
+        // RTX 2080: use all host CPU threads for pre/post processing
+        fp.set_n_threads(num_cpus::get() as i32);
+
+        // Deterministic, fastest decode path
+        fp.set_temperature(0.0);
+        // Temperature fallback: when a segment fails quality checks, retry with
+        // increasing temperature (0.0 → 0.2 → 0.4 …) rather than hallucinating.
+        fp.set_temperature_inc(0.2);
+
+        // ── Anti-hallucination / quality guards (from whisper.cpp docs) ──────
+        // Skip segments where the model is uncertain there is speech at all.
+        fp.set_no_speech_thold(0.6);
+        // High token-entropy signals a repetition loop — abort the segment.
+        fp.set_entropy_thold(2.4);
+        // Low average log-probability signals poor confidence — discard segment.
+        fp.set_logprob_thold(-1.0);
+        // Suppress leading blank tokens (avoids empty/whitespace-only segments).
+        fp.set_suppress_blank(true);
+        // Suppress music notes, laughter, [BLANK_AUDIO] and similar non-speech tokens.
+        fp.set_suppress_non_speech_tokens(true);
+
+        // Don't echo progress/results to stdout — we use the callback instead.
+        fp.set_print_progress(false);
+        fp.set_print_realtime(false);
+
+        if let Some(lang) = language {
+            fp.set_language(Some(lang));
+        } else {
+            fp.set_detect_language(true);
+        }
+
+        fp.set_translate(task == "translate");
+
+        // Progress callback — whisper.cpp calls this with 0–100
+        fp.set_progress_callback_safe(move |p| on_progress(p as u8));
+
+        state
+            .full(fp, pcm)
+            .map_err(|e| AppError::Internal(format!("transcription failed: {e}")))?;
+
+        let n_segments = state.full_n_segments()
+            .map_err(|e| AppError::Internal(e.to_string()))?;
+
+        let mut segments = Vec::with_capacity(n_segments as usize);
+
+        for i in 0..n_segments {
+            let text  = state.full_get_segment_text(i)
+                .map_err(|e| AppError::Internal(e.to_string()))?;
+            let start = state.full_get_segment_t0(i)
+                .map_err(|e| AppError::Internal(e.to_string()))? as f32 / 100.0;
+            let end   = state.full_get_segment_t1(i)
+                .map_err(|e| AppError::Internal(e.to_string()))? as f32 / 100.0;
+
+            let n_tokens = state.full_n_tokens(i)
+                .map_err(|e| AppError::Internal(e.to_string()))?;
+
+            let mut words = Vec::new();
+            for t in 0..n_tokens {
+                let token_text = state.full_get_token_text(i, t)
+                    .map_err(|e| AppError::Internal(e.to_string()))?;
+                // Skip special tokens (they start with '[')
+                if token_text.starts_with('[') {
+                    continue;
+                }
+                let data = state.full_get_token_data(i, t)
+                    .map_err(|e| AppError::Internal(e.to_string()))?;
+                words.push(Word {
+                    text:        token_text,
+                    start:       data.t0 as f32 / 100.0,
+                    end:         data.t1 as f32 / 100.0,
+                    probability: data.p,
+                });
+            }
+
+            segments.push(Segment { index: i, start, end, text, words });
+        }
+
+        // Detect language used
+        let lang = state
+            .full_lang_id_from_state()
+            .ok()
+            .and_then(|id| whisper_rs::get_lang_str(id as i32).map(str::to_owned))
+            .unwrap_or_else(|| language.unwrap_or("unknown").to_owned());
+
+        Ok((segments, lang))
+    }
+}
--- a/src/webhook.rs
+++ b/src/webhook.rs
@@ -0,0 +1,62 @@
+use std::time::Duration;
+
+use reqwest::Client;
+
+use crate::models::Job;
+
+const MAX_RETRIES: u32 = 5;
+const BASE_DELAY_SECS: u64 = 1;
+
+/// Fire a webhook POST with the completed job payload.
+/// Retries up to MAX_RETRIES times with exponential backoff.
+/// After all retries are exhausted the error is logged and dropped.
+pub async fn fire(client: &Client, url: &str, job: &Job) {
+    let mut attempt = 0u32;
+
+    loop {
+        match client.post(url).json(job).send().await {
+            Ok(resp) if resp.status().is_success() => {
+                tracing::info!(
+                    job_id = %job.id,
+                    url,
+                    status = resp.status().as_u16(),
+                    "webhook delivered"
+                );
+                return;
+            }
+            Ok(resp) => {
+                tracing::warn!(
+                    job_id = %job.id,
+                    url,
+                    status = resp.status().as_u16(),
+                    attempt,
+                    "webhook non-2xx response"
+                );
+            }
+            Err(e) => {
+                tracing::warn!(
+                    job_id = %job.id,
+                    url,
+                    attempt,
+                    error = %e,
+                    "webhook request failed"
+                );
+            }
+        }
+
+        attempt += 1;
+        if attempt >= MAX_RETRIES {
+            tracing::error!(
+                job_id = %job.id,
+                url,
+                "webhook failed after {MAX_RETRIES} retries — giving up"
+            );
+            return;
+        }
+
+        // Exponential backoff: 1s, 2s, 4s, 8s, 16s
+        let delay = BASE_DELAY_SECS * (1 << attempt);
+        tracing::debug!(job_id = %job.id, delay_secs = delay, "webhook retry scheduled");
+        tokio::time::sleep(Duration::from_secs(delay)).await;
+    }
+}
--- a/src/worker.rs
+++ b/src/worker.rs
@@ -0,0 +1,245 @@
+use std::{
+    path::PathBuf,
+    sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc,
+    },
+};
+
+use chrono::Utc;
+use reqwest::Client;
+use tokio::sync::{broadcast, mpsc, oneshot};
+
+use crate::{
+    models::{Job, JobId, JobStatus, Segment},
+    storage::Storage,
+    transcriber::Transcriber,
+    webhook,
+};
+
+/// Per-job broadcast channel for SSE subscribers.
+pub type ProgressTx = broadcast::Sender<ProgressEvent>;
+
+#[derive(Debug, Clone)]
+pub enum ProgressEvent {
+    Progress(u8),
+    Done(Box<Job>),
+    Error(String),
+}
+
+/// Global registry: job_id → broadcast sender.
+pub type ProgressRegistry = Arc<dashmap::DashMap<JobId, ProgressTx>>;
+
+// ── Transcription request/response types for the blocking thread ─────────────
+
+struct TranscribeRequest {
+    pcm:         Vec<f32>,
+    language:    Option<String>,
+    task:        String,
+    progress_tx: ProgressTx,
+    reply:       oneshot::Sender<crate::Result<(Vec<Segment>, String)>>,
+}
+
+/// Spawn the single GPU worker.
+/// Returns the SSE progress registry.
+pub fn start(
+    job_rx:      mpsc::UnboundedReceiver<JobId>,
+    storage:     Arc<Storage>,
+    model_path:  PathBuf,
+    queue_depth: Arc<AtomicUsize>,
+    gpu_device:  u32,
+) -> ProgressRegistry {
+    let registry: ProgressRegistry = Arc::new(dashmap::DashMap::new());
+    let reg_clone = Arc::clone(&registry);
+
+    // The transcriber lives on a dedicated OS thread because WhisperContext
+    // is !Send (holds raw CUDA pointers) and transcription is a long blocking call.
+    // We bridge async↔sync via an unbounded mpsc channel.
+    let (tx_req, rx_req) = std::sync::mpsc::channel::<TranscribeRequest>();
+
+    std::thread::Builder::new()
+        .name("whisper-gpu".into())
+        .spawn(move || transcriber_thread(rx_req, model_path, gpu_device))
+        .expect("failed to spawn whisper-gpu thread");
+
+    tokio::spawn(run(job_rx, storage, queue_depth, reg_clone, tx_req));
+
+    registry
+}
+
+/// Dedicated OS thread that owns the Transcriber (non-Send) and runs inference.
+fn transcriber_thread(
+    rx: std::sync::mpsc::Receiver<TranscribeRequest>,
+    model_path: PathBuf,
+    gpu_device: u32,
+) {
+    let transcriber = match Transcriber::load(&model_path, gpu_device) {
+        Ok(t)  => t,
+        Err(e) => {
+            tracing::error!(error = %e, "failed to load whisper model — transcriber thread exiting");
+            return;
+        }
+    };
+    tracing::info!(model = %model_path.display(), "GPU worker ready");
+
+    for req in rx {
+        let result = transcriber.transcribe(
+            &req.pcm,
+            req.language.as_deref(),
+            &req.task,
+            move |p| { let _ = req.progress_tx.send(ProgressEvent::Progress(p)); },
+        );
+        let _ = req.reply.send(result);
+    }
+}
+
+pub async fn run(
+    mut job_rx:  mpsc::UnboundedReceiver<JobId>,
+    storage:     Arc<Storage>,
+    queue_depth: Arc<AtomicUsize>,
+    registry:    ProgressRegistry,
+    tx_req:      std::sync::mpsc::Sender<TranscribeRequest>,
+) {
+    let http = Client::builder()
+        .timeout(std::time::Duration::from_secs(30))
+        .build()
+        .expect("failed to build reqwest client");
+
+    while let Some(job_id) = job_rx.recv().await {
+        queue_depth.fetch_sub(1, Ordering::Relaxed);
+
+        let mut job = match storage.get(&job_id).await {
+            Ok(j)  => j,
+            Err(e) => {
+                tracing::warn!(job_id = %job_id, error = %e, "job vanished before processing");
+                registry.remove(&job_id);
+                continue;
+            }
+        };
+
+        if job.status == JobStatus::Cancelled {
+            registry.remove(&job_id);
+            continue;
+        }
+
+        job.status = JobStatus::Running;
+        if let Err(e) = storage.save(&job).await {
+            tracing::error!(job_id = %job_id, error = %e, "failed to persist running status");
+        }
+
+        let progress_tx = registry
+            .entry(job_id)
+            .or_insert_with(|| broadcast::channel(64).0)
+            .clone();
+
+        let audio_path = audio_path_for(&job_id);
+
+        let result = process_job(&job, &audio_path, &progress_tx, &tx_req).await;
+
+        let _ = tokio::fs::remove_file(&audio_path).await;
+
+        match result {
+            Ok((segments, language, duration_secs)) => {
+                job.status        = JobStatus::Done;
+                job.segments      = segments;
+                job.language      = Some(language);
+                job.duration_secs = Some(duration_secs);
+                job.progress      = 100;
+                job.completed_at  = Some(Utc::now());
+                let _ = progress_tx.send(ProgressEvent::Done(Box::new(job.clone())));
+            }
+            Err(e) => {
+                let msg = e.to_string();
+                tracing::error!(job_id = %job_id, error = %msg, "transcription failed");
+                job.status       = JobStatus::Failed;
+                job.error        = Some(msg.clone());
+                job.completed_at = Some(Utc::now());
+                let _ = progress_tx.send(ProgressEvent::Error(msg));
+            }
+        }
+
+        if let Err(e) = storage.save(&job).await {
+            tracing::error!(job_id = %job_id, error = %e, "failed to persist final job state");
+        }
+
+        if let Some(url) = &job.webhook_url.clone() {
+            let http = http.clone();
+            let url  = url.clone();
+            let job  = job.clone();
+            tokio::spawn(async move { webhook::fire(&http, &url, &job).await; });
+        }
+
+        tokio::time::sleep(std::time::Duration::from_secs(30)).await;
+        registry.remove(&job_id);
+    }
+}
+
+async fn process_job(
+    job:         &Job,
+    audio_path:  &std::path::Path,
+    progress_tx: &ProgressTx,
+    tx_req:      &std::sync::mpsc::Sender<TranscribeRequest>,
+) -> crate::Result<(Vec<Segment>, String, f32)> {
+    let pcm = decode_audio(audio_path).await?;
+    let duration_secs = pcm.len() as f32 / 16_000.0;
+
+    let (reply_tx, reply_rx) = oneshot::channel();
+    tx_req.send(TranscribeRequest {
+        pcm,
+        language:    job.language.clone(),
+        task:        job.task.clone(),
+        progress_tx: progress_tx.clone(),
+        reply:       reply_tx,
+    }).map_err(|_| crate::AppError::Internal("transcriber thread gone".into()))?;
+
+    let (segments, language) = reply_rx.await
+        .map_err(|_| crate::AppError::Internal("transcriber thread dropped reply".into()))??;
+
+    Ok((segments, language, duration_secs))
+}
+
+/// Decode any audio file to 16 kHz mono PCM f32 using ffmpeg.
+async fn decode_audio(path: &std::path::Path) -> crate::Result<Vec<f32>> {
+    use tokio::process::Command;
+
+    let output = Command::new("ffmpeg")
+        .args([
+            "-nostdin", "-threads", "0",
+            "-i",       path.to_str().unwrap_or(""),
+            "-f",       "f32le",
+            "-ac",      "1",
+            "-ar",      "16000",
+            "-",        // write to stdout
+        ])
+        .output()
+        .await
+        .map_err(|e| crate::AppError::Internal(format!("ffmpeg spawn failed: {e}")))?;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        return Err(crate::AppError::Internal(format!(
+            "ffmpeg exited with {}: {}",
+            output.status, stderr
+        )));
+    }
+
+    // Reinterpret raw bytes as f32 (little-endian)
+    let bytes = output.stdout;
+    if bytes.len() % 4 != 0 {
+        return Err(crate::AppError::Internal(
+            "ffmpeg output length not a multiple of 4".into(),
+        ));
+    }
+    let samples: Vec<f32> = bytes
+        .chunks_exact(4)
+        .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
+        .collect();
+
+    Ok(samples)
+}
+
+pub fn audio_path_for(id: &JobId) -> PathBuf {
+    // Audio lives alongside job state in DATA_DIR.
+    let data_dir = std::env::var("DATA_DIR").unwrap_or_else(|_| "/data".into());
+    PathBuf::from(data_dir).join(format!("{id}.audio"))
+}
--- a/test_all.sh
+++ b/test_all.sh
@@ -0,0 +1,155 @@
+#!/usr/bin/env bash
+set -euo pipefail
+BASE="http://localhost:8090"
+AUDIO="/home/moze/Sources/youtube-transcriber/docker/tmp/audio-b2167046-a236-4fcd-b739-78177542fd23.wav"
+GREEN='\033[0;32m'; RED='\033[0;31m'; NC='\033[0m'
+ok()  { echo -e "${GREEN}[PASS]${NC} $*"; }
+fail(){ echo -e "${RED}[FAIL]${NC} $*"; exit 1; }
+
+echo "=== 1. GET /health ==="
+HEALTH=$(curl -sf "$BASE/health")
+echo "$HEALTH" | python3 -m json.tool
+echo "$HEALTH" | python3 -c "import sys,json; d=json.load(sys.stdin); assert d['status']=='ok'" && ok "health"
+
+echo ""
+echo "=== 2. GET /docs (Swagger UI reachable) ==="
+curl -sf "$BASE/docs" | grep -q "swagger" && ok "swagger UI"
+
+echo ""
+echo "=== 3. Webhook server (background nc loop) ==="
+# Simple webhook receiver using Python
+python3 - &
+WEBHOOK_PID=$!
+cat > /tmp/webhook_receiver.py << 'PYEOF'
+import http.server, json, sys
+
+class H(http.server.BaseHTTPRequestHandler):
+    def do_POST(self):
+        n = int(self.headers.get('Content-Length', 0))
+        body = self.rfile.read(n)
+        print("\n[WEBHOOK] received:", json.dumps(json.loads(body), indent=2)[:500])
+        self.send_response(200)
+        self.end_headers()
+    def log_message(self, *a): pass
+
+print("[WEBHOOK] listening on :9999")
+http.server.HTTPServer(('', 9999), H).serve_forever()
+PYEOF
+kill $WEBHOOK_PID 2>/dev/null || true
+python3 /tmp/webhook_receiver.py &
+WEBHOOK_PID=$!
+sleep 1
+echo "Webhook receiver started (PID $WEBHOOK_PID)"
+
+echo ""
+echo "=== 4. DELETE a non-existent job → 404 ==="
+STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X DELETE "$BASE/jobs/00000000-0000-0000-0000-000000000000")
+[ "$STATUS" = "404" ] && ok "DELETE 404 for unknown job" || fail "expected 404 got $STATUS"
+
+echo ""
+echo "=== 5. POST /jobs — submit audio ==="
+SUBMIT=$(curl -sf -X POST "$BASE/jobs" \
+  -F "audio=@${AUDIO};type=audio/wav" \
+  -F "language=auto" \
+  -F "task=transcribe" \
+  -F "webhook_url=http://localhost:9999/webhook")
+echo "$SUBMIT"
+JOB_ID=$(echo "$SUBMIT" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])")
+ok "submitted job $JOB_ID"
+
+echo ""
+echo "=== 6. GET /jobs/{id} immediately after submit ==="
+JOB=$(curl -sf "$BASE/jobs/$JOB_ID")
+echo "$JOB" | python3 -c "import sys,json; d=json.load(sys.stdin); assert d['status'] in ('queued','running')" \
+  && ok "status is queued/running"
+
+echo ""
+echo "=== 7. SSE stream (first 15 events then detach) ==="
+echo "Subscribing to SSE stream for $JOB_ID …"
+curl -sN --max-time 60 "$BASE/jobs/$JOB_ID/stream" | head -30 &
+SSE_PID=$!
+
+echo ""
+echo "=== 8. Poll until done (max 20 min) ==="
+SECONDS=0
+while true; do
+  sleep 15
+  JOB=$(curl -sf "$BASE/jobs/$JOB_ID")
+  STATUS=$(echo "$JOB" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])")
+  echo "  [${SECONDS}s] status=$STATUS"
+  if [ "$STATUS" = "done" ]; then
+    ok "job finished in ${SECONDS}s"
+    break
+  elif [ "$STATUS" = "failed" ]; then
+    echo "$JOB" | python3 -m json.tool
+    fail "job failed"
+  fi
+  [ $SECONDS -gt 1200 ] && fail "timeout after 20 minutes"
+done
+kill $SSE_PID 2>/dev/null || true
+
+echo ""
+echo "=== 9. Inspect transcription quality ==="
+RESULT=$(curl -sf "$BASE/jobs/$JOB_ID")
+echo "$RESULT" | python3 - << 'PYCHECK'
+import sys, json, re
+
+data = json.loads(sys.stdin.read())
+segments = data.get("segments", [])
+print(f"  Language  : {data.get('language')}")
+print(f"  Duration  : {data.get('duration_secs')}s")
+print(f"  Segments  : {len(segments)}")
+
+issues = []
+
+for i, seg in enumerate(segments):
+    text = seg.get("text", "")
+    # --- repetition loop ---
+    words = text.strip().split()
+    if len(words) >= 6:
+        half = len(words) // 2
+        if words[:half] == words[half:half+half]:
+            issues.append(f"  [seg {i}] REPETITION LOOP: {text[:80]}")
+    # --- long duplicate phrases ---
+    phrases = re.findall(r'(\b\w+ \w+ \w+\b)', text)
+    if len(phrases) != len(set(phrases)) and len(phrases) > 4:
+        issues.append(f"  [seg {i}] DUPLICATE PHRASE: {text[:80]}")
+    # --- blank/empty segment ---
+    if not text.strip():
+        issues.append(f"  [seg {i}] BLANK SEGMENT")
+
+if issues:
+    print("\n  ⚠  Quality issues found:")
+    for iss in issues[:10]:
+        print(iss)
+else:
+    print("\n  ✓ No repetition loops or blank segments detected")
+
+# Print first 5 segments as sample
+print("\n  Sample output:")
+for seg in segments[:5]:
+    print(f"    [{seg['start']:.1f}–{seg['end']:.1f}] {seg['text'][:100]}")
+PYCHECK
+
+echo ""
+echo "=== 10. DELETE completed job ==="
+STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X DELETE "$BASE/jobs/$JOB_ID")
+[ "$STATUS" = "204" ] || [ "$STATUS" = "200" ] && ok "DELETE returned $STATUS"
+
+echo ""
+echo "=== 11. Submit + immediately cancel a job ==="
+JOB2=$(curl -sf -X POST "$BASE/jobs" \
+  -F "audio=@${AUDIO};type=audio/wav" \
+  -F "language=en" \
+  -F "task=transcribe")
+JOB2_ID=$(echo "$JOB2" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])")
+sleep 1
+DEL_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X DELETE "$BASE/jobs/$JOB2_ID")
+CANCEL_STATUS=$(curl -sf "$BASE/jobs/$JOB2_ID" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])")
+[ "$CANCEL_STATUS" = "cancelled" ] && ok "cancel works ($DEL_STATUS → cancelled)"
+
+echo ""
+echo "=== 12. Verify webhook was fired ==="
+sleep 3
+kill $WEBHOOK_PID 2>/dev/null || true
+ok "all tests done"