llama-cpp/compose.yaml

# ==============================================================================
# llama.cpp multi-model server
# Hardware: GTX 1650 Ti (3717 MiB VRAM, CC 7.5) + i7-10750H 6c/12t
#
# MODEL PROFILES (mutually exclusive — GPU can only hold one at a time):
#   qwen35-9b         Qwen3.5-9B Q8_0   TurboQuant (turbo2 KV, FORCE_MMQ) ~4.4 t/s
#   gemma4-e2b        Gemma 4 E2B       Official llama.cpp                 ~65 t/s
#   gemma4-e4b        Gemma 4 E4B       Official llama.cpp (CPU split)     ~30 t/s
#   smollm3-3b        SmolLM3 3B        Official llama.cpp                 ~90 t/s
#   qwen3-4b          Qwen3 4B          Official llama.cpp                 ~75 t/s
#
# BIGCTX PROFILES (-nkvo: KV in RAM, benchmarked v4 2026-05-06, TurboQuant FORCE_MMQ):
#   smollm3-3b-bigctx SmolLM3 3B  ctx=65536   turbo2 | ~53 t/s base | ~15 t/s@50% | +40960 vs GPU
#   gemma4-e2b-bigctx Gemma 4 E2B ctx=393216  q4_0   | ~62 t/s base | ~17 t/s@50% | +368640 vs GPU (MQA!)
#   gemma4-e4b-bigctx Gemma 4 E4B ctx=163840  turbo2 | ~30 t/s base | ~18 t/s@50% | +139264 vs GPU
#   qwen3-4b-bigctx   Qwen3 4B    ctx=24576   q4_0   | ~39 t/s base | ~11 t/s@50% | +8192 vs GPU
#
# OPTIONAL ADD-ON (combine with any model profile):
#   webui        Open WebUI — auto-connects to whichever model is running
#
# BENCHMARK PROFILES (one-shot, run with: docker compose ... run --rm <service>):
#   bench-qwen35-9b / bench-gemma4-e2b / bench-gemma4-e4b
#   bench-smollm3-3b / bench-qwen3-4b
#
# EXAMPLES:
#   docker compose --profile qwen35-9b up -d
#   docker compose --profile gemma4-e2b --profile webui up -d
#   docker compose --profile bench-smollm3-3b run --rm --entrypoint="" bench-smollm3-3b \
#     bash -c '/app/llama-bench -m /models/$MODEL_FILE -ngl 99 -o csv 2>/dev/null'
#
# FIRST-TIME BUILD (qwen35-9b TurboQuant image, ~20 min):
#   docker compose --profile qwen35-9b build llama-qwen35-9b
#
# Per-model params live in envs/.env.<model> — edit there to retune.
# All server services expose API on host port 8080 and Docker network as
# http://llama-current:8080 via the llama-net network alias.
# ==============================================================================

# ── Shared GPU passthrough ────────────────────────────────────────────────────
x-gpu: &gpu
  runtime: nvidia
  environment:
    NVIDIA_VISIBLE_DEVICES: all
    NVIDIA_DRIVER_CAPABILITIES: compute,utility

# ── Common healthcheck properties (start_period overridden per service) ───────
x-hc: &hc
  test: ["CMD-SHELL", "curl -sf http://localhost:8080/health | grep -q '\"status\":\"ok\"'"]
  interval: 20s
  timeout: 10s
  retries: 10

# ── Common server scaffold ────────────────────────────────────────────────────
# All model services merge this. Per-model differences go in envs/.env.<model>.
# $$VAR uses double-$ to escape compose interpolation — shell expands them at
# runtime from the env_file variables injected into the container.
x-server: &server
  <<: *gpu
  container_name: llama_server
  volumes:
    - ./models:/models:ro
  ports:
    - "8080:8080"
  shm_size: 1g
  ulimits:
    memlock:
      soft: -1
      hard: -1
  restart: unless-stopped
  entrypoint: ["/bin/sh", "-c"]
  command: |
    exec /app/llama-server \
      --model "/models/$$MODEL_FILE" \
      --host 0.0.0.0 --port 8080 \
      --n-gpu-layers $$N_GPU_LAYERS \
      --ctx-size $$CTX_SIZE \
      --threads $$THREADS --threads-batch $$THREADS_BATCH \
      --batch-size $$BATCH_SIZE --ubatch-size $$UBATCH_SIZE \
      --cache-type-k $$CACHE_TYPE_K --cache-type-v $$CACHE_TYPE_V \
      --cont-batching --parallel $$PARALLEL \
      $$EXTRA_ARGS \
      --log-disable
  networks:
    llama-net:
      aliases: [llama-current]

# ── Common benchmark scaffold ─────────────────────────────────────────────────
x-bench: &bench
  <<: *gpu
  container_name: llama_bench
  volumes:
    - ./models:/models:ro
    - ./benchmark-results:/results
    - ./scripts:/scripts:ro
  shm_size: 1g
  ulimits:
    memlock:
      soft: -1
      hard: -1
  entrypoint: ["/bin/bash", "/scripts/benchmark.sh"]

# ── Networks ──────────────────────────────────────────────────────────────────
networks:
  llama-net:
    driver: bridge

# ── Volumes ───────────────────────────────────────────────────────────────────
volumes:
  open-webui-data:

# ==============================================================================
services:

  # ── QWEN 3.5-9B Q8_0 — TurboQuant (turbo2 KV, FORCE_MMQ, SM75) ────────────
  # Build image first: docker compose --profile qwen35-9b build llama-qwen35-9b
  llama-qwen35-9b:
    build:
      context: https://github.com/TheTom/llama-cpp-turboquant.git#feature/turboquant-kv-cache
      dockerfile: .devops/cuda.Dockerfile
      target: server
      args:
        CUDA_DOCKER_ARCH: "75 -DGGML_CUDA_FORCE_MMQ=ON"
    image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
    <<: *server
    profiles: [qwen35-9b]
    env_file: envs/.env.qwen35-9b
    healthcheck:
      <<: *hc
      retries: 12
      start_period: 180s  # mlock pins 8.86 GB into RAM — needs time

  # ── GEMMA 4 E2B — 2.3B effective (5.1B total/PLE), 128K ctx, audio+video ───
  # Download: see envs/.env.gemma4-e2b for huggingface-cli command
  llama-gemma4-e2b:
    image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
    <<: *server
    profiles: [gemma4-e2b]
    env_file: envs/.env.gemma4-e2b
    healthcheck:
      <<: *hc
      start_period: 60s

  # ── GEMMA 4 E4B — 4.5B effective (8B total/PLE), 128K ctx, CPU-split ────────
  # Fits ~28/42 layers on GPU; remaining layers run on CPU RAM
  llama-gemma4-e4b:
    image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
    <<: *server
    profiles: [gemma4-e4b]
    env_file: envs/.env.gemma4-e4b
    healthcheck:
      <<: *hc
      start_period: 60s

  # ── SMOLLM3 3B — thinking mode, tool calling, 64K ctx, Apache 2.0 ──────────
  llama-smollm3-3b:
    image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
    <<: *server
    profiles: [smollm3-3b]
    env_file: envs/.env.smollm3-3b
    healthcheck:
      <<: *hc
      start_period: 60s

  # ── QWEN3 4B — thinking mode, 128K ctx, best ecosystem ────────────────────
  llama-qwen3-4b:
    image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
    <<: *server
    profiles: [qwen3-4b]
    env_file: envs/.env.qwen3-4b
    healthcheck:
      <<: *hc
      start_period: 60s

  # ── BIGCTX VARIANTS (-nkvo: KV in RAM, benchmarked 2026-05-06) ────────────
  # Use when you need more context than the pure-GPU profiles offer.
  # KV cache lives in CPU RAM instead of VRAM → VRAM freed for larger ctx.
  # Speed estimated via PCIe bandwidth model (8 GB/s). E2B/E4B use MQA — tiny KV, far less PCIe pressure.

  llama-smollm3-3b-bigctx:
    image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
    <<: *server
    profiles: [smollm3-3b-bigctx]
    env_file: envs/.env.smollm3-3b-bigctx
    healthcheck:
      <<: *hc
      start_period: 90s

  llama-gemma4-e2b-bigctx:
    image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
    <<: *server
    profiles: [gemma4-e2b-bigctx]
    env_file: envs/.env.gemma4-e2b-bigctx
    healthcheck:
      <<: *hc
      start_period: 120s  # large ctx (393216) allocation takes extra time

  llama-gemma4-e4b-bigctx:
    image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
    <<: *server
    profiles: [gemma4-e4b-bigctx]
    env_file: envs/.env.gemma4-e4b-bigctx
    healthcheck:
      <<: *hc
      start_period: 150s  # 5 GiB model + warmup + 163840 ctx takes ~90-120s

  llama-qwen3-4b-bigctx:
    image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
    <<: *server
    profiles: [qwen3-4b-bigctx]
    env_file: envs/.env.qwen3-4b-bigctx
    healthcheck:
      <<: *hc
      start_period: 90s

  # ── OPEN WEBUI ─────────────────────────────────────────────────────────────
  # Separate profile — add to any running model:
  #   docker compose --profile <model> --profile webui up -d
  # Connects to whichever model is running via the llama-current DNS alias.
  # Open WebUI retries on startup so no depends_on needed.
  openwebui:
    image: ghcr.io/open-webui/open-webui:main
    container_name: open_webui
    profiles: [webui]
    environment:
      - OPENAI_API_BASE_URL=http://llama-current:8080/v1
      - OPENAI_API_KEY=sk-no-key-needed
      - WEBUI_AUTH=false
    ports:
      - "3000:8080"
    networks:
      - llama-net
    volumes:
      - open-webui-data:/app/backend/data
    restart: unless-stopped

  # ── BENCHMARKS ─────────────────────────────────────────────────────────────
  # Run as one-shot: docker compose --profile bench-<model> run --rm bench-<model>
  # Override entrypoint for ad-hoc: ... run --rm --entrypoint="" bench-<model> bash -c '...'

  bench-qwen35-9b:
    build:
      context: https://github.com/TheTom/llama-cpp-turboquant.git#feature/turboquant-kv-cache
      dockerfile: .devops/cuda.Dockerfile
      target: full
      args:
        CUDA_DOCKER_ARCH: "75 -DGGML_CUDA_FORCE_MMQ=ON"
    image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
    <<: *bench
    profiles: [bench-qwen35-9b]
    environment:
      MODEL_FILE: Qwen3.5-9B.Q8_0.gguf
      OUTPUT_DIR: /results
      VARIANT: qwen35-9b-turboquant
      PATH: /app:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin

  bench-gemma4-e2b:
    image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
    <<: *bench
    profiles: [bench-gemma4-e2b]
    environment:
      MODEL_FILE: google_gemma-4-E2B-it-Q4_K_M.gguf
      OUTPUT_DIR: /results
      VARIANT: gemma4-e2b

  bench-gemma4-e4b:
    image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
    <<: *bench
    profiles: [bench-gemma4-e4b]
    environment:
      MODEL_FILE: google_gemma-4-E4B-it-Q4_K_M.gguf
      OUTPUT_DIR: /results
      VARIANT: gemma4-e4b

  bench-smollm3-3b:
    image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
    <<: *bench
    profiles: [bench-smollm3-3b]
    environment:
      MODEL_FILE: HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf
      OUTPUT_DIR: /results
      VARIANT: smollm3-3b

  bench-qwen3-4b:
    image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
    <<: *bench
    profiles: [bench-qwen3-4b]
    environment:
      MODEL_FILE: Qwen3-4B-Q4_K_M.gguf
      OUTPUT_DIR: /results
      VARIANT: qwen3-4b