Files
llama-cpp/compose.yaml
Giancarmine Salucci e7e389c0e1 llama+compose: fix bigctx startup timing
- compose: increase start_period for bigctx services
  - gemma4-e4b-bigctx: 60s -> 150s (5 GiB model + warmup + 163840 ctx takes ~90-120s)
  - gemma4-e2b-bigctx: 60s -> 120s (large ctx 393216 allocation)
  - smollm3/qwen3-4b bigctx: 60s -> 90s
- llama: extend health poll from 30x2s=60s to 75x2s=150s
- llama: require 3 consecutive unhealthy before giving up (avoids
  false positives during Docker start_period window)
2026-05-06 19:03:31 +02:00

291 lines
11 KiB
YAML

# ==============================================================================
# llama.cpp multi-model server
# Hardware: GTX 1650 Ti (3717 MiB VRAM, CC 7.5) + i7-10750H 6c/12t
#
# MODEL PROFILES (mutually exclusive — GPU can only hold one at a time):
# qwen35-9b Qwen3.5-9B Q8_0 TurboQuant (turbo2 KV, FORCE_MMQ) ~4.4 t/s
# gemma4-e2b Gemma 4 E2B Official llama.cpp ~65 t/s
# gemma4-e4b Gemma 4 E4B Official llama.cpp (CPU split) ~30 t/s
# smollm3-3b SmolLM3 3B Official llama.cpp ~90 t/s
# qwen3-4b Qwen3 4B Official llama.cpp ~75 t/s
#
# BIGCTX PROFILES (-nkvo: KV in RAM, benchmarked v4 2026-05-06, TurboQuant FORCE_MMQ):
# smollm3-3b-bigctx SmolLM3 3B ctx=65536 turbo2 | ~53 t/s base | ~15 t/s@50% | +40960 vs GPU
# gemma4-e2b-bigctx Gemma 4 E2B ctx=393216 q4_0 | ~62 t/s base | ~17 t/s@50% | +368640 vs GPU (MQA!)
# gemma4-e4b-bigctx Gemma 4 E4B ctx=163840 turbo2 | ~30 t/s base | ~18 t/s@50% | +139264 vs GPU
# qwen3-4b-bigctx Qwen3 4B ctx=24576 q4_0 | ~39 t/s base | ~11 t/s@50% | +8192 vs GPU
#
# OPTIONAL ADD-ON (combine with any model profile):
# webui Open WebUI — auto-connects to whichever model is running
#
# BENCHMARK PROFILES (one-shot, run with: docker compose ... run --rm <service>):
# bench-qwen35-9b / bench-gemma4-e2b / bench-gemma4-e4b
# bench-smollm3-3b / bench-qwen3-4b
#
# EXAMPLES:
# docker compose --profile qwen35-9b up -d
# docker compose --profile gemma4-e2b --profile webui up -d
# docker compose --profile bench-smollm3-3b run --rm --entrypoint="" bench-smollm3-3b \
# bash -c '/app/llama-bench -m /models/$MODEL_FILE -ngl 99 -o csv 2>/dev/null'
#
# FIRST-TIME BUILD (qwen35-9b TurboQuant image, ~20 min):
# docker compose --profile qwen35-9b build llama-qwen35-9b
#
# Per-model params live in envs/.env.<model> — edit there to retune.
# All server services expose API on host port 8080 and Docker network as
# http://llama-current:8080 via the llama-net network alias.
# ==============================================================================
# ── Shared GPU passthrough ────────────────────────────────────────────────────
x-gpu: &gpu
runtime: nvidia
environment:
NVIDIA_VISIBLE_DEVICES: all
NVIDIA_DRIVER_CAPABILITIES: compute,utility
# ── Common healthcheck properties (start_period overridden per service) ───────
x-hc: &hc
test: ["CMD-SHELL", "curl -sf http://localhost:8080/health | grep -q '\"status\":\"ok\"'"]
interval: 20s
timeout: 10s
retries: 10
# ── Common server scaffold ────────────────────────────────────────────────────
# All model services merge this. Per-model differences go in envs/.env.<model>.
# $$VAR uses double-$ to escape compose interpolation — shell expands them at
# runtime from the env_file variables injected into the container.
x-server: &server
<<: *gpu
container_name: llama_server
volumes:
- ./models:/models:ro
ports:
- "8080:8080"
shm_size: 1g
ulimits:
memlock:
soft: -1
hard: -1
restart: unless-stopped
entrypoint: ["/bin/sh", "-c"]
command: |
exec /app/llama-server \
--model "/models/$$MODEL_FILE" \
--host 0.0.0.0 --port 8080 \
--n-gpu-layers $$N_GPU_LAYERS \
--ctx-size $$CTX_SIZE \
--threads $$THREADS --threads-batch $$THREADS_BATCH \
--batch-size $$BATCH_SIZE --ubatch-size $$UBATCH_SIZE \
--cache-type-k $$CACHE_TYPE_K --cache-type-v $$CACHE_TYPE_V \
--cont-batching --parallel $$PARALLEL \
$$EXTRA_ARGS \
--log-disable
networks:
llama-net:
aliases: [llama-current]
# ── Common benchmark scaffold ─────────────────────────────────────────────────
x-bench: &bench
<<: *gpu
container_name: llama_bench
volumes:
- ./models:/models:ro
- ./benchmark-results:/results
- ./scripts:/scripts:ro
shm_size: 1g
ulimits:
memlock:
soft: -1
hard: -1
entrypoint: ["/bin/bash", "/scripts/benchmark.sh"]
# ── Networks ──────────────────────────────────────────────────────────────────
networks:
llama-net:
driver: bridge
# ── Volumes ───────────────────────────────────────────────────────────────────
volumes:
open-webui-data:
# ==============================================================================
services:
# ── QWEN 3.5-9B Q8_0 — TurboQuant (turbo2 KV, FORCE_MMQ, SM75) ────────────
# Build image first: docker compose --profile qwen35-9b build llama-qwen35-9b
llama-qwen35-9b:
build:
context: https://github.com/TheTom/llama-cpp-turboquant.git#feature/turboquant-kv-cache
dockerfile: .devops/cuda.Dockerfile
target: server
args:
CUDA_DOCKER_ARCH: "75 -DGGML_CUDA_FORCE_MMQ=ON"
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
<<: *server
profiles: [qwen35-9b]
env_file: envs/.env.qwen35-9b
healthcheck:
<<: *hc
retries: 12
start_period: 180s # mlock pins 8.86 GB into RAM — needs time
# ── GEMMA 4 E2B — 2.3B effective (5.1B total/PLE), 128K ctx, audio+video ───
# Download: see envs/.env.gemma4-e2b for huggingface-cli command
llama-gemma4-e2b:
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
<<: *server
profiles: [gemma4-e2b]
env_file: envs/.env.gemma4-e2b
healthcheck:
<<: *hc
start_period: 60s
# ── GEMMA 4 E4B — 4.5B effective (8B total/PLE), 128K ctx, CPU-split ────────
# Fits ~28/42 layers on GPU; remaining layers run on CPU RAM
llama-gemma4-e4b:
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
<<: *server
profiles: [gemma4-e4b]
env_file: envs/.env.gemma4-e4b
healthcheck:
<<: *hc
start_period: 60s
# ── SMOLLM3 3B — thinking mode, tool calling, 64K ctx, Apache 2.0 ──────────
llama-smollm3-3b:
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
<<: *server
profiles: [smollm3-3b]
env_file: envs/.env.smollm3-3b
healthcheck:
<<: *hc
start_period: 60s
# ── QWEN3 4B — thinking mode, 128K ctx, best ecosystem ────────────────────
llama-qwen3-4b:
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
<<: *server
profiles: [qwen3-4b]
env_file: envs/.env.qwen3-4b
healthcheck:
<<: *hc
start_period: 60s
# ── BIGCTX VARIANTS (-nkvo: KV in RAM, benchmarked 2026-05-06) ────────────
# Use when you need more context than the pure-GPU profiles offer.
# KV cache lives in CPU RAM instead of VRAM → VRAM freed for larger ctx.
# Speed estimated via PCIe bandwidth model (8 GB/s). E2B/E4B use MQA — tiny KV, far less PCIe pressure.
llama-smollm3-3b-bigctx:
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
<<: *server
profiles: [smollm3-3b-bigctx]
env_file: envs/.env.smollm3-3b-bigctx
healthcheck:
<<: *hc
start_period: 90s
llama-gemma4-e2b-bigctx:
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
<<: *server
profiles: [gemma4-e2b-bigctx]
env_file: envs/.env.gemma4-e2b-bigctx
healthcheck:
<<: *hc
start_period: 120s # large ctx (393216) allocation takes extra time
llama-gemma4-e4b-bigctx:
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
<<: *server
profiles: [gemma4-e4b-bigctx]
env_file: envs/.env.gemma4-e4b-bigctx
healthcheck:
<<: *hc
start_period: 150s # 5 GiB model + warmup + 163840 ctx takes ~90-120s
llama-qwen3-4b-bigctx:
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
<<: *server
profiles: [qwen3-4b-bigctx]
env_file: envs/.env.qwen3-4b-bigctx
healthcheck:
<<: *hc
start_period: 90s
# ── OPEN WEBUI ─────────────────────────────────────────────────────────────
# Separate profile — add to any running model:
# docker compose --profile <model> --profile webui up -d
# Connects to whichever model is running via the llama-current DNS alias.
# Open WebUI retries on startup so no depends_on needed.
openwebui:
image: ghcr.io/open-webui/open-webui:main
container_name: open_webui
profiles: [webui]
environment:
- OPENAI_API_BASE_URL=http://llama-current:8080/v1
- OPENAI_API_KEY=sk-no-key-needed
- WEBUI_AUTH=false
ports:
- "3000:8080"
networks:
- llama-net
volumes:
- open-webui-data:/app/backend/data
restart: unless-stopped
# ── BENCHMARKS ─────────────────────────────────────────────────────────────
# Run as one-shot: docker compose --profile bench-<model> run --rm bench-<model>
# Override entrypoint for ad-hoc: ... run --rm --entrypoint="" bench-<model> bash -c '...'
bench-qwen35-9b:
build:
context: https://github.com/TheTom/llama-cpp-turboquant.git#feature/turboquant-kv-cache
dockerfile: .devops/cuda.Dockerfile
target: full
args:
CUDA_DOCKER_ARCH: "75 -DGGML_CUDA_FORCE_MMQ=ON"
image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
<<: *bench
profiles: [bench-qwen35-9b]
environment:
MODEL_FILE: Qwen3.5-9B.Q8_0.gguf
OUTPUT_DIR: /results
VARIANT: qwen35-9b-turboquant
PATH: /app:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
bench-gemma4-e2b:
image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
<<: *bench
profiles: [bench-gemma4-e2b]
environment:
MODEL_FILE: google_gemma-4-E2B-it-Q4_K_M.gguf
OUTPUT_DIR: /results
VARIANT: gemma4-e2b
bench-gemma4-e4b:
image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
<<: *bench
profiles: [bench-gemma4-e4b]
environment:
MODEL_FILE: google_gemma-4-E4B-it-Q4_K_M.gguf
OUTPUT_DIR: /results
VARIANT: gemma4-e4b
bench-smollm3-3b:
image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
<<: *bench
profiles: [bench-smollm3-3b]
environment:
MODEL_FILE: HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf
OUTPUT_DIR: /results
VARIANT: smollm3-3b
bench-qwen3-4b:
image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
<<: *bench
profiles: [bench-qwen3-4b]
environment:
MODEL_FILE: Qwen3-4B-Q4_K_M.gguf
OUTPUT_DIR: /results
VARIANT: qwen3-4b