- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B - TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs - Bigctx profiles (-nkvo KV in RAM): 2-16x context gain - turbo2 KV: 2x smaller, benchmarked against PPL quality gate - Per-model env files with justified parameters - kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts - docs/FINDINGS.md: surprises, pitfalls, recommendations - docs/ARCHITECTURE.md: compose + test script design
291 lines
11 KiB
YAML
291 lines
11 KiB
YAML
# ==============================================================================
|
|
# llama.cpp multi-model server
|
|
# Hardware: GTX 1650 Ti (3717 MiB VRAM, CC 7.5) + i7-10750H 6c/12t
|
|
#
|
|
# MODEL PROFILES (mutually exclusive — GPU can only hold one at a time):
|
|
# qwen35-9b Qwen3.5-9B Q8_0 TurboQuant (turbo2 KV, FORCE_MMQ) ~4.4 t/s
|
|
# gemma4-e2b Gemma 4 E2B Official llama.cpp ~65 t/s
|
|
# gemma4-e4b Gemma 4 E4B Official llama.cpp (CPU split) ~30 t/s
|
|
# smollm3-3b SmolLM3 3B Official llama.cpp ~90 t/s
|
|
# qwen3-4b Qwen3 4B Official llama.cpp ~75 t/s
|
|
#
|
|
# BIGCTX PROFILES (-nkvo: KV in RAM, benchmarked v4 2026-05-06, TurboQuant FORCE_MMQ):
|
|
# smollm3-3b-bigctx SmolLM3 3B ctx=65536 turbo2 | ~53 t/s base | ~15 t/s@50% | +40960 vs GPU
|
|
# gemma4-e2b-bigctx Gemma 4 E2B ctx=393216 q4_0 | ~62 t/s base | ~17 t/s@50% | +368640 vs GPU (MQA!)
|
|
# gemma4-e4b-bigctx Gemma 4 E4B ctx=163840 turbo2 | ~30 t/s base | ~18 t/s@50% | +139264 vs GPU
|
|
# qwen3-4b-bigctx Qwen3 4B ctx=24576 q4_0 | ~39 t/s base | ~11 t/s@50% | +8192 vs GPU
|
|
#
|
|
# OPTIONAL ADD-ON (combine with any model profile):
|
|
# webui Open WebUI — auto-connects to whichever model is running
|
|
#
|
|
# BENCHMARK PROFILES (one-shot, run with: docker compose ... run --rm <service>):
|
|
# bench-qwen35-9b / bench-gemma4-e2b / bench-gemma4-e4b
|
|
# bench-smollm3-3b / bench-qwen3-4b
|
|
#
|
|
# EXAMPLES:
|
|
# docker compose --profile qwen35-9b up -d
|
|
# docker compose --profile gemma4-e2b --profile webui up -d
|
|
# docker compose --profile bench-smollm3-3b run --rm --entrypoint="" bench-smollm3-3b \
|
|
# bash -c '/app/llama-bench -m /models/$MODEL_FILE -ngl 99 -o csv 2>/dev/null'
|
|
#
|
|
# FIRST-TIME BUILD (qwen35-9b TurboQuant image, ~20 min):
|
|
# docker compose --profile qwen35-9b build llama-qwen35-9b
|
|
#
|
|
# Per-model params live in envs/.env.<model> — edit there to retune.
|
|
# All server services expose API on host port 8080 and Docker network as
|
|
# http://llama-current:8080 via the llama-net network alias.
|
|
# ==============================================================================
|
|
|
|
# ── Shared GPU passthrough ────────────────────────────────────────────────────
|
|
x-gpu: &gpu
|
|
runtime: nvidia
|
|
environment:
|
|
NVIDIA_VISIBLE_DEVICES: all
|
|
NVIDIA_DRIVER_CAPABILITIES: compute,utility
|
|
|
|
# ── Common healthcheck properties (start_period overridden per service) ───────
|
|
x-hc: &hc
|
|
test: ["CMD-SHELL", "curl -sf http://localhost:8080/health | grep -q '\"status\":\"ok\"'"]
|
|
interval: 20s
|
|
timeout: 10s
|
|
retries: 10
|
|
|
|
# ── Common server scaffold ────────────────────────────────────────────────────
|
|
# All model services merge this. Per-model differences go in envs/.env.<model>.
|
|
# $$VAR uses double-$ to escape compose interpolation — shell expands them at
|
|
# runtime from the env_file variables injected into the container.
|
|
x-server: &server
|
|
<<: *gpu
|
|
container_name: llama_server
|
|
volumes:
|
|
- ./models:/models:ro
|
|
ports:
|
|
- "8080:8080"
|
|
shm_size: 1g
|
|
ulimits:
|
|
memlock:
|
|
soft: -1
|
|
hard: -1
|
|
restart: unless-stopped
|
|
entrypoint: ["/bin/sh", "-c"]
|
|
command: |
|
|
exec /app/llama-server \
|
|
--model "/models/$$MODEL_FILE" \
|
|
--host 0.0.0.0 --port 8080 \
|
|
--n-gpu-layers $$N_GPU_LAYERS \
|
|
--ctx-size $$CTX_SIZE \
|
|
--threads $$THREADS --threads-batch $$THREADS_BATCH \
|
|
--batch-size $$BATCH_SIZE --ubatch-size $$UBATCH_SIZE \
|
|
--cache-type-k $$CACHE_TYPE_K --cache-type-v $$CACHE_TYPE_V \
|
|
--cont-batching --parallel $$PARALLEL \
|
|
$$EXTRA_ARGS \
|
|
--log-disable
|
|
networks:
|
|
llama-net:
|
|
aliases: [llama-current]
|
|
|
|
# ── Common benchmark scaffold ─────────────────────────────────────────────────
|
|
x-bench: &bench
|
|
<<: *gpu
|
|
container_name: llama_bench
|
|
volumes:
|
|
- ./models:/models:ro
|
|
- ./benchmark-results:/results
|
|
- ./scripts:/scripts:ro
|
|
shm_size: 1g
|
|
ulimits:
|
|
memlock:
|
|
soft: -1
|
|
hard: -1
|
|
entrypoint: ["/bin/bash", "/scripts/benchmark.sh"]
|
|
|
|
# ── Networks ──────────────────────────────────────────────────────────────────
|
|
networks:
|
|
llama-net:
|
|
driver: bridge
|
|
|
|
# ── Volumes ───────────────────────────────────────────────────────────────────
|
|
volumes:
|
|
open-webui-data:
|
|
|
|
# ==============================================================================
|
|
services:
|
|
|
|
# ── QWEN 3.5-9B Q8_0 — TurboQuant (turbo2 KV, FORCE_MMQ, SM75) ────────────
|
|
# Build image first: docker compose --profile qwen35-9b build llama-qwen35-9b
|
|
llama-qwen35-9b:
|
|
build:
|
|
context: https://github.com/TheTom/llama-cpp-turboquant.git#feature/turboquant-kv-cache
|
|
dockerfile: .devops/cuda.Dockerfile
|
|
target: server
|
|
args:
|
|
CUDA_DOCKER_ARCH: "75 -DGGML_CUDA_FORCE_MMQ=ON"
|
|
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
|
|
<<: *server
|
|
profiles: [qwen35-9b]
|
|
env_file: envs/.env.qwen35-9b
|
|
healthcheck:
|
|
<<: *hc
|
|
retries: 12
|
|
start_period: 180s # mlock pins 8.86 GB into RAM — needs time
|
|
|
|
# ── GEMMA 4 E2B — 2.3B effective (5.1B total/PLE), 128K ctx, audio+video ───
|
|
# Download: see envs/.env.gemma4-e2b for huggingface-cli command
|
|
llama-gemma4-e2b:
|
|
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
|
|
<<: *server
|
|
profiles: [gemma4-e2b]
|
|
env_file: envs/.env.gemma4-e2b
|
|
healthcheck:
|
|
<<: *hc
|
|
start_period: 60s
|
|
|
|
# ── GEMMA 4 E4B — 4.5B effective (8B total/PLE), 128K ctx, CPU-split ────────
|
|
# Fits ~28/42 layers on GPU; remaining layers run on CPU RAM
|
|
llama-gemma4-e4b:
|
|
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
|
|
<<: *server
|
|
profiles: [gemma4-e4b]
|
|
env_file: envs/.env.gemma4-e4b
|
|
healthcheck:
|
|
<<: *hc
|
|
start_period: 60s
|
|
|
|
# ── SMOLLM3 3B — thinking mode, tool calling, 64K ctx, Apache 2.0 ──────────
|
|
llama-smollm3-3b:
|
|
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
|
|
<<: *server
|
|
profiles: [smollm3-3b]
|
|
env_file: envs/.env.smollm3-3b
|
|
healthcheck:
|
|
<<: *hc
|
|
start_period: 60s
|
|
|
|
# ── QWEN3 4B — thinking mode, 128K ctx, best ecosystem ────────────────────
|
|
llama-qwen3-4b:
|
|
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
|
|
<<: *server
|
|
profiles: [qwen3-4b]
|
|
env_file: envs/.env.qwen3-4b
|
|
healthcheck:
|
|
<<: *hc
|
|
start_period: 60s
|
|
|
|
# ── BIGCTX VARIANTS (-nkvo: KV in RAM, benchmarked 2026-05-06) ────────────
|
|
# Use when you need more context than the pure-GPU profiles offer.
|
|
# KV cache lives in CPU RAM instead of VRAM → VRAM freed for larger ctx.
|
|
# Speed estimated via PCIe bandwidth model (8 GB/s). E2B/E4B use MQA — tiny KV, far less PCIe pressure.
|
|
|
|
llama-smollm3-3b-bigctx:
|
|
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
|
|
<<: *server
|
|
profiles: [smollm3-3b-bigctx]
|
|
env_file: envs/.env.smollm3-3b-bigctx
|
|
healthcheck:
|
|
<<: *hc
|
|
start_period: 60s
|
|
|
|
llama-gemma4-e2b-bigctx:
|
|
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
|
|
<<: *server
|
|
profiles: [gemma4-e2b-bigctx]
|
|
env_file: envs/.env.gemma4-e2b-bigctx
|
|
healthcheck:
|
|
<<: *hc
|
|
start_period: 60s
|
|
|
|
llama-gemma4-e4b-bigctx:
|
|
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
|
|
<<: *server
|
|
profiles: [gemma4-e4b-bigctx]
|
|
env_file: envs/.env.gemma4-e4b-bigctx
|
|
healthcheck:
|
|
<<: *hc
|
|
start_period: 60s
|
|
|
|
llama-qwen3-4b-bigctx:
|
|
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
|
|
<<: *server
|
|
profiles: [qwen3-4b-bigctx]
|
|
env_file: envs/.env.qwen3-4b-bigctx
|
|
healthcheck:
|
|
<<: *hc
|
|
start_period: 60s
|
|
|
|
# ── OPEN WEBUI ─────────────────────────────────────────────────────────────
|
|
# Separate profile — add to any running model:
|
|
# docker compose --profile <model> --profile webui up -d
|
|
# Connects to whichever model is running via the llama-current DNS alias.
|
|
# Open WebUI retries on startup so no depends_on needed.
|
|
openwebui:
|
|
image: ghcr.io/open-webui/open-webui:main
|
|
container_name: open_webui
|
|
profiles: [webui]
|
|
environment:
|
|
- OPENAI_API_BASE_URL=http://llama-current:8080/v1
|
|
- OPENAI_API_KEY=sk-no-key-needed
|
|
- WEBUI_AUTH=false
|
|
ports:
|
|
- "3000:8080"
|
|
networks:
|
|
- llama-net
|
|
volumes:
|
|
- open-webui-data:/app/backend/data
|
|
restart: unless-stopped
|
|
|
|
# ── BENCHMARKS ─────────────────────────────────────────────────────────────
|
|
# Run as one-shot: docker compose --profile bench-<model> run --rm bench-<model>
|
|
# Override entrypoint for ad-hoc: ... run --rm --entrypoint="" bench-<model> bash -c '...'
|
|
|
|
bench-qwen35-9b:
|
|
build:
|
|
context: https://github.com/TheTom/llama-cpp-turboquant.git#feature/turboquant-kv-cache
|
|
dockerfile: .devops/cuda.Dockerfile
|
|
target: full
|
|
args:
|
|
CUDA_DOCKER_ARCH: "75 -DGGML_CUDA_FORCE_MMQ=ON"
|
|
image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
|
|
<<: *bench
|
|
profiles: [bench-qwen35-9b]
|
|
environment:
|
|
MODEL_FILE: Qwen3.5-9B.Q8_0.gguf
|
|
OUTPUT_DIR: /results
|
|
VARIANT: qwen35-9b-turboquant
|
|
PATH: /app:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
|
|
|
bench-gemma4-e2b:
|
|
image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
|
|
<<: *bench
|
|
profiles: [bench-gemma4-e2b]
|
|
environment:
|
|
MODEL_FILE: google_gemma-4-E2B-it-Q4_K_M.gguf
|
|
OUTPUT_DIR: /results
|
|
VARIANT: gemma4-e2b
|
|
|
|
bench-gemma4-e4b:
|
|
image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
|
|
<<: *bench
|
|
profiles: [bench-gemma4-e4b]
|
|
environment:
|
|
MODEL_FILE: google_gemma-4-E4B-it-Q4_K_M.gguf
|
|
OUTPUT_DIR: /results
|
|
VARIANT: gemma4-e4b
|
|
|
|
bench-smollm3-3b:
|
|
image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
|
|
<<: *bench
|
|
profiles: [bench-smollm3-3b]
|
|
environment:
|
|
MODEL_FILE: HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf
|
|
OUTPUT_DIR: /results
|
|
VARIANT: smollm3-3b
|
|
|
|
bench-qwen3-4b:
|
|
image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
|
|
<<: *bench
|
|
profiles: [bench-qwen3-4b]
|
|
environment:
|
|
MODEL_FILE: Qwen3-4B-Q4_K_M.gguf
|
|
OUTPUT_DIR: /results
|
|
VARIANT: qwen3-4b
|