# ============================================================================== # llama.cpp multi-model server # Hardware: GTX 1650 Ti (3717 MiB VRAM, CC 7.5) + i7-10750H 6c/12t # # MODEL PROFILES (mutually exclusive — GPU can only hold one at a time): # qwen35-9b Qwen3.5-9B Q8_0 TurboQuant (turbo2 KV, FORCE_MMQ) ~4.4 t/s # gemma4-e2b Gemma 4 E2B Official llama.cpp ~65 t/s # gemma4-e4b Gemma 4 E4B Official llama.cpp (CPU split) ~30 t/s # smollm3-3b SmolLM3 3B Official llama.cpp ~90 t/s # qwen3-4b Qwen3 4B Official llama.cpp ~75 t/s # # BIGCTX PROFILES (-nkvo: KV in RAM, benchmarked v4 2026-05-06, TurboQuant FORCE_MMQ): # smollm3-3b-bigctx SmolLM3 3B ctx=65536 turbo2 | ~53 t/s base | ~15 t/s@50% | +40960 vs GPU # gemma4-e2b-bigctx Gemma 4 E2B ctx=393216 q4_0 | ~62 t/s base | ~17 t/s@50% | +368640 vs GPU (MQA!) # gemma4-e4b-bigctx Gemma 4 E4B ctx=163840 turbo2 | ~30 t/s base | ~18 t/s@50% | +139264 vs GPU # qwen3-4b-bigctx Qwen3 4B ctx=24576 q4_0 | ~39 t/s base | ~11 t/s@50% | +8192 vs GPU # # OPTIONAL ADD-ON (combine with any model profile): # webui Open WebUI — auto-connects to whichever model is running # # BENCHMARK PROFILES (one-shot, run with: docker compose ... run --rm ): # bench-qwen35-9b / bench-gemma4-e2b / bench-gemma4-e4b # bench-smollm3-3b / bench-qwen3-4b # # EXAMPLES: # docker compose --profile qwen35-9b up -d # docker compose --profile gemma4-e2b --profile webui up -d # docker compose --profile bench-smollm3-3b run --rm --entrypoint="" bench-smollm3-3b \ # bash -c '/app/llama-bench -m /models/$MODEL_FILE -ngl 99 -o csv 2>/dev/null' # # FIRST-TIME BUILD (qwen35-9b TurboQuant image, ~20 min): # docker compose --profile qwen35-9b build llama-qwen35-9b # # Per-model params live in envs/.env. — edit there to retune. # All server services expose API on host port 8080 and Docker network as # http://llama-current:8080 via the llama-net network alias. # ============================================================================== # ── Shared GPU passthrough ──────────────────────────────────────────────────── x-gpu: &gpu runtime: nvidia environment: NVIDIA_VISIBLE_DEVICES: all NVIDIA_DRIVER_CAPABILITIES: compute,utility # ── Common healthcheck properties (start_period overridden per service) ─────── x-hc: &hc test: ["CMD-SHELL", "curl -sf http://localhost:8080/health | grep -q '\"status\":\"ok\"'"] interval: 20s timeout: 10s retries: 10 # ── Common server scaffold ──────────────────────────────────────────────────── # All model services merge this. Per-model differences go in envs/.env.. # $$VAR uses double-$ to escape compose interpolation — shell expands them at # runtime from the env_file variables injected into the container. x-server: &server <<: *gpu container_name: llama_server volumes: - ./models:/models:ro ports: - "8080:8080" shm_size: 1g ulimits: memlock: soft: -1 hard: -1 restart: unless-stopped entrypoint: ["/bin/sh", "-c"] command: | exec /app/llama-server \ --model "/models/$$MODEL_FILE" \ --host 0.0.0.0 --port 8080 \ --n-gpu-layers $$N_GPU_LAYERS \ --ctx-size $$CTX_SIZE \ --threads $$THREADS --threads-batch $$THREADS_BATCH \ --batch-size $$BATCH_SIZE --ubatch-size $$UBATCH_SIZE \ --cache-type-k $$CACHE_TYPE_K --cache-type-v $$CACHE_TYPE_V \ --cont-batching --parallel $$PARALLEL \ $$EXTRA_ARGS \ --log-disable networks: llama-net: aliases: [llama-current] # ── Common benchmark scaffold ───────────────────────────────────────────────── x-bench: &bench <<: *gpu container_name: llama_bench volumes: - ./models:/models:ro - ./benchmark-results:/results - ./scripts:/scripts:ro shm_size: 1g ulimits: memlock: soft: -1 hard: -1 entrypoint: ["/bin/bash", "/scripts/benchmark.sh"] # ── Networks ────────────────────────────────────────────────────────────────── networks: llama-net: driver: bridge # ── Volumes ─────────────────────────────────────────────────────────────────── volumes: open-webui-data: # ============================================================================== services: # ── QWEN 3.5-9B Q8_0 — TurboQuant (turbo2 KV, FORCE_MMQ, SM75) ──────────── # Build image first: docker compose --profile qwen35-9b build llama-qwen35-9b llama-qwen35-9b: build: context: https://github.com/TheTom/llama-cpp-turboquant.git#feature/turboquant-kv-cache dockerfile: .devops/cuda.Dockerfile target: server args: CUDA_DOCKER_ARCH: "75 -DGGML_CUDA_FORCE_MMQ=ON" image: local/llama-cpp-turboquant:server-cuda-sm75-mmq <<: *server profiles: [qwen35-9b] env_file: envs/.env.qwen35-9b healthcheck: <<: *hc retries: 12 start_period: 180s # mlock pins 8.86 GB into RAM — needs time # ── GEMMA 4 E2B — 2.3B effective (5.1B total/PLE), 128K ctx, audio+video ─── # Download: see envs/.env.gemma4-e2b for huggingface-cli command llama-gemma4-e2b: image: local/llama-cpp-turboquant:server-cuda-sm75-mmq <<: *server profiles: [gemma4-e2b] env_file: envs/.env.gemma4-e2b healthcheck: <<: *hc start_period: 60s # ── GEMMA 4 E4B — 4.5B effective (8B total/PLE), 128K ctx, CPU-split ──────── # Fits ~28/42 layers on GPU; remaining layers run on CPU RAM llama-gemma4-e4b: image: local/llama-cpp-turboquant:server-cuda-sm75-mmq <<: *server profiles: [gemma4-e4b] env_file: envs/.env.gemma4-e4b healthcheck: <<: *hc start_period: 60s # ── SMOLLM3 3B — thinking mode, tool calling, 64K ctx, Apache 2.0 ────────── llama-smollm3-3b: image: local/llama-cpp-turboquant:server-cuda-sm75-mmq <<: *server profiles: [smollm3-3b] env_file: envs/.env.smollm3-3b healthcheck: <<: *hc start_period: 60s # ── QWEN3 4B — thinking mode, 128K ctx, best ecosystem ──────────────────── llama-qwen3-4b: image: local/llama-cpp-turboquant:server-cuda-sm75-mmq <<: *server profiles: [qwen3-4b] env_file: envs/.env.qwen3-4b healthcheck: <<: *hc start_period: 60s # ── BIGCTX VARIANTS (-nkvo: KV in RAM, benchmarked 2026-05-06) ──────────── # Use when you need more context than the pure-GPU profiles offer. # KV cache lives in CPU RAM instead of VRAM → VRAM freed for larger ctx. # Speed estimated via PCIe bandwidth model (8 GB/s). E2B/E4B use MQA — tiny KV, far less PCIe pressure. llama-smollm3-3b-bigctx: image: local/llama-cpp-turboquant:server-cuda-sm75-mmq <<: *server profiles: [smollm3-3b-bigctx] env_file: envs/.env.smollm3-3b-bigctx healthcheck: <<: *hc start_period: 90s llama-gemma4-e2b-bigctx: image: local/llama-cpp-turboquant:server-cuda-sm75-mmq <<: *server profiles: [gemma4-e2b-bigctx] env_file: envs/.env.gemma4-e2b-bigctx healthcheck: <<: *hc start_period: 120s # large ctx (393216) allocation takes extra time llama-gemma4-e4b-bigctx: image: local/llama-cpp-turboquant:server-cuda-sm75-mmq <<: *server profiles: [gemma4-e4b-bigctx] env_file: envs/.env.gemma4-e4b-bigctx healthcheck: <<: *hc start_period: 150s # 5 GiB model + warmup + 163840 ctx takes ~90-120s llama-qwen3-4b-bigctx: image: local/llama-cpp-turboquant:server-cuda-sm75-mmq <<: *server profiles: [qwen3-4b-bigctx] env_file: envs/.env.qwen3-4b-bigctx healthcheck: <<: *hc start_period: 90s # ── OPEN WEBUI ───────────────────────────────────────────────────────────── # Separate profile — add to any running model: # docker compose --profile --profile webui up -d # Connects to whichever model is running via the llama-current DNS alias. # Open WebUI retries on startup so no depends_on needed. openwebui: image: ghcr.io/open-webui/open-webui:main container_name: open_webui profiles: [webui] environment: - OPENAI_API_BASE_URL=http://llama-current:8080/v1 - OPENAI_API_KEY=sk-no-key-needed - WEBUI_AUTH=false ports: - "3000:8080" networks: - llama-net volumes: - open-webui-data:/app/backend/data restart: unless-stopped # ── BENCHMARKS ───────────────────────────────────────────────────────────── # Run as one-shot: docker compose --profile bench- run --rm bench- # Override entrypoint for ad-hoc: ... run --rm --entrypoint="" bench- bash -c '...' bench-qwen35-9b: build: context: https://github.com/TheTom/llama-cpp-turboquant.git#feature/turboquant-kv-cache dockerfile: .devops/cuda.Dockerfile target: full args: CUDA_DOCKER_ARCH: "75 -DGGML_CUDA_FORCE_MMQ=ON" image: local/llama-cpp-turboquant:full-cuda-sm75-mmq <<: *bench profiles: [bench-qwen35-9b] environment: MODEL_FILE: Qwen3.5-9B.Q8_0.gguf OUTPUT_DIR: /results VARIANT: qwen35-9b-turboquant PATH: /app:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin bench-gemma4-e2b: image: local/llama-cpp-turboquant:full-cuda-sm75-mmq <<: *bench profiles: [bench-gemma4-e2b] environment: MODEL_FILE: google_gemma-4-E2B-it-Q4_K_M.gguf OUTPUT_DIR: /results VARIANT: gemma4-e2b bench-gemma4-e4b: image: local/llama-cpp-turboquant:full-cuda-sm75-mmq <<: *bench profiles: [bench-gemma4-e4b] environment: MODEL_FILE: google_gemma-4-E4B-it-Q4_K_M.gguf OUTPUT_DIR: /results VARIANT: gemma4-e4b bench-smollm3-3b: image: local/llama-cpp-turboquant:full-cuda-sm75-mmq <<: *bench profiles: [bench-smollm3-3b] environment: MODEL_FILE: HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf OUTPUT_DIR: /results VARIANT: smollm3-3b bench-qwen3-4b: image: local/llama-cpp-turboquant:full-cuda-sm75-mmq <<: *bench profiles: [bench-qwen3-4b] environment: MODEL_FILE: Qwen3-4B-Q4_K_M.gguf OUTPUT_DIR: /results VARIANT: qwen3-4b