llama-cpp/envs/.env.qwen35-9b

# ==============================================================================
# Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2 Q8_0 — TurboQuant SM75
# Architecture: 32 layers (8 full-attn + 24 linear-attn), GQA 4 KV heads
# Model size: 8.86 GB  |  VRAM usage: ~3.4 GB (11 layers on GPU)
# RAM usage: ~5.5 GB (21 layers pinned via mlock)
#
# Benchmark results (turbo2 KV, ngl=11, fa=1):
#   t=1→0.86  t=2→1.62  t=3→2.25  t=4→2.94  t=5→3.56  t=6→4.38 ← best
#   t=8→4.22  t=12→3.61  (hyperthreading hurts above 6)
# Theoretical ceiling: ~5.1 t/s (45 GB/s RAM BW ÷ 8.86 GB model)
# Achieved: 4.38 t/s = 86% efficiency
#
# Download:
#   huggingface-cli download Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2-GGUF \
#     Qwen3.5-9B.Q8_0.gguf --local-dir ./models/
# ==============================================================================

MODEL_FILE=Qwen3.5-9B.Q8_0.gguf

# GPU: 11 layers fit in 3.7 GB VRAM. ngl=12 causes OOM at ctx>2048.
N_GPU_LAYERS=11

# 32K context fits with turbo2 KV (~104 MiB overhead vs ~3.3 GB for f16)
CTX_SIZE=32768

# t=6 is optimal for i7-10750H (6 physical cores). t>6 uses HT which hurts.
THREADS=6
THREADS_BATCH=6

BATCH_SIZE=512
UBATCH_SIZE=128

# turbo2: 2-bit KV cache, 6.4× smaller than f16. Requires TurboQuant image.
CACHE_TYPE_K=turbo2
CACHE_TYPE_V=turbo2

PARALLEL=1

# --no-mmap --mlock: pins entire model in RAM (prevents paging, avoids cold reads)
# --flash-attn on: required with turbo2 KV (fa=0 + turbo2 has no speed benefit)
EXTRA_ARGS=--flash-attn on --no-mmap --mlock