# ============================================================================== # Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2 Q8_0 — TurboQuant SM75 # Architecture: 32 layers (8 full-attn + 24 linear-attn), GQA 4 KV heads # Model size: 8.86 GB | VRAM usage: ~3.4 GB (11 layers on GPU) # RAM usage: ~5.5 GB (21 layers pinned via mlock) # # Benchmark results (turbo2 KV, ngl=11, fa=1): # t=1→0.86 t=2→1.62 t=3→2.25 t=4→2.94 t=5→3.56 t=6→4.38 ← best # t=8→4.22 t=12→3.61 (hyperthreading hurts above 6) # Theoretical ceiling: ~5.1 t/s (45 GB/s RAM BW ÷ 8.86 GB model) # Achieved: 4.38 t/s = 86% efficiency # # Download: # huggingface-cli download Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2-GGUF \ # Qwen3.5-9B.Q8_0.gguf --local-dir ./models/ # ============================================================================== MODEL_FILE=Qwen3.5-9B.Q8_0.gguf # GPU: 11 layers fit in 3.7 GB VRAM. ngl=12 causes OOM at ctx>2048. N_GPU_LAYERS=11 # 32K context fits with turbo2 KV (~104 MiB overhead vs ~3.3 GB for f16) CTX_SIZE=32768 # t=6 is optimal for i7-10750H (6 physical cores). t>6 uses HT which hurts. THREADS=6 THREADS_BATCH=6 BATCH_SIZE=512 UBATCH_SIZE=128 # turbo2: 2-bit KV cache, 6.4× smaller than f16. Requires TurboQuant image. CACHE_TYPE_K=turbo2 CACHE_TYPE_V=turbo2 PARALLEL=1 # --no-mmap --mlock: pins entire model in RAM (prevents paging, avoids cold reads) # --flash-attn on: required with turbo2 KV (fa=0 + turbo2 has no speed benefit) EXTRA_ARGS=--flash-attn on --no-mmap --mlock