# ============================================================================== # Qwen3-4B Q4_K_M — bigctx variant (KV in RAM via -nkvo) # Benchmarked 2026-05-06: -nkvo max ctx=24576 (+8K vs pure-GPU 16384) # Baseline TG: ~39 t/s (empty KV). # Use this profile when you need >16K context; otherwise use qwen3-4b. # ============================================================================== MODEL_FILE=Qwen3-4B-Q4_K_M.gguf N_GPU_LAYERS=99 CTX_SIZE=24576 THREADS=6 THREADS_BATCH=6 BATCH_SIZE=512 UBATCH_SIZE=256 CACHE_TYPE_K=q4_0 CACHE_TYPE_V=q4_0 PARALLEL=1 EXTRA_ARGS=--flash-attn on --mmap --no-kv-offload