# ============================================================================== # Qwen3-4B-Instruct Q4_K_M — Alibaba (May 2025) # Architecture: Decoder-only transformer, GQA # - 4B params, 32 layers # - 32K native context (128K with YaRN) # Model size: ~2.4 GB Q4_K_M | Full GPU fit (ngl=99) # Features: thinking mode (/think /no_think), tool calling, 119 languages, # Apache 2.0. Strong code + reasoning. Best ecosystem (most fine-tunes). # # Download: # huggingface-cli download bartowski/Qwen3-4B-GGUF \ # Qwen3-4B-Q4_K_M.gguf --local-dir ./models/ # # NOTE: Verify exact filename after download: # ls models/Qwen3-4B* # ============================================================================== MODEL_FILE=Qwen3-4B-Q4_K_M.gguf # All layers fit — ~2.4 GB leaves ~1.3 GB free for KV + compute N_GPU_LAYERS=99 # Benchmarked 2026-05-05 on GTX 1650 Ti (3717 MiB): # Max ctx=8192 (12K OOM). Full attention — all KV must fit at full ctx. # GGUF native limit=40960, but VRAM walls at ~8K. # Baseline: 181 pp / 41.6 tg t/s. At 8K ctx fa=1: 191 pp / 44.3 tg t/s (+6%). CTX_SIZE=16384 THREADS=6 THREADS_BATCH=6 BATCH_SIZE=512 UBATCH_SIZE=256 CACHE_TYPE_K=q4_0 CACHE_TYPE_V=q4_0 # 1 parallel slot — limited VRAM at 8K ctx with 2.4GB model PARALLEL=1 # fa=1 gives +6% tg speed on full-attention Qwen3 EXTRA_ARGS=--flash-attn on --mmap