llama-cpp/envs/.env.qwen3-4b

# ==============================================================================
# Qwen3-4B-Instruct Q4_K_M — Alibaba (May 2025)
# Architecture: Decoder-only transformer, GQA
#   - 4B params, 32 layers
#   - 32K native context (128K with YaRN)
# Model size: ~2.4 GB Q4_K_M  |  Full GPU fit (ngl=99)
# Features: thinking mode (/think /no_think), tool calling, 119 languages,
#           Apache 2.0. Strong code + reasoning. Best ecosystem (most fine-tunes).
#
# Download:
#   huggingface-cli download bartowski/Qwen3-4B-GGUF \
#     Qwen3-4B-Q4_K_M.gguf --local-dir ./models/
#
# NOTE: Verify exact filename after download:
#       ls models/Qwen3-4B*
# ==============================================================================

MODEL_FILE=Qwen3-4B-Q4_K_M.gguf

# All layers fit — ~2.4 GB leaves ~1.3 GB free for KV + compute
N_GPU_LAYERS=99

# Benchmarked 2026-05-05 on GTX 1650 Ti (3717 MiB):
# Max ctx=8192 (12K OOM). Full attention — all KV must fit at full ctx.
# GGUF native limit=40960, but VRAM walls at ~8K.
# Baseline: 181 pp / 41.6 tg t/s. At 8K ctx fa=1: 191 pp / 44.3 tg t/s (+6%).
CTX_SIZE=16384

THREADS=6
THREADS_BATCH=6

BATCH_SIZE=512
UBATCH_SIZE=256

CACHE_TYPE_K=q4_0
CACHE_TYPE_V=q4_0

# 1 parallel slot — limited VRAM at 8K ctx with 2.4GB model
PARALLEL=1

# fa=1 gives +6% tg speed on full-attention Qwen3
EXTRA_ARGS=--flash-attn on --mmap