# ============================================================================== # SmolLM3 3B-it Q4_K_M — HuggingFace (2025) # Architecture: Decoder-only transformer, GQA + NoPE (3:1 ratio) # - 3B params, 11.2T training tokens # - 64K native context (128K with YaRN) # Model size: ~1.9 GB Q4_K_M | Full GPU fit (ngl=99) # Features: thinking mode (/think /no_think), tool calling, 6 languages, # Apache 2.0. AIME 2025: 36.7% in think mode. # # Download: # huggingface-cli download bartowski/HuggingFaceTB_SmolLM3-3B-GGUF \ # HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf --local-dir ./models/ # # NOTE: Verify exact filename after download: # ls models/SmolLM3* models/HuggingFaceTB_SmolLM3* # ============================================================================== MODEL_FILE=HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf # All layers fit comfortably — ~1.9 GB leaves ~1.8 GB free for KV + compute N_GPU_LAYERS=99 # Benchmarked 2026-05-05 on GTX 1650 Ti (3717 MiB): # Max ctx=24576 (32K OOM). Baseline: 249 pp / 56.8 tg t/s. # At 24K ctx with fa=1: 260 pp / 58.3 tg t/s (+2%). # Model context limit = 65536, VRAM is the constraint here. CTX_SIZE=24576 THREADS=6 THREADS_BATCH=6 BATCH_SIZE=512 UBATCH_SIZE=256 CACHE_TYPE_K=q8_0 CACHE_TYPE_V=q8_0 # 2 parallel slots — less headroom at 24K ctx vs original 16K estimate PARALLEL=2 # fa=1 gives small but consistent improvement (+2 tg t/s) EXTRA_ARGS=--flash-attn on --mmap