# ============================================================================== # Gemma 4 E4B-it Q4_K_M — Google DeepMind (April 2025) # Architecture: Dense transformer + Per-Layer Embeddings (PLE) # - 4.5B effective params (8B total with PLE embedding tables) # - 42 layers, hybrid local (512-token window) + global attention # - 128K context window # Model size: ~4.7 GB Q4_K_M | CPU-split needed (exceeds 3.7 GB VRAM) # Modalities: text + image + audio (ASR/translation) + video frames # # Download: # huggingface-cli download bartowski/google_gemma-4-E4B-it-GGUF \ # google_gemma-4-E4B-it-Q4_K_M.gguf --local-dir ./models/ # # NOTE: Verify the exact filename after download — bartowski naming may vary. # Check: ls models/google_gemma* # ============================================================================== MODEL_FILE=google_gemma-4-E4B-it-Q4_K_M.gguf # Benchmarked 2026-05-05 on GTX 1650 Ti (3717 MiB): # ALL 42 layers fit on GPU when no other containers hold VRAM! # ngl sweep: ngl=42 → 133 pp / 32.0 tg t/s (ngl=28 was only 59/16.5) # Max ctx=24576 (hybrid attention, 32K OOM). fa=1 works (+3% vs fa=0). # Thread sweep: t=4-6 optimal (GPU-only now, CPU largely idle for tg) N_GPU_LAYERS=42 # 24K max — hybrid sliding-window keeps most layers' KV tiny # 32K OOM due to global-attn layers hitting VRAM wall CTX_SIZE=24576 THREADS=6 THREADS_BATCH=6 BATCH_SIZE=512 UBATCH_SIZE=128 CACHE_TYPE_K=q4_0 CACHE_TYPE_V=q4_0 PARALLEL=1 # fa=1 confirmed working on hybrid Gemma4 attention EXTRA_ARGS=--flash-attn on --mmap