# ============================================================================== # Gemma 4 E2B-it Q4_K_M — Google DeepMind (April 2025) # Architecture: Dense transformer + Per-Layer Embeddings (PLE) # - 2.3B effective params (5.1B total with PLE embedding tables) # - 35 layers, hybrid local (512-token window) + global attention # - 128K context window # Model size: ~2.9 GB Q4_K_M | Full GPU fit (ngl=99, VRAM ~3.4 GB total) # Modalities: text + image + audio (ASR/translation) + video frames # # Download: # huggingface-cli download bartowski/google_gemma-4-E2B-it-GGUF \ # google_gemma-4-E2B-it-Q4_K_M.gguf --local-dir ./models/ # # NOTE: Verify the exact filename after download — bartowski naming may vary. # Check: ls models/google_gemma* # ============================================================================== MODEL_FILE=google_gemma-4-E2B-it-Q4_K_M.gguf # All 35 layers fit in VRAM. PLE layers are small compute, large embedding lookup. N_GPU_LAYERS=99 # Benchmarked 2026-05-05 on GTX 1650 Ti (3717 MiB): # Hybrid sliding-window attention (512-token) keeps KV tiny → 32K ctx fits! # 65K/131K OOM (full global-attn layers eat VRAM at large ctx). # Baseline: 350 pp / 64.6 tg t/s | At 32K ctx: 365 pp / 66.8 tg t/s (fa=1) CTX_SIZE=24576 THREADS=6 THREADS_BATCH=6 BATCH_SIZE=512 UBATCH_SIZE=256 # f16 KV — model small, KV overhead negligible even at 32K CACHE_TYPE_K=f16 CACHE_TYPE_V=f16 # 2 parallel slots — fast model (66 tg t/s), VRAM headroom available PARALLEL=2 # fa=1 confirmed working on hybrid Gemma4 attention (+5% vs fa=0) EXTRA_ARGS=--flash-attn on --mmap