llama-cpp/envs/.env.gemma4-e4b

# ==============================================================================
# Gemma 4 E4B-it Q4_K_M — Google DeepMind (April 2025)
# Architecture: Dense transformer + Per-Layer Embeddings (PLE)
#   - 4.5B effective params (8B total with PLE embedding tables)
#   - 42 layers, hybrid local (512-token window) + global attention
#   - 128K context window
# Model size: ~4.7 GB Q4_K_M  |  CPU-split needed (exceeds 3.7 GB VRAM)
# Modalities: text + image + audio (ASR/translation) + video frames
#
# Download:
#   huggingface-cli download bartowski/google_gemma-4-E4B-it-GGUF \
#     google_gemma-4-E4B-it-Q4_K_M.gguf --local-dir ./models/
#
# NOTE: Verify the exact filename after download — bartowski naming may vary.
#       Check: ls models/google_gemma*
# ==============================================================================

MODEL_FILE=google_gemma-4-E4B-it-Q4_K_M.gguf

# Benchmarked 2026-05-05 on GTX 1650 Ti (3717 MiB):
# ALL 42 layers fit on GPU when no other containers hold VRAM!
# ngl sweep: ngl=42 → 133 pp / 32.0 tg t/s  (ngl=28 was only 59/16.5)
# Max ctx=24576 (hybrid attention, 32K OOM).  fa=1 works (+3% vs fa=0).
# Thread sweep: t=4-6 optimal (GPU-only now, CPU largely idle for tg)
N_GPU_LAYERS=42

# 24K max — hybrid sliding-window keeps most layers' KV tiny
# 32K OOM due to global-attn layers hitting VRAM wall
CTX_SIZE=24576

THREADS=6
THREADS_BATCH=6

BATCH_SIZE=512
UBATCH_SIZE=128

CACHE_TYPE_K=q4_0
CACHE_TYPE_V=q4_0

PARALLEL=1

# fa=1 confirmed working on hybrid Gemma4 attention
EXTRA_ARGS=--flash-attn on --mmap