#!/bin/bash # Benchmark all 4 new models on GTX 1650 Ti (3717 MiB VRAM) # Priority: max context size > tg speed # Runs inside ghcr.io/ggml-org/llama.cpp:full-cuda (build b9014, no -c flag) # # Architecture context limits (from GGUF metadata): # SmolLM3-3B : 65536 (full attention, KV-limited to ~28K in practice) # Gemma4-E2B : 131072 (hybrid: sliding_window=512 → huge ctx possible) # Gemma4-E4B : 131072 (hybrid: sliding_window=512) # Qwen3-4B : 40960 (full attention, KV-limited to ~9K in practice) # # NOTE: llama-bench b9014 has NO -c flag. Context is set by -p (prompt tokens). # -p N -n G allocates KV for N+G tokens. OOM = exit!=0 or error in stdout. set -uo pipefail M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf" M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf" M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf" M_Q3="/models/Qwen3-4B-Q4_K_M.gguf" # -- CSV column detection (called once on first successful output) -- TS_COL=0; NG_COL=0; NP_COL=0 detect_cols() { local hdr hdr=$(printf '%s\n' "$1" | sed 's/"//g' | grep '^build_commit' | head -1) TS_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="avg_ts"){print i;exit}}') NG_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="n_gen"){print i;exit}}') NP_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="n_prompt"){print i;exit}}') TS_COL=${TS_COL:-0}; NG_COL=${NG_COL:-0}; NP_COL=${NP_COL:-0} } # Returns "pp_speed pp / tg_speed tg t/s" parse_speeds() { local out="$1" [ "${TS_COL:-0}" = "0" ] && detect_cols "$out" local s pp tg s=$(printf '%s\n' "$out" | sed 's/"//g') pp=$(printf '%s\n' "$s" | awk -F',' -v tc="$TS_COL" -v np="$NP_COL" -v ng="$NG_COL" \ 'NR>1 && $np+0>0 && $ng+0==0 {printf "%.0f", $tc+0; exit}') tg=$(printf '%s\n' "$s" | awk -F',' -v tc="$TS_COL" -v np="$NP_COL" -v ng="$NG_COL" \ 'NR>1 && $ng+0>0 && $np+0==0 {printf "%.1f", $tc+0; exit}') printf "%s pp / %s tg t/s" "${pp:--}" "${tg:--}" } is_oom() { local out="$1" ec="$2" [ "$ec" -ne 0 ] && return 0 printf '%s\n' "$out" | grep -qiE "failed to create context|out of memory|GGML_ASSERT|error:" && return 0 return 1 } # bench MODEL NGL [llama-bench extra args...] # Standard speed benchmark: -p 512 -n 128 small context bench() { local model=$1 ngl=$2; shift 2 local out ec out=$(timeout 250 /app/llama-bench -m "$model" -ngl "$ngl" \ -b 512 -ub 128 -o csv "$@" 2>&1) ec=$? if is_oom "$out" "$ec"; then echo "OOM"; return; fi [ "${TS_COL:-0}" = "0" ] && detect_cols "$out" parse_speeds "$out" } # bench_ctx MODEL NGL CTX # Context-capacity test: allocates KV for CTX tokens via -p CTX -n 1 # Tries fa=1 first, falls back to fa=0. Returns "OK (N pp t/s [fa=N])" or "OOM" bench_ctx() { local model=$1 ngl=$2 ctx=$3 local out ec fa_used for fa in 1 0; do out=$(timeout 250 /app/llama-bench -m "$model" -ngl "$ngl" \ -p "$ctx" -n 1 -r 1 --no-warmup \ -b 512 -ub 128 -fa "$fa" -t 6 -o csv 2>&1) ec=$? is_oom "$out" "$ec" || { fa_used=$fa; break; } [ "$fa" = "0" ] && { echo "OOM"; return; } done [ "${TS_COL:-0}" = "0" ] && detect_cols "$out" local pp pp=$(printf '%s\n' "$out" | sed 's/"//g' | \ awk -F',' -v tc="$TS_COL" -v np="$NP_COL" \ 'NR>1 && $np+0>0 {printf "%.0f", $tc+0; exit}') printf "OK (%s pp t/s fa=%s)" "${pp:--}" "${fa_used:-?}" } HR="======================================================================" echo "$HR" echo "LLAMA.CPP BENCHMARK — ALL MODELS — $(date)" echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo unknown)" echo "$HR" echo "" # ── Phase 1: Baseline (small context) ──────────────────────────────────────── echo "=== Phase 1: Baseline (ngl=99, p=512 n=128 r=2, t=6, fa=0) ===" for entry in "SmolLM3-3B:$M_SMOL" "Gemma4-E2B:$M_E2B" "Gemma4-E4B:$M_E4B" "Qwen3-4B:$M_Q3"; do lbl="${entry%%:*}"; mdl="${entry#*:}" printf " %-14s %s\n" "$lbl" "$(bench "$mdl" 99 -p 512 -n 128 -r 2 -t 6 -fa 0)" done echo "" # ── Phase 2: Gemma4-E4B ngl sweep ──────────────────────────────────────────── echo "=== Phase 2: Gemma4-E4B ngl sweep (p=16 n=64 r=1 t=6 fa=0) ===" echo " 5.1GB model on 3.7GB VRAM — finding highest ngl before OOM" best_e4b_ngl=0 for ngl in 0 4 8 12 16 20 24 28 32 36 42; do ts=$(bench "$M_E4B" $ngl -p 16 -n 64 -r 1 -t 6 -fa 0) printf " ngl=%-3s %s\n" "$ngl" "$ts" [[ "$ts" == OOM ]] && break best_e4b_ngl=$ngl done echo " → best_e4b_ngl=$best_e4b_ngl" echo "" # ── Phase 3: Max context sweep ──────────────────────────────────────────────── echo "=== Phase 3: Max context (p=ctx n=1 r=1 no-warmup fa=1) ===" echo " Gemma4 hybrid attention (sliding_window=512) enables large ctx cheaply." declare -A BEST_CTX BEST_CTX[smollm3]=512; BEST_CTX[e2b]=512; BEST_CTX[e4b]=512; BEST_CTX[q3]=512 for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \ "e2b:Gemma4-E2B:$M_E2B:99" \ "e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \ "q3:Qwen3-4B:$M_Q3:99"; do IFS=':' read -r key lbl mdl ngl <<< "$entry" echo " -- $lbl (ngl=$ngl) --" for ctx in 512 1024 2048 4096 8192 12288 16384 24576 32768 49152 65536 98304 131072; do ts=$(bench_ctx "$mdl" "$ngl" "$ctx") printf " ctx=%-7s %s\n" "$ctx" "$ts" [[ "$ts" == OOM ]] && break BEST_CTX[$key]=$ctx done echo " → MAX ctx=${BEST_CTX[$key]}" done echo "" # ── Phase 4: TG speed at max context ───────────────────────────────────────── echo "=== Phase 4: TG speed at max context (p=512 n=128 r=2 fa=1 t=6) ===" for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \ "e2b:Gemma4-E2B:$M_E2B:99" \ "e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \ "q3:Qwen3-4B:$M_Q3:99"; do IFS=':' read -r key lbl mdl ngl <<< "$entry" ts=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 1 -t 6) printf " %-14s max_ctx=%-7s %s\n" "$lbl" "${BEST_CTX[$key]}" "$ts" done echo "" # ── Phase 5: E4B thread sweep (CPU split model — threads matter) ────────────── echo "=== Phase 5: Gemma4-E4B thread sweep (p=512 n=128 r=2 fa=0 ngl=$best_e4b_ngl) ===" for t in 1 2 3 4 5 6 8 10 12; do ts=$(bench "$M_E4B" "$best_e4b_ngl" -p 512 -n 128 -r 2 -fa 0 -t "$t") printf " t=%-3s %s\n" "$t" "$ts" done echo "" # ── Phase 6: Flash attention comparison ────────────────────────────────────── echo "=== Phase 6: Flash attention fa=0 vs fa=1 (p=512 n=128 r=2 t=6) ===" echo " Gemma4 hybrid attention may not support FA — testing both." for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \ "e2b:Gemma4-E2B:$M_E2B:99" \ "e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \ "q3:Qwen3-4B:$M_Q3:99"; do IFS=':' read -r key lbl mdl ngl <<< "$entry" ts0=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 0 -t 6) ts1=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 1 -t 6) printf " %-14s fa=0: %-30s fa=1: %s\n" "$lbl" "$ts0" "$ts1" done echo "" echo "$HR" echo "BENCHMARK COMPLETE: $(date)" echo "$HR"