Initial commit: tuned multi-model llama.cpp stack
- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B - TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs - Bigctx profiles (-nkvo KV in RAM): 2-16x context gain - turbo2 KV: 2x smaller, benchmarked against PPL quality gate - Per-model env files with justified parameters - kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts - docs/FINDINGS.md: surprises, pitfalls, recommendations - docs/ARCHITECTURE.md: compose + test script design
This commit is contained in:
175
scripts/benchmark_models.sh
Normal file
175
scripts/benchmark_models.sh
Normal file
@@ -0,0 +1,175 @@
|
||||
#!/bin/bash
|
||||
# Benchmark all 4 new models on GTX 1650 Ti (3717 MiB VRAM)
|
||||
# Priority: max context size > tg speed
|
||||
# Runs inside ghcr.io/ggml-org/llama.cpp:full-cuda (build b9014, no -c flag)
|
||||
#
|
||||
# Architecture context limits (from GGUF metadata):
|
||||
# SmolLM3-3B : 65536 (full attention, KV-limited to ~28K in practice)
|
||||
# Gemma4-E2B : 131072 (hybrid: sliding_window=512 → huge ctx possible)
|
||||
# Gemma4-E4B : 131072 (hybrid: sliding_window=512)
|
||||
# Qwen3-4B : 40960 (full attention, KV-limited to ~9K in practice)
|
||||
#
|
||||
# NOTE: llama-bench b9014 has NO -c flag. Context is set by -p (prompt tokens).
|
||||
# -p N -n G allocates KV for N+G tokens. OOM = exit!=0 or error in stdout.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
|
||||
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
|
||||
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
|
||||
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
|
||||
|
||||
# -- CSV column detection (called once on first successful output) --
|
||||
TS_COL=0; NG_COL=0; NP_COL=0
|
||||
|
||||
detect_cols() {
|
||||
local hdr
|
||||
hdr=$(printf '%s\n' "$1" | sed 's/"//g' | grep '^build_commit' | head -1)
|
||||
TS_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="avg_ts"){print i;exit}}')
|
||||
NG_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="n_gen"){print i;exit}}')
|
||||
NP_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="n_prompt"){print i;exit}}')
|
||||
TS_COL=${TS_COL:-0}; NG_COL=${NG_COL:-0}; NP_COL=${NP_COL:-0}
|
||||
}
|
||||
|
||||
# Returns "pp_speed pp / tg_speed tg t/s"
|
||||
parse_speeds() {
|
||||
local out="$1"
|
||||
[ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
|
||||
local s pp tg
|
||||
s=$(printf '%s\n' "$out" | sed 's/"//g')
|
||||
pp=$(printf '%s\n' "$s" | awk -F',' -v tc="$TS_COL" -v np="$NP_COL" -v ng="$NG_COL" \
|
||||
'NR>1 && $np+0>0 && $ng+0==0 {printf "%.0f", $tc+0; exit}')
|
||||
tg=$(printf '%s\n' "$s" | awk -F',' -v tc="$TS_COL" -v np="$NP_COL" -v ng="$NG_COL" \
|
||||
'NR>1 && $ng+0>0 && $np+0==0 {printf "%.1f", $tc+0; exit}')
|
||||
printf "%s pp / %s tg t/s" "${pp:--}" "${tg:--}"
|
||||
}
|
||||
|
||||
is_oom() {
|
||||
local out="$1" ec="$2"
|
||||
[ "$ec" -ne 0 ] && return 0
|
||||
printf '%s\n' "$out" | grep -qiE "failed to create context|out of memory|GGML_ASSERT|error:" && return 0
|
||||
return 1
|
||||
}
|
||||
|
||||
# bench MODEL NGL [llama-bench extra args...]
|
||||
# Standard speed benchmark: -p 512 -n 128 small context
|
||||
bench() {
|
||||
local model=$1 ngl=$2; shift 2
|
||||
local out ec
|
||||
out=$(timeout 250 /app/llama-bench -m "$model" -ngl "$ngl" \
|
||||
-b 512 -ub 128 -o csv "$@" 2>&1)
|
||||
ec=$?
|
||||
if is_oom "$out" "$ec"; then echo "OOM"; return; fi
|
||||
[ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
|
||||
parse_speeds "$out"
|
||||
}
|
||||
|
||||
# bench_ctx MODEL NGL CTX
|
||||
# Context-capacity test: allocates KV for CTX tokens via -p CTX -n 1
|
||||
# Tries fa=1 first, falls back to fa=0. Returns "OK (N pp t/s [fa=N])" or "OOM"
|
||||
bench_ctx() {
|
||||
local model=$1 ngl=$2 ctx=$3
|
||||
local out ec fa_used
|
||||
for fa in 1 0; do
|
||||
out=$(timeout 250 /app/llama-bench -m "$model" -ngl "$ngl" \
|
||||
-p "$ctx" -n 1 -r 1 --no-warmup \
|
||||
-b 512 -ub 128 -fa "$fa" -t 6 -o csv 2>&1)
|
||||
ec=$?
|
||||
is_oom "$out" "$ec" || { fa_used=$fa; break; }
|
||||
[ "$fa" = "0" ] && { echo "OOM"; return; }
|
||||
done
|
||||
[ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
|
||||
local pp
|
||||
pp=$(printf '%s\n' "$out" | sed 's/"//g' | \
|
||||
awk -F',' -v tc="$TS_COL" -v np="$NP_COL" \
|
||||
'NR>1 && $np+0>0 {printf "%.0f", $tc+0; exit}')
|
||||
printf "OK (%s pp t/s fa=%s)" "${pp:--}" "${fa_used:-?}"
|
||||
}
|
||||
|
||||
HR="======================================================================"
|
||||
echo "$HR"
|
||||
echo "LLAMA.CPP BENCHMARK — ALL MODELS — $(date)"
|
||||
echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo unknown)"
|
||||
echo "$HR"
|
||||
echo ""
|
||||
|
||||
# ── Phase 1: Baseline (small context) ────────────────────────────────────────
|
||||
echo "=== Phase 1: Baseline (ngl=99, p=512 n=128 r=2, t=6, fa=0) ==="
|
||||
for entry in "SmolLM3-3B:$M_SMOL" "Gemma4-E2B:$M_E2B" "Gemma4-E4B:$M_E4B" "Qwen3-4B:$M_Q3"; do
|
||||
lbl="${entry%%:*}"; mdl="${entry#*:}"
|
||||
printf " %-14s %s\n" "$lbl" "$(bench "$mdl" 99 -p 512 -n 128 -r 2 -t 6 -fa 0)"
|
||||
done
|
||||
echo ""
|
||||
|
||||
# ── Phase 2: Gemma4-E4B ngl sweep ────────────────────────────────────────────
|
||||
echo "=== Phase 2: Gemma4-E4B ngl sweep (p=16 n=64 r=1 t=6 fa=0) ==="
|
||||
echo " 5.1GB model on 3.7GB VRAM — finding highest ngl before OOM"
|
||||
best_e4b_ngl=0
|
||||
for ngl in 0 4 8 12 16 20 24 28 32 36 42; do
|
||||
ts=$(bench "$M_E4B" $ngl -p 16 -n 64 -r 1 -t 6 -fa 0)
|
||||
printf " ngl=%-3s %s\n" "$ngl" "$ts"
|
||||
[[ "$ts" == OOM ]] && break
|
||||
best_e4b_ngl=$ngl
|
||||
done
|
||||
echo " → best_e4b_ngl=$best_e4b_ngl"
|
||||
echo ""
|
||||
|
||||
# ── Phase 3: Max context sweep ────────────────────────────────────────────────
|
||||
echo "=== Phase 3: Max context (p=ctx n=1 r=1 no-warmup fa=1) ==="
|
||||
echo " Gemma4 hybrid attention (sliding_window=512) enables large ctx cheaply."
|
||||
declare -A BEST_CTX
|
||||
BEST_CTX[smollm3]=512; BEST_CTX[e2b]=512; BEST_CTX[e4b]=512; BEST_CTX[q3]=512
|
||||
|
||||
for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
|
||||
"e2b:Gemma4-E2B:$M_E2B:99" \
|
||||
"e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
|
||||
"q3:Qwen3-4B:$M_Q3:99"; do
|
||||
IFS=':' read -r key lbl mdl ngl <<< "$entry"
|
||||
echo " -- $lbl (ngl=$ngl) --"
|
||||
for ctx in 512 1024 2048 4096 8192 12288 16384 24576 32768 49152 65536 98304 131072; do
|
||||
ts=$(bench_ctx "$mdl" "$ngl" "$ctx")
|
||||
printf " ctx=%-7s %s\n" "$ctx" "$ts"
|
||||
[[ "$ts" == OOM ]] && break
|
||||
BEST_CTX[$key]=$ctx
|
||||
done
|
||||
echo " → MAX ctx=${BEST_CTX[$key]}"
|
||||
done
|
||||
echo ""
|
||||
|
||||
# ── Phase 4: TG speed at max context ─────────────────────────────────────────
|
||||
echo "=== Phase 4: TG speed at max context (p=512 n=128 r=2 fa=1 t=6) ==="
|
||||
for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
|
||||
"e2b:Gemma4-E2B:$M_E2B:99" \
|
||||
"e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
|
||||
"q3:Qwen3-4B:$M_Q3:99"; do
|
||||
IFS=':' read -r key lbl mdl ngl <<< "$entry"
|
||||
ts=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 1 -t 6)
|
||||
printf " %-14s max_ctx=%-7s %s\n" "$lbl" "${BEST_CTX[$key]}" "$ts"
|
||||
done
|
||||
echo ""
|
||||
|
||||
# ── Phase 5: E4B thread sweep (CPU split model — threads matter) ──────────────
|
||||
echo "=== Phase 5: Gemma4-E4B thread sweep (p=512 n=128 r=2 fa=0 ngl=$best_e4b_ngl) ==="
|
||||
for t in 1 2 3 4 5 6 8 10 12; do
|
||||
ts=$(bench "$M_E4B" "$best_e4b_ngl" -p 512 -n 128 -r 2 -fa 0 -t "$t")
|
||||
printf " t=%-3s %s\n" "$t" "$ts"
|
||||
done
|
||||
echo ""
|
||||
|
||||
# ── Phase 6: Flash attention comparison ──────────────────────────────────────
|
||||
echo "=== Phase 6: Flash attention fa=0 vs fa=1 (p=512 n=128 r=2 t=6) ==="
|
||||
echo " Gemma4 hybrid attention may not support FA — testing both."
|
||||
for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
|
||||
"e2b:Gemma4-E2B:$M_E2B:99" \
|
||||
"e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
|
||||
"q3:Qwen3-4B:$M_Q3:99"; do
|
||||
IFS=':' read -r key lbl mdl ngl <<< "$entry"
|
||||
ts0=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 0 -t 6)
|
||||
ts1=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 1 -t 6)
|
||||
printf " %-14s fa=0: %-30s fa=1: %s\n" "$lbl" "$ts0" "$ts1"
|
||||
done
|
||||
echo ""
|
||||
|
||||
echo "$HR"
|
||||
echo "BENCHMARK COMPLETE: $(date)"
|
||||
echo "$HR"
|
||||
Reference in New Issue
Block a user