Initial commit: tuned multi-model llama.cpp stack

- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B
- TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs
- Bigctx profiles (-nkvo KV in RAM): 2-16x context gain
- turbo2 KV: 2x smaller, benchmarked against PPL quality gate
- Per-model env files with justified parameters
- kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts
- docs/FINDINGS.md: surprises, pitfalls, recommendations
- docs/ARCHITECTURE.md: compose + test script design
This commit is contained in:
2026-05-06 15:56:40 +02:00
commit 4ad296608b
22 changed files with 2530 additions and 0 deletions

175
scripts/benchmark_models.sh Normal file
View File

@@ -0,0 +1,175 @@
#!/bin/bash
# Benchmark all 4 new models on GTX 1650 Ti (3717 MiB VRAM)
# Priority: max context size > tg speed
# Runs inside ghcr.io/ggml-org/llama.cpp:full-cuda (build b9014, no -c flag)
#
# Architecture context limits (from GGUF metadata):
# SmolLM3-3B : 65536 (full attention, KV-limited to ~28K in practice)
# Gemma4-E2B : 131072 (hybrid: sliding_window=512 → huge ctx possible)
# Gemma4-E4B : 131072 (hybrid: sliding_window=512)
# Qwen3-4B : 40960 (full attention, KV-limited to ~9K in practice)
#
# NOTE: llama-bench b9014 has NO -c flag. Context is set by -p (prompt tokens).
# -p N -n G allocates KV for N+G tokens. OOM = exit!=0 or error in stdout.
set -uo pipefail
M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
# -- CSV column detection (called once on first successful output) --
TS_COL=0; NG_COL=0; NP_COL=0
detect_cols() {
local hdr
hdr=$(printf '%s\n' "$1" | sed 's/"//g' | grep '^build_commit' | head -1)
TS_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="avg_ts"){print i;exit}}')
NG_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="n_gen"){print i;exit}}')
NP_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="n_prompt"){print i;exit}}')
TS_COL=${TS_COL:-0}; NG_COL=${NG_COL:-0}; NP_COL=${NP_COL:-0}
}
# Returns "pp_speed pp / tg_speed tg t/s"
parse_speeds() {
local out="$1"
[ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
local s pp tg
s=$(printf '%s\n' "$out" | sed 's/"//g')
pp=$(printf '%s\n' "$s" | awk -F',' -v tc="$TS_COL" -v np="$NP_COL" -v ng="$NG_COL" \
'NR>1 && $np+0>0 && $ng+0==0 {printf "%.0f", $tc+0; exit}')
tg=$(printf '%s\n' "$s" | awk -F',' -v tc="$TS_COL" -v np="$NP_COL" -v ng="$NG_COL" \
'NR>1 && $ng+0>0 && $np+0==0 {printf "%.1f", $tc+0; exit}')
printf "%s pp / %s tg t/s" "${pp:--}" "${tg:--}"
}
is_oom() {
local out="$1" ec="$2"
[ "$ec" -ne 0 ] && return 0
printf '%s\n' "$out" | grep -qiE "failed to create context|out of memory|GGML_ASSERT|error:" && return 0
return 1
}
# bench MODEL NGL [llama-bench extra args...]
# Standard speed benchmark: -p 512 -n 128 small context
bench() {
local model=$1 ngl=$2; shift 2
local out ec
out=$(timeout 250 /app/llama-bench -m "$model" -ngl "$ngl" \
-b 512 -ub 128 -o csv "$@" 2>&1)
ec=$?
if is_oom "$out" "$ec"; then echo "OOM"; return; fi
[ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
parse_speeds "$out"
}
# bench_ctx MODEL NGL CTX
# Context-capacity test: allocates KV for CTX tokens via -p CTX -n 1
# Tries fa=1 first, falls back to fa=0. Returns "OK (N pp t/s [fa=N])" or "OOM"
bench_ctx() {
local model=$1 ngl=$2 ctx=$3
local out ec fa_used
for fa in 1 0; do
out=$(timeout 250 /app/llama-bench -m "$model" -ngl "$ngl" \
-p "$ctx" -n 1 -r 1 --no-warmup \
-b 512 -ub 128 -fa "$fa" -t 6 -o csv 2>&1)
ec=$?
is_oom "$out" "$ec" || { fa_used=$fa; break; }
[ "$fa" = "0" ] && { echo "OOM"; return; }
done
[ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
local pp
pp=$(printf '%s\n' "$out" | sed 's/"//g' | \
awk -F',' -v tc="$TS_COL" -v np="$NP_COL" \
'NR>1 && $np+0>0 {printf "%.0f", $tc+0; exit}')
printf "OK (%s pp t/s fa=%s)" "${pp:--}" "${fa_used:-?}"
}
HR="======================================================================"
echo "$HR"
echo "LLAMA.CPP BENCHMARK — ALL MODELS — $(date)"
echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo unknown)"
echo "$HR"
echo ""
# ── Phase 1: Baseline (small context) ────────────────────────────────────────
echo "=== Phase 1: Baseline (ngl=99, p=512 n=128 r=2, t=6, fa=0) ==="
for entry in "SmolLM3-3B:$M_SMOL" "Gemma4-E2B:$M_E2B" "Gemma4-E4B:$M_E4B" "Qwen3-4B:$M_Q3"; do
lbl="${entry%%:*}"; mdl="${entry#*:}"
printf " %-14s %s\n" "$lbl" "$(bench "$mdl" 99 -p 512 -n 128 -r 2 -t 6 -fa 0)"
done
echo ""
# ── Phase 2: Gemma4-E4B ngl sweep ────────────────────────────────────────────
echo "=== Phase 2: Gemma4-E4B ngl sweep (p=16 n=64 r=1 t=6 fa=0) ==="
echo " 5.1GB model on 3.7GB VRAM — finding highest ngl before OOM"
best_e4b_ngl=0
for ngl in 0 4 8 12 16 20 24 28 32 36 42; do
ts=$(bench "$M_E4B" $ngl -p 16 -n 64 -r 1 -t 6 -fa 0)
printf " ngl=%-3s %s\n" "$ngl" "$ts"
[[ "$ts" == OOM ]] && break
best_e4b_ngl=$ngl
done
echo " → best_e4b_ngl=$best_e4b_ngl"
echo ""
# ── Phase 3: Max context sweep ────────────────────────────────────────────────
echo "=== Phase 3: Max context (p=ctx n=1 r=1 no-warmup fa=1) ==="
echo " Gemma4 hybrid attention (sliding_window=512) enables large ctx cheaply."
declare -A BEST_CTX
BEST_CTX[smollm3]=512; BEST_CTX[e2b]=512; BEST_CTX[e4b]=512; BEST_CTX[q3]=512
for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
"e2b:Gemma4-E2B:$M_E2B:99" \
"e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
"q3:Qwen3-4B:$M_Q3:99"; do
IFS=':' read -r key lbl mdl ngl <<< "$entry"
echo " -- $lbl (ngl=$ngl) --"
for ctx in 512 1024 2048 4096 8192 12288 16384 24576 32768 49152 65536 98304 131072; do
ts=$(bench_ctx "$mdl" "$ngl" "$ctx")
printf " ctx=%-7s %s\n" "$ctx" "$ts"
[[ "$ts" == OOM ]] && break
BEST_CTX[$key]=$ctx
done
echo " → MAX ctx=${BEST_CTX[$key]}"
done
echo ""
# ── Phase 4: TG speed at max context ─────────────────────────────────────────
echo "=== Phase 4: TG speed at max context (p=512 n=128 r=2 fa=1 t=6) ==="
for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
"e2b:Gemma4-E2B:$M_E2B:99" \
"e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
"q3:Qwen3-4B:$M_Q3:99"; do
IFS=':' read -r key lbl mdl ngl <<< "$entry"
ts=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 1 -t 6)
printf " %-14s max_ctx=%-7s %s\n" "$lbl" "${BEST_CTX[$key]}" "$ts"
done
echo ""
# ── Phase 5: E4B thread sweep (CPU split model — threads matter) ──────────────
echo "=== Phase 5: Gemma4-E4B thread sweep (p=512 n=128 r=2 fa=0 ngl=$best_e4b_ngl) ==="
for t in 1 2 3 4 5 6 8 10 12; do
ts=$(bench "$M_E4B" "$best_e4b_ngl" -p 512 -n 128 -r 2 -fa 0 -t "$t")
printf " t=%-3s %s\n" "$t" "$ts"
done
echo ""
# ── Phase 6: Flash attention comparison ──────────────────────────────────────
echo "=== Phase 6: Flash attention fa=0 vs fa=1 (p=512 n=128 r=2 t=6) ==="
echo " Gemma4 hybrid attention may not support FA — testing both."
for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
"e2b:Gemma4-E2B:$M_E2B:99" \
"e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
"q3:Qwen3-4B:$M_Q3:99"; do
IFS=':' read -r key lbl mdl ngl <<< "$entry"
ts0=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 0 -t 6)
ts1=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 1 -t 6)
printf " %-14s fa=0: %-30s fa=1: %s\n" "$lbl" "$ts0" "$ts1"
done
echo ""
echo "$HR"
echo "BENCHMARK COMPLETE: $(date)"
echo "$HR"