Initial commit: tuned multi-model llama.cpp stack

- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B - TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs - Bigctx profiles (-nkvo KV in RAM): 2-16x context gain - turbo2 KV: 2x smaller, benchmarked against PPL quality gate - Per-model env files with justified parameters - kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts - docs/FINDINGS.md: surprises, pitfalls, recommendations - docs/ARCHITECTURE.md: compose + test script design
2026-05-06 15:56:40 +02:00
commit 4ad296608b
22 changed files with 2530 additions and 0 deletions
--- a/scripts/benchmark_models.sh
+++ b/scripts/benchmark_models.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+# Benchmark all 4 new models on GTX 1650 Ti (3717 MiB VRAM)
+# Priority: max context size > tg speed
+# Runs inside ghcr.io/ggml-org/llama.cpp:full-cuda (build b9014, no -c flag)
+#
+# Architecture context limits (from GGUF metadata):
+#   SmolLM3-3B   : 65536  (full attention, KV-limited to ~28K in practice)
+#   Gemma4-E2B   : 131072 (hybrid: sliding_window=512 → huge ctx possible)
+#   Gemma4-E4B   : 131072 (hybrid: sliding_window=512)
+#   Qwen3-4B     : 40960  (full attention, KV-limited to ~9K in practice)
+#
+# NOTE: llama-bench b9014 has NO -c flag. Context is set by -p (prompt tokens).
+#   -p N -n G allocates KV for N+G tokens. OOM = exit!=0 or error in stdout.
+
+set -uo pipefail
+
+M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
+M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
+M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
+M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
+
+# -- CSV column detection (called once on first successful output) --
+TS_COL=0; NG_COL=0; NP_COL=0
+
+detect_cols() {
+    local hdr
+    hdr=$(printf '%s\n' "$1" | sed 's/"//g' | grep '^build_commit' | head -1)
+    TS_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="avg_ts"){print i;exit}}')
+    NG_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="n_gen"){print i;exit}}')
+    NP_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="n_prompt"){print i;exit}}')
+    TS_COL=${TS_COL:-0}; NG_COL=${NG_COL:-0}; NP_COL=${NP_COL:-0}
+}
+
+# Returns "pp_speed pp / tg_speed tg t/s"
+parse_speeds() {
+    local out="$1"
+    [ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
+    local s pp tg
+    s=$(printf '%s\n' "$out" | sed 's/"//g')
+    pp=$(printf '%s\n' "$s" | awk -F',' -v tc="$TS_COL" -v np="$NP_COL" -v ng="$NG_COL" \
+        'NR>1 && $np+0>0 && $ng+0==0 {printf "%.0f", $tc+0; exit}')
+    tg=$(printf '%s\n' "$s" | awk -F',' -v tc="$TS_COL" -v np="$NP_COL" -v ng="$NG_COL" \
+        'NR>1 && $ng+0>0 && $np+0==0 {printf "%.1f", $tc+0; exit}')
+    printf "%s pp / %s tg t/s" "${pp:--}" "${tg:--}"
+}
+
+is_oom() {
+    local out="$1" ec="$2"
+    [ "$ec" -ne 0 ] && return 0
+    printf '%s\n' "$out" | grep -qiE "failed to create context|out of memory|GGML_ASSERT|error:" && return 0
+    return 1
+}
+
+# bench MODEL NGL [llama-bench extra args...]
+# Standard speed benchmark: -p 512 -n 128 small context
+bench() {
+    local model=$1 ngl=$2; shift 2
+    local out ec
+    out=$(timeout 250 /app/llama-bench -m "$model" -ngl "$ngl" \
+        -b 512 -ub 128 -o csv "$@" 2>&1)
+    ec=$?
+    if is_oom "$out" "$ec"; then echo "OOM"; return; fi
+    [ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
+    parse_speeds "$out"
+}
+
+# bench_ctx MODEL NGL CTX
+# Context-capacity test: allocates KV for CTX tokens via -p CTX -n 1
+# Tries fa=1 first, falls back to fa=0. Returns "OK (N pp t/s [fa=N])" or "OOM"
+bench_ctx() {
+    local model=$1 ngl=$2 ctx=$3
+    local out ec fa_used
+    for fa in 1 0; do
+        out=$(timeout 250 /app/llama-bench -m "$model" -ngl "$ngl" \
+            -p "$ctx" -n 1 -r 1 --no-warmup \
+            -b 512 -ub 128 -fa "$fa" -t 6 -o csv 2>&1)
+        ec=$?
+        is_oom "$out" "$ec" || { fa_used=$fa; break; }
+        [ "$fa" = "0" ] && { echo "OOM"; return; }
+    done
+    [ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
+    local pp
+    pp=$(printf '%s\n' "$out" | sed 's/"//g' | \
+        awk -F',' -v tc="$TS_COL" -v np="$NP_COL" \
+        'NR>1 && $np+0>0 {printf "%.0f", $tc+0; exit}')
+    printf "OK (%s pp t/s fa=%s)" "${pp:--}" "${fa_used:-?}"
+}
+
+HR="======================================================================"
+echo "$HR"
+echo "LLAMA.CPP BENCHMARK — ALL MODELS — $(date)"
+echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo unknown)"
+echo "$HR"
+echo ""
+
+# ── Phase 1: Baseline (small context) ────────────────────────────────────────
+echo "=== Phase 1: Baseline (ngl=99, p=512 n=128 r=2, t=6, fa=0) ==="
+for entry in "SmolLM3-3B:$M_SMOL" "Gemma4-E2B:$M_E2B" "Gemma4-E4B:$M_E4B" "Qwen3-4B:$M_Q3"; do
+    lbl="${entry%%:*}"; mdl="${entry#*:}"
+    printf "  %-14s  %s\n" "$lbl" "$(bench "$mdl" 99 -p 512 -n 128 -r 2 -t 6 -fa 0)"
+done
+echo ""
+
+# ── Phase 2: Gemma4-E4B ngl sweep ────────────────────────────────────────────
+echo "=== Phase 2: Gemma4-E4B ngl sweep (p=16 n=64 r=1 t=6 fa=0) ==="
+echo "    5.1GB model on 3.7GB VRAM — finding highest ngl before OOM"
+best_e4b_ngl=0
+for ngl in 0 4 8 12 16 20 24 28 32 36 42; do
+    ts=$(bench "$M_E4B" $ngl -p 16 -n 64 -r 1 -t 6 -fa 0)
+    printf "  ngl=%-3s  %s\n" "$ngl" "$ts"
+    [[ "$ts" == OOM ]] && break
+    best_e4b_ngl=$ngl
+done
+echo "  → best_e4b_ngl=$best_e4b_ngl"
+echo ""
+
+# ── Phase 3: Max context sweep ────────────────────────────────────────────────
+echo "=== Phase 3: Max context (p=ctx n=1 r=1 no-warmup fa=1) ==="
+echo "    Gemma4 hybrid attention (sliding_window=512) enables large ctx cheaply."
+declare -A BEST_CTX
+BEST_CTX[smollm3]=512; BEST_CTX[e2b]=512; BEST_CTX[e4b]=512; BEST_CTX[q3]=512
+
+for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
+             "e2b:Gemma4-E2B:$M_E2B:99" \
+             "e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
+             "q3:Qwen3-4B:$M_Q3:99"; do
+    IFS=':' read -r key lbl mdl ngl <<< "$entry"
+    echo "  -- $lbl (ngl=$ngl) --"
+    for ctx in 512 1024 2048 4096 8192 12288 16384 24576 32768 49152 65536 98304 131072; do
+        ts=$(bench_ctx "$mdl" "$ngl" "$ctx")
+        printf "    ctx=%-7s  %s\n" "$ctx" "$ts"
+        [[ "$ts" == OOM ]] && break
+        BEST_CTX[$key]=$ctx
+    done
+    echo "    → MAX ctx=${BEST_CTX[$key]}"
+done
+echo ""
+
+# ── Phase 4: TG speed at max context ─────────────────────────────────────────
+echo "=== Phase 4: TG speed at max context (p=512 n=128 r=2 fa=1 t=6) ==="
+for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
+             "e2b:Gemma4-E2B:$M_E2B:99" \
+             "e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
+             "q3:Qwen3-4B:$M_Q3:99"; do
+    IFS=':' read -r key lbl mdl ngl <<< "$entry"
+    ts=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 1 -t 6)
+    printf "  %-14s  max_ctx=%-7s  %s\n" "$lbl" "${BEST_CTX[$key]}" "$ts"
+done
+echo ""
+
+# ── Phase 5: E4B thread sweep (CPU split model — threads matter) ──────────────
+echo "=== Phase 5: Gemma4-E4B thread sweep (p=512 n=128 r=2 fa=0 ngl=$best_e4b_ngl) ==="
+for t in 1 2 3 4 5 6 8 10 12; do
+    ts=$(bench "$M_E4B" "$best_e4b_ngl" -p 512 -n 128 -r 2 -fa 0 -t "$t")
+    printf "  t=%-3s  %s\n" "$t" "$ts"
+done
+echo ""
+
+# ── Phase 6: Flash attention comparison ──────────────────────────────────────
+echo "=== Phase 6: Flash attention fa=0 vs fa=1 (p=512 n=128 r=2 t=6) ==="
+echo "    Gemma4 hybrid attention may not support FA — testing both."
+for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
+             "e2b:Gemma4-E2B:$M_E2B:99" \
+             "e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
+             "q3:Qwen3-4B:$M_Q3:99"; do
+    IFS=':' read -r key lbl mdl ngl <<< "$entry"
+    ts0=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 0 -t 6)
+    ts1=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 1 -t 6)
+    printf "  %-14s  fa=0: %-30s  fa=1: %s\n" "$lbl" "$ts0" "$ts1"
+done
+echo ""
+
+echo "$HR"
+echo "BENCHMARK COMPLETE: $(date)"
+echo "$HR"