Initial commit: tuned multi-model llama.cpp stack

- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B
- TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs
- Bigctx profiles (-nkvo KV in RAM): 2-16x context gain
- turbo2 KV: 2x smaller, benchmarked against PPL quality gate
- Per-model env files with justified parameters
- kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts
- docs/FINDINGS.md: surprises, pitfalls, recommendations
- docs/ARCHITECTURE.md: compose + test script design
This commit is contained in:
2026-05-06 15:56:40 +02:00
commit 4ad296608b
22 changed files with 2530 additions and 0 deletions

335
scripts/benchmark.sh Executable file
View File

@@ -0,0 +1,335 @@
#!/usr/bin/env bash
# =============================================================================
# llama.cpp Automated Benchmark — Qwen3.5-9B on GTX 1650 Ti (4 GB VRAM)
#
# Runs for BOTH official llama.cpp and TurboQuant fork.
# VARIANT env var selects which KV type set to sweep:
# VARIANT=official → f16 q8_0 q5_0 q4_0 iq4_nl
# VARIANT=turboquant → f16 q8_0 iq4_nl turbo4 turbo3 turbo2
#
# Output: CSV + recommended .env per variant, plus a final comparison table.
#
# Run:
# docker compose --profile benchmark run --rm benchmark (official)
# docker compose --profile benchmark run --rm benchmark-turbo (turboquant)
# =============================================================================
set -euo pipefail
# Ensure llama-bench is findable in both official (/usr/local/bin) and TurboQuant (/app) images
export PATH="/app:/usr/local/bin:/usr/bin:/bin:${PATH:-}"
MODEL="${MODEL:-${1:-/models/Qwen3.5-9B.Q8_0.gguf}}"
OUTPUT_DIR="${OUTPUT_DIR:-${2:-/results}}"
VARIANT="${VARIANT:-official}" # official | turboquant
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
RESULTS_CSV="${OUTPUT_DIR}/${VARIANT}_results_${TIMESTAMP}.csv"
LOG="${OUTPUT_DIR}/${VARIANT}_benchmark_${TIMESTAMP}.log"
# ── Baseline config ────────────────────────────────────────────────────────
THREADS=6
THREADS_BATCH=12
BATCH_SIZE=2048
UBATCH_SIZE=512
PROMPT_TOKENS=512
GEN_TOKENS=32
REPETITIONS=1
# ── KV type sets per variant ───────────────────────────────────────────────
# turbo2=2-bit (6.4× vs f16), turbo3=3-bit, turbo4=4-bit — TurboQuant only
if [[ "$VARIANT" == "turboquant" ]]; then
KV_TYPES=(f16 q8_0 iq4_nl turbo4 turbo3 turbo2)
else
# Official llama.cpp: all standard quant types
# iq4_nl = i-quant non-linear: best quality at 4-bit (non-uniform scale)
KV_TYPES=(f16 q8_0 q5_0 q4_0 iq4_nl)
fi
# ── GPU layer sweep (Q8_0 ~297 MB/layer, 3717 MiB VRAM → max ~12 layers) ──
NGL_VALUES=(6 9 12 13 14 99)
# ── Context sweep: use -p to stress KV cache at given size ─────────────────
CTX_VALUES=(128 512 1024 2048 4096 8192)
# ── Batch sweep ────────────────────────────────────────────────────────────
BATCH_VALUES=(512 1024 2048 4096)
mkdir -p "$OUTPUT_DIR"
log() { echo "$*" | tee -a "$LOG"; }
sep() { log "$(printf '─%.0s' {1..70})"; }
hdr() { sep; log " $*"; sep; }
log "llama.cpp Benchmark [${VARIANT}] — $(date)"
log "Model: $MODEL"
log "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo 'CPU only')"
log "KV set: ${KV_TYPES[*]}"
sep
echo "variant,phase,ngl,ctx,kv_type_k,kv_type_v,flash_attn,batch_size,ubatch_size,threads,pp_tokens_per_sec,tg_tokens_per_sec,status" \
> "$RESULTS_CSV"
# ── Helper: run llama-bench ────────────────────────────────────────────────
LAST_PP=0
LAST_TG=0
run_bench() {
local ngl=$1 ctx=$2 kv=$3 fa=$4 batch=$5 ubatch=$6 phase="${7:-test}"
local raw
# New llama-bench API (b9014+): -c and -tb removed; -p sets prompt/ctx size
# CSV columns: ...n_prompt(34),n_gen(35),n_depth(36),test_time(37),
# avg_ns(38),stddev_ns(39),avg_ts(40),stddev_ts(41)
# pp row: n_gen==0; tg row: n_prompt==0
raw=$(timeout 300 /app/llama-bench \
-m "$MODEL" \
-ngl "$ngl" \
-p "$ctx" \
-n "$GEN_TOKENS" \
-t "$THREADS" \
-b "$batch" \
-ub "$ubatch" \
-ctk "$kv" \
-ctv "$kv" \
-fa "$fa" \
-r "$REPETITIONS" \
-o csv 2>&1) || return 1
# Strip quotes from CSV, then extract avg_ts (col 40) by pp/tg row type
LAST_PP=$(printf '%s\n' "$raw" | sed 's/"//g' | awk -F',' 'NR>1 && $35=="0" && $40+0>0 {print $40+0; exit}')
LAST_TG=$(printf '%s\n' "$raw" | sed 's/"//g' | awk -F',' 'NR>1 && $34=="0" && $35+0>0 && $40+0>0 {print $40+0; exit}')
LAST_PP="${LAST_PP:-0}"
LAST_TG="${LAST_TG:-0}"
echo "${VARIANT},${phase},${ngl},${ctx},${kv},${kv},${fa},${batch},${ubatch},${THREADS},${LAST_PP},${LAST_TG},ok" \
>> "$RESULTS_CSV"
return 0
}
fail_row() {
local phase=$1 ngl=$2 ctx=$3 kv=$4 fa=$5 batch=$6 ubatch=$7
echo "${VARIANT},${phase},${ngl},${ctx},${kv},${kv},${fa},${batch},${ubatch},${THREADS},0,0,failed" \
>> "$RESULTS_CSV"
}
# ── Phase 1: GPU layer sweep ───────────────────────────────────────────────
hdr "PHASE 1 — GPU layer sweep (prompt=128 kv=f16 fa=0)"
# Use f16 KV: prebuilt official image lacks SM75 CUDA kernels for quantized KV.
# We isolate the NGL variable here; KV type is swept in Phase 3.
MAX_STABLE_NGL=0
for ngl in "${NGL_VALUES[@]}"; do
printf " ngl=%-3s " "$ngl" | tee -a "$LOG"
if run_bench "$ngl" 128 f16 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph1_ngl"; then
log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s"
MAX_STABLE_NGL="$ngl"
else
log "FAILED (OOM/timeout)"
fail_row ph1_ngl "$ngl" 128 f16 0 "$BATCH_SIZE" "$UBATCH_SIZE"
break
fi
done
log " → Best ngl: ${MAX_STABLE_NGL}"
# ── Phase 2: Context sweep ─────────────────────────────────────────────────
hdr "PHASE 2 — Context/prompt sweep (ngl=${MAX_STABLE_NGL} kv=f16 fa=0)"
MAX_STABLE_CTX=128
for ctx in "${CTX_VALUES[@]}"; do
printf " ctx=%-6s " "$ctx" | tee -a "$LOG"
if run_bench "$MAX_STABLE_NGL" "$ctx" f16 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph2_ctx"; then
log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s"
MAX_STABLE_CTX="$ctx"
else
log "FAILED (OOM/timeout)"
fail_row ph2_ctx "$MAX_STABLE_NGL" "$ctx" f16 0 "$BATCH_SIZE" "$UBATCH_SIZE"
break
fi
done
log " → Best ctx: ${MAX_STABLE_CTX}"
# ── Phase 3: KV cache type sweep ───────────────────────────────────────────
hdr "PHASE 3 — KV type sweep (ngl=${MAX_STABLE_NGL} ctx=${MAX_STABLE_CTX} fa=1)"
log " [${VARIANT}] KV types: ${KV_TYPES[*]}"
log " Note: Qwen3.5-9B has only 8/32 full-attention layers + GQA (4 KV heads)"
log " Linear-attention layers need no KV cache at all → quant errors minimal"
if [[ "$VARIANT" == "turboquant" ]]; then
log " turbo2=2-bit (6.4× compression), turbo3=3-bit, turbo4=4-bit"
fi
BEST_KV="q8_0"
BEST_TG_KV=0
for kv in "${KV_TYPES[@]}"; do
printf " kv=%-8s " "$kv" | tee -a "$LOG"
if run_bench "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$kv" 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph3_kv"; then
log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s"
tg_n=$(printf '%s' "$LAST_TG" | grep -oP '[0-9]+\.?[0-9]*' | head -1)
if awk "BEGIN{exit !(${tg_n:-0} > ${BEST_TG_KV:-0})}"; then
BEST_TG_KV="${tg_n:-0}"
BEST_KV="$kv"
fi
else
log "FAILED"
fail_row ph3_kv "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$kv" 0 "$BATCH_SIZE" "$UBATCH_SIZE"
fi
done
log " → Best KV: ${BEST_KV} (tg=${BEST_TG_KV} t/s)"
# ── Phase 4: Flash attention ───────────────────────────────────────────────
hdr "PHASE 4 — Flash attention (ngl=${MAX_STABLE_NGL} ctx=${MAX_STABLE_CTX} kv=${BEST_KV})"
log " GTX 1650 Ti = CC 7.5 (Turing) — FA2 requires SM80+ but FA1 works on CC>=7.5"
BEST_FA=1
BEST_TG_FA=0
for fa in 1 0; do
fa_label=$([ "$fa" -eq 1 ] && echo "on " || echo "off")
printf " fa=%-3s " "$fa_label" | tee -a "$LOG"
if run_bench "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$BEST_KV" "$fa" "$BATCH_SIZE" "$UBATCH_SIZE" "ph4_fa"; then
log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s"
tg_n=$(printf '%s' "$LAST_TG" | grep -oP '[0-9]+\.?[0-9]*' | head -1)
if awk "BEGIN{exit !(${tg_n:-0} > ${BEST_TG_FA:-0})}"; then
BEST_TG_FA="${tg_n:-0}"
BEST_FA="$fa"
fi
else
log "FAILED"
fi
done
log " → Best FA: ${BEST_FA} (tg=${BEST_TG_FA} t/s)"
# ── Phase 5: Batch sweep ───────────────────────────────────────────────────
hdr "PHASE 5 — Batch sweep (ngl=${MAX_STABLE_NGL} ctx=${MAX_STABLE_CTX} kv=${BEST_KV} fa=${BEST_FA})"
# Use small fixed prompt (64) to isolate batch-buffer allocation overhead from prompt size.
# Larger batch = larger CUDA activation buffers; tests whether they fit in remaining VRAM.
BEST_BATCH="$BATCH_SIZE"
BEST_PP_BATCH=0
FIXED_P=64
for batch in "${BATCH_VALUES[@]}"; do
ubatch=$(( batch / 4 < 64 ? 64 : batch / 4 ))
printf " batch=%-5s ubatch=%-4s " "$batch" "$ubatch" | tee -a "$LOG"
if run_bench "$MAX_STABLE_NGL" "$FIXED_P" "$BEST_KV" "$BEST_FA" "$batch" "$ubatch" "ph5_batch"; then
log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s"
pp_n=$(printf '%s' "$LAST_PP" | grep -oP '[0-9]+\.?[0-9]*' | head -1)
if awk "BEGIN{exit !(${pp_n:-0} > ${BEST_PP_BATCH:-0})}"; then
BEST_PP_BATCH="${pp_n:-0}"
BEST_BATCH="$batch"
fi
else
log "FAILED"
fi
done
BEST_UBATCH=$(( BEST_BATCH / 4 < 64 ? 64 : BEST_BATCH / 4 ))
log " → Best batch: ${BEST_BATCH} ubatch: ${BEST_UBATCH} (pp=${BEST_PP_BATCH} t/s)"
# ── Phase 6 (TurboQuant only): max context with turbo2 KV ──────────────────
if [[ "$VARIANT" == "turboquant" ]]; then
hdr "PHASE 6 — TurboQuant: extended context with turbo2 KV (ngl=${MAX_STABLE_NGL} fa=${BEST_FA})"
log " turbo2 = 2-bit KV (6.4× smaller than f16) → enables much larger ctx in same VRAM"
TURBO_CTX_VALUES=(512 1024 2048 4096 8192 16384 32768)
MAX_TURBO_CTX="128"
TURBO_KV="turbo2"
for ctx in "${TURBO_CTX_VALUES[@]}"; do
printf " ctx=%-7s " "$ctx" | tee -a "$LOG"
if run_bench "$MAX_STABLE_NGL" "$ctx" "$TURBO_KV" "$BEST_FA" "$BEST_BATCH" "$BEST_UBATCH" "ph6_turbo_ctx"; then
log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s"
MAX_TURBO_CTX="$ctx"
else
log "FAILED (OOM/timeout)"
fail_row ph6_turbo_ctx "$MAX_STABLE_NGL" "$ctx" "$TURBO_KV" "$BEST_FA" "$BEST_BATCH" "$BEST_UBATCH"
break
fi
done
log " → Max context with turbo2: ${MAX_TURBO_CTX}"
# Use the larger turbo ctx for the recommended .env
MAX_STABLE_CTX="$MAX_TURBO_CTX"
BEST_KV="$TURBO_KV"
fi
# ── Summary ────────────────────────────────────────────────────────────────
sep
log "BENCHMARK COMPLETE [${VARIANT}] — $(date)"
sep
log ""
log " Optimal params for GTX 1650 Ti + Qwen3.5-9B Q4_K_M [${VARIANT}]:"
log ""
log " ngl : ${MAX_STABLE_NGL}"
log " ctx_size : ${MAX_STABLE_CTX}"
log " kv_type : ${BEST_KV}"
log " flash_attn : ${BEST_FA}"
log " batch_size : ${BEST_BATCH}"
log " ubatch : ${BEST_UBATCH}"
log ""
log " Full CSV: ${RESULTS_CSV}"
log ""
# Write recommended .env
ENV_OUT="${OUTPUT_DIR}/${VARIANT}_recommended.env"
cat > "$ENV_OUT" <<EOF
# Generated by benchmark.sh [${VARIANT}] on $(date)
LLAMA_N_GPU_LAYERS=${MAX_STABLE_NGL}
LLAMA_CTX_SIZE=${MAX_STABLE_CTX}
LLAMA_CACHE_TYPE_K=${BEST_KV}
LLAMA_CACHE_TYPE_V=${BEST_KV}
LLAMA_BATCH_SIZE=${BEST_BATCH}
LLAMA_UBATCH_SIZE=${BEST_UBATCH}
LLAMA_THREADS=${THREADS}
LLAMA_THREADS_BATCH=${THREADS_BATCH}
LLAMA_PARALLEL=1
EOF
log " Recommended .env → ${ENV_OUT}"
# ── Cross-variant comparison (if both results exist) ──────────────────────
OFFICIAL_CSV=$(ls "${OUTPUT_DIR}"/official_results_*.csv 2>/dev/null | sort | tail -1 || true)
TURBO_CSV=$(ls "${OUTPUT_DIR}"/turboquant_results_*.csv 2>/dev/null | sort | tail -1 || true)
if [[ -n "$OFFICIAL_CSV" && -n "$TURBO_CSV" ]]; then
COMPARE_OUT="${OUTPUT_DIR}/comparison_$(date +%Y%m%d_%H%M%S).txt"
{
echo "======================================================================"
echo " OFFICIAL vs TURBOQUANT COMPARISON"
echo "======================================================================"
echo ""
echo "Official CSV: $OFFICIAL_CSV"
echo "TurboQuant CSV: $TURBO_CSV"
echo ""
echo "KV type benchmark results (phase ph3_kv):"
echo ""
printf "%-12s %-10s %-10s %-12s %-12s\n" "variant" "kv_type" "ctx" "pp (t/s)" "tg (t/s)"
echo "----------------------------------------------------------------------"
for csv in "$OFFICIAL_CSV" "$TURBO_CSV"; do
awk -F',' '
NR>1 && $2 == "ph3_kv" {
printf "%-12s %-10s %-10s %-12s %-12s\n", $1, $5, $4, $11, $12
}
' "$csv"
done
echo ""
echo "Winner by tg (generation speed):"
awk -F',' '
NR>1 && $2 == "ph3_kv" && $13 == "ok" {
key = $1 "," $5
val = $12+0
if (val > best[key]) { best[key] = val; row[key] = $0 }
}
END {
best_tg = 0; best_key = ""
for (k in best) { if (best[k] > best_tg) { best_tg = best[k]; best_key = k } }
n = split(best_key, a, ",")
printf " %s with kv=%s → %.1f t/s\n", a[1], a[2], best_tg
}
' "$OFFICIAL_CSV" "$TURBO_CSV"
echo "======================================================================"
} | tee "$COMPARE_OUT" | tee -a "$LOG"
echo ""
echo "Comparison report: $COMPARE_OUT"
fi
sep
echo ""
echo "=== RECOMMENDED .env [${VARIANT}] ==="
cat "$ENV_OUT"

175
scripts/benchmark_models.sh Normal file
View File

@@ -0,0 +1,175 @@
#!/bin/bash
# Benchmark all 4 new models on GTX 1650 Ti (3717 MiB VRAM)
# Priority: max context size > tg speed
# Runs inside ghcr.io/ggml-org/llama.cpp:full-cuda (build b9014, no -c flag)
#
# Architecture context limits (from GGUF metadata):
# SmolLM3-3B : 65536 (full attention, KV-limited to ~28K in practice)
# Gemma4-E2B : 131072 (hybrid: sliding_window=512 → huge ctx possible)
# Gemma4-E4B : 131072 (hybrid: sliding_window=512)
# Qwen3-4B : 40960 (full attention, KV-limited to ~9K in practice)
#
# NOTE: llama-bench b9014 has NO -c flag. Context is set by -p (prompt tokens).
# -p N -n G allocates KV for N+G tokens. OOM = exit!=0 or error in stdout.
set -uo pipefail
M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
# -- CSV column detection (called once on first successful output) --
TS_COL=0; NG_COL=0; NP_COL=0
detect_cols() {
local hdr
hdr=$(printf '%s\n' "$1" | sed 's/"//g' | grep '^build_commit' | head -1)
TS_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="avg_ts"){print i;exit}}')
NG_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="n_gen"){print i;exit}}')
NP_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="n_prompt"){print i;exit}}')
TS_COL=${TS_COL:-0}; NG_COL=${NG_COL:-0}; NP_COL=${NP_COL:-0}
}
# Returns "pp_speed pp / tg_speed tg t/s"
parse_speeds() {
local out="$1"
[ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
local s pp tg
s=$(printf '%s\n' "$out" | sed 's/"//g')
pp=$(printf '%s\n' "$s" | awk -F',' -v tc="$TS_COL" -v np="$NP_COL" -v ng="$NG_COL" \
'NR>1 && $np+0>0 && $ng+0==0 {printf "%.0f", $tc+0; exit}')
tg=$(printf '%s\n' "$s" | awk -F',' -v tc="$TS_COL" -v np="$NP_COL" -v ng="$NG_COL" \
'NR>1 && $ng+0>0 && $np+0==0 {printf "%.1f", $tc+0; exit}')
printf "%s pp / %s tg t/s" "${pp:--}" "${tg:--}"
}
is_oom() {
local out="$1" ec="$2"
[ "$ec" -ne 0 ] && return 0
printf '%s\n' "$out" | grep -qiE "failed to create context|out of memory|GGML_ASSERT|error:" && return 0
return 1
}
# bench MODEL NGL [llama-bench extra args...]
# Standard speed benchmark: -p 512 -n 128 small context
bench() {
local model=$1 ngl=$2; shift 2
local out ec
out=$(timeout 250 /app/llama-bench -m "$model" -ngl "$ngl" \
-b 512 -ub 128 -o csv "$@" 2>&1)
ec=$?
if is_oom "$out" "$ec"; then echo "OOM"; return; fi
[ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
parse_speeds "$out"
}
# bench_ctx MODEL NGL CTX
# Context-capacity test: allocates KV for CTX tokens via -p CTX -n 1
# Tries fa=1 first, falls back to fa=0. Returns "OK (N pp t/s [fa=N])" or "OOM"
bench_ctx() {
local model=$1 ngl=$2 ctx=$3
local out ec fa_used
for fa in 1 0; do
out=$(timeout 250 /app/llama-bench -m "$model" -ngl "$ngl" \
-p "$ctx" -n 1 -r 1 --no-warmup \
-b 512 -ub 128 -fa "$fa" -t 6 -o csv 2>&1)
ec=$?
is_oom "$out" "$ec" || { fa_used=$fa; break; }
[ "$fa" = "0" ] && { echo "OOM"; return; }
done
[ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
local pp
pp=$(printf '%s\n' "$out" | sed 's/"//g' | \
awk -F',' -v tc="$TS_COL" -v np="$NP_COL" \
'NR>1 && $np+0>0 {printf "%.0f", $tc+0; exit}')
printf "OK (%s pp t/s fa=%s)" "${pp:--}" "${fa_used:-?}"
}
HR="======================================================================"
echo "$HR"
echo "LLAMA.CPP BENCHMARK — ALL MODELS — $(date)"
echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo unknown)"
echo "$HR"
echo ""
# ── Phase 1: Baseline (small context) ────────────────────────────────────────
echo "=== Phase 1: Baseline (ngl=99, p=512 n=128 r=2, t=6, fa=0) ==="
for entry in "SmolLM3-3B:$M_SMOL" "Gemma4-E2B:$M_E2B" "Gemma4-E4B:$M_E4B" "Qwen3-4B:$M_Q3"; do
lbl="${entry%%:*}"; mdl="${entry#*:}"
printf " %-14s %s\n" "$lbl" "$(bench "$mdl" 99 -p 512 -n 128 -r 2 -t 6 -fa 0)"
done
echo ""
# ── Phase 2: Gemma4-E4B ngl sweep ────────────────────────────────────────────
echo "=== Phase 2: Gemma4-E4B ngl sweep (p=16 n=64 r=1 t=6 fa=0) ==="
echo " 5.1GB model on 3.7GB VRAM — finding highest ngl before OOM"
best_e4b_ngl=0
for ngl in 0 4 8 12 16 20 24 28 32 36 42; do
ts=$(bench "$M_E4B" $ngl -p 16 -n 64 -r 1 -t 6 -fa 0)
printf " ngl=%-3s %s\n" "$ngl" "$ts"
[[ "$ts" == OOM ]] && break
best_e4b_ngl=$ngl
done
echo " → best_e4b_ngl=$best_e4b_ngl"
echo ""
# ── Phase 3: Max context sweep ────────────────────────────────────────────────
echo "=== Phase 3: Max context (p=ctx n=1 r=1 no-warmup fa=1) ==="
echo " Gemma4 hybrid attention (sliding_window=512) enables large ctx cheaply."
declare -A BEST_CTX
BEST_CTX[smollm3]=512; BEST_CTX[e2b]=512; BEST_CTX[e4b]=512; BEST_CTX[q3]=512
for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
"e2b:Gemma4-E2B:$M_E2B:99" \
"e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
"q3:Qwen3-4B:$M_Q3:99"; do
IFS=':' read -r key lbl mdl ngl <<< "$entry"
echo " -- $lbl (ngl=$ngl) --"
for ctx in 512 1024 2048 4096 8192 12288 16384 24576 32768 49152 65536 98304 131072; do
ts=$(bench_ctx "$mdl" "$ngl" "$ctx")
printf " ctx=%-7s %s\n" "$ctx" "$ts"
[[ "$ts" == OOM ]] && break
BEST_CTX[$key]=$ctx
done
echo " → MAX ctx=${BEST_CTX[$key]}"
done
echo ""
# ── Phase 4: TG speed at max context ─────────────────────────────────────────
echo "=== Phase 4: TG speed at max context (p=512 n=128 r=2 fa=1 t=6) ==="
for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
"e2b:Gemma4-E2B:$M_E2B:99" \
"e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
"q3:Qwen3-4B:$M_Q3:99"; do
IFS=':' read -r key lbl mdl ngl <<< "$entry"
ts=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 1 -t 6)
printf " %-14s max_ctx=%-7s %s\n" "$lbl" "${BEST_CTX[$key]}" "$ts"
done
echo ""
# ── Phase 5: E4B thread sweep (CPU split model — threads matter) ──────────────
echo "=== Phase 5: Gemma4-E4B thread sweep (p=512 n=128 r=2 fa=0 ngl=$best_e4b_ngl) ==="
for t in 1 2 3 4 5 6 8 10 12; do
ts=$(bench "$M_E4B" "$best_e4b_ngl" -p 512 -n 128 -r 2 -fa 0 -t "$t")
printf " t=%-3s %s\n" "$t" "$ts"
done
echo ""
# ── Phase 6: Flash attention comparison ──────────────────────────────────────
echo "=== Phase 6: Flash attention fa=0 vs fa=1 (p=512 n=128 r=2 t=6) ==="
echo " Gemma4 hybrid attention may not support FA — testing both."
for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
"e2b:Gemma4-E2B:$M_E2B:99" \
"e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
"q3:Qwen3-4B:$M_Q3:99"; do
IFS=':' read -r key lbl mdl ngl <<< "$entry"
ts0=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 0 -t 6)
ts1=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 1 -t 6)
printf " %-14s fa=0: %-30s fa=1: %s\n" "$lbl" "$ts0" "$ts1"
done
echo ""
echo "$HR"
echo "BENCHMARK COMPLETE: $(date)"
echo "$HR"

251
scripts/cpu_ctx_test.sh Normal file
View File

@@ -0,0 +1,251 @@
#!/bin/bash
# cpu_ctx_test.sh v4 — -nkvo bigctx with TurboQuant image (FORCE_MMQ)
# Image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
#
# Tests KV in RAM (-nkvo) with BOTH q4_0 and turbo2 KV types.
# turbo2 = 2-bit KV (2x smaller than q4_0) → ~2x more context at same RAM budget.
#
# Speed model per token:
# GPU-compute models (smollm3/e2b/e4b/q3): bottleneck = PCIe KV reads
# t/s = 1000 / (gpu_ms + ctx * kv_bytes_per_token / PCIE_BPS * 1000)
# Qwen3.5-9B: bottleneck = RAM reads (21/32 layers on CPU, 8.86 GB model)
# t/s = 1000 / (1000/baseline + ctx * kv_bytes_per_token / RAM_BPS * 1000)
#
# Usage: bash /scripts/cpu_ctx_test.sh [smollm3|e2b|e4b|q3|qwen35q|all]
set -uo pipefail
TARGET="${1:-all}"
TARGET_TPS=15
CPU_THREADS=6
BENCH_GEN=32
PCIE_BW_GBPS=8.0 # PCIe x4 3.0 practical read BW (conservative)
RAM_BW_GBPS=45.0 # RAM practical read BW (i7-10750H DDR4-2933)
M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
M_Q35="/models/Qwen3.5-9B.Q8_0.gguf"
declare -A NGL_GPU=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99 [qwen35q]=11)
# BW source: pcie for GPU-compute models, ram for qwen35-9b (CPU-compute bound)
declare -A BW_GBPS=([smollm3]=$PCIE_BW_GBPS [e2b]=$PCIE_BW_GBPS [e4b]=$PCIE_BW_GBPS [q3]=$PCIE_BW_GBPS [qwen35q]=$RAM_BW_GBPS)
declare -A BW_LABEL=([smollm3]="PCIe" [e2b]="PCIe" [e4b]="PCIe" [q3]="PCIe" [qwen35q]="RAM")
# CTX candidates: larger now thanks to turbo2 (2x smaller KV vs q4_0)
# Note: turbo2 is SKIPPED for Qwen3-4B (PPL explodes at ctx>=8192: +0.52 → +13 → +437)
# turbo2 is SKIPPED for Qwen3.5-9B (hybrid linear-attn incompatible with llama-perplexity;
# server works fine at 32K — this is a test-tool limitation, not a real issue)
SMOL_CTXS=(32768 49152 65536 98304 131072 163840)
E2B_CTXS=(32768 49152 65536 98304 131072 163840 196608 262144 393216)
E4B_CTXS=(32768 49152 65536 98304 131072 163840)
Q3_CTXS=(24576 32768 49152 65536 98304 131072)
Q35_CTXS=(16384 32768 49152 65536 98304 131072)
declare -A CTX_CANDIDATES=(
[smollm3]="SMOL_CTXS" [e2b]="E2B_CTXS" [e4b]="E4B_CTXS"
[q3]="Q3_CTXS" [qwen35q]="Q35_CTXS")
# Pure-GPU ctx for gain comparison
declare -A PURE_GPU_CTX=([smollm3]=24576 [e2b]=24576 [e4b]=24576 [q3]=16384 [qwen35q]=32768)
GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; CYAN='\033[0;36m'; NC='\033[0m'
HR="======================================================================"
# Tiny alloc file — enough for 1 chunk, minimal compute time
ALLOC_FILE="/tmp/kv_alloc_tiny.txt"
python3 -c "
sentences = [
'The transformer architecture uses self-attention mechanisms to process sequences.',
'Large language models require significant computational resources for training.',
'Quantization reduces memory usage by storing weights in lower precision formats.',
'Flash attention enables memory-efficient computation for long context windows.',
'The key-value cache stores intermediate attention states during generation.',
]
import random; random.seed(1)
print(chr(10).join([random.choice(sentences) for _ in range(64)]))
" > "$ALLOC_FILE"
# check_alloc MODEL NGL KV CTX [EXTRA...]
# Returns "<host_kv_mib>" on success, "OOM" on failure. Fast: <15s.
check_alloc() {
local model=$1 ngl=$2 kv=$3 ctx=$4
shift 4
local extra_args=("$@")
local tmp_err; tmp_err=$(mktemp)
timeout 90 /app/llama-perplexity \
-m "$model" -ngl "$ngl" \
-fa on -nkvo \
-c "$ctx" -ctk "$kv" -ctv "$kv" \
-f "$ALLOC_FILE" --chunks 1 \
"${extra_args[@]}" \
> /dev/null 2>"$tmp_err"
local rc=$?
local err; err=$(cat "$tmp_err"); rm -f "$tmp_err"
if grep -qi "out of memory\|failed to allocate\|cudaMalloc failed\|CUDA_ERROR_OUT_OF_MEMORY\|ggml_cuda_malloc\|cannot allocate memory\|cannot create buffer" <<< "$err"; then
echo "OOM"; return 1
fi
# Parse Host context MiB: "| Host | total = model + context + compute |"
local host_ctx_mib
host_ctx_mib=$(grep "Host" <<< "$err" | \
grep -oP "=\s*\d+\s*\+\s*\K\d+(?=\s*\+)" | head -1 || true)
echo "${host_ctx_mib:-?}"
}
# measure_baseline_tps MODEL NGL [EXTRA...]
measure_baseline_tps() {
local model=$1 ngl=$2
shift 2
local extra_args=("$@")
local raw
raw=$(timeout 120 /app/llama-bench \
-m "$model" -ngl "$ngl" -t "$CPU_THREADS" \
-p 1 -n "$BENCH_GEN" \
-ctk q4_0 -ctv q4_0 -nkvo 1 -fa 1 -r 1 -o csv \
"${extra_args[@]}" 2>/dev/null) || true
printf '%s\n' "$raw" | sed 's/"//g' | \
awk -F',' 'NR>1 && $34=="0" && $35+0>0 && $40+0>0 {print $40+0; exit}'
}
# estimate_tps BASELINE_TPS KV_PER_TOKEN_MIB CTX BW_GBPS
estimate_tps() {
local baseline_tps=$1 kv_per_token_mib=$2 ctx=$3 bw_gbps=$4
python3 -c "
baseline = float('$baseline_tps')
kv_tok_bytes = float('$kv_per_token_mib') * 1024 * 1024
bps = float('$bw_gbps') * 1e9
ctx = int('$ctx')
base_ms = 1000.0 / baseline
kv_ms = ctx * kv_tok_bytes / bps * 1000
print(f'{1000.0 / (base_ms + kv_ms):.1f}')
" 2>/dev/null || echo "?"
}
# ---------------------------------------------------------------------------
echo "$HR"
echo "CPU-RAM KV CONTEXT TEST v4 (-nkvo, TurboQuant FORCE_MMQ) -- $(date)"
echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null)"
echo "KV types tested: q4_0 (4-bit) and turbo2 (2-bit, 2x smaller → 2x more ctx)"
printf "PCIe assumption: %.1f GB/s | RAM assumption: %.1f GB/s\n" "$PCIE_BW_GBPS" "$RAM_BW_GBPS"
echo "$HR"
echo ""
declare -a SUMMARY=()
for entry in \
"smollm3:SmolLM3-3B:$M_SMOL" \
"e2b:Gemma4-E2B:$M_E2B" \
"e4b:Gemma4-E4B:$M_E4B" \
"q3:Qwen3-4B:$M_Q3" \
"qwen35q:Qwen3.5-9B:$M_Q35"
do
IFS=':' read -r key lbl model <<< "$entry"
[[ "$TARGET" != "all" && "$TARGET" != "$key" ]] && continue
eval "ctxs=(\"\${${CTX_CANDIDATES[$key]}[@]}\")"
ngl="${NGL_GPU[$key]}"
bw_gbps="${BW_GBPS[$key]}"
bw_label="${BW_LABEL[$key]}"
# turbo2 incompatible with Qwen3-4B (quality fails at ctx>=8192)
# turbo2 alloc works for Qwen3.5-9B but quality measurement unreliable — test q4_0 only
if [[ "$key" == "q3" || "$key" == "qwen35q" ]]; then
kv_types_to_test=(q4_0)
else
kv_types_to_test=(q4_0 turbo2)
fi
extra_args=()
printf "${BLUE}=== %s (ngl=%s, BW model: %s %.0f GB/s) ===${NC}\n" \
"$lbl" "$ngl" "$bw_label" "$bw_gbps"
# Baseline t/s (empty KV, with q4_0 -nkvo — upper bound)
printf " Measuring baseline t/s (empty KV, p=1)... "
baseline_tps=$(measure_baseline_tps "$model" "$ngl" "${extra_args[@]}")
if [[ -z "$baseline_tps" ]]; then
printf "${RED}FAIL${NC}\n\n"
SUMMARY+=("$lbl|FAIL|FAIL|FAIL|FAIL|FAIL")
continue
fi
printf "${GREEN}%s t/s${NC}\n\n" "$baseline_tps"
# Header
printf " %-10s %-12s %-12s %-12s %-12s %-12s %-12s\n" \
"ctx" "KV type" "KV in RAM" "kv/tok" "t/s@25%" "t/s@50%" "t/s@100%"
printf " %-10s %-12s %-12s %-12s %-12s %-12s %-12s\n" \
"---" "-------" "---------" "------" "-------" "-------" "--------"
max_ctx_q4=""
max_ctx_t2=""
rec_q4=""
rec_t2=""
declare -A kv_ref_mib=()
for ctx in "${ctxs[@]}"; do
for kv_type in "${kv_types_to_test[@]}"; do
result=$(check_alloc "$model" "$ngl" "$kv_type" "$ctx" "${extra_args[@]}")
if [[ "$result" == "OOM" ]]; then
printf " ${RED}%-10s %-12s OOM${NC}\n" "$ctx" "$kv_type"
continue
fi
host_kv_mib="${result}"
[[ "$kv_type" == "q4_0" ]] && max_ctx_q4=$ctx || max_ctx_t2=$ctx
# KV per token
if [[ "$host_kv_mib" =~ ^[0-9]+$ ]]; then
kv_per_token_mib=$(python3 -c "print(f'{$host_kv_mib / $ctx:.6f}')")
kv_ref_mib[$kv_type]=$kv_per_token_mib
else
kv_per_token_mib="${kv_ref_mib[$kv_type]:-?}"
fi
tps25=$(estimate_tps "$baseline_tps" "$kv_per_token_mib" "$(( ctx / 4 ))" "$bw_gbps")
tps50=$(estimate_tps "$baseline_tps" "$kv_per_token_mib" "$(( ctx / 2 ))" "$bw_gbps")
tps100=$(estimate_tps "$baseline_tps" "$kv_per_token_mib" "$ctx" "$bw_gbps")
meets=$(python3 -c "print(1 if '$tps50' != '?' and float('$tps50') >= $TARGET_TPS else 0)" 2>/dev/null || echo 0)
[[ "$kv_type" == "q4_0" && "$meets" == "1" ]] && rec_q4=$ctx
[[ "$kv_type" == "turbo2" && "$meets" == "1" ]] && rec_t2=$ctx
color=$([[ "$meets" == "1" ]] && echo "$GREEN" || echo "$YELLOW")
printf " ${color}%-10s${NC} %-12s %-12s %-12s %-12s ${color}%-12s${NC} %-12s\n" \
"$ctx" "$kv_type" "${host_kv_mib}MiB" "${kv_per_token_mib}MiB" \
"$tps25" "$tps50" "$tps100"
done
done
rec_q4="${rec_q4:-$max_ctx_q4}"
rec_t2="${rec_t2:-$max_ctx_t2}"
pg="${PURE_GPU_CTX[$key]}"
printf "\n Recommended ctx (>=%s t/s@50%%): q4_0=%s turbo2=%s (pure-GPU was %s)\n\n" \
"$TARGET_TPS" "${rec_q4:-FAIL}" "${rec_t2:-FAIL}" "$pg"
gain_q4=$([[ -n "${rec_q4:-}" && "${rec_q4:-}" != "FAIL" ]] && echo "$((rec_q4 - pg))" || echo "?")
gain_t2=$([[ -n "${rec_t2:-}" && "${rec_t2:-}" != "FAIL" ]] && echo "$((rec_t2 - pg))" || echo "?")
SUMMARY+=("$lbl|$baseline_tps|${max_ctx_q4:-OOM}|${rec_q4:-FAIL}|${max_ctx_t2:-OOM}|${rec_t2:-FAIL}|$gain_q4|$gain_t2")
unset kv_ref_mib max_ctx_q4 max_ctx_t2 rec_q4 rec_t2
done
echo "$HR"
echo "SUMMARY — -nkvo (KV in RAM): q4_0 vs turbo2"
echo "$HR"
printf "%-16s %-12s %-14s %-14s %-14s %-14s\n" \
"Model" "Baseline t/s" "q4_0 max" "q4_0 rec" "turbo2 max" "turbo2 rec"
printf "%-16s %-12s %-14s %-14s %-14s %-14s\n" \
"-----" "------------" "--------" "--------" "----------" "----------"
for row in "${SUMMARY[@]}"; do
IFS='|' read -r lbl btps max_q4 rec_q4 max_t2 rec_t2 g_q4 g_t2 <<< "$row"
printf "${GREEN}%-16s %-12s %-14s %-14s %-14s %-14s [q4+%s / t2+%s vs pure-GPU]${NC}\n" \
"$lbl" "$btps" "$max_q4" "$rec_q4" "$max_t2" "$rec_t2" "$g_q4" "$g_t2"
done
echo "$HR"
echo "Note: Qwen3.5-9B baseline already <15 t/s (RAM-bound, 8.86 GB model). BW model uses RAM not PCIe."
echo "$HR"

116
scripts/download_models.sh Executable file
View File

@@ -0,0 +1,116 @@
#!/usr/bin/env bash
# download_models.sh — Download GGUF model files to ./models/
#
# Usage:
# bash scripts/download_models.sh # all models
# bash scripts/download_models.sh smollm3 # single model
# bash scripts/download_models.sh gemma4-e2b gemma4-e4b # multiple
#
# Requires: huggingface-cli (pip install huggingface_hub)
# Models land in: ./models/
#
# Available keys: smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b | all
set -euo pipefail
MODELS_DIR="$(cd "$(dirname "$0")/.." && pwd)/models"
mkdir -p "$MODELS_DIR"
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; NC='\033[0m'
check_hf_cli() {
if ! command -v huggingface-cli &>/dev/null; then
echo -e "${RED}Error: huggingface-cli not found.${NC}"
echo "Install with: pip install huggingface_hub"
exit 1
fi
}
download() {
local key="$1"
local repo="$2"
local filename="$3"
local size_hint="$4"
local dest="$MODELS_DIR/$filename"
if [[ -f "$dest" ]]; then
echo -e "${YELLOW}[$key]${NC} Already exists: $filename — skipping"
return
fi
echo -e "${GREEN}[$key]${NC} Downloading $filename (~$size_hint) from $repo ..."
huggingface-cli download "$repo" "$filename" --local-dir "$MODELS_DIR"
echo -e "${GREEN}[$key]${NC} Done: $MODELS_DIR/$filename"
}
download_smollm3() {
download "smollm3" \
"bartowski/HuggingFaceTB_SmolLM3-3B-GGUF" \
"HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf" \
"1.9 GB"
}
download_gemma4_e2b() {
download "gemma4-e2b" \
"bartowski/google_gemma-4-E2B-it-GGUF" \
"google_gemma-4-E2B-it-Q4_K_M.gguf" \
"2.9 GB"
}
download_gemma4_e4b() {
download "gemma4-e4b" \
"bartowski/google_gemma-4-E4B-it-GGUF" \
"google_gemma-4-E4B-it-Q4_K_M.gguf" \
"4.7 GB"
}
download_qwen3_4b() {
download "qwen3-4b" \
"bartowski/Qwen3-4B-GGUF" \
"Qwen3-4B-Q4_K_M.gguf" \
"2.4 GB"
}
download_qwen35_9b() {
download "qwen35-9b" \
"Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2-GGUF" \
"Qwen3.5-9B.Q8_0.gguf" \
"8.9 GB"
}
main() {
check_hf_cli
local targets=("$@")
if [[ ${#targets[@]} -eq 0 || "${targets[0]}" == "all" ]]; then
targets=(smollm3 gemma4-e2b gemma4-e4b qwen3-4b qwen35-9b)
fi
for target in "${targets[@]}"; do
case "$target" in
smollm3) download_smollm3 ;;
gemma4-e2b) download_gemma4_e2b ;;
gemma4-e4b) download_gemma4_e4b ;;
qwen3-4b) download_qwen3_4b ;;
qwen35-9b) download_qwen35_9b ;;
all)
download_smollm3
download_gemma4_e2b
download_gemma4_e4b
download_qwen3_4b
download_qwen35_9b
;;
*)
echo -e "${RED}Unknown model: $target${NC}"
echo "Valid keys: smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b | all"
exit 1
;;
esac
done
echo ""
echo "Models directory:"
ls -lh "$MODELS_DIR"/*.gguf 2>/dev/null || echo "(no .gguf files found)"
}
main "$@"

246
scripts/kv_quant_test.sh Normal file
View File

@@ -0,0 +1,246 @@
#!/bin/bash
# KV cache quantization test using llama-perplexity.
# Image: local/llama-cpp-turboquant:full-cuda-sm75-mmq (FORCE_MMQ, turbo2/3/4 support)
#
# Tests KV types: f16 (baseline) + q8_0/q4_0/turbo2 for Q4_K_M models
# f16 (baseline) + turbo2/3/4 for Qwen3.5-9B Q8_0
# Quality gate: PPL delta vs f16 < 0.5 (lossless for practical use)
#
# Usage: bash /scripts/kv_quant_test.sh [MODEL_KEY]
# MODEL_KEY: smollm3 | e2b | e4b | q3 | qwen35q | all (default: all)
set -uo pipefail
TARGET="${1:-all}"
M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
M_Q35="/models/Qwen3.5-9B.Q8_0.gguf"
declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99 [qwen35q]=11)
declare -A BASE_CTX=([smollm3]=18432 [e2b]=32768 [e4b]=20480 [q3]=8192 [qwen35q]=8192)
declare -A PPL_TIMEOUT=([smollm3]=300 [e2b]=300 [e4b]=300 [q3]=300 [qwen35q]=600)
# Per-model KV types to test (f16 is always the baseline)
# Standard Q4_K_M models: q8_0/q4_0 + turbo2 (all supported by TurboQuant image)
# Qwen3.5-9B: designed for turbo KV — test turbo2/3/4 only (q4_0 would also work but less relevant)
declare -A MODEL_KV_TYPES=(
[smollm3]="q8_0 q4_0 turbo2"
[e2b]="q8_0 q4_0 turbo2"
[e4b]="q8_0 q4_0 turbo2"
[q3]="q8_0 q4_0 turbo2"
[qwen35q]="turbo2 turbo3 turbo4"
)
# ctx candidates per model
SMOL_CTXS=(8192 12288 16384 18432 20480 24576 32768 40960 49152)
E2B_CTXS=(8192 16384 24576 32768 40960 49152 65536)
E4B_CTXS=(8192 12288 16384 20480 24576 32768 40960)
Q3_CTXS=(4096 6144 8192 10240 12288 16384 24576 32768)
Q35_CTXS=(4096 8192 16384 24576 32768 40960 49152)
declare -A CTX_CANDIDATES=(
[smollm3]="SMOL_CTXS" [e2b]="E2B_CTXS" [e4b]="E4B_CTXS"
[q3]="Q3_CTXS" [qwen35q]="Q35_CTXS")
GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
HR="======================================================================"
# Synthetic PPL file — 4000 lines, deterministic, no network needed
PPL_FILE="/tmp/kv_ppl_input.txt"
ensure_ppl_file() {
[[ -f "$PPL_FILE" ]] && return
python3 - << 'PY'
import random, sys
random.seed(42)
sentences = [
"The transformer architecture uses self-attention mechanisms to process sequences.",
"Large language models require significant computational resources for training.",
"Quantization reduces memory usage by storing weights in lower precision formats.",
"Flash attention enables memory-efficient computation for long context windows.",
"The key-value cache stores intermediate attention states during generation.",
"Context length determines how many tokens the model can attend to simultaneously.",
"Perplexity measures how well a probability model predicts a sample of text.",
"Lower perplexity values indicate better language modeling performance overall.",
"GPU memory bandwidth is the primary bottleneck for autoregressive token generation.",
"Grouped query attention reduces KV cache size by sharing keys across head groups.",
"Rotary position embeddings encode relative position information in attention queries.",
"Mixture of experts models route tokens through specialized feed-forward networks.",
"Continuous batching allows servers to process multiple requests simultaneously.",
"KV cache quantization trades a small quality loss for significantly larger contexts.",
]
lines = [random.choice(sentences) for _ in range(4000)]
print('\n'.join(lines), file=open('/tmp/kv_ppl_input.txt', 'w'))
PY
}
# run_ppl MODEL NGL KV CTX TIMEOUT [EXTRA_ARGS...]
# Echoes PPL value on stdout, returns 0 on success, 1 on OOM/crash.
run_ppl() {
local model=$1 ngl=$2 kv=$3 ctx=$4 timeout_s=$5
shift 5
local extra_args=("$@")
local tmp_err; tmp_err=$(mktemp)
local ppl_out; ppl_out=$(mktemp)
timeout "$timeout_s" /app/llama-perplexity \
-m "$model" \
-ngl "$ngl" \
-fa on \
-c "$ctx" \
-ctk "$kv" -ctv "$kv" \
-f "$PPL_FILE" \
--chunks 1 \
"${extra_args[@]}" \
> "$ppl_out" 2>"$tmp_err"
local ppl_rc=$?
local err; err=$(cat "$tmp_err"); rm -f "$tmp_err"
if [[ "$ppl_rc" != "0" ]] || \
grep -qi "out of memory\|failed to allocate\|cudaMalloc failed\|CUDA_ERROR_OUT_OF_MEMORY\|ggml_cuda_malloc\|cannot allocate memory" <<< "$err"; then
rm -f "$ppl_out"
return 1
fi
local ppl_val
ppl_val=$(grep -oP '\[\d+\]\K[0-9.]+' "$ppl_out" | tail -1)
rm -f "$ppl_out"
[[ -z "$ppl_val" ]] && return 1
echo "$ppl_val"
}
# ---------------------------------------------------------------------------
ensure_ppl_file
echo "$HR"
echo "KV CACHE QUANT TEST (llama-perplexity) — TurboQuant image (FORCE_MMQ SM75)"
echo "$(date)"
echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null)"
echo "$HR"
echo "Standard models: f16 baseline | q8_0 | q4_0 | turbo2 (2-bit, 8x smaller KV vs f16)"
echo "Qwen3.5-9B: f16 baseline | turbo2 | turbo3 | turbo4 (TurboQuant KV types)"
echo "Quality gate: PPL delta vs f16 < 0.5"
echo ""
declare -a SUMMARY=()
for entry in \
"smollm3:SmolLM3-3B:$M_SMOL" \
"e2b:Gemma4-E2B:$M_E2B" \
"e4b:Gemma4-E4B:$M_E4B" \
"q3:Qwen3-4B:$M_Q3" \
"qwen35q:Qwen3.5-9B:$M_Q35"
do
IFS=':' read -r key lbl model <<< "$entry"
[[ "$TARGET" != "all" && "$TARGET" != "$key" ]] && continue
eval "ctxs=(\"\${${CTX_CANDIDATES[$key]}[@]}\")"
ngl="${NGL[$key]}"
timeout_s="${PPL_TIMEOUT[$key]}"
IFS=' ' read -ra kv_types <<< "${MODEL_KV_TYPES[$key]}"
# Extra args for qwen35-9b (flash attn already set; no mlock needed for PPL correctness)
extra_args=()
printf "${BLUE}=== %s (base ctx=%s, ngl=%s) ===${NC}\n" \
"$lbl" "${BASE_CTX[$key]}" "$ngl"
# Dynamic header based on KV types for this model
printf " %-10s %-18s" "ctx" "f16 (PPL)"
for kv in "${kv_types[@]}"; do
printf " %-20s" "$kv (PPL/delta)"
done
printf "\n"
printf " %-10s %-18s" "---" "---------"
for kv in "${kv_types[@]}"; do
printf " %-20s" "--------------------"
done
printf "\n"
declare -A best_ctx_per_kv=([f16]="${BASE_CTX[$key]}")
for kv in "${kv_types[@]}"; do best_ctx_per_kv[$kv]="${BASE_CTX[$key]}"; done
declare -A oom_kv=([f16]=0)
for kv in "${kv_types[@]}"; do oom_kv[$kv]=0; done
declare -A ppl_f16_at_ctx=()
for ctx in "${ctxs[@]}"; do
printf " %-10s" "$ctx"
# f16 baseline
f16_ppl=""
if [[ "${oom_kv[f16]}" == "1" ]]; then
printf " ${RED}%-18s${NC}" "OOM"
else
f16_ppl=$(run_ppl "$model" "$ngl" "f16" "$ctx" "$timeout_s" "${extra_args[@]}")
if [[ $? -ne 0 ]]; then
printf " ${RED}%-18s${NC}" "OOM"
oom_kv[f16]=1
else
printf " ${GREEN}%-18s${NC}" "$f16_ppl"
best_ctx_per_kv[f16]=$ctx
ppl_f16_at_ctx[$ctx]=$f16_ppl
fi
fi
# KV type columns
for kv in "${kv_types[@]}"; do
if [[ "${oom_kv[$kv]}" == "1" ]]; then
printf " ${RED}%-20s${NC}" "OOM"
continue
fi
ppl=$(run_ppl "$model" "$ngl" "$kv" "$ctx" "$timeout_s" "${extra_args[@]}")
if [[ $? -ne 0 ]]; then
printf " ${RED}%-20s${NC}" "OOM"
oom_kv[$kv]=1
continue
fi
best_ctx_per_kv[$kv]=$ctx
if [[ -n "$f16_ppl" ]]; then
delta=$(python3 -c "print(f'{float(\"$ppl\")-float(\"$f16_ppl\"):+.2f}')" 2>/dev/null || echo "?")
ok=$(python3 -c "exit(0 if abs(float('$ppl')-float('$f16_ppl'))<0.5 else 1)" 2>/dev/null && echo ok || echo bad)
if [[ "$ok" == "ok" ]]; then
printf " ${GREEN}%-20s${NC}" "${ppl}(${delta})"
else
printf " ${YELLOW}%-20s${NC}" "${ppl}(${delta})"
fi
else
printf " ${GREEN}%-20s${NC}" "$ppl"
fi
done
echo ""
done
echo ""
# Best recommendation: highest ctx where all non-f16 types passed quality gate
overall_best_ctx="${BASE_CTX[$key]}"
overall_best_kv="f16"
for kv in "${kv_types[@]}"; do
bctx="${best_ctx_per_kv[$kv]}"
SUMMARY+=("$lbl|$kv|$bctx")
if [[ "$bctx" -gt "$overall_best_ctx" ]]; then
overall_best_ctx=$bctx; overall_best_kv=$kv
fi
done
SUMMARY+=("$lbl|f16|${best_ctx_per_kv[f16]}")
printf " ${GREEN}Best: %s → max ctx %s${NC}\n\n" "$overall_best_kv" "$overall_best_ctx"
unset best_ctx_per_kv oom_kv ppl_f16_at_ctx
done
echo "$HR"
echo "SUMMARY"
echo "$HR"
printf "%-16s %-8s %s\n" "Model" "KV" "Max Ctx (no OOM + PPL delta<0.5)"
printf "%-16s %-8s %s\n" "-----" "--" "---------------------------------"
for row in "${SUMMARY[@]}"; do
IFS='|' read -r lbl kv ctx <<< "$row"
printf "${GREEN}%-16s %-8s %s${NC}\n" "$lbl" "$kv" "$ctx"
done
echo "$HR"
echo "Reminder: update envs/.env.<model>: CACHE_TYPE_K/V=<best_kv> CTX_SIZE=<max_ctx>"
echo "$HR"

215
scripts/quality_test.sh Normal file
View File

@@ -0,0 +1,215 @@
#!/bin/bash
# Quality tests for all 4 models — runs inside full-cuda container.
# Tests: coding tasks + needle-in-haystack at 1K/8K ctx.
#
# Inference parameters sourced from official HF model cards:
# SmolLM3: /no_think in SYSTEM prompt (-sys); temp=0.6 top_p=0.95
# Qwen3: /no_think in SYSTEM prompt (-sys); temp=0.7 top_p=0.8 top_k=20
# DO NOT use greedy (temp=0) — causes endless repetition per Qwen3 docs
# Gemma4: No thinking mode; temp=0.7 top_p=0.95
set -uo pipefail
M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99)
declare -A MAX_CTX=([smollm3]=24576 [e2b]=32768 [e4b]=24576 [q3]=8192)
# Per-model sampling params (HF model card sources)
declare -A TEMP=([smollm3]="0.6" [e2b]="0.7" [e4b]="0.7" [q3]="0.7")
declare -A TOPP=([smollm3]="0.95" [e2b]="0.95" [e4b]="0.95" [q3]="0.8")
declare -A TOPK=([smollm3]="0" [e2b]="0" [e4b]="0" [q3]="20")
# /no_think in system prompt disables thinking for SmolLM3 and Qwen3
declare -A SYSP=([smollm3]="/no_think" [e2b]="" [e4b]="" [q3]="/no_think")
PASS=0; FAIL=0; TOTAL=0
GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m'
# sed script to strip llama-cli interactive UI banner from stdout.
# ▄ (U+2584) and █ (U+2588) appear in the llama.cpp ASCII logo, sometimes
# with leading spaces — match anywhere on the line to be safe.
STRIP_BANNER='/^$/d
/^Loading model/d
/^[[:space:]]*$/d
/[▄█]/d
/^build /d
/^model /d
/^modalities/d
/^available commands/d
/^ \//d
/^\[ Prompt:/d
/^\[ Prompt:/d
/^Exiting/d
/^> /d
'
check() {
local lbl="$1" out="$2"
shift 2
local patterns=("$@")
local ok=1
for pat in "${patterns[@]}"; do
printf '%s\n' "$out" | grep -qiE "$pat" || { ok=0; break; }
done
TOTAL=$((TOTAL+1))
if [ "$ok" = "1" ]; then
PASS=$((PASS+1)); printf " ${GREEN}PASS${NC} %s\n" "$lbl"
else
FAIL=$((FAIL+1)); printf " ${RED}FAIL${NC} %s\n" "$lbl"
printf '%s\n' "$out" | grep -v '^$' | tail -3 | sed 's/^/ | /'
fi
}
# Strip thinking blocks from output.
# Gemma4 uses [Start thinking]...[End thinking].
# Qwen3/SmolLM3 use <think>...</think>.
# Match to end-of-string as fallback for truncated/incomplete blocks.
strip_think() {
python3 -c "
import sys, re
t = sys.stdin.read()
# Only strip COMPLETE blocks. If thinking hit token limit, leave as-is so
# check patterns can still match reasoning content inside the block.
t = re.sub(r'\[Start thinking\].*?\[End thinking\]', '', t, flags=re.DOTALL)
t = re.sub(r'<think>.*?</think>', '', t, flags=re.DOTALL)
print(t.strip())
" 2>/dev/null || cat
}
# run KEY MODEL PROMPT MAX_TOKENS [SYS_OVERRIDE]
# SYS_OVERRIDE defaults to SYSP[$key] if omitted.
# Pass "" explicitly to disable system prompt (thinking ON for Qwen3/SmolLM3).
# Thinking params: SmolLM3/Qwen3 thinking=temp0.6/top_p0.95, nothink=model defaults.
run() {
local key=$1 model=$2 prompt=$3 max_tok=$4
local ngl="${NGL[$key]}"
# 5th arg overrides sys; if not provided, use SYSP[$key]
local use_sys
if [ "${5+x}" = "x" ]; then use_sys="$5"; else use_sys="${SYSP[$key]}"; fi
# choose sampling params: thinking mode uses 0.6/0.95, non-think uses model defaults
local temp topp topk
if [ -z "$use_sys" ] && [[ "$key" == "smollm3" || "$key" == "q3" ]]; then
temp="0.6"; topp="0.95"; topk="${TOPK[$key]}"
else
temp="${TEMP[$key]}"; topp="${TOPP[$key]}"; topk="${TOPK[$key]}"
fi
local sys_arg=()
[ -n "$use_sys" ] && sys_arg=(-sys "$use_sys")
local topk_arg=()
[ "$topk" != "0" ] && topk_arg=(--top-k "$topk")
timeout 300 /app/llama-cli -m "$model" -ngl "$ngl" \
-n "$max_tok" --temp "$temp" --top-p "$topp" "${topk_arg[@]}" \
--repeat-penalty 1.1 -fa on --mmap --single-turn \
"${sys_arg[@]}" -p "$prompt" 2>/dev/null \
| sed "$STRIP_BANNER" \
| strip_think
}
# needle_test KEY MODEL NEEDLE CTX
# Generates ~CTX tokens of filler, plants needle in middle, asks to recall it.
needle_test() {
local key=$1 model=$2 needle=$3 ctx=$4
local ngl="${NGL[$key]}"
local temp="${TEMP[$key]}" topp="${TOPP[$key]}" sys="${SYSP[$key]}"
local sys_arg=()
[ -n "$sys" ] && sys_arg=(-sys "$sys")
# filler: ctx/2 tokens each side, 1 token ~4 chars
local half_chars=$(( ctx * 2 ))
local reps=$(( half_chars / 45 + 2 ))
local filler
filler=$(python3 -c "print('The quick brown fox jumps over the lazy dog. ' * $reps)" 2>/dev/null \
| head -c "$half_chars")
local prompt
printf -v prompt \
'%s\nSECRET_VALUE=%s\n%s\nWhat is SECRET_VALUE? Reply with only the value, nothing else.' \
"$filler" "$needle" "$filler"
local ctx_size=$(( ctx + 512 ))
local out
out=$(timeout 180 /app/llama-cli -m "$model" -ngl "$ngl" \
-n 512 --temp "$temp" --top-p "$topp" \
-fa on --mmap --single-turn \
-c "$ctx_size" "${sys_arg[@]}" -p "$prompt" 2>/dev/null \
| sed "$STRIP_BANNER" \
| strip_think)
# join lines before grep in case model breaks needle across newlines
local flat
flat=$(printf '%s' "$out" | tr '\n' ' ')
if printf '%s' "$flat" | grep -qF "$needle"; then
echo "FOUND"
else
local snip
snip=$(printf '%s' "$flat" | cut -c1-80)
echo "MISSED (${snip:-<empty>})"
fi
}
HR="======================================================================"
echo "$HR"
echo "QUALITY TESTS — ALL MODELS — $(date)"
echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null)"
echo "$HR"
printf "Temps: SmolLM3=0.6/0.95 | Qwen3=0.7/0.8/k20 | Gemma4=0.7/0.95\n"
printf "/no_think via -sys for needle tests | thinking ON for coding bug test\n\n"
CODING_FIZZBUZZ='Write ONLY the Python function fizzbuzz(n). It returns a list where multiples of 3 are "Fizz", multiples of 5 are "Buzz", multiples of both are "FizzBuzz", others are the number as string. Output code only, no prose.'
# hi is correctly len(arr)-1 to have ONE unambiguous bug: lo=mid (infinite loop)
CODING_BUG='Find the bug in this Python function and explain it in one sentence:
def binary_search(arr, target):
lo, hi = 0, len(arr) - 1
while lo < hi:
mid = (lo + hi) // 2
if arr[mid] == target:
return mid
elif arr[mid] < target:
lo = mid
else:
hi = mid
return -1'
for entry in "smollm3:SmolLM3-3B:$M_SMOL" "e2b:Gemma4-E2B:$M_E2B" "e4b:Gemma4-E4B:$M_E4B" "q3:Qwen3-4B:$M_Q3"; do
IFS=':' read -r key lbl model <<< "$entry"
echo "=== $lbl ==="
# Coding test 1: FizzBuzz — expect def + Fizz + Buzz
out=$(run "$key" "$model" "$CODING_FIZZBUZZ" 512)
check "FizzBuzz: def + Fizz + Buzz in output" "$out" \
"def " "Fizz" "Buzz"
# Coding test 2: Bug — thinking ON for all models (more reliable reasoning).
# Pass "" to disable /no_think override. Gemma4 already thinks by default.
out=$(run "$key" "$model" "$CODING_BUG" 3000 "")
check "Bug: identify lo=mid / infinite loop" "$out" \
"lo.*=.*mid.*\+.*1|lo\+1|infinite loop|never.*advance|never.*progress|stuck|lo should be|lo\b.*never.*incr"
# Needle-in-haystack
NEEDLE="QX7-ALPHA-9"
# strict < so we skip when ctx == max_ctx (prompt fills entire context, no room for output)
for ctx in 1024 8192; do
if [ "$ctx" -lt "${MAX_CTX[$key]}" ]; then
result=$(needle_test "$key" "$model" "$NEEDLE" "$ctx")
TOTAL=$((TOTAL+1))
if [[ "$result" == FOUND ]]; then
PASS=$((PASS+1)); printf " ${GREEN}PASS${NC} Needle @ %s tok: %s\n" "$ctx" "$result"
else
FAIL=$((FAIL+1)); printf " ${RED}FAIL${NC} Needle @ %s tok: %s\n" "$ctx" "$result"
fi
else
printf " ${YELLOW}SKIP${NC} Needle @ %s tok (exceeds model max %s)\n" "$ctx" "${MAX_CTX[$key]}"
fi
done
echo ""
done
echo "$HR"
printf "RESULTS: ${GREEN}%s PASSED${NC} / ${RED}%s FAILED${NC} / %s TOTAL\n" "$PASS" "$FAIL" "$TOTAL"
echo "$HR"