Files
llama-cpp/scripts/benchmark.sh
Giancarmine Salucci 4ad296608b Initial commit: tuned multi-model llama.cpp stack
- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B
- TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs
- Bigctx profiles (-nkvo KV in RAM): 2-16x context gain
- turbo2 KV: 2x smaller, benchmarked against PPL quality gate
- Per-model env files with justified parameters
- kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts
- docs/FINDINGS.md: surprises, pitfalls, recommendations
- docs/ARCHITECTURE.md: compose + test script design
2026-05-06 15:56:40 +02:00

336 lines
13 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# =============================================================================
# llama.cpp Automated Benchmark — Qwen3.5-9B on GTX 1650 Ti (4 GB VRAM)
#
# Runs for BOTH official llama.cpp and TurboQuant fork.
# VARIANT env var selects which KV type set to sweep:
# VARIANT=official → f16 q8_0 q5_0 q4_0 iq4_nl
# VARIANT=turboquant → f16 q8_0 iq4_nl turbo4 turbo3 turbo2
#
# Output: CSV + recommended .env per variant, plus a final comparison table.
#
# Run:
# docker compose --profile benchmark run --rm benchmark (official)
# docker compose --profile benchmark run --rm benchmark-turbo (turboquant)
# =============================================================================
set -euo pipefail
# Ensure llama-bench is findable in both official (/usr/local/bin) and TurboQuant (/app) images
export PATH="/app:/usr/local/bin:/usr/bin:/bin:${PATH:-}"
MODEL="${MODEL:-${1:-/models/Qwen3.5-9B.Q8_0.gguf}}"
OUTPUT_DIR="${OUTPUT_DIR:-${2:-/results}}"
VARIANT="${VARIANT:-official}" # official | turboquant
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
RESULTS_CSV="${OUTPUT_DIR}/${VARIANT}_results_${TIMESTAMP}.csv"
LOG="${OUTPUT_DIR}/${VARIANT}_benchmark_${TIMESTAMP}.log"
# ── Baseline config ────────────────────────────────────────────────────────
THREADS=6
THREADS_BATCH=12
BATCH_SIZE=2048
UBATCH_SIZE=512
PROMPT_TOKENS=512
GEN_TOKENS=32
REPETITIONS=1
# ── KV type sets per variant ───────────────────────────────────────────────
# turbo2=2-bit (6.4× vs f16), turbo3=3-bit, turbo4=4-bit — TurboQuant only
if [[ "$VARIANT" == "turboquant" ]]; then
KV_TYPES=(f16 q8_0 iq4_nl turbo4 turbo3 turbo2)
else
# Official llama.cpp: all standard quant types
# iq4_nl = i-quant non-linear: best quality at 4-bit (non-uniform scale)
KV_TYPES=(f16 q8_0 q5_0 q4_0 iq4_nl)
fi
# ── GPU layer sweep (Q8_0 ~297 MB/layer, 3717 MiB VRAM → max ~12 layers) ──
NGL_VALUES=(6 9 12 13 14 99)
# ── Context sweep: use -p to stress KV cache at given size ─────────────────
CTX_VALUES=(128 512 1024 2048 4096 8192)
# ── Batch sweep ────────────────────────────────────────────────────────────
BATCH_VALUES=(512 1024 2048 4096)
mkdir -p "$OUTPUT_DIR"
log() { echo "$*" | tee -a "$LOG"; }
sep() { log "$(printf '─%.0s' {1..70})"; }
hdr() { sep; log " $*"; sep; }
log "llama.cpp Benchmark [${VARIANT}] — $(date)"
log "Model: $MODEL"
log "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo 'CPU only')"
log "KV set: ${KV_TYPES[*]}"
sep
echo "variant,phase,ngl,ctx,kv_type_k,kv_type_v,flash_attn,batch_size,ubatch_size,threads,pp_tokens_per_sec,tg_tokens_per_sec,status" \
> "$RESULTS_CSV"
# ── Helper: run llama-bench ────────────────────────────────────────────────
LAST_PP=0
LAST_TG=0
run_bench() {
local ngl=$1 ctx=$2 kv=$3 fa=$4 batch=$5 ubatch=$6 phase="${7:-test}"
local raw
# New llama-bench API (b9014+): -c and -tb removed; -p sets prompt/ctx size
# CSV columns: ...n_prompt(34),n_gen(35),n_depth(36),test_time(37),
# avg_ns(38),stddev_ns(39),avg_ts(40),stddev_ts(41)
# pp row: n_gen==0; tg row: n_prompt==0
raw=$(timeout 300 /app/llama-bench \
-m "$MODEL" \
-ngl "$ngl" \
-p "$ctx" \
-n "$GEN_TOKENS" \
-t "$THREADS" \
-b "$batch" \
-ub "$ubatch" \
-ctk "$kv" \
-ctv "$kv" \
-fa "$fa" \
-r "$REPETITIONS" \
-o csv 2>&1) || return 1
# Strip quotes from CSV, then extract avg_ts (col 40) by pp/tg row type
LAST_PP=$(printf '%s\n' "$raw" | sed 's/"//g' | awk -F',' 'NR>1 && $35=="0" && $40+0>0 {print $40+0; exit}')
LAST_TG=$(printf '%s\n' "$raw" | sed 's/"//g' | awk -F',' 'NR>1 && $34=="0" && $35+0>0 && $40+0>0 {print $40+0; exit}')
LAST_PP="${LAST_PP:-0}"
LAST_TG="${LAST_TG:-0}"
echo "${VARIANT},${phase},${ngl},${ctx},${kv},${kv},${fa},${batch},${ubatch},${THREADS},${LAST_PP},${LAST_TG},ok" \
>> "$RESULTS_CSV"
return 0
}
fail_row() {
local phase=$1 ngl=$2 ctx=$3 kv=$4 fa=$5 batch=$6 ubatch=$7
echo "${VARIANT},${phase},${ngl},${ctx},${kv},${kv},${fa},${batch},${ubatch},${THREADS},0,0,failed" \
>> "$RESULTS_CSV"
}
# ── Phase 1: GPU layer sweep ───────────────────────────────────────────────
hdr "PHASE 1 — GPU layer sweep (prompt=128 kv=f16 fa=0)"
# Use f16 KV: prebuilt official image lacks SM75 CUDA kernels for quantized KV.
# We isolate the NGL variable here; KV type is swept in Phase 3.
MAX_STABLE_NGL=0
for ngl in "${NGL_VALUES[@]}"; do
printf " ngl=%-3s " "$ngl" | tee -a "$LOG"
if run_bench "$ngl" 128 f16 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph1_ngl"; then
log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s"
MAX_STABLE_NGL="$ngl"
else
log "FAILED (OOM/timeout)"
fail_row ph1_ngl "$ngl" 128 f16 0 "$BATCH_SIZE" "$UBATCH_SIZE"
break
fi
done
log " → Best ngl: ${MAX_STABLE_NGL}"
# ── Phase 2: Context sweep ─────────────────────────────────────────────────
hdr "PHASE 2 — Context/prompt sweep (ngl=${MAX_STABLE_NGL} kv=f16 fa=0)"
MAX_STABLE_CTX=128
for ctx in "${CTX_VALUES[@]}"; do
printf " ctx=%-6s " "$ctx" | tee -a "$LOG"
if run_bench "$MAX_STABLE_NGL" "$ctx" f16 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph2_ctx"; then
log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s"
MAX_STABLE_CTX="$ctx"
else
log "FAILED (OOM/timeout)"
fail_row ph2_ctx "$MAX_STABLE_NGL" "$ctx" f16 0 "$BATCH_SIZE" "$UBATCH_SIZE"
break
fi
done
log " → Best ctx: ${MAX_STABLE_CTX}"
# ── Phase 3: KV cache type sweep ───────────────────────────────────────────
hdr "PHASE 3 — KV type sweep (ngl=${MAX_STABLE_NGL} ctx=${MAX_STABLE_CTX} fa=1)"
log " [${VARIANT}] KV types: ${KV_TYPES[*]}"
log " Note: Qwen3.5-9B has only 8/32 full-attention layers + GQA (4 KV heads)"
log " Linear-attention layers need no KV cache at all → quant errors minimal"
if [[ "$VARIANT" == "turboquant" ]]; then
log " turbo2=2-bit (6.4× compression), turbo3=3-bit, turbo4=4-bit"
fi
BEST_KV="q8_0"
BEST_TG_KV=0
for kv in "${KV_TYPES[@]}"; do
printf " kv=%-8s " "$kv" | tee -a "$LOG"
if run_bench "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$kv" 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph3_kv"; then
log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s"
tg_n=$(printf '%s' "$LAST_TG" | grep -oP '[0-9]+\.?[0-9]*' | head -1)
if awk "BEGIN{exit !(${tg_n:-0} > ${BEST_TG_KV:-0})}"; then
BEST_TG_KV="${tg_n:-0}"
BEST_KV="$kv"
fi
else
log "FAILED"
fail_row ph3_kv "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$kv" 0 "$BATCH_SIZE" "$UBATCH_SIZE"
fi
done
log " → Best KV: ${BEST_KV} (tg=${BEST_TG_KV} t/s)"
# ── Phase 4: Flash attention ───────────────────────────────────────────────
hdr "PHASE 4 — Flash attention (ngl=${MAX_STABLE_NGL} ctx=${MAX_STABLE_CTX} kv=${BEST_KV})"
log " GTX 1650 Ti = CC 7.5 (Turing) — FA2 requires SM80+ but FA1 works on CC>=7.5"
BEST_FA=1
BEST_TG_FA=0
for fa in 1 0; do
fa_label=$([ "$fa" -eq 1 ] && echo "on " || echo "off")
printf " fa=%-3s " "$fa_label" | tee -a "$LOG"
if run_bench "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$BEST_KV" "$fa" "$BATCH_SIZE" "$UBATCH_SIZE" "ph4_fa"; then
log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s"
tg_n=$(printf '%s' "$LAST_TG" | grep -oP '[0-9]+\.?[0-9]*' | head -1)
if awk "BEGIN{exit !(${tg_n:-0} > ${BEST_TG_FA:-0})}"; then
BEST_TG_FA="${tg_n:-0}"
BEST_FA="$fa"
fi
else
log "FAILED"
fi
done
log " → Best FA: ${BEST_FA} (tg=${BEST_TG_FA} t/s)"
# ── Phase 5: Batch sweep ───────────────────────────────────────────────────
hdr "PHASE 5 — Batch sweep (ngl=${MAX_STABLE_NGL} ctx=${MAX_STABLE_CTX} kv=${BEST_KV} fa=${BEST_FA})"
# Use small fixed prompt (64) to isolate batch-buffer allocation overhead from prompt size.
# Larger batch = larger CUDA activation buffers; tests whether they fit in remaining VRAM.
BEST_BATCH="$BATCH_SIZE"
BEST_PP_BATCH=0
FIXED_P=64
for batch in "${BATCH_VALUES[@]}"; do
ubatch=$(( batch / 4 < 64 ? 64 : batch / 4 ))
printf " batch=%-5s ubatch=%-4s " "$batch" "$ubatch" | tee -a "$LOG"
if run_bench "$MAX_STABLE_NGL" "$FIXED_P" "$BEST_KV" "$BEST_FA" "$batch" "$ubatch" "ph5_batch"; then
log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s"
pp_n=$(printf '%s' "$LAST_PP" | grep -oP '[0-9]+\.?[0-9]*' | head -1)
if awk "BEGIN{exit !(${pp_n:-0} > ${BEST_PP_BATCH:-0})}"; then
BEST_PP_BATCH="${pp_n:-0}"
BEST_BATCH="$batch"
fi
else
log "FAILED"
fi
done
BEST_UBATCH=$(( BEST_BATCH / 4 < 64 ? 64 : BEST_BATCH / 4 ))
log " → Best batch: ${BEST_BATCH} ubatch: ${BEST_UBATCH} (pp=${BEST_PP_BATCH} t/s)"
# ── Phase 6 (TurboQuant only): max context with turbo2 KV ──────────────────
if [[ "$VARIANT" == "turboquant" ]]; then
hdr "PHASE 6 — TurboQuant: extended context with turbo2 KV (ngl=${MAX_STABLE_NGL} fa=${BEST_FA})"
log " turbo2 = 2-bit KV (6.4× smaller than f16) → enables much larger ctx in same VRAM"
TURBO_CTX_VALUES=(512 1024 2048 4096 8192 16384 32768)
MAX_TURBO_CTX="128"
TURBO_KV="turbo2"
for ctx in "${TURBO_CTX_VALUES[@]}"; do
printf " ctx=%-7s " "$ctx" | tee -a "$LOG"
if run_bench "$MAX_STABLE_NGL" "$ctx" "$TURBO_KV" "$BEST_FA" "$BEST_BATCH" "$BEST_UBATCH" "ph6_turbo_ctx"; then
log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s"
MAX_TURBO_CTX="$ctx"
else
log "FAILED (OOM/timeout)"
fail_row ph6_turbo_ctx "$MAX_STABLE_NGL" "$ctx" "$TURBO_KV" "$BEST_FA" "$BEST_BATCH" "$BEST_UBATCH"
break
fi
done
log " → Max context with turbo2: ${MAX_TURBO_CTX}"
# Use the larger turbo ctx for the recommended .env
MAX_STABLE_CTX="$MAX_TURBO_CTX"
BEST_KV="$TURBO_KV"
fi
# ── Summary ────────────────────────────────────────────────────────────────
sep
log "BENCHMARK COMPLETE [${VARIANT}] — $(date)"
sep
log ""
log " Optimal params for GTX 1650 Ti + Qwen3.5-9B Q4_K_M [${VARIANT}]:"
log ""
log " ngl : ${MAX_STABLE_NGL}"
log " ctx_size : ${MAX_STABLE_CTX}"
log " kv_type : ${BEST_KV}"
log " flash_attn : ${BEST_FA}"
log " batch_size : ${BEST_BATCH}"
log " ubatch : ${BEST_UBATCH}"
log ""
log " Full CSV: ${RESULTS_CSV}"
log ""
# Write recommended .env
ENV_OUT="${OUTPUT_DIR}/${VARIANT}_recommended.env"
cat > "$ENV_OUT" <<EOF
# Generated by benchmark.sh [${VARIANT}] on $(date)
LLAMA_N_GPU_LAYERS=${MAX_STABLE_NGL}
LLAMA_CTX_SIZE=${MAX_STABLE_CTX}
LLAMA_CACHE_TYPE_K=${BEST_KV}
LLAMA_CACHE_TYPE_V=${BEST_KV}
LLAMA_BATCH_SIZE=${BEST_BATCH}
LLAMA_UBATCH_SIZE=${BEST_UBATCH}
LLAMA_THREADS=${THREADS}
LLAMA_THREADS_BATCH=${THREADS_BATCH}
LLAMA_PARALLEL=1
EOF
log " Recommended .env → ${ENV_OUT}"
# ── Cross-variant comparison (if both results exist) ──────────────────────
OFFICIAL_CSV=$(ls "${OUTPUT_DIR}"/official_results_*.csv 2>/dev/null | sort | tail -1 || true)
TURBO_CSV=$(ls "${OUTPUT_DIR}"/turboquant_results_*.csv 2>/dev/null | sort | tail -1 || true)
if [[ -n "$OFFICIAL_CSV" && -n "$TURBO_CSV" ]]; then
COMPARE_OUT="${OUTPUT_DIR}/comparison_$(date +%Y%m%d_%H%M%S).txt"
{
echo "======================================================================"
echo " OFFICIAL vs TURBOQUANT COMPARISON"
echo "======================================================================"
echo ""
echo "Official CSV: $OFFICIAL_CSV"
echo "TurboQuant CSV: $TURBO_CSV"
echo ""
echo "KV type benchmark results (phase ph3_kv):"
echo ""
printf "%-12s %-10s %-10s %-12s %-12s\n" "variant" "kv_type" "ctx" "pp (t/s)" "tg (t/s)"
echo "----------------------------------------------------------------------"
for csv in "$OFFICIAL_CSV" "$TURBO_CSV"; do
awk -F',' '
NR>1 && $2 == "ph3_kv" {
printf "%-12s %-10s %-10s %-12s %-12s\n", $1, $5, $4, $11, $12
}
' "$csv"
done
echo ""
echo "Winner by tg (generation speed):"
awk -F',' '
NR>1 && $2 == "ph3_kv" && $13 == "ok" {
key = $1 "," $5
val = $12+0
if (val > best[key]) { best[key] = val; row[key] = $0 }
}
END {
best_tg = 0; best_key = ""
for (k in best) { if (best[k] > best_tg) { best_tg = best[k]; best_key = k } }
n = split(best_key, a, ",")
printf " %s with kv=%s → %.1f t/s\n", a[1], a[2], best_tg
}
' "$OFFICIAL_CSV" "$TURBO_CSV"
echo "======================================================================"
} | tee "$COMPARE_OUT" | tee -a "$LOG"
echo ""
echo "Comparison report: $COMPARE_OUT"
fi
sep
echo ""
echo "=== RECOMMENDED .env [${VARIANT}] ==="
cat "$ENV_OUT"