#!/usr/bin/env bash # ============================================================================= # llama.cpp Automated Benchmark — Qwen3.5-9B on GTX 1650 Ti (4 GB VRAM) # # Runs for BOTH official llama.cpp and TurboQuant fork. # VARIANT env var selects which KV type set to sweep: # VARIANT=official → f16 q8_0 q5_0 q4_0 iq4_nl # VARIANT=turboquant → f16 q8_0 iq4_nl turbo4 turbo3 turbo2 # # Output: CSV + recommended .env per variant, plus a final comparison table. # # Run: # docker compose --profile benchmark run --rm benchmark (official) # docker compose --profile benchmark run --rm benchmark-turbo (turboquant) # ============================================================================= set -euo pipefail # Ensure llama-bench is findable in both official (/usr/local/bin) and TurboQuant (/app) images export PATH="/app:/usr/local/bin:/usr/bin:/bin:${PATH:-}" MODEL="${MODEL:-${1:-/models/Qwen3.5-9B.Q8_0.gguf}}" OUTPUT_DIR="${OUTPUT_DIR:-${2:-/results}}" VARIANT="${VARIANT:-official}" # official | turboquant TIMESTAMP=$(date +%Y%m%d_%H%M%S) RESULTS_CSV="${OUTPUT_DIR}/${VARIANT}_results_${TIMESTAMP}.csv" LOG="${OUTPUT_DIR}/${VARIANT}_benchmark_${TIMESTAMP}.log" # ── Baseline config ──────────────────────────────────────────────────────── THREADS=6 THREADS_BATCH=12 BATCH_SIZE=2048 UBATCH_SIZE=512 PROMPT_TOKENS=512 GEN_TOKENS=32 REPETITIONS=1 # ── KV type sets per variant ─────────────────────────────────────────────── # turbo2=2-bit (6.4× vs f16), turbo3=3-bit, turbo4=4-bit — TurboQuant only if [[ "$VARIANT" == "turboquant" ]]; then KV_TYPES=(f16 q8_0 iq4_nl turbo4 turbo3 turbo2) else # Official llama.cpp: all standard quant types # iq4_nl = i-quant non-linear: best quality at 4-bit (non-uniform scale) KV_TYPES=(f16 q8_0 q5_0 q4_0 iq4_nl) fi # ── GPU layer sweep (Q8_0 ~297 MB/layer, 3717 MiB VRAM → max ~12 layers) ── NGL_VALUES=(6 9 12 13 14 99) # ── Context sweep: use -p to stress KV cache at given size ───────────────── CTX_VALUES=(128 512 1024 2048 4096 8192) # ── Batch sweep ──────────────────────────────────────────────────────────── BATCH_VALUES=(512 1024 2048 4096) mkdir -p "$OUTPUT_DIR" log() { echo "$*" | tee -a "$LOG"; } sep() { log "$(printf '─%.0s' {1..70})"; } hdr() { sep; log " $*"; sep; } log "llama.cpp Benchmark [${VARIANT}] — $(date)" log "Model: $MODEL" log "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo 'CPU only')" log "KV set: ${KV_TYPES[*]}" sep echo "variant,phase,ngl,ctx,kv_type_k,kv_type_v,flash_attn,batch_size,ubatch_size,threads,pp_tokens_per_sec,tg_tokens_per_sec,status" \ > "$RESULTS_CSV" # ── Helper: run llama-bench ──────────────────────────────────────────────── LAST_PP=0 LAST_TG=0 run_bench() { local ngl=$1 ctx=$2 kv=$3 fa=$4 batch=$5 ubatch=$6 phase="${7:-test}" local raw # New llama-bench API (b9014+): -c and -tb removed; -p sets prompt/ctx size # CSV columns: ...n_prompt(34),n_gen(35),n_depth(36),test_time(37), # avg_ns(38),stddev_ns(39),avg_ts(40),stddev_ts(41) # pp row: n_gen==0; tg row: n_prompt==0 raw=$(timeout 300 /app/llama-bench \ -m "$MODEL" \ -ngl "$ngl" \ -p "$ctx" \ -n "$GEN_TOKENS" \ -t "$THREADS" \ -b "$batch" \ -ub "$ubatch" \ -ctk "$kv" \ -ctv "$kv" \ -fa "$fa" \ -r "$REPETITIONS" \ -o csv 2>&1) || return 1 # Strip quotes from CSV, then extract avg_ts (col 40) by pp/tg row type LAST_PP=$(printf '%s\n' "$raw" | sed 's/"//g' | awk -F',' 'NR>1 && $35=="0" && $40+0>0 {print $40+0; exit}') LAST_TG=$(printf '%s\n' "$raw" | sed 's/"//g' | awk -F',' 'NR>1 && $34=="0" && $35+0>0 && $40+0>0 {print $40+0; exit}') LAST_PP="${LAST_PP:-0}" LAST_TG="${LAST_TG:-0}" echo "${VARIANT},${phase},${ngl},${ctx},${kv},${kv},${fa},${batch},${ubatch},${THREADS},${LAST_PP},${LAST_TG},ok" \ >> "$RESULTS_CSV" return 0 } fail_row() { local phase=$1 ngl=$2 ctx=$3 kv=$4 fa=$5 batch=$6 ubatch=$7 echo "${VARIANT},${phase},${ngl},${ctx},${kv},${kv},${fa},${batch},${ubatch},${THREADS},0,0,failed" \ >> "$RESULTS_CSV" } # ── Phase 1: GPU layer sweep ─────────────────────────────────────────────── hdr "PHASE 1 — GPU layer sweep (prompt=128 kv=f16 fa=0)" # Use f16 KV: prebuilt official image lacks SM75 CUDA kernels for quantized KV. # We isolate the NGL variable here; KV type is swept in Phase 3. MAX_STABLE_NGL=0 for ngl in "${NGL_VALUES[@]}"; do printf " ngl=%-3s " "$ngl" | tee -a "$LOG" if run_bench "$ngl" 128 f16 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph1_ngl"; then log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s" MAX_STABLE_NGL="$ngl" else log "FAILED (OOM/timeout)" fail_row ph1_ngl "$ngl" 128 f16 0 "$BATCH_SIZE" "$UBATCH_SIZE" break fi done log " → Best ngl: ${MAX_STABLE_NGL}" # ── Phase 2: Context sweep ───────────────────────────────────────────────── hdr "PHASE 2 — Context/prompt sweep (ngl=${MAX_STABLE_NGL} kv=f16 fa=0)" MAX_STABLE_CTX=128 for ctx in "${CTX_VALUES[@]}"; do printf " ctx=%-6s " "$ctx" | tee -a "$LOG" if run_bench "$MAX_STABLE_NGL" "$ctx" f16 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph2_ctx"; then log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s" MAX_STABLE_CTX="$ctx" else log "FAILED (OOM/timeout)" fail_row ph2_ctx "$MAX_STABLE_NGL" "$ctx" f16 0 "$BATCH_SIZE" "$UBATCH_SIZE" break fi done log " → Best ctx: ${MAX_STABLE_CTX}" # ── Phase 3: KV cache type sweep ─────────────────────────────────────────── hdr "PHASE 3 — KV type sweep (ngl=${MAX_STABLE_NGL} ctx=${MAX_STABLE_CTX} fa=1)" log " [${VARIANT}] KV types: ${KV_TYPES[*]}" log " Note: Qwen3.5-9B has only 8/32 full-attention layers + GQA (4 KV heads)" log " Linear-attention layers need no KV cache at all → quant errors minimal" if [[ "$VARIANT" == "turboquant" ]]; then log " turbo2=2-bit (6.4× compression), turbo3=3-bit, turbo4=4-bit" fi BEST_KV="q8_0" BEST_TG_KV=0 for kv in "${KV_TYPES[@]}"; do printf " kv=%-8s " "$kv" | tee -a "$LOG" if run_bench "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$kv" 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph3_kv"; then log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s" tg_n=$(printf '%s' "$LAST_TG" | grep -oP '[0-9]+\.?[0-9]*' | head -1) if awk "BEGIN{exit !(${tg_n:-0} > ${BEST_TG_KV:-0})}"; then BEST_TG_KV="${tg_n:-0}" BEST_KV="$kv" fi else log "FAILED" fail_row ph3_kv "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$kv" 0 "$BATCH_SIZE" "$UBATCH_SIZE" fi done log " → Best KV: ${BEST_KV} (tg=${BEST_TG_KV} t/s)" # ── Phase 4: Flash attention ─────────────────────────────────────────────── hdr "PHASE 4 — Flash attention (ngl=${MAX_STABLE_NGL} ctx=${MAX_STABLE_CTX} kv=${BEST_KV})" log " GTX 1650 Ti = CC 7.5 (Turing) — FA2 requires SM80+ but FA1 works on CC>=7.5" BEST_FA=1 BEST_TG_FA=0 for fa in 1 0; do fa_label=$([ "$fa" -eq 1 ] && echo "on " || echo "off") printf " fa=%-3s " "$fa_label" | tee -a "$LOG" if run_bench "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$BEST_KV" "$fa" "$BATCH_SIZE" "$UBATCH_SIZE" "ph4_fa"; then log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s" tg_n=$(printf '%s' "$LAST_TG" | grep -oP '[0-9]+\.?[0-9]*' | head -1) if awk "BEGIN{exit !(${tg_n:-0} > ${BEST_TG_FA:-0})}"; then BEST_TG_FA="${tg_n:-0}" BEST_FA="$fa" fi else log "FAILED" fi done log " → Best FA: ${BEST_FA} (tg=${BEST_TG_FA} t/s)" # ── Phase 5: Batch sweep ─────────────────────────────────────────────────── hdr "PHASE 5 — Batch sweep (ngl=${MAX_STABLE_NGL} ctx=${MAX_STABLE_CTX} kv=${BEST_KV} fa=${BEST_FA})" # Use small fixed prompt (64) to isolate batch-buffer allocation overhead from prompt size. # Larger batch = larger CUDA activation buffers; tests whether they fit in remaining VRAM. BEST_BATCH="$BATCH_SIZE" BEST_PP_BATCH=0 FIXED_P=64 for batch in "${BATCH_VALUES[@]}"; do ubatch=$(( batch / 4 < 64 ? 64 : batch / 4 )) printf " batch=%-5s ubatch=%-4s " "$batch" "$ubatch" | tee -a "$LOG" if run_bench "$MAX_STABLE_NGL" "$FIXED_P" "$BEST_KV" "$BEST_FA" "$batch" "$ubatch" "ph5_batch"; then log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s" pp_n=$(printf '%s' "$LAST_PP" | grep -oP '[0-9]+\.?[0-9]*' | head -1) if awk "BEGIN{exit !(${pp_n:-0} > ${BEST_PP_BATCH:-0})}"; then BEST_PP_BATCH="${pp_n:-0}" BEST_BATCH="$batch" fi else log "FAILED" fi done BEST_UBATCH=$(( BEST_BATCH / 4 < 64 ? 64 : BEST_BATCH / 4 )) log " → Best batch: ${BEST_BATCH} ubatch: ${BEST_UBATCH} (pp=${BEST_PP_BATCH} t/s)" # ── Phase 6 (TurboQuant only): max context with turbo2 KV ────────────────── if [[ "$VARIANT" == "turboquant" ]]; then hdr "PHASE 6 — TurboQuant: extended context with turbo2 KV (ngl=${MAX_STABLE_NGL} fa=${BEST_FA})" log " turbo2 = 2-bit KV (6.4× smaller than f16) → enables much larger ctx in same VRAM" TURBO_CTX_VALUES=(512 1024 2048 4096 8192 16384 32768) MAX_TURBO_CTX="128" TURBO_KV="turbo2" for ctx in "${TURBO_CTX_VALUES[@]}"; do printf " ctx=%-7s " "$ctx" | tee -a "$LOG" if run_bench "$MAX_STABLE_NGL" "$ctx" "$TURBO_KV" "$BEST_FA" "$BEST_BATCH" "$BEST_UBATCH" "ph6_turbo_ctx"; then log "OK pp=${LAST_PP} t/s tg=${LAST_TG} t/s" MAX_TURBO_CTX="$ctx" else log "FAILED (OOM/timeout)" fail_row ph6_turbo_ctx "$MAX_STABLE_NGL" "$ctx" "$TURBO_KV" "$BEST_FA" "$BEST_BATCH" "$BEST_UBATCH" break fi done log " → Max context with turbo2: ${MAX_TURBO_CTX}" # Use the larger turbo ctx for the recommended .env MAX_STABLE_CTX="$MAX_TURBO_CTX" BEST_KV="$TURBO_KV" fi # ── Summary ──────────────────────────────────────────────────────────────── sep log "BENCHMARK COMPLETE [${VARIANT}] — $(date)" sep log "" log " Optimal params for GTX 1650 Ti + Qwen3.5-9B Q4_K_M [${VARIANT}]:" log "" log " ngl : ${MAX_STABLE_NGL}" log " ctx_size : ${MAX_STABLE_CTX}" log " kv_type : ${BEST_KV}" log " flash_attn : ${BEST_FA}" log " batch_size : ${BEST_BATCH}" log " ubatch : ${BEST_UBATCH}" log "" log " Full CSV: ${RESULTS_CSV}" log "" # Write recommended .env ENV_OUT="${OUTPUT_DIR}/${VARIANT}_recommended.env" cat > "$ENV_OUT" </dev/null | sort | tail -1 || true) TURBO_CSV=$(ls "${OUTPUT_DIR}"/turboquant_results_*.csv 2>/dev/null | sort | tail -1 || true) if [[ -n "$OFFICIAL_CSV" && -n "$TURBO_CSV" ]]; then COMPARE_OUT="${OUTPUT_DIR}/comparison_$(date +%Y%m%d_%H%M%S).txt" { echo "======================================================================" echo " OFFICIAL vs TURBOQUANT COMPARISON" echo "======================================================================" echo "" echo "Official CSV: $OFFICIAL_CSV" echo "TurboQuant CSV: $TURBO_CSV" echo "" echo "KV type benchmark results (phase ph3_kv):" echo "" printf "%-12s %-10s %-10s %-12s %-12s\n" "variant" "kv_type" "ctx" "pp (t/s)" "tg (t/s)" echo "----------------------------------------------------------------------" for csv in "$OFFICIAL_CSV" "$TURBO_CSV"; do awk -F',' ' NR>1 && $2 == "ph3_kv" { printf "%-12s %-10s %-10s %-12s %-12s\n", $1, $5, $4, $11, $12 } ' "$csv" done echo "" echo "Winner by tg (generation speed):" awk -F',' ' NR>1 && $2 == "ph3_kv" && $13 == "ok" { key = $1 "," $5 val = $12+0 if (val > best[key]) { best[key] = val; row[key] = $0 } } END { best_tg = 0; best_key = "" for (k in best) { if (best[k] > best_tg) { best_tg = best[k]; best_key = k } } n = split(best_key, a, ",") printf " %s with kv=%s → %.1f t/s\n", a[1], a[2], best_tg } ' "$OFFICIAL_CSV" "$TURBO_CSV" echo "======================================================================" } | tee "$COMPARE_OUT" | tee -a "$LOG" echo "" echo "Comparison report: $COMPARE_OUT" fi sep echo "" echo "=== RECOMMENDED .env [${VARIANT}] ===" cat "$ENV_OUT"