Initial commit: tuned multi-model llama.cpp stack

- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B - TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs - Bigctx profiles (-nkvo KV in RAM): 2-16x context gain - turbo2 KV: 2x smaller, benchmarked against PPL quality gate - Per-model env files with justified parameters - kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts - docs/FINDINGS.md: surprises, pitfalls, recommendations - docs/ARCHITECTURE.md: compose + test script design
2026-05-06 15:56:40 +02:00
commit 4ad296608b
22 changed files with 2530 additions and 0 deletions
--- a/scripts/benchmark.sh
+++ b/scripts/benchmark.sh
@@ -0,0 +1,335 @@
+#!/usr/bin/env bash
+# =============================================================================
+# llama.cpp Automated Benchmark — Qwen3.5-9B on GTX 1650 Ti (4 GB VRAM)
+#
+# Runs for BOTH official llama.cpp and TurboQuant fork.
+# VARIANT env var selects which KV type set to sweep:
+#   VARIANT=official    → f16 q8_0 q5_0 q4_0 iq4_nl
+#   VARIANT=turboquant  → f16 q8_0 iq4_nl turbo4 turbo3 turbo2
+#
+# Output: CSV + recommended .env per variant, plus a final comparison table.
+#
+# Run:
+#   docker compose --profile benchmark run --rm benchmark          (official)
+#   docker compose --profile benchmark run --rm benchmark-turbo    (turboquant)
+# =============================================================================
+
+set -euo pipefail
+
+# Ensure llama-bench is findable in both official (/usr/local/bin) and TurboQuant (/app) images
+export PATH="/app:/usr/local/bin:/usr/bin:/bin:${PATH:-}"
+
+MODEL="${MODEL:-${1:-/models/Qwen3.5-9B.Q8_0.gguf}}"
+OUTPUT_DIR="${OUTPUT_DIR:-${2:-/results}}"
+VARIANT="${VARIANT:-official}"   # official | turboquant
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+RESULTS_CSV="${OUTPUT_DIR}/${VARIANT}_results_${TIMESTAMP}.csv"
+LOG="${OUTPUT_DIR}/${VARIANT}_benchmark_${TIMESTAMP}.log"
+
+# ── Baseline config ────────────────────────────────────────────────────────
+THREADS=6
+THREADS_BATCH=12
+BATCH_SIZE=2048
+UBATCH_SIZE=512
+PROMPT_TOKENS=512
+GEN_TOKENS=32
+REPETITIONS=1
+
+# ── KV type sets per variant ───────────────────────────────────────────────
+# turbo2=2-bit (6.4× vs f16), turbo3=3-bit, turbo4=4-bit — TurboQuant only
+if [[ "$VARIANT" == "turboquant" ]]; then
+  KV_TYPES=(f16 q8_0 iq4_nl turbo4 turbo3 turbo2)
+else
+  # Official llama.cpp: all standard quant types
+  # iq4_nl = i-quant non-linear: best quality at 4-bit (non-uniform scale)
+  KV_TYPES=(f16 q8_0 q5_0 q4_0 iq4_nl)
+fi
+
+# ── GPU layer sweep (Q8_0 ~297 MB/layer, 3717 MiB VRAM → max ~12 layers) ──
+NGL_VALUES=(6 9 12 13 14 99)
+
+# ── Context sweep: use -p to stress KV cache at given size ─────────────────
+CTX_VALUES=(128 512 1024 2048 4096 8192)
+
+# ── Batch sweep ────────────────────────────────────────────────────────────
+BATCH_VALUES=(512 1024 2048 4096)
+
+mkdir -p "$OUTPUT_DIR"
+
+log()  { echo "$*" | tee -a "$LOG"; }
+sep()  { log "$(printf '─%.0s' {1..70})"; }
+hdr()  { sep; log "  $*"; sep; }
+
+log "llama.cpp Benchmark [${VARIANT}] — $(date)"
+log "Model:   $MODEL"
+log "GPU:     $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo 'CPU only')"
+log "KV set:  ${KV_TYPES[*]}"
+sep
+
+echo "variant,phase,ngl,ctx,kv_type_k,kv_type_v,flash_attn,batch_size,ubatch_size,threads,pp_tokens_per_sec,tg_tokens_per_sec,status" \
+  > "$RESULTS_CSV"
+
+# ── Helper: run llama-bench ────────────────────────────────────────────────
+LAST_PP=0
+LAST_TG=0
+
+run_bench() {
+  local ngl=$1 ctx=$2 kv=$3 fa=$4 batch=$5 ubatch=$6 phase="${7:-test}"
+  local raw
+
+  # New llama-bench API (b9014+): -c and -tb removed; -p sets prompt/ctx size
+  # CSV columns: ...n_prompt(34),n_gen(35),n_depth(36),test_time(37),
+  #              avg_ns(38),stddev_ns(39),avg_ts(40),stddev_ts(41)
+  # pp row: n_gen==0; tg row: n_prompt==0
+  raw=$(timeout 300 /app/llama-bench \
+    -m "$MODEL" \
+    -ngl "$ngl" \
+    -p "$ctx" \
+    -n "$GEN_TOKENS" \
+    -t "$THREADS" \
+    -b "$batch" \
+    -ub "$ubatch" \
+    -ctk "$kv" \
+    -ctv "$kv" \
+    -fa "$fa" \
+    -r "$REPETITIONS" \
+    -o csv 2>&1) || return 1
+
+  # Strip quotes from CSV, then extract avg_ts (col 40) by pp/tg row type
+  LAST_PP=$(printf '%s\n' "$raw" | sed 's/"//g' | awk -F',' 'NR>1 && $35=="0"    && $40+0>0 {print $40+0; exit}')
+  LAST_TG=$(printf '%s\n' "$raw" | sed 's/"//g' | awk -F',' 'NR>1 && $34=="0" && $35+0>0 && $40+0>0 {print $40+0; exit}')
+  LAST_PP="${LAST_PP:-0}"
+  LAST_TG="${LAST_TG:-0}"
+
+  echo "${VARIANT},${phase},${ngl},${ctx},${kv},${kv},${fa},${batch},${ubatch},${THREADS},${LAST_PP},${LAST_TG},ok" \
+    >> "$RESULTS_CSV"
+  return 0
+}
+
+fail_row() {
+  local phase=$1 ngl=$2 ctx=$3 kv=$4 fa=$5 batch=$6 ubatch=$7
+  echo "${VARIANT},${phase},${ngl},${ctx},${kv},${kv},${fa},${batch},${ubatch},${THREADS},0,0,failed" \
+    >> "$RESULTS_CSV"
+}
+
+# ── Phase 1: GPU layer sweep ───────────────────────────────────────────────
+hdr "PHASE 1 — GPU layer sweep (prompt=128  kv=f16  fa=0)"
+# Use f16 KV: prebuilt official image lacks SM75 CUDA kernels for quantized KV.
+# We isolate the NGL variable here; KV type is swept in Phase 3.
+MAX_STABLE_NGL=0
+for ngl in "${NGL_VALUES[@]}"; do
+  printf "  ngl=%-3s  " "$ngl" | tee -a "$LOG"
+  if run_bench "$ngl" 128 f16 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph1_ngl"; then
+    log "OK  pp=${LAST_PP} t/s  tg=${LAST_TG} t/s"
+    MAX_STABLE_NGL="$ngl"
+  else
+    log "FAILED (OOM/timeout)"
+    fail_row ph1_ngl "$ngl" 128 f16 0 "$BATCH_SIZE" "$UBATCH_SIZE"
+    break
+  fi
+done
+log "  → Best ngl: ${MAX_STABLE_NGL}"
+
+# ── Phase 2: Context sweep ─────────────────────────────────────────────────
+hdr "PHASE 2 — Context/prompt sweep (ngl=${MAX_STABLE_NGL}  kv=f16  fa=0)"
+
+MAX_STABLE_CTX=128
+for ctx in "${CTX_VALUES[@]}"; do
+  printf "  ctx=%-6s  " "$ctx" | tee -a "$LOG"
+  if run_bench "$MAX_STABLE_NGL" "$ctx" f16 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph2_ctx"; then
+    log "OK  pp=${LAST_PP} t/s  tg=${LAST_TG} t/s"
+    MAX_STABLE_CTX="$ctx"
+  else
+    log "FAILED (OOM/timeout)"
+    fail_row ph2_ctx "$MAX_STABLE_NGL" "$ctx" f16 0 "$BATCH_SIZE" "$UBATCH_SIZE"
+    break
+  fi
+done
+log "  → Best ctx: ${MAX_STABLE_CTX}"
+
+# ── Phase 3: KV cache type sweep ───────────────────────────────────────────
+hdr "PHASE 3 — KV type sweep (ngl=${MAX_STABLE_NGL}  ctx=${MAX_STABLE_CTX}  fa=1)"
+log "  [${VARIANT}] KV types: ${KV_TYPES[*]}"
+log "  Note: Qwen3.5-9B has only 8/32 full-attention layers + GQA (4 KV heads)"
+log "        Linear-attention layers need no KV cache at all → quant errors minimal"
+if [[ "$VARIANT" == "turboquant" ]]; then
+  log "  turbo2=2-bit (6.4× compression), turbo3=3-bit, turbo4=4-bit"
+fi
+
+BEST_KV="q8_0"
+BEST_TG_KV=0
+
+for kv in "${KV_TYPES[@]}"; do
+  printf "  kv=%-8s  " "$kv" | tee -a "$LOG"
+  if run_bench "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$kv" 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph3_kv"; then
+    log "OK  pp=${LAST_PP} t/s  tg=${LAST_TG} t/s"
+    tg_n=$(printf '%s' "$LAST_TG" | grep -oP '[0-9]+\.?[0-9]*' | head -1)
+    if awk "BEGIN{exit !(${tg_n:-0} > ${BEST_TG_KV:-0})}"; then
+      BEST_TG_KV="${tg_n:-0}"
+      BEST_KV="$kv"
+    fi
+  else
+    log "FAILED"
+    fail_row ph3_kv "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$kv" 0 "$BATCH_SIZE" "$UBATCH_SIZE"
+  fi
+done
+log "  → Best KV: ${BEST_KV}  (tg=${BEST_TG_KV} t/s)"
+
+# ── Phase 4: Flash attention ───────────────────────────────────────────────
+hdr "PHASE 4 — Flash attention (ngl=${MAX_STABLE_NGL}  ctx=${MAX_STABLE_CTX}  kv=${BEST_KV})"
+log "  GTX 1650 Ti = CC 7.5 (Turing) — FA2 requires SM80+ but FA1 works on CC>=7.5"
+
+BEST_FA=1
+BEST_TG_FA=0
+
+for fa in 1 0; do
+  fa_label=$([ "$fa" -eq 1 ] && echo "on " || echo "off")
+  printf "  fa=%-3s  " "$fa_label" | tee -a "$LOG"
+  if run_bench "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$BEST_KV" "$fa" "$BATCH_SIZE" "$UBATCH_SIZE" "ph4_fa"; then
+    log "OK  pp=${LAST_PP} t/s  tg=${LAST_TG} t/s"
+    tg_n=$(printf '%s' "$LAST_TG" | grep -oP '[0-9]+\.?[0-9]*' | head -1)
+    if awk "BEGIN{exit !(${tg_n:-0} > ${BEST_TG_FA:-0})}"; then
+      BEST_TG_FA="${tg_n:-0}"
+      BEST_FA="$fa"
+    fi
+  else
+    log "FAILED"
+  fi
+done
+log "  → Best FA: ${BEST_FA}  (tg=${BEST_TG_FA} t/s)"
+
+# ── Phase 5: Batch sweep ───────────────────────────────────────────────────
+hdr "PHASE 5 — Batch sweep (ngl=${MAX_STABLE_NGL}  ctx=${MAX_STABLE_CTX}  kv=${BEST_KV}  fa=${BEST_FA})"
+# Use small fixed prompt (64) to isolate batch-buffer allocation overhead from prompt size.
+# Larger batch = larger CUDA activation buffers; tests whether they fit in remaining VRAM.
+BEST_BATCH="$BATCH_SIZE"
+BEST_PP_BATCH=0
+FIXED_P=64
+
+for batch in "${BATCH_VALUES[@]}"; do
+  ubatch=$(( batch / 4 < 64 ? 64 : batch / 4 ))
+  printf "  batch=%-5s ubatch=%-4s  " "$batch" "$ubatch" | tee -a "$LOG"
+  if run_bench "$MAX_STABLE_NGL" "$FIXED_P" "$BEST_KV" "$BEST_FA" "$batch" "$ubatch" "ph5_batch"; then
+    log "OK  pp=${LAST_PP} t/s  tg=${LAST_TG} t/s"
+    pp_n=$(printf '%s' "$LAST_PP" | grep -oP '[0-9]+\.?[0-9]*' | head -1)
+    if awk "BEGIN{exit !(${pp_n:-0} > ${BEST_PP_BATCH:-0})}"; then
+      BEST_PP_BATCH="${pp_n:-0}"
+      BEST_BATCH="$batch"
+    fi
+  else
+    log "FAILED"
+  fi
+done
+BEST_UBATCH=$(( BEST_BATCH / 4 < 64 ? 64 : BEST_BATCH / 4 ))
+log "  → Best batch: ${BEST_BATCH}  ubatch: ${BEST_UBATCH}  (pp=${BEST_PP_BATCH} t/s)"
+
+# ── Phase 6 (TurboQuant only): max context with turbo2 KV ──────────────────
+if [[ "$VARIANT" == "turboquant" ]]; then
+  hdr "PHASE 6 — TurboQuant: extended context with turbo2 KV (ngl=${MAX_STABLE_NGL}  fa=${BEST_FA})"
+  log "  turbo2 = 2-bit KV (6.4× smaller than f16) → enables much larger ctx in same VRAM"
+
+  TURBO_CTX_VALUES=(512 1024 2048 4096 8192 16384 32768)
+  MAX_TURBO_CTX="128"
+  TURBO_KV="turbo2"
+
+  for ctx in "${TURBO_CTX_VALUES[@]}"; do
+    printf "  ctx=%-7s  " "$ctx" | tee -a "$LOG"
+    if run_bench "$MAX_STABLE_NGL" "$ctx" "$TURBO_KV" "$BEST_FA" "$BEST_BATCH" "$BEST_UBATCH" "ph6_turbo_ctx"; then
+      log "OK  pp=${LAST_PP} t/s  tg=${LAST_TG} t/s"
+      MAX_TURBO_CTX="$ctx"
+    else
+      log "FAILED (OOM/timeout)"
+      fail_row ph6_turbo_ctx "$MAX_STABLE_NGL" "$ctx" "$TURBO_KV" "$BEST_FA" "$BEST_BATCH" "$BEST_UBATCH"
+      break
+    fi
+  done
+  log "  → Max context with turbo2: ${MAX_TURBO_CTX}"
+  # Use the larger turbo ctx for the recommended .env
+  MAX_STABLE_CTX="$MAX_TURBO_CTX"
+  BEST_KV="$TURBO_KV"
+fi
+
+# ── Summary ────────────────────────────────────────────────────────────────
+sep
+log "BENCHMARK COMPLETE [${VARIANT}] — $(date)"
+sep
+log ""
+log "  Optimal params for GTX 1650 Ti + Qwen3.5-9B Q4_K_M [${VARIANT}]:"
+log ""
+log "    ngl        : ${MAX_STABLE_NGL}"
+log "    ctx_size   : ${MAX_STABLE_CTX}"
+log "    kv_type    : ${BEST_KV}"
+log "    flash_attn : ${BEST_FA}"
+log "    batch_size : ${BEST_BATCH}"
+log "    ubatch     : ${BEST_UBATCH}"
+log ""
+log "  Full CSV: ${RESULTS_CSV}"
+log ""
+
+# Write recommended .env
+ENV_OUT="${OUTPUT_DIR}/${VARIANT}_recommended.env"
+cat > "$ENV_OUT" <<EOF
+# Generated by benchmark.sh [${VARIANT}] on $(date)
+LLAMA_N_GPU_LAYERS=${MAX_STABLE_NGL}
+LLAMA_CTX_SIZE=${MAX_STABLE_CTX}
+LLAMA_CACHE_TYPE_K=${BEST_KV}
+LLAMA_CACHE_TYPE_V=${BEST_KV}
+LLAMA_BATCH_SIZE=${BEST_BATCH}
+LLAMA_UBATCH_SIZE=${BEST_UBATCH}
+LLAMA_THREADS=${THREADS}
+LLAMA_THREADS_BATCH=${THREADS_BATCH}
+LLAMA_PARALLEL=1
+EOF
+log "  Recommended .env → ${ENV_OUT}"
+
+# ── Cross-variant comparison (if both results exist) ──────────────────────
+OFFICIAL_CSV=$(ls "${OUTPUT_DIR}"/official_results_*.csv 2>/dev/null | sort | tail -1 || true)
+TURBO_CSV=$(ls "${OUTPUT_DIR}"/turboquant_results_*.csv 2>/dev/null | sort | tail -1 || true)
+
+if [[ -n "$OFFICIAL_CSV" && -n "$TURBO_CSV" ]]; then
+  COMPARE_OUT="${OUTPUT_DIR}/comparison_$(date +%Y%m%d_%H%M%S).txt"
+  {
+    echo "======================================================================"
+    echo " OFFICIAL vs TURBOQUANT COMPARISON"
+    echo "======================================================================"
+    echo ""
+    echo "Official  CSV: $OFFICIAL_CSV"
+    echo "TurboQuant CSV: $TURBO_CSV"
+    echo ""
+    echo "KV type benchmark results (phase ph3_kv):"
+    echo ""
+    printf "%-12s  %-10s  %-10s  %-12s  %-12s\n" "variant" "kv_type" "ctx" "pp (t/s)" "tg (t/s)"
+    echo "----------------------------------------------------------------------"
+    for csv in "$OFFICIAL_CSV" "$TURBO_CSV"; do
+      awk -F',' '
+        NR>1 && $2 == "ph3_kv" {
+          printf "%-12s  %-10s  %-10s  %-12s  %-12s\n", $1, $5, $4, $11, $12
+        }
+      ' "$csv"
+    done
+    echo ""
+    echo "Winner by tg (generation speed):"
+    awk -F',' '
+      NR>1 && $2 == "ph3_kv" && $13 == "ok" {
+        key = $1 "," $5
+        val = $12+0
+        if (val > best[key]) { best[key] = val; row[key] = $0 }
+      }
+      END {
+        best_tg = 0; best_key = ""
+        for (k in best) { if (best[k] > best_tg) { best_tg = best[k]; best_key = k } }
+        n = split(best_key, a, ",")
+        printf "  %s with kv=%s → %.1f t/s\n", a[1], a[2], best_tg
+      }
+    ' "$OFFICIAL_CSV" "$TURBO_CSV"
+    echo "======================================================================"
+  } | tee "$COMPARE_OUT" | tee -a "$LOG"
+  echo ""
+  echo "Comparison report: $COMPARE_OUT"
+fi
+
+sep
+echo ""
+echo "=== RECOMMENDED .env [${VARIANT}] ==="
+cat "$ENV_OUT"
+