Initial commit: tuned multi-model llama.cpp stack

- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B - TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs - Bigctx profiles (-nkvo KV in RAM): 2-16x context gain - turbo2 KV: 2x smaller, benchmarked against PPL quality gate - Per-model env files with justified parameters - kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts - docs/FINDINGS.md: surprises, pitfalls, recommendations - docs/ARCHITECTURE.md: compose + test script design
2026-05-06 15:56:40 +02:00
commit 4ad296608b
22 changed files with 2530 additions and 0 deletions
--- a/scripts/benchmark.sh
+++ b/scripts/benchmark.sh
@@ -0,0 +1,335 @@
+#!/usr/bin/env bash
+# =============================================================================
+# llama.cpp Automated Benchmark — Qwen3.5-9B on GTX 1650 Ti (4 GB VRAM)
+#
+# Runs for BOTH official llama.cpp and TurboQuant fork.
+# VARIANT env var selects which KV type set to sweep:
+#   VARIANT=official    → f16 q8_0 q5_0 q4_0 iq4_nl
+#   VARIANT=turboquant  → f16 q8_0 iq4_nl turbo4 turbo3 turbo2
+#
+# Output: CSV + recommended .env per variant, plus a final comparison table.
+#
+# Run:
+#   docker compose --profile benchmark run --rm benchmark          (official)
+#   docker compose --profile benchmark run --rm benchmark-turbo    (turboquant)
+# =============================================================================
+
+set -euo pipefail
+
+# Ensure llama-bench is findable in both official (/usr/local/bin) and TurboQuant (/app) images
+export PATH="/app:/usr/local/bin:/usr/bin:/bin:${PATH:-}"
+
+MODEL="${MODEL:-${1:-/models/Qwen3.5-9B.Q8_0.gguf}}"
+OUTPUT_DIR="${OUTPUT_DIR:-${2:-/results}}"
+VARIANT="${VARIANT:-official}"   # official | turboquant
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+RESULTS_CSV="${OUTPUT_DIR}/${VARIANT}_results_${TIMESTAMP}.csv"
+LOG="${OUTPUT_DIR}/${VARIANT}_benchmark_${TIMESTAMP}.log"
+
+# ── Baseline config ────────────────────────────────────────────────────────
+THREADS=6
+THREADS_BATCH=12
+BATCH_SIZE=2048
+UBATCH_SIZE=512
+PROMPT_TOKENS=512
+GEN_TOKENS=32
+REPETITIONS=1
+
+# ── KV type sets per variant ───────────────────────────────────────────────
+# turbo2=2-bit (6.4× vs f16), turbo3=3-bit, turbo4=4-bit — TurboQuant only
+if [[ "$VARIANT" == "turboquant" ]]; then
+  KV_TYPES=(f16 q8_0 iq4_nl turbo4 turbo3 turbo2)
+else
+  # Official llama.cpp: all standard quant types
+  # iq4_nl = i-quant non-linear: best quality at 4-bit (non-uniform scale)
+  KV_TYPES=(f16 q8_0 q5_0 q4_0 iq4_nl)
+fi
+
+# ── GPU layer sweep (Q8_0 ~297 MB/layer, 3717 MiB VRAM → max ~12 layers) ──
+NGL_VALUES=(6 9 12 13 14 99)
+
+# ── Context sweep: use -p to stress KV cache at given size ─────────────────
+CTX_VALUES=(128 512 1024 2048 4096 8192)
+
+# ── Batch sweep ────────────────────────────────────────────────────────────
+BATCH_VALUES=(512 1024 2048 4096)
+
+mkdir -p "$OUTPUT_DIR"
+
+log()  { echo "$*" | tee -a "$LOG"; }
+sep()  { log "$(printf '─%.0s' {1..70})"; }
+hdr()  { sep; log "  $*"; sep; }
+
+log "llama.cpp Benchmark [${VARIANT}] — $(date)"
+log "Model:   $MODEL"
+log "GPU:     $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo 'CPU only')"
+log "KV set:  ${KV_TYPES[*]}"
+sep
+
+echo "variant,phase,ngl,ctx,kv_type_k,kv_type_v,flash_attn,batch_size,ubatch_size,threads,pp_tokens_per_sec,tg_tokens_per_sec,status" \
+  > "$RESULTS_CSV"
+
+# ── Helper: run llama-bench ────────────────────────────────────────────────
+LAST_PP=0
+LAST_TG=0
+
+run_bench() {
+  local ngl=$1 ctx=$2 kv=$3 fa=$4 batch=$5 ubatch=$6 phase="${7:-test}"
+  local raw
+
+  # New llama-bench API (b9014+): -c and -tb removed; -p sets prompt/ctx size
+  # CSV columns: ...n_prompt(34),n_gen(35),n_depth(36),test_time(37),
+  #              avg_ns(38),stddev_ns(39),avg_ts(40),stddev_ts(41)
+  # pp row: n_gen==0; tg row: n_prompt==0
+  raw=$(timeout 300 /app/llama-bench \
+    -m "$MODEL" \
+    -ngl "$ngl" \
+    -p "$ctx" \
+    -n "$GEN_TOKENS" \
+    -t "$THREADS" \
+    -b "$batch" \
+    -ub "$ubatch" \
+    -ctk "$kv" \
+    -ctv "$kv" \
+    -fa "$fa" \
+    -r "$REPETITIONS" \
+    -o csv 2>&1) || return 1
+
+  # Strip quotes from CSV, then extract avg_ts (col 40) by pp/tg row type
+  LAST_PP=$(printf '%s\n' "$raw" | sed 's/"//g' | awk -F',' 'NR>1 && $35=="0"    && $40+0>0 {print $40+0; exit}')
+  LAST_TG=$(printf '%s\n' "$raw" | sed 's/"//g' | awk -F',' 'NR>1 && $34=="0" && $35+0>0 && $40+0>0 {print $40+0; exit}')
+  LAST_PP="${LAST_PP:-0}"
+  LAST_TG="${LAST_TG:-0}"
+
+  echo "${VARIANT},${phase},${ngl},${ctx},${kv},${kv},${fa},${batch},${ubatch},${THREADS},${LAST_PP},${LAST_TG},ok" \
+    >> "$RESULTS_CSV"
+  return 0
+}
+
+fail_row() {
+  local phase=$1 ngl=$2 ctx=$3 kv=$4 fa=$5 batch=$6 ubatch=$7
+  echo "${VARIANT},${phase},${ngl},${ctx},${kv},${kv},${fa},${batch},${ubatch},${THREADS},0,0,failed" \
+    >> "$RESULTS_CSV"
+}
+
+# ── Phase 1: GPU layer sweep ───────────────────────────────────────────────
+hdr "PHASE 1 — GPU layer sweep (prompt=128  kv=f16  fa=0)"
+# Use f16 KV: prebuilt official image lacks SM75 CUDA kernels for quantized KV.
+# We isolate the NGL variable here; KV type is swept in Phase 3.
+MAX_STABLE_NGL=0
+for ngl in "${NGL_VALUES[@]}"; do
+  printf "  ngl=%-3s  " "$ngl" | tee -a "$LOG"
+  if run_bench "$ngl" 128 f16 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph1_ngl"; then
+    log "OK  pp=${LAST_PP} t/s  tg=${LAST_TG} t/s"
+    MAX_STABLE_NGL="$ngl"
+  else
+    log "FAILED (OOM/timeout)"
+    fail_row ph1_ngl "$ngl" 128 f16 0 "$BATCH_SIZE" "$UBATCH_SIZE"
+    break
+  fi
+done
+log "  → Best ngl: ${MAX_STABLE_NGL}"
+
+# ── Phase 2: Context sweep ─────────────────────────────────────────────────
+hdr "PHASE 2 — Context/prompt sweep (ngl=${MAX_STABLE_NGL}  kv=f16  fa=0)"
+
+MAX_STABLE_CTX=128
+for ctx in "${CTX_VALUES[@]}"; do
+  printf "  ctx=%-6s  " "$ctx" | tee -a "$LOG"
+  if run_bench "$MAX_STABLE_NGL" "$ctx" f16 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph2_ctx"; then
+    log "OK  pp=${LAST_PP} t/s  tg=${LAST_TG} t/s"
+    MAX_STABLE_CTX="$ctx"
+  else
+    log "FAILED (OOM/timeout)"
+    fail_row ph2_ctx "$MAX_STABLE_NGL" "$ctx" f16 0 "$BATCH_SIZE" "$UBATCH_SIZE"
+    break
+  fi
+done
+log "  → Best ctx: ${MAX_STABLE_CTX}"
+
+# ── Phase 3: KV cache type sweep ───────────────────────────────────────────
+hdr "PHASE 3 — KV type sweep (ngl=${MAX_STABLE_NGL}  ctx=${MAX_STABLE_CTX}  fa=1)"
+log "  [${VARIANT}] KV types: ${KV_TYPES[*]}"
+log "  Note: Qwen3.5-9B has only 8/32 full-attention layers + GQA (4 KV heads)"
+log "        Linear-attention layers need no KV cache at all → quant errors minimal"
+if [[ "$VARIANT" == "turboquant" ]]; then
+  log "  turbo2=2-bit (6.4× compression), turbo3=3-bit, turbo4=4-bit"
+fi
+
+BEST_KV="q8_0"
+BEST_TG_KV=0
+
+for kv in "${KV_TYPES[@]}"; do
+  printf "  kv=%-8s  " "$kv" | tee -a "$LOG"
+  if run_bench "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$kv" 0 "$BATCH_SIZE" "$UBATCH_SIZE" "ph3_kv"; then
+    log "OK  pp=${LAST_PP} t/s  tg=${LAST_TG} t/s"
+    tg_n=$(printf '%s' "$LAST_TG" | grep -oP '[0-9]+\.?[0-9]*' | head -1)
+    if awk "BEGIN{exit !(${tg_n:-0} > ${BEST_TG_KV:-0})}"; then
+      BEST_TG_KV="${tg_n:-0}"
+      BEST_KV="$kv"
+    fi
+  else
+    log "FAILED"
+    fail_row ph3_kv "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$kv" 0 "$BATCH_SIZE" "$UBATCH_SIZE"
+  fi
+done
+log "  → Best KV: ${BEST_KV}  (tg=${BEST_TG_KV} t/s)"
+
+# ── Phase 4: Flash attention ───────────────────────────────────────────────
+hdr "PHASE 4 — Flash attention (ngl=${MAX_STABLE_NGL}  ctx=${MAX_STABLE_CTX}  kv=${BEST_KV})"
+log "  GTX 1650 Ti = CC 7.5 (Turing) — FA2 requires SM80+ but FA1 works on CC>=7.5"
+
+BEST_FA=1
+BEST_TG_FA=0
+
+for fa in 1 0; do
+  fa_label=$([ "$fa" -eq 1 ] && echo "on " || echo "off")
+  printf "  fa=%-3s  " "$fa_label" | tee -a "$LOG"
+  if run_bench "$MAX_STABLE_NGL" "$MAX_STABLE_CTX" "$BEST_KV" "$fa" "$BATCH_SIZE" "$UBATCH_SIZE" "ph4_fa"; then
+    log "OK  pp=${LAST_PP} t/s  tg=${LAST_TG} t/s"
+    tg_n=$(printf '%s' "$LAST_TG" | grep -oP '[0-9]+\.?[0-9]*' | head -1)
+    if awk "BEGIN{exit !(${tg_n:-0} > ${BEST_TG_FA:-0})}"; then
+      BEST_TG_FA="${tg_n:-0}"
+      BEST_FA="$fa"
+    fi
+  else
+    log "FAILED"
+  fi
+done
+log "  → Best FA: ${BEST_FA}  (tg=${BEST_TG_FA} t/s)"
+
+# ── Phase 5: Batch sweep ───────────────────────────────────────────────────
+hdr "PHASE 5 — Batch sweep (ngl=${MAX_STABLE_NGL}  ctx=${MAX_STABLE_CTX}  kv=${BEST_KV}  fa=${BEST_FA})"
+# Use small fixed prompt (64) to isolate batch-buffer allocation overhead from prompt size.
+# Larger batch = larger CUDA activation buffers; tests whether they fit in remaining VRAM.
+BEST_BATCH="$BATCH_SIZE"
+BEST_PP_BATCH=0
+FIXED_P=64
+
+for batch in "${BATCH_VALUES[@]}"; do
+  ubatch=$(( batch / 4 < 64 ? 64 : batch / 4 ))
+  printf "  batch=%-5s ubatch=%-4s  " "$batch" "$ubatch" | tee -a "$LOG"
+  if run_bench "$MAX_STABLE_NGL" "$FIXED_P" "$BEST_KV" "$BEST_FA" "$batch" "$ubatch" "ph5_batch"; then
+    log "OK  pp=${LAST_PP} t/s  tg=${LAST_TG} t/s"
+    pp_n=$(printf '%s' "$LAST_PP" | grep -oP '[0-9]+\.?[0-9]*' | head -1)
+    if awk "BEGIN{exit !(${pp_n:-0} > ${BEST_PP_BATCH:-0})}"; then
+      BEST_PP_BATCH="${pp_n:-0}"
+      BEST_BATCH="$batch"
+    fi
+  else
+    log "FAILED"
+  fi
+done
+BEST_UBATCH=$(( BEST_BATCH / 4 < 64 ? 64 : BEST_BATCH / 4 ))
+log "  → Best batch: ${BEST_BATCH}  ubatch: ${BEST_UBATCH}  (pp=${BEST_PP_BATCH} t/s)"
+
+# ── Phase 6 (TurboQuant only): max context with turbo2 KV ──────────────────
+if [[ "$VARIANT" == "turboquant" ]]; then
+  hdr "PHASE 6 — TurboQuant: extended context with turbo2 KV (ngl=${MAX_STABLE_NGL}  fa=${BEST_FA})"
+  log "  turbo2 = 2-bit KV (6.4× smaller than f16) → enables much larger ctx in same VRAM"
+
+  TURBO_CTX_VALUES=(512 1024 2048 4096 8192 16384 32768)
+  MAX_TURBO_CTX="128"
+  TURBO_KV="turbo2"
+
+  for ctx in "${TURBO_CTX_VALUES[@]}"; do
+    printf "  ctx=%-7s  " "$ctx" | tee -a "$LOG"
+    if run_bench "$MAX_STABLE_NGL" "$ctx" "$TURBO_KV" "$BEST_FA" "$BEST_BATCH" "$BEST_UBATCH" "ph6_turbo_ctx"; then
+      log "OK  pp=${LAST_PP} t/s  tg=${LAST_TG} t/s"
+      MAX_TURBO_CTX="$ctx"
+    else
+      log "FAILED (OOM/timeout)"
+      fail_row ph6_turbo_ctx "$MAX_STABLE_NGL" "$ctx" "$TURBO_KV" "$BEST_FA" "$BEST_BATCH" "$BEST_UBATCH"
+      break
+    fi
+  done
+  log "  → Max context with turbo2: ${MAX_TURBO_CTX}"
+  # Use the larger turbo ctx for the recommended .env
+  MAX_STABLE_CTX="$MAX_TURBO_CTX"
+  BEST_KV="$TURBO_KV"
+fi
+
+# ── Summary ────────────────────────────────────────────────────────────────
+sep
+log "BENCHMARK COMPLETE [${VARIANT}] — $(date)"
+sep
+log ""
+log "  Optimal params for GTX 1650 Ti + Qwen3.5-9B Q4_K_M [${VARIANT}]:"
+log ""
+log "    ngl        : ${MAX_STABLE_NGL}"
+log "    ctx_size   : ${MAX_STABLE_CTX}"
+log "    kv_type    : ${BEST_KV}"
+log "    flash_attn : ${BEST_FA}"
+log "    batch_size : ${BEST_BATCH}"
+log "    ubatch     : ${BEST_UBATCH}"
+log ""
+log "  Full CSV: ${RESULTS_CSV}"
+log ""
+
+# Write recommended .env
+ENV_OUT="${OUTPUT_DIR}/${VARIANT}_recommended.env"
+cat > "$ENV_OUT" <<EOF
+# Generated by benchmark.sh [${VARIANT}] on $(date)
+LLAMA_N_GPU_LAYERS=${MAX_STABLE_NGL}
+LLAMA_CTX_SIZE=${MAX_STABLE_CTX}
+LLAMA_CACHE_TYPE_K=${BEST_KV}
+LLAMA_CACHE_TYPE_V=${BEST_KV}
+LLAMA_BATCH_SIZE=${BEST_BATCH}
+LLAMA_UBATCH_SIZE=${BEST_UBATCH}
+LLAMA_THREADS=${THREADS}
+LLAMA_THREADS_BATCH=${THREADS_BATCH}
+LLAMA_PARALLEL=1
+EOF
+log "  Recommended .env → ${ENV_OUT}"
+
+# ── Cross-variant comparison (if both results exist) ──────────────────────
+OFFICIAL_CSV=$(ls "${OUTPUT_DIR}"/official_results_*.csv 2>/dev/null | sort | tail -1 || true)
+TURBO_CSV=$(ls "${OUTPUT_DIR}"/turboquant_results_*.csv 2>/dev/null | sort | tail -1 || true)
+
+if [[ -n "$OFFICIAL_CSV" && -n "$TURBO_CSV" ]]; then
+  COMPARE_OUT="${OUTPUT_DIR}/comparison_$(date +%Y%m%d_%H%M%S).txt"
+  {
+    echo "======================================================================"
+    echo " OFFICIAL vs TURBOQUANT COMPARISON"
+    echo "======================================================================"
+    echo ""
+    echo "Official  CSV: $OFFICIAL_CSV"
+    echo "TurboQuant CSV: $TURBO_CSV"
+    echo ""
+    echo "KV type benchmark results (phase ph3_kv):"
+    echo ""
+    printf "%-12s  %-10s  %-10s  %-12s  %-12s\n" "variant" "kv_type" "ctx" "pp (t/s)" "tg (t/s)"
+    echo "----------------------------------------------------------------------"
+    for csv in "$OFFICIAL_CSV" "$TURBO_CSV"; do
+      awk -F',' '
+        NR>1 && $2 == "ph3_kv" {
+          printf "%-12s  %-10s  %-10s  %-12s  %-12s\n", $1, $5, $4, $11, $12
+        }
+      ' "$csv"
+    done
+    echo ""
+    echo "Winner by tg (generation speed):"
+    awk -F',' '
+      NR>1 && $2 == "ph3_kv" && $13 == "ok" {
+        key = $1 "," $5
+        val = $12+0
+        if (val > best[key]) { best[key] = val; row[key] = $0 }
+      }
+      END {
+        best_tg = 0; best_key = ""
+        for (k in best) { if (best[k] > best_tg) { best_tg = best[k]; best_key = k } }
+        n = split(best_key, a, ",")
+        printf "  %s with kv=%s → %.1f t/s\n", a[1], a[2], best_tg
+      }
+    ' "$OFFICIAL_CSV" "$TURBO_CSV"
+    echo "======================================================================"
+  } | tee "$COMPARE_OUT" | tee -a "$LOG"
+  echo ""
+  echo "Comparison report: $COMPARE_OUT"
+fi
+
+sep
+echo ""
+echo "=== RECOMMENDED .env [${VARIANT}] ==="
+cat "$ENV_OUT"
+
--- a/scripts/benchmark_models.sh
+++ b/scripts/benchmark_models.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+# Benchmark all 4 new models on GTX 1650 Ti (3717 MiB VRAM)
+# Priority: max context size > tg speed
+# Runs inside ghcr.io/ggml-org/llama.cpp:full-cuda (build b9014, no -c flag)
+#
+# Architecture context limits (from GGUF metadata):
+#   SmolLM3-3B   : 65536  (full attention, KV-limited to ~28K in practice)
+#   Gemma4-E2B   : 131072 (hybrid: sliding_window=512 → huge ctx possible)
+#   Gemma4-E4B   : 131072 (hybrid: sliding_window=512)
+#   Qwen3-4B     : 40960  (full attention, KV-limited to ~9K in practice)
+#
+# NOTE: llama-bench b9014 has NO -c flag. Context is set by -p (prompt tokens).
+#   -p N -n G allocates KV for N+G tokens. OOM = exit!=0 or error in stdout.
+
+set -uo pipefail
+
+M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
+M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
+M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
+M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
+
+# -- CSV column detection (called once on first successful output) --
+TS_COL=0; NG_COL=0; NP_COL=0
+
+detect_cols() {
+    local hdr
+    hdr=$(printf '%s\n' "$1" | sed 's/"//g' | grep '^build_commit' | head -1)
+    TS_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="avg_ts"){print i;exit}}')
+    NG_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="n_gen"){print i;exit}}')
+    NP_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="n_prompt"){print i;exit}}')
+    TS_COL=${TS_COL:-0}; NG_COL=${NG_COL:-0}; NP_COL=${NP_COL:-0}
+}
+
+# Returns "pp_speed pp / tg_speed tg t/s"
+parse_speeds() {
+    local out="$1"
+    [ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
+    local s pp tg
+    s=$(printf '%s\n' "$out" | sed 's/"//g')
+    pp=$(printf '%s\n' "$s" | awk -F',' -v tc="$TS_COL" -v np="$NP_COL" -v ng="$NG_COL" \
+        'NR>1 && $np+0>0 && $ng+0==0 {printf "%.0f", $tc+0; exit}')
+    tg=$(printf '%s\n' "$s" | awk -F',' -v tc="$TS_COL" -v np="$NP_COL" -v ng="$NG_COL" \
+        'NR>1 && $ng+0>0 && $np+0==0 {printf "%.1f", $tc+0; exit}')
+    printf "%s pp / %s tg t/s" "${pp:--}" "${tg:--}"
+}
+
+is_oom() {
+    local out="$1" ec="$2"
+    [ "$ec" -ne 0 ] && return 0
+    printf '%s\n' "$out" | grep -qiE "failed to create context|out of memory|GGML_ASSERT|error:" && return 0
+    return 1
+}
+
+# bench MODEL NGL [llama-bench extra args...]
+# Standard speed benchmark: -p 512 -n 128 small context
+bench() {
+    local model=$1 ngl=$2; shift 2
+    local out ec
+    out=$(timeout 250 /app/llama-bench -m "$model" -ngl "$ngl" \
+        -b 512 -ub 128 -o csv "$@" 2>&1)
+    ec=$?
+    if is_oom "$out" "$ec"; then echo "OOM"; return; fi
+    [ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
+    parse_speeds "$out"
+}
+
+# bench_ctx MODEL NGL CTX
+# Context-capacity test: allocates KV for CTX tokens via -p CTX -n 1
+# Tries fa=1 first, falls back to fa=0. Returns "OK (N pp t/s [fa=N])" or "OOM"
+bench_ctx() {
+    local model=$1 ngl=$2 ctx=$3
+    local out ec fa_used
+    for fa in 1 0; do
+        out=$(timeout 250 /app/llama-bench -m "$model" -ngl "$ngl" \
+            -p "$ctx" -n 1 -r 1 --no-warmup \
+            -b 512 -ub 128 -fa "$fa" -t 6 -o csv 2>&1)
+        ec=$?
+        is_oom "$out" "$ec" || { fa_used=$fa; break; }
+        [ "$fa" = "0" ] && { echo "OOM"; return; }
+    done
+    [ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
+    local pp
+    pp=$(printf '%s\n' "$out" | sed 's/"//g' | \
+        awk -F',' -v tc="$TS_COL" -v np="$NP_COL" \
+        'NR>1 && $np+0>0 {printf "%.0f", $tc+0; exit}')
+    printf "OK (%s pp t/s fa=%s)" "${pp:--}" "${fa_used:-?}"
+}
+
+HR="======================================================================"
+echo "$HR"
+echo "LLAMA.CPP BENCHMARK — ALL MODELS — $(date)"
+echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo unknown)"
+echo "$HR"
+echo ""
+
+# ── Phase 1: Baseline (small context) ────────────────────────────────────────
+echo "=== Phase 1: Baseline (ngl=99, p=512 n=128 r=2, t=6, fa=0) ==="
+for entry in "SmolLM3-3B:$M_SMOL" "Gemma4-E2B:$M_E2B" "Gemma4-E4B:$M_E4B" "Qwen3-4B:$M_Q3"; do
+    lbl="${entry%%:*}"; mdl="${entry#*:}"
+    printf "  %-14s  %s\n" "$lbl" "$(bench "$mdl" 99 -p 512 -n 128 -r 2 -t 6 -fa 0)"
+done
+echo ""
+
+# ── Phase 2: Gemma4-E4B ngl sweep ────────────────────────────────────────────
+echo "=== Phase 2: Gemma4-E4B ngl sweep (p=16 n=64 r=1 t=6 fa=0) ==="
+echo "    5.1GB model on 3.7GB VRAM — finding highest ngl before OOM"
+best_e4b_ngl=0
+for ngl in 0 4 8 12 16 20 24 28 32 36 42; do
+    ts=$(bench "$M_E4B" $ngl -p 16 -n 64 -r 1 -t 6 -fa 0)
+    printf "  ngl=%-3s  %s\n" "$ngl" "$ts"
+    [[ "$ts" == OOM ]] && break
+    best_e4b_ngl=$ngl
+done
+echo "  → best_e4b_ngl=$best_e4b_ngl"
+echo ""
+
+# ── Phase 3: Max context sweep ────────────────────────────────────────────────
+echo "=== Phase 3: Max context (p=ctx n=1 r=1 no-warmup fa=1) ==="
+echo "    Gemma4 hybrid attention (sliding_window=512) enables large ctx cheaply."
+declare -A BEST_CTX
+BEST_CTX[smollm3]=512; BEST_CTX[e2b]=512; BEST_CTX[e4b]=512; BEST_CTX[q3]=512
+
+for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
+             "e2b:Gemma4-E2B:$M_E2B:99" \
+             "e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
+             "q3:Qwen3-4B:$M_Q3:99"; do
+    IFS=':' read -r key lbl mdl ngl <<< "$entry"
+    echo "  -- $lbl (ngl=$ngl) --"
+    for ctx in 512 1024 2048 4096 8192 12288 16384 24576 32768 49152 65536 98304 131072; do
+        ts=$(bench_ctx "$mdl" "$ngl" "$ctx")
+        printf "    ctx=%-7s  %s\n" "$ctx" "$ts"
+        [[ "$ts" == OOM ]] && break
+        BEST_CTX[$key]=$ctx
+    done
+    echo "    → MAX ctx=${BEST_CTX[$key]}"
+done
+echo ""
+
+# ── Phase 4: TG speed at max context ─────────────────────────────────────────
+echo "=== Phase 4: TG speed at max context (p=512 n=128 r=2 fa=1 t=6) ==="
+for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
+             "e2b:Gemma4-E2B:$M_E2B:99" \
+             "e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
+             "q3:Qwen3-4B:$M_Q3:99"; do
+    IFS=':' read -r key lbl mdl ngl <<< "$entry"
+    ts=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 1 -t 6)
+    printf "  %-14s  max_ctx=%-7s  %s\n" "$lbl" "${BEST_CTX[$key]}" "$ts"
+done
+echo ""
+
+# ── Phase 5: E4B thread sweep (CPU split model — threads matter) ──────────────
+echo "=== Phase 5: Gemma4-E4B thread sweep (p=512 n=128 r=2 fa=0 ngl=$best_e4b_ngl) ==="
+for t in 1 2 3 4 5 6 8 10 12; do
+    ts=$(bench "$M_E4B" "$best_e4b_ngl" -p 512 -n 128 -r 2 -fa 0 -t "$t")
+    printf "  t=%-3s  %s\n" "$t" "$ts"
+done
+echo ""
+
+# ── Phase 6: Flash attention comparison ──────────────────────────────────────
+echo "=== Phase 6: Flash attention fa=0 vs fa=1 (p=512 n=128 r=2 t=6) ==="
+echo "    Gemma4 hybrid attention may not support FA — testing both."
+for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
+             "e2b:Gemma4-E2B:$M_E2B:99" \
+             "e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
+             "q3:Qwen3-4B:$M_Q3:99"; do
+    IFS=':' read -r key lbl mdl ngl <<< "$entry"
+    ts0=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 0 -t 6)
+    ts1=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 1 -t 6)
+    printf "  %-14s  fa=0: %-30s  fa=1: %s\n" "$lbl" "$ts0" "$ts1"
+done
+echo ""
+
+echo "$HR"
+echo "BENCHMARK COMPLETE: $(date)"
+echo "$HR"
--- a/scripts/cpu_ctx_test.sh
+++ b/scripts/cpu_ctx_test.sh
@@ -0,0 +1,251 @@
+#!/bin/bash
+# cpu_ctx_test.sh v4 — -nkvo bigctx with TurboQuant image (FORCE_MMQ)
+# Image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
+#
+# Tests KV in RAM (-nkvo) with BOTH q4_0 and turbo2 KV types.
+# turbo2 = 2-bit KV (2x smaller than q4_0) → ~2x more context at same RAM budget.
+#
+# Speed model per token:
+#   GPU-compute models (smollm3/e2b/e4b/q3): bottleneck = PCIe KV reads
+#     t/s = 1000 / (gpu_ms + ctx * kv_bytes_per_token / PCIE_BPS * 1000)
+#   Qwen3.5-9B: bottleneck = RAM reads (21/32 layers on CPU, 8.86 GB model)
+#     t/s = 1000 / (1000/baseline + ctx * kv_bytes_per_token / RAM_BPS * 1000)
+#
+# Usage: bash /scripts/cpu_ctx_test.sh [smollm3|e2b|e4b|q3|qwen35q|all]
+
+set -uo pipefail
+
+TARGET="${1:-all}"
+TARGET_TPS=15
+CPU_THREADS=6
+BENCH_GEN=32
+PCIE_BW_GBPS=8.0   # PCIe x4 3.0 practical read BW (conservative)
+RAM_BW_GBPS=45.0   # RAM practical read BW (i7-10750H DDR4-2933)
+
+M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
+M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
+M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
+M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
+M_Q35="/models/Qwen3.5-9B.Q8_0.gguf"
+
+declare -A NGL_GPU=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99 [qwen35q]=11)
+# BW source: pcie for GPU-compute models, ram for qwen35-9b (CPU-compute bound)
+declare -A BW_GBPS=([smollm3]=$PCIE_BW_GBPS [e2b]=$PCIE_BW_GBPS [e4b]=$PCIE_BW_GBPS [q3]=$PCIE_BW_GBPS [qwen35q]=$RAM_BW_GBPS)
+declare -A BW_LABEL=([smollm3]="PCIe" [e2b]="PCIe" [e4b]="PCIe" [q3]="PCIe" [qwen35q]="RAM")
+
+# CTX candidates: larger now thanks to turbo2 (2x smaller KV vs q4_0)
+# Note: turbo2 is SKIPPED for Qwen3-4B (PPL explodes at ctx>=8192: +0.52 → +13 → +437)
+#       turbo2 is SKIPPED for Qwen3.5-9B (hybrid linear-attn incompatible with llama-perplexity;
+#       server works fine at 32K — this is a test-tool limitation, not a real issue)
+SMOL_CTXS=(32768 49152 65536 98304 131072 163840)
+E2B_CTXS=(32768 49152 65536 98304 131072 163840 196608 262144 393216)
+E4B_CTXS=(32768 49152 65536 98304 131072 163840)
+Q3_CTXS=(24576 32768 49152 65536 98304 131072)
+Q35_CTXS=(16384 32768 49152 65536 98304 131072)
+declare -A CTX_CANDIDATES=(
+    [smollm3]="SMOL_CTXS" [e2b]="E2B_CTXS" [e4b]="E4B_CTXS"
+    [q3]="Q3_CTXS" [qwen35q]="Q35_CTXS")
+
+# Pure-GPU ctx for gain comparison
+declare -A PURE_GPU_CTX=([smollm3]=24576 [e2b]=24576 [e4b]=24576 [q3]=16384 [qwen35q]=32768)
+
+GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; CYAN='\033[0;36m'; NC='\033[0m'
+HR="======================================================================"
+
+# Tiny alloc file — enough for 1 chunk, minimal compute time
+ALLOC_FILE="/tmp/kv_alloc_tiny.txt"
+python3 -c "
+sentences = [
+    'The transformer architecture uses self-attention mechanisms to process sequences.',
+    'Large language models require significant computational resources for training.',
+    'Quantization reduces memory usage by storing weights in lower precision formats.',
+    'Flash attention enables memory-efficient computation for long context windows.',
+    'The key-value cache stores intermediate attention states during generation.',
+]
+import random; random.seed(1)
+print(chr(10).join([random.choice(sentences) for _ in range(64)]))
+" > "$ALLOC_FILE"
+
+# check_alloc MODEL NGL KV CTX [EXTRA...]
+# Returns "<host_kv_mib>" on success, "OOM" on failure. Fast: <15s.
+check_alloc() {
+    local model=$1 ngl=$2 kv=$3 ctx=$4
+    shift 4
+    local extra_args=("$@")
+    local tmp_err; tmp_err=$(mktemp)
+
+    timeout 90 /app/llama-perplexity \
+        -m "$model" -ngl "$ngl" \
+        -fa on -nkvo \
+        -c "$ctx" -ctk "$kv" -ctv "$kv" \
+        -f "$ALLOC_FILE" --chunks 1 \
+        "${extra_args[@]}" \
+        > /dev/null 2>"$tmp_err"
+    local rc=$?
+
+    local err; err=$(cat "$tmp_err"); rm -f "$tmp_err"
+
+    if grep -qi "out of memory\|failed to allocate\|cudaMalloc failed\|CUDA_ERROR_OUT_OF_MEMORY\|ggml_cuda_malloc\|cannot allocate memory\|cannot create buffer" <<< "$err"; then
+        echo "OOM"; return 1
+    fi
+
+    # Parse Host context MiB: "| Host | total = model + context + compute |"
+    local host_ctx_mib
+    host_ctx_mib=$(grep "Host" <<< "$err" | \
+        grep -oP "=\s*\d+\s*\+\s*\K\d+(?=\s*\+)" | head -1 || true)
+    echo "${host_ctx_mib:-?}"
+}
+
+# measure_baseline_tps MODEL NGL [EXTRA...]
+measure_baseline_tps() {
+    local model=$1 ngl=$2
+    shift 2
+    local extra_args=("$@")
+    local raw
+    raw=$(timeout 120 /app/llama-bench \
+        -m "$model" -ngl "$ngl" -t "$CPU_THREADS" \
+        -p 1 -n "$BENCH_GEN" \
+        -ctk q4_0 -ctv q4_0 -nkvo 1 -fa 1 -r 1 -o csv \
+        "${extra_args[@]}" 2>/dev/null) || true
+    printf '%s\n' "$raw" | sed 's/"//g' | \
+        awk -F',' 'NR>1 && $34=="0" && $35+0>0 && $40+0>0 {print $40+0; exit}'
+}
+
+# estimate_tps BASELINE_TPS KV_PER_TOKEN_MIB CTX BW_GBPS
+estimate_tps() {
+    local baseline_tps=$1 kv_per_token_mib=$2 ctx=$3 bw_gbps=$4
+    python3 -c "
+baseline = float('$baseline_tps')
+kv_tok_bytes = float('$kv_per_token_mib') * 1024 * 1024
+bps = float('$bw_gbps') * 1e9
+ctx = int('$ctx')
+base_ms = 1000.0 / baseline
+kv_ms = ctx * kv_tok_bytes / bps * 1000
+print(f'{1000.0 / (base_ms + kv_ms):.1f}')
+" 2>/dev/null || echo "?"
+}
+
+# ---------------------------------------------------------------------------
+echo "$HR"
+echo "CPU-RAM KV CONTEXT TEST v4 (-nkvo, TurboQuant FORCE_MMQ) -- $(date)"
+echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null)"
+echo "KV types tested: q4_0 (4-bit) and turbo2 (2-bit, 2x smaller → 2x more ctx)"
+printf "PCIe assumption: %.1f GB/s  |  RAM assumption: %.1f GB/s\n" "$PCIE_BW_GBPS" "$RAM_BW_GBPS"
+echo "$HR"
+echo ""
+
+declare -a SUMMARY=()
+
+for entry in \
+    "smollm3:SmolLM3-3B:$M_SMOL" \
+    "e2b:Gemma4-E2B:$M_E2B" \
+    "e4b:Gemma4-E4B:$M_E4B" \
+    "q3:Qwen3-4B:$M_Q3" \
+    "qwen35q:Qwen3.5-9B:$M_Q35"
+do
+    IFS=':' read -r key lbl model <<< "$entry"
+    [[ "$TARGET" != "all" && "$TARGET" != "$key" ]] && continue
+
+    eval "ctxs=(\"\${${CTX_CANDIDATES[$key]}[@]}\")"
+    ngl="${NGL_GPU[$key]}"
+    bw_gbps="${BW_GBPS[$key]}"
+    bw_label="${BW_LABEL[$key]}"
+
+    # turbo2 incompatible with Qwen3-4B (quality fails at ctx>=8192)
+    # turbo2 alloc works for Qwen3.5-9B but quality measurement unreliable — test q4_0 only
+    if [[ "$key" == "q3" || "$key" == "qwen35q" ]]; then
+        kv_types_to_test=(q4_0)
+    else
+        kv_types_to_test=(q4_0 turbo2)
+    fi
+
+    extra_args=()
+
+    printf "${BLUE}=== %s (ngl=%s, BW model: %s %.0f GB/s) ===${NC}\n" \
+        "$lbl" "$ngl" "$bw_label" "$bw_gbps"
+
+    # Baseline t/s (empty KV, with q4_0 -nkvo — upper bound)
+    printf "  Measuring baseline t/s (empty KV, p=1)... "
+    baseline_tps=$(measure_baseline_tps "$model" "$ngl" "${extra_args[@]}")
+    if [[ -z "$baseline_tps" ]]; then
+        printf "${RED}FAIL${NC}\n\n"
+        SUMMARY+=("$lbl|FAIL|FAIL|FAIL|FAIL|FAIL")
+        continue
+    fi
+    printf "${GREEN}%s t/s${NC}\n\n" "$baseline_tps"
+
+    # Header
+    printf "  %-10s  %-12s  %-12s  %-12s  %-12s  %-12s  %-12s\n" \
+        "ctx" "KV type" "KV in RAM" "kv/tok" "t/s@25%" "t/s@50%" "t/s@100%"
+    printf "  %-10s  %-12s  %-12s  %-12s  %-12s  %-12s  %-12s\n" \
+        "---" "-------" "---------" "------" "-------" "-------" "--------"
+
+    max_ctx_q4=""
+    max_ctx_t2=""
+    rec_q4=""
+    rec_t2=""
+    declare -A kv_ref_mib=()
+
+    for ctx in "${ctxs[@]}"; do
+        for kv_type in "${kv_types_to_test[@]}"; do
+            result=$(check_alloc "$model" "$ngl" "$kv_type" "$ctx" "${extra_args[@]}")
+            if [[ "$result" == "OOM" ]]; then
+                printf "  ${RED}%-10s  %-12s  OOM${NC}\n" "$ctx" "$kv_type"
+                continue
+            fi
+
+            host_kv_mib="${result}"
+            [[ "$kv_type" == "q4_0" ]] && max_ctx_q4=$ctx || max_ctx_t2=$ctx
+
+            # KV per token
+            if [[ "$host_kv_mib" =~ ^[0-9]+$ ]]; then
+                kv_per_token_mib=$(python3 -c "print(f'{$host_kv_mib / $ctx:.6f}')")
+                kv_ref_mib[$kv_type]=$kv_per_token_mib
+            else
+                kv_per_token_mib="${kv_ref_mib[$kv_type]:-?}"
+            fi
+
+            tps25=$(estimate_tps "$baseline_tps" "$kv_per_token_mib" "$(( ctx / 4 ))" "$bw_gbps")
+            tps50=$(estimate_tps "$baseline_tps" "$kv_per_token_mib" "$(( ctx / 2 ))" "$bw_gbps")
+            tps100=$(estimate_tps "$baseline_tps" "$kv_per_token_mib" "$ctx" "$bw_gbps")
+
+            meets=$(python3 -c "print(1 if '$tps50' != '?' and float('$tps50') >= $TARGET_TPS else 0)" 2>/dev/null || echo 0)
+            [[ "$kv_type" == "q4_0" && "$meets" == "1" ]] && rec_q4=$ctx
+            [[ "$kv_type" == "turbo2" && "$meets" == "1" ]] && rec_t2=$ctx
+
+            color=$([[ "$meets" == "1" ]] && echo "$GREEN" || echo "$YELLOW")
+            printf "  ${color}%-10s${NC}  %-12s  %-12s  %-12s  %-12s  ${color}%-12s${NC}  %-12s\n" \
+                "$ctx" "$kv_type" "${host_kv_mib}MiB" "${kv_per_token_mib}MiB" \
+                "$tps25" "$tps50" "$tps100"
+        done
+    done
+
+    rec_q4="${rec_q4:-$max_ctx_q4}"
+    rec_t2="${rec_t2:-$max_ctx_t2}"
+    pg="${PURE_GPU_CTX[$key]}"
+
+    printf "\n  Recommended ctx (>=%s t/s@50%%): q4_0=%s  turbo2=%s  (pure-GPU was %s)\n\n" \
+        "$TARGET_TPS" "${rec_q4:-FAIL}" "${rec_t2:-FAIL}" "$pg"
+
+    gain_q4=$([[ -n "${rec_q4:-}" && "${rec_q4:-}" != "FAIL" ]] && echo "$((rec_q4 - pg))" || echo "?")
+    gain_t2=$([[ -n "${rec_t2:-}" && "${rec_t2:-}" != "FAIL" ]] && echo "$((rec_t2 - pg))" || echo "?")
+    SUMMARY+=("$lbl|$baseline_tps|${max_ctx_q4:-OOM}|${rec_q4:-FAIL}|${max_ctx_t2:-OOM}|${rec_t2:-FAIL}|$gain_q4|$gain_t2")
+
+    unset kv_ref_mib max_ctx_q4 max_ctx_t2 rec_q4 rec_t2
+done
+
+echo "$HR"
+echo "SUMMARY — -nkvo (KV in RAM): q4_0 vs turbo2"
+echo "$HR"
+printf "%-16s  %-12s  %-14s  %-14s  %-14s  %-14s\n" \
+    "Model" "Baseline t/s" "q4_0 max" "q4_0 rec" "turbo2 max" "turbo2 rec"
+printf "%-16s  %-12s  %-14s  %-14s  %-14s  %-14s\n" \
+    "-----" "------------" "--------" "--------" "----------" "----------"
+for row in "${SUMMARY[@]}"; do
+    IFS='|' read -r lbl btps max_q4 rec_q4 max_t2 rec_t2 g_q4 g_t2 <<< "$row"
+    printf "${GREEN}%-16s  %-12s  %-14s  %-14s  %-14s  %-14s  [q4+%s / t2+%s vs pure-GPU]${NC}\n" \
+        "$lbl" "$btps" "$max_q4" "$rec_q4" "$max_t2" "$rec_t2" "$g_q4" "$g_t2"
+done
+echo "$HR"
+echo "Note: Qwen3.5-9B baseline already <15 t/s (RAM-bound, 8.86 GB model). BW model uses RAM not PCIe."
+echo "$HR"
--- a/scripts/download_models.sh
+++ b/scripts/download_models.sh
@@ -0,0 +1,116 @@
+#!/usr/bin/env bash
+# download_models.sh — Download GGUF model files to ./models/
+#
+# Usage:
+#   bash scripts/download_models.sh           # all models
+#   bash scripts/download_models.sh smollm3   # single model
+#   bash scripts/download_models.sh gemma4-e2b gemma4-e4b  # multiple
+#
+# Requires: huggingface-cli (pip install huggingface_hub)
+# Models land in: ./models/
+#
+# Available keys: smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b | all
+
+set -euo pipefail
+
+MODELS_DIR="$(cd "$(dirname "$0")/.." && pwd)/models"
+mkdir -p "$MODELS_DIR"
+
+GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; NC='\033[0m'
+
+check_hf_cli() {
+    if ! command -v huggingface-cli &>/dev/null; then
+        echo -e "${RED}Error: huggingface-cli not found.${NC}"
+        echo "Install with: pip install huggingface_hub"
+        exit 1
+    fi
+}
+
+download() {
+    local key="$1"
+    local repo="$2"
+    local filename="$3"
+    local size_hint="$4"
+
+    local dest="$MODELS_DIR/$filename"
+    if [[ -f "$dest" ]]; then
+        echo -e "${YELLOW}[$key]${NC} Already exists: $filename — skipping"
+        return
+    fi
+
+    echo -e "${GREEN}[$key]${NC} Downloading $filename (~$size_hint) from $repo ..."
+    huggingface-cli download "$repo" "$filename" --local-dir "$MODELS_DIR"
+    echo -e "${GREEN}[$key]${NC} Done: $MODELS_DIR/$filename"
+}
+
+download_smollm3() {
+    download "smollm3" \
+        "bartowski/HuggingFaceTB_SmolLM3-3B-GGUF" \
+        "HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf" \
+        "1.9 GB"
+}
+
+download_gemma4_e2b() {
+    download "gemma4-e2b" \
+        "bartowski/google_gemma-4-E2B-it-GGUF" \
+        "google_gemma-4-E2B-it-Q4_K_M.gguf" \
+        "2.9 GB"
+}
+
+download_gemma4_e4b() {
+    download "gemma4-e4b" \
+        "bartowski/google_gemma-4-E4B-it-GGUF" \
+        "google_gemma-4-E4B-it-Q4_K_M.gguf" \
+        "4.7 GB"
+}
+
+download_qwen3_4b() {
+    download "qwen3-4b" \
+        "bartowski/Qwen3-4B-GGUF" \
+        "Qwen3-4B-Q4_K_M.gguf" \
+        "2.4 GB"
+}
+
+download_qwen35_9b() {
+    download "qwen35-9b" \
+        "Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-v2-GGUF" \
+        "Qwen3.5-9B.Q8_0.gguf" \
+        "8.9 GB"
+}
+
+main() {
+    check_hf_cli
+
+    local targets=("$@")
+    if [[ ${#targets[@]} -eq 0 || "${targets[0]}" == "all" ]]; then
+        targets=(smollm3 gemma4-e2b gemma4-e4b qwen3-4b qwen35-9b)
+    fi
+
+    for target in "${targets[@]}"; do
+        case "$target" in
+            smollm3)     download_smollm3 ;;
+            gemma4-e2b)  download_gemma4_e2b ;;
+            gemma4-e4b)  download_gemma4_e4b ;;
+            qwen3-4b)    download_qwen3_4b ;;
+            qwen35-9b)   download_qwen35_9b ;;
+            all)
+                download_smollm3
+                download_gemma4_e2b
+                download_gemma4_e4b
+                download_qwen3_4b
+                download_qwen35_9b
+                ;;
+            *)
+                echo -e "${RED}Unknown model: $target${NC}"
+                echo "Valid keys: smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b | all"
+                exit 1
+                ;;
+        esac
+    done
+
+    echo ""
+    echo "Models directory:"
+    ls -lh "$MODELS_DIR"/*.gguf 2>/dev/null || echo "(no .gguf files found)"
+}
+
+main "$@"
--- a/scripts/kv_quant_test.sh
+++ b/scripts/kv_quant_test.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+# KV cache quantization test using llama-perplexity.
+# Image: local/llama-cpp-turboquant:full-cuda-sm75-mmq (FORCE_MMQ, turbo2/3/4 support)
+#
+# Tests KV types: f16 (baseline) + q8_0/q4_0/turbo2 for Q4_K_M models
+#                 f16 (baseline) + turbo2/3/4 for Qwen3.5-9B Q8_0
+# Quality gate: PPL delta vs f16 < 0.5 (lossless for practical use)
+#
+# Usage: bash /scripts/kv_quant_test.sh [MODEL_KEY]
+#   MODEL_KEY: smollm3 | e2b | e4b | q3 | qwen35q | all (default: all)
+
+set -uo pipefail
+
+TARGET="${1:-all}"
+
+M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
+M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
+M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
+M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
+M_Q35="/models/Qwen3.5-9B.Q8_0.gguf"
+
+declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99 [qwen35q]=11)
+declare -A BASE_CTX=([smollm3]=18432 [e2b]=32768 [e4b]=20480 [q3]=8192 [qwen35q]=8192)
+declare -A PPL_TIMEOUT=([smollm3]=300 [e2b]=300 [e4b]=300 [q3]=300 [qwen35q]=600)
+
+# Per-model KV types to test (f16 is always the baseline)
+# Standard Q4_K_M models: q8_0/q4_0 + turbo2 (all supported by TurboQuant image)
+# Qwen3.5-9B: designed for turbo KV — test turbo2/3/4 only (q4_0 would also work but less relevant)
+declare -A MODEL_KV_TYPES=(
+    [smollm3]="q8_0 q4_0 turbo2"
+    [e2b]="q8_0 q4_0 turbo2"
+    [e4b]="q8_0 q4_0 turbo2"
+    [q3]="q8_0 q4_0 turbo2"
+    [qwen35q]="turbo2 turbo3 turbo4"
+)
+
+# ctx candidates per model
+SMOL_CTXS=(8192 12288 16384 18432 20480 24576 32768 40960 49152)
+E2B_CTXS=(8192 16384 24576 32768 40960 49152 65536)
+E4B_CTXS=(8192 12288 16384 20480 24576 32768 40960)
+Q3_CTXS=(4096 6144 8192 10240 12288 16384 24576 32768)
+Q35_CTXS=(4096 8192 16384 24576 32768 40960 49152)
+declare -A CTX_CANDIDATES=(
+    [smollm3]="SMOL_CTXS" [e2b]="E2B_CTXS" [e4b]="E4B_CTXS"
+    [q3]="Q3_CTXS" [qwen35q]="Q35_CTXS")
+
+GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
+HR="======================================================================"
+
+# Synthetic PPL file — 4000 lines, deterministic, no network needed
+PPL_FILE="/tmp/kv_ppl_input.txt"
+ensure_ppl_file() {
+    [[ -f "$PPL_FILE" ]] && return
+    python3 - << 'PY'
+import random, sys
+random.seed(42)
+sentences = [
+    "The transformer architecture uses self-attention mechanisms to process sequences.",
+    "Large language models require significant computational resources for training.",
+    "Quantization reduces memory usage by storing weights in lower precision formats.",
+    "Flash attention enables memory-efficient computation for long context windows.",
+    "The key-value cache stores intermediate attention states during generation.",
+    "Context length determines how many tokens the model can attend to simultaneously.",
+    "Perplexity measures how well a probability model predicts a sample of text.",
+    "Lower perplexity values indicate better language modeling performance overall.",
+    "GPU memory bandwidth is the primary bottleneck for autoregressive token generation.",
+    "Grouped query attention reduces KV cache size by sharing keys across head groups.",
+    "Rotary position embeddings encode relative position information in attention queries.",
+    "Mixture of experts models route tokens through specialized feed-forward networks.",
+    "Continuous batching allows servers to process multiple requests simultaneously.",
+    "KV cache quantization trades a small quality loss for significantly larger contexts.",
+]
+lines = [random.choice(sentences) for _ in range(4000)]
+print('\n'.join(lines), file=open('/tmp/kv_ppl_input.txt', 'w'))
+PY
+}
+
+# run_ppl MODEL NGL KV CTX TIMEOUT [EXTRA_ARGS...]
+# Echoes PPL value on stdout, returns 0 on success, 1 on OOM/crash.
+run_ppl() {
+    local model=$1 ngl=$2 kv=$3 ctx=$4 timeout_s=$5
+    shift 5
+    local extra_args=("$@")
+
+    local tmp_err; tmp_err=$(mktemp)
+    local ppl_out; ppl_out=$(mktemp)
+
+    timeout "$timeout_s" /app/llama-perplexity \
+        -m "$model" \
+        -ngl "$ngl" \
+        -fa on \
+        -c "$ctx" \
+        -ctk "$kv" -ctv "$kv" \
+        -f "$PPL_FILE" \
+        --chunks 1 \
+        "${extra_args[@]}" \
+        > "$ppl_out" 2>"$tmp_err"
+    local ppl_rc=$?
+
+    local err; err=$(cat "$tmp_err"); rm -f "$tmp_err"
+
+    if [[ "$ppl_rc" != "0" ]] || \
+       grep -qi "out of memory\|failed to allocate\|cudaMalloc failed\|CUDA_ERROR_OUT_OF_MEMORY\|ggml_cuda_malloc\|cannot allocate memory" <<< "$err"; then
+        rm -f "$ppl_out"
+        return 1
+    fi
+
+    local ppl_val
+    ppl_val=$(grep -oP '\[\d+\]\K[0-9.]+' "$ppl_out" | tail -1)
+    rm -f "$ppl_out"
+    [[ -z "$ppl_val" ]] && return 1
+    echo "$ppl_val"
+}
+
+# ---------------------------------------------------------------------------
+ensure_ppl_file
+
+echo "$HR"
+echo "KV CACHE QUANT TEST (llama-perplexity) — TurboQuant image (FORCE_MMQ SM75)"
+echo "$(date)"
+echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null)"
+echo "$HR"
+echo "Standard models: f16 baseline | q8_0 | q4_0 | turbo2 (2-bit, 8x smaller KV vs f16)"
+echo "Qwen3.5-9B:      f16 baseline | turbo2 | turbo3 | turbo4 (TurboQuant KV types)"
+echo "Quality gate: PPL delta vs f16 < 0.5"
+echo ""
+
+declare -a SUMMARY=()
+
+for entry in \
+    "smollm3:SmolLM3-3B:$M_SMOL" \
+    "e2b:Gemma4-E2B:$M_E2B" \
+    "e4b:Gemma4-E4B:$M_E4B" \
+    "q3:Qwen3-4B:$M_Q3" \
+    "qwen35q:Qwen3.5-9B:$M_Q35"
+do
+    IFS=':' read -r key lbl model <<< "$entry"
+    [[ "$TARGET" != "all" && "$TARGET" != "$key" ]] && continue
+
+    eval "ctxs=(\"\${${CTX_CANDIDATES[$key]}[@]}\")"
+    ngl="${NGL[$key]}"
+    timeout_s="${PPL_TIMEOUT[$key]}"
+    IFS=' ' read -ra kv_types <<< "${MODEL_KV_TYPES[$key]}"
+
+    # Extra args for qwen35-9b (flash attn already set; no mlock needed for PPL correctness)
+    extra_args=()
+
+    printf "${BLUE}=== %s (base ctx=%s, ngl=%s) ===${NC}\n" \
+        "$lbl" "${BASE_CTX[$key]}" "$ngl"
+
+    # Dynamic header based on KV types for this model
+    printf "  %-10s  %-18s" "ctx" "f16 (PPL)"
+    for kv in "${kv_types[@]}"; do
+        printf "  %-20s" "$kv (PPL/delta)"
+    done
+    printf "\n"
+    printf "  %-10s  %-18s" "---" "---------"
+    for kv in "${kv_types[@]}"; do
+        printf "  %-20s" "--------------------"
+    done
+    printf "\n"
+
+    declare -A best_ctx_per_kv=([f16]="${BASE_CTX[$key]}")
+    for kv in "${kv_types[@]}"; do best_ctx_per_kv[$kv]="${BASE_CTX[$key]}"; done
+    declare -A oom_kv=([f16]=0)
+    for kv in "${kv_types[@]}"; do oom_kv[$kv]=0; done
+    declare -A ppl_f16_at_ctx=()
+
+    for ctx in "${ctxs[@]}"; do
+        printf "  %-10s" "$ctx"
+
+        # f16 baseline
+        f16_ppl=""
+        if [[ "${oom_kv[f16]}" == "1" ]]; then
+            printf "  ${RED}%-18s${NC}" "OOM"
+        else
+            f16_ppl=$(run_ppl "$model" "$ngl" "f16" "$ctx" "$timeout_s" "${extra_args[@]}")
+            if [[ $? -ne 0 ]]; then
+                printf "  ${RED}%-18s${NC}" "OOM"
+                oom_kv[f16]=1
+            else
+                printf "  ${GREEN}%-18s${NC}" "$f16_ppl"
+                best_ctx_per_kv[f16]=$ctx
+                ppl_f16_at_ctx[$ctx]=$f16_ppl
+            fi
+        fi
+
+        # KV type columns
+        for kv in "${kv_types[@]}"; do
+            if [[ "${oom_kv[$kv]}" == "1" ]]; then
+                printf "  ${RED}%-20s${NC}" "OOM"
+                continue
+            fi
+            ppl=$(run_ppl "$model" "$ngl" "$kv" "$ctx" "$timeout_s" "${extra_args[@]}")
+            if [[ $? -ne 0 ]]; then
+                printf "  ${RED}%-20s${NC}" "OOM"
+                oom_kv[$kv]=1
+                continue
+            fi
+            best_ctx_per_kv[$kv]=$ctx
+
+            if [[ -n "$f16_ppl" ]]; then
+                delta=$(python3 -c "print(f'{float(\"$ppl\")-float(\"$f16_ppl\"):+.2f}')" 2>/dev/null || echo "?")
+                ok=$(python3 -c "exit(0 if abs(float('$ppl')-float('$f16_ppl'))<0.5 else 1)" 2>/dev/null && echo ok || echo bad)
+                if [[ "$ok" == "ok" ]]; then
+                    printf "  ${GREEN}%-20s${NC}" "${ppl}(${delta})"
+                else
+                    printf "  ${YELLOW}%-20s${NC}" "${ppl}(${delta})"
+                fi
+            else
+                printf "  ${GREEN}%-20s${NC}" "$ppl"
+            fi
+        done
+        echo ""
+    done
+
+    echo ""
+
+    # Best recommendation: highest ctx where all non-f16 types passed quality gate
+    overall_best_ctx="${BASE_CTX[$key]}"
+    overall_best_kv="f16"
+    for kv in "${kv_types[@]}"; do
+        bctx="${best_ctx_per_kv[$kv]}"
+        SUMMARY+=("$lbl|$kv|$bctx")
+        if [[ "$bctx" -gt "$overall_best_ctx" ]]; then
+            overall_best_ctx=$bctx; overall_best_kv=$kv
+        fi
+    done
+    SUMMARY+=("$lbl|f16|${best_ctx_per_kv[f16]}")
+    printf "  ${GREEN}Best: %s → max ctx %s${NC}\n\n" "$overall_best_kv" "$overall_best_ctx"
+
+    unset best_ctx_per_kv oom_kv ppl_f16_at_ctx
+done
+
+echo "$HR"
+echo "SUMMARY"
+echo "$HR"
+printf "%-16s  %-8s  %s\n" "Model" "KV" "Max Ctx (no OOM + PPL delta<0.5)"
+printf "%-16s  %-8s  %s\n" "-----" "--" "---------------------------------"
+for row in "${SUMMARY[@]}"; do
+    IFS='|' read -r lbl kv ctx <<< "$row"
+    printf "${GREEN}%-16s  %-8s  %s${NC}\n" "$lbl" "$kv" "$ctx"
+done
+echo "$HR"
+echo "Reminder: update envs/.env.<model>: CACHE_TYPE_K/V=<best_kv>  CTX_SIZE=<max_ctx>"
+echo "$HR"
--- a/scripts/quality_test.sh
+++ b/scripts/quality_test.sh
@@ -0,0 +1,215 @@
+#!/bin/bash
+# Quality tests for all 4 models — runs inside full-cuda container.
+# Tests: coding tasks + needle-in-haystack at 1K/8K ctx.
+#
+# Inference parameters sourced from official HF model cards:
+#   SmolLM3:  /no_think in SYSTEM prompt (-sys); temp=0.6 top_p=0.95
+#   Qwen3:    /no_think in SYSTEM prompt (-sys); temp=0.7 top_p=0.8 top_k=20
+#             DO NOT use greedy (temp=0) — causes endless repetition per Qwen3 docs
+#   Gemma4:   No thinking mode; temp=0.7 top_p=0.95
+
+set -uo pipefail
+
+M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
+M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
+M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
+M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
+
+declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99)
+declare -A MAX_CTX=([smollm3]=24576 [e2b]=32768 [e4b]=24576 [q3]=8192)
+
+# Per-model sampling params (HF model card sources)
+declare -A TEMP=([smollm3]="0.6"  [e2b]="0.7"  [e4b]="0.7"  [q3]="0.7")
+declare -A TOPP=([smollm3]="0.95" [e2b]="0.95" [e4b]="0.95" [q3]="0.8")
+declare -A TOPK=([smollm3]="0"    [e2b]="0"    [e4b]="0"    [q3]="20")
+# /no_think in system prompt disables thinking for SmolLM3 and Qwen3
+declare -A SYSP=([smollm3]="/no_think" [e2b]="" [e4b]="" [q3]="/no_think")
+
+PASS=0; FAIL=0; TOTAL=0
+
+GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m'
+
+# sed script to strip llama-cli interactive UI banner from stdout.
+# ▄ (U+2584) and █ (U+2588) appear in the llama.cpp ASCII logo, sometimes
+# with leading spaces — match anywhere on the line to be safe.
+STRIP_BANNER='/^$/d
+/^Loading model/d
+/^[[:space:]]*$/d
+/[▄█]/d
+/^build /d
+/^model /d
+/^modalities/d
+/^available commands/d
+/^  \//d
+/^\[ Prompt:/d
+/^\[  Prompt:/d
+/^Exiting/d
+/^> /d
+'
+
+check() {
+    local lbl="$1" out="$2"
+    shift 2
+    local patterns=("$@")
+    local ok=1
+    for pat in "${patterns[@]}"; do
+        printf '%s\n' "$out" | grep -qiE "$pat" || { ok=0; break; }
+    done
+    TOTAL=$((TOTAL+1))
+    if [ "$ok" = "1" ]; then
+        PASS=$((PASS+1)); printf "  ${GREEN}PASS${NC} %s\n" "$lbl"
+    else
+        FAIL=$((FAIL+1)); printf "  ${RED}FAIL${NC} %s\n" "$lbl"
+        printf '%s\n' "$out" | grep -v '^$' | tail -3 | sed 's/^/       | /'
+    fi
+}
+
+# Strip thinking blocks from output.
+# Gemma4 uses [Start thinking]...[End thinking].
+# Qwen3/SmolLM3 use <think>...</think>.
+# Match to end-of-string as fallback for truncated/incomplete blocks.
+strip_think() {
+    python3 -c "
+import sys, re
+t = sys.stdin.read()
+# Only strip COMPLETE blocks. If thinking hit token limit, leave as-is so
+# check patterns can still match reasoning content inside the block.
+t = re.sub(r'\[Start thinking\].*?\[End thinking\]', '', t, flags=re.DOTALL)
+t = re.sub(r'<think>.*?</think>', '', t, flags=re.DOTALL)
+print(t.strip())
+" 2>/dev/null || cat
+}
+
+# run KEY MODEL PROMPT MAX_TOKENS [SYS_OVERRIDE]
+# SYS_OVERRIDE defaults to SYSP[$key] if omitted.
+# Pass "" explicitly to disable system prompt (thinking ON for Qwen3/SmolLM3).
+# Thinking params: SmolLM3/Qwen3 thinking=temp0.6/top_p0.95, nothink=model defaults.
+run() {
+    local key=$1 model=$2 prompt=$3 max_tok=$4
+    local ngl="${NGL[$key]}"
+    # 5th arg overrides sys; if not provided, use SYSP[$key]
+    local use_sys
+    if [ "${5+x}" = "x" ]; then use_sys="$5"; else use_sys="${SYSP[$key]}"; fi
+    # choose sampling params: thinking mode uses 0.6/0.95, non-think uses model defaults
+    local temp topp topk
+    if [ -z "$use_sys" ] && [[ "$key" == "smollm3" || "$key" == "q3" ]]; then
+        temp="0.6"; topp="0.95"; topk="${TOPK[$key]}"
+    else
+        temp="${TEMP[$key]}"; topp="${TOPP[$key]}"; topk="${TOPK[$key]}"
+    fi
+    local sys_arg=()
+    [ -n "$use_sys" ] && sys_arg=(-sys "$use_sys")
+    local topk_arg=()
+    [ "$topk" != "0" ] && topk_arg=(--top-k "$topk")
+    timeout 300 /app/llama-cli -m "$model" -ngl "$ngl" \
+        -n "$max_tok" --temp "$temp" --top-p "$topp" "${topk_arg[@]}" \
+        --repeat-penalty 1.1 -fa on --mmap --single-turn \
+        "${sys_arg[@]}" -p "$prompt" 2>/dev/null \
+    | sed "$STRIP_BANNER" \
+    | strip_think
+}
+
+# needle_test KEY MODEL NEEDLE CTX
+# Generates ~CTX tokens of filler, plants needle in middle, asks to recall it.
+needle_test() {
+    local key=$1 model=$2 needle=$3 ctx=$4
+    local ngl="${NGL[$key]}"
+    local temp="${TEMP[$key]}" topp="${TOPP[$key]}" sys="${SYSP[$key]}"
+    local sys_arg=()
+    [ -n "$sys" ] && sys_arg=(-sys "$sys")
+
+    # filler: ctx/2 tokens each side, 1 token ~4 chars
+    local half_chars=$(( ctx * 2 ))
+    local reps=$(( half_chars / 45 + 2 ))
+    local filler
+    filler=$(python3 -c "print('The quick brown fox jumps over the lazy dog. ' * $reps)" 2>/dev/null \
+        | head -c "$half_chars")
+
+    local prompt
+    printf -v prompt \
+        '%s\nSECRET_VALUE=%s\n%s\nWhat is SECRET_VALUE? Reply with only the value, nothing else.' \
+        "$filler" "$needle" "$filler"
+
+    local ctx_size=$(( ctx + 512 ))
+    local out
+    out=$(timeout 180 /app/llama-cli -m "$model" -ngl "$ngl" \
+        -n 512 --temp "$temp" --top-p "$topp" \
+        -fa on --mmap --single-turn \
+        -c "$ctx_size" "${sys_arg[@]}" -p "$prompt" 2>/dev/null \
+    | sed "$STRIP_BANNER" \
+    | strip_think)
+
+    # join lines before grep in case model breaks needle across newlines
+    local flat
+    flat=$(printf '%s' "$out" | tr '\n' ' ')
+    if printf '%s' "$flat" | grep -qF "$needle"; then
+        echo "FOUND"
+    else
+        local snip
+        snip=$(printf '%s' "$flat" | cut -c1-80)
+        echo "MISSED (${snip:-<empty>})"
+    fi
+}
+
+HR="======================================================================"
+echo "$HR"
+echo "QUALITY TESTS — ALL MODELS — $(date)"
+echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null)"
+echo "$HR"
+printf "Temps: SmolLM3=0.6/0.95 | Qwen3=0.7/0.8/k20 | Gemma4=0.7/0.95\n"
+printf "/no_think via -sys for needle tests | thinking ON for coding bug test\n\n"
+
+CODING_FIZZBUZZ='Write ONLY the Python function fizzbuzz(n). It returns a list where multiples of 3 are "Fizz", multiples of 5 are "Buzz", multiples of both are "FizzBuzz", others are the number as string. Output code only, no prose.'
+
+# hi is correctly len(arr)-1 to have ONE unambiguous bug: lo=mid (infinite loop)
+CODING_BUG='Find the bug in this Python function and explain it in one sentence:
+def binary_search(arr, target):
+    lo, hi = 0, len(arr) - 1
+    while lo < hi:
+        mid = (lo + hi) // 2
+        if arr[mid] == target:
+            return mid
+        elif arr[mid] < target:
+            lo = mid
+        else:
+            hi = mid
+    return -1'
+
+for entry in "smollm3:SmolLM3-3B:$M_SMOL" "e2b:Gemma4-E2B:$M_E2B" "e4b:Gemma4-E4B:$M_E4B" "q3:Qwen3-4B:$M_Q3"; do
+    IFS=':' read -r key lbl model <<< "$entry"
+    echo "=== $lbl ==="
+
+    # Coding test 1: FizzBuzz — expect def + Fizz + Buzz
+    out=$(run "$key" "$model" "$CODING_FIZZBUZZ" 512)
+    check "FizzBuzz: def + Fizz + Buzz in output" "$out" \
+        "def " "Fizz" "Buzz"
+
+    # Coding test 2: Bug — thinking ON for all models (more reliable reasoning).
+    # Pass "" to disable /no_think override. Gemma4 already thinks by default.
+    out=$(run "$key" "$model" "$CODING_BUG" 3000 "")
+    check "Bug: identify lo=mid / infinite loop" "$out" \
+        "lo.*=.*mid.*\+.*1|lo\+1|infinite loop|never.*advance|never.*progress|stuck|lo should be|lo\b.*never.*incr"
+
+    # Needle-in-haystack
+    NEEDLE="QX7-ALPHA-9"
+    # strict < so we skip when ctx == max_ctx (prompt fills entire context, no room for output)
+    for ctx in 1024 8192; do
+        if [ "$ctx" -lt "${MAX_CTX[$key]}" ]; then
+            result=$(needle_test "$key" "$model" "$NEEDLE" "$ctx")
+            TOTAL=$((TOTAL+1))
+            if [[ "$result" == FOUND ]]; then
+                PASS=$((PASS+1)); printf "  ${GREEN}PASS${NC} Needle @ %s tok: %s\n" "$ctx" "$result"
+            else
+                FAIL=$((FAIL+1)); printf "  ${RED}FAIL${NC} Needle @ %s tok: %s\n" "$ctx" "$result"
+            fi
+        else
+            printf "  ${YELLOW}SKIP${NC} Needle @ %s tok (exceeds model max %s)\n" "$ctx" "${MAX_CTX[$key]}"
+        fi
+    done
+
+    echo ""
+done
+
+echo "$HR"
+printf "RESULTS: ${GREEN}%s PASSED${NC} / ${RED}%s FAILED${NC} / %s TOTAL\n" "$PASS" "$FAIL" "$TOTAL"
+echo "$HR"