#!/bin/bash # KV cache quantization test using llama-perplexity. # Image: local/llama-cpp-turboquant:full-cuda-sm75-mmq (FORCE_MMQ, turbo2/3/4 support) # # Tests KV types: f16 (baseline) + q8_0/q4_0/turbo2 for Q4_K_M models # f16 (baseline) + turbo2/3/4 for Qwen3.5-9B Q8_0 # Quality gate: PPL delta vs f16 < 0.5 (lossless for practical use) # # Usage: bash /scripts/kv_quant_test.sh [MODEL_KEY] # MODEL_KEY: smollm3 | e2b | e4b | q3 | qwen35q | all (default: all) set -uo pipefail TARGET="${1:-all}" M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf" M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf" M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf" M_Q3="/models/Qwen3-4B-Q4_K_M.gguf" M_Q35="/models/Qwen3.5-9B.Q8_0.gguf" declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99 [qwen35q]=11) declare -A BASE_CTX=([smollm3]=18432 [e2b]=32768 [e4b]=20480 [q3]=8192 [qwen35q]=8192) declare -A PPL_TIMEOUT=([smollm3]=300 [e2b]=300 [e4b]=300 [q3]=300 [qwen35q]=600) # Per-model KV types to test (f16 is always the baseline) # Standard Q4_K_M models: q8_0/q4_0 + turbo2 (all supported by TurboQuant image) # Qwen3.5-9B: designed for turbo KV — test turbo2/3/4 only (q4_0 would also work but less relevant) declare -A MODEL_KV_TYPES=( [smollm3]="q8_0 q4_0 turbo2" [e2b]="q8_0 q4_0 turbo2" [e4b]="q8_0 q4_0 turbo2" [q3]="q8_0 q4_0 turbo2" [qwen35q]="turbo2 turbo3 turbo4" ) # ctx candidates per model SMOL_CTXS=(8192 12288 16384 18432 20480 24576 32768 40960 49152) E2B_CTXS=(8192 16384 24576 32768 40960 49152 65536) E4B_CTXS=(8192 12288 16384 20480 24576 32768 40960) Q3_CTXS=(4096 6144 8192 10240 12288 16384 24576 32768) Q35_CTXS=(4096 8192 16384 24576 32768 40960 49152) declare -A CTX_CANDIDATES=( [smollm3]="SMOL_CTXS" [e2b]="E2B_CTXS" [e4b]="E4B_CTXS" [q3]="Q3_CTXS" [qwen35q]="Q35_CTXS") GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m' HR="======================================================================" # Synthetic PPL file — 4000 lines, deterministic, no network needed PPL_FILE="/tmp/kv_ppl_input.txt" ensure_ppl_file() { [[ -f "$PPL_FILE" ]] && return python3 - << 'PY' import random, sys random.seed(42) sentences = [ "The transformer architecture uses self-attention mechanisms to process sequences.", "Large language models require significant computational resources for training.", "Quantization reduces memory usage by storing weights in lower precision formats.", "Flash attention enables memory-efficient computation for long context windows.", "The key-value cache stores intermediate attention states during generation.", "Context length determines how many tokens the model can attend to simultaneously.", "Perplexity measures how well a probability model predicts a sample of text.", "Lower perplexity values indicate better language modeling performance overall.", "GPU memory bandwidth is the primary bottleneck for autoregressive token generation.", "Grouped query attention reduces KV cache size by sharing keys across head groups.", "Rotary position embeddings encode relative position information in attention queries.", "Mixture of experts models route tokens through specialized feed-forward networks.", "Continuous batching allows servers to process multiple requests simultaneously.", "KV cache quantization trades a small quality loss for significantly larger contexts.", ] lines = [random.choice(sentences) for _ in range(4000)] print('\n'.join(lines), file=open('/tmp/kv_ppl_input.txt', 'w')) PY } # run_ppl MODEL NGL KV CTX TIMEOUT [EXTRA_ARGS...] # Echoes PPL value on stdout, returns 0 on success, 1 on OOM/crash. run_ppl() { local model=$1 ngl=$2 kv=$3 ctx=$4 timeout_s=$5 shift 5 local extra_args=("$@") local tmp_err; tmp_err=$(mktemp) local ppl_out; ppl_out=$(mktemp) timeout "$timeout_s" /app/llama-perplexity \ -m "$model" \ -ngl "$ngl" \ -fa on \ -c "$ctx" \ -ctk "$kv" -ctv "$kv" \ -f "$PPL_FILE" \ --chunks 1 \ "${extra_args[@]}" \ > "$ppl_out" 2>"$tmp_err" local ppl_rc=$? local err; err=$(cat "$tmp_err"); rm -f "$tmp_err" if [[ "$ppl_rc" != "0" ]] || \ grep -qi "out of memory\|failed to allocate\|cudaMalloc failed\|CUDA_ERROR_OUT_OF_MEMORY\|ggml_cuda_malloc\|cannot allocate memory" <<< "$err"; then rm -f "$ppl_out" return 1 fi local ppl_val ppl_val=$(grep -oP '\[\d+\]\K[0-9.]+' "$ppl_out" | tail -1) rm -f "$ppl_out" [[ -z "$ppl_val" ]] && return 1 echo "$ppl_val" } # --------------------------------------------------------------------------- ensure_ppl_file echo "$HR" echo "KV CACHE QUANT TEST (llama-perplexity) — TurboQuant image (FORCE_MMQ SM75)" echo "$(date)" echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null)" echo "$HR" echo "Standard models: f16 baseline | q8_0 | q4_0 | turbo2 (2-bit, 8x smaller KV vs f16)" echo "Qwen3.5-9B: f16 baseline | turbo2 | turbo3 | turbo4 (TurboQuant KV types)" echo "Quality gate: PPL delta vs f16 < 0.5" echo "" declare -a SUMMARY=() for entry in \ "smollm3:SmolLM3-3B:$M_SMOL" \ "e2b:Gemma4-E2B:$M_E2B" \ "e4b:Gemma4-E4B:$M_E4B" \ "q3:Qwen3-4B:$M_Q3" \ "qwen35q:Qwen3.5-9B:$M_Q35" do IFS=':' read -r key lbl model <<< "$entry" [[ "$TARGET" != "all" && "$TARGET" != "$key" ]] && continue eval "ctxs=(\"\${${CTX_CANDIDATES[$key]}[@]}\")" ngl="${NGL[$key]}" timeout_s="${PPL_TIMEOUT[$key]}" IFS=' ' read -ra kv_types <<< "${MODEL_KV_TYPES[$key]}" # Extra args for qwen35-9b (flash attn already set; no mlock needed for PPL correctness) extra_args=() printf "${BLUE}=== %s (base ctx=%s, ngl=%s) ===${NC}\n" \ "$lbl" "${BASE_CTX[$key]}" "$ngl" # Dynamic header based on KV types for this model printf " %-10s %-18s" "ctx" "f16 (PPL)" for kv in "${kv_types[@]}"; do printf " %-20s" "$kv (PPL/delta)" done printf "\n" printf " %-10s %-18s" "---" "---------" for kv in "${kv_types[@]}"; do printf " %-20s" "--------------------" done printf "\n" declare -A best_ctx_per_kv=([f16]="${BASE_CTX[$key]}") for kv in "${kv_types[@]}"; do best_ctx_per_kv[$kv]="${BASE_CTX[$key]}"; done declare -A oom_kv=([f16]=0) for kv in "${kv_types[@]}"; do oom_kv[$kv]=0; done declare -A ppl_f16_at_ctx=() for ctx in "${ctxs[@]}"; do printf " %-10s" "$ctx" # f16 baseline f16_ppl="" if [[ "${oom_kv[f16]}" == "1" ]]; then printf " ${RED}%-18s${NC}" "OOM" else f16_ppl=$(run_ppl "$model" "$ngl" "f16" "$ctx" "$timeout_s" "${extra_args[@]}") if [[ $? -ne 0 ]]; then printf " ${RED}%-18s${NC}" "OOM" oom_kv[f16]=1 else printf " ${GREEN}%-18s${NC}" "$f16_ppl" best_ctx_per_kv[f16]=$ctx ppl_f16_at_ctx[$ctx]=$f16_ppl fi fi # KV type columns for kv in "${kv_types[@]}"; do if [[ "${oom_kv[$kv]}" == "1" ]]; then printf " ${RED}%-20s${NC}" "OOM" continue fi ppl=$(run_ppl "$model" "$ngl" "$kv" "$ctx" "$timeout_s" "${extra_args[@]}") if [[ $? -ne 0 ]]; then printf " ${RED}%-20s${NC}" "OOM" oom_kv[$kv]=1 continue fi best_ctx_per_kv[$kv]=$ctx if [[ -n "$f16_ppl" ]]; then delta=$(python3 -c "print(f'{float(\"$ppl\")-float(\"$f16_ppl\"):+.2f}')" 2>/dev/null || echo "?") ok=$(python3 -c "exit(0 if abs(float('$ppl')-float('$f16_ppl'))<0.5 else 1)" 2>/dev/null && echo ok || echo bad) if [[ "$ok" == "ok" ]]; then printf " ${GREEN}%-20s${NC}" "${ppl}(${delta})" else printf " ${YELLOW}%-20s${NC}" "${ppl}(${delta})" fi else printf " ${GREEN}%-20s${NC}" "$ppl" fi done echo "" done echo "" # Best recommendation: highest ctx where all non-f16 types passed quality gate overall_best_ctx="${BASE_CTX[$key]}" overall_best_kv="f16" for kv in "${kv_types[@]}"; do bctx="${best_ctx_per_kv[$kv]}" SUMMARY+=("$lbl|$kv|$bctx") if [[ "$bctx" -gt "$overall_best_ctx" ]]; then overall_best_ctx=$bctx; overall_best_kv=$kv fi done SUMMARY+=("$lbl|f16|${best_ctx_per_kv[f16]}") printf " ${GREEN}Best: %s → max ctx %s${NC}\n\n" "$overall_best_kv" "$overall_best_ctx" unset best_ctx_per_kv oom_kv ppl_f16_at_ctx done echo "$HR" echo "SUMMARY" echo "$HR" printf "%-16s %-8s %s\n" "Model" "KV" "Max Ctx (no OOM + PPL delta<0.5)" printf "%-16s %-8s %s\n" "-----" "--" "---------------------------------" for row in "${SUMMARY[@]}"; do IFS='|' read -r lbl kv ctx <<< "$row" printf "${GREEN}%-16s %-8s %s${NC}\n" "$lbl" "$kv" "$ctx" done echo "$HR" echo "Reminder: update envs/.env.: CACHE_TYPE_K/V= CTX_SIZE=" echo "$HR"