- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B - TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs - Bigctx profiles (-nkvo KV in RAM): 2-16x context gain - turbo2 KV: 2x smaller, benchmarked against PPL quality gate - Per-model env files with justified parameters - kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts - docs/FINDINGS.md: surprises, pitfalls, recommendations - docs/ARCHITECTURE.md: compose + test script design
247 lines
9.1 KiB
Bash
247 lines
9.1 KiB
Bash
#!/bin/bash
|
|
# KV cache quantization test using llama-perplexity.
|
|
# Image: local/llama-cpp-turboquant:full-cuda-sm75-mmq (FORCE_MMQ, turbo2/3/4 support)
|
|
#
|
|
# Tests KV types: f16 (baseline) + q8_0/q4_0/turbo2 for Q4_K_M models
|
|
# f16 (baseline) + turbo2/3/4 for Qwen3.5-9B Q8_0
|
|
# Quality gate: PPL delta vs f16 < 0.5 (lossless for practical use)
|
|
#
|
|
# Usage: bash /scripts/kv_quant_test.sh [MODEL_KEY]
|
|
# MODEL_KEY: smollm3 | e2b | e4b | q3 | qwen35q | all (default: all)
|
|
|
|
set -uo pipefail
|
|
|
|
TARGET="${1:-all}"
|
|
|
|
M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
|
|
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
|
|
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
|
|
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
|
|
M_Q35="/models/Qwen3.5-9B.Q8_0.gguf"
|
|
|
|
declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99 [qwen35q]=11)
|
|
declare -A BASE_CTX=([smollm3]=18432 [e2b]=32768 [e4b]=20480 [q3]=8192 [qwen35q]=8192)
|
|
declare -A PPL_TIMEOUT=([smollm3]=300 [e2b]=300 [e4b]=300 [q3]=300 [qwen35q]=600)
|
|
|
|
# Per-model KV types to test (f16 is always the baseline)
|
|
# Standard Q4_K_M models: q8_0/q4_0 + turbo2 (all supported by TurboQuant image)
|
|
# Qwen3.5-9B: designed for turbo KV — test turbo2/3/4 only (q4_0 would also work but less relevant)
|
|
declare -A MODEL_KV_TYPES=(
|
|
[smollm3]="q8_0 q4_0 turbo2"
|
|
[e2b]="q8_0 q4_0 turbo2"
|
|
[e4b]="q8_0 q4_0 turbo2"
|
|
[q3]="q8_0 q4_0 turbo2"
|
|
[qwen35q]="turbo2 turbo3 turbo4"
|
|
)
|
|
|
|
# ctx candidates per model
|
|
SMOL_CTXS=(8192 12288 16384 18432 20480 24576 32768 40960 49152)
|
|
E2B_CTXS=(8192 16384 24576 32768 40960 49152 65536)
|
|
E4B_CTXS=(8192 12288 16384 20480 24576 32768 40960)
|
|
Q3_CTXS=(4096 6144 8192 10240 12288 16384 24576 32768)
|
|
Q35_CTXS=(4096 8192 16384 24576 32768 40960 49152)
|
|
declare -A CTX_CANDIDATES=(
|
|
[smollm3]="SMOL_CTXS" [e2b]="E2B_CTXS" [e4b]="E4B_CTXS"
|
|
[q3]="Q3_CTXS" [qwen35q]="Q35_CTXS")
|
|
|
|
GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
|
HR="======================================================================"
|
|
|
|
# Synthetic PPL file — 4000 lines, deterministic, no network needed
|
|
PPL_FILE="/tmp/kv_ppl_input.txt"
|
|
ensure_ppl_file() {
|
|
[[ -f "$PPL_FILE" ]] && return
|
|
python3 - << 'PY'
|
|
import random, sys
|
|
random.seed(42)
|
|
sentences = [
|
|
"The transformer architecture uses self-attention mechanisms to process sequences.",
|
|
"Large language models require significant computational resources for training.",
|
|
"Quantization reduces memory usage by storing weights in lower precision formats.",
|
|
"Flash attention enables memory-efficient computation for long context windows.",
|
|
"The key-value cache stores intermediate attention states during generation.",
|
|
"Context length determines how many tokens the model can attend to simultaneously.",
|
|
"Perplexity measures how well a probability model predicts a sample of text.",
|
|
"Lower perplexity values indicate better language modeling performance overall.",
|
|
"GPU memory bandwidth is the primary bottleneck for autoregressive token generation.",
|
|
"Grouped query attention reduces KV cache size by sharing keys across head groups.",
|
|
"Rotary position embeddings encode relative position information in attention queries.",
|
|
"Mixture of experts models route tokens through specialized feed-forward networks.",
|
|
"Continuous batching allows servers to process multiple requests simultaneously.",
|
|
"KV cache quantization trades a small quality loss for significantly larger contexts.",
|
|
]
|
|
lines = [random.choice(sentences) for _ in range(4000)]
|
|
print('\n'.join(lines), file=open('/tmp/kv_ppl_input.txt', 'w'))
|
|
PY
|
|
}
|
|
|
|
# run_ppl MODEL NGL KV CTX TIMEOUT [EXTRA_ARGS...]
|
|
# Echoes PPL value on stdout, returns 0 on success, 1 on OOM/crash.
|
|
run_ppl() {
|
|
local model=$1 ngl=$2 kv=$3 ctx=$4 timeout_s=$5
|
|
shift 5
|
|
local extra_args=("$@")
|
|
|
|
local tmp_err; tmp_err=$(mktemp)
|
|
local ppl_out; ppl_out=$(mktemp)
|
|
|
|
timeout "$timeout_s" /app/llama-perplexity \
|
|
-m "$model" \
|
|
-ngl "$ngl" \
|
|
-fa on \
|
|
-c "$ctx" \
|
|
-ctk "$kv" -ctv "$kv" \
|
|
-f "$PPL_FILE" \
|
|
--chunks 1 \
|
|
"${extra_args[@]}" \
|
|
> "$ppl_out" 2>"$tmp_err"
|
|
local ppl_rc=$?
|
|
|
|
local err; err=$(cat "$tmp_err"); rm -f "$tmp_err"
|
|
|
|
if [[ "$ppl_rc" != "0" ]] || \
|
|
grep -qi "out of memory\|failed to allocate\|cudaMalloc failed\|CUDA_ERROR_OUT_OF_MEMORY\|ggml_cuda_malloc\|cannot allocate memory" <<< "$err"; then
|
|
rm -f "$ppl_out"
|
|
return 1
|
|
fi
|
|
|
|
local ppl_val
|
|
ppl_val=$(grep -oP '\[\d+\]\K[0-9.]+' "$ppl_out" | tail -1)
|
|
rm -f "$ppl_out"
|
|
[[ -z "$ppl_val" ]] && return 1
|
|
echo "$ppl_val"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
ensure_ppl_file
|
|
|
|
echo "$HR"
|
|
echo "KV CACHE QUANT TEST (llama-perplexity) — TurboQuant image (FORCE_MMQ SM75)"
|
|
echo "$(date)"
|
|
echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null)"
|
|
echo "$HR"
|
|
echo "Standard models: f16 baseline | q8_0 | q4_0 | turbo2 (2-bit, 8x smaller KV vs f16)"
|
|
echo "Qwen3.5-9B: f16 baseline | turbo2 | turbo3 | turbo4 (TurboQuant KV types)"
|
|
echo "Quality gate: PPL delta vs f16 < 0.5"
|
|
echo ""
|
|
|
|
declare -a SUMMARY=()
|
|
|
|
for entry in \
|
|
"smollm3:SmolLM3-3B:$M_SMOL" \
|
|
"e2b:Gemma4-E2B:$M_E2B" \
|
|
"e4b:Gemma4-E4B:$M_E4B" \
|
|
"q3:Qwen3-4B:$M_Q3" \
|
|
"qwen35q:Qwen3.5-9B:$M_Q35"
|
|
do
|
|
IFS=':' read -r key lbl model <<< "$entry"
|
|
[[ "$TARGET" != "all" && "$TARGET" != "$key" ]] && continue
|
|
|
|
eval "ctxs=(\"\${${CTX_CANDIDATES[$key]}[@]}\")"
|
|
ngl="${NGL[$key]}"
|
|
timeout_s="${PPL_TIMEOUT[$key]}"
|
|
IFS=' ' read -ra kv_types <<< "${MODEL_KV_TYPES[$key]}"
|
|
|
|
# Extra args for qwen35-9b (flash attn already set; no mlock needed for PPL correctness)
|
|
extra_args=()
|
|
|
|
printf "${BLUE}=== %s (base ctx=%s, ngl=%s) ===${NC}\n" \
|
|
"$lbl" "${BASE_CTX[$key]}" "$ngl"
|
|
|
|
# Dynamic header based on KV types for this model
|
|
printf " %-10s %-18s" "ctx" "f16 (PPL)"
|
|
for kv in "${kv_types[@]}"; do
|
|
printf " %-20s" "$kv (PPL/delta)"
|
|
done
|
|
printf "\n"
|
|
printf " %-10s %-18s" "---" "---------"
|
|
for kv in "${kv_types[@]}"; do
|
|
printf " %-20s" "--------------------"
|
|
done
|
|
printf "\n"
|
|
|
|
declare -A best_ctx_per_kv=([f16]="${BASE_CTX[$key]}")
|
|
for kv in "${kv_types[@]}"; do best_ctx_per_kv[$kv]="${BASE_CTX[$key]}"; done
|
|
declare -A oom_kv=([f16]=0)
|
|
for kv in "${kv_types[@]}"; do oom_kv[$kv]=0; done
|
|
declare -A ppl_f16_at_ctx=()
|
|
|
|
for ctx in "${ctxs[@]}"; do
|
|
printf " %-10s" "$ctx"
|
|
|
|
# f16 baseline
|
|
f16_ppl=""
|
|
if [[ "${oom_kv[f16]}" == "1" ]]; then
|
|
printf " ${RED}%-18s${NC}" "OOM"
|
|
else
|
|
f16_ppl=$(run_ppl "$model" "$ngl" "f16" "$ctx" "$timeout_s" "${extra_args[@]}")
|
|
if [[ $? -ne 0 ]]; then
|
|
printf " ${RED}%-18s${NC}" "OOM"
|
|
oom_kv[f16]=1
|
|
else
|
|
printf " ${GREEN}%-18s${NC}" "$f16_ppl"
|
|
best_ctx_per_kv[f16]=$ctx
|
|
ppl_f16_at_ctx[$ctx]=$f16_ppl
|
|
fi
|
|
fi
|
|
|
|
# KV type columns
|
|
for kv in "${kv_types[@]}"; do
|
|
if [[ "${oom_kv[$kv]}" == "1" ]]; then
|
|
printf " ${RED}%-20s${NC}" "OOM"
|
|
continue
|
|
fi
|
|
ppl=$(run_ppl "$model" "$ngl" "$kv" "$ctx" "$timeout_s" "${extra_args[@]}")
|
|
if [[ $? -ne 0 ]]; then
|
|
printf " ${RED}%-20s${NC}" "OOM"
|
|
oom_kv[$kv]=1
|
|
continue
|
|
fi
|
|
best_ctx_per_kv[$kv]=$ctx
|
|
|
|
if [[ -n "$f16_ppl" ]]; then
|
|
delta=$(python3 -c "print(f'{float(\"$ppl\")-float(\"$f16_ppl\"):+.2f}')" 2>/dev/null || echo "?")
|
|
ok=$(python3 -c "exit(0 if abs(float('$ppl')-float('$f16_ppl'))<0.5 else 1)" 2>/dev/null && echo ok || echo bad)
|
|
if [[ "$ok" == "ok" ]]; then
|
|
printf " ${GREEN}%-20s${NC}" "${ppl}(${delta})"
|
|
else
|
|
printf " ${YELLOW}%-20s${NC}" "${ppl}(${delta})"
|
|
fi
|
|
else
|
|
printf " ${GREEN}%-20s${NC}" "$ppl"
|
|
fi
|
|
done
|
|
echo ""
|
|
done
|
|
|
|
echo ""
|
|
|
|
# Best recommendation: highest ctx where all non-f16 types passed quality gate
|
|
overall_best_ctx="${BASE_CTX[$key]}"
|
|
overall_best_kv="f16"
|
|
for kv in "${kv_types[@]}"; do
|
|
bctx="${best_ctx_per_kv[$kv]}"
|
|
SUMMARY+=("$lbl|$kv|$bctx")
|
|
if [[ "$bctx" -gt "$overall_best_ctx" ]]; then
|
|
overall_best_ctx=$bctx; overall_best_kv=$kv
|
|
fi
|
|
done
|
|
SUMMARY+=("$lbl|f16|${best_ctx_per_kv[f16]}")
|
|
printf " ${GREEN}Best: %s → max ctx %s${NC}\n\n" "$overall_best_kv" "$overall_best_ctx"
|
|
|
|
unset best_ctx_per_kv oom_kv ppl_f16_at_ctx
|
|
done
|
|
|
|
echo "$HR"
|
|
echo "SUMMARY"
|
|
echo "$HR"
|
|
printf "%-16s %-8s %s\n" "Model" "KV" "Max Ctx (no OOM + PPL delta<0.5)"
|
|
printf "%-16s %-8s %s\n" "-----" "--" "---------------------------------"
|
|
for row in "${SUMMARY[@]}"; do
|
|
IFS='|' read -r lbl kv ctx <<< "$row"
|
|
printf "${GREEN}%-16s %-8s %s${NC}\n" "$lbl" "$kv" "$ctx"
|
|
done
|
|
echo "$HR"
|
|
echo "Reminder: update envs/.env.<model>: CACHE_TYPE_K/V=<best_kv> CTX_SIZE=<max_ctx>"
|
|
echo "$HR"
|