Initial commit: tuned multi-model llama.cpp stack
- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B - TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs - Bigctx profiles (-nkvo KV in RAM): 2-16x context gain - turbo2 KV: 2x smaller, benchmarked against PPL quality gate - Per-model env files with justified parameters - kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts - docs/FINDINGS.md: surprises, pitfalls, recommendations - docs/ARCHITECTURE.md: compose + test script design
This commit is contained in:
246
scripts/kv_quant_test.sh
Normal file
246
scripts/kv_quant_test.sh
Normal file
@@ -0,0 +1,246 @@
|
||||
#!/bin/bash
|
||||
# KV cache quantization test using llama-perplexity.
|
||||
# Image: local/llama-cpp-turboquant:full-cuda-sm75-mmq (FORCE_MMQ, turbo2/3/4 support)
|
||||
#
|
||||
# Tests KV types: f16 (baseline) + q8_0/q4_0/turbo2 for Q4_K_M models
|
||||
# f16 (baseline) + turbo2/3/4 for Qwen3.5-9B Q8_0
|
||||
# Quality gate: PPL delta vs f16 < 0.5 (lossless for practical use)
|
||||
#
|
||||
# Usage: bash /scripts/kv_quant_test.sh [MODEL_KEY]
|
||||
# MODEL_KEY: smollm3 | e2b | e4b | q3 | qwen35q | all (default: all)
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
TARGET="${1:-all}"
|
||||
|
||||
M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
|
||||
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
|
||||
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
|
||||
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
|
||||
M_Q35="/models/Qwen3.5-9B.Q8_0.gguf"
|
||||
|
||||
declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99 [qwen35q]=11)
|
||||
declare -A BASE_CTX=([smollm3]=18432 [e2b]=32768 [e4b]=20480 [q3]=8192 [qwen35q]=8192)
|
||||
declare -A PPL_TIMEOUT=([smollm3]=300 [e2b]=300 [e4b]=300 [q3]=300 [qwen35q]=600)
|
||||
|
||||
# Per-model KV types to test (f16 is always the baseline)
|
||||
# Standard Q4_K_M models: q8_0/q4_0 + turbo2 (all supported by TurboQuant image)
|
||||
# Qwen3.5-9B: designed for turbo KV — test turbo2/3/4 only (q4_0 would also work but less relevant)
|
||||
declare -A MODEL_KV_TYPES=(
|
||||
[smollm3]="q8_0 q4_0 turbo2"
|
||||
[e2b]="q8_0 q4_0 turbo2"
|
||||
[e4b]="q8_0 q4_0 turbo2"
|
||||
[q3]="q8_0 q4_0 turbo2"
|
||||
[qwen35q]="turbo2 turbo3 turbo4"
|
||||
)
|
||||
|
||||
# ctx candidates per model
|
||||
SMOL_CTXS=(8192 12288 16384 18432 20480 24576 32768 40960 49152)
|
||||
E2B_CTXS=(8192 16384 24576 32768 40960 49152 65536)
|
||||
E4B_CTXS=(8192 12288 16384 20480 24576 32768 40960)
|
||||
Q3_CTXS=(4096 6144 8192 10240 12288 16384 24576 32768)
|
||||
Q35_CTXS=(4096 8192 16384 24576 32768 40960 49152)
|
||||
declare -A CTX_CANDIDATES=(
|
||||
[smollm3]="SMOL_CTXS" [e2b]="E2B_CTXS" [e4b]="E4B_CTXS"
|
||||
[q3]="Q3_CTXS" [qwen35q]="Q35_CTXS")
|
||||
|
||||
GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||
HR="======================================================================"
|
||||
|
||||
# Synthetic PPL file — 4000 lines, deterministic, no network needed
|
||||
PPL_FILE="/tmp/kv_ppl_input.txt"
|
||||
ensure_ppl_file() {
|
||||
[[ -f "$PPL_FILE" ]] && return
|
||||
python3 - << 'PY'
|
||||
import random, sys
|
||||
random.seed(42)
|
||||
sentences = [
|
||||
"The transformer architecture uses self-attention mechanisms to process sequences.",
|
||||
"Large language models require significant computational resources for training.",
|
||||
"Quantization reduces memory usage by storing weights in lower precision formats.",
|
||||
"Flash attention enables memory-efficient computation for long context windows.",
|
||||
"The key-value cache stores intermediate attention states during generation.",
|
||||
"Context length determines how many tokens the model can attend to simultaneously.",
|
||||
"Perplexity measures how well a probability model predicts a sample of text.",
|
||||
"Lower perplexity values indicate better language modeling performance overall.",
|
||||
"GPU memory bandwidth is the primary bottleneck for autoregressive token generation.",
|
||||
"Grouped query attention reduces KV cache size by sharing keys across head groups.",
|
||||
"Rotary position embeddings encode relative position information in attention queries.",
|
||||
"Mixture of experts models route tokens through specialized feed-forward networks.",
|
||||
"Continuous batching allows servers to process multiple requests simultaneously.",
|
||||
"KV cache quantization trades a small quality loss for significantly larger contexts.",
|
||||
]
|
||||
lines = [random.choice(sentences) for _ in range(4000)]
|
||||
print('\n'.join(lines), file=open('/tmp/kv_ppl_input.txt', 'w'))
|
||||
PY
|
||||
}
|
||||
|
||||
# run_ppl MODEL NGL KV CTX TIMEOUT [EXTRA_ARGS...]
|
||||
# Echoes PPL value on stdout, returns 0 on success, 1 on OOM/crash.
|
||||
run_ppl() {
|
||||
local model=$1 ngl=$2 kv=$3 ctx=$4 timeout_s=$5
|
||||
shift 5
|
||||
local extra_args=("$@")
|
||||
|
||||
local tmp_err; tmp_err=$(mktemp)
|
||||
local ppl_out; ppl_out=$(mktemp)
|
||||
|
||||
timeout "$timeout_s" /app/llama-perplexity \
|
||||
-m "$model" \
|
||||
-ngl "$ngl" \
|
||||
-fa on \
|
||||
-c "$ctx" \
|
||||
-ctk "$kv" -ctv "$kv" \
|
||||
-f "$PPL_FILE" \
|
||||
--chunks 1 \
|
||||
"${extra_args[@]}" \
|
||||
> "$ppl_out" 2>"$tmp_err"
|
||||
local ppl_rc=$?
|
||||
|
||||
local err; err=$(cat "$tmp_err"); rm -f "$tmp_err"
|
||||
|
||||
if [[ "$ppl_rc" != "0" ]] || \
|
||||
grep -qi "out of memory\|failed to allocate\|cudaMalloc failed\|CUDA_ERROR_OUT_OF_MEMORY\|ggml_cuda_malloc\|cannot allocate memory" <<< "$err"; then
|
||||
rm -f "$ppl_out"
|
||||
return 1
|
||||
fi
|
||||
|
||||
local ppl_val
|
||||
ppl_val=$(grep -oP '\[\d+\]\K[0-9.]+' "$ppl_out" | tail -1)
|
||||
rm -f "$ppl_out"
|
||||
[[ -z "$ppl_val" ]] && return 1
|
||||
echo "$ppl_val"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
ensure_ppl_file
|
||||
|
||||
echo "$HR"
|
||||
echo "KV CACHE QUANT TEST (llama-perplexity) — TurboQuant image (FORCE_MMQ SM75)"
|
||||
echo "$(date)"
|
||||
echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null)"
|
||||
echo "$HR"
|
||||
echo "Standard models: f16 baseline | q8_0 | q4_0 | turbo2 (2-bit, 8x smaller KV vs f16)"
|
||||
echo "Qwen3.5-9B: f16 baseline | turbo2 | turbo3 | turbo4 (TurboQuant KV types)"
|
||||
echo "Quality gate: PPL delta vs f16 < 0.5"
|
||||
echo ""
|
||||
|
||||
declare -a SUMMARY=()
|
||||
|
||||
for entry in \
|
||||
"smollm3:SmolLM3-3B:$M_SMOL" \
|
||||
"e2b:Gemma4-E2B:$M_E2B" \
|
||||
"e4b:Gemma4-E4B:$M_E4B" \
|
||||
"q3:Qwen3-4B:$M_Q3" \
|
||||
"qwen35q:Qwen3.5-9B:$M_Q35"
|
||||
do
|
||||
IFS=':' read -r key lbl model <<< "$entry"
|
||||
[[ "$TARGET" != "all" && "$TARGET" != "$key" ]] && continue
|
||||
|
||||
eval "ctxs=(\"\${${CTX_CANDIDATES[$key]}[@]}\")"
|
||||
ngl="${NGL[$key]}"
|
||||
timeout_s="${PPL_TIMEOUT[$key]}"
|
||||
IFS=' ' read -ra kv_types <<< "${MODEL_KV_TYPES[$key]}"
|
||||
|
||||
# Extra args for qwen35-9b (flash attn already set; no mlock needed for PPL correctness)
|
||||
extra_args=()
|
||||
|
||||
printf "${BLUE}=== %s (base ctx=%s, ngl=%s) ===${NC}\n" \
|
||||
"$lbl" "${BASE_CTX[$key]}" "$ngl"
|
||||
|
||||
# Dynamic header based on KV types for this model
|
||||
printf " %-10s %-18s" "ctx" "f16 (PPL)"
|
||||
for kv in "${kv_types[@]}"; do
|
||||
printf " %-20s" "$kv (PPL/delta)"
|
||||
done
|
||||
printf "\n"
|
||||
printf " %-10s %-18s" "---" "---------"
|
||||
for kv in "${kv_types[@]}"; do
|
||||
printf " %-20s" "--------------------"
|
||||
done
|
||||
printf "\n"
|
||||
|
||||
declare -A best_ctx_per_kv=([f16]="${BASE_CTX[$key]}")
|
||||
for kv in "${kv_types[@]}"; do best_ctx_per_kv[$kv]="${BASE_CTX[$key]}"; done
|
||||
declare -A oom_kv=([f16]=0)
|
||||
for kv in "${kv_types[@]}"; do oom_kv[$kv]=0; done
|
||||
declare -A ppl_f16_at_ctx=()
|
||||
|
||||
for ctx in "${ctxs[@]}"; do
|
||||
printf " %-10s" "$ctx"
|
||||
|
||||
# f16 baseline
|
||||
f16_ppl=""
|
||||
if [[ "${oom_kv[f16]}" == "1" ]]; then
|
||||
printf " ${RED}%-18s${NC}" "OOM"
|
||||
else
|
||||
f16_ppl=$(run_ppl "$model" "$ngl" "f16" "$ctx" "$timeout_s" "${extra_args[@]}")
|
||||
if [[ $? -ne 0 ]]; then
|
||||
printf " ${RED}%-18s${NC}" "OOM"
|
||||
oom_kv[f16]=1
|
||||
else
|
||||
printf " ${GREEN}%-18s${NC}" "$f16_ppl"
|
||||
best_ctx_per_kv[f16]=$ctx
|
||||
ppl_f16_at_ctx[$ctx]=$f16_ppl
|
||||
fi
|
||||
fi
|
||||
|
||||
# KV type columns
|
||||
for kv in "${kv_types[@]}"; do
|
||||
if [[ "${oom_kv[$kv]}" == "1" ]]; then
|
||||
printf " ${RED}%-20s${NC}" "OOM"
|
||||
continue
|
||||
fi
|
||||
ppl=$(run_ppl "$model" "$ngl" "$kv" "$ctx" "$timeout_s" "${extra_args[@]}")
|
||||
if [[ $? -ne 0 ]]; then
|
||||
printf " ${RED}%-20s${NC}" "OOM"
|
||||
oom_kv[$kv]=1
|
||||
continue
|
||||
fi
|
||||
best_ctx_per_kv[$kv]=$ctx
|
||||
|
||||
if [[ -n "$f16_ppl" ]]; then
|
||||
delta=$(python3 -c "print(f'{float(\"$ppl\")-float(\"$f16_ppl\"):+.2f}')" 2>/dev/null || echo "?")
|
||||
ok=$(python3 -c "exit(0 if abs(float('$ppl')-float('$f16_ppl'))<0.5 else 1)" 2>/dev/null && echo ok || echo bad)
|
||||
if [[ "$ok" == "ok" ]]; then
|
||||
printf " ${GREEN}%-20s${NC}" "${ppl}(${delta})"
|
||||
else
|
||||
printf " ${YELLOW}%-20s${NC}" "${ppl}(${delta})"
|
||||
fi
|
||||
else
|
||||
printf " ${GREEN}%-20s${NC}" "$ppl"
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
done
|
||||
|
||||
echo ""
|
||||
|
||||
# Best recommendation: highest ctx where all non-f16 types passed quality gate
|
||||
overall_best_ctx="${BASE_CTX[$key]}"
|
||||
overall_best_kv="f16"
|
||||
for kv in "${kv_types[@]}"; do
|
||||
bctx="${best_ctx_per_kv[$kv]}"
|
||||
SUMMARY+=("$lbl|$kv|$bctx")
|
||||
if [[ "$bctx" -gt "$overall_best_ctx" ]]; then
|
||||
overall_best_ctx=$bctx; overall_best_kv=$kv
|
||||
fi
|
||||
done
|
||||
SUMMARY+=("$lbl|f16|${best_ctx_per_kv[f16]}")
|
||||
printf " ${GREEN}Best: %s → max ctx %s${NC}\n\n" "$overall_best_kv" "$overall_best_ctx"
|
||||
|
||||
unset best_ctx_per_kv oom_kv ppl_f16_at_ctx
|
||||
done
|
||||
|
||||
echo "$HR"
|
||||
echo "SUMMARY"
|
||||
echo "$HR"
|
||||
printf "%-16s %-8s %s\n" "Model" "KV" "Max Ctx (no OOM + PPL delta<0.5)"
|
||||
printf "%-16s %-8s %s\n" "-----" "--" "---------------------------------"
|
||||
for row in "${SUMMARY[@]}"; do
|
||||
IFS='|' read -r lbl kv ctx <<< "$row"
|
||||
printf "${GREEN}%-16s %-8s %s${NC}\n" "$lbl" "$kv" "$ctx"
|
||||
done
|
||||
echo "$HR"
|
||||
echo "Reminder: update envs/.env.<model>: CACHE_TYPE_K/V=<best_kv> CTX_SIZE=<max_ctx>"
|
||||
echo "$HR"
|
||||
Reference in New Issue
Block a user