Initial commit: tuned multi-model llama.cpp stack

- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B
- TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs
- Bigctx profiles (-nkvo KV in RAM): 2-16x context gain
- turbo2 KV: 2x smaller, benchmarked against PPL quality gate
- Per-model env files with justified parameters
- kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts
- docs/FINDINGS.md: surprises, pitfalls, recommendations
- docs/ARCHITECTURE.md: compose + test script design
This commit is contained in:
2026-05-06 15:56:40 +02:00
commit 4ad296608b
22 changed files with 2530 additions and 0 deletions

246
scripts/kv_quant_test.sh Normal file
View File

@@ -0,0 +1,246 @@
#!/bin/bash
# KV cache quantization test using llama-perplexity.
# Image: local/llama-cpp-turboquant:full-cuda-sm75-mmq (FORCE_MMQ, turbo2/3/4 support)
#
# Tests KV types: f16 (baseline) + q8_0/q4_0/turbo2 for Q4_K_M models
# f16 (baseline) + turbo2/3/4 for Qwen3.5-9B Q8_0
# Quality gate: PPL delta vs f16 < 0.5 (lossless for practical use)
#
# Usage: bash /scripts/kv_quant_test.sh [MODEL_KEY]
# MODEL_KEY: smollm3 | e2b | e4b | q3 | qwen35q | all (default: all)
set -uo pipefail
TARGET="${1:-all}"
M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
M_Q35="/models/Qwen3.5-9B.Q8_0.gguf"
declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99 [qwen35q]=11)
declare -A BASE_CTX=([smollm3]=18432 [e2b]=32768 [e4b]=20480 [q3]=8192 [qwen35q]=8192)
declare -A PPL_TIMEOUT=([smollm3]=300 [e2b]=300 [e4b]=300 [q3]=300 [qwen35q]=600)
# Per-model KV types to test (f16 is always the baseline)
# Standard Q4_K_M models: q8_0/q4_0 + turbo2 (all supported by TurboQuant image)
# Qwen3.5-9B: designed for turbo KV — test turbo2/3/4 only (q4_0 would also work but less relevant)
declare -A MODEL_KV_TYPES=(
[smollm3]="q8_0 q4_0 turbo2"
[e2b]="q8_0 q4_0 turbo2"
[e4b]="q8_0 q4_0 turbo2"
[q3]="q8_0 q4_0 turbo2"
[qwen35q]="turbo2 turbo3 turbo4"
)
# ctx candidates per model
SMOL_CTXS=(8192 12288 16384 18432 20480 24576 32768 40960 49152)
E2B_CTXS=(8192 16384 24576 32768 40960 49152 65536)
E4B_CTXS=(8192 12288 16384 20480 24576 32768 40960)
Q3_CTXS=(4096 6144 8192 10240 12288 16384 24576 32768)
Q35_CTXS=(4096 8192 16384 24576 32768 40960 49152)
declare -A CTX_CANDIDATES=(
[smollm3]="SMOL_CTXS" [e2b]="E2B_CTXS" [e4b]="E4B_CTXS"
[q3]="Q3_CTXS" [qwen35q]="Q35_CTXS")
GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
HR="======================================================================"
# Synthetic PPL file — 4000 lines, deterministic, no network needed
PPL_FILE="/tmp/kv_ppl_input.txt"
ensure_ppl_file() {
[[ -f "$PPL_FILE" ]] && return
python3 - << 'PY'
import random, sys
random.seed(42)
sentences = [
"The transformer architecture uses self-attention mechanisms to process sequences.",
"Large language models require significant computational resources for training.",
"Quantization reduces memory usage by storing weights in lower precision formats.",
"Flash attention enables memory-efficient computation for long context windows.",
"The key-value cache stores intermediate attention states during generation.",
"Context length determines how many tokens the model can attend to simultaneously.",
"Perplexity measures how well a probability model predicts a sample of text.",
"Lower perplexity values indicate better language modeling performance overall.",
"GPU memory bandwidth is the primary bottleneck for autoregressive token generation.",
"Grouped query attention reduces KV cache size by sharing keys across head groups.",
"Rotary position embeddings encode relative position information in attention queries.",
"Mixture of experts models route tokens through specialized feed-forward networks.",
"Continuous batching allows servers to process multiple requests simultaneously.",
"KV cache quantization trades a small quality loss for significantly larger contexts.",
]
lines = [random.choice(sentences) for _ in range(4000)]
print('\n'.join(lines), file=open('/tmp/kv_ppl_input.txt', 'w'))
PY
}
# run_ppl MODEL NGL KV CTX TIMEOUT [EXTRA_ARGS...]
# Echoes PPL value on stdout, returns 0 on success, 1 on OOM/crash.
run_ppl() {
local model=$1 ngl=$2 kv=$3 ctx=$4 timeout_s=$5
shift 5
local extra_args=("$@")
local tmp_err; tmp_err=$(mktemp)
local ppl_out; ppl_out=$(mktemp)
timeout "$timeout_s" /app/llama-perplexity \
-m "$model" \
-ngl "$ngl" \
-fa on \
-c "$ctx" \
-ctk "$kv" -ctv "$kv" \
-f "$PPL_FILE" \
--chunks 1 \
"${extra_args[@]}" \
> "$ppl_out" 2>"$tmp_err"
local ppl_rc=$?
local err; err=$(cat "$tmp_err"); rm -f "$tmp_err"
if [[ "$ppl_rc" != "0" ]] || \
grep -qi "out of memory\|failed to allocate\|cudaMalloc failed\|CUDA_ERROR_OUT_OF_MEMORY\|ggml_cuda_malloc\|cannot allocate memory" <<< "$err"; then
rm -f "$ppl_out"
return 1
fi
local ppl_val
ppl_val=$(grep -oP '\[\d+\]\K[0-9.]+' "$ppl_out" | tail -1)
rm -f "$ppl_out"
[[ -z "$ppl_val" ]] && return 1
echo "$ppl_val"
}
# ---------------------------------------------------------------------------
ensure_ppl_file
echo "$HR"
echo "KV CACHE QUANT TEST (llama-perplexity) — TurboQuant image (FORCE_MMQ SM75)"
echo "$(date)"
echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null)"
echo "$HR"
echo "Standard models: f16 baseline | q8_0 | q4_0 | turbo2 (2-bit, 8x smaller KV vs f16)"
echo "Qwen3.5-9B: f16 baseline | turbo2 | turbo3 | turbo4 (TurboQuant KV types)"
echo "Quality gate: PPL delta vs f16 < 0.5"
echo ""
declare -a SUMMARY=()
for entry in \
"smollm3:SmolLM3-3B:$M_SMOL" \
"e2b:Gemma4-E2B:$M_E2B" \
"e4b:Gemma4-E4B:$M_E4B" \
"q3:Qwen3-4B:$M_Q3" \
"qwen35q:Qwen3.5-9B:$M_Q35"
do
IFS=':' read -r key lbl model <<< "$entry"
[[ "$TARGET" != "all" && "$TARGET" != "$key" ]] && continue
eval "ctxs=(\"\${${CTX_CANDIDATES[$key]}[@]}\")"
ngl="${NGL[$key]}"
timeout_s="${PPL_TIMEOUT[$key]}"
IFS=' ' read -ra kv_types <<< "${MODEL_KV_TYPES[$key]}"
# Extra args for qwen35-9b (flash attn already set; no mlock needed for PPL correctness)
extra_args=()
printf "${BLUE}=== %s (base ctx=%s, ngl=%s) ===${NC}\n" \
"$lbl" "${BASE_CTX[$key]}" "$ngl"
# Dynamic header based on KV types for this model
printf " %-10s %-18s" "ctx" "f16 (PPL)"
for kv in "${kv_types[@]}"; do
printf " %-20s" "$kv (PPL/delta)"
done
printf "\n"
printf " %-10s %-18s" "---" "---------"
for kv in "${kv_types[@]}"; do
printf " %-20s" "--------------------"
done
printf "\n"
declare -A best_ctx_per_kv=([f16]="${BASE_CTX[$key]}")
for kv in "${kv_types[@]}"; do best_ctx_per_kv[$kv]="${BASE_CTX[$key]}"; done
declare -A oom_kv=([f16]=0)
for kv in "${kv_types[@]}"; do oom_kv[$kv]=0; done
declare -A ppl_f16_at_ctx=()
for ctx in "${ctxs[@]}"; do
printf " %-10s" "$ctx"
# f16 baseline
f16_ppl=""
if [[ "${oom_kv[f16]}" == "1" ]]; then
printf " ${RED}%-18s${NC}" "OOM"
else
f16_ppl=$(run_ppl "$model" "$ngl" "f16" "$ctx" "$timeout_s" "${extra_args[@]}")
if [[ $? -ne 0 ]]; then
printf " ${RED}%-18s${NC}" "OOM"
oom_kv[f16]=1
else
printf " ${GREEN}%-18s${NC}" "$f16_ppl"
best_ctx_per_kv[f16]=$ctx
ppl_f16_at_ctx[$ctx]=$f16_ppl
fi
fi
# KV type columns
for kv in "${kv_types[@]}"; do
if [[ "${oom_kv[$kv]}" == "1" ]]; then
printf " ${RED}%-20s${NC}" "OOM"
continue
fi
ppl=$(run_ppl "$model" "$ngl" "$kv" "$ctx" "$timeout_s" "${extra_args[@]}")
if [[ $? -ne 0 ]]; then
printf " ${RED}%-20s${NC}" "OOM"
oom_kv[$kv]=1
continue
fi
best_ctx_per_kv[$kv]=$ctx
if [[ -n "$f16_ppl" ]]; then
delta=$(python3 -c "print(f'{float(\"$ppl\")-float(\"$f16_ppl\"):+.2f}')" 2>/dev/null || echo "?")
ok=$(python3 -c "exit(0 if abs(float('$ppl')-float('$f16_ppl'))<0.5 else 1)" 2>/dev/null && echo ok || echo bad)
if [[ "$ok" == "ok" ]]; then
printf " ${GREEN}%-20s${NC}" "${ppl}(${delta})"
else
printf " ${YELLOW}%-20s${NC}" "${ppl}(${delta})"
fi
else
printf " ${GREEN}%-20s${NC}" "$ppl"
fi
done
echo ""
done
echo ""
# Best recommendation: highest ctx where all non-f16 types passed quality gate
overall_best_ctx="${BASE_CTX[$key]}"
overall_best_kv="f16"
for kv in "${kv_types[@]}"; do
bctx="${best_ctx_per_kv[$kv]}"
SUMMARY+=("$lbl|$kv|$bctx")
if [[ "$bctx" -gt "$overall_best_ctx" ]]; then
overall_best_ctx=$bctx; overall_best_kv=$kv
fi
done
SUMMARY+=("$lbl|f16|${best_ctx_per_kv[f16]}")
printf " ${GREEN}Best: %s → max ctx %s${NC}\n\n" "$overall_best_kv" "$overall_best_ctx"
unset best_ctx_per_kv oom_kv ppl_f16_at_ctx
done
echo "$HR"
echo "SUMMARY"
echo "$HR"
printf "%-16s %-8s %s\n" "Model" "KV" "Max Ctx (no OOM + PPL delta<0.5)"
printf "%-16s %-8s %s\n" "-----" "--" "---------------------------------"
for row in "${SUMMARY[@]}"; do
IFS='|' read -r lbl kv ctx <<< "$row"
printf "${GREEN}%-16s %-8s %s${NC}\n" "$lbl" "$kv" "$ctx"
done
echo "$HR"
echo "Reminder: update envs/.env.<model>: CACHE_TYPE_K/V=<best_kv> CTX_SIZE=<max_ctx>"
echo "$HR"