llama-cpp/scripts/kv_quant_test.sh

#!/bin/bash
# KV cache quantization test using llama-perplexity.
# Image: local/llama-cpp-turboquant:full-cuda-sm75-mmq (FORCE_MMQ, turbo2/3/4 support)
#
# Tests KV types: f16 (baseline) + q8_0/q4_0/turbo2 for Q4_K_M models
#                 f16 (baseline) + turbo2/3/4 for Qwen3.5-9B Q8_0
# Quality gate: PPL delta vs f16 < 0.5 (lossless for practical use)
#
# Usage: bash /scripts/kv_quant_test.sh [MODEL_KEY]
#   MODEL_KEY: smollm3 | e2b | e4b | q3 | qwen35q | all (default: all)

set -uo pipefail

TARGET="${1:-all}"

M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
M_Q35="/models/Qwen3.5-9B.Q8_0.gguf"

declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99 [qwen35q]=11)
declare -A BASE_CTX=([smollm3]=18432 [e2b]=32768 [e4b]=20480 [q3]=8192 [qwen35q]=8192)
declare -A PPL_TIMEOUT=([smollm3]=300 [e2b]=300 [e4b]=300 [q3]=300 [qwen35q]=600)

# Per-model KV types to test (f16 is always the baseline)
# Standard Q4_K_M models: q8_0/q4_0 + turbo2 (all supported by TurboQuant image)
# Qwen3.5-9B: designed for turbo KV — test turbo2/3/4 only (q4_0 would also work but less relevant)
declare -A MODEL_KV_TYPES=(
    [smollm3]="q8_0 q4_0 turbo2"
    [e2b]="q8_0 q4_0 turbo2"
    [e4b]="q8_0 q4_0 turbo2"
    [q3]="q8_0 q4_0 turbo2"
    [qwen35q]="turbo2 turbo3 turbo4"
)

# ctx candidates per model
SMOL_CTXS=(8192 12288 16384 18432 20480 24576 32768 40960 49152)
E2B_CTXS=(8192 16384 24576 32768 40960 49152 65536)
E4B_CTXS=(8192 12288 16384 20480 24576 32768 40960)
Q3_CTXS=(4096 6144 8192 10240 12288 16384 24576 32768)
Q35_CTXS=(4096 8192 16384 24576 32768 40960 49152)
declare -A CTX_CANDIDATES=(
    [smollm3]="SMOL_CTXS" [e2b]="E2B_CTXS" [e4b]="E4B_CTXS"
    [q3]="Q3_CTXS" [qwen35q]="Q35_CTXS")

GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
HR="======================================================================"

# Synthetic PPL file — 4000 lines, deterministic, no network needed
PPL_FILE="/tmp/kv_ppl_input.txt"
ensure_ppl_file() {
    [[ -f "$PPL_FILE" ]] && return
    python3 - << 'PY'
import random, sys
random.seed(42)
sentences = [
    "The transformer architecture uses self-attention mechanisms to process sequences.",
    "Large language models require significant computational resources for training.",
    "Quantization reduces memory usage by storing weights in lower precision formats.",
    "Flash attention enables memory-efficient computation for long context windows.",
    "The key-value cache stores intermediate attention states during generation.",
    "Context length determines how many tokens the model can attend to simultaneously.",
    "Perplexity measures how well a probability model predicts a sample of text.",
    "Lower perplexity values indicate better language modeling performance overall.",
    "GPU memory bandwidth is the primary bottleneck for autoregressive token generation.",
    "Grouped query attention reduces KV cache size by sharing keys across head groups.",
    "Rotary position embeddings encode relative position information in attention queries.",
    "Mixture of experts models route tokens through specialized feed-forward networks.",
    "Continuous batching allows servers to process multiple requests simultaneously.",
    "KV cache quantization trades a small quality loss for significantly larger contexts.",
]
lines = [random.choice(sentences) for _ in range(4000)]
print('\n'.join(lines), file=open('/tmp/kv_ppl_input.txt', 'w'))
PY
}

# run_ppl MODEL NGL KV CTX TIMEOUT [EXTRA_ARGS...]
# Echoes PPL value on stdout, returns 0 on success, 1 on OOM/crash.
run_ppl() {
    local model=$1 ngl=$2 kv=$3 ctx=$4 timeout_s=$5
    shift 5
    local extra_args=("$@")

    local tmp_err; tmp_err=$(mktemp)
    local ppl_out; ppl_out=$(mktemp)

    timeout "$timeout_s" /app/llama-perplexity \
        -m "$model" \
        -ngl "$ngl" \
        -fa on \
        -c "$ctx" \
        -ctk "$kv" -ctv "$kv" \
        -f "$PPL_FILE" \
        --chunks 1 \
        "${extra_args[@]}" \
        > "$ppl_out" 2>"$tmp_err"
    local ppl_rc=$?

    local err; err=$(cat "$tmp_err"); rm -f "$tmp_err"

    if [[ "$ppl_rc" != "0" ]] || \
       grep -qi "out of memory\|failed to allocate\|cudaMalloc failed\|CUDA_ERROR_OUT_OF_MEMORY\|ggml_cuda_malloc\|cannot allocate memory" <<< "$err"; then
        rm -f "$ppl_out"
        return 1
    fi

    local ppl_val
    ppl_val=$(grep -oP '\[\d+\]\K[0-9.]+' "$ppl_out" | tail -1)
    rm -f "$ppl_out"
    [[ -z "$ppl_val" ]] && return 1
    echo "$ppl_val"
}

# ---------------------------------------------------------------------------
ensure_ppl_file

echo "$HR"
echo "KV CACHE QUANT TEST (llama-perplexity) — TurboQuant image (FORCE_MMQ SM75)"
echo "$(date)"
echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null)"
echo "$HR"
echo "Standard models: f16 baseline | q8_0 | q4_0 | turbo2 (2-bit, 8x smaller KV vs f16)"
echo "Qwen3.5-9B:      f16 baseline | turbo2 | turbo3 | turbo4 (TurboQuant KV types)"
echo "Quality gate: PPL delta vs f16 < 0.5"
echo ""

declare -a SUMMARY=()

for entry in \
    "smollm3:SmolLM3-3B:$M_SMOL" \
    "e2b:Gemma4-E2B:$M_E2B" \
    "e4b:Gemma4-E4B:$M_E4B" \
    "q3:Qwen3-4B:$M_Q3" \
    "qwen35q:Qwen3.5-9B:$M_Q35"
do
    IFS=':' read -r key lbl model <<< "$entry"
    [[ "$TARGET" != "all" && "$TARGET" != "$key" ]] && continue

    eval "ctxs=(\"\${${CTX_CANDIDATES[$key]}[@]}\")"
    ngl="${NGL[$key]}"
    timeout_s="${PPL_TIMEOUT[$key]}"
    IFS=' ' read -ra kv_types <<< "${MODEL_KV_TYPES[$key]}"

    # Extra args for qwen35-9b (flash attn already set; no mlock needed for PPL correctness)
    extra_args=()

    printf "${BLUE}=== %s (base ctx=%s, ngl=%s) ===${NC}\n" \
        "$lbl" "${BASE_CTX[$key]}" "$ngl"

    # Dynamic header based on KV types for this model
    printf "  %-10s  %-18s" "ctx" "f16 (PPL)"
    for kv in "${kv_types[@]}"; do
        printf "  %-20s" "$kv (PPL/delta)"
    done
    printf "\n"
    printf "  %-10s  %-18s" "---" "---------"
    for kv in "${kv_types[@]}"; do
        printf "  %-20s" "--------------------"
    done
    printf "\n"

    declare -A best_ctx_per_kv=([f16]="${BASE_CTX[$key]}")
    for kv in "${kv_types[@]}"; do best_ctx_per_kv[$kv]="${BASE_CTX[$key]}"; done
    declare -A oom_kv=([f16]=0)
    for kv in "${kv_types[@]}"; do oom_kv[$kv]=0; done
    declare -A ppl_f16_at_ctx=()

    for ctx in "${ctxs[@]}"; do
        printf "  %-10s" "$ctx"

        # f16 baseline
        f16_ppl=""
        if [[ "${oom_kv[f16]}" == "1" ]]; then
            printf "  ${RED}%-18s${NC}" "OOM"
        else
            f16_ppl=$(run_ppl "$model" "$ngl" "f16" "$ctx" "$timeout_s" "${extra_args[@]}")
            if [[ $? -ne 0 ]]; then
                printf "  ${RED}%-18s${NC}" "OOM"
                oom_kv[f16]=1
            else
                printf "  ${GREEN}%-18s${NC}" "$f16_ppl"
                best_ctx_per_kv[f16]=$ctx
                ppl_f16_at_ctx[$ctx]=$f16_ppl
            fi
        fi

        # KV type columns
        for kv in "${kv_types[@]}"; do
            if [[ "${oom_kv[$kv]}" == "1" ]]; then
                printf "  ${RED}%-20s${NC}" "OOM"
                continue
            fi
            ppl=$(run_ppl "$model" "$ngl" "$kv" "$ctx" "$timeout_s" "${extra_args[@]}")
            if [[ $? -ne 0 ]]; then
                printf "  ${RED}%-20s${NC}" "OOM"
                oom_kv[$kv]=1
                continue
            fi
            best_ctx_per_kv[$kv]=$ctx

            if [[ -n "$f16_ppl" ]]; then
                delta=$(python3 -c "print(f'{float(\"$ppl\")-float(\"$f16_ppl\"):+.2f}')" 2>/dev/null || echo "?")
                ok=$(python3 -c "exit(0 if abs(float('$ppl')-float('$f16_ppl'))<0.5 else 1)" 2>/dev/null && echo ok || echo bad)
                if [[ "$ok" == "ok" ]]; then
                    printf "  ${GREEN}%-20s${NC}" "${ppl}(${delta})"
                else
                    printf "  ${YELLOW}%-20s${NC}" "${ppl}(${delta})"
                fi
            else
                printf "  ${GREEN}%-20s${NC}" "$ppl"
            fi
        done
        echo ""
    done

    echo ""

    # Best recommendation: highest ctx where all non-f16 types passed quality gate
    overall_best_ctx="${BASE_CTX[$key]}"
    overall_best_kv="f16"
    for kv in "${kv_types[@]}"; do
        bctx="${best_ctx_per_kv[$kv]}"
        SUMMARY+=("$lbl|$kv|$bctx")
        if [[ "$bctx" -gt "$overall_best_ctx" ]]; then
            overall_best_ctx=$bctx; overall_best_kv=$kv
        fi
    done
    SUMMARY+=("$lbl|f16|${best_ctx_per_kv[f16]}")
    printf "  ${GREEN}Best: %s → max ctx %s${NC}\n\n" "$overall_best_kv" "$overall_best_ctx"

    unset best_ctx_per_kv oom_kv ppl_f16_at_ctx
done

echo "$HR"
echo "SUMMARY"
echo "$HR"
printf "%-16s  %-8s  %s\n" "Model" "KV" "Max Ctx (no OOM + PPL delta<0.5)"
printf "%-16s  %-8s  %s\n" "-----" "--" "---------------------------------"
for row in "${SUMMARY[@]}"; do
    IFS='|' read -r lbl kv ctx <<< "$row"
    printf "${GREEN}%-16s  %-8s  %s${NC}\n" "$lbl" "$kv" "$ctx"
done
echo "$HR"
echo "Reminder: update envs/.env.<model>: CACHE_TYPE_K/V=<best_kv>  CTX_SIZE=<max_ctx>"
echo "$HR"