Initial commit: tuned multi-model llama.cpp stack

- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B - TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs - Bigctx profiles (-nkvo KV in RAM): 2-16x context gain - turbo2 KV: 2x smaller, benchmarked against PPL quality gate - Per-model env files with justified parameters - kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts - docs/FINDINGS.md: surprises, pitfalls, recommendations - docs/ARCHITECTURE.md: compose + test script design
2026-05-06 15:56:40 +02:00
commit 4ad296608b
22 changed files with 2530 additions and 0 deletions
--- a/scripts/kv_quant_test.sh
+++ b/scripts/kv_quant_test.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+# KV cache quantization test using llama-perplexity.
+# Image: local/llama-cpp-turboquant:full-cuda-sm75-mmq (FORCE_MMQ, turbo2/3/4 support)
+#
+# Tests KV types: f16 (baseline) + q8_0/q4_0/turbo2 for Q4_K_M models
+#                 f16 (baseline) + turbo2/3/4 for Qwen3.5-9B Q8_0
+# Quality gate: PPL delta vs f16 < 0.5 (lossless for practical use)
+#
+# Usage: bash /scripts/kv_quant_test.sh [MODEL_KEY]
+#   MODEL_KEY: smollm3 | e2b | e4b | q3 | qwen35q | all (default: all)
+
+set -uo pipefail
+
+TARGET="${1:-all}"
+
+M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
+M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
+M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
+M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
+M_Q35="/models/Qwen3.5-9B.Q8_0.gguf"
+
+declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99 [qwen35q]=11)
+declare -A BASE_CTX=([smollm3]=18432 [e2b]=32768 [e4b]=20480 [q3]=8192 [qwen35q]=8192)
+declare -A PPL_TIMEOUT=([smollm3]=300 [e2b]=300 [e4b]=300 [q3]=300 [qwen35q]=600)
+
+# Per-model KV types to test (f16 is always the baseline)
+# Standard Q4_K_M models: q8_0/q4_0 + turbo2 (all supported by TurboQuant image)
+# Qwen3.5-9B: designed for turbo KV — test turbo2/3/4 only (q4_0 would also work but less relevant)
+declare -A MODEL_KV_TYPES=(
+    [smollm3]="q8_0 q4_0 turbo2"
+    [e2b]="q8_0 q4_0 turbo2"
+    [e4b]="q8_0 q4_0 turbo2"
+    [q3]="q8_0 q4_0 turbo2"
+    [qwen35q]="turbo2 turbo3 turbo4"
+)
+
+# ctx candidates per model
+SMOL_CTXS=(8192 12288 16384 18432 20480 24576 32768 40960 49152)
+E2B_CTXS=(8192 16384 24576 32768 40960 49152 65536)
+E4B_CTXS=(8192 12288 16384 20480 24576 32768 40960)
+Q3_CTXS=(4096 6144 8192 10240 12288 16384 24576 32768)
+Q35_CTXS=(4096 8192 16384 24576 32768 40960 49152)
+declare -A CTX_CANDIDATES=(
+    [smollm3]="SMOL_CTXS" [e2b]="E2B_CTXS" [e4b]="E4B_CTXS"
+    [q3]="Q3_CTXS" [qwen35q]="Q35_CTXS")
+
+GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
+HR="======================================================================"
+
+# Synthetic PPL file — 4000 lines, deterministic, no network needed
+PPL_FILE="/tmp/kv_ppl_input.txt"
+ensure_ppl_file() {
+    [[ -f "$PPL_FILE" ]] && return
+    python3 - << 'PY'
+import random, sys
+random.seed(42)
+sentences = [
+    "The transformer architecture uses self-attention mechanisms to process sequences.",
+    "Large language models require significant computational resources for training.",
+    "Quantization reduces memory usage by storing weights in lower precision formats.",
+    "Flash attention enables memory-efficient computation for long context windows.",
+    "The key-value cache stores intermediate attention states during generation.",
+    "Context length determines how many tokens the model can attend to simultaneously.",
+    "Perplexity measures how well a probability model predicts a sample of text.",
+    "Lower perplexity values indicate better language modeling performance overall.",
+    "GPU memory bandwidth is the primary bottleneck for autoregressive token generation.",
+    "Grouped query attention reduces KV cache size by sharing keys across head groups.",
+    "Rotary position embeddings encode relative position information in attention queries.",
+    "Mixture of experts models route tokens through specialized feed-forward networks.",
+    "Continuous batching allows servers to process multiple requests simultaneously.",
+    "KV cache quantization trades a small quality loss for significantly larger contexts.",
+]
+lines = [random.choice(sentences) for _ in range(4000)]
+print('\n'.join(lines), file=open('/tmp/kv_ppl_input.txt', 'w'))
+PY
+}
+
+# run_ppl MODEL NGL KV CTX TIMEOUT [EXTRA_ARGS...]
+# Echoes PPL value on stdout, returns 0 on success, 1 on OOM/crash.
+run_ppl() {
+    local model=$1 ngl=$2 kv=$3 ctx=$4 timeout_s=$5
+    shift 5
+    local extra_args=("$@")
+
+    local tmp_err; tmp_err=$(mktemp)
+    local ppl_out; ppl_out=$(mktemp)
+
+    timeout "$timeout_s" /app/llama-perplexity \
+        -m "$model" \
+        -ngl "$ngl" \
+        -fa on \
+        -c "$ctx" \
+        -ctk "$kv" -ctv "$kv" \
+        -f "$PPL_FILE" \
+        --chunks 1 \
+        "${extra_args[@]}" \
+        > "$ppl_out" 2>"$tmp_err"
+    local ppl_rc=$?
+
+    local err; err=$(cat "$tmp_err"); rm -f "$tmp_err"
+
+    if [[ "$ppl_rc" != "0" ]] || \
+       grep -qi "out of memory\|failed to allocate\|cudaMalloc failed\|CUDA_ERROR_OUT_OF_MEMORY\|ggml_cuda_malloc\|cannot allocate memory" <<< "$err"; then
+        rm -f "$ppl_out"
+        return 1
+    fi
+
+    local ppl_val
+    ppl_val=$(grep -oP '\[\d+\]\K[0-9.]+' "$ppl_out" | tail -1)
+    rm -f "$ppl_out"
+    [[ -z "$ppl_val" ]] && return 1
+    echo "$ppl_val"
+}
+
+# ---------------------------------------------------------------------------
+ensure_ppl_file
+
+echo "$HR"
+echo "KV CACHE QUANT TEST (llama-perplexity) — TurboQuant image (FORCE_MMQ SM75)"
+echo "$(date)"
+echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null)"
+echo "$HR"
+echo "Standard models: f16 baseline | q8_0 | q4_0 | turbo2 (2-bit, 8x smaller KV vs f16)"
+echo "Qwen3.5-9B:      f16 baseline | turbo2 | turbo3 | turbo4 (TurboQuant KV types)"
+echo "Quality gate: PPL delta vs f16 < 0.5"
+echo ""
+
+declare -a SUMMARY=()
+
+for entry in \
+    "smollm3:SmolLM3-3B:$M_SMOL" \
+    "e2b:Gemma4-E2B:$M_E2B" \
+    "e4b:Gemma4-E4B:$M_E4B" \
+    "q3:Qwen3-4B:$M_Q3" \
+    "qwen35q:Qwen3.5-9B:$M_Q35"
+do
+    IFS=':' read -r key lbl model <<< "$entry"
+    [[ "$TARGET" != "all" && "$TARGET" != "$key" ]] && continue
+
+    eval "ctxs=(\"\${${CTX_CANDIDATES[$key]}[@]}\")"
+    ngl="${NGL[$key]}"
+    timeout_s="${PPL_TIMEOUT[$key]}"
+    IFS=' ' read -ra kv_types <<< "${MODEL_KV_TYPES[$key]}"
+
+    # Extra args for qwen35-9b (flash attn already set; no mlock needed for PPL correctness)
+    extra_args=()
+
+    printf "${BLUE}=== %s (base ctx=%s, ngl=%s) ===${NC}\n" \
+        "$lbl" "${BASE_CTX[$key]}" "$ngl"
+
+    # Dynamic header based on KV types for this model
+    printf "  %-10s  %-18s" "ctx" "f16 (PPL)"
+    for kv in "${kv_types[@]}"; do
+        printf "  %-20s" "$kv (PPL/delta)"
+    done
+    printf "\n"
+    printf "  %-10s  %-18s" "---" "---------"
+    for kv in "${kv_types[@]}"; do
+        printf "  %-20s" "--------------------"
+    done
+    printf "\n"
+
+    declare -A best_ctx_per_kv=([f16]="${BASE_CTX[$key]}")
+    for kv in "${kv_types[@]}"; do best_ctx_per_kv[$kv]="${BASE_CTX[$key]}"; done
+    declare -A oom_kv=([f16]=0)
+    for kv in "${kv_types[@]}"; do oom_kv[$kv]=0; done
+    declare -A ppl_f16_at_ctx=()
+
+    for ctx in "${ctxs[@]}"; do
+        printf "  %-10s" "$ctx"
+
+        # f16 baseline
+        f16_ppl=""
+        if [[ "${oom_kv[f16]}" == "1" ]]; then
+            printf "  ${RED}%-18s${NC}" "OOM"
+        else
+            f16_ppl=$(run_ppl "$model" "$ngl" "f16" "$ctx" "$timeout_s" "${extra_args[@]}")
+            if [[ $? -ne 0 ]]; then
+                printf "  ${RED}%-18s${NC}" "OOM"
+                oom_kv[f16]=1
+            else
+                printf "  ${GREEN}%-18s${NC}" "$f16_ppl"
+                best_ctx_per_kv[f16]=$ctx
+                ppl_f16_at_ctx[$ctx]=$f16_ppl
+            fi
+        fi
+
+        # KV type columns
+        for kv in "${kv_types[@]}"; do
+            if [[ "${oom_kv[$kv]}" == "1" ]]; then
+                printf "  ${RED}%-20s${NC}" "OOM"
+                continue
+            fi
+            ppl=$(run_ppl "$model" "$ngl" "$kv" "$ctx" "$timeout_s" "${extra_args[@]}")
+            if [[ $? -ne 0 ]]; then
+                printf "  ${RED}%-20s${NC}" "OOM"
+                oom_kv[$kv]=1
+                continue
+            fi
+            best_ctx_per_kv[$kv]=$ctx
+
+            if [[ -n "$f16_ppl" ]]; then
+                delta=$(python3 -c "print(f'{float(\"$ppl\")-float(\"$f16_ppl\"):+.2f}')" 2>/dev/null || echo "?")
+                ok=$(python3 -c "exit(0 if abs(float('$ppl')-float('$f16_ppl'))<0.5 else 1)" 2>/dev/null && echo ok || echo bad)
+                if [[ "$ok" == "ok" ]]; then
+                    printf "  ${GREEN}%-20s${NC}" "${ppl}(${delta})"
+                else
+                    printf "  ${YELLOW}%-20s${NC}" "${ppl}(${delta})"
+                fi
+            else
+                printf "  ${GREEN}%-20s${NC}" "$ppl"
+            fi
+        done
+        echo ""
+    done
+
+    echo ""
+
+    # Best recommendation: highest ctx where all non-f16 types passed quality gate
+    overall_best_ctx="${BASE_CTX[$key]}"
+    overall_best_kv="f16"
+    for kv in "${kv_types[@]}"; do
+        bctx="${best_ctx_per_kv[$kv]}"
+        SUMMARY+=("$lbl|$kv|$bctx")
+        if [[ "$bctx" -gt "$overall_best_ctx" ]]; then
+            overall_best_ctx=$bctx; overall_best_kv=$kv
+        fi
+    done
+    SUMMARY+=("$lbl|f16|${best_ctx_per_kv[f16]}")
+    printf "  ${GREEN}Best: %s → max ctx %s${NC}\n\n" "$overall_best_kv" "$overall_best_ctx"
+
+    unset best_ctx_per_kv oom_kv ppl_f16_at_ctx
+done
+
+echo "$HR"
+echo "SUMMARY"
+echo "$HR"
+printf "%-16s  %-8s  %s\n" "Model" "KV" "Max Ctx (no OOM + PPL delta<0.5)"
+printf "%-16s  %-8s  %s\n" "-----" "--" "---------------------------------"
+for row in "${SUMMARY[@]}"; do
+    IFS='|' read -r lbl kv ctx <<< "$row"
+    printf "${GREEN}%-16s  %-8s  %s${NC}\n" "$lbl" "$kv" "$ctx"
+done
+echo "$HR"
+echo "Reminder: update envs/.env.<model>: CACHE_TYPE_K/V=<best_kv>  CTX_SIZE=<max_ctx>"
+echo "$HR"