Initial commit: tuned multi-model llama.cpp stack
- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B - TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs - Bigctx profiles (-nkvo KV in RAM): 2-16x context gain - turbo2 KV: 2x smaller, benchmarked against PPL quality gate - Per-model env files with justified parameters - kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts - docs/FINDINGS.md: surprises, pitfalls, recommendations - docs/ARCHITECTURE.md: compose + test script design
This commit is contained in:
251
scripts/cpu_ctx_test.sh
Normal file
251
scripts/cpu_ctx_test.sh
Normal file
@@ -0,0 +1,251 @@
|
||||
#!/bin/bash
|
||||
# cpu_ctx_test.sh v4 — -nkvo bigctx with TurboQuant image (FORCE_MMQ)
|
||||
# Image: local/llama-cpp-turboquant:full-cuda-sm75-mmq
|
||||
#
|
||||
# Tests KV in RAM (-nkvo) with BOTH q4_0 and turbo2 KV types.
|
||||
# turbo2 = 2-bit KV (2x smaller than q4_0) → ~2x more context at same RAM budget.
|
||||
#
|
||||
# Speed model per token:
|
||||
# GPU-compute models (smollm3/e2b/e4b/q3): bottleneck = PCIe KV reads
|
||||
# t/s = 1000 / (gpu_ms + ctx * kv_bytes_per_token / PCIE_BPS * 1000)
|
||||
# Qwen3.5-9B: bottleneck = RAM reads (21/32 layers on CPU, 8.86 GB model)
|
||||
# t/s = 1000 / (1000/baseline + ctx * kv_bytes_per_token / RAM_BPS * 1000)
|
||||
#
|
||||
# Usage: bash /scripts/cpu_ctx_test.sh [smollm3|e2b|e4b|q3|qwen35q|all]
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
TARGET="${1:-all}"
|
||||
TARGET_TPS=15
|
||||
CPU_THREADS=6
|
||||
BENCH_GEN=32
|
||||
PCIE_BW_GBPS=8.0 # PCIe x4 3.0 practical read BW (conservative)
|
||||
RAM_BW_GBPS=45.0 # RAM practical read BW (i7-10750H DDR4-2933)
|
||||
|
||||
M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
|
||||
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
|
||||
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
|
||||
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
|
||||
M_Q35="/models/Qwen3.5-9B.Q8_0.gguf"
|
||||
|
||||
declare -A NGL_GPU=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99 [qwen35q]=11)
|
||||
# BW source: pcie for GPU-compute models, ram for qwen35-9b (CPU-compute bound)
|
||||
declare -A BW_GBPS=([smollm3]=$PCIE_BW_GBPS [e2b]=$PCIE_BW_GBPS [e4b]=$PCIE_BW_GBPS [q3]=$PCIE_BW_GBPS [qwen35q]=$RAM_BW_GBPS)
|
||||
declare -A BW_LABEL=([smollm3]="PCIe" [e2b]="PCIe" [e4b]="PCIe" [q3]="PCIe" [qwen35q]="RAM")
|
||||
|
||||
# CTX candidates: larger now thanks to turbo2 (2x smaller KV vs q4_0)
|
||||
# Note: turbo2 is SKIPPED for Qwen3-4B (PPL explodes at ctx>=8192: +0.52 → +13 → +437)
|
||||
# turbo2 is SKIPPED for Qwen3.5-9B (hybrid linear-attn incompatible with llama-perplexity;
|
||||
# server works fine at 32K — this is a test-tool limitation, not a real issue)
|
||||
SMOL_CTXS=(32768 49152 65536 98304 131072 163840)
|
||||
E2B_CTXS=(32768 49152 65536 98304 131072 163840 196608 262144 393216)
|
||||
E4B_CTXS=(32768 49152 65536 98304 131072 163840)
|
||||
Q3_CTXS=(24576 32768 49152 65536 98304 131072)
|
||||
Q35_CTXS=(16384 32768 49152 65536 98304 131072)
|
||||
declare -A CTX_CANDIDATES=(
|
||||
[smollm3]="SMOL_CTXS" [e2b]="E2B_CTXS" [e4b]="E4B_CTXS"
|
||||
[q3]="Q3_CTXS" [qwen35q]="Q35_CTXS")
|
||||
|
||||
# Pure-GPU ctx for gain comparison
|
||||
declare -A PURE_GPU_CTX=([smollm3]=24576 [e2b]=24576 [e4b]=24576 [q3]=16384 [qwen35q]=32768)
|
||||
|
||||
GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; CYAN='\033[0;36m'; NC='\033[0m'
|
||||
HR="======================================================================"
|
||||
|
||||
# Tiny alloc file — enough for 1 chunk, minimal compute time
|
||||
ALLOC_FILE="/tmp/kv_alloc_tiny.txt"
|
||||
python3 -c "
|
||||
sentences = [
|
||||
'The transformer architecture uses self-attention mechanisms to process sequences.',
|
||||
'Large language models require significant computational resources for training.',
|
||||
'Quantization reduces memory usage by storing weights in lower precision formats.',
|
||||
'Flash attention enables memory-efficient computation for long context windows.',
|
||||
'The key-value cache stores intermediate attention states during generation.',
|
||||
]
|
||||
import random; random.seed(1)
|
||||
print(chr(10).join([random.choice(sentences) for _ in range(64)]))
|
||||
" > "$ALLOC_FILE"
|
||||
|
||||
# check_alloc MODEL NGL KV CTX [EXTRA...]
|
||||
# Returns "<host_kv_mib>" on success, "OOM" on failure. Fast: <15s.
|
||||
check_alloc() {
|
||||
local model=$1 ngl=$2 kv=$3 ctx=$4
|
||||
shift 4
|
||||
local extra_args=("$@")
|
||||
local tmp_err; tmp_err=$(mktemp)
|
||||
|
||||
timeout 90 /app/llama-perplexity \
|
||||
-m "$model" -ngl "$ngl" \
|
||||
-fa on -nkvo \
|
||||
-c "$ctx" -ctk "$kv" -ctv "$kv" \
|
||||
-f "$ALLOC_FILE" --chunks 1 \
|
||||
"${extra_args[@]}" \
|
||||
> /dev/null 2>"$tmp_err"
|
||||
local rc=$?
|
||||
|
||||
local err; err=$(cat "$tmp_err"); rm -f "$tmp_err"
|
||||
|
||||
if grep -qi "out of memory\|failed to allocate\|cudaMalloc failed\|CUDA_ERROR_OUT_OF_MEMORY\|ggml_cuda_malloc\|cannot allocate memory\|cannot create buffer" <<< "$err"; then
|
||||
echo "OOM"; return 1
|
||||
fi
|
||||
|
||||
# Parse Host context MiB: "| Host | total = model + context + compute |"
|
||||
local host_ctx_mib
|
||||
host_ctx_mib=$(grep "Host" <<< "$err" | \
|
||||
grep -oP "=\s*\d+\s*\+\s*\K\d+(?=\s*\+)" | head -1 || true)
|
||||
echo "${host_ctx_mib:-?}"
|
||||
}
|
||||
|
||||
# measure_baseline_tps MODEL NGL [EXTRA...]
|
||||
measure_baseline_tps() {
|
||||
local model=$1 ngl=$2
|
||||
shift 2
|
||||
local extra_args=("$@")
|
||||
local raw
|
||||
raw=$(timeout 120 /app/llama-bench \
|
||||
-m "$model" -ngl "$ngl" -t "$CPU_THREADS" \
|
||||
-p 1 -n "$BENCH_GEN" \
|
||||
-ctk q4_0 -ctv q4_0 -nkvo 1 -fa 1 -r 1 -o csv \
|
||||
"${extra_args[@]}" 2>/dev/null) || true
|
||||
printf '%s\n' "$raw" | sed 's/"//g' | \
|
||||
awk -F',' 'NR>1 && $34=="0" && $35+0>0 && $40+0>0 {print $40+0; exit}'
|
||||
}
|
||||
|
||||
# estimate_tps BASELINE_TPS KV_PER_TOKEN_MIB CTX BW_GBPS
|
||||
estimate_tps() {
|
||||
local baseline_tps=$1 kv_per_token_mib=$2 ctx=$3 bw_gbps=$4
|
||||
python3 -c "
|
||||
baseline = float('$baseline_tps')
|
||||
kv_tok_bytes = float('$kv_per_token_mib') * 1024 * 1024
|
||||
bps = float('$bw_gbps') * 1e9
|
||||
ctx = int('$ctx')
|
||||
base_ms = 1000.0 / baseline
|
||||
kv_ms = ctx * kv_tok_bytes / bps * 1000
|
||||
print(f'{1000.0 / (base_ms + kv_ms):.1f}')
|
||||
" 2>/dev/null || echo "?"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
echo "$HR"
|
||||
echo "CPU-RAM KV CONTEXT TEST v4 (-nkvo, TurboQuant FORCE_MMQ) -- $(date)"
|
||||
echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null)"
|
||||
echo "KV types tested: q4_0 (4-bit) and turbo2 (2-bit, 2x smaller → 2x more ctx)"
|
||||
printf "PCIe assumption: %.1f GB/s | RAM assumption: %.1f GB/s\n" "$PCIE_BW_GBPS" "$RAM_BW_GBPS"
|
||||
echo "$HR"
|
||||
echo ""
|
||||
|
||||
declare -a SUMMARY=()
|
||||
|
||||
for entry in \
|
||||
"smollm3:SmolLM3-3B:$M_SMOL" \
|
||||
"e2b:Gemma4-E2B:$M_E2B" \
|
||||
"e4b:Gemma4-E4B:$M_E4B" \
|
||||
"q3:Qwen3-4B:$M_Q3" \
|
||||
"qwen35q:Qwen3.5-9B:$M_Q35"
|
||||
do
|
||||
IFS=':' read -r key lbl model <<< "$entry"
|
||||
[[ "$TARGET" != "all" && "$TARGET" != "$key" ]] && continue
|
||||
|
||||
eval "ctxs=(\"\${${CTX_CANDIDATES[$key]}[@]}\")"
|
||||
ngl="${NGL_GPU[$key]}"
|
||||
bw_gbps="${BW_GBPS[$key]}"
|
||||
bw_label="${BW_LABEL[$key]}"
|
||||
|
||||
# turbo2 incompatible with Qwen3-4B (quality fails at ctx>=8192)
|
||||
# turbo2 alloc works for Qwen3.5-9B but quality measurement unreliable — test q4_0 only
|
||||
if [[ "$key" == "q3" || "$key" == "qwen35q" ]]; then
|
||||
kv_types_to_test=(q4_0)
|
||||
else
|
||||
kv_types_to_test=(q4_0 turbo2)
|
||||
fi
|
||||
|
||||
extra_args=()
|
||||
|
||||
printf "${BLUE}=== %s (ngl=%s, BW model: %s %.0f GB/s) ===${NC}\n" \
|
||||
"$lbl" "$ngl" "$bw_label" "$bw_gbps"
|
||||
|
||||
# Baseline t/s (empty KV, with q4_0 -nkvo — upper bound)
|
||||
printf " Measuring baseline t/s (empty KV, p=1)... "
|
||||
baseline_tps=$(measure_baseline_tps "$model" "$ngl" "${extra_args[@]}")
|
||||
if [[ -z "$baseline_tps" ]]; then
|
||||
printf "${RED}FAIL${NC}\n\n"
|
||||
SUMMARY+=("$lbl|FAIL|FAIL|FAIL|FAIL|FAIL")
|
||||
continue
|
||||
fi
|
||||
printf "${GREEN}%s t/s${NC}\n\n" "$baseline_tps"
|
||||
|
||||
# Header
|
||||
printf " %-10s %-12s %-12s %-12s %-12s %-12s %-12s\n" \
|
||||
"ctx" "KV type" "KV in RAM" "kv/tok" "t/s@25%" "t/s@50%" "t/s@100%"
|
||||
printf " %-10s %-12s %-12s %-12s %-12s %-12s %-12s\n" \
|
||||
"---" "-------" "---------" "------" "-------" "-------" "--------"
|
||||
|
||||
max_ctx_q4=""
|
||||
max_ctx_t2=""
|
||||
rec_q4=""
|
||||
rec_t2=""
|
||||
declare -A kv_ref_mib=()
|
||||
|
||||
for ctx in "${ctxs[@]}"; do
|
||||
for kv_type in "${kv_types_to_test[@]}"; do
|
||||
result=$(check_alloc "$model" "$ngl" "$kv_type" "$ctx" "${extra_args[@]}")
|
||||
if [[ "$result" == "OOM" ]]; then
|
||||
printf " ${RED}%-10s %-12s OOM${NC}\n" "$ctx" "$kv_type"
|
||||
continue
|
||||
fi
|
||||
|
||||
host_kv_mib="${result}"
|
||||
[[ "$kv_type" == "q4_0" ]] && max_ctx_q4=$ctx || max_ctx_t2=$ctx
|
||||
|
||||
# KV per token
|
||||
if [[ "$host_kv_mib" =~ ^[0-9]+$ ]]; then
|
||||
kv_per_token_mib=$(python3 -c "print(f'{$host_kv_mib / $ctx:.6f}')")
|
||||
kv_ref_mib[$kv_type]=$kv_per_token_mib
|
||||
else
|
||||
kv_per_token_mib="${kv_ref_mib[$kv_type]:-?}"
|
||||
fi
|
||||
|
||||
tps25=$(estimate_tps "$baseline_tps" "$kv_per_token_mib" "$(( ctx / 4 ))" "$bw_gbps")
|
||||
tps50=$(estimate_tps "$baseline_tps" "$kv_per_token_mib" "$(( ctx / 2 ))" "$bw_gbps")
|
||||
tps100=$(estimate_tps "$baseline_tps" "$kv_per_token_mib" "$ctx" "$bw_gbps")
|
||||
|
||||
meets=$(python3 -c "print(1 if '$tps50' != '?' and float('$tps50') >= $TARGET_TPS else 0)" 2>/dev/null || echo 0)
|
||||
[[ "$kv_type" == "q4_0" && "$meets" == "1" ]] && rec_q4=$ctx
|
||||
[[ "$kv_type" == "turbo2" && "$meets" == "1" ]] && rec_t2=$ctx
|
||||
|
||||
color=$([[ "$meets" == "1" ]] && echo "$GREEN" || echo "$YELLOW")
|
||||
printf " ${color}%-10s${NC} %-12s %-12s %-12s %-12s ${color}%-12s${NC} %-12s\n" \
|
||||
"$ctx" "$kv_type" "${host_kv_mib}MiB" "${kv_per_token_mib}MiB" \
|
||||
"$tps25" "$tps50" "$tps100"
|
||||
done
|
||||
done
|
||||
|
||||
rec_q4="${rec_q4:-$max_ctx_q4}"
|
||||
rec_t2="${rec_t2:-$max_ctx_t2}"
|
||||
pg="${PURE_GPU_CTX[$key]}"
|
||||
|
||||
printf "\n Recommended ctx (>=%s t/s@50%%): q4_0=%s turbo2=%s (pure-GPU was %s)\n\n" \
|
||||
"$TARGET_TPS" "${rec_q4:-FAIL}" "${rec_t2:-FAIL}" "$pg"
|
||||
|
||||
gain_q4=$([[ -n "${rec_q4:-}" && "${rec_q4:-}" != "FAIL" ]] && echo "$((rec_q4 - pg))" || echo "?")
|
||||
gain_t2=$([[ -n "${rec_t2:-}" && "${rec_t2:-}" != "FAIL" ]] && echo "$((rec_t2 - pg))" || echo "?")
|
||||
SUMMARY+=("$lbl|$baseline_tps|${max_ctx_q4:-OOM}|${rec_q4:-FAIL}|${max_ctx_t2:-OOM}|${rec_t2:-FAIL}|$gain_q4|$gain_t2")
|
||||
|
||||
unset kv_ref_mib max_ctx_q4 max_ctx_t2 rec_q4 rec_t2
|
||||
done
|
||||
|
||||
echo "$HR"
|
||||
echo "SUMMARY — -nkvo (KV in RAM): q4_0 vs turbo2"
|
||||
echo "$HR"
|
||||
printf "%-16s %-12s %-14s %-14s %-14s %-14s\n" \
|
||||
"Model" "Baseline t/s" "q4_0 max" "q4_0 rec" "turbo2 max" "turbo2 rec"
|
||||
printf "%-16s %-12s %-14s %-14s %-14s %-14s\n" \
|
||||
"-----" "------------" "--------" "--------" "----------" "----------"
|
||||
for row in "${SUMMARY[@]}"; do
|
||||
IFS='|' read -r lbl btps max_q4 rec_q4 max_t2 rec_t2 g_q4 g_t2 <<< "$row"
|
||||
printf "${GREEN}%-16s %-12s %-14s %-14s %-14s %-14s [q4+%s / t2+%s vs pure-GPU]${NC}\n" \
|
||||
"$lbl" "$btps" "$max_q4" "$rec_q4" "$max_t2" "$rec_t2" "$g_q4" "$g_t2"
|
||||
done
|
||||
echo "$HR"
|
||||
echo "Note: Qwen3.5-9B baseline already <15 t/s (RAM-bound, 8.86 GB model). BW model uses RAM not PCIe."
|
||||
echo "$HR"
|
||||
Reference in New Issue
Block a user