Initial commit: tuned multi-model llama.cpp stack
- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B - TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs - Bigctx profiles (-nkvo KV in RAM): 2-16x context gain - turbo2 KV: 2x smaller, benchmarked against PPL quality gate - Per-model env files with justified parameters - kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts - docs/FINDINGS.md: surprises, pitfalls, recommendations - docs/ARCHITECTURE.md: compose + test script design
This commit is contained in:
215
scripts/quality_test.sh
Normal file
215
scripts/quality_test.sh
Normal file
@@ -0,0 +1,215 @@
|
||||
#!/bin/bash
|
||||
# Quality tests for all 4 models — runs inside full-cuda container.
|
||||
# Tests: coding tasks + needle-in-haystack at 1K/8K ctx.
|
||||
#
|
||||
# Inference parameters sourced from official HF model cards:
|
||||
# SmolLM3: /no_think in SYSTEM prompt (-sys); temp=0.6 top_p=0.95
|
||||
# Qwen3: /no_think in SYSTEM prompt (-sys); temp=0.7 top_p=0.8 top_k=20
|
||||
# DO NOT use greedy (temp=0) — causes endless repetition per Qwen3 docs
|
||||
# Gemma4: No thinking mode; temp=0.7 top_p=0.95
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
|
||||
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
|
||||
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
|
||||
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
|
||||
|
||||
declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99)
|
||||
declare -A MAX_CTX=([smollm3]=24576 [e2b]=32768 [e4b]=24576 [q3]=8192)
|
||||
|
||||
# Per-model sampling params (HF model card sources)
|
||||
declare -A TEMP=([smollm3]="0.6" [e2b]="0.7" [e4b]="0.7" [q3]="0.7")
|
||||
declare -A TOPP=([smollm3]="0.95" [e2b]="0.95" [e4b]="0.95" [q3]="0.8")
|
||||
declare -A TOPK=([smollm3]="0" [e2b]="0" [e4b]="0" [q3]="20")
|
||||
# /no_think in system prompt disables thinking for SmolLM3 and Qwen3
|
||||
declare -A SYSP=([smollm3]="/no_think" [e2b]="" [e4b]="" [q3]="/no_think")
|
||||
|
||||
PASS=0; FAIL=0; TOTAL=0
|
||||
|
||||
GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m'
|
||||
|
||||
# sed script to strip llama-cli interactive UI banner from stdout.
|
||||
# ▄ (U+2584) and █ (U+2588) appear in the llama.cpp ASCII logo, sometimes
|
||||
# with leading spaces — match anywhere on the line to be safe.
|
||||
STRIP_BANNER='/^$/d
|
||||
/^Loading model/d
|
||||
/^[[:space:]]*$/d
|
||||
/[▄█]/d
|
||||
/^build /d
|
||||
/^model /d
|
||||
/^modalities/d
|
||||
/^available commands/d
|
||||
/^ \//d
|
||||
/^\[ Prompt:/d
|
||||
/^\[ Prompt:/d
|
||||
/^Exiting/d
|
||||
/^> /d
|
||||
'
|
||||
|
||||
check() {
|
||||
local lbl="$1" out="$2"
|
||||
shift 2
|
||||
local patterns=("$@")
|
||||
local ok=1
|
||||
for pat in "${patterns[@]}"; do
|
||||
printf '%s\n' "$out" | grep -qiE "$pat" || { ok=0; break; }
|
||||
done
|
||||
TOTAL=$((TOTAL+1))
|
||||
if [ "$ok" = "1" ]; then
|
||||
PASS=$((PASS+1)); printf " ${GREEN}PASS${NC} %s\n" "$lbl"
|
||||
else
|
||||
FAIL=$((FAIL+1)); printf " ${RED}FAIL${NC} %s\n" "$lbl"
|
||||
printf '%s\n' "$out" | grep -v '^$' | tail -3 | sed 's/^/ | /'
|
||||
fi
|
||||
}
|
||||
|
||||
# Strip thinking blocks from output.
|
||||
# Gemma4 uses [Start thinking]...[End thinking].
|
||||
# Qwen3/SmolLM3 use <think>...</think>.
|
||||
# Match to end-of-string as fallback for truncated/incomplete blocks.
|
||||
strip_think() {
|
||||
python3 -c "
|
||||
import sys, re
|
||||
t = sys.stdin.read()
|
||||
# Only strip COMPLETE blocks. If thinking hit token limit, leave as-is so
|
||||
# check patterns can still match reasoning content inside the block.
|
||||
t = re.sub(r'\[Start thinking\].*?\[End thinking\]', '', t, flags=re.DOTALL)
|
||||
t = re.sub(r'<think>.*?</think>', '', t, flags=re.DOTALL)
|
||||
print(t.strip())
|
||||
" 2>/dev/null || cat
|
||||
}
|
||||
|
||||
# run KEY MODEL PROMPT MAX_TOKENS [SYS_OVERRIDE]
|
||||
# SYS_OVERRIDE defaults to SYSP[$key] if omitted.
|
||||
# Pass "" explicitly to disable system prompt (thinking ON for Qwen3/SmolLM3).
|
||||
# Thinking params: SmolLM3/Qwen3 thinking=temp0.6/top_p0.95, nothink=model defaults.
|
||||
run() {
|
||||
local key=$1 model=$2 prompt=$3 max_tok=$4
|
||||
local ngl="${NGL[$key]}"
|
||||
# 5th arg overrides sys; if not provided, use SYSP[$key]
|
||||
local use_sys
|
||||
if [ "${5+x}" = "x" ]; then use_sys="$5"; else use_sys="${SYSP[$key]}"; fi
|
||||
# choose sampling params: thinking mode uses 0.6/0.95, non-think uses model defaults
|
||||
local temp topp topk
|
||||
if [ -z "$use_sys" ] && [[ "$key" == "smollm3" || "$key" == "q3" ]]; then
|
||||
temp="0.6"; topp="0.95"; topk="${TOPK[$key]}"
|
||||
else
|
||||
temp="${TEMP[$key]}"; topp="${TOPP[$key]}"; topk="${TOPK[$key]}"
|
||||
fi
|
||||
local sys_arg=()
|
||||
[ -n "$use_sys" ] && sys_arg=(-sys "$use_sys")
|
||||
local topk_arg=()
|
||||
[ "$topk" != "0" ] && topk_arg=(--top-k "$topk")
|
||||
timeout 300 /app/llama-cli -m "$model" -ngl "$ngl" \
|
||||
-n "$max_tok" --temp "$temp" --top-p "$topp" "${topk_arg[@]}" \
|
||||
--repeat-penalty 1.1 -fa on --mmap --single-turn \
|
||||
"${sys_arg[@]}" -p "$prompt" 2>/dev/null \
|
||||
| sed "$STRIP_BANNER" \
|
||||
| strip_think
|
||||
}
|
||||
|
||||
# needle_test KEY MODEL NEEDLE CTX
|
||||
# Generates ~CTX tokens of filler, plants needle in middle, asks to recall it.
|
||||
needle_test() {
|
||||
local key=$1 model=$2 needle=$3 ctx=$4
|
||||
local ngl="${NGL[$key]}"
|
||||
local temp="${TEMP[$key]}" topp="${TOPP[$key]}" sys="${SYSP[$key]}"
|
||||
local sys_arg=()
|
||||
[ -n "$sys" ] && sys_arg=(-sys "$sys")
|
||||
|
||||
# filler: ctx/2 tokens each side, 1 token ~4 chars
|
||||
local half_chars=$(( ctx * 2 ))
|
||||
local reps=$(( half_chars / 45 + 2 ))
|
||||
local filler
|
||||
filler=$(python3 -c "print('The quick brown fox jumps over the lazy dog. ' * $reps)" 2>/dev/null \
|
||||
| head -c "$half_chars")
|
||||
|
||||
local prompt
|
||||
printf -v prompt \
|
||||
'%s\nSECRET_VALUE=%s\n%s\nWhat is SECRET_VALUE? Reply with only the value, nothing else.' \
|
||||
"$filler" "$needle" "$filler"
|
||||
|
||||
local ctx_size=$(( ctx + 512 ))
|
||||
local out
|
||||
out=$(timeout 180 /app/llama-cli -m "$model" -ngl "$ngl" \
|
||||
-n 512 --temp "$temp" --top-p "$topp" \
|
||||
-fa on --mmap --single-turn \
|
||||
-c "$ctx_size" "${sys_arg[@]}" -p "$prompt" 2>/dev/null \
|
||||
| sed "$STRIP_BANNER" \
|
||||
| strip_think)
|
||||
|
||||
# join lines before grep in case model breaks needle across newlines
|
||||
local flat
|
||||
flat=$(printf '%s' "$out" | tr '\n' ' ')
|
||||
if printf '%s' "$flat" | grep -qF "$needle"; then
|
||||
echo "FOUND"
|
||||
else
|
||||
local snip
|
||||
snip=$(printf '%s' "$flat" | cut -c1-80)
|
||||
echo "MISSED (${snip:-<empty>})"
|
||||
fi
|
||||
}
|
||||
|
||||
HR="======================================================================"
|
||||
echo "$HR"
|
||||
echo "QUALITY TESTS — ALL MODELS — $(date)"
|
||||
echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null)"
|
||||
echo "$HR"
|
||||
printf "Temps: SmolLM3=0.6/0.95 | Qwen3=0.7/0.8/k20 | Gemma4=0.7/0.95\n"
|
||||
printf "/no_think via -sys for needle tests | thinking ON for coding bug test\n\n"
|
||||
|
||||
CODING_FIZZBUZZ='Write ONLY the Python function fizzbuzz(n). It returns a list where multiples of 3 are "Fizz", multiples of 5 are "Buzz", multiples of both are "FizzBuzz", others are the number as string. Output code only, no prose.'
|
||||
|
||||
# hi is correctly len(arr)-1 to have ONE unambiguous bug: lo=mid (infinite loop)
|
||||
CODING_BUG='Find the bug in this Python function and explain it in one sentence:
|
||||
def binary_search(arr, target):
|
||||
lo, hi = 0, len(arr) - 1
|
||||
while lo < hi:
|
||||
mid = (lo + hi) // 2
|
||||
if arr[mid] == target:
|
||||
return mid
|
||||
elif arr[mid] < target:
|
||||
lo = mid
|
||||
else:
|
||||
hi = mid
|
||||
return -1'
|
||||
|
||||
for entry in "smollm3:SmolLM3-3B:$M_SMOL" "e2b:Gemma4-E2B:$M_E2B" "e4b:Gemma4-E4B:$M_E4B" "q3:Qwen3-4B:$M_Q3"; do
|
||||
IFS=':' read -r key lbl model <<< "$entry"
|
||||
echo "=== $lbl ==="
|
||||
|
||||
# Coding test 1: FizzBuzz — expect def + Fizz + Buzz
|
||||
out=$(run "$key" "$model" "$CODING_FIZZBUZZ" 512)
|
||||
check "FizzBuzz: def + Fizz + Buzz in output" "$out" \
|
||||
"def " "Fizz" "Buzz"
|
||||
|
||||
# Coding test 2: Bug — thinking ON for all models (more reliable reasoning).
|
||||
# Pass "" to disable /no_think override. Gemma4 already thinks by default.
|
||||
out=$(run "$key" "$model" "$CODING_BUG" 3000 "")
|
||||
check "Bug: identify lo=mid / infinite loop" "$out" \
|
||||
"lo.*=.*mid.*\+.*1|lo\+1|infinite loop|never.*advance|never.*progress|stuck|lo should be|lo\b.*never.*incr"
|
||||
|
||||
# Needle-in-haystack
|
||||
NEEDLE="QX7-ALPHA-9"
|
||||
# strict < so we skip when ctx == max_ctx (prompt fills entire context, no room for output)
|
||||
for ctx in 1024 8192; do
|
||||
if [ "$ctx" -lt "${MAX_CTX[$key]}" ]; then
|
||||
result=$(needle_test "$key" "$model" "$NEEDLE" "$ctx")
|
||||
TOTAL=$((TOTAL+1))
|
||||
if [[ "$result" == FOUND ]]; then
|
||||
PASS=$((PASS+1)); printf " ${GREEN}PASS${NC} Needle @ %s tok: %s\n" "$ctx" "$result"
|
||||
else
|
||||
FAIL=$((FAIL+1)); printf " ${RED}FAIL${NC} Needle @ %s tok: %s\n" "$ctx" "$result"
|
||||
fi
|
||||
else
|
||||
printf " ${YELLOW}SKIP${NC} Needle @ %s tok (exceeds model max %s)\n" "$ctx" "${MAX_CTX[$key]}"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
done
|
||||
|
||||
echo "$HR"
|
||||
printf "RESULTS: ${GREEN}%s PASSED${NC} / ${RED}%s FAILED${NC} / %s TOTAL\n" "$PASS" "$FAIL" "$TOTAL"
|
||||
echo "$HR"
|
||||
Reference in New Issue
Block a user