- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B - TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs - Bigctx profiles (-nkvo KV in RAM): 2-16x context gain - turbo2 KV: 2x smaller, benchmarked against PPL quality gate - Per-model env files with justified parameters - kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts - docs/FINDINGS.md: surprises, pitfalls, recommendations - docs/ARCHITECTURE.md: compose + test script design
216 lines
8.0 KiB
Bash
216 lines
8.0 KiB
Bash
#!/bin/bash
|
|
# Quality tests for all 4 models — runs inside full-cuda container.
|
|
# Tests: coding tasks + needle-in-haystack at 1K/8K ctx.
|
|
#
|
|
# Inference parameters sourced from official HF model cards:
|
|
# SmolLM3: /no_think in SYSTEM prompt (-sys); temp=0.6 top_p=0.95
|
|
# Qwen3: /no_think in SYSTEM prompt (-sys); temp=0.7 top_p=0.8 top_k=20
|
|
# DO NOT use greedy (temp=0) — causes endless repetition per Qwen3 docs
|
|
# Gemma4: No thinking mode; temp=0.7 top_p=0.95
|
|
|
|
set -uo pipefail
|
|
|
|
M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
|
|
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
|
|
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
|
|
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
|
|
|
|
declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99)
|
|
declare -A MAX_CTX=([smollm3]=24576 [e2b]=32768 [e4b]=24576 [q3]=8192)
|
|
|
|
# Per-model sampling params (HF model card sources)
|
|
declare -A TEMP=([smollm3]="0.6" [e2b]="0.7" [e4b]="0.7" [q3]="0.7")
|
|
declare -A TOPP=([smollm3]="0.95" [e2b]="0.95" [e4b]="0.95" [q3]="0.8")
|
|
declare -A TOPK=([smollm3]="0" [e2b]="0" [e4b]="0" [q3]="20")
|
|
# /no_think in system prompt disables thinking for SmolLM3 and Qwen3
|
|
declare -A SYSP=([smollm3]="/no_think" [e2b]="" [e4b]="" [q3]="/no_think")
|
|
|
|
PASS=0; FAIL=0; TOTAL=0
|
|
|
|
GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m'
|
|
|
|
# sed script to strip llama-cli interactive UI banner from stdout.
|
|
# ▄ (U+2584) and █ (U+2588) appear in the llama.cpp ASCII logo, sometimes
|
|
# with leading spaces — match anywhere on the line to be safe.
|
|
STRIP_BANNER='/^$/d
|
|
/^Loading model/d
|
|
/^[[:space:]]*$/d
|
|
/[▄█]/d
|
|
/^build /d
|
|
/^model /d
|
|
/^modalities/d
|
|
/^available commands/d
|
|
/^ \//d
|
|
/^\[ Prompt:/d
|
|
/^\[ Prompt:/d
|
|
/^Exiting/d
|
|
/^> /d
|
|
'
|
|
|
|
check() {
|
|
local lbl="$1" out="$2"
|
|
shift 2
|
|
local patterns=("$@")
|
|
local ok=1
|
|
for pat in "${patterns[@]}"; do
|
|
printf '%s\n' "$out" | grep -qiE "$pat" || { ok=0; break; }
|
|
done
|
|
TOTAL=$((TOTAL+1))
|
|
if [ "$ok" = "1" ]; then
|
|
PASS=$((PASS+1)); printf " ${GREEN}PASS${NC} %s\n" "$lbl"
|
|
else
|
|
FAIL=$((FAIL+1)); printf " ${RED}FAIL${NC} %s\n" "$lbl"
|
|
printf '%s\n' "$out" | grep -v '^$' | tail -3 | sed 's/^/ | /'
|
|
fi
|
|
}
|
|
|
|
# Strip thinking blocks from output.
|
|
# Gemma4 uses [Start thinking]...[End thinking].
|
|
# Qwen3/SmolLM3 use <think>...</think>.
|
|
# Match to end-of-string as fallback for truncated/incomplete blocks.
|
|
strip_think() {
|
|
python3 -c "
|
|
import sys, re
|
|
t = sys.stdin.read()
|
|
# Only strip COMPLETE blocks. If thinking hit token limit, leave as-is so
|
|
# check patterns can still match reasoning content inside the block.
|
|
t = re.sub(r'\[Start thinking\].*?\[End thinking\]', '', t, flags=re.DOTALL)
|
|
t = re.sub(r'<think>.*?</think>', '', t, flags=re.DOTALL)
|
|
print(t.strip())
|
|
" 2>/dev/null || cat
|
|
}
|
|
|
|
# run KEY MODEL PROMPT MAX_TOKENS [SYS_OVERRIDE]
|
|
# SYS_OVERRIDE defaults to SYSP[$key] if omitted.
|
|
# Pass "" explicitly to disable system prompt (thinking ON for Qwen3/SmolLM3).
|
|
# Thinking params: SmolLM3/Qwen3 thinking=temp0.6/top_p0.95, nothink=model defaults.
|
|
run() {
|
|
local key=$1 model=$2 prompt=$3 max_tok=$4
|
|
local ngl="${NGL[$key]}"
|
|
# 5th arg overrides sys; if not provided, use SYSP[$key]
|
|
local use_sys
|
|
if [ "${5+x}" = "x" ]; then use_sys="$5"; else use_sys="${SYSP[$key]}"; fi
|
|
# choose sampling params: thinking mode uses 0.6/0.95, non-think uses model defaults
|
|
local temp topp topk
|
|
if [ -z "$use_sys" ] && [[ "$key" == "smollm3" || "$key" == "q3" ]]; then
|
|
temp="0.6"; topp="0.95"; topk="${TOPK[$key]}"
|
|
else
|
|
temp="${TEMP[$key]}"; topp="${TOPP[$key]}"; topk="${TOPK[$key]}"
|
|
fi
|
|
local sys_arg=()
|
|
[ -n "$use_sys" ] && sys_arg=(-sys "$use_sys")
|
|
local topk_arg=()
|
|
[ "$topk" != "0" ] && topk_arg=(--top-k "$topk")
|
|
timeout 300 /app/llama-cli -m "$model" -ngl "$ngl" \
|
|
-n "$max_tok" --temp "$temp" --top-p "$topp" "${topk_arg[@]}" \
|
|
--repeat-penalty 1.1 -fa on --mmap --single-turn \
|
|
"${sys_arg[@]}" -p "$prompt" 2>/dev/null \
|
|
| sed "$STRIP_BANNER" \
|
|
| strip_think
|
|
}
|
|
|
|
# needle_test KEY MODEL NEEDLE CTX
|
|
# Generates ~CTX tokens of filler, plants needle in middle, asks to recall it.
|
|
needle_test() {
|
|
local key=$1 model=$2 needle=$3 ctx=$4
|
|
local ngl="${NGL[$key]}"
|
|
local temp="${TEMP[$key]}" topp="${TOPP[$key]}" sys="${SYSP[$key]}"
|
|
local sys_arg=()
|
|
[ -n "$sys" ] && sys_arg=(-sys "$sys")
|
|
|
|
# filler: ctx/2 tokens each side, 1 token ~4 chars
|
|
local half_chars=$(( ctx * 2 ))
|
|
local reps=$(( half_chars / 45 + 2 ))
|
|
local filler
|
|
filler=$(python3 -c "print('The quick brown fox jumps over the lazy dog. ' * $reps)" 2>/dev/null \
|
|
| head -c "$half_chars")
|
|
|
|
local prompt
|
|
printf -v prompt \
|
|
'%s\nSECRET_VALUE=%s\n%s\nWhat is SECRET_VALUE? Reply with only the value, nothing else.' \
|
|
"$filler" "$needle" "$filler"
|
|
|
|
local ctx_size=$(( ctx + 512 ))
|
|
local out
|
|
out=$(timeout 180 /app/llama-cli -m "$model" -ngl "$ngl" \
|
|
-n 512 --temp "$temp" --top-p "$topp" \
|
|
-fa on --mmap --single-turn \
|
|
-c "$ctx_size" "${sys_arg[@]}" -p "$prompt" 2>/dev/null \
|
|
| sed "$STRIP_BANNER" \
|
|
| strip_think)
|
|
|
|
# join lines before grep in case model breaks needle across newlines
|
|
local flat
|
|
flat=$(printf '%s' "$out" | tr '\n' ' ')
|
|
if printf '%s' "$flat" | grep -qF "$needle"; then
|
|
echo "FOUND"
|
|
else
|
|
local snip
|
|
snip=$(printf '%s' "$flat" | cut -c1-80)
|
|
echo "MISSED (${snip:-<empty>})"
|
|
fi
|
|
}
|
|
|
|
HR="======================================================================"
|
|
echo "$HR"
|
|
echo "QUALITY TESTS — ALL MODELS — $(date)"
|
|
echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null)"
|
|
echo "$HR"
|
|
printf "Temps: SmolLM3=0.6/0.95 | Qwen3=0.7/0.8/k20 | Gemma4=0.7/0.95\n"
|
|
printf "/no_think via -sys for needle tests | thinking ON for coding bug test\n\n"
|
|
|
|
CODING_FIZZBUZZ='Write ONLY the Python function fizzbuzz(n). It returns a list where multiples of 3 are "Fizz", multiples of 5 are "Buzz", multiples of both are "FizzBuzz", others are the number as string. Output code only, no prose.'
|
|
|
|
# hi is correctly len(arr)-1 to have ONE unambiguous bug: lo=mid (infinite loop)
|
|
CODING_BUG='Find the bug in this Python function and explain it in one sentence:
|
|
def binary_search(arr, target):
|
|
lo, hi = 0, len(arr) - 1
|
|
while lo < hi:
|
|
mid = (lo + hi) // 2
|
|
if arr[mid] == target:
|
|
return mid
|
|
elif arr[mid] < target:
|
|
lo = mid
|
|
else:
|
|
hi = mid
|
|
return -1'
|
|
|
|
for entry in "smollm3:SmolLM3-3B:$M_SMOL" "e2b:Gemma4-E2B:$M_E2B" "e4b:Gemma4-E4B:$M_E4B" "q3:Qwen3-4B:$M_Q3"; do
|
|
IFS=':' read -r key lbl model <<< "$entry"
|
|
echo "=== $lbl ==="
|
|
|
|
# Coding test 1: FizzBuzz — expect def + Fizz + Buzz
|
|
out=$(run "$key" "$model" "$CODING_FIZZBUZZ" 512)
|
|
check "FizzBuzz: def + Fizz + Buzz in output" "$out" \
|
|
"def " "Fizz" "Buzz"
|
|
|
|
# Coding test 2: Bug — thinking ON for all models (more reliable reasoning).
|
|
# Pass "" to disable /no_think override. Gemma4 already thinks by default.
|
|
out=$(run "$key" "$model" "$CODING_BUG" 3000 "")
|
|
check "Bug: identify lo=mid / infinite loop" "$out" \
|
|
"lo.*=.*mid.*\+.*1|lo\+1|infinite loop|never.*advance|never.*progress|stuck|lo should be|lo\b.*never.*incr"
|
|
|
|
# Needle-in-haystack
|
|
NEEDLE="QX7-ALPHA-9"
|
|
# strict < so we skip when ctx == max_ctx (prompt fills entire context, no room for output)
|
|
for ctx in 1024 8192; do
|
|
if [ "$ctx" -lt "${MAX_CTX[$key]}" ]; then
|
|
result=$(needle_test "$key" "$model" "$NEEDLE" "$ctx")
|
|
TOTAL=$((TOTAL+1))
|
|
if [[ "$result" == FOUND ]]; then
|
|
PASS=$((PASS+1)); printf " ${GREEN}PASS${NC} Needle @ %s tok: %s\n" "$ctx" "$result"
|
|
else
|
|
FAIL=$((FAIL+1)); printf " ${RED}FAIL${NC} Needle @ %s tok: %s\n" "$ctx" "$result"
|
|
fi
|
|
else
|
|
printf " ${YELLOW}SKIP${NC} Needle @ %s tok (exceeds model max %s)\n" "$ctx" "${MAX_CTX[$key]}"
|
|
fi
|
|
done
|
|
|
|
echo ""
|
|
done
|
|
|
|
echo "$HR"
|
|
printf "RESULTS: ${GREEN}%s PASSED${NC} / ${RED}%s FAILED${NC} / %s TOTAL\n" "$PASS" "$FAIL" "$TOTAL"
|
|
echo "$HR"
|