#!/bin/bash
# Quality tests for all 4 models — runs inside full-cuda container.
# Tests: coding tasks + needle-in-haystack at 1K/8K ctx.
#
# Inference parameters sourced from official HF model cards:
# SmolLM3: /no_think in SYSTEM prompt (-sys); temp=0.6 top_p=0.95
# Qwen3: /no_think in SYSTEM prompt (-sys); temp=0.7 top_p=0.8 top_k=20
# DO NOT use greedy (temp=0) — causes endless repetition per Qwen3 docs
# Gemma4: No thinking mode; temp=0.7 top_p=0.95
set -uo pipefail
M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99)
declare -A MAX_CTX=([smollm3]=24576 [e2b]=32768 [e4b]=24576 [q3]=8192)
# Per-model sampling params (HF model card sources)
declare -A TEMP=([smollm3]="0.6" [e2b]="0.7" [e4b]="0.7" [q3]="0.7")
declare -A TOPP=([smollm3]="0.95" [e2b]="0.95" [e4b]="0.95" [q3]="0.8")
declare -A TOPK=([smollm3]="0" [e2b]="0" [e4b]="0" [q3]="20")
# /no_think in system prompt disables thinking for SmolLM3 and Qwen3
declare -A SYSP=([smollm3]="/no_think" [e2b]="" [e4b]="" [q3]="/no_think")
PASS=0; FAIL=0; TOTAL=0
GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m'
# sed script to strip llama-cli interactive UI banner from stdout.
# ▄ (U+2584) and █ (U+2588) appear in the llama.cpp ASCII logo, sometimes
# with leading spaces — match anywhere on the line to be safe.
STRIP_BANNER='/^$/d
/^Loading model/d
/^[[:space:]]*$/d
/[▄█]/d
/^build /d
/^model /d
/^modalities/d
/^available commands/d
/^ \//d
/^\[ Prompt:/d
/^\[ Prompt:/d
/^Exiting/d
/^> /d
'
check() {
local lbl="$1" out="$2"
shift 2
local patterns=("$@")
local ok=1
for pat in "${patterns[@]}"; do
printf '%s\n' "$out" | grep -qiE "$pat" || { ok=0; break; }
done
TOTAL=$((TOTAL+1))
if [ "$ok" = "1" ]; then
PASS=$((PASS+1)); printf " ${GREEN}PASS${NC} %s\n" "$lbl"
else
FAIL=$((FAIL+1)); printf " ${RED}FAIL${NC} %s\n" "$lbl"
printf '%s\n' "$out" | grep -v '^$' | tail -3 | sed 's/^/ | /'
fi
}
# Strip thinking blocks from output.
# Gemma4 uses [Start thinking]...[End thinking].
# Qwen3/SmolLM3 use ....
# Match to end-of-string as fallback for truncated/incomplete blocks.
strip_think() {
python3 -c "
import sys, re
t = sys.stdin.read()
# Only strip COMPLETE blocks. If thinking hit token limit, leave as-is so
# check patterns can still match reasoning content inside the block.
t = re.sub(r'\[Start thinking\].*?\[End thinking\]', '', t, flags=re.DOTALL)
t = re.sub(r'.*?', '', t, flags=re.DOTALL)
print(t.strip())
" 2>/dev/null || cat
}
# run KEY MODEL PROMPT MAX_TOKENS [SYS_OVERRIDE]
# SYS_OVERRIDE defaults to SYSP[$key] if omitted.
# Pass "" explicitly to disable system prompt (thinking ON for Qwen3/SmolLM3).
# Thinking params: SmolLM3/Qwen3 thinking=temp0.6/top_p0.95, nothink=model defaults.
run() {
local key=$1 model=$2 prompt=$3 max_tok=$4
local ngl="${NGL[$key]}"
# 5th arg overrides sys; if not provided, use SYSP[$key]
local use_sys
if [ "${5+x}" = "x" ]; then use_sys="$5"; else use_sys="${SYSP[$key]}"; fi
# choose sampling params: thinking mode uses 0.6/0.95, non-think uses model defaults
local temp topp topk
if [ -z "$use_sys" ] && [[ "$key" == "smollm3" || "$key" == "q3" ]]; then
temp="0.6"; topp="0.95"; topk="${TOPK[$key]}"
else
temp="${TEMP[$key]}"; topp="${TOPP[$key]}"; topk="${TOPK[$key]}"
fi
local sys_arg=()
[ -n "$use_sys" ] && sys_arg=(-sys "$use_sys")
local topk_arg=()
[ "$topk" != "0" ] && topk_arg=(--top-k "$topk")
timeout 300 /app/llama-cli -m "$model" -ngl "$ngl" \
-n "$max_tok" --temp "$temp" --top-p "$topp" "${topk_arg[@]}" \
--repeat-penalty 1.1 -fa on --mmap --single-turn \
"${sys_arg[@]}" -p "$prompt" 2>/dev/null \
| sed "$STRIP_BANNER" \
| strip_think
}
# needle_test KEY MODEL NEEDLE CTX
# Generates ~CTX tokens of filler, plants needle in middle, asks to recall it.
needle_test() {
local key=$1 model=$2 needle=$3 ctx=$4
local ngl="${NGL[$key]}"
local temp="${TEMP[$key]}" topp="${TOPP[$key]}" sys="${SYSP[$key]}"
local sys_arg=()
[ -n "$sys" ] && sys_arg=(-sys "$sys")
# filler: ctx/2 tokens each side, 1 token ~4 chars
local half_chars=$(( ctx * 2 ))
local reps=$(( half_chars / 45 + 2 ))
local filler
filler=$(python3 -c "print('The quick brown fox jumps over the lazy dog. ' * $reps)" 2>/dev/null \
| head -c "$half_chars")
local prompt
printf -v prompt \
'%s\nSECRET_VALUE=%s\n%s\nWhat is SECRET_VALUE? Reply with only the value, nothing else.' \
"$filler" "$needle" "$filler"
local ctx_size=$(( ctx + 512 ))
local out
out=$(timeout 180 /app/llama-cli -m "$model" -ngl "$ngl" \
-n 512 --temp "$temp" --top-p "$topp" \
-fa on --mmap --single-turn \
-c "$ctx_size" "${sys_arg[@]}" -p "$prompt" 2>/dev/null \
| sed "$STRIP_BANNER" \
| strip_think)
# join lines before grep in case model breaks needle across newlines
local flat
flat=$(printf '%s' "$out" | tr '\n' ' ')
if printf '%s' "$flat" | grep -qF "$needle"; then
echo "FOUND"
else
local snip
snip=$(printf '%s' "$flat" | cut -c1-80)
echo "MISSED (${snip:-})"
fi
}
HR="======================================================================"
echo "$HR"
echo "QUALITY TESTS — ALL MODELS — $(date)"
echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null)"
echo "$HR"
printf "Temps: SmolLM3=0.6/0.95 | Qwen3=0.7/0.8/k20 | Gemma4=0.7/0.95\n"
printf "/no_think via -sys for needle tests | thinking ON for coding bug test\n\n"
CODING_FIZZBUZZ='Write ONLY the Python function fizzbuzz(n). It returns a list where multiples of 3 are "Fizz", multiples of 5 are "Buzz", multiples of both are "FizzBuzz", others are the number as string. Output code only, no prose.'
# hi is correctly len(arr)-1 to have ONE unambiguous bug: lo=mid (infinite loop)
CODING_BUG='Find the bug in this Python function and explain it in one sentence:
def binary_search(arr, target):
lo, hi = 0, len(arr) - 1
while lo < hi:
mid = (lo + hi) // 2
if arr[mid] == target:
return mid
elif arr[mid] < target:
lo = mid
else:
hi = mid
return -1'
for entry in "smollm3:SmolLM3-3B:$M_SMOL" "e2b:Gemma4-E2B:$M_E2B" "e4b:Gemma4-E4B:$M_E4B" "q3:Qwen3-4B:$M_Q3"; do
IFS=':' read -r key lbl model <<< "$entry"
echo "=== $lbl ==="
# Coding test 1: FizzBuzz — expect def + Fizz + Buzz
out=$(run "$key" "$model" "$CODING_FIZZBUZZ" 512)
check "FizzBuzz: def + Fizz + Buzz in output" "$out" \
"def " "Fizz" "Buzz"
# Coding test 2: Bug — thinking ON for all models (more reliable reasoning).
# Pass "" to disable /no_think override. Gemma4 already thinks by default.
out=$(run "$key" "$model" "$CODING_BUG" 3000 "")
check "Bug: identify lo=mid / infinite loop" "$out" \
"lo.*=.*mid.*\+.*1|lo\+1|infinite loop|never.*advance|never.*progress|stuck|lo should be|lo\b.*never.*incr"
# Needle-in-haystack
NEEDLE="QX7-ALPHA-9"
# strict < so we skip when ctx == max_ctx (prompt fills entire context, no room for output)
for ctx in 1024 8192; do
if [ "$ctx" -lt "${MAX_CTX[$key]}" ]; then
result=$(needle_test "$key" "$model" "$NEEDLE" "$ctx")
TOTAL=$((TOTAL+1))
if [[ "$result" == FOUND ]]; then
PASS=$((PASS+1)); printf " ${GREEN}PASS${NC} Needle @ %s tok: %s\n" "$ctx" "$result"
else
FAIL=$((FAIL+1)); printf " ${RED}FAIL${NC} Needle @ %s tok: %s\n" "$ctx" "$result"
fi
else
printf " ${YELLOW}SKIP${NC} Needle @ %s tok (exceeds model max %s)\n" "$ctx" "${MAX_CTX[$key]}"
fi
done
echo ""
done
echo "$HR"
printf "RESULTS: ${GREEN}%s PASSED${NC} / ${RED}%s FAILED${NC} / %s TOTAL\n" "$PASS" "$FAIL" "$TOTAL"
echo "$HR"