#!/bin/bash # Quality tests for all 4 models — runs inside full-cuda container. # Tests: coding tasks + needle-in-haystack at 1K/8K ctx. # # Inference parameters sourced from official HF model cards: # SmolLM3: /no_think in SYSTEM prompt (-sys); temp=0.6 top_p=0.95 # Qwen3: /no_think in SYSTEM prompt (-sys); temp=0.7 top_p=0.8 top_k=20 # DO NOT use greedy (temp=0) — causes endless repetition per Qwen3 docs # Gemma4: No thinking mode; temp=0.7 top_p=0.95 set -uo pipefail M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf" M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf" M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf" M_Q3="/models/Qwen3-4B-Q4_K_M.gguf" declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99) declare -A MAX_CTX=([smollm3]=24576 [e2b]=32768 [e4b]=24576 [q3]=8192) # Per-model sampling params (HF model card sources) declare -A TEMP=([smollm3]="0.6" [e2b]="0.7" [e4b]="0.7" [q3]="0.7") declare -A TOPP=([smollm3]="0.95" [e2b]="0.95" [e4b]="0.95" [q3]="0.8") declare -A TOPK=([smollm3]="0" [e2b]="0" [e4b]="0" [q3]="20") # /no_think in system prompt disables thinking for SmolLM3 and Qwen3 declare -A SYSP=([smollm3]="/no_think" [e2b]="" [e4b]="" [q3]="/no_think") PASS=0; FAIL=0; TOTAL=0 GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m' # sed script to strip llama-cli interactive UI banner from stdout. # ▄ (U+2584) and █ (U+2588) appear in the llama.cpp ASCII logo, sometimes # with leading spaces — match anywhere on the line to be safe. STRIP_BANNER='/^$/d /^Loading model/d /^[[:space:]]*$/d /[▄█]/d /^build /d /^model /d /^modalities/d /^available commands/d /^ \//d /^\[ Prompt:/d /^\[ Prompt:/d /^Exiting/d /^> /d ' check() { local lbl="$1" out="$2" shift 2 local patterns=("$@") local ok=1 for pat in "${patterns[@]}"; do printf '%s\n' "$out" | grep -qiE "$pat" || { ok=0; break; } done TOTAL=$((TOTAL+1)) if [ "$ok" = "1" ]; then PASS=$((PASS+1)); printf " ${GREEN}PASS${NC} %s\n" "$lbl" else FAIL=$((FAIL+1)); printf " ${RED}FAIL${NC} %s\n" "$lbl" printf '%s\n' "$out" | grep -v '^$' | tail -3 | sed 's/^/ | /' fi } # Strip thinking blocks from output. # Gemma4 uses [Start thinking]...[End thinking]. # Qwen3/SmolLM3 use .... # Match to end-of-string as fallback for truncated/incomplete blocks. strip_think() { python3 -c " import sys, re t = sys.stdin.read() # Only strip COMPLETE blocks. If thinking hit token limit, leave as-is so # check patterns can still match reasoning content inside the block. t = re.sub(r'\[Start thinking\].*?\[End thinking\]', '', t, flags=re.DOTALL) t = re.sub(r'.*?', '', t, flags=re.DOTALL) print(t.strip()) " 2>/dev/null || cat } # run KEY MODEL PROMPT MAX_TOKENS [SYS_OVERRIDE] # SYS_OVERRIDE defaults to SYSP[$key] if omitted. # Pass "" explicitly to disable system prompt (thinking ON for Qwen3/SmolLM3). # Thinking params: SmolLM3/Qwen3 thinking=temp0.6/top_p0.95, nothink=model defaults. run() { local key=$1 model=$2 prompt=$3 max_tok=$4 local ngl="${NGL[$key]}" # 5th arg overrides sys; if not provided, use SYSP[$key] local use_sys if [ "${5+x}" = "x" ]; then use_sys="$5"; else use_sys="${SYSP[$key]}"; fi # choose sampling params: thinking mode uses 0.6/0.95, non-think uses model defaults local temp topp topk if [ -z "$use_sys" ] && [[ "$key" == "smollm3" || "$key" == "q3" ]]; then temp="0.6"; topp="0.95"; topk="${TOPK[$key]}" else temp="${TEMP[$key]}"; topp="${TOPP[$key]}"; topk="${TOPK[$key]}" fi local sys_arg=() [ -n "$use_sys" ] && sys_arg=(-sys "$use_sys") local topk_arg=() [ "$topk" != "0" ] && topk_arg=(--top-k "$topk") timeout 300 /app/llama-cli -m "$model" -ngl "$ngl" \ -n "$max_tok" --temp "$temp" --top-p "$topp" "${topk_arg[@]}" \ --repeat-penalty 1.1 -fa on --mmap --single-turn \ "${sys_arg[@]}" -p "$prompt" 2>/dev/null \ | sed "$STRIP_BANNER" \ | strip_think } # needle_test KEY MODEL NEEDLE CTX # Generates ~CTX tokens of filler, plants needle in middle, asks to recall it. needle_test() { local key=$1 model=$2 needle=$3 ctx=$4 local ngl="${NGL[$key]}" local temp="${TEMP[$key]}" topp="${TOPP[$key]}" sys="${SYSP[$key]}" local sys_arg=() [ -n "$sys" ] && sys_arg=(-sys "$sys") # filler: ctx/2 tokens each side, 1 token ~4 chars local half_chars=$(( ctx * 2 )) local reps=$(( half_chars / 45 + 2 )) local filler filler=$(python3 -c "print('The quick brown fox jumps over the lazy dog. ' * $reps)" 2>/dev/null \ | head -c "$half_chars") local prompt printf -v prompt \ '%s\nSECRET_VALUE=%s\n%s\nWhat is SECRET_VALUE? Reply with only the value, nothing else.' \ "$filler" "$needle" "$filler" local ctx_size=$(( ctx + 512 )) local out out=$(timeout 180 /app/llama-cli -m "$model" -ngl "$ngl" \ -n 512 --temp "$temp" --top-p "$topp" \ -fa on --mmap --single-turn \ -c "$ctx_size" "${sys_arg[@]}" -p "$prompt" 2>/dev/null \ | sed "$STRIP_BANNER" \ | strip_think) # join lines before grep in case model breaks needle across newlines local flat flat=$(printf '%s' "$out" | tr '\n' ' ') if printf '%s' "$flat" | grep -qF "$needle"; then echo "FOUND" else local snip snip=$(printf '%s' "$flat" | cut -c1-80) echo "MISSED (${snip:-})" fi } HR="======================================================================" echo "$HR" echo "QUALITY TESTS — ALL MODELS — $(date)" echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null)" echo "$HR" printf "Temps: SmolLM3=0.6/0.95 | Qwen3=0.7/0.8/k20 | Gemma4=0.7/0.95\n" printf "/no_think via -sys for needle tests | thinking ON for coding bug test\n\n" CODING_FIZZBUZZ='Write ONLY the Python function fizzbuzz(n). It returns a list where multiples of 3 are "Fizz", multiples of 5 are "Buzz", multiples of both are "FizzBuzz", others are the number as string. Output code only, no prose.' # hi is correctly len(arr)-1 to have ONE unambiguous bug: lo=mid (infinite loop) CODING_BUG='Find the bug in this Python function and explain it in one sentence: def binary_search(arr, target): lo, hi = 0, len(arr) - 1 while lo < hi: mid = (lo + hi) // 2 if arr[mid] == target: return mid elif arr[mid] < target: lo = mid else: hi = mid return -1' for entry in "smollm3:SmolLM3-3B:$M_SMOL" "e2b:Gemma4-E2B:$M_E2B" "e4b:Gemma4-E4B:$M_E4B" "q3:Qwen3-4B:$M_Q3"; do IFS=':' read -r key lbl model <<< "$entry" echo "=== $lbl ===" # Coding test 1: FizzBuzz — expect def + Fizz + Buzz out=$(run "$key" "$model" "$CODING_FIZZBUZZ" 512) check "FizzBuzz: def + Fizz + Buzz in output" "$out" \ "def " "Fizz" "Buzz" # Coding test 2: Bug — thinking ON for all models (more reliable reasoning). # Pass "" to disable /no_think override. Gemma4 already thinks by default. out=$(run "$key" "$model" "$CODING_BUG" 3000 "") check "Bug: identify lo=mid / infinite loop" "$out" \ "lo.*=.*mid.*\+.*1|lo\+1|infinite loop|never.*advance|never.*progress|stuck|lo should be|lo\b.*never.*incr" # Needle-in-haystack NEEDLE="QX7-ALPHA-9" # strict < so we skip when ctx == max_ctx (prompt fills entire context, no room for output) for ctx in 1024 8192; do if [ "$ctx" -lt "${MAX_CTX[$key]}" ]; then result=$(needle_test "$key" "$model" "$NEEDLE" "$ctx") TOTAL=$((TOTAL+1)) if [[ "$result" == FOUND ]]; then PASS=$((PASS+1)); printf " ${GREEN}PASS${NC} Needle @ %s tok: %s\n" "$ctx" "$result" else FAIL=$((FAIL+1)); printf " ${RED}FAIL${NC} Needle @ %s tok: %s\n" "$ctx" "$result" fi else printf " ${YELLOW}SKIP${NC} Needle @ %s tok (exceeds model max %s)\n" "$ctx" "${MAX_CTX[$key]}" fi done echo "" done echo "$HR" printf "RESULTS: ${GREEN}%s PASSED${NC} / ${RED}%s FAILED${NC} / %s TOTAL\n" "$PASS" "$FAIL" "$TOTAL" echo "$HR"