llama-cpp/scripts/quality_test.sh

#!/bin/bash
# Quality tests for all 4 models — runs inside full-cuda container.
# Tests: coding tasks + needle-in-haystack at 1K/8K ctx.
#
# Inference parameters sourced from official HF model cards:
#   SmolLM3:  /no_think in SYSTEM prompt (-sys); temp=0.6 top_p=0.95
#   Qwen3:    /no_think in SYSTEM prompt (-sys); temp=0.7 top_p=0.8 top_k=20
#             DO NOT use greedy (temp=0) — causes endless repetition per Qwen3 docs
#   Gemma4:   No thinking mode; temp=0.7 top_p=0.95

set -uo pipefail

M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"

declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99)
declare -A MAX_CTX=([smollm3]=24576 [e2b]=32768 [e4b]=24576 [q3]=8192)

# Per-model sampling params (HF model card sources)
declare -A TEMP=([smollm3]="0.6"  [e2b]="0.7"  [e4b]="0.7"  [q3]="0.7")
declare -A TOPP=([smollm3]="0.95" [e2b]="0.95" [e4b]="0.95" [q3]="0.8")
declare -A TOPK=([smollm3]="0"    [e2b]="0"    [e4b]="0"    [q3]="20")
# /no_think in system prompt disables thinking for SmolLM3 and Qwen3
declare -A SYSP=([smollm3]="/no_think" [e2b]="" [e4b]="" [q3]="/no_think")

PASS=0; FAIL=0; TOTAL=0

GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m'

# sed script to strip llama-cli interactive UI banner from stdout.
# ▄ (U+2584) and █ (U+2588) appear in the llama.cpp ASCII logo, sometimes
# with leading spaces — match anywhere on the line to be safe.
STRIP_BANNER='/^$/d
/^Loading model/d
/^[[:space:]]*$/d
/[▄█]/d
/^build /d
/^model /d
/^modalities/d
/^available commands/d
/^  \//d
/^\[ Prompt:/d
/^\[  Prompt:/d
/^Exiting/d
/^> /d
'

check() {
    local lbl="$1" out="$2"
    shift 2
    local patterns=("$@")
    local ok=1
    for pat in "${patterns[@]}"; do
        printf '%s\n' "$out" | grep -qiE "$pat" || { ok=0; break; }
    done
    TOTAL=$((TOTAL+1))
    if [ "$ok" = "1" ]; then
        PASS=$((PASS+1)); printf "  ${GREEN}PASS${NC} %s\n" "$lbl"
    else
        FAIL=$((FAIL+1)); printf "  ${RED}FAIL${NC} %s\n" "$lbl"
        printf '%s\n' "$out" | grep -v '^$' | tail -3 | sed 's/^/       | /'
    fi
}

# Strip thinking blocks from output.
# Gemma4 uses [Start thinking]...[End thinking].
# Qwen3/SmolLM3 use <think>...</think>.
# Match to end-of-string as fallback for truncated/incomplete blocks.
strip_think() {
    python3 -c "
import sys, re
t = sys.stdin.read()
# Only strip COMPLETE blocks. If thinking hit token limit, leave as-is so
# check patterns can still match reasoning content inside the block.
t = re.sub(r'\[Start thinking\].*?\[End thinking\]', '', t, flags=re.DOTALL)
t = re.sub(r'<think>.*?</think>', '', t, flags=re.DOTALL)
print(t.strip())
" 2>/dev/null || cat
}

# run KEY MODEL PROMPT MAX_TOKENS [SYS_OVERRIDE]
# SYS_OVERRIDE defaults to SYSP[$key] if omitted.
# Pass "" explicitly to disable system prompt (thinking ON for Qwen3/SmolLM3).
# Thinking params: SmolLM3/Qwen3 thinking=temp0.6/top_p0.95, nothink=model defaults.
run() {
    local key=$1 model=$2 prompt=$3 max_tok=$4
    local ngl="${NGL[$key]}"
    # 5th arg overrides sys; if not provided, use SYSP[$key]
    local use_sys
    if [ "${5+x}" = "x" ]; then use_sys="$5"; else use_sys="${SYSP[$key]}"; fi
    # choose sampling params: thinking mode uses 0.6/0.95, non-think uses model defaults
    local temp topp topk
    if [ -z "$use_sys" ] && [[ "$key" == "smollm3" || "$key" == "q3" ]]; then
        temp="0.6"; topp="0.95"; topk="${TOPK[$key]}"
    else
        temp="${TEMP[$key]}"; topp="${TOPP[$key]}"; topk="${TOPK[$key]}"
    fi
    local sys_arg=()
    [ -n "$use_sys" ] && sys_arg=(-sys "$use_sys")
    local topk_arg=()
    [ "$topk" != "0" ] && topk_arg=(--top-k "$topk")
    timeout 300 /app/llama-cli -m "$model" -ngl "$ngl" \
        -n "$max_tok" --temp "$temp" --top-p "$topp" "${topk_arg[@]}" \
        --repeat-penalty 1.1 -fa on --mmap --single-turn \
        "${sys_arg[@]}" -p "$prompt" 2>/dev/null \
    | sed "$STRIP_BANNER" \
    | strip_think
}

# needle_test KEY MODEL NEEDLE CTX
# Generates ~CTX tokens of filler, plants needle in middle, asks to recall it.
needle_test() {
    local key=$1 model=$2 needle=$3 ctx=$4
    local ngl="${NGL[$key]}"
    local temp="${TEMP[$key]}" topp="${TOPP[$key]}" sys="${SYSP[$key]}"
    local sys_arg=()
    [ -n "$sys" ] && sys_arg=(-sys "$sys")

    # filler: ctx/2 tokens each side, 1 token ~4 chars
    local half_chars=$(( ctx * 2 ))
    local reps=$(( half_chars / 45 + 2 ))
    local filler
    filler=$(python3 -c "print('The quick brown fox jumps over the lazy dog. ' * $reps)" 2>/dev/null \
        | head -c "$half_chars")

    local prompt
    printf -v prompt \
        '%s\nSECRET_VALUE=%s\n%s\nWhat is SECRET_VALUE? Reply with only the value, nothing else.' \
        "$filler" "$needle" "$filler"

    local ctx_size=$(( ctx + 512 ))
    local out
    out=$(timeout 180 /app/llama-cli -m "$model" -ngl "$ngl" \
        -n 512 --temp "$temp" --top-p "$topp" \
        -fa on --mmap --single-turn \
        -c "$ctx_size" "${sys_arg[@]}" -p "$prompt" 2>/dev/null \
    | sed "$STRIP_BANNER" \
    | strip_think)

    # join lines before grep in case model breaks needle across newlines
    local flat
    flat=$(printf '%s' "$out" | tr '\n' ' ')
    if printf '%s' "$flat" | grep -qF "$needle"; then
        echo "FOUND"
    else
        local snip
        snip=$(printf '%s' "$flat" | cut -c1-80)
        echo "MISSED (${snip:-<empty>})"
    fi
}

HR="======================================================================"
echo "$HR"
echo "QUALITY TESTS — ALL MODELS — $(date)"
echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null)"
echo "$HR"
printf "Temps: SmolLM3=0.6/0.95 | Qwen3=0.7/0.8/k20 | Gemma4=0.7/0.95\n"
printf "/no_think via -sys for needle tests | thinking ON for coding bug test\n\n"

CODING_FIZZBUZZ='Write ONLY the Python function fizzbuzz(n). It returns a list where multiples of 3 are "Fizz", multiples of 5 are "Buzz", multiples of both are "FizzBuzz", others are the number as string. Output code only, no prose.'

# hi is correctly len(arr)-1 to have ONE unambiguous bug: lo=mid (infinite loop)
CODING_BUG='Find the bug in this Python function and explain it in one sentence:
def binary_search(arr, target):
    lo, hi = 0, len(arr) - 1
    while lo < hi:
        mid = (lo + hi) // 2
        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            lo = mid
        else:
            hi = mid
    return -1'

for entry in "smollm3:SmolLM3-3B:$M_SMOL" "e2b:Gemma4-E2B:$M_E2B" "e4b:Gemma4-E4B:$M_E4B" "q3:Qwen3-4B:$M_Q3"; do
    IFS=':' read -r key lbl model <<< "$entry"
    echo "=== $lbl ==="

    # Coding test 1: FizzBuzz — expect def + Fizz + Buzz
    out=$(run "$key" "$model" "$CODING_FIZZBUZZ" 512)
    check "FizzBuzz: def + Fizz + Buzz in output" "$out" \
        "def " "Fizz" "Buzz"

    # Coding test 2: Bug — thinking ON for all models (more reliable reasoning).
    # Pass "" to disable /no_think override. Gemma4 already thinks by default.
    out=$(run "$key" "$model" "$CODING_BUG" 3000 "")
    check "Bug: identify lo=mid / infinite loop" "$out" \
        "lo.*=.*mid.*\+.*1|lo\+1|infinite loop|never.*advance|never.*progress|stuck|lo should be|lo\b.*never.*incr"

    # Needle-in-haystack
    NEEDLE="QX7-ALPHA-9"
    # strict < so we skip when ctx == max_ctx (prompt fills entire context, no room for output)
    for ctx in 1024 8192; do
        if [ "$ctx" -lt "${MAX_CTX[$key]}" ]; then
            result=$(needle_test "$key" "$model" "$NEEDLE" "$ctx")
            TOTAL=$((TOTAL+1))
            if [[ "$result" == FOUND ]]; then
                PASS=$((PASS+1)); printf "  ${GREEN}PASS${NC} Needle @ %s tok: %s\n" "$ctx" "$result"
            else
                FAIL=$((FAIL+1)); printf "  ${RED}FAIL${NC} Needle @ %s tok: %s\n" "$ctx" "$result"
            fi
        else
            printf "  ${YELLOW}SKIP${NC} Needle @ %s tok (exceeds model max %s)\n" "$ctx" "${MAX_CTX[$key]}"
        fi
    done

    echo ""
done

echo "$HR"
printf "RESULTS: ${GREEN}%s PASSED${NC} / ${RED}%s FAILED${NC} / %s TOTAL\n" "$PASS" "$FAIL" "$TOTAL"
echo "$HR"