Initial commit: tuned multi-model llama.cpp stack

- 5 models: SmolLM3-3B, Gemma4-E2B/E4B, Qwen3-4B, Qwen3.5-9B - TurboQuant image (FORCE_MMQ): +6-11% free speed on Turing GPUs - Bigctx profiles (-nkvo KV in RAM): 2-16x context gain - turbo2 KV: 2x smaller, benchmarked against PPL quality gate - Per-model env files with justified parameters - kv_quant_test.sh + cpu_ctx_test.sh benchmark scripts - docs/FINDINGS.md: surprises, pitfalls, recommendations - docs/ARCHITECTURE.md: compose + test script design
2026-05-06 15:56:40 +02:00
commit 4ad296608b
22 changed files with 2530 additions and 0 deletions
--- a/scripts/quality_test.sh
+++ b/scripts/quality_test.sh
@@ -0,0 +1,215 @@
+#!/bin/bash
+# Quality tests for all 4 models — runs inside full-cuda container.
+# Tests: coding tasks + needle-in-haystack at 1K/8K ctx.
+#
+# Inference parameters sourced from official HF model cards:
+#   SmolLM3:  /no_think in SYSTEM prompt (-sys); temp=0.6 top_p=0.95
+#   Qwen3:    /no_think in SYSTEM prompt (-sys); temp=0.7 top_p=0.8 top_k=20
+#             DO NOT use greedy (temp=0) — causes endless repetition per Qwen3 docs
+#   Gemma4:   No thinking mode; temp=0.7 top_p=0.95
+
+set -uo pipefail
+
+M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
+M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
+M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
+M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"
+
+declare -A NGL=([smollm3]=99 [e2b]=99 [e4b]=42 [q3]=99)
+declare -A MAX_CTX=([smollm3]=24576 [e2b]=32768 [e4b]=24576 [q3]=8192)
+
+# Per-model sampling params (HF model card sources)
+declare -A TEMP=([smollm3]="0.6"  [e2b]="0.7"  [e4b]="0.7"  [q3]="0.7")
+declare -A TOPP=([smollm3]="0.95" [e2b]="0.95" [e4b]="0.95" [q3]="0.8")
+declare -A TOPK=([smollm3]="0"    [e2b]="0"    [e4b]="0"    [q3]="20")
+# /no_think in system prompt disables thinking for SmolLM3 and Qwen3
+declare -A SYSP=([smollm3]="/no_think" [e2b]="" [e4b]="" [q3]="/no_think")
+
+PASS=0; FAIL=0; TOTAL=0
+
+GREEN='\033[0;32m'; RED='\033[0;31m'; YELLOW='\033[1;33m'; NC='\033[0m'
+
+# sed script to strip llama-cli interactive UI banner from stdout.
+# ▄ (U+2584) and █ (U+2588) appear in the llama.cpp ASCII logo, sometimes
+# with leading spaces — match anywhere on the line to be safe.
+STRIP_BANNER='/^$/d
+/^Loading model/d
+/^[[:space:]]*$/d
+/[▄█]/d
+/^build /d
+/^model /d
+/^modalities/d
+/^available commands/d
+/^  \//d
+/^\[ Prompt:/d
+/^\[  Prompt:/d
+/^Exiting/d
+/^> /d
+'
+
+check() {
+    local lbl="$1" out="$2"
+    shift 2
+    local patterns=("$@")
+    local ok=1
+    for pat in "${patterns[@]}"; do
+        printf '%s\n' "$out" | grep -qiE "$pat" || { ok=0; break; }
+    done
+    TOTAL=$((TOTAL+1))
+    if [ "$ok" = "1" ]; then
+        PASS=$((PASS+1)); printf "  ${GREEN}PASS${NC} %s\n" "$lbl"
+    else
+        FAIL=$((FAIL+1)); printf "  ${RED}FAIL${NC} %s\n" "$lbl"
+        printf '%s\n' "$out" | grep -v '^$' | tail -3 | sed 's/^/       | /'
+    fi
+}
+
+# Strip thinking blocks from output.
+# Gemma4 uses [Start thinking]...[End thinking].
+# Qwen3/SmolLM3 use <think>...</think>.
+# Match to end-of-string as fallback for truncated/incomplete blocks.
+strip_think() {
+    python3 -c "
+import sys, re
+t = sys.stdin.read()
+# Only strip COMPLETE blocks. If thinking hit token limit, leave as-is so
+# check patterns can still match reasoning content inside the block.
+t = re.sub(r'\[Start thinking\].*?\[End thinking\]', '', t, flags=re.DOTALL)
+t = re.sub(r'<think>.*?</think>', '', t, flags=re.DOTALL)
+print(t.strip())
+" 2>/dev/null || cat
+}
+
+# run KEY MODEL PROMPT MAX_TOKENS [SYS_OVERRIDE]
+# SYS_OVERRIDE defaults to SYSP[$key] if omitted.
+# Pass "" explicitly to disable system prompt (thinking ON for Qwen3/SmolLM3).
+# Thinking params: SmolLM3/Qwen3 thinking=temp0.6/top_p0.95, nothink=model defaults.
+run() {
+    local key=$1 model=$2 prompt=$3 max_tok=$4
+    local ngl="${NGL[$key]}"
+    # 5th arg overrides sys; if not provided, use SYSP[$key]
+    local use_sys
+    if [ "${5+x}" = "x" ]; then use_sys="$5"; else use_sys="${SYSP[$key]}"; fi
+    # choose sampling params: thinking mode uses 0.6/0.95, non-think uses model defaults
+    local temp topp topk
+    if [ -z "$use_sys" ] && [[ "$key" == "smollm3" || "$key" == "q3" ]]; then
+        temp="0.6"; topp="0.95"; topk="${TOPK[$key]}"
+    else
+        temp="${TEMP[$key]}"; topp="${TOPP[$key]}"; topk="${TOPK[$key]}"
+    fi
+    local sys_arg=()
+    [ -n "$use_sys" ] && sys_arg=(-sys "$use_sys")
+    local topk_arg=()
+    [ "$topk" != "0" ] && topk_arg=(--top-k "$topk")
+    timeout 300 /app/llama-cli -m "$model" -ngl "$ngl" \
+        -n "$max_tok" --temp "$temp" --top-p "$topp" "${topk_arg[@]}" \
+        --repeat-penalty 1.1 -fa on --mmap --single-turn \
+        "${sys_arg[@]}" -p "$prompt" 2>/dev/null \
+    | sed "$STRIP_BANNER" \
+    | strip_think
+}
+
+# needle_test KEY MODEL NEEDLE CTX
+# Generates ~CTX tokens of filler, plants needle in middle, asks to recall it.
+needle_test() {
+    local key=$1 model=$2 needle=$3 ctx=$4
+    local ngl="${NGL[$key]}"
+    local temp="${TEMP[$key]}" topp="${TOPP[$key]}" sys="${SYSP[$key]}"
+    local sys_arg=()
+    [ -n "$sys" ] && sys_arg=(-sys "$sys")
+
+    # filler: ctx/2 tokens each side, 1 token ~4 chars
+    local half_chars=$(( ctx * 2 ))
+    local reps=$(( half_chars / 45 + 2 ))
+    local filler
+    filler=$(python3 -c "print('The quick brown fox jumps over the lazy dog. ' * $reps)" 2>/dev/null \
+        | head -c "$half_chars")
+
+    local prompt
+    printf -v prompt \
+        '%s\nSECRET_VALUE=%s\n%s\nWhat is SECRET_VALUE? Reply with only the value, nothing else.' \
+        "$filler" "$needle" "$filler"
+
+    local ctx_size=$(( ctx + 512 ))
+    local out
+    out=$(timeout 180 /app/llama-cli -m "$model" -ngl "$ngl" \
+        -n 512 --temp "$temp" --top-p "$topp" \
+        -fa on --mmap --single-turn \
+        -c "$ctx_size" "${sys_arg[@]}" -p "$prompt" 2>/dev/null \
+    | sed "$STRIP_BANNER" \
+    | strip_think)
+
+    # join lines before grep in case model breaks needle across newlines
+    local flat
+    flat=$(printf '%s' "$out" | tr '\n' ' ')
+    if printf '%s' "$flat" | grep -qF "$needle"; then
+        echo "FOUND"
+    else
+        local snip
+        snip=$(printf '%s' "$flat" | cut -c1-80)
+        echo "MISSED (${snip:-<empty>})"
+    fi
+}
+
+HR="======================================================================"
+echo "$HR"
+echo "QUALITY TESTS — ALL MODELS — $(date)"
+echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null)"
+echo "$HR"
+printf "Temps: SmolLM3=0.6/0.95 | Qwen3=0.7/0.8/k20 | Gemma4=0.7/0.95\n"
+printf "/no_think via -sys for needle tests | thinking ON for coding bug test\n\n"
+
+CODING_FIZZBUZZ='Write ONLY the Python function fizzbuzz(n). It returns a list where multiples of 3 are "Fizz", multiples of 5 are "Buzz", multiples of both are "FizzBuzz", others are the number as string. Output code only, no prose.'
+
+# hi is correctly len(arr)-1 to have ONE unambiguous bug: lo=mid (infinite loop)
+CODING_BUG='Find the bug in this Python function and explain it in one sentence:
+def binary_search(arr, target):
+    lo, hi = 0, len(arr) - 1
+    while lo < hi:
+        mid = (lo + hi) // 2
+        if arr[mid] == target:
+            return mid
+        elif arr[mid] < target:
+            lo = mid
+        else:
+            hi = mid
+    return -1'
+
+for entry in "smollm3:SmolLM3-3B:$M_SMOL" "e2b:Gemma4-E2B:$M_E2B" "e4b:Gemma4-E4B:$M_E4B" "q3:Qwen3-4B:$M_Q3"; do
+    IFS=':' read -r key lbl model <<< "$entry"
+    echo "=== $lbl ==="
+
+    # Coding test 1: FizzBuzz — expect def + Fizz + Buzz
+    out=$(run "$key" "$model" "$CODING_FIZZBUZZ" 512)
+    check "FizzBuzz: def + Fizz + Buzz in output" "$out" \
+        "def " "Fizz" "Buzz"
+
+    # Coding test 2: Bug — thinking ON for all models (more reliable reasoning).
+    # Pass "" to disable /no_think override. Gemma4 already thinks by default.
+    out=$(run "$key" "$model" "$CODING_BUG" 3000 "")
+    check "Bug: identify lo=mid / infinite loop" "$out" \
+        "lo.*=.*mid.*\+.*1|lo\+1|infinite loop|never.*advance|never.*progress|stuck|lo should be|lo\b.*never.*incr"
+
+    # Needle-in-haystack
+    NEEDLE="QX7-ALPHA-9"
+    # strict < so we skip when ctx == max_ctx (prompt fills entire context, no room for output)
+    for ctx in 1024 8192; do
+        if [ "$ctx" -lt "${MAX_CTX[$key]}" ]; then
+            result=$(needle_test "$key" "$model" "$NEEDLE" "$ctx")
+            TOTAL=$((TOTAL+1))
+            if [[ "$result" == FOUND ]]; then
+                PASS=$((PASS+1)); printf "  ${GREEN}PASS${NC} Needle @ %s tok: %s\n" "$ctx" "$result"
+            else
+                FAIL=$((FAIL+1)); printf "  ${RED}FAIL${NC} Needle @ %s tok: %s\n" "$ctx" "$result"
+            fi
+        else
+            printf "  ${YELLOW}SKIP${NC} Needle @ %s tok (exceeds model max %s)\n" "$ctx" "${MAX_CTX[$key]}"
+        fi
+    done
+
+    echo ""
+done
+
+echo "$HR"
+printf "RESULTS: ${GREEN}%s PASSED${NC} / ${RED}%s FAILED${NC} / %s TOTAL\n" "$PASS" "$FAIL" "$TOTAL"
+echo "$HR"