#!/bin/bash
# Benchmark all 4 new models on GTX 1650 Ti (3717 MiB VRAM)
# Priority: max context size > tg speed
# Runs inside ghcr.io/ggml-org/llama.cpp:full-cuda (build b9014, no -c flag)
#
# Architecture context limits (from GGUF metadata):
#   SmolLM3-3B   : 65536  (full attention, KV-limited to ~28K in practice)
#   Gemma4-E2B   : 131072 (hybrid: sliding_window=512 → huge ctx possible)
#   Gemma4-E4B   : 131072 (hybrid: sliding_window=512)
#   Qwen3-4B     : 40960  (full attention, KV-limited to ~9K in practice)
#
# NOTE: llama-bench b9014 has NO -c flag. Context is set by -p (prompt tokens).
#   -p N -n G allocates KV for N+G tokens. OOM = exit!=0 or error in stdout.

set -uo pipefail

M_SMOL="/models/HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
M_E2B="/models/google_gemma-4-E2B-it-Q4_K_M.gguf"
M_E4B="/models/google_gemma-4-E4B-it-Q4_K_M.gguf"
M_Q3="/models/Qwen3-4B-Q4_K_M.gguf"

# -- CSV column detection (called once on first successful output) --
TS_COL=0; NG_COL=0; NP_COL=0

detect_cols() {
    local hdr
    hdr=$(printf '%s\n' "$1" | sed 's/"//g' | grep '^build_commit' | head -1)
    TS_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="avg_ts"){print i;exit}}')
    NG_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="n_gen"){print i;exit}}')
    NP_COL=$(printf '%s\n' "$hdr" | awk -F',' '{for(i=1;i<=NF;i++) if($i=="n_prompt"){print i;exit}}')
    TS_COL=${TS_COL:-0}; NG_COL=${NG_COL:-0}; NP_COL=${NP_COL:-0}
}

# Returns "pp_speed pp / tg_speed tg t/s"
parse_speeds() {
    local out="$1"
    [ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
    local s pp tg
    s=$(printf '%s\n' "$out" | sed 's/"//g')
    pp=$(printf '%s\n' "$s" | awk -F',' -v tc="$TS_COL" -v np="$NP_COL" -v ng="$NG_COL" \
        'NR>1 && $np+0>0 && $ng+0==0 {printf "%.0f", $tc+0; exit}')
    tg=$(printf '%s\n' "$s" | awk -F',' -v tc="$TS_COL" -v np="$NP_COL" -v ng="$NG_COL" \
        'NR>1 && $ng+0>0 && $np+0==0 {printf "%.1f", $tc+0; exit}')
    printf "%s pp / %s tg t/s" "${pp:--}" "${tg:--}"
}

is_oom() {
    local out="$1" ec="$2"
    [ "$ec" -ne 0 ] && return 0
    printf '%s\n' "$out" | grep -qiE "failed to create context|out of memory|GGML_ASSERT|error:" && return 0
    return 1
}

# bench MODEL NGL [llama-bench extra args...]
# Standard speed benchmark: -p 512 -n 128 small context
bench() {
    local model=$1 ngl=$2; shift 2
    local out ec
    out=$(timeout 250 /app/llama-bench -m "$model" -ngl "$ngl" \
        -b 512 -ub 128 -o csv "$@" 2>&1)
    ec=$?
    if is_oom "$out" "$ec"; then echo "OOM"; return; fi
    [ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
    parse_speeds "$out"
}

# bench_ctx MODEL NGL CTX
# Context-capacity test: allocates KV for CTX tokens via -p CTX -n 1
# Tries fa=1 first, falls back to fa=0. Returns "OK (N pp t/s [fa=N])" or "OOM"
bench_ctx() {
    local model=$1 ngl=$2 ctx=$3
    local out ec fa_used
    for fa in 1 0; do
        out=$(timeout 250 /app/llama-bench -m "$model" -ngl "$ngl" \
            -p "$ctx" -n 1 -r 1 --no-warmup \
            -b 512 -ub 128 -fa "$fa" -t 6 -o csv 2>&1)
        ec=$?
        is_oom "$out" "$ec" || { fa_used=$fa; break; }
        [ "$fa" = "0" ] && { echo "OOM"; return; }
    done
    [ "${TS_COL:-0}" = "0" ] && detect_cols "$out"
    local pp
    pp=$(printf '%s\n' "$out" | sed 's/"//g' | \
        awk -F',' -v tc="$TS_COL" -v np="$NP_COL" \
        'NR>1 && $np+0>0 {printf "%.0f", $tc+0; exit}')
    printf "OK (%s pp t/s fa=%s)" "${pp:--}" "${fa_used:-?}"
}

HR="======================================================================"
echo "$HR"
echo "LLAMA.CPP BENCHMARK — ALL MODELS — $(date)"
echo "GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo unknown)"
echo "$HR"
echo ""

# ── Phase 1: Baseline (small context) ────────────────────────────────────────
echo "=== Phase 1: Baseline (ngl=99, p=512 n=128 r=2, t=6, fa=0) ==="
for entry in "SmolLM3-3B:$M_SMOL" "Gemma4-E2B:$M_E2B" "Gemma4-E4B:$M_E4B" "Qwen3-4B:$M_Q3"; do
    lbl="${entry%%:*}"; mdl="${entry#*:}"
    printf "  %-14s  %s\n" "$lbl" "$(bench "$mdl" 99 -p 512 -n 128 -r 2 -t 6 -fa 0)"
done
echo ""

# ── Phase 2: Gemma4-E4B ngl sweep ────────────────────────────────────────────
echo "=== Phase 2: Gemma4-E4B ngl sweep (p=16 n=64 r=1 t=6 fa=0) ==="
echo "    5.1GB model on 3.7GB VRAM — finding highest ngl before OOM"
best_e4b_ngl=0
for ngl in 0 4 8 12 16 20 24 28 32 36 42; do
    ts=$(bench "$M_E4B" $ngl -p 16 -n 64 -r 1 -t 6 -fa 0)
    printf "  ngl=%-3s  %s\n" "$ngl" "$ts"
    [[ "$ts" == OOM ]] && break
    best_e4b_ngl=$ngl
done
echo "  → best_e4b_ngl=$best_e4b_ngl"
echo ""

# ── Phase 3: Max context sweep ────────────────────────────────────────────────
echo "=== Phase 3: Max context (p=ctx n=1 r=1 no-warmup fa=1) ==="
echo "    Gemma4 hybrid attention (sliding_window=512) enables large ctx cheaply."
declare -A BEST_CTX
BEST_CTX[smollm3]=512; BEST_CTX[e2b]=512; BEST_CTX[e4b]=512; BEST_CTX[q3]=512

for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
             "e2b:Gemma4-E2B:$M_E2B:99" \
             "e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
             "q3:Qwen3-4B:$M_Q3:99"; do
    IFS=':' read -r key lbl mdl ngl <<< "$entry"
    echo "  -- $lbl (ngl=$ngl) --"
    for ctx in 512 1024 2048 4096 8192 12288 16384 24576 32768 49152 65536 98304 131072; do
        ts=$(bench_ctx "$mdl" "$ngl" "$ctx")
        printf "    ctx=%-7s  %s\n" "$ctx" "$ts"
        [[ "$ts" == OOM ]] && break
        BEST_CTX[$key]=$ctx
    done
    echo "    → MAX ctx=${BEST_CTX[$key]}"
done
echo ""

# ── Phase 4: TG speed at max context ─────────────────────────────────────────
echo "=== Phase 4: TG speed at max context (p=512 n=128 r=2 fa=1 t=6) ==="
for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
             "e2b:Gemma4-E2B:$M_E2B:99" \
             "e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
             "q3:Qwen3-4B:$M_Q3:99"; do
    IFS=':' read -r key lbl mdl ngl <<< "$entry"
    ts=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 1 -t 6)
    printf "  %-14s  max_ctx=%-7s  %s\n" "$lbl" "${BEST_CTX[$key]}" "$ts"
done
echo ""

# ── Phase 5: E4B thread sweep (CPU split model — threads matter) ──────────────
echo "=== Phase 5: Gemma4-E4B thread sweep (p=512 n=128 r=2 fa=0 ngl=$best_e4b_ngl) ==="
for t in 1 2 3 4 5 6 8 10 12; do
    ts=$(bench "$M_E4B" "$best_e4b_ngl" -p 512 -n 128 -r 2 -fa 0 -t "$t")
    printf "  t=%-3s  %s\n" "$t" "$ts"
done
echo ""

# ── Phase 6: Flash attention comparison ──────────────────────────────────────
echo "=== Phase 6: Flash attention fa=0 vs fa=1 (p=512 n=128 r=2 t=6) ==="
echo "    Gemma4 hybrid attention may not support FA — testing both."
for entry in "smollm3:SmolLM3-3B:$M_SMOL:99" \
             "e2b:Gemma4-E2B:$M_E2B:99" \
             "e4b:Gemma4-E4B:$M_E4B:$best_e4b_ngl" \
             "q3:Qwen3-4B:$M_Q3:99"; do
    IFS=':' read -r key lbl mdl ngl <<< "$entry"
    ts0=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 0 -t 6)
    ts1=$(bench "$mdl" "$ngl" -p 512 -n 128 -r 2 -fa 1 -t 6)
    printf "  %-14s  fa=0: %-30s  fa=1: %s\n" "$lbl" "$ts0" "$ts1"
done
echo ""

echo "$HR"
echo "BENCHMARK COMPLETE: $(date)"
echo "$HR"