llama-cpp/llama

#!/usr/bin/env bash
# llama — utility script for the llama.cpp Docker Compose stack
#
# Usage:
#   ./llama                          interactive menu
#   ./llama start <model> [--bigctx] [--webui]
#   ./llama stop
#   ./llama status
#   ./llama logs [--follow]
#   ./llama build
#   ./llama bench <model>
#
# Models: smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b

set -euo pipefail
cd "$(dirname "$0")"

# ── Colors ────────────────────────────────────────────────────────────────────
if [[ -t 1 ]]; then
    G='\033[0;32m'; Y='\033[1;33m'; R='\033[0;31m'; B='\033[0;34m'; C='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m'
else
    G=''; Y=''; R=''; B=''; C=''; BOLD=''; NC=''
fi

MODELS_DIR="./models"

# ── Model metadata ────────────────────────────────────────────────────────────
declare -A MODEL_FILE=(
    [smollm3]="HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
    [gemma4-e2b]="google_gemma-4-E2B-it-Q4_K_M.gguf"
    [gemma4-e4b]="google_gemma-4-E4B-it-Q4_K_M.gguf"
    [qwen3-4b]="Qwen3-4B-Q4_K_M.gguf"
    [qwen35-9b]="Qwen3.5-9B.Q8_0.gguf"
)

declare -A MODEL_LABEL=(
    [smollm3]="SmolLM3-3B    (~53 t/s, ctx 24K/65K bigctx, thinking+tools)"
    [gemma4-e2b]="Gemma4-E2B    (~62 t/s, ctx 24K/393K bigctx, multimodal)"
    [gemma4-e4b]="Gemma4-E4B    (~30 t/s, ctx 24K/164K bigctx, multimodal)"
    [qwen3-4b]="Qwen3-4B      (~39 t/s, ctx 16K/24K bigctx, thinking+tools)"
    [qwen35-9b]="Qwen3.5-9B    (~4.4 t/s, ctx 32K, reasoning distill)"
)

# Models that support bigctx
BIGCTX_MODELS=(smollm3 gemma4-e2b gemma4-e4b qwen3-4b)

# ── Helpers ───────────────────────────────────────────────────────────────────
die()  { echo -e "${R}Error: $*${NC}" >&2; exit 1; }
info() { echo -e "${B}▸ $*${NC}"; }
ok()   { echo -e "${G}✓ $*${NC}"; }
warn() { echo -e "${Y}⚠ $*${NC}"; }

confirm() {
    local prompt="$1"
    local answer
    echo -en "${Y}${prompt} [y/N] ${NC}"
    read -r answer
    [[ "${answer,,}" == "y" ]]
}

running_model() {
    # Returns the profile name of the currently running llama_server, or empty
    local name
    name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
    [[ -z "$name" ]] && return
    # Read the COMPOSE_PROFILES label from the running container's env
    docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \
        | grep "^COMPOSE_PROFILES=" | cut -d= -f2 | tr ',' '\n' \
        | grep -v "bigctx\|webui" | head -1
}

running_profiles() {
    local name
    name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
    [[ -z "$name" ]] && return
    docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \
        | grep "^COMPOSE_PROFILES=" | cut -d= -f2
}

check_model_file() {
    local model="$1"
    local file="${MODEL_FILE[$model]:-}"
    [[ -z "$file" ]] && return 0
    if [[ ! -f "$MODELS_DIR/$file" ]]; then
        warn "Model file not found: $MODELS_DIR/$file"
        if confirm "Download it now?"; then
            bash scripts/download_models.sh "$model"
        else
            die "Model file missing. Run: bash scripts/download_models.sh $model"
        fi
    fi
}

stop_running() {
    local current
    current=$(running_model)
    if [[ -n "$current" ]]; then
        warn "Currently running: $current"
        confirm "Stop it and start new model?" || { echo "Aborted."; exit 0; }
        info "Stopping running containers..."
        docker compose down --remove-orphans 2>/dev/null || true
    fi
    # Remove any stopped containers holding reserved names (from previous runs or failed starts)
    for reserved in llama_server llama_bench open_webui; do
        if docker ps -a --filter "name=^/${reserved}$" --format "{{.Names}}" 2>/dev/null | grep -q "^${reserved}$"; then
            docker rm -f "$reserved" &>/dev/null || true
        fi
    done
}

build_profiles_flag() {
    local model="$1" bigctx="$2" webui="$3"
    local profiles=()
    # bigctx is a replacement for the base profile, not an addition
    if [[ "$bigctx" == "1" ]]; then
        profiles+=("${model}-bigctx")
    else
        profiles+=("$model")
    fi
    [[ "$webui" == "1" ]] && profiles+=("webui")
    # docker compose --profile a --profile b
    local flags=""
    for p in "${profiles[@]}"; do flags+="--profile $p "; done
    echo "$flags"
}

# ── Commands ──────────────────────────────────────────────────────────────────

cmd_start() {
    local model="" bigctx=0 webui=0

    # Parse args
    while [[ $# -gt 0 ]]; do
        case "$1" in
            --bigctx) bigctx=1 ;;
            --webui)  webui=1 ;;
            -*)       die "Unknown flag: $1" ;;
            *)        [[ -z "$model" ]] && model="$1" || die "Unexpected argument: $1" ;;
        esac
        shift
    done

    [[ -z "$model" ]] && die "Model required. Usage: ./llama start <model> [--bigctx] [--webui]"
    [[ -z "${MODEL_FILE[$model]+_}" ]] && die "Unknown model: $model. Valid: ${!MODEL_FILE[*]}"

    if [[ "$bigctx" == "1" ]]; then
        local valid=0
        for m in "${BIGCTX_MODELS[@]}"; do [[ "$m" == "$model" ]] && valid=1; done
        [[ "$valid" == "0" ]] && die "$model does not have a bigctx profile"
    fi

    check_model_file "$model"
    stop_running

    local profile_flags
    profile_flags=$(build_profiles_flag "$model" "$bigctx" "$webui")

    local desc="${MODEL_LABEL[$model]}"
    [[ "$bigctx" == "1" ]] && desc+=" [bigctx]"
    [[ "$webui"  == "1" ]] && desc+=" [+webui]"
    info "Starting: $desc"

    # shellcheck disable=SC2086
    docker compose $profile_flags up -d

    echo ""
    info "Waiting for health check..."
    # bigctx / heavy models can take >2 min to load: poll up to 150s
    local max_polls=75
    local i=0
    local consecutive_unhealthy=0
    while [[ $i -lt $max_polls ]]; do
        local status
        status=$(docker inspect llama_server --format '{{.State.Health.Status}}' 2>/dev/null || echo "starting")
        if [[ "$status" == "healthy" ]]; then
            echo ""
            ok "Server is healthy → http://localhost:8080"
            [[ "$webui" == "1" ]] && ok "Open WebUI → http://localhost:3000"
            return 0
        elif [[ "$status" == "unhealthy" ]]; then
            (( consecutive_unhealthy++ )) || true
            # Only give up after 3 consecutive unhealthy — avoids false positives
            # during start_period when Docker hasn't run healthchecks yet
            if [[ $consecutive_unhealthy -ge 3 ]]; then
                echo ""
                warn "Server reported unhealthy. Check logs: ./llama logs"
                return 1
            fi
        else
            consecutive_unhealthy=0
        fi
        echo -n "."
        sleep 2
        (( i++ )) || true
    done
    echo ""
    warn "Still starting (health check pending). Try: ./llama status"
}

cmd_stop() {
    local current
    current=$(running_model)
    if [[ -z "$current" ]]; then
        info "No llama containers running."
    else
        info "Stopping $current..."
        docker compose down --remove-orphans
    fi
    # Remove any stopped containers holding reserved names
    local cleaned=0
    for reserved in llama_server llama_bench open_webui; do
        if docker ps -a --filter "name=^/${reserved}$" --format "{{.Names}}" 2>/dev/null | grep -q "^${reserved}$"; then
            docker rm -f "$reserved" &>/dev/null && (( cleaned++ )) || true
        fi
    done
    [[ $cleaned -gt 0 ]] && info "Removed $cleaned stopped container(s)."
    ok "Clean."
}

cmd_status() {
    echo ""
    echo -e "${BOLD}── llama.cpp server status ──────────────────────────────${NC}"

    local name
    name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)

    if [[ -z "$name" ]]; then
        echo -e "  ${R}●${NC} No server running"
    else
        local health uptime model_env
        health=$(docker inspect "$name" --format '{{.State.Health.Status}}' 2>/dev/null || echo "unknown")
        uptime=$(docker inspect "$name" --format '{{.State.StartedAt}}' 2>/dev/null | \
            python3 -c "import sys,datetime; t=sys.stdin.read().strip().rstrip('Z'); \
            start=datetime.datetime.fromisoformat(t.replace('Z','')); \
            diff=datetime.datetime.utcnow()-start; \
            h,r=divmod(int(diff.total_seconds()),3600); m=r//60; \
            print(f'{h}h {m}m' if h else f'{m}m')" 2>/dev/null || echo "?")
        model_env=$(docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \
            | grep "^MODEL_FILE=\|^CTX_SIZE=\|^N_GPU_LAYERS=" | sort)

        local color="${G}"
        [[ "$health" == "unhealthy" ]] && color="${R}"
        [[ "$health" == "starting"  ]] && color="${Y}"

        echo -e "  ${color}●${NC} ${BOLD}${name}${NC} — ${health} (up ${uptime})"
        while IFS= read -r line; do
            echo -e "    ${C}${line}${NC}"
        done <<< "$model_env"
        echo -e "    ${C}API → http://localhost:8080${NC}"
    fi

    # WebUI
    local wname
    wname=$(docker ps --filter "name=open_webui" --format "{{.Names}}" 2>/dev/null | head -1)
    if [[ -n "$wname" ]]; then
        echo -e "  ${G}●${NC} open_webui running → http://localhost:3000"
    fi

    echo ""
}

cmd_logs() {
    local follow=0
    [[ "${1:-}" == "--follow" || "${1:-}" == "-f" ]] && follow=1

    local name
    name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
    [[ -z "$name" ]] && die "No server running."

    if [[ "$follow" == "1" ]]; then
        docker logs -f --tail 50 "$name"
    else
        docker logs --tail 80 "$name"
    fi
}

cmd_build() {
    info "Building TurboQuant image (full + server targets)..."
    info "This takes ~20 minutes on first build."
    docker compose --profile qwen35-9b build llama-qwen35-9b
    ok "Images built: local/llama-cpp-turboquant:server-cuda-sm75-mmq"
    ok "             local/llama-cpp-turboquant:full-cuda-sm75-mmq"
}

cmd_bench() {
    local model="${1:-}"
    [[ -z "$model" ]] && die "Model required. Usage: ./llama bench <model>"
    [[ -z "${MODEL_FILE[$model]+_}" ]] && die "Unknown model: $model"

    check_model_file "$model"

    local current
    current=$(running_model)
    if [[ -n "$current" ]]; then
        warn "Server is running ($current). Bench will compete for GPU."
        confirm "Continue anyway?" || { echo "Aborted."; exit 0; }
        docker compose down --remove-orphans 2>/dev/null || true
    fi

    info "Running benchmark for $model..."
    docker compose --profile "bench-${model}" run --rm "bench-${model}"
    ok "Results written to benchmark-results/"
}

# ── Interactive menu ──────────────────────────────────────────────────────────

menu() {
    echo ""
    echo -e "${BOLD}╔══════════════════════════════════════════════════════╗${NC}"
    echo -e "${BOLD}║           llama.cpp model launcher                  ║${NC}"
    echo -e "${BOLD}╚══════════════════════════════════════════════════════╝${NC}"
    echo ""

    cmd_status

    echo -e "${BOLD}Select a model:${NC}"
    local models=(smollm3 gemma4-e2b gemma4-e4b qwen3-4b qwen35-9b)
    local i=1
    for m in "${models[@]}"; do
        local missing=""
        [[ ! -f "$MODELS_DIR/${MODEL_FILE[$m]}" ]] && missing=" ${R}[not downloaded]${NC}"
        echo -e "  ${C}${i})${NC} ${MODEL_LABEL[$m]}${missing}"
        (( i++ ))
    done
    echo ""
    echo -e "  ${C}s)${NC} stop all"
    echo -e "  ${C}q)${NC} quit"
    echo ""
    echo -n "Choice: "
    local choice
    read -r choice

    case "$choice" in
        [1-5])
            local model="${models[$((choice-1))]}"
            local bigctx=0 webui=0

            # bigctx option
            local has_bigctx=0
            for m in "${BIGCTX_MODELS[@]}"; do [[ "$m" == "$model" ]] && has_bigctx=1; done

            if [[ "$has_bigctx" == "1" ]]; then
                echo -n "Use bigctx profile (larger context, slower)? [y/N] "
                read -r ans; [[ "${ans,,}" == "y" ]] && bigctx=1
            fi

            echo -n "Include Open WebUI? [y/N] "
            read -r ans; [[ "${ans,,}" == "y" ]] && webui=1

            echo ""
            local extra_flags=""
            [[ "$bigctx" == "1" ]] && extra_flags+=" --bigctx"
            [[ "$webui"  == "1" ]] && extra_flags+=" --webui"
            # shellcheck disable=SC2086
            cmd_start $model $extra_flags
            ;;
        s|S) cmd_stop ;;
        q|Q) exit 0 ;;
        *)   warn "Invalid choice."; menu ;;
    esac
}

# ── Entrypoint ────────────────────────────────────────────────────────────────

usage() {
    echo ""
    echo -e "${BOLD}Usage:${NC}"
    echo "  ./llama                                  interactive menu"
    echo "  ./llama start <model> [--bigctx] [--webui]"
    echo "  ./llama stop"
    echo "  ./llama status"
    echo "  ./llama logs [--follow]"
    echo "  ./llama build"
    echo "  ./llama bench <model>"
    echo ""
    echo -e "${BOLD}Models:${NC} smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b"
    echo ""
}

case "${1:-}" in
    "")       menu ;;
    start)    shift; cmd_start "$@" ;;
    stop)     cmd_stop ;;
    status)   cmd_status ;;
    logs)     shift; cmd_logs "$@" ;;
    build)    cmd_build ;;
    bench)    shift; cmd_bench "$@" ;;
    help|--help|-h) usage ;;
    *)        die "Unknown command: $1. Run ./llama help" ;;
esac