#!/usr/bin/env bash # llama — utility script for the llama.cpp Docker Compose stack # # Usage: # ./llama interactive menu # ./llama start [--bigctx] [--webui] # ./llama stop # ./llama status # ./llama logs [--follow] # ./llama build # ./llama bench # # Models: smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b set -euo pipefail cd "$(dirname "$0")" # ── Colors ──────────────────────────────────────────────────────────────────── if [[ -t 1 ]]; then G='\033[0;32m'; Y='\033[1;33m'; R='\033[0;31m'; B='\033[0;34m'; C='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m' else G=''; Y=''; R=''; B=''; C=''; BOLD=''; NC='' fi MODELS_DIR="./models" # ── Model metadata ──────────────────────────────────────────────────────────── declare -A MODEL_FILE=( [smollm3]="HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf" [gemma4-e2b]="google_gemma-4-E2B-it-Q4_K_M.gguf" [gemma4-e4b]="google_gemma-4-E4B-it-Q4_K_M.gguf" [qwen3-4b]="Qwen3-4B-Q4_K_M.gguf" [qwen35-9b]="Qwen3.5-9B.Q8_0.gguf" ) declare -A MODEL_LABEL=( [smollm3]="SmolLM3-3B (~53 t/s, ctx 24K/65K bigctx, thinking+tools)" [gemma4-e2b]="Gemma4-E2B (~62 t/s, ctx 24K/393K bigctx, multimodal)" [gemma4-e4b]="Gemma4-E4B (~30 t/s, ctx 24K/164K bigctx, multimodal)" [qwen3-4b]="Qwen3-4B (~39 t/s, ctx 16K/24K bigctx, thinking+tools)" [qwen35-9b]="Qwen3.5-9B (~4.4 t/s, ctx 32K, reasoning distill)" ) # Models that support bigctx BIGCTX_MODELS=(smollm3 gemma4-e2b gemma4-e4b qwen3-4b) # ── Helpers ─────────────────────────────────────────────────────────────────── die() { echo -e "${R}Error: $*${NC}" >&2; exit 1; } info() { echo -e "${B}▸ $*${NC}"; } ok() { echo -e "${G}✓ $*${NC}"; } warn() { echo -e "${Y}⚠ $*${NC}"; } confirm() { local prompt="$1" local answer echo -en "${Y}${prompt} [y/N] ${NC}" read -r answer [[ "${answer,,}" == "y" ]] } running_model() { # Returns the profile name of the currently running llama_server, or empty local name name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1) [[ -z "$name" ]] && return # Read the COMPOSE_PROFILES label from the running container's env docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \ | grep "^COMPOSE_PROFILES=" | cut -d= -f2 | tr ',' '\n' \ | grep -v "bigctx\|webui" | head -1 } running_profiles() { local name name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1) [[ -z "$name" ]] && return docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \ | grep "^COMPOSE_PROFILES=" | cut -d= -f2 } check_model_file() { local model="$1" local file="${MODEL_FILE[$model]:-}" [[ -z "$file" ]] && return 0 if [[ ! -f "$MODELS_DIR/$file" ]]; then warn "Model file not found: $MODELS_DIR/$file" if confirm "Download it now?"; then bash scripts/download_models.sh "$model" else die "Model file missing. Run: bash scripts/download_models.sh $model" fi fi } stop_running() { local current current=$(running_model) if [[ -n "$current" ]]; then warn "Currently running: $current" confirm "Stop it and start new model?" || { echo "Aborted."; exit 0; } info "Stopping running containers..." docker compose down --remove-orphans 2>/dev/null || true fi # Remove any stopped containers holding reserved names (from previous runs or failed starts) for reserved in llama_server llama_bench open_webui; do if docker ps -a --filter "name=^/${reserved}$" --format "{{.Names}}" 2>/dev/null | grep -q "^${reserved}$"; then docker rm -f "$reserved" &>/dev/null || true fi done } build_profiles_flag() { local model="$1" bigctx="$2" webui="$3" local profiles=() # bigctx is a replacement for the base profile, not an addition if [[ "$bigctx" == "1" ]]; then profiles+=("${model}-bigctx") else profiles+=("$model") fi [[ "$webui" == "1" ]] && profiles+=("webui") # docker compose --profile a --profile b local flags="" for p in "${profiles[@]}"; do flags+="--profile $p "; done echo "$flags" } # ── Commands ────────────────────────────────────────────────────────────────── cmd_start() { local model="" bigctx=0 webui=0 # Parse args while [[ $# -gt 0 ]]; do case "$1" in --bigctx) bigctx=1 ;; --webui) webui=1 ;; -*) die "Unknown flag: $1" ;; *) [[ -z "$model" ]] && model="$1" || die "Unexpected argument: $1" ;; esac shift done [[ -z "$model" ]] && die "Model required. Usage: ./llama start [--bigctx] [--webui]" [[ -z "${MODEL_FILE[$model]+_}" ]] && die "Unknown model: $model. Valid: ${!MODEL_FILE[*]}" if [[ "$bigctx" == "1" ]]; then local valid=0 for m in "${BIGCTX_MODELS[@]}"; do [[ "$m" == "$model" ]] && valid=1; done [[ "$valid" == "0" ]] && die "$model does not have a bigctx profile" fi check_model_file "$model" stop_running local profile_flags profile_flags=$(build_profiles_flag "$model" "$bigctx" "$webui") local desc="${MODEL_LABEL[$model]}" [[ "$bigctx" == "1" ]] && desc+=" [bigctx]" [[ "$webui" == "1" ]] && desc+=" [+webui]" info "Starting: $desc" # shellcheck disable=SC2086 docker compose $profile_flags up -d echo "" info "Waiting for health check..." # bigctx / heavy models can take >2 min to load: poll up to 150s local max_polls=75 local i=0 local consecutive_unhealthy=0 while [[ $i -lt $max_polls ]]; do local status status=$(docker inspect llama_server --format '{{.State.Health.Status}}' 2>/dev/null || echo "starting") if [[ "$status" == "healthy" ]]; then echo "" ok "Server is healthy → http://localhost:8080" [[ "$webui" == "1" ]] && ok "Open WebUI → http://localhost:3000" return 0 elif [[ "$status" == "unhealthy" ]]; then (( consecutive_unhealthy++ )) || true # Only give up after 3 consecutive unhealthy — avoids false positives # during start_period when Docker hasn't run healthchecks yet if [[ $consecutive_unhealthy -ge 3 ]]; then echo "" warn "Server reported unhealthy. Check logs: ./llama logs" return 1 fi else consecutive_unhealthy=0 fi echo -n "." sleep 2 (( i++ )) || true done echo "" warn "Still starting (health check pending). Try: ./llama status" } cmd_stop() { local current current=$(running_model) if [[ -z "$current" ]]; then info "No llama containers running." else info "Stopping $current..." docker compose down --remove-orphans fi # Remove any stopped containers holding reserved names local cleaned=0 for reserved in llama_server llama_bench open_webui; do if docker ps -a --filter "name=^/${reserved}$" --format "{{.Names}}" 2>/dev/null | grep -q "^${reserved}$"; then docker rm -f "$reserved" &>/dev/null && (( cleaned++ )) || true fi done [[ $cleaned -gt 0 ]] && info "Removed $cleaned stopped container(s)." ok "Clean." } cmd_status() { echo "" echo -e "${BOLD}── llama.cpp server status ──────────────────────────────${NC}" local name name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1) if [[ -z "$name" ]]; then echo -e " ${R}●${NC} No server running" else local health uptime model_env health=$(docker inspect "$name" --format '{{.State.Health.Status}}' 2>/dev/null || echo "unknown") uptime=$(docker inspect "$name" --format '{{.State.StartedAt}}' 2>/dev/null | \ python3 -c "import sys,datetime; t=sys.stdin.read().strip().rstrip('Z'); \ start=datetime.datetime.fromisoformat(t.replace('Z','')); \ diff=datetime.datetime.utcnow()-start; \ h,r=divmod(int(diff.total_seconds()),3600); m=r//60; \ print(f'{h}h {m}m' if h else f'{m}m')" 2>/dev/null || echo "?") model_env=$(docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \ | grep "^MODEL_FILE=\|^CTX_SIZE=\|^N_GPU_LAYERS=" | sort) local color="${G}" [[ "$health" == "unhealthy" ]] && color="${R}" [[ "$health" == "starting" ]] && color="${Y}" echo -e " ${color}●${NC} ${BOLD}${name}${NC} — ${health} (up ${uptime})" while IFS= read -r line; do echo -e " ${C}${line}${NC}" done <<< "$model_env" echo -e " ${C}API → http://localhost:8080${NC}" fi # WebUI local wname wname=$(docker ps --filter "name=open_webui" --format "{{.Names}}" 2>/dev/null | head -1) if [[ -n "$wname" ]]; then echo -e " ${G}●${NC} open_webui running → http://localhost:3000" fi echo "" } cmd_logs() { local follow=0 [[ "${1:-}" == "--follow" || "${1:-}" == "-f" ]] && follow=1 local name name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1) [[ -z "$name" ]] && die "No server running." if [[ "$follow" == "1" ]]; then docker logs -f --tail 50 "$name" else docker logs --tail 80 "$name" fi } cmd_build() { info "Building TurboQuant image (full + server targets)..." info "This takes ~20 minutes on first build." docker compose --profile qwen35-9b build llama-qwen35-9b ok "Images built: local/llama-cpp-turboquant:server-cuda-sm75-mmq" ok " local/llama-cpp-turboquant:full-cuda-sm75-mmq" } cmd_bench() { local model="${1:-}" [[ -z "$model" ]] && die "Model required. Usage: ./llama bench " [[ -z "${MODEL_FILE[$model]+_}" ]] && die "Unknown model: $model" check_model_file "$model" local current current=$(running_model) if [[ -n "$current" ]]; then warn "Server is running ($current). Bench will compete for GPU." confirm "Continue anyway?" || { echo "Aborted."; exit 0; } docker compose down --remove-orphans 2>/dev/null || true fi info "Running benchmark for $model..." docker compose --profile "bench-${model}" run --rm "bench-${model}" ok "Results written to benchmark-results/" } # ── Interactive menu ────────────────────────────────────────────────────────── menu() { echo "" echo -e "${BOLD}╔══════════════════════════════════════════════════════╗${NC}" echo -e "${BOLD}║ llama.cpp model launcher ║${NC}" echo -e "${BOLD}╚══════════════════════════════════════════════════════╝${NC}" echo "" cmd_status echo -e "${BOLD}Select a model:${NC}" local models=(smollm3 gemma4-e2b gemma4-e4b qwen3-4b qwen35-9b) local i=1 for m in "${models[@]}"; do local missing="" [[ ! -f "$MODELS_DIR/${MODEL_FILE[$m]}" ]] && missing=" ${R}[not downloaded]${NC}" echo -e " ${C}${i})${NC} ${MODEL_LABEL[$m]}${missing}" (( i++ )) done echo "" echo -e " ${C}s)${NC} stop all" echo -e " ${C}q)${NC} quit" echo "" echo -n "Choice: " local choice read -r choice case "$choice" in [1-5]) local model="${models[$((choice-1))]}" local bigctx=0 webui=0 # bigctx option local has_bigctx=0 for m in "${BIGCTX_MODELS[@]}"; do [[ "$m" == "$model" ]] && has_bigctx=1; done if [[ "$has_bigctx" == "1" ]]; then echo -n "Use bigctx profile (larger context, slower)? [y/N] " read -r ans; [[ "${ans,,}" == "y" ]] && bigctx=1 fi echo -n "Include Open WebUI? [y/N] " read -r ans; [[ "${ans,,}" == "y" ]] && webui=1 echo "" local extra_flags="" [[ "$bigctx" == "1" ]] && extra_flags+=" --bigctx" [[ "$webui" == "1" ]] && extra_flags+=" --webui" # shellcheck disable=SC2086 cmd_start $model $extra_flags ;; s|S) cmd_stop ;; q|Q) exit 0 ;; *) warn "Invalid choice."; menu ;; esac } # ── Entrypoint ──────────────────────────────────────────────────────────────── usage() { echo "" echo -e "${BOLD}Usage:${NC}" echo " ./llama interactive menu" echo " ./llama start [--bigctx] [--webui]" echo " ./llama stop" echo " ./llama status" echo " ./llama logs [--follow]" echo " ./llama build" echo " ./llama bench " echo "" echo -e "${BOLD}Models:${NC} smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b" echo "" } case "${1:-}" in "") menu ;; start) shift; cmd_start "$@" ;; stop) cmd_stop ;; status) cmd_status ;; logs) shift; cmd_logs "$@" ;; build) cmd_build ;; bench) shift; cmd_bench "$@" ;; help|--help|-h) usage ;; *) die "Unknown command: $1. Run ./llama help" ;; esac