Files
llama-cpp/llama
Giancarmine Salucci e7e389c0e1 llama+compose: fix bigctx startup timing
- compose: increase start_period for bigctx services
  - gemma4-e4b-bigctx: 60s -> 150s (5 GiB model + warmup + 163840 ctx takes ~90-120s)
  - gemma4-e2b-bigctx: 60s -> 120s (large ctx 393216 allocation)
  - smollm3/qwen3-4b bigctx: 60s -> 90s
- llama: extend health poll from 30x2s=60s to 75x2s=150s
- llama: require 3 consecutive unhealthy before giving up (avoids
  false positives during Docker start_period window)
2026-05-06 19:03:31 +02:00

391 lines
14 KiB
Bash
Executable File

#!/usr/bin/env bash
# llama — utility script for the llama.cpp Docker Compose stack
#
# Usage:
# ./llama interactive menu
# ./llama start <model> [--bigctx] [--webui]
# ./llama stop
# ./llama status
# ./llama logs [--follow]
# ./llama build
# ./llama bench <model>
#
# Models: smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b
set -euo pipefail
cd "$(dirname "$0")"
# ── Colors ────────────────────────────────────────────────────────────────────
if [[ -t 1 ]]; then
G='\033[0;32m'; Y='\033[1;33m'; R='\033[0;31m'; B='\033[0;34m'; C='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m'
else
G=''; Y=''; R=''; B=''; C=''; BOLD=''; NC=''
fi
MODELS_DIR="./models"
# ── Model metadata ────────────────────────────────────────────────────────────
declare -A MODEL_FILE=(
[smollm3]="HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
[gemma4-e2b]="google_gemma-4-E2B-it-Q4_K_M.gguf"
[gemma4-e4b]="google_gemma-4-E4B-it-Q4_K_M.gguf"
[qwen3-4b]="Qwen3-4B-Q4_K_M.gguf"
[qwen35-9b]="Qwen3.5-9B.Q8_0.gguf"
)
declare -A MODEL_LABEL=(
[smollm3]="SmolLM3-3B (~53 t/s, ctx 24K/65K bigctx, thinking+tools)"
[gemma4-e2b]="Gemma4-E2B (~62 t/s, ctx 24K/393K bigctx, multimodal)"
[gemma4-e4b]="Gemma4-E4B (~30 t/s, ctx 24K/164K bigctx, multimodal)"
[qwen3-4b]="Qwen3-4B (~39 t/s, ctx 16K/24K bigctx, thinking+tools)"
[qwen35-9b]="Qwen3.5-9B (~4.4 t/s, ctx 32K, reasoning distill)"
)
# Models that support bigctx
BIGCTX_MODELS=(smollm3 gemma4-e2b gemma4-e4b qwen3-4b)
# ── Helpers ───────────────────────────────────────────────────────────────────
die() { echo -e "${R}Error: $*${NC}" >&2; exit 1; }
info() { echo -e "${B}$*${NC}"; }
ok() { echo -e "${G}$*${NC}"; }
warn() { echo -e "${Y}$*${NC}"; }
confirm() {
local prompt="$1"
local answer
echo -en "${Y}${prompt} [y/N] ${NC}"
read -r answer
[[ "${answer,,}" == "y" ]]
}
running_model() {
# Returns the profile name of the currently running llama_server, or empty
local name
name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
[[ -z "$name" ]] && return
# Read the COMPOSE_PROFILES label from the running container's env
docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \
| grep "^COMPOSE_PROFILES=" | cut -d= -f2 | tr ',' '\n' \
| grep -v "bigctx\|webui" | head -1
}
running_profiles() {
local name
name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
[[ -z "$name" ]] && return
docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \
| grep "^COMPOSE_PROFILES=" | cut -d= -f2
}
check_model_file() {
local model="$1"
local file="${MODEL_FILE[$model]:-}"
[[ -z "$file" ]] && return 0
if [[ ! -f "$MODELS_DIR/$file" ]]; then
warn "Model file not found: $MODELS_DIR/$file"
if confirm "Download it now?"; then
bash scripts/download_models.sh "$model"
else
die "Model file missing. Run: bash scripts/download_models.sh $model"
fi
fi
}
stop_running() {
local current
current=$(running_model)
if [[ -n "$current" ]]; then
warn "Currently running: $current"
confirm "Stop it and start new model?" || { echo "Aborted."; exit 0; }
info "Stopping running containers..."
docker compose down --remove-orphans 2>/dev/null || true
fi
# Remove any stopped containers holding reserved names (from previous runs or failed starts)
for reserved in llama_server llama_bench open_webui; do
if docker ps -a --filter "name=^/${reserved}$" --format "{{.Names}}" 2>/dev/null | grep -q "^${reserved}$"; then
docker rm -f "$reserved" &>/dev/null || true
fi
done
}
build_profiles_flag() {
local model="$1" bigctx="$2" webui="$3"
local profiles=()
# bigctx is a replacement for the base profile, not an addition
if [[ "$bigctx" == "1" ]]; then
profiles+=("${model}-bigctx")
else
profiles+=("$model")
fi
[[ "$webui" == "1" ]] && profiles+=("webui")
# docker compose --profile a --profile b
local flags=""
for p in "${profiles[@]}"; do flags+="--profile $p "; done
echo "$flags"
}
# ── Commands ──────────────────────────────────────────────────────────────────
cmd_start() {
local model="" bigctx=0 webui=0
# Parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--bigctx) bigctx=1 ;;
--webui) webui=1 ;;
-*) die "Unknown flag: $1" ;;
*) [[ -z "$model" ]] && model="$1" || die "Unexpected argument: $1" ;;
esac
shift
done
[[ -z "$model" ]] && die "Model required. Usage: ./llama start <model> [--bigctx] [--webui]"
[[ -z "${MODEL_FILE[$model]+_}" ]] && die "Unknown model: $model. Valid: ${!MODEL_FILE[*]}"
if [[ "$bigctx" == "1" ]]; then
local valid=0
for m in "${BIGCTX_MODELS[@]}"; do [[ "$m" == "$model" ]] && valid=1; done
[[ "$valid" == "0" ]] && die "$model does not have a bigctx profile"
fi
check_model_file "$model"
stop_running
local profile_flags
profile_flags=$(build_profiles_flag "$model" "$bigctx" "$webui")
local desc="${MODEL_LABEL[$model]}"
[[ "$bigctx" == "1" ]] && desc+=" [bigctx]"
[[ "$webui" == "1" ]] && desc+=" [+webui]"
info "Starting: $desc"
# shellcheck disable=SC2086
docker compose $profile_flags up -d
echo ""
info "Waiting for health check..."
# bigctx / heavy models can take >2 min to load: poll up to 150s
local max_polls=75
local i=0
local consecutive_unhealthy=0
while [[ $i -lt $max_polls ]]; do
local status
status=$(docker inspect llama_server --format '{{.State.Health.Status}}' 2>/dev/null || echo "starting")
if [[ "$status" == "healthy" ]]; then
echo ""
ok "Server is healthy → http://localhost:8080"
[[ "$webui" == "1" ]] && ok "Open WebUI → http://localhost:3000"
return 0
elif [[ "$status" == "unhealthy" ]]; then
(( consecutive_unhealthy++ )) || true
# Only give up after 3 consecutive unhealthy — avoids false positives
# during start_period when Docker hasn't run healthchecks yet
if [[ $consecutive_unhealthy -ge 3 ]]; then
echo ""
warn "Server reported unhealthy. Check logs: ./llama logs"
return 1
fi
else
consecutive_unhealthy=0
fi
echo -n "."
sleep 2
(( i++ )) || true
done
echo ""
warn "Still starting (health check pending). Try: ./llama status"
}
cmd_stop() {
local current
current=$(running_model)
if [[ -z "$current" ]]; then
info "No llama containers running."
else
info "Stopping $current..."
docker compose down --remove-orphans
fi
# Remove any stopped containers holding reserved names
local cleaned=0
for reserved in llama_server llama_bench open_webui; do
if docker ps -a --filter "name=^/${reserved}$" --format "{{.Names}}" 2>/dev/null | grep -q "^${reserved}$"; then
docker rm -f "$reserved" &>/dev/null && (( cleaned++ )) || true
fi
done
[[ $cleaned -gt 0 ]] && info "Removed $cleaned stopped container(s)."
ok "Clean."
}
cmd_status() {
echo ""
echo -e "${BOLD}── llama.cpp server status ──────────────────────────────${NC}"
local name
name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
if [[ -z "$name" ]]; then
echo -e " ${R}${NC} No server running"
else
local health uptime model_env
health=$(docker inspect "$name" --format '{{.State.Health.Status}}' 2>/dev/null || echo "unknown")
uptime=$(docker inspect "$name" --format '{{.State.StartedAt}}' 2>/dev/null | \
python3 -c "import sys,datetime; t=sys.stdin.read().strip().rstrip('Z'); \
start=datetime.datetime.fromisoformat(t.replace('Z','')); \
diff=datetime.datetime.utcnow()-start; \
h,r=divmod(int(diff.total_seconds()),3600); m=r//60; \
print(f'{h}h {m}m' if h else f'{m}m')" 2>/dev/null || echo "?")
model_env=$(docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \
| grep "^MODEL_FILE=\|^CTX_SIZE=\|^N_GPU_LAYERS=" | sort)
local color="${G}"
[[ "$health" == "unhealthy" ]] && color="${R}"
[[ "$health" == "starting" ]] && color="${Y}"
echo -e " ${color}${NC} ${BOLD}${name}${NC}${health} (up ${uptime})"
while IFS= read -r line; do
echo -e " ${C}${line}${NC}"
done <<< "$model_env"
echo -e " ${C}API → http://localhost:8080${NC}"
fi
# WebUI
local wname
wname=$(docker ps --filter "name=open_webui" --format "{{.Names}}" 2>/dev/null | head -1)
if [[ -n "$wname" ]]; then
echo -e " ${G}${NC} open_webui running → http://localhost:3000"
fi
echo ""
}
cmd_logs() {
local follow=0
[[ "${1:-}" == "--follow" || "${1:-}" == "-f" ]] && follow=1
local name
name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
[[ -z "$name" ]] && die "No server running."
if [[ "$follow" == "1" ]]; then
docker logs -f --tail 50 "$name"
else
docker logs --tail 80 "$name"
fi
}
cmd_build() {
info "Building TurboQuant image (full + server targets)..."
info "This takes ~20 minutes on first build."
docker compose --profile qwen35-9b build llama-qwen35-9b
ok "Images built: local/llama-cpp-turboquant:server-cuda-sm75-mmq"
ok " local/llama-cpp-turboquant:full-cuda-sm75-mmq"
}
cmd_bench() {
local model="${1:-}"
[[ -z "$model" ]] && die "Model required. Usage: ./llama bench <model>"
[[ -z "${MODEL_FILE[$model]+_}" ]] && die "Unknown model: $model"
check_model_file "$model"
local current
current=$(running_model)
if [[ -n "$current" ]]; then
warn "Server is running ($current). Bench will compete for GPU."
confirm "Continue anyway?" || { echo "Aborted."; exit 0; }
docker compose down --remove-orphans 2>/dev/null || true
fi
info "Running benchmark for $model..."
docker compose --profile "bench-${model}" run --rm "bench-${model}"
ok "Results written to benchmark-results/"
}
# ── Interactive menu ──────────────────────────────────────────────────────────
menu() {
echo ""
echo -e "${BOLD}╔══════════════════════════════════════════════════════╗${NC}"
echo -e "${BOLD}║ llama.cpp model launcher ║${NC}"
echo -e "${BOLD}╚══════════════════════════════════════════════════════╝${NC}"
echo ""
cmd_status
echo -e "${BOLD}Select a model:${NC}"
local models=(smollm3 gemma4-e2b gemma4-e4b qwen3-4b qwen35-9b)
local i=1
for m in "${models[@]}"; do
local missing=""
[[ ! -f "$MODELS_DIR/${MODEL_FILE[$m]}" ]] && missing=" ${R}[not downloaded]${NC}"
echo -e " ${C}${i})${NC} ${MODEL_LABEL[$m]}${missing}"
(( i++ ))
done
echo ""
echo -e " ${C}s)${NC} stop all"
echo -e " ${C}q)${NC} quit"
echo ""
echo -n "Choice: "
local choice
read -r choice
case "$choice" in
[1-5])
local model="${models[$((choice-1))]}"
local bigctx=0 webui=0
# bigctx option
local has_bigctx=0
for m in "${BIGCTX_MODELS[@]}"; do [[ "$m" == "$model" ]] && has_bigctx=1; done
if [[ "$has_bigctx" == "1" ]]; then
echo -n "Use bigctx profile (larger context, slower)? [y/N] "
read -r ans; [[ "${ans,,}" == "y" ]] && bigctx=1
fi
echo -n "Include Open WebUI? [y/N] "
read -r ans; [[ "${ans,,}" == "y" ]] && webui=1
echo ""
local extra_flags=""
[[ "$bigctx" == "1" ]] && extra_flags+=" --bigctx"
[[ "$webui" == "1" ]] && extra_flags+=" --webui"
# shellcheck disable=SC2086
cmd_start $model $extra_flags
;;
s|S) cmd_stop ;;
q|Q) exit 0 ;;
*) warn "Invalid choice."; menu ;;
esac
}
# ── Entrypoint ────────────────────────────────────────────────────────────────
usage() {
echo ""
echo -e "${BOLD}Usage:${NC}"
echo " ./llama interactive menu"
echo " ./llama start <model> [--bigctx] [--webui]"
echo " ./llama stop"
echo " ./llama status"
echo " ./llama logs [--follow]"
echo " ./llama build"
echo " ./llama bench <model>"
echo ""
echo -e "${BOLD}Models:${NC} smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b"
echo ""
}
case "${1:-}" in
"") menu ;;
start) shift; cmd_start "$@" ;;
stop) cmd_stop ;;
status) cmd_status ;;
logs) shift; cmd_logs "$@" ;;
build) cmd_build ;;
bench) shift; cmd_bench "$@" ;;
help|--help|-h) usage ;;
*) die "Unknown command: $1. Run ./llama help" ;;
esac