379 lines
14 KiB
Bash
Executable File
379 lines
14 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# llama — utility script for the llama.cpp Docker Compose stack
|
|
#
|
|
# Usage:
|
|
# ./llama interactive menu
|
|
# ./llama start <model> [--bigctx] [--webui]
|
|
# ./llama stop
|
|
# ./llama status
|
|
# ./llama logs [--follow]
|
|
# ./llama build
|
|
# ./llama bench <model>
|
|
#
|
|
# Models: smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b
|
|
|
|
set -euo pipefail
|
|
cd "$(dirname "$0")"
|
|
|
|
# ── Colors ────────────────────────────────────────────────────────────────────
|
|
if [[ -t 1 ]]; then
|
|
G='\033[0;32m'; Y='\033[1;33m'; R='\033[0;31m'; B='\033[0;34m'; C='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m'
|
|
else
|
|
G=''; Y=''; R=''; B=''; C=''; BOLD=''; NC=''
|
|
fi
|
|
|
|
MODELS_DIR="./models"
|
|
|
|
# ── Model metadata ────────────────────────────────────────────────────────────
|
|
declare -A MODEL_FILE=(
|
|
[smollm3]="HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
|
|
[gemma4-e2b]="google_gemma-4-E2B-it-Q4_K_M.gguf"
|
|
[gemma4-e4b]="google_gemma-4-E4B-it-Q4_K_M.gguf"
|
|
[qwen3-4b]="Qwen3-4B-Q4_K_M.gguf"
|
|
[qwen35-9b]="Qwen3.5-9B.Q8_0.gguf"
|
|
)
|
|
|
|
declare -A MODEL_LABEL=(
|
|
[smollm3]="SmolLM3-3B (~53 t/s, ctx 24K/65K bigctx, thinking+tools)"
|
|
[gemma4-e2b]="Gemma4-E2B (~62 t/s, ctx 24K/393K bigctx, multimodal)"
|
|
[gemma4-e4b]="Gemma4-E4B (~30 t/s, ctx 24K/164K bigctx, multimodal)"
|
|
[qwen3-4b]="Qwen3-4B (~39 t/s, ctx 16K/24K bigctx, thinking+tools)"
|
|
[qwen35-9b]="Qwen3.5-9B (~4.4 t/s, ctx 32K, reasoning distill)"
|
|
)
|
|
|
|
# Models that support bigctx
|
|
BIGCTX_MODELS=(smollm3 gemma4-e2b gemma4-e4b qwen3-4b)
|
|
|
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
|
die() { echo -e "${R}Error: $*${NC}" >&2; exit 1; }
|
|
info() { echo -e "${B}▸ $*${NC}"; }
|
|
ok() { echo -e "${G}✓ $*${NC}"; }
|
|
warn() { echo -e "${Y}⚠ $*${NC}"; }
|
|
|
|
confirm() {
|
|
local prompt="$1"
|
|
local answer
|
|
echo -en "${Y}${prompt} [y/N] ${NC}"
|
|
read -r answer
|
|
[[ "${answer,,}" == "y" ]]
|
|
}
|
|
|
|
running_model() {
|
|
# Returns the profile name of the currently running llama_server, or empty
|
|
local name
|
|
name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
|
|
[[ -z "$name" ]] && return
|
|
# Read the COMPOSE_PROFILES label from the running container's env
|
|
docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \
|
|
| grep "^COMPOSE_PROFILES=" | cut -d= -f2 | tr ',' '\n' \
|
|
| grep -v "bigctx\|webui" | head -1
|
|
}
|
|
|
|
running_profiles() {
|
|
local name
|
|
name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
|
|
[[ -z "$name" ]] && return
|
|
docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \
|
|
| grep "^COMPOSE_PROFILES=" | cut -d= -f2
|
|
}
|
|
|
|
check_model_file() {
|
|
local model="$1"
|
|
local file="${MODEL_FILE[$model]:-}"
|
|
[[ -z "$file" ]] && return 0
|
|
if [[ ! -f "$MODELS_DIR/$file" ]]; then
|
|
warn "Model file not found: $MODELS_DIR/$file"
|
|
if confirm "Download it now?"; then
|
|
bash scripts/download_models.sh "$model"
|
|
else
|
|
die "Model file missing. Run: bash scripts/download_models.sh $model"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
stop_running() {
|
|
local current
|
|
current=$(running_model)
|
|
if [[ -n "$current" ]]; then
|
|
warn "Currently running: $current"
|
|
confirm "Stop it and start new model?" || { echo "Aborted."; exit 0; }
|
|
info "Stopping running containers..."
|
|
docker compose down --remove-orphans 2>/dev/null || true
|
|
fi
|
|
# Remove any stopped containers holding reserved names (from previous runs or failed starts)
|
|
for reserved in llama_server llama_bench open_webui; do
|
|
if docker ps -a --filter "name=^/${reserved}$" --format "{{.Names}}" 2>/dev/null | grep -q "^${reserved}$"; then
|
|
docker rm -f "$reserved" &>/dev/null || true
|
|
fi
|
|
done
|
|
}
|
|
|
|
build_profiles_flag() {
|
|
local model="$1" bigctx="$2" webui="$3"
|
|
local profiles=()
|
|
# bigctx is a replacement for the base profile, not an addition
|
|
if [[ "$bigctx" == "1" ]]; then
|
|
profiles+=("${model}-bigctx")
|
|
else
|
|
profiles+=("$model")
|
|
fi
|
|
[[ "$webui" == "1" ]] && profiles+=("webui")
|
|
# docker compose --profile a --profile b
|
|
local flags=""
|
|
for p in "${profiles[@]}"; do flags+="--profile $p "; done
|
|
echo "$flags"
|
|
}
|
|
|
|
# ── Commands ──────────────────────────────────────────────────────────────────
|
|
|
|
cmd_start() {
|
|
local model="" bigctx=0 webui=0
|
|
|
|
# Parse args
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--bigctx) bigctx=1 ;;
|
|
--webui) webui=1 ;;
|
|
-*) die "Unknown flag: $1" ;;
|
|
*) [[ -z "$model" ]] && model="$1" || die "Unexpected argument: $1" ;;
|
|
esac
|
|
shift
|
|
done
|
|
|
|
[[ -z "$model" ]] && die "Model required. Usage: ./llama start <model> [--bigctx] [--webui]"
|
|
[[ -z "${MODEL_FILE[$model]+_}" ]] && die "Unknown model: $model. Valid: ${!MODEL_FILE[*]}"
|
|
|
|
if [[ "$bigctx" == "1" ]]; then
|
|
local valid=0
|
|
for m in "${BIGCTX_MODELS[@]}"; do [[ "$m" == "$model" ]] && valid=1; done
|
|
[[ "$valid" == "0" ]] && die "$model does not have a bigctx profile"
|
|
fi
|
|
|
|
check_model_file "$model"
|
|
stop_running
|
|
|
|
local profile_flags
|
|
profile_flags=$(build_profiles_flag "$model" "$bigctx" "$webui")
|
|
|
|
local desc="${MODEL_LABEL[$model]}"
|
|
[[ "$bigctx" == "1" ]] && desc+=" [bigctx]"
|
|
[[ "$webui" == "1" ]] && desc+=" [+webui]"
|
|
info "Starting: $desc"
|
|
|
|
# shellcheck disable=SC2086
|
|
docker compose $profile_flags up -d
|
|
|
|
echo ""
|
|
info "Waiting for health check..."
|
|
local i=0
|
|
while [[ $i -lt 30 ]]; do
|
|
local status
|
|
status=$(docker inspect llama_server --format '{{.State.Health.Status}}' 2>/dev/null || echo "starting")
|
|
if [[ "$status" == "healthy" ]]; then
|
|
ok "Server is healthy → http://localhost:8080"
|
|
[[ "$webui" == "1" ]] && ok "Open WebUI → http://localhost:3000"
|
|
return 0
|
|
elif [[ "$status" == "unhealthy" ]]; then
|
|
warn "Server reported unhealthy. Check logs: ./llama logs"
|
|
return 1
|
|
fi
|
|
echo -n "."
|
|
sleep 2
|
|
(( i++ ))
|
|
done
|
|
echo ""
|
|
warn "Still starting (health check pending). Try: ./llama status"
|
|
}
|
|
|
|
cmd_stop() {
|
|
local current
|
|
current=$(running_model)
|
|
if [[ -z "$current" ]]; then
|
|
info "No llama containers running."
|
|
else
|
|
info "Stopping $current..."
|
|
docker compose down --remove-orphans
|
|
fi
|
|
# Remove any stopped containers holding reserved names
|
|
local cleaned=0
|
|
for reserved in llama_server llama_bench open_webui; do
|
|
if docker ps -a --filter "name=^/${reserved}$" --format "{{.Names}}" 2>/dev/null | grep -q "^${reserved}$"; then
|
|
docker rm -f "$reserved" &>/dev/null && (( cleaned++ )) || true
|
|
fi
|
|
done
|
|
[[ $cleaned -gt 0 ]] && info "Removed $cleaned stopped container(s)."
|
|
ok "Clean."
|
|
}
|
|
|
|
cmd_status() {
|
|
echo ""
|
|
echo -e "${BOLD}── llama.cpp server status ──────────────────────────────${NC}"
|
|
|
|
local name
|
|
name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
|
|
|
|
if [[ -z "$name" ]]; then
|
|
echo -e " ${R}●${NC} No server running"
|
|
else
|
|
local health uptime model_env
|
|
health=$(docker inspect "$name" --format '{{.State.Health.Status}}' 2>/dev/null || echo "unknown")
|
|
uptime=$(docker inspect "$name" --format '{{.State.StartedAt}}' 2>/dev/null | \
|
|
python3 -c "import sys,datetime; t=sys.stdin.read().strip().rstrip('Z'); \
|
|
start=datetime.datetime.fromisoformat(t.replace('Z','')); \
|
|
diff=datetime.datetime.utcnow()-start; \
|
|
h,r=divmod(int(diff.total_seconds()),3600); m=r//60; \
|
|
print(f'{h}h {m}m' if h else f'{m}m')" 2>/dev/null || echo "?")
|
|
model_env=$(docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \
|
|
| grep "^MODEL_FILE=\|^CTX_SIZE=\|^N_GPU_LAYERS=" | sort)
|
|
|
|
local color="${G}"
|
|
[[ "$health" == "unhealthy" ]] && color="${R}"
|
|
[[ "$health" == "starting" ]] && color="${Y}"
|
|
|
|
echo -e " ${color}●${NC} ${BOLD}${name}${NC} — ${health} (up ${uptime})"
|
|
while IFS= read -r line; do
|
|
echo -e " ${C}${line}${NC}"
|
|
done <<< "$model_env"
|
|
echo -e " ${C}API → http://localhost:8080${NC}"
|
|
fi
|
|
|
|
# WebUI
|
|
local wname
|
|
wname=$(docker ps --filter "name=open_webui" --format "{{.Names}}" 2>/dev/null | head -1)
|
|
if [[ -n "$wname" ]]; then
|
|
echo -e " ${G}●${NC} open_webui running → http://localhost:3000"
|
|
fi
|
|
|
|
echo ""
|
|
}
|
|
|
|
cmd_logs() {
|
|
local follow=0
|
|
[[ "${1:-}" == "--follow" || "${1:-}" == "-f" ]] && follow=1
|
|
|
|
local name
|
|
name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
|
|
[[ -z "$name" ]] && die "No server running."
|
|
|
|
if [[ "$follow" == "1" ]]; then
|
|
docker logs -f --tail 50 "$name"
|
|
else
|
|
docker logs --tail 80 "$name"
|
|
fi
|
|
}
|
|
|
|
cmd_build() {
|
|
info "Building TurboQuant image (full + server targets)..."
|
|
info "This takes ~20 minutes on first build."
|
|
docker compose --profile qwen35-9b build llama-qwen35-9b
|
|
ok "Images built: local/llama-cpp-turboquant:server-cuda-sm75-mmq"
|
|
ok " local/llama-cpp-turboquant:full-cuda-sm75-mmq"
|
|
}
|
|
|
|
cmd_bench() {
|
|
local model="${1:-}"
|
|
[[ -z "$model" ]] && die "Model required. Usage: ./llama bench <model>"
|
|
[[ -z "${MODEL_FILE[$model]+_}" ]] && die "Unknown model: $model"
|
|
|
|
check_model_file "$model"
|
|
|
|
local current
|
|
current=$(running_model)
|
|
if [[ -n "$current" ]]; then
|
|
warn "Server is running ($current). Bench will compete for GPU."
|
|
confirm "Continue anyway?" || { echo "Aborted."; exit 0; }
|
|
docker compose down --remove-orphans 2>/dev/null || true
|
|
fi
|
|
|
|
info "Running benchmark for $model..."
|
|
docker compose --profile "bench-${model}" run --rm "bench-${model}"
|
|
ok "Results written to benchmark-results/"
|
|
}
|
|
|
|
# ── Interactive menu ──────────────────────────────────────────────────────────
|
|
|
|
menu() {
|
|
echo ""
|
|
echo -e "${BOLD}╔══════════════════════════════════════════════════════╗${NC}"
|
|
echo -e "${BOLD}║ llama.cpp model launcher ║${NC}"
|
|
echo -e "${BOLD}╚══════════════════════════════════════════════════════╝${NC}"
|
|
echo ""
|
|
|
|
cmd_status
|
|
|
|
echo -e "${BOLD}Select a model:${NC}"
|
|
local models=(smollm3 gemma4-e2b gemma4-e4b qwen3-4b qwen35-9b)
|
|
local i=1
|
|
for m in "${models[@]}"; do
|
|
local missing=""
|
|
[[ ! -f "$MODELS_DIR/${MODEL_FILE[$m]}" ]] && missing=" ${R}[not downloaded]${NC}"
|
|
echo -e " ${C}${i})${NC} ${MODEL_LABEL[$m]}${missing}"
|
|
(( i++ ))
|
|
done
|
|
echo ""
|
|
echo -e " ${C}s)${NC} stop all"
|
|
echo -e " ${C}q)${NC} quit"
|
|
echo ""
|
|
echo -n "Choice: "
|
|
local choice
|
|
read -r choice
|
|
|
|
case "$choice" in
|
|
[1-5])
|
|
local model="${models[$((choice-1))]}"
|
|
local bigctx=0 webui=0
|
|
|
|
# bigctx option
|
|
local has_bigctx=0
|
|
for m in "${BIGCTX_MODELS[@]}"; do [[ "$m" == "$model" ]] && has_bigctx=1; done
|
|
|
|
if [[ "$has_bigctx" == "1" ]]; then
|
|
echo -n "Use bigctx profile (larger context, slower)? [y/N] "
|
|
read -r ans; [[ "${ans,,}" == "y" ]] && bigctx=1
|
|
fi
|
|
|
|
echo -n "Include Open WebUI? [y/N] "
|
|
read -r ans; [[ "${ans,,}" == "y" ]] && webui=1
|
|
|
|
echo ""
|
|
local extra_flags=""
|
|
[[ "$bigctx" == "1" ]] && extra_flags+=" --bigctx"
|
|
[[ "$webui" == "1" ]] && extra_flags+=" --webui"
|
|
# shellcheck disable=SC2086
|
|
cmd_start $model $extra_flags
|
|
;;
|
|
s|S) cmd_stop ;;
|
|
q|Q) exit 0 ;;
|
|
*) warn "Invalid choice."; menu ;;
|
|
esac
|
|
}
|
|
|
|
# ── Entrypoint ────────────────────────────────────────────────────────────────
|
|
|
|
usage() {
|
|
echo ""
|
|
echo -e "${BOLD}Usage:${NC}"
|
|
echo " ./llama interactive menu"
|
|
echo " ./llama start <model> [--bigctx] [--webui]"
|
|
echo " ./llama stop"
|
|
echo " ./llama status"
|
|
echo " ./llama logs [--follow]"
|
|
echo " ./llama build"
|
|
echo " ./llama bench <model>"
|
|
echo ""
|
|
echo -e "${BOLD}Models:${NC} smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b"
|
|
echo ""
|
|
}
|
|
|
|
case "${1:-}" in
|
|
"") menu ;;
|
|
start) shift; cmd_start "$@" ;;
|
|
stop) cmd_stop ;;
|
|
status) cmd_status ;;
|
|
logs) shift; cmd_logs "$@" ;;
|
|
build) cmd_build ;;
|
|
bench) shift; cmd_bench "$@" ;;
|
|
help|--help|-h) usage ;;
|
|
*) die "Unknown command: $1. Run ./llama help" ;;
|
|
esac
|