Add llama launcher script
- ./llama (interactive menu) or ./llama <cmd> [args] - start <model> [--bigctx] [--webui]: verify model file, warn before stopping running server, health-wait after start - stop: stop all llama containers - status: running model + health + env vars - logs [--follow]: tail server logs - build: build TurboQuant images - bench <model>: run llama-bench via bench profile
This commit is contained in:
360
llama
Executable file
360
llama
Executable file
@@ -0,0 +1,360 @@
|
||||
#!/usr/bin/env bash
|
||||
# llama — utility script for the llama.cpp Docker Compose stack
|
||||
#
|
||||
# Usage:
|
||||
# ./llama interactive menu
|
||||
# ./llama start <model> [--bigctx] [--webui]
|
||||
# ./llama stop
|
||||
# ./llama status
|
||||
# ./llama logs [--follow]
|
||||
# ./llama build
|
||||
# ./llama bench <model>
|
||||
#
|
||||
# Models: smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b
|
||||
|
||||
set -euo pipefail
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
# ── Colors ────────────────────────────────────────────────────────────────────
|
||||
if [[ -t 1 ]]; then
|
||||
G='\033[0;32m'; Y='\033[1;33m'; R='\033[0;31m'; B='\033[0;34m'; C='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m'
|
||||
else
|
||||
G=''; Y=''; R=''; B=''; C=''; BOLD=''; NC=''
|
||||
fi
|
||||
|
||||
MODELS_DIR="./models"
|
||||
|
||||
# ── Model metadata ────────────────────────────────────────────────────────────
|
||||
declare -A MODEL_FILE=(
|
||||
[smollm3]="HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf"
|
||||
[gemma4-e2b]="google_gemma-4-E2B-it-Q4_K_M.gguf"
|
||||
[gemma4-e4b]="google_gemma-4-E4B-it-Q4_K_M.gguf"
|
||||
[qwen3-4b]="Qwen3-4B-Q4_K_M.gguf"
|
||||
[qwen35-9b]="Qwen3.5-9B.Q8_0.gguf"
|
||||
)
|
||||
|
||||
declare -A MODEL_LABEL=(
|
||||
[smollm3]="SmolLM3-3B (~53 t/s, ctx 24K/65K bigctx, thinking+tools)"
|
||||
[gemma4-e2b]="Gemma4-E2B (~62 t/s, ctx 24K/393K bigctx, multimodal)"
|
||||
[gemma4-e4b]="Gemma4-E4B (~30 t/s, ctx 24K/164K bigctx, multimodal)"
|
||||
[qwen3-4b]="Qwen3-4B (~39 t/s, ctx 16K/24K bigctx, thinking+tools)"
|
||||
[qwen35-9b]="Qwen3.5-9B (~4.4 t/s, ctx 32K, reasoning distill)"
|
||||
)
|
||||
|
||||
# Models that support bigctx
|
||||
BIGCTX_MODELS=(smollm3 gemma4-e2b gemma4-e4b qwen3-4b)
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
die() { echo -e "${R}Error: $*${NC}" >&2; exit 1; }
|
||||
info() { echo -e "${B}▸ $*${NC}"; }
|
||||
ok() { echo -e "${G}✓ $*${NC}"; }
|
||||
warn() { echo -e "${Y}⚠ $*${NC}"; }
|
||||
|
||||
confirm() {
|
||||
local prompt="$1"
|
||||
local answer
|
||||
echo -en "${Y}${prompt} [y/N] ${NC}"
|
||||
read -r answer
|
||||
[[ "${answer,,}" == "y" ]]
|
||||
}
|
||||
|
||||
running_model() {
|
||||
# Returns the profile name of the currently running llama_server, or empty
|
||||
local name
|
||||
name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
|
||||
[[ -z "$name" ]] && return
|
||||
# Read the COMPOSE_PROFILES label from the running container's env
|
||||
docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \
|
||||
| grep "^COMPOSE_PROFILES=" | cut -d= -f2 | tr ',' '\n' \
|
||||
| grep -v "bigctx\|webui" | head -1
|
||||
}
|
||||
|
||||
running_profiles() {
|
||||
local name
|
||||
name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
|
||||
[[ -z "$name" ]] && return
|
||||
docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \
|
||||
| grep "^COMPOSE_PROFILES=" | cut -d= -f2
|
||||
}
|
||||
|
||||
check_model_file() {
|
||||
local model="$1"
|
||||
local file="${MODEL_FILE[$model]:-}"
|
||||
[[ -z "$file" ]] && return 0
|
||||
if [[ ! -f "$MODELS_DIR/$file" ]]; then
|
||||
warn "Model file not found: $MODELS_DIR/$file"
|
||||
if confirm "Download it now?"; then
|
||||
bash scripts/download_models.sh "$model"
|
||||
else
|
||||
die "Model file missing. Run: bash scripts/download_models.sh $model"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
stop_running() {
|
||||
local current
|
||||
current=$(running_model)
|
||||
if [[ -n "$current" ]]; then
|
||||
warn "Currently running: $current"
|
||||
confirm "Stop it and start new model?" || { echo "Aborted."; exit 0; }
|
||||
info "Stopping running containers..."
|
||||
docker compose down --remove-orphans 2>/dev/null || true
|
||||
ok "Stopped."
|
||||
fi
|
||||
}
|
||||
|
||||
build_profiles_flag() {
|
||||
local model="$1" bigctx="$2" webui="$3"
|
||||
local profiles=("$model")
|
||||
[[ "$bigctx" == "1" ]] && profiles+=("${model}-bigctx")
|
||||
[[ "$webui" == "1" ]] && profiles+=("webui")
|
||||
# docker compose --profile a --profile b
|
||||
local flags=""
|
||||
for p in "${profiles[@]}"; do flags+="--profile $p "; done
|
||||
echo "$flags"
|
||||
}
|
||||
|
||||
# ── Commands ──────────────────────────────────────────────────────────────────
|
||||
|
||||
cmd_start() {
|
||||
local model="" bigctx=0 webui=0
|
||||
|
||||
# Parse args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--bigctx) bigctx=1 ;;
|
||||
--webui) webui=1 ;;
|
||||
-*) die "Unknown flag: $1" ;;
|
||||
*) [[ -z "$model" ]] && model="$1" || die "Unexpected argument: $1" ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
[[ -z "$model" ]] && die "Model required. Usage: ./llama start <model> [--bigctx] [--webui]"
|
||||
[[ -z "${MODEL_FILE[$model]+_}" ]] && die "Unknown model: $model. Valid: ${!MODEL_FILE[*]}"
|
||||
|
||||
if [[ "$bigctx" == "1" ]]; then
|
||||
local valid=0
|
||||
for m in "${BIGCTX_MODELS[@]}"; do [[ "$m" == "$model" ]] && valid=1; done
|
||||
[[ "$valid" == "0" ]] && die "$model does not have a bigctx profile"
|
||||
fi
|
||||
|
||||
check_model_file "$model"
|
||||
stop_running
|
||||
|
||||
local profile_flags
|
||||
profile_flags=$(build_profiles_flag "$model" "$bigctx" "$webui")
|
||||
|
||||
local desc="${MODEL_LABEL[$model]}"
|
||||
[[ "$bigctx" == "1" ]] && desc+=" [bigctx]"
|
||||
[[ "$webui" == "1" ]] && desc+=" [+webui]"
|
||||
info "Starting: $desc"
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
docker compose $profile_flags up -d
|
||||
|
||||
echo ""
|
||||
info "Waiting for health check..."
|
||||
local i=0
|
||||
while [[ $i -lt 30 ]]; do
|
||||
local status
|
||||
status=$(docker inspect llama_server --format '{{.State.Health.Status}}' 2>/dev/null || echo "starting")
|
||||
if [[ "$status" == "healthy" ]]; then
|
||||
ok "Server is healthy → http://localhost:8080"
|
||||
[[ "$webui" == "1" ]] && ok "Open WebUI → http://localhost:3000"
|
||||
return 0
|
||||
elif [[ "$status" == "unhealthy" ]]; then
|
||||
warn "Server reported unhealthy. Check logs: ./llama logs"
|
||||
return 1
|
||||
fi
|
||||
echo -n "."
|
||||
sleep 2
|
||||
(( i++ ))
|
||||
done
|
||||
echo ""
|
||||
warn "Still starting (health check pending). Try: ./llama status"
|
||||
}
|
||||
|
||||
cmd_stop() {
|
||||
local current
|
||||
current=$(running_model)
|
||||
if [[ -z "$current" ]]; then
|
||||
info "No llama containers running."
|
||||
return
|
||||
fi
|
||||
info "Stopping $current..."
|
||||
docker compose down --remove-orphans
|
||||
ok "Stopped."
|
||||
}
|
||||
|
||||
cmd_status() {
|
||||
echo ""
|
||||
echo -e "${BOLD}── llama.cpp server status ──────────────────────────────${NC}"
|
||||
|
||||
local name
|
||||
name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
|
||||
|
||||
if [[ -z "$name" ]]; then
|
||||
echo -e " ${R}●${NC} No server running"
|
||||
else
|
||||
local health uptime model_env
|
||||
health=$(docker inspect "$name" --format '{{.State.Health.Status}}' 2>/dev/null || echo "unknown")
|
||||
uptime=$(docker inspect "$name" --format '{{.State.StartedAt}}' 2>/dev/null | \
|
||||
python3 -c "import sys,datetime; t=sys.stdin.read().strip().rstrip('Z'); \
|
||||
start=datetime.datetime.fromisoformat(t.replace('Z','')); \
|
||||
diff=datetime.datetime.utcnow()-start; \
|
||||
h,r=divmod(int(diff.total_seconds()),3600); m=r//60; \
|
||||
print(f'{h}h {m}m' if h else f'{m}m')" 2>/dev/null || echo "?")
|
||||
model_env=$(docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \
|
||||
| grep "^MODEL_FILE=\|^CTX_SIZE=\|^N_GPU_LAYERS=" | sort)
|
||||
|
||||
local color="${G}"
|
||||
[[ "$health" == "unhealthy" ]] && color="${R}"
|
||||
[[ "$health" == "starting" ]] && color="${Y}"
|
||||
|
||||
echo -e " ${color}●${NC} ${BOLD}${name}${NC} — ${health} (up ${uptime})"
|
||||
while IFS= read -r line; do
|
||||
echo -e " ${C}${line}${NC}"
|
||||
done <<< "$model_env"
|
||||
echo -e " ${C}API → http://localhost:8080${NC}"
|
||||
fi
|
||||
|
||||
# WebUI
|
||||
local wname
|
||||
wname=$(docker ps --filter "name=open_webui" --format "{{.Names}}" 2>/dev/null | head -1)
|
||||
if [[ -n "$wname" ]]; then
|
||||
echo -e " ${G}●${NC} open_webui running → http://localhost:3000"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
}
|
||||
|
||||
cmd_logs() {
|
||||
local follow=0
|
||||
[[ "${1:-}" == "--follow" || "${1:-}" == "-f" ]] && follow=1
|
||||
|
||||
local name
|
||||
name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1)
|
||||
[[ -z "$name" ]] && die "No server running."
|
||||
|
||||
if [[ "$follow" == "1" ]]; then
|
||||
docker logs -f --tail 50 "$name"
|
||||
else
|
||||
docker logs --tail 80 "$name"
|
||||
fi
|
||||
}
|
||||
|
||||
cmd_build() {
|
||||
info "Building TurboQuant image (full + server targets)..."
|
||||
info "This takes ~20 minutes on first build."
|
||||
docker compose --profile qwen35-9b build llama-qwen35-9b
|
||||
ok "Images built: local/llama-cpp-turboquant:server-cuda-sm75-mmq"
|
||||
ok " local/llama-cpp-turboquant:full-cuda-sm75-mmq"
|
||||
}
|
||||
|
||||
cmd_bench() {
|
||||
local model="${1:-}"
|
||||
[[ -z "$model" ]] && die "Model required. Usage: ./llama bench <model>"
|
||||
[[ -z "${MODEL_FILE[$model]+_}" ]] && die "Unknown model: $model"
|
||||
|
||||
check_model_file "$model"
|
||||
|
||||
local current
|
||||
current=$(running_model)
|
||||
if [[ -n "$current" ]]; then
|
||||
warn "Server is running ($current). Bench will compete for GPU."
|
||||
confirm "Continue anyway?" || { echo "Aborted."; exit 0; }
|
||||
docker compose down --remove-orphans 2>/dev/null || true
|
||||
fi
|
||||
|
||||
info "Running benchmark for $model..."
|
||||
docker compose --profile "bench-${model}" run --rm "bench-${model}"
|
||||
ok "Results written to benchmark-results/"
|
||||
}
|
||||
|
||||
# ── Interactive menu ──────────────────────────────────────────────────────────
|
||||
|
||||
menu() {
|
||||
echo ""
|
||||
echo -e "${BOLD}╔══════════════════════════════════════════════════════╗${NC}"
|
||||
echo -e "${BOLD}║ llama.cpp model launcher ║${NC}"
|
||||
echo -e "${BOLD}╚══════════════════════════════════════════════════════╝${NC}"
|
||||
echo ""
|
||||
|
||||
cmd_status
|
||||
|
||||
echo -e "${BOLD}Select a model:${NC}"
|
||||
local models=(smollm3 gemma4-e2b gemma4-e4b qwen3-4b qwen35-9b)
|
||||
local i=1
|
||||
for m in "${models[@]}"; do
|
||||
local missing=""
|
||||
[[ ! -f "$MODELS_DIR/${MODEL_FILE[$m]}" ]] && missing=" ${R}[not downloaded]${NC}"
|
||||
echo -e " ${C}${i})${NC} ${MODEL_LABEL[$m]}${missing}"
|
||||
(( i++ ))
|
||||
done
|
||||
echo ""
|
||||
echo -e " ${C}s)${NC} stop all"
|
||||
echo -e " ${C}q)${NC} quit"
|
||||
echo ""
|
||||
echo -n "Choice: "
|
||||
local choice
|
||||
read -r choice
|
||||
|
||||
case "$choice" in
|
||||
[1-5])
|
||||
local model="${models[$((choice-1))]}"
|
||||
local bigctx=0 webui=0
|
||||
|
||||
# bigctx option
|
||||
local has_bigctx=0
|
||||
for m in "${BIGCTX_MODELS[@]}"; do [[ "$m" == "$model" ]] && has_bigctx=1; done
|
||||
|
||||
if [[ "$has_bigctx" == "1" ]]; then
|
||||
echo -n "Use bigctx profile (larger context, slower)? [y/N] "
|
||||
read -r ans; [[ "${ans,,}" == "y" ]] && bigctx=1
|
||||
fi
|
||||
|
||||
echo -n "Include Open WebUI? [y/N] "
|
||||
read -r ans; [[ "${ans,,}" == "y" ]] && webui=1
|
||||
|
||||
echo ""
|
||||
local extra_flags=""
|
||||
[[ "$bigctx" == "1" ]] && extra_flags+=" --bigctx"
|
||||
[[ "$webui" == "1" ]] && extra_flags+=" --webui"
|
||||
# shellcheck disable=SC2086
|
||||
cmd_start $model $extra_flags
|
||||
;;
|
||||
s|S) cmd_stop ;;
|
||||
q|Q) exit 0 ;;
|
||||
*) warn "Invalid choice."; menu ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# ── Entrypoint ────────────────────────────────────────────────────────────────
|
||||
|
||||
usage() {
|
||||
echo ""
|
||||
echo -e "${BOLD}Usage:${NC}"
|
||||
echo " ./llama interactive menu"
|
||||
echo " ./llama start <model> [--bigctx] [--webui]"
|
||||
echo " ./llama stop"
|
||||
echo " ./llama status"
|
||||
echo " ./llama logs [--follow]"
|
||||
echo " ./llama build"
|
||||
echo " ./llama bench <model>"
|
||||
echo ""
|
||||
echo -e "${BOLD}Models:${NC} smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b"
|
||||
echo ""
|
||||
}
|
||||
|
||||
case "${1:-}" in
|
||||
"") menu ;;
|
||||
start) shift; cmd_start "$@" ;;
|
||||
stop) cmd_stop ;;
|
||||
status) cmd_status ;;
|
||||
logs) shift; cmd_logs "$@" ;;
|
||||
build) cmd_build ;;
|
||||
bench) shift; cmd_bench "$@" ;;
|
||||
help|--help|-h) usage ;;
|
||||
*) die "Unknown command: $1. Run ./llama help" ;;
|
||||
esac
|
||||
Reference in New Issue
Block a user