diff --git a/llama b/llama new file mode 100755 index 0000000..36eebaf --- /dev/null +++ b/llama @@ -0,0 +1,360 @@ +#!/usr/bin/env bash +# llama — utility script for the llama.cpp Docker Compose stack +# +# Usage: +# ./llama interactive menu +# ./llama start [--bigctx] [--webui] +# ./llama stop +# ./llama status +# ./llama logs [--follow] +# ./llama build +# ./llama bench +# +# Models: smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b + +set -euo pipefail +cd "$(dirname "$0")" + +# ── Colors ──────────────────────────────────────────────────────────────────── +if [[ -t 1 ]]; then + G='\033[0;32m'; Y='\033[1;33m'; R='\033[0;31m'; B='\033[0;34m'; C='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m' +else + G=''; Y=''; R=''; B=''; C=''; BOLD=''; NC='' +fi + +MODELS_DIR="./models" + +# ── Model metadata ──────────────────────────────────────────────────────────── +declare -A MODEL_FILE=( + [smollm3]="HuggingFaceTB_SmolLM3-3B-Q4_K_M.gguf" + [gemma4-e2b]="google_gemma-4-E2B-it-Q4_K_M.gguf" + [gemma4-e4b]="google_gemma-4-E4B-it-Q4_K_M.gguf" + [qwen3-4b]="Qwen3-4B-Q4_K_M.gguf" + [qwen35-9b]="Qwen3.5-9B.Q8_0.gguf" +) + +declare -A MODEL_LABEL=( + [smollm3]="SmolLM3-3B (~53 t/s, ctx 24K/65K bigctx, thinking+tools)" + [gemma4-e2b]="Gemma4-E2B (~62 t/s, ctx 24K/393K bigctx, multimodal)" + [gemma4-e4b]="Gemma4-E4B (~30 t/s, ctx 24K/164K bigctx, multimodal)" + [qwen3-4b]="Qwen3-4B (~39 t/s, ctx 16K/24K bigctx, thinking+tools)" + [qwen35-9b]="Qwen3.5-9B (~4.4 t/s, ctx 32K, reasoning distill)" +) + +# Models that support bigctx +BIGCTX_MODELS=(smollm3 gemma4-e2b gemma4-e4b qwen3-4b) + +# ── Helpers ─────────────────────────────────────────────────────────────────── +die() { echo -e "${R}Error: $*${NC}" >&2; exit 1; } +info() { echo -e "${B}▸ $*${NC}"; } +ok() { echo -e "${G}✓ $*${NC}"; } +warn() { echo -e "${Y}⚠ $*${NC}"; } + +confirm() { + local prompt="$1" + local answer + echo -en "${Y}${prompt} [y/N] ${NC}" + read -r answer + [[ "${answer,,}" == "y" ]] +} + +running_model() { + # Returns the profile name of the currently running llama_server, or empty + local name + name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1) + [[ -z "$name" ]] && return + # Read the COMPOSE_PROFILES label from the running container's env + docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \ + | grep "^COMPOSE_PROFILES=" | cut -d= -f2 | tr ',' '\n' \ + | grep -v "bigctx\|webui" | head -1 +} + +running_profiles() { + local name + name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1) + [[ -z "$name" ]] && return + docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \ + | grep "^COMPOSE_PROFILES=" | cut -d= -f2 +} + +check_model_file() { + local model="$1" + local file="${MODEL_FILE[$model]:-}" + [[ -z "$file" ]] && return 0 + if [[ ! -f "$MODELS_DIR/$file" ]]; then + warn "Model file not found: $MODELS_DIR/$file" + if confirm "Download it now?"; then + bash scripts/download_models.sh "$model" + else + die "Model file missing. Run: bash scripts/download_models.sh $model" + fi + fi +} + +stop_running() { + local current + current=$(running_model) + if [[ -n "$current" ]]; then + warn "Currently running: $current" + confirm "Stop it and start new model?" || { echo "Aborted."; exit 0; } + info "Stopping running containers..." + docker compose down --remove-orphans 2>/dev/null || true + ok "Stopped." + fi +} + +build_profiles_flag() { + local model="$1" bigctx="$2" webui="$3" + local profiles=("$model") + [[ "$bigctx" == "1" ]] && profiles+=("${model}-bigctx") + [[ "$webui" == "1" ]] && profiles+=("webui") + # docker compose --profile a --profile b + local flags="" + for p in "${profiles[@]}"; do flags+="--profile $p "; done + echo "$flags" +} + +# ── Commands ────────────────────────────────────────────────────────────────── + +cmd_start() { + local model="" bigctx=0 webui=0 + + # Parse args + while [[ $# -gt 0 ]]; do + case "$1" in + --bigctx) bigctx=1 ;; + --webui) webui=1 ;; + -*) die "Unknown flag: $1" ;; + *) [[ -z "$model" ]] && model="$1" || die "Unexpected argument: $1" ;; + esac + shift + done + + [[ -z "$model" ]] && die "Model required. Usage: ./llama start [--bigctx] [--webui]" + [[ -z "${MODEL_FILE[$model]+_}" ]] && die "Unknown model: $model. Valid: ${!MODEL_FILE[*]}" + + if [[ "$bigctx" == "1" ]]; then + local valid=0 + for m in "${BIGCTX_MODELS[@]}"; do [[ "$m" == "$model" ]] && valid=1; done + [[ "$valid" == "0" ]] && die "$model does not have a bigctx profile" + fi + + check_model_file "$model" + stop_running + + local profile_flags + profile_flags=$(build_profiles_flag "$model" "$bigctx" "$webui") + + local desc="${MODEL_LABEL[$model]}" + [[ "$bigctx" == "1" ]] && desc+=" [bigctx]" + [[ "$webui" == "1" ]] && desc+=" [+webui]" + info "Starting: $desc" + + # shellcheck disable=SC2086 + docker compose $profile_flags up -d + + echo "" + info "Waiting for health check..." + local i=0 + while [[ $i -lt 30 ]]; do + local status + status=$(docker inspect llama_server --format '{{.State.Health.Status}}' 2>/dev/null || echo "starting") + if [[ "$status" == "healthy" ]]; then + ok "Server is healthy → http://localhost:8080" + [[ "$webui" == "1" ]] && ok "Open WebUI → http://localhost:3000" + return 0 + elif [[ "$status" == "unhealthy" ]]; then + warn "Server reported unhealthy. Check logs: ./llama logs" + return 1 + fi + echo -n "." + sleep 2 + (( i++ )) + done + echo "" + warn "Still starting (health check pending). Try: ./llama status" +} + +cmd_stop() { + local current + current=$(running_model) + if [[ -z "$current" ]]; then + info "No llama containers running." + return + fi + info "Stopping $current..." + docker compose down --remove-orphans + ok "Stopped." +} + +cmd_status() { + echo "" + echo -e "${BOLD}── llama.cpp server status ──────────────────────────────${NC}" + + local name + name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1) + + if [[ -z "$name" ]]; then + echo -e " ${R}●${NC} No server running" + else + local health uptime model_env + health=$(docker inspect "$name" --format '{{.State.Health.Status}}' 2>/dev/null || echo "unknown") + uptime=$(docker inspect "$name" --format '{{.State.StartedAt}}' 2>/dev/null | \ + python3 -c "import sys,datetime; t=sys.stdin.read().strip().rstrip('Z'); \ + start=datetime.datetime.fromisoformat(t.replace('Z','')); \ + diff=datetime.datetime.utcnow()-start; \ + h,r=divmod(int(diff.total_seconds()),3600); m=r//60; \ + print(f'{h}h {m}m' if h else f'{m}m')" 2>/dev/null || echo "?") + model_env=$(docker inspect "$name" --format '{{range .Config.Env}}{{println .}}{{end}}' 2>/dev/null \ + | grep "^MODEL_FILE=\|^CTX_SIZE=\|^N_GPU_LAYERS=" | sort) + + local color="${G}" + [[ "$health" == "unhealthy" ]] && color="${R}" + [[ "$health" == "starting" ]] && color="${Y}" + + echo -e " ${color}●${NC} ${BOLD}${name}${NC} — ${health} (up ${uptime})" + while IFS= read -r line; do + echo -e " ${C}${line}${NC}" + done <<< "$model_env" + echo -e " ${C}API → http://localhost:8080${NC}" + fi + + # WebUI + local wname + wname=$(docker ps --filter "name=open_webui" --format "{{.Names}}" 2>/dev/null | head -1) + if [[ -n "$wname" ]]; then + echo -e " ${G}●${NC} open_webui running → http://localhost:3000" + fi + + echo "" +} + +cmd_logs() { + local follow=0 + [[ "${1:-}" == "--follow" || "${1:-}" == "-f" ]] && follow=1 + + local name + name=$(docker ps --filter "name=llama_server" --format "{{.Names}}" 2>/dev/null | head -1) + [[ -z "$name" ]] && die "No server running." + + if [[ "$follow" == "1" ]]; then + docker logs -f --tail 50 "$name" + else + docker logs --tail 80 "$name" + fi +} + +cmd_build() { + info "Building TurboQuant image (full + server targets)..." + info "This takes ~20 minutes on first build." + docker compose --profile qwen35-9b build llama-qwen35-9b + ok "Images built: local/llama-cpp-turboquant:server-cuda-sm75-mmq" + ok " local/llama-cpp-turboquant:full-cuda-sm75-mmq" +} + +cmd_bench() { + local model="${1:-}" + [[ -z "$model" ]] && die "Model required. Usage: ./llama bench " + [[ -z "${MODEL_FILE[$model]+_}" ]] && die "Unknown model: $model" + + check_model_file "$model" + + local current + current=$(running_model) + if [[ -n "$current" ]]; then + warn "Server is running ($current). Bench will compete for GPU." + confirm "Continue anyway?" || { echo "Aborted."; exit 0; } + docker compose down --remove-orphans 2>/dev/null || true + fi + + info "Running benchmark for $model..." + docker compose --profile "bench-${model}" run --rm "bench-${model}" + ok "Results written to benchmark-results/" +} + +# ── Interactive menu ────────────────────────────────────────────────────────── + +menu() { + echo "" + echo -e "${BOLD}╔══════════════════════════════════════════════════════╗${NC}" + echo -e "${BOLD}║ llama.cpp model launcher ║${NC}" + echo -e "${BOLD}╚══════════════════════════════════════════════════════╝${NC}" + echo "" + + cmd_status + + echo -e "${BOLD}Select a model:${NC}" + local models=(smollm3 gemma4-e2b gemma4-e4b qwen3-4b qwen35-9b) + local i=1 + for m in "${models[@]}"; do + local missing="" + [[ ! -f "$MODELS_DIR/${MODEL_FILE[$m]}" ]] && missing=" ${R}[not downloaded]${NC}" + echo -e " ${C}${i})${NC} ${MODEL_LABEL[$m]}${missing}" + (( i++ )) + done + echo "" + echo -e " ${C}s)${NC} stop all" + echo -e " ${C}q)${NC} quit" + echo "" + echo -n "Choice: " + local choice + read -r choice + + case "$choice" in + [1-5]) + local model="${models[$((choice-1))]}" + local bigctx=0 webui=0 + + # bigctx option + local has_bigctx=0 + for m in "${BIGCTX_MODELS[@]}"; do [[ "$m" == "$model" ]] && has_bigctx=1; done + + if [[ "$has_bigctx" == "1" ]]; then + echo -n "Use bigctx profile (larger context, slower)? [y/N] " + read -r ans; [[ "${ans,,}" == "y" ]] && bigctx=1 + fi + + echo -n "Include Open WebUI? [y/N] " + read -r ans; [[ "${ans,,}" == "y" ]] && webui=1 + + echo "" + local extra_flags="" + [[ "$bigctx" == "1" ]] && extra_flags+=" --bigctx" + [[ "$webui" == "1" ]] && extra_flags+=" --webui" + # shellcheck disable=SC2086 + cmd_start $model $extra_flags + ;; + s|S) cmd_stop ;; + q|Q) exit 0 ;; + *) warn "Invalid choice."; menu ;; + esac +} + +# ── Entrypoint ──────────────────────────────────────────────────────────────── + +usage() { + echo "" + echo -e "${BOLD}Usage:${NC}" + echo " ./llama interactive menu" + echo " ./llama start [--bigctx] [--webui]" + echo " ./llama stop" + echo " ./llama status" + echo " ./llama logs [--follow]" + echo " ./llama build" + echo " ./llama bench " + echo "" + echo -e "${BOLD}Models:${NC} smollm3 | gemma4-e2b | gemma4-e4b | qwen3-4b | qwen35-9b" + echo "" +} + +case "${1:-}" in + "") menu ;; + start) shift; cmd_start "$@" ;; + stop) cmd_stop ;; + status) cmd_status ;; + logs) shift; cmd_logs "$@" ;; + build) cmd_build ;; + bench) shift; cmd_bench "$@" ;; + help|--help|-h) usage ;; + *) die "Unknown command: $1. Run ./llama help" ;; +esac