llama+compose: fix bigctx startup timing

- compose: increase start_period for bigctx services
  - gemma4-e4b-bigctx: 60s -> 150s (5 GiB model + warmup + 163840 ctx takes ~90-120s)
  - gemma4-e2b-bigctx: 60s -> 120s (large ctx 393216 allocation)
  - smollm3/qwen3-4b bigctx: 60s -> 90s
- llama: extend health poll from 30x2s=60s to 75x2s=150s
- llama: require 3 consecutive unhealthy before giving up (avoids
  false positives during Docker start_period window)
This commit is contained in:
2026-05-06 19:03:31 +02:00
parent 0618078937
commit e7e389c0e1
2 changed files with 20 additions and 8 deletions

View File

@@ -183,7 +183,7 @@ services:
env_file: envs/.env.smollm3-3b-bigctx env_file: envs/.env.smollm3-3b-bigctx
healthcheck: healthcheck:
<<: *hc <<: *hc
start_period: 60s start_period: 90s
llama-gemma4-e2b-bigctx: llama-gemma4-e2b-bigctx:
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
@@ -192,7 +192,7 @@ services:
env_file: envs/.env.gemma4-e2b-bigctx env_file: envs/.env.gemma4-e2b-bigctx
healthcheck: healthcheck:
<<: *hc <<: *hc
start_period: 60s start_period: 120s # large ctx (393216) allocation takes extra time
llama-gemma4-e4b-bigctx: llama-gemma4-e4b-bigctx:
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
@@ -201,7 +201,7 @@ services:
env_file: envs/.env.gemma4-e4b-bigctx env_file: envs/.env.gemma4-e4b-bigctx
healthcheck: healthcheck:
<<: *hc <<: *hc
start_period: 60s start_period: 150s # 5 GiB model + warmup + 163840 ctx takes ~90-120s
llama-qwen3-4b-bigctx: llama-qwen3-4b-bigctx:
image: local/llama-cpp-turboquant:server-cuda-sm75-mmq image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
@@ -210,7 +210,7 @@ services:
env_file: envs/.env.qwen3-4b-bigctx env_file: envs/.env.qwen3-4b-bigctx
healthcheck: healthcheck:
<<: *hc <<: *hc
start_period: 60s start_period: 90s
# ── OPEN WEBUI ───────────────────────────────────────────────────────────── # ── OPEN WEBUI ─────────────────────────────────────────────────────────────
# Separate profile — add to any running model: # Separate profile — add to any running model:

20
llama
View File

@@ -165,21 +165,33 @@ cmd_start() {
echo "" echo ""
info "Waiting for health check..." info "Waiting for health check..."
# bigctx / heavy models can take >2 min to load: poll up to 150s
local max_polls=75
local i=0 local i=0
while [[ $i -lt 30 ]]; do local consecutive_unhealthy=0
while [[ $i -lt $max_polls ]]; do
local status local status
status=$(docker inspect llama_server --format '{{.State.Health.Status}}' 2>/dev/null || echo "starting") status=$(docker inspect llama_server --format '{{.State.Health.Status}}' 2>/dev/null || echo "starting")
if [[ "$status" == "healthy" ]]; then if [[ "$status" == "healthy" ]]; then
echo ""
ok "Server is healthy → http://localhost:8080" ok "Server is healthy → http://localhost:8080"
[[ "$webui" == "1" ]] && ok "Open WebUI → http://localhost:3000" [[ "$webui" == "1" ]] && ok "Open WebUI → http://localhost:3000"
return 0 return 0
elif [[ "$status" == "unhealthy" ]]; then elif [[ "$status" == "unhealthy" ]]; then
warn "Server reported unhealthy. Check logs: ./llama logs" (( consecutive_unhealthy++ )) || true
return 1 # Only give up after 3 consecutive unhealthy — avoids false positives
# during start_period when Docker hasn't run healthchecks yet
if [[ $consecutive_unhealthy -ge 3 ]]; then
echo ""
warn "Server reported unhealthy. Check logs: ./llama logs"
return 1
fi
else
consecutive_unhealthy=0
fi fi
echo -n "." echo -n "."
sleep 2 sleep 2
(( i++ )) (( i++ )) || true
done done
echo "" echo ""
warn "Still starting (health check pending). Try: ./llama status" warn "Still starting (health check pending). Try: ./llama status"