From e7e389c0e187fc269ca4c743ae561eb751b77a05 Mon Sep 17 00:00:00 2001 From: Giancarmine Salucci Date: Wed, 6 May 2026 19:03:31 +0200 Subject: [PATCH] llama+compose: fix bigctx startup timing - compose: increase start_period for bigctx services - gemma4-e4b-bigctx: 60s -> 150s (5 GiB model + warmup + 163840 ctx takes ~90-120s) - gemma4-e2b-bigctx: 60s -> 120s (large ctx 393216 allocation) - smollm3/qwen3-4b bigctx: 60s -> 90s - llama: extend health poll from 30x2s=60s to 75x2s=150s - llama: require 3 consecutive unhealthy before giving up (avoids false positives during Docker start_period window) --- compose.yaml | 8 ++++---- llama | 20 ++++++++++++++++---- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/compose.yaml b/compose.yaml index 8e53afe..8178f88 100644 --- a/compose.yaml +++ b/compose.yaml @@ -183,7 +183,7 @@ services: env_file: envs/.env.smollm3-3b-bigctx healthcheck: <<: *hc - start_period: 60s + start_period: 90s llama-gemma4-e2b-bigctx: image: local/llama-cpp-turboquant:server-cuda-sm75-mmq @@ -192,7 +192,7 @@ services: env_file: envs/.env.gemma4-e2b-bigctx healthcheck: <<: *hc - start_period: 60s + start_period: 120s # large ctx (393216) allocation takes extra time llama-gemma4-e4b-bigctx: image: local/llama-cpp-turboquant:server-cuda-sm75-mmq @@ -201,7 +201,7 @@ services: env_file: envs/.env.gemma4-e4b-bigctx healthcheck: <<: *hc - start_period: 60s + start_period: 150s # 5 GiB model + warmup + 163840 ctx takes ~90-120s llama-qwen3-4b-bigctx: image: local/llama-cpp-turboquant:server-cuda-sm75-mmq @@ -210,7 +210,7 @@ services: env_file: envs/.env.qwen3-4b-bigctx healthcheck: <<: *hc - start_period: 60s + start_period: 90s # ── OPEN WEBUI ───────────────────────────────────────────────────────────── # Separate profile — add to any running model: diff --git a/llama b/llama index f1764c8..e4075ca 100755 --- a/llama +++ b/llama @@ -165,21 +165,33 @@ cmd_start() { echo "" info "Waiting for health check..." + # bigctx / heavy models can take >2 min to load: poll up to 150s + local max_polls=75 local i=0 - while [[ $i -lt 30 ]]; do + local consecutive_unhealthy=0 + while [[ $i -lt $max_polls ]]; do local status status=$(docker inspect llama_server --format '{{.State.Health.Status}}' 2>/dev/null || echo "starting") if [[ "$status" == "healthy" ]]; then + echo "" ok "Server is healthy → http://localhost:8080" [[ "$webui" == "1" ]] && ok "Open WebUI → http://localhost:3000" return 0 elif [[ "$status" == "unhealthy" ]]; then - warn "Server reported unhealthy. Check logs: ./llama logs" - return 1 + (( consecutive_unhealthy++ )) || true + # Only give up after 3 consecutive unhealthy — avoids false positives + # during start_period when Docker hasn't run healthchecks yet + if [[ $consecutive_unhealthy -ge 3 ]]; then + echo "" + warn "Server reported unhealthy. Check logs: ./llama logs" + return 1 + fi + else + consecutive_unhealthy=0 fi echo -n "." sleep 2 - (( i++ )) + (( i++ )) || true done echo "" warn "Still starting (health check pending). Try: ./llama status"