From e7e389c0e187fc269ca4c743ae561eb751b77a05 Mon Sep 17 00:00:00 2001
From: Giancarmine Salucci <mozempk@gmail.com>
Date: Wed, 6 May 2026 19:03:31 +0200
Subject: [PATCH] llama+compose: fix bigctx startup timing

- compose: increase start_period for bigctx services
  - gemma4-e4b-bigctx: 60s -> 150s (5 GiB model + warmup + 163840 ctx takes ~90-120s)
  - gemma4-e2b-bigctx: 60s -> 120s (large ctx 393216 allocation)
  - smollm3/qwen3-4b bigctx: 60s -> 90s
- llama: extend health poll from 30x2s=60s to 75x2s=150s
- llama: require 3 consecutive unhealthy before giving up (avoids
  false positives during Docker start_period window)
---
 compose.yaml |  8 ++++----
 llama        | 20 ++++++++++++++++----
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/compose.yaml b/compose.yaml
index 8e53afe..8178f88 100644
--- a/compose.yaml
+++ b/compose.yaml
@@ -183,7 +183,7 @@ services:
     env_file: envs/.env.smollm3-3b-bigctx
     healthcheck:
       <<: *hc
-      start_period: 60s
+      start_period: 90s
 
   llama-gemma4-e2b-bigctx:
     image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
@@ -192,7 +192,7 @@ services:
     env_file: envs/.env.gemma4-e2b-bigctx
     healthcheck:
       <<: *hc
-      start_period: 60s
+      start_period: 120s  # large ctx (393216) allocation takes extra time
 
   llama-gemma4-e4b-bigctx:
     image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
@@ -201,7 +201,7 @@ services:
     env_file: envs/.env.gemma4-e4b-bigctx
     healthcheck:
       <<: *hc
-      start_period: 60s
+      start_period: 150s  # 5 GiB model + warmup + 163840 ctx takes ~90-120s
 
   llama-qwen3-4b-bigctx:
     image: local/llama-cpp-turboquant:server-cuda-sm75-mmq
@@ -210,7 +210,7 @@ services:
     env_file: envs/.env.qwen3-4b-bigctx
     healthcheck:
       <<: *hc
-      start_period: 60s
+      start_period: 90s
 
   # ── OPEN WEBUI ─────────────────────────────────────────────────────────────
   # Separate profile — add to any running model:
diff --git a/llama b/llama
index f1764c8..e4075ca 100755
--- a/llama
+++ b/llama
@@ -165,21 +165,33 @@ cmd_start() {
 
     echo ""
     info "Waiting for health check..."
+    # bigctx / heavy models can take >2 min to load: poll up to 150s
+    local max_polls=75
     local i=0
-    while [[ $i -lt 30 ]]; do
+    local consecutive_unhealthy=0
+    while [[ $i -lt $max_polls ]]; do
         local status
         status=$(docker inspect llama_server --format '{{.State.Health.Status}}' 2>/dev/null || echo "starting")
         if [[ "$status" == "healthy" ]]; then
+            echo ""
             ok "Server is healthy → http://localhost:8080"
             [[ "$webui" == "1" ]] && ok "Open WebUI → http://localhost:3000"
             return 0
         elif [[ "$status" == "unhealthy" ]]; then
-            warn "Server reported unhealthy. Check logs: ./llama logs"
-            return 1
+            (( consecutive_unhealthy++ )) || true
+            # Only give up after 3 consecutive unhealthy — avoids false positives
+            # during start_period when Docker hasn't run healthchecks yet
+            if [[ $consecutive_unhealthy -ge 3 ]]; then
+                echo ""
+                warn "Server reported unhealthy. Check logs: ./llama logs"
+                return 1
+            fi
+        else
+            consecutive_unhealthy=0
         fi
         echo -n "."
         sleep 2
-        (( i++ ))
+        (( i++ )) || true
     done
     echo ""
     warn "Still starting (health check pending). Try: ./llama status"