feat: GPU model lazy-load/unload lifecycle management

- Domain: add ModelState, ModelStateEvent, ModelNotReady, ManageModelLifecycle (in-port), ModelLoader and ModelStateEventBus (out-ports) - Application: InMemoryModelStateEventBus; ModelLifecycleService — state machine (ReentrantLock), lazy load on first request, idle-timeout auto-unload (configurable via trueref.embedding.idle-timeout-seconds, default 300 s), job-guard (skips unload while ingestion running), platform-thread CUDA executor - Adapters: OnnxModelLoader wires embedder + reranker start/stop; remove @PostConstruct/@PreDestroy from OnnxEmbeddingService and OnnxRerankerService; requireStarted() now throws ModelNotReady instead of IllegalStateException - REST: GET /api/model/status, POST /api/model/unload (409 when jobs running, force=true to override), GET /api/model/status/stream (SSE) - GlobalExceptionHandler: ModelNotReady -> 503 + Retry-After header - HybridSearchService: calls lifecycle.ensureReady() before every search so both REST and MCP paths get ModelNotReady (-> 503 / MCP error) when unloaded - TrueRefMcpTools: catches ModelNotReady, returns retry hint in MCP error text - Tests: InMemoryModelStateEventBusTest, ModelLifecycleServiceTest (10 cases), OnnxModelLoaderTest, GlobalExceptionHandlerTest — all 41 tests green Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-09 15:44:33 +02:00
parent 943a38fd36
commit 5c6085df99
24 changed files with 1144 additions and 17 deletions
--- a/trueref-domain/src/main/java/com/trueref/domain/error/ModelNotReady.java
+++ b/trueref-domain/src/main/java/com/trueref/domain/error/ModelNotReady.java
@@ -0,0 +1,23 @@
+package com.trueref.domain.error;
+
+/**
+ * Thrown when an inference request arrives while the GPU models are not yet loaded (or are being
+ * unloaded). Callers should surface this as HTTP 503 with a {@code Retry-After} header, or as an
+ * MCP error inviting retry.
+ */
+public final class ModelNotReady extends TrueRefException {
+
+    private final int retryAfterSeconds;
+
+    public ModelNotReady(int retryAfterSeconds) {
+        super("model_not_ready",
+                "Model is not ready, retry in ~" + retryAfterSeconds + " seconds",
+                null);
+        this.retryAfterSeconds = retryAfterSeconds;
+    }
+
+    /** Suggested number of seconds the caller should wait before retrying. */
+    public int retryAfterSeconds() {
+        return retryAfterSeconds;
+    }
+}
--- a/trueref-domain/src/main/java/com/trueref/domain/error/TrueRefException.java
+++ b/trueref-domain/src/main/java/com/trueref/domain/error/TrueRefException.java
@@ -10,7 +10,8 @@ public abstract sealed class TrueRefException extends RuntimeException
                VersionNotIndexed,
                TagNotFound,
                IngestionFailed,
-                InvalidSearchRequest {
+                InvalidSearchRequest,
+                ModelNotReady {

    private final String code;

--- a/trueref-domain/src/main/java/com/trueref/domain/model/ModelState.java
+++ b/trueref-domain/src/main/java/com/trueref/domain/model/ModelState.java
@@ -0,0 +1,13 @@
+package com.trueref.domain.model;
+
+/** Life-cycle state of the GPU inference models (embedder + reranker). */
+public enum ModelState {
+    /** Models are not loaded; no VRAM is consumed. */
+    UNLOADED,
+    /** Models are being loaded into GPU memory (load in progress). */
+    LOADING,
+    /** Models are loaded and ready to serve inference requests. */
+    LOADED,
+    /** Models are being unloaded from GPU memory. */
+    UNLOADING
+}
--- a/trueref-domain/src/main/java/com/trueref/domain/model/ModelStateEvent.java
+++ b/trueref-domain/src/main/java/com/trueref/domain/model/ModelStateEvent.java
@@ -0,0 +1,18 @@
+package com.trueref.domain.model;
+
+import java.time.Instant;
+import org.jspecify.annotations.Nullable;
+
+/**
+ * Immutable event emitted whenever the GPU model lifecycle transitions to a new {@link ModelState}.
+ */
+public record ModelStateEvent(ModelState state, Instant ts, @Nullable String message) {
+
+    public static ModelStateEvent of(ModelState state) {
+        return new ModelStateEvent(state, Instant.now(), null);
+    }
+
+    public static ModelStateEvent of(ModelState state, String message) {
+        return new ModelStateEvent(state, Instant.now(), message);
+    }
+}
--- a/trueref-domain/src/main/java/com/trueref/domain/port/in/ManageModelLifecycle.java
+++ b/trueref-domain/src/main/java/com/trueref/domain/port/in/ManageModelLifecycle.java
@@ -0,0 +1,47 @@
+package com.trueref.domain.port.in;
+
+import com.trueref.domain.model.ModelState;
+import com.trueref.domain.model.ModelStateEvent;
+import java.time.Instant;
+import java.util.function.Consumer;
+import org.jspecify.annotations.Nullable;
+
+/** Use-case port: manage the lifecycle of the GPU inference models. */
+public interface ManageModelLifecycle {
+
+    /**
+     * Ensures the model is loaded and ready. If the model is already LOADED, returns immediately.
+     * If the model is UNLOADED, triggers an async load and throws
+     * {@link com.trueref.domain.error.ModelNotReady}. If the model is LOADING or UNLOADING,
+     * throws {@link com.trueref.domain.error.ModelNotReady}.
+     *
+     * <p>Also records the current time as the last-activity timestamp used by the idle timer.
+     */
+    void ensureReady();
+
+    /**
+     * Forces an unload of the GPU models. Blocked if ingestion jobs are running, unless
+     * {@code force=true}.
+     *
+     * @param force when {@code true}, unloads even while jobs are running
+     * @return {@code true} if unload was initiated; {@code false} if blocked by running jobs
+     */
+    boolean forceUnload(boolean force);
+
+    /** Returns a snapshot of the current model lifecycle status. */
+    Status getStatus();
+
+    /**
+     * Registers a subscriber to receive model state-change events.
+     *
+     * @return an {@link AutoCloseable} that removes the subscription when closed
+     */
+    AutoCloseable subscribeState(Consumer<ModelStateEvent> subscriber);
+
+    /** Snapshot of the current model lifecycle status. */
+    record Status(
+            ModelState state,
+            @Nullable Instant loadedAt,
+            @Nullable Instant lastActivityAt,
+            long idleTimeoutSeconds) {}
+}
--- a/trueref-domain/src/main/java/com/trueref/domain/port/out/ModelLoader.java
+++ b/trueref-domain/src/main/java/com/trueref/domain/port/out/ModelLoader.java
@@ -0,0 +1,18 @@
+package com.trueref.domain.port.out;
+
+/**
+ * Loads and unloads the GPU inference models (embedder + reranker) as a shared lifecycle unit.
+ * Implementations are expected to run load/unload on a platform OS thread to satisfy CUDA
+ * context affinity constraints.
+ */
+public interface ModelLoader {
+
+    /** Loads both models into GPU memory. Blocks until ready or throws on error. */
+    void load();
+
+    /** Releases both models from GPU memory. Idempotent. */
+    void unload();
+
+    /** Returns {@code true} if both models are currently loaded and accepting inference. */
+    boolean isLoaded();
+}
--- a/trueref-domain/src/main/java/com/trueref/domain/port/out/ModelStateEventBus.java
+++ b/trueref-domain/src/main/java/com/trueref/domain/port/out/ModelStateEventBus.java
@@ -0,0 +1,18 @@
+package com.trueref.domain.port.out;
+
+import com.trueref.domain.model.ModelStateEvent;
+import java.util.function.Consumer;
+
+/** Event bus for broadcasting {@link ModelStateEvent}s to subscribers (e.g. SSE connections). */
+public interface ModelStateEventBus {
+
+    /** Publishes a state-change event to all current subscribers. */
+    void publish(ModelStateEvent event);
+
+    /**
+     * Registers a subscriber to receive future events.
+     *
+     * @return an {@link AutoCloseable} that removes the subscription when closed
+     */
+    AutoCloseable subscribe(Consumer<ModelStateEvent> subscriber);
+}