feat: GPU model lazy-load/unload lifecycle management
All checks were successful
Build and publish Docker image / Build and push CPU image (push) Successful in 2m33s
Build and publish Docker image / Build and push GPU image (push) Successful in 3m15s

- Domain: add ModelState, ModelStateEvent, ModelNotReady, ManageModelLifecycle
  (in-port), ModelLoader and ModelStateEventBus (out-ports)
- Application: InMemoryModelStateEventBus; ModelLifecycleService — state
  machine (ReentrantLock), lazy load on first request, idle-timeout auto-unload
  (configurable via trueref.embedding.idle-timeout-seconds, default 300 s),
  job-guard (skips unload while ingestion running), platform-thread CUDA executor
- Adapters: OnnxModelLoader wires embedder + reranker start/stop; remove
  @PostConstruct/@PreDestroy from OnnxEmbeddingService and OnnxRerankerService;
  requireStarted() now throws ModelNotReady instead of IllegalStateException
- REST: GET /api/model/status, POST /api/model/unload (409 when jobs running,
  force=true to override), GET /api/model/status/stream (SSE)
- GlobalExceptionHandler: ModelNotReady -> 503 + Retry-After header
- HybridSearchService: calls lifecycle.ensureReady() before every search so
  both REST and MCP paths get ModelNotReady (-> 503 / MCP error) when unloaded
- TrueRefMcpTools: catches ModelNotReady, returns retry hint in MCP error text
- Tests: InMemoryModelStateEventBusTest, ModelLifecycleServiceTest (10 cases),
  OnnxModelLoaderTest, GlobalExceptionHandlerTest — all 41 tests green

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
moze
2026-05-09 15:44:33 +02:00
parent 943a38fd36
commit 5c6085df99
24 changed files with 1144 additions and 17 deletions

View File

@@ -0,0 +1,23 @@
package com.trueref.domain.error;
/**
* Thrown when an inference request arrives while the GPU models are not yet loaded (or are being
* unloaded). Callers should surface this as HTTP 503 with a {@code Retry-After} header, or as an
* MCP error inviting retry.
*/
public final class ModelNotReady extends TrueRefException {
private final int retryAfterSeconds;
public ModelNotReady(int retryAfterSeconds) {
super("model_not_ready",
"Model is not ready, retry in ~" + retryAfterSeconds + " seconds",
null);
this.retryAfterSeconds = retryAfterSeconds;
}
/** Suggested number of seconds the caller should wait before retrying. */
public int retryAfterSeconds() {
return retryAfterSeconds;
}
}

View File

@@ -10,7 +10,8 @@ public abstract sealed class TrueRefException extends RuntimeException
VersionNotIndexed,
TagNotFound,
IngestionFailed,
InvalidSearchRequest {
InvalidSearchRequest,
ModelNotReady {
private final String code;

View File

@@ -0,0 +1,13 @@
package com.trueref.domain.model;
/** Life-cycle state of the GPU inference models (embedder + reranker). */
public enum ModelState {
/** Models are not loaded; no VRAM is consumed. */
UNLOADED,
/** Models are being loaded into GPU memory (load in progress). */
LOADING,
/** Models are loaded and ready to serve inference requests. */
LOADED,
/** Models are being unloaded from GPU memory. */
UNLOADING
}

View File

@@ -0,0 +1,18 @@
package com.trueref.domain.model;
import java.time.Instant;
import org.jspecify.annotations.Nullable;
/**
* Immutable event emitted whenever the GPU model lifecycle transitions to a new {@link ModelState}.
*/
public record ModelStateEvent(ModelState state, Instant ts, @Nullable String message) {
public static ModelStateEvent of(ModelState state) {
return new ModelStateEvent(state, Instant.now(), null);
}
public static ModelStateEvent of(ModelState state, String message) {
return new ModelStateEvent(state, Instant.now(), message);
}
}

View File

@@ -0,0 +1,47 @@
package com.trueref.domain.port.in;
import com.trueref.domain.model.ModelState;
import com.trueref.domain.model.ModelStateEvent;
import java.time.Instant;
import java.util.function.Consumer;
import org.jspecify.annotations.Nullable;
/** Use-case port: manage the lifecycle of the GPU inference models. */
public interface ManageModelLifecycle {
/**
* Ensures the model is loaded and ready. If the model is already LOADED, returns immediately.
* If the model is UNLOADED, triggers an async load and throws
* {@link com.trueref.domain.error.ModelNotReady}. If the model is LOADING or UNLOADING,
* throws {@link com.trueref.domain.error.ModelNotReady}.
*
* <p>Also records the current time as the last-activity timestamp used by the idle timer.
*/
void ensureReady();
/**
* Forces an unload of the GPU models. Blocked if ingestion jobs are running, unless
* {@code force=true}.
*
* @param force when {@code true}, unloads even while jobs are running
* @return {@code true} if unload was initiated; {@code false} if blocked by running jobs
*/
boolean forceUnload(boolean force);
/** Returns a snapshot of the current model lifecycle status. */
Status getStatus();
/**
* Registers a subscriber to receive model state-change events.
*
* @return an {@link AutoCloseable} that removes the subscription when closed
*/
AutoCloseable subscribeState(Consumer<ModelStateEvent> subscriber);
/** Snapshot of the current model lifecycle status. */
record Status(
ModelState state,
@Nullable Instant loadedAt,
@Nullable Instant lastActivityAt,
long idleTimeoutSeconds) {}
}

View File

@@ -0,0 +1,18 @@
package com.trueref.domain.port.out;
/**
* Loads and unloads the GPU inference models (embedder + reranker) as a shared lifecycle unit.
* Implementations are expected to run load/unload on a platform OS thread to satisfy CUDA
* context affinity constraints.
*/
public interface ModelLoader {
/** Loads both models into GPU memory. Blocks until ready or throws on error. */
void load();
/** Releases both models from GPU memory. Idempotent. */
void unload();
/** Returns {@code true} if both models are currently loaded and accepting inference. */
boolean isLoaded();
}

View File

@@ -0,0 +1,18 @@
package com.trueref.domain.port.out;
import com.trueref.domain.model.ModelStateEvent;
import java.util.function.Consumer;
/** Event bus for broadcasting {@link ModelStateEvent}s to subscribers (e.g. SSE connections). */
public interface ModelStateEventBus {
/** Publishes a state-change event to all current subscribers. */
void publish(ModelStateEvent event);
/**
* Registers a subscriber to receive future events.
*
* @return an {@link AutoCloseable} that removes the subscription when closed
*/
AutoCloseable subscribe(Consumer<ModelStateEvent> subscriber);
}