feat: GPU model lazy-load/unload lifecycle management
- Domain: add ModelState, ModelStateEvent, ModelNotReady, ManageModelLifecycle (in-port), ModelLoader and ModelStateEventBus (out-ports) - Application: InMemoryModelStateEventBus; ModelLifecycleService — state machine (ReentrantLock), lazy load on first request, idle-timeout auto-unload (configurable via trueref.embedding.idle-timeout-seconds, default 300 s), job-guard (skips unload while ingestion running), platform-thread CUDA executor - Adapters: OnnxModelLoader wires embedder + reranker start/stop; remove @PostConstruct/@PreDestroy from OnnxEmbeddingService and OnnxRerankerService; requireStarted() now throws ModelNotReady instead of IllegalStateException - REST: GET /api/model/status, POST /api/model/unload (409 when jobs running, force=true to override), GET /api/model/status/stream (SSE) - GlobalExceptionHandler: ModelNotReady -> 503 + Retry-After header - HybridSearchService: calls lifecycle.ensureReady() before every search so both REST and MCP paths get ModelNotReady (-> 503 / MCP error) when unloaded - TrueRefMcpTools: catches ModelNotReady, returns retry hint in MCP error text - Tests: InMemoryModelStateEventBusTest, ModelLifecycleServiceTest (10 cases), OnnxModelLoaderTest, GlobalExceptionHandlerTest — all 41 tests green Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -0,0 +1,23 @@
|
||||
package com.trueref.domain.error;
|
||||
|
||||
/**
|
||||
* Thrown when an inference request arrives while the GPU models are not yet loaded (or are being
|
||||
* unloaded). Callers should surface this as HTTP 503 with a {@code Retry-After} header, or as an
|
||||
* MCP error inviting retry.
|
||||
*/
|
||||
public final class ModelNotReady extends TrueRefException {
|
||||
|
||||
private final int retryAfterSeconds;
|
||||
|
||||
public ModelNotReady(int retryAfterSeconds) {
|
||||
super("model_not_ready",
|
||||
"Model is not ready, retry in ~" + retryAfterSeconds + " seconds",
|
||||
null);
|
||||
this.retryAfterSeconds = retryAfterSeconds;
|
||||
}
|
||||
|
||||
/** Suggested number of seconds the caller should wait before retrying. */
|
||||
public int retryAfterSeconds() {
|
||||
return retryAfterSeconds;
|
||||
}
|
||||
}
|
||||
@@ -10,7 +10,8 @@ public abstract sealed class TrueRefException extends RuntimeException
|
||||
VersionNotIndexed,
|
||||
TagNotFound,
|
||||
IngestionFailed,
|
||||
InvalidSearchRequest {
|
||||
InvalidSearchRequest,
|
||||
ModelNotReady {
|
||||
|
||||
private final String code;
|
||||
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
package com.trueref.domain.model;
|
||||
|
||||
/** Life-cycle state of the GPU inference models (embedder + reranker). */
|
||||
public enum ModelState {
|
||||
/** Models are not loaded; no VRAM is consumed. */
|
||||
UNLOADED,
|
||||
/** Models are being loaded into GPU memory (load in progress). */
|
||||
LOADING,
|
||||
/** Models are loaded and ready to serve inference requests. */
|
||||
LOADED,
|
||||
/** Models are being unloaded from GPU memory. */
|
||||
UNLOADING
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package com.trueref.domain.model;
|
||||
|
||||
import java.time.Instant;
|
||||
import org.jspecify.annotations.Nullable;
|
||||
|
||||
/**
|
||||
* Immutable event emitted whenever the GPU model lifecycle transitions to a new {@link ModelState}.
|
||||
*/
|
||||
public record ModelStateEvent(ModelState state, Instant ts, @Nullable String message) {
|
||||
|
||||
public static ModelStateEvent of(ModelState state) {
|
||||
return new ModelStateEvent(state, Instant.now(), null);
|
||||
}
|
||||
|
||||
public static ModelStateEvent of(ModelState state, String message) {
|
||||
return new ModelStateEvent(state, Instant.now(), message);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
package com.trueref.domain.port.in;
|
||||
|
||||
import com.trueref.domain.model.ModelState;
|
||||
import com.trueref.domain.model.ModelStateEvent;
|
||||
import java.time.Instant;
|
||||
import java.util.function.Consumer;
|
||||
import org.jspecify.annotations.Nullable;
|
||||
|
||||
/** Use-case port: manage the lifecycle of the GPU inference models. */
|
||||
public interface ManageModelLifecycle {
|
||||
|
||||
/**
|
||||
* Ensures the model is loaded and ready. If the model is already LOADED, returns immediately.
|
||||
* If the model is UNLOADED, triggers an async load and throws
|
||||
* {@link com.trueref.domain.error.ModelNotReady}. If the model is LOADING or UNLOADING,
|
||||
* throws {@link com.trueref.domain.error.ModelNotReady}.
|
||||
*
|
||||
* <p>Also records the current time as the last-activity timestamp used by the idle timer.
|
||||
*/
|
||||
void ensureReady();
|
||||
|
||||
/**
|
||||
* Forces an unload of the GPU models. Blocked if ingestion jobs are running, unless
|
||||
* {@code force=true}.
|
||||
*
|
||||
* @param force when {@code true}, unloads even while jobs are running
|
||||
* @return {@code true} if unload was initiated; {@code false} if blocked by running jobs
|
||||
*/
|
||||
boolean forceUnload(boolean force);
|
||||
|
||||
/** Returns a snapshot of the current model lifecycle status. */
|
||||
Status getStatus();
|
||||
|
||||
/**
|
||||
* Registers a subscriber to receive model state-change events.
|
||||
*
|
||||
* @return an {@link AutoCloseable} that removes the subscription when closed
|
||||
*/
|
||||
AutoCloseable subscribeState(Consumer<ModelStateEvent> subscriber);
|
||||
|
||||
/** Snapshot of the current model lifecycle status. */
|
||||
record Status(
|
||||
ModelState state,
|
||||
@Nullable Instant loadedAt,
|
||||
@Nullable Instant lastActivityAt,
|
||||
long idleTimeoutSeconds) {}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package com.trueref.domain.port.out;
|
||||
|
||||
/**
|
||||
* Loads and unloads the GPU inference models (embedder + reranker) as a shared lifecycle unit.
|
||||
* Implementations are expected to run load/unload on a platform OS thread to satisfy CUDA
|
||||
* context affinity constraints.
|
||||
*/
|
||||
public interface ModelLoader {
|
||||
|
||||
/** Loads both models into GPU memory. Blocks until ready or throws on error. */
|
||||
void load();
|
||||
|
||||
/** Releases both models from GPU memory. Idempotent. */
|
||||
void unload();
|
||||
|
||||
/** Returns {@code true} if both models are currently loaded and accepting inference. */
|
||||
boolean isLoaded();
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package com.trueref.domain.port.out;
|
||||
|
||||
import com.trueref.domain.model.ModelStateEvent;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
/** Event bus for broadcasting {@link ModelStateEvent}s to subscribers (e.g. SSE connections). */
|
||||
public interface ModelStateEventBus {
|
||||
|
||||
/** Publishes a state-change event to all current subscribers. */
|
||||
void publish(ModelStateEvent event);
|
||||
|
||||
/**
|
||||
* Registers a subscriber to receive future events.
|
||||
*
|
||||
* @return an {@link AutoCloseable} that removes the subscription when closed
|
||||
*/
|
||||
AutoCloseable subscribe(Consumer<ModelStateEvent> subscriber);
|
||||
}
|
||||
Reference in New Issue
Block a user