Some checks failed
Build and publish Docker image / Build and push (push) Failing after 1m27s
Java 21 / Spring Boot 3.5.3 multi-module Maven project. Hybrid BM25+HNSW search with RRF, cross-encoder reranker, ONNX Runtime 1.22.0 (CPU + CUDA 12 GPU variants).
116 lines
5.1 KiB
Bash
Executable File
116 lines
5.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# trueref launcher (workspace root)
|
|
#
|
|
# Wraps the fat JAR with:
|
|
# - --enable-native-access=ALL-UNNAMED (silences FFM Linker warning from DJL tokenizers)
|
|
# - --add-modules=jdk.incubator.vector (enables Lucene 10 SIMD codepath)
|
|
# - cuDNN 9 (cu12 build) on LD_LIBRARY_PATH so ONNX Runtime CUDA EP loads
|
|
# - CUDA_VISIBLE_DEVICES isolation so ORT BFC arena doesn't trip over the second card
|
|
# - per-session GPU memory cap so embedder + reranker fit on one card
|
|
#
|
|
# Defaults are tuned for this machine (LMDE 7, CUDA 12.4, RTX 2080 SUPER + RTX 3060).
|
|
# Override anything via env vars or by appending Spring properties to the command line.
|
|
#
|
|
# Usage:
|
|
# ./trueref # run with defaults (port 18080)
|
|
# ./trueref --server.port=8080 # forward Spring properties
|
|
# TRUEREF_GPU=0 ./trueref # use the 2080 SUPER instead# TRUEREF_GPU=cpu ./trueref # disable CUDA, run on CPU
|
|
# TRUEREF_HOME=/data/trueref ./trueref # custom data dir
|
|
#
|
|
# Env vars:
|
|
# TRUEREF_GPU GPU index (matches `nvidia-smi -L`) or "cpu". Default: 1
|
|
# TRUEREF_HOME Data directory. Default: ./data
|
|
# TRUEREF_PORT HTTP port. Default: 18080
|
|
# TRUEREF_MEM_LIMIT Per-session GPU mem cap in bytes. Default: 0 (unbounded).
|
|
# With session-count=1 there is no multi-session contention, so the BFC
|
|
# arena can grow freely — capping it risks running out of budget during
|
|
# model-weight loading (~1.5-2 GB) before inference even starts.
|
|
# Set to e.g. 8589934592 (8 GiB) only if you run multiple pools on one card.
|
|
# TRUEREF_CUDNN_LIB Directory containing libcudnn.so.9. Default: ./runtime/cudnn/nvidia/cudnn/lib
|
|
# TRUEREF_JAR Path to the fat JAR. Default: ./trueref-bootstrap/target/trueref.jar
|
|
# JAVA java binary. Default: $JAVA_HOME/bin/java or `java` on PATH
|
|
# JAVA_OPTS Extra JVM flags (e.g. -Xmx16g)
|
|
|
|
set -euo pipefail
|
|
|
|
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
JAR="${TRUEREF_JAR:-$ROOT/trueref-bootstrap/target/trueref.jar}"
|
|
GPU="${TRUEREF_GPU:-1}"
|
|
HOME_DIR="${TRUEREF_HOME:-$ROOT/data}"
|
|
PORT="${TRUEREF_PORT:-18080}"
|
|
MEM_LIMIT="${TRUEREF_MEM_LIMIT:-0}"
|
|
CUDNN_LIB="${TRUEREF_CUDNN_LIB:-$ROOT/runtime/cudnn/nvidia/cudnn/lib}"
|
|
|
|
if [[ ! -f "$JAR" ]]; then
|
|
echo "trueref: jar not found at $JAR" >&2
|
|
echo "trueref: build it first with: mvn -DskipTests -pl trueref-bootstrap -am package" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Resolve java
|
|
if [[ -n "${JAVA:-}" ]]; then
|
|
:
|
|
elif [[ -n "${JAVA_HOME:-}" && -x "${JAVA_HOME}/bin/java" ]]; then
|
|
JAVA="${JAVA_HOME}/bin/java"
|
|
else
|
|
JAVA="$(command -v java || true)"
|
|
fi
|
|
if [[ -z "${JAVA:-}" || ! -x "${JAVA}" ]]; then
|
|
echo "trueref: java not found; install JDK 21+ or set JAVA_HOME" >&2
|
|
exit 1
|
|
fi
|
|
|
|
mkdir -p "$HOME_DIR"
|
|
|
|
SPRING_ARGS=(
|
|
"--server.port=$PORT"
|
|
"--trueref.home=$HOME_DIR"
|
|
)
|
|
|
|
# CUDA setup. "cpu" disables CUDA entirely; otherwise pass the physical GPU index
|
|
# directly to ORT. ORT's CUDA EP uses the physical device index regardless of
|
|
# CUDA_VISIBLE_DEVICES remapping — so we pass the physical index and explicitly
|
|
# unset CUDA_VISIBLE_DEVICES to avoid the two-layer renumbering problem where
|
|
# CUDA runtime remaps N→0 but ORT still expects the physical N.
|
|
if [[ "$GPU" == "cpu" || "$GPU" == "CPU" ]]; then
|
|
echo "trueref: GPU disabled (TRUEREF_GPU=cpu) — embedder/reranker will run on CPU"
|
|
SPRING_ARGS+=(
|
|
"--trueref.embedding.onnx-providers=cpu"
|
|
)
|
|
else
|
|
if [[ -d "$CUDNN_LIB" ]]; then
|
|
export LD_LIBRARY_PATH="${CUDNN_LIB}${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
|
|
else
|
|
echo "trueref: TRUEREF_CUDNN_LIB=$CUDNN_LIB not found — CUDA EP will fall back to CPU" >&2
|
|
echo "trueref: download cu12 cuDNN with:" >&2
|
|
echo " mkdir -p runtime/cudnn && cd runtime/cudnn && \\" >&2
|
|
echo " pip download --no-deps --only-binary=:all: --python-version 3.12 \\" >&2
|
|
echo " --platform manylinux2014_x86_64 'nvidia-cudnn-cu12<10' -d . && \\" >&2
|
|
echo " unzip -q -o nvidia_cudnn_cu12-*.whl 'nvidia/cudnn/lib/*' && rm *.whl" >&2
|
|
fi
|
|
# CUDA runtime respects CUDA_VISIBLE_DEVICES for all allocations (cudaMalloc, BFC arena,
|
|
# etc.). By restricting CUDA's view to exactly the target GPU, we prevent the runtime from
|
|
# creating a default context on device 0 before ORT's cudaSetDevice() takes effect.
|
|
# We always pass gpu-device-id=0 to ORT because CUDA_VISIBLE_DEVICES makes the target
|
|
# card the ONLY visible device (index 0 in the runtime's view).
|
|
#
|
|
# CUDA_DEVICE_ORDER=PCI_BUS_ID ensures CUDA runtime numbering matches nvidia-smi numbering.
|
|
# Without it, the default FASTEST_FIRST ordering can rank GPUs differently from nvidia-smi,
|
|
# so CUDA_VISIBLE_DEVICES=N would expose a different physical card than nvidia-smi GPU N.
|
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
|
export CUDA_VISIBLE_DEVICES="$GPU"
|
|
SPRING_ARGS+=(
|
|
"--trueref.embedding.gpu-device-id=0"
|
|
"--trueref.embedding.gpu-mem-limit-bytes=$MEM_LIMIT"
|
|
)
|
|
fi
|
|
|
|
exec "$JAVA" \
|
|
--enable-native-access=ALL-UNNAMED \
|
|
--add-modules=jdk.incubator.vector \
|
|
${JAVA_OPTS:-} \
|
|
-jar "$JAR" \
|
|
"${SPRING_ARGS[@]}" \
|
|
"$@"
|