"...dynamographdeploymentrequest_controller_test.go" did not exist on "6a84ffd347c01ddd279e42f69bef1087da545ed7"
Unverified Commit 0b7cdf55 authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

feat: Using NIXL for KV cache transfer when using disaggregated serving in TRTLLM (#1591)


Signed-off-by: default avatarTanmay Verma <tanmay2592@gmail.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent 0c9ae4dd
......@@ -324,16 +324,29 @@ RUN pip install dist/ai_dynamo_runtime*cp312*.whl && \
ENV DYNAMO_HOME=/workspace
ARG ARCH_ALT
ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu:$LD_LIBRARY_PATH
# Use UCX for TRTLLM KV Cache Transfer
ENV TRTLLM_USE_UCX_KVCACHE=1
ARG TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL
# Create a script that sets the environment variables and source it
RUN echo '#!/bin/bash' > /usr/local/bin/set_trtllm_env.sh && \
if [ "$TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL" = "1" ]; then \
echo 'export TRTLLM_USE_NIXL_KVCACHE=1' >> /usr/local/bin/set_trtllm_env.sh; \
else \
echo 'export TRTLLM_USE_UCX_KVCACHE=1' >> /usr/local/bin/set_trtllm_env.sh; \
fi && \
chmod +x /usr/local/bin/set_trtllm_env.sh
# Source the script in bashrc
RUN echo 'source /usr/local/bin/set_trtllm_env.sh' >> /root/.bashrc
# Copy launch banner
RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \
sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \
echo "cat ~/.launch_screen" >> ~/.bashrc
# FIXME: May want a modification with dynamo banner on entry
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
......@@ -90,6 +90,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
# variables to learn how to run a pipeline with a specific commit.
DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="137fe35539ea182f1495f5021bfda97c729e50c3"
TRTLLM_COMMIT=""
TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
# TensorRT-LLM PyPI index URL
TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
......@@ -166,6 +167,13 @@ get_options() {
fi
USE_DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT=true
;;
--trtllm-use-nixl-kvcache-experimental)
if [ -n "$2" ] && [[ "$2" != --* ]]; then
echo "ERROR: --trtllm-use-nixl-kvcache-experimental does not take any argument"
exit 1
fi
TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="1"
;;
--tensorrtllm-pip-wheel)
if [ "$2" ]; then
TENSORRTLLM_PIP_WHEEL=$2
......@@ -364,6 +372,7 @@ show_help() {
echo " [--build-context name=path to add build context]"
echo " [--release-build perform a release build]"
echo " [--make-efa Enables EFA support for NIXL]"
echo " [--trtllm-use-nixl-kvcache-experimental Enables NIXL KVCACHE experimental support for TensorRT-LLM]"
exit 0
}
......@@ -492,6 +501,10 @@ if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
TRTLLM_COMMIT="$DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT"
fi
if [ -n "${TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL}" ]; then
BUILD_ARGS+=" --build-arg TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL=${TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL} "
fi
# If user didn't set both wheel and commit, use default tensorrt_llm pip wheel
if [ -z "$TENSORRTLLM_PIP_WHEEL" ] && [ -z "$TRTLLM_COMMIT" ]; then
TENSORRTLLM_PIP_WHEEL="$DEFAULT_TENSORRTLLM_PIP_WHEEL"
......@@ -507,7 +520,7 @@ if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
echo "Checking for TensorRT-LLM wheel in ${TENSORRTLLM_PIP_WHEEL_DIR}"
if ! check_wheel_file "${TENSORRTLLM_PIP_WHEEL_DIR}" "${ARCH}_${TRTLLM_COMMIT}"; then
echo "WARN: Valid trtllm wheel file not found in ${TENSORRTLLM_PIP_WHEEL_DIR}, attempting to build from source"
if ! env -i ${SOURCE_DIR}/build_trtllm_wheel.sh -o ${TENSORRTLLM_PIP_WHEEL_DIR} -c ${TRTLLM_COMMIT} -a ${ARCH}; then
if ! env -i ${SOURCE_DIR}/build_trtllm_wheel.sh -o ${TENSORRTLLM_PIP_WHEEL_DIR} -c ${TRTLLM_COMMIT} -a ${ARCH} -n ${NIXL_COMMIT}; then
error "ERROR: Failed to build TensorRT-LLM wheel"
fi
fi
......
......@@ -18,15 +18,17 @@
# This script builds the TRT-LLM base image for Dynamo with TensorRT-LLM.
while getopts "c:o:a:" opt; do
while getopts "c:o:a:n:" opt; do
case ${opt} in
c) TRTLLM_COMMIT=$OPTARG ;;
o) OUTPUT_DIR=$OPTARG ;;
a) ARCH=$OPTARG ;;
*) echo "Usage: $(basename $0) [-c commit] [-o output_dir] [-a arch]"
n) NIXL_COMMIT=$OPTARG ;;
*) echo "Usage: $(basename $0) [-c commit] [-o output_dir] [-a arch] [-n nixl_commit]"
echo " -c: TensorRT-LLM commit to build"
echo " -o: Output directory for wheel files"
echo " -a: Architecture (amd64 or arm64)"
echo " -n: NIXL commit"
exit 1 ;;
esac
done
......@@ -36,6 +38,8 @@ if [ -z "$OUTPUT_DIR" ]; then
OUTPUT_DIR="/tmp/trtllm_wheel"
fi
# Store directory where script is being launched from
MAIN_DIR=$(dirname "$(readlink -f "$0")")
(cd /tmp && \
# Clone the TensorRT-LLM repository.
......@@ -79,8 +83,16 @@ sed -i "s/__version__ = \"\(.*\)\"/__version__ = \"\1+dev${COMMIT_VERSION}\"/" "
echo "Updated version:"
grep "__version__" "$VERSION_FILE"
echo "Copying install_nixl.sh from $MAIN_DIR to ${PWD}/docker/common/"
# Copy install_nixl.sh to docker/common/
cp $MAIN_DIR/deps/tensorrt_llm/install_nixl.sh docker/common/install_nixl.sh
# Update NIXL_COMMIT in install_nixl.sh to use the parameter passed to this script
sed -i "s/NIXL_COMMIT=\"[^\"]*\"/NIXL_COMMIT=\"${NIXL_COMMIT}\"/" docker/common/install_nixl.sh
make -C docker wheel_build
# Need to build in the Triton Devel Image for NIXL support.
make -C docker tritondevel_build
make -C docker wheel_build DEVEL_IMAGE=tritondevel BUILD_WHEEL_OPTS='--extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl'
# Copy the wheel to the host
mkdir -p $OUTPUT_DIR
......
#!/bin/bash -e
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Install NIXL for TensorRT-LLM.
# This script is an adapted version of the NIXL install script from the TensorRT-LLM repository.
# The original script is located at:
# https://github.com/NVIDIA/TensorRT-LLM/blob/main/docker/common/install_nixl.sh
set -ex
GITHUB_URL="https://github.com"
UCX_VERSION="v1.18.1"
UCX_INSTALL_PATH="/usr/local/ucx/"
CUDA_PATH="/usr/local/cuda"
NIXL_COMMIT="16348080f5bdeb9fe6058a23be140cec020ef3f3"
UCX_REPO="https://github.com/openucx/ucx.git"
NIXL_REPO="https://github.com/ai-dynamo/nixl.git"
if [ ! -d ${UCX_INSTALL_PATH} ]; then
git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO}
cd ucx
./autogen.sh
./contrib/configure-release \
--prefix=${UCX_INSTALL_PATH} \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-cma \
--enable-devel-headers \
--with-cuda=${CUDA_PATH} \
--with-verbs \
--with-dm \
--enable-mt
make install -j$(nproc)
cd ..
rm -rf ucx # Remove UCX source to save space
echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"
fi
ARCH_NAME="x86_64-linux-gnu"
if [ "$(uname -m)" != "amd64" ] && [ "$(uname -m)" != "x86_64" ]; then
ARCH_NAME="aarch64-linux-gnu"
EXTRA_NIXL_ARGS="-Ddisable_gds_backend=true"
fi
if [ $ARCH_NAME != "x86_64-linux-gnu" ]; then
echo "The NIXL backend is temporarily unavailable on the aarch64 platform. Exiting script."
exit 0
fi
pip3 install --no-cache-dir meson ninja pybind11
git clone ${NIXL_REPO} nixl
cd nixl
git checkout ${NIXL_COMMIT}
meson setup builddir -Ducx_path=${UCX_INSTALL_PATH} -Dstatic_plugins=UCX -Dbuildtype=release ${EXTRA_NIXL_ARGS}
cd builddir && ninja install
cd ../..
rm -rf nixl* # Remove NIXL source tree to save space
echo "export LD_LIBRARY_PATH=/opt/nvidia/nvda_nixl/lib/${ARCH_NAME}:/opt/nvidia/nvda_nixl/lib64:\$LD_LIBRARY_PATH" >> "${ENV}"
\ No newline at end of file
......@@ -69,15 +69,6 @@ apt-get update && apt-get -y install git git-lfs
./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit
```
> [!NOTE]
> Because of a known issue of C++11 ABI compatibility within the NGC pytorch container,
> we rebuild TensorRT-LLM from source. See [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
> for more information.
>
> Hence, when running this script for the first time, the time taken by this script can be
> quite long.
### Run container
```
......@@ -306,13 +297,54 @@ See [close deployment](../../docs/guides/dynamo_serve.md#close-deployment) secti
To benchmark your deployment with GenAI-Perf, see this utility script, configuring the
`model` name and `host` based on your deployment: [perf.sh](../../benchmarks/llm/perf.sh)
### Future Work
Remaining tasks:
- [x] Add support for the disaggregated serving.
- [x] Add multi-node support.
- [x] Add instructions for benchmarking.
- [x] Use processor from dynamo-llm framework.
- [ ] Add integration test coverage.
- [ ] Merge the code base with llm example to reduce the code duplication.
- [ ] Enable NIXL integration with TensorRT-LLM once available. Currently, TensorRT-LLM uses UCX to transfer KV cache.
### KV Cache Transfer for Disaggregated Serving
In disaggregated serving architectures, KV cache must be transferred between prefill and decode nodes. TensorRT-LLM supports two methods for this transfer:
#### Default Method: UCX
By default, TensorRT-LLM uses UCX (Unified Communication X) for KV cache transfer between prefill and decode nodes. UCX provides high-performance communication optimized for GPU-to-GPU transfers.
#### Experimental Method: NIXL
TensorRT-LLM also provides experimental support for using **NIXL** (NVIDIA Inference Xfer Library) for KV cache transfer. [NIXL](https://github.com/ai-dynamo/nixl) is NVIDIA's high-performance communication library designed for efficient data transfer in distributed GPU environments.
**Note:** NIXL support in TensorRT-LLM is experimental and is not suitable for production environments yet.
#### Using NIXL for KV Cache Transfer
To enable NIXL for KV cache transfer in disaggregated serving:
1. **Build the container with NIXL support:**
The TensorRT-LLM wheel must be built from source with NIXL support. The `./container/build.sh` script caches previously built TensorRT-LLM wheels to reduce build time. If you have previously built a TensorRT-LLM wheel without NIXL support, you must delete the cached wheel to force a rebuild with NIXL support.
**Remove cached TensorRT-LLM wheel (only if previously built without NIXL support):**
```bash
rm -rf /tmp/trtllm_wheel
```
**Build the container with NIXL support:**
```bash
./container/build.sh --framework tensorrtllm \
--use-default-experimental-tensorrtllm-commit \
--trtllm-use-nixl-kvcache-experimental
```
**Note:** Both `--use-default-experimental-tensorrtllm-commit` and `--trtllm-use-nixl-kvcache-experimental` flags are required to enable NIXL support.
2. **Run the containerized environment:**
See [run container](#run-container) section to learn how to start the container image built in previous step.
3. **Start the disaggregated service:**
See [disaggregated serving](#disaggregated-serving) to see how to start the deployment.
4. **Send the request:**
See [client](#client) section to learn how to send the request to deployment.
**Important:** Ensure that ETCD and NATS services are running before starting the service.
The container will automatically configure the appropriate environment variables (`TRTLLM_USE_NIXL_KVCACHE=1`) when built with the NIXL flag. The same container image can be used to use UCX for KV cache transfer.
```bash
unset TRTLLM_USE_NIXL_KVCACHE
export TRTLLM_USE_UCX_KVCACHE=1
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment