build: Cleans the TensorRTLLM + Dynamo container build (#968)

Signed-off-by: Tanmay Verma <tanmay2592@gmail.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>

build: Cleans the TensorRTLLM + Dynamo container build (#968)
Signed-off-by: Tanmay Verma <tanmay2592@gmail.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
7dd79013 · Tanmay Verma · GitHub · 412ec843 · 7dd79013 · 7dd79013
Unverified Commit 7dd79013 authored May 07, 2025 by Tanmay Verma Committed by GitHub May 07, 2025
7 changed files
--- a/container/Dockerfile.tensorrt_llm
+++ b/container/Dockerfile.tensorrt_llm
@@ -13,8 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-ARG BASE_IMAGE="tensorrt_llm/release"
+ARG BASE_IMAGE="nvcr.io/nvidia/pytorch"
-ARG BASE_IMAGE_TAG="latest_squashed"
+ARG BASE_IMAGE_TAG="25.03-py3"
 ARG RELEASE_BUILD
 # Define general architecture ARGs for supporting both x86 and aarch64 builds.
@@ -59,10 +59,31 @@ RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$E
    rm /tmp/etcd.tar.gz
 ENV PATH=/usr/local/bin/etcd/:$PATH
-# TODO: Try using uv to install tensorrtllm
+ARG HAS_TRTLLM_CONTEXT=0
-ARG TENSORRTLLM_PIP_WHEEL_PATH=""
+ARG TENSORRTLLM_PIP_WHEEL="tensorrt-llm"
-COPY ${TENSORRTLLM_PIP_WHEEL_PATH}/*.whl /tmp/
+ARG TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
-RUN find /tmp -name "*.whl" -exec pip install {} +
+COPY --from=trtllm_wheel . /trtllm_wheel/
+# TODO: Currently, ABI compatibility issues with TRTLLM wheel and NGC PyTorch prevent us
+# from using the TRTLLM wheel in a uv venv. Once the issues are resolved, we can
+# use uv to install TensorRT-LLM wheel within the uv venv.
+RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \
+    if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
+        # Install from local wheel directory in build context
+        WHEEL_FILE=$(find /trtllm_wheel -name "*.whl" | head -n 1); \
+        if [ -n "$WHEEL_FILE" ]; then \
+            pip install "$WHEEL_FILE"; \
+        else \
+            echo "No wheel file found in /trtllm_wheel directory."; \
+            exit 1; \
+        fi; \
+    else \
+         # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI
+         pip install --index-url "${TENSORRTLLM_INDEX_URL}" \
+         --extra-index-url https://pypi.org/simple \
+         "${TENSORRTLLM_PIP_WHEEL}" ; \
+    fi
 # Install genai-perf for benchmarking
 # TODO: Move to published pypi tags
@@ -90,7 +111,8 @@ RUN apt-get update && \
    protobuf-compiler \
    cmake \
    libssl-dev \
-    pkg-config
+    pkg-config \
+    libclang-dev
 ENV RUSTUP_HOME=/usr/local/rustup \
    CARGO_HOME=/usr/local/cargo \
@@ -237,12 +259,7 @@ ENV DYNAMO_HOME=/workspace
 # Use UCX for TRTLLM KV Cache Transfer
 ENV TRTLLM_USE_UCX_KVCACHE=1
-# Needed to use NVLink for TRTLLM KV Cache Transfer
-# https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/disaggregated-service.md
-ENV UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda
-ENV UCX_CUDA_COPY_DMABUF=no
-ENV UCX_MEMTYPE_CACHE=no
-ENV UCX_RNDV_PIPELINE_ERROR_HANDLING=y
 # Copy launch banner
 RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \

--- a/container/build.sh
+++ b/container/build.sh
@@ -57,9 +57,43 @@ DOCKERFILE=${SOURCE_DIR}/Dockerfile
 BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")")
 # Base Images
-TENSORRTLLM_BASE_IMAGE=tensorrt_llm/release
+TENSORRTLLM_BASE_IMAGE=nvcr.io/nvidia/pytorch
-TENSORRTLLM_BASE_IMAGE_TAG=latest_squashed
+TENSORRTLLM_BASE_IMAGE_TAG=25.03-py3
-TENSORRTLLM_PIP_WHEEL_PATH=""
+# Important Note: Because of ABI compatibility issues between TensorRT-LLM and NGC PyTorch,
+# we need to build the TensorRT-LLM wheel from source.
+#
+# There are two ways to build the dynamo image with TensorRT-LLM.
+# 1. Use the local TensorRT-LLM wheel directory.
+# 2. Use the TensorRT-LLM wheel on artifactory.
+#
+# If using option 1, the TENSORRTLLM_PIP_WHEEL_DIR must be a path to a directory
+# containing TensorRT-LLM wheel file along with commit.txt file with the
+# <arch>_<commit ID> as contents. If no valid trtllm wheel is found, the script
+# will attempt to build the wheel from source and store the built wheel in the
+# specified directory. TRTLLM_COMMIT from the TensorRT-LLM main branch will be
+# used to build the wheel.
+#
+# If using option 2, the TENSORRTLLM_PIP_WHEEL must be the TensorRT-LLM wheel
+# package that will be installed from the specified TensorRT-LLM PyPI Index URL.
+# This option will ignore the TRTLLM_COMMIT option. As the TensorRT-LLM wheel from PyPI
+# is not ABI compatible with NGC PyTorch, you can use TENSORRTLLM_INDEX_URL to specify
+# a private PyPI index URL which has your pre-built TensorRT-LLM wheel.
+#
+# By default, we will use option 1. If you want to use option 2, you can set
+# TENSORRTLLM_PIP_WHEEL to the TensorRT-LLM wheel on artifactory.
+#
+# Path to the local TensorRT-LLM wheel directory or the wheel on artifactory.
+TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
+# TensorRT-LLM commit to use for building the trtllm wheel if not provided.
+# Important Note: This commit is not used in our CI pipeline. See the CI
+# variables to learn how to run a pipeline with a specific commit.
+TRTLLM_COMMIT=83f37614ef735d251281136c3c05b1fecf8ef68b
+# TensorRT-LLM PyPI index URL
+TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
+TENSORRTLLM_PIP_WHEEL=""
 VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 VLLM_BASE_IMAGE_TAG="25.03-cuda12.8-devel-ubuntu24.04"
@@ -70,6 +104,8 @@ NONE_BASE_IMAGE_TAG="24.04"
 NIXL_COMMIT=d247e88c72db75dc00e4e37aa21ed8d99e60c27d
 NIXL_REPO=ai-dynamo/nixl.git
+NO_CACHE=""
 get_options() {
    while :; do
        case $1 in
@@ -93,9 +129,33 @@ get_options() {
                missing_requirement "$1"
            fi
            ;;
-        --tensorrtllm-pip-wheel-path)
+        --tensorrtllm-pip-wheel-dir)
            if [ "$2" ]; then
-                TENSORRTLLM_PIP_WHEEL_PATH=$2
+                TENSORRTLLM_PIP_WHEEL_DIR=$2
+                shift
+            else
+                missing_requirement "$1"
+            fi
+            ;;
+        --tensorrtllm-commit)
+            if [ "$2" ]; then
+                TRTLLM_COMMIT=$2
+                shift
+            else
+                missing_requirement "$1"
+            fi
+            ;;
+        --tensorrtllm-pip-wheel)
+            if [ "$2" ]; then
+                TENSORRTLLM_PIP_WHEEL=$2
+                shift
+            else
+                missing_requirement "$1"
+            fi
+            ;;
+        --tensorrtllm-index-url)
+            if [ "$2" ]; then
+                TENSORRTLLM_INDEX_URL=$2
                shift
            else
                missing_requirement "$1"
@@ -252,7 +312,7 @@ show_image_options() {
    echo "   Base: '${BASE_IMAGE}'"
    echo "   Base_Image_Tag: '${BASE_IMAGE_TAG}'"
    if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
-        echo "   Tensorrtllm_Pip_Wheel_Path: '${TENSORRTLLM_PIP_WHEEL_PATH}'"
+        echo "   Tensorrtllm_Pip_Wheel: '${TENSORRTLLM_PIP_WHEEL}'"
    fi
    echo "   Build Context: '${BUILD_CONTEXT}'"
    echo "   Build Arguments: '${BUILD_ARGS}'"
@@ -266,7 +326,10 @@ show_help() {
    echo "  [--base-image-tag base image tag]"
    echo "  [--platform platform for docker build"
    echo "  [--framework framework one of ${!FRAMEWORKS[*]}]"
-    echo "  [--tensorrtllm-pip-wheel-path path to tensorrtllm pip wheel]"
+    echo "  [--tensorrtllm-pip-wheel-dir path to tensorrtllm pip wheel directory]"
+    echo "  [--tensorrtllm-commit tensorrtllm commit to use for building the trtllm wheel if the wheel is not provided]"
+    echo "  [--tensorrtllm-pip-wheel tensorrtllm pip wheel on artifactory]"
+    echo "  [--tensorrtllm-index-url tensorrtllm PyPI index URL if providing the wheel from artifactory]"
    echo "  [--build-arg additional build args to pass to docker build]"
    echo "  [--cache-from cache location to start from]"
    echo "  [--cache-to location where to cache the build output]"
@@ -289,7 +352,9 @@ error() {
 get_options "$@"
 # Automatically set ARCH and ARCH_ALT if PLATFORM is linux/arm64
+ARCH="amd64"
 if [[ "$PLATFORM" == *"linux/arm64"* ]]; then
+    ARCH="arm64"
    BUILD_ARGS+=" --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64 "
 fi
@@ -349,9 +414,73 @@ if [ -n "${GITLAB_TOKEN}" ]; then
    BUILD_ARGS+=" --build-arg GITLAB_TOKEN=${GITLAB_TOKEN} "
 fi
+check_wheel_file() {
+    local wheel_dir="$1"
+    # Check if directory exists
+    if [ ! -d "$wheel_dir" ]; then
+        echo "Error: Directory '$wheel_dir' does not exist"
+        return 1
+    fi
+    # Look for .whl files
+    wheel_count=$(find "$wheel_dir" -name "*.whl" | wc -l)
+    if [ "$wheel_count" -eq 0 ]; then
+        echo "WARN: No .whl files found in '$wheel_dir'"
+        return 1
+    elif [ "$wheel_count" -gt 1 ]; then
+        echo "Warning: Multiple wheel files found in '$wheel_dir'. Will use first one found."
+        find "$wheel_dir" -name "*.whl" | head -n 1
+        return 0
+    else
+        echo "Found $wheel_count wheel files in '$wheel_dir'"
+        # Check if commit file exists
+        commit_file="$wheel_dir/commit.txt"
+        if [ ! -f "$commit_file" ]; then
+            echo "Error: Commit file '$commit_file' does not exist"
+            return 1
+        fi
+        # Check if commit ID matches, otherwise re-build the wheel
+        # Commit ID is of the form <arch>_<commit_id>
+        commit_id=$(cat "$commit_file")
+        if [ "$commit_id" != "$2" ]; then
+            echo "Error: Commit ID mismatch. Expected '$2', got '$commit_id'"
+            rm -rf $wheel_dir/*.whl
+            return 1
+        fi
+        return 0
+    fi
+}
 if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
-    if [ -n "${TENSORRTLLM_PIP_WHEEL_PATH}" ]; then
+    if [ -z "${TENSORRTLLM_PIP_WHEEL}" ]; then
-        BUILD_ARGS+=" --build-arg TENSORRTLLM_PIP_WHEEL_PATH=${TENSORRTLLM_PIP_WHEEL_PATH} "
+        # Use option 1
+        if [ ! -d "${TENSORRTLLM_PIP_WHEEL_DIR}" ]; then
+            # Create the directory if it doesn't exist
+            mkdir -p ${TENSORRTLLM_PIP_WHEEL_DIR}
+        fi
+        BUILD_ARGS+=" --build-arg HAS_TRTLLM_CONTEXT=1"
+        echo "Checking for TensorRT-LLM wheel in ${TENSORRTLLM_PIP_WHEEL_DIR}"
+        if ! check_wheel_file "${TENSORRTLLM_PIP_WHEEL_DIR}" "${ARCH}_${TRTLLM_COMMIT}"; then
+            echo "WARN: Valid trtllm wheel file not found in ${TENSORRTLLM_PIP_WHEEL_DIR}, attempting to build from source"
+            if ! env -i ${SOURCE_DIR}/build_trtllm_wheel.sh -o ${TENSORRTLLM_PIP_WHEEL_DIR} -c ${TRTLLM_COMMIT} -a ${ARCH}; then
+                error "ERROR: Failed to build TensorRT-LLM wheel"
+            fi
+        fi
+        echo "Installing TensorRT-LLM from local wheel directory"
+        BUILD_CONTEXT_ARG+=" --build-context trtllm_wheel=${TENSORRTLLM_PIP_WHEEL_DIR}"
+    else
+        BUILD_ARGS+=" --build-arg HAS_TRTLLM_CONTEXT=0"
+        BUILD_ARGS+=" --build-arg TENSORRTLLM_PIP_WHEEL=${TENSORRTLLM_PIP_WHEEL}"
+        BUILD_ARGS+=" --build-arg TENSORRTLLM_INDEX_URL=${TENSORRTLLM_INDEX_URL}"
+        # Create a dummy directory to satisfy the build context requirement
+        # There is no way to conditionally copy the build context in dockerfile.
+        mkdir -p /tmp/dummy_dir
+        BUILD_CONTEXT_ARG+=" --build-context trtllm_wheel=/tmp/dummy_dir"
    fi
 fi
@@ -374,19 +503,6 @@ if [ -z "$RUN_PREFIX" ]; then
    set -x
 fi
-# Check if the TensorRT-LLM base image exists
-if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
-    if docker inspect --type=image "$BASE_IMAGE:$BASE_IMAGE_TAG" > /dev/null 2>&1; then
-        echo "Image '$BASE_IMAGE:$BASE_IMAGE_TAG' is found."
-    else
-        echo "Image '$BASE_IMAGE:$BASE_IMAGE_TAG' is not found." >&2
-        echo "Please build the TensorRT-LLM base image first. Run ./build_trtllm_base_image.sh" >&2
-        echo "or use --base-image and --base-image-tag to an existing TensorRT-LLM base image." >&2
-        echo "See https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-linux.html for more information." >&2
-        exit 1
-    fi
-fi
 $RUN_PREFIX docker build -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE
 { set +x; } 2>/dev/null

--- a/container/build_trtllm_base_image.sh
+++ b/container/build_trtllm_base_image.sh
@@ -14,22 +14,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Build the TRT-LLM base image.
+# Build the TRT-LLM wheel.
 # This script builds the TRT-LLM base image for Dynamo with TensorRT-LLM.
-TRTLLM_COMMIT=dfbcb543
-while getopts "c:" opt; do
+while getopts "c:o:a:" opt; do
  case ${opt} in
    c) TRTLLM_COMMIT=$OPTARG ;;
-    *) echo "Invalid option" ;;
+    o) OUTPUT_DIR=$OPTARG ;;
+    a) ARCH=$OPTARG ;;
+    *) echo "Usage: $(basename $0) [-c commit] [-o output_dir] [-a arch]"
+       echo "  -c: TensorRT-LLM commit to build"
+       echo "  -o: Output directory for wheel files"
+       echo "  -a: Architecture (amd64 or arm64)"
+       exit 1 ;;
  esac
 done
-python3 -m venv /tmp/squash-env
+# Set default output directory if not specified
+if [ -z "$OUTPUT_DIR" ]; then
+    OUTPUT_DIR="/tmp/trtllm_wheel"
+fi
-source /tmp/squash-env/bin/activate
-pip3 install docker-squash
 (cd /tmp && \
 # Clone the TensorRT-LLM repository.
@@ -50,9 +56,20 @@ git submodule update --init --recursive
 git lfs pull
 # Build the TRT-LLM base image.
-make -C docker release_build)
+make -C docker wheel_build
+# Copy the wheel to the host
+mkdir -p $OUTPUT_DIR
+docker create --name trtllm_wheel_container docker.io/tensorrt_llm/wheel:latest
+docker cp trtllm_wheel_container:/src/tensorrt_llm/build $OUTPUT_DIR/
+cp $OUTPUT_DIR/build/*.whl $OUTPUT_DIR/
+docker rm trtllm_wheel_container || true
+)
-pip3 install docker-squash
+# Store the commit hash in the output directory to ensure the wheel is built from the correct commit.
-docker-squash -t tensorrt_llm/release:latest_squashed tensorrt_llm/release:latest
+rm -rf $OUTPUT_DIR/commit.txt
+echo ${ARCH}_${TRTLLM_COMMIT} > $OUTPUT_DIR/commit.txt
-deactivate
+echo "TRT-LLM wheel built successfully."
\ No newline at end of file
+ls -al $OUTPUT_DIR
\ No newline at end of file
--- a/examples/tensorrt_llm/README.md
+++ b/examples/tensorrt_llm/README.md
@@ -42,30 +42,10 @@ docker compose -f deploy/metrics/docker-compose.yml up -d
 ### Build docker
-#### Step 1: Build TensorRT-LLM base container image
-Because of the known issue of C++11 ABI compatibility within the NGC pytorch container, we rebuild TensorRT-LLM from source.
-See [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html) for more informantion.
-Use the helper script to build a TensorRT-LLM container base image. The script uses a specific commit id from TensorRT-LLM main branch.
 ```bash
 # TensorRT-LLM uses git-lfs, which needs to be installed in advance.
 apt-get update && apt-get -y install git git-lfs
-# The script uses python packages like docker-squash to squash image
-# layers within trtllm base image
-DEBIAN_FRONTEND=noninteractive TZ=America/Los_Angeles apt-get -y install python3 python3-pip python3-venv
-./container/build_trtllm_base_image.sh
-```
-For more information see [here](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-linux.html#option-1-build-tensorrt-llm-in-one-step) for more details on building from source.
-If you already have a TensorRT-LLM container image, you can skip this step.
-#### Step 2: Build the Dynamo container
-```
 # On an x86 machine:
 ./container/build.sh --framework tensorrtllm
@@ -73,14 +53,15 @@ If you already have a TensorRT-LLM container image, you can skip this step.
 ./container/build.sh --framework tensorrtllm --platform linux/arm64
 ```
-This build script internally points to the base container image built with step 1. If you skipped previous step because you already have the container image available, you can run the build script with that image as a base.
+> [!NOTE]
+> Because of a known issue of C++11 ABI compatibility within the NGC pytorch container,
+> we rebuild TensorRT-LLM from source. See [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
+> for more informantion.
+>
+> Hence, when running this script for the first time, the time taken by this script can be
+> quite long.
-```bash
-# Build dynamo image with other TRTLLM base image.
-./container/build.sh --framework TENSORRTLLM --base-image <trtllm-base-image> --base-image-tag <trtllm-base-image-tag>
-```
 ### Run container
 ```

--- a/examples/tensorrt_llm/common/protocol.py
+++ b/examples/tensorrt_llm/common/protocol.py
@@ -38,6 +38,7 @@ from tensorrt_llm.serve.openai_protocol import (
 class DynamoTRTLLMCompletionRequest(CompletionRequest):
    id: str = Field(default_factory=lambda: f"cmpl-{str(uuid.uuid4().hex)}")
    max_completion_tokens: Optional[int] = None
+    nvext: Optional[dict] = Field(default=None)
 class DynamoTRTLLMChatCompletionRequest(ChatCompletionRequest):
@@ -45,6 +46,7 @@ class DynamoTRTLLMChatCompletionRequest(ChatCompletionRequest):
    max_completion_tokens: Optional[int] = None
    max_tokens: Optional[int] = None
    disaggregated_params: Optional[DisaggregatedParams] = Field(default=None)
+    nvext: Optional[dict] = Field(default=None)
 class Tokens(BaseModel):
@@ -74,6 +76,7 @@ class TRTLLMWorkerResponseOutput:
    text: str
    token_ids: list[int]
    logprobs: Optional[List[float]] = None
+    prompt_logprobs: Optional[List[float]] = None
    cumulative_logprob: Optional[float] = None
    finish_reason: Optional[Literal["stop", "length", "timeout", "cancelled"]] = None
    stop_reason: Optional[Union[int, str]] = None
@@ -101,8 +104,6 @@ class TRTLLMWorkerResponse(BaseModel):
    prompt_token_ids: list[int]
    outputs: list[dict]
    finished: bool
-    # TODO
-    # prompt_logprobs: list[float]
 class DisaggregatedTypeConverter:

--- a/examples/tensorrt_llm/configs/llmapi_disagg_configs/single_node_config.yaml
+++ b/examples/tensorrt_llm/configs/llmapi_disagg_configs/single_node_config.yaml
@@ -31,6 +31,8 @@ context_servers:
  enable_chunked_prefill: false
  kv_cache_config:
    free_gpu_memory_fraction: 0.40
+  cache_transceiver_config:
+    max_num_tokens: 10240
  pytorch_backend_config:
    enable_overlap_scheduler: false
    use_cuda_graph: false
@@ -44,6 +46,8 @@ generation_servers:
  max_batch_size: 256
  kv_cache_config:
    free_gpu_memory_fraction: 0.40
+  cache_transceiver_config:
+    max_num_tokens: 256
  pytorch_backend_config:
    enable_overlap_scheduler: true
    use_cuda_graph: false

--- a/examples/tensorrt_llm/configs/llmapi_disagg_router_configs/single_node_config.yaml
+++ b/examples/tensorrt_llm/configs/llmapi_disagg_router_configs/single_node_config.yaml
@@ -33,6 +33,8 @@ context_servers:
    free_gpu_memory_fraction: 0.40
    event_buffer_max_size: 1024
    enable_block_reuse: true
+  cache_transceiver_config:
+    max_num_tokens: 10240
  pytorch_backend_config:
    enable_overlap_scheduler: false
    use_cuda_graph: false
@@ -49,6 +51,8 @@ generation_servers:
    free_gpu_memory_fraction: 0.40
    event_buffer_max_size: 1024
    enable_block_reuse: true
+  cache_transceiver_config:
+    max_num_tokens: 256
  pytorch_backend_config:
    enable_overlap_scheduler: true
    use_cuda_graph: false