chore: upgrade trtllm 1.2.0rc2 (#4405)

Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com> Co-authored-by: tanmayv25 <tanmay2592@gmail.com> Co-authored-by: Kyle McGill <kmcgill@nvidia.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com> Co-authored-by: Tanmay Verma <tanmayv@nvidia.com>

chore: upgrade trtllm 1.2.0rc2 (#4405)
Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com> Co-authored-by: tanmayv25 <tanmay2592@gmail.com> Co-authored-by: Kyle McGill <kmcgill@nvidia.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com> Co-authored-by: Tanmay Verma <tanmayv@nvidia.com>
9b8b9988 · Dmitry Tokarev · GitHub · 6d69e8c7 · 9b8b9988 · 9b8b9988
Unverified Commit 9b8b9988 authored Nov 21, 2025 by Dmitry Tokarev Committed by GitHub Nov 21, 2025
6 changed files
--- a/components/src/dynamo/trtllm/main.py
+++ b/components/src/dynamo/trtllm/main.py
@@ -182,7 +182,6 @@ async def init(runtime: DistributedRuntime, config: Config):
        "pipeline_parallel_size": config.pipeline_parallel_size,
        "moe_expert_parallel_size": config.expert_parallel_size,
        "backend": Backend.PYTORCH,
-        "skip_tokenizer_init": True,
        "build_config": build_config,
        "kv_cache_config": kv_cache_config,
        "gpus_per_node": gpus_per_node,
@@ -241,12 +240,10 @@ async def init(runtime: DistributedRuntime, config: Config):
    # Populate default sampling params from the model
    tokenizer = tokenizer_factory(arg_map["model"])
    default_sampling_params = SamplingParams()
-    default_sampling_params._setup(tokenizer)
-    default_sampling_params.stop = None
    # Enable perf metrics so prompt_tokens_details can be returned
    if hasattr(default_sampling_params, "return_perf_metrics"):
        default_sampling_params.return_perf_metrics = True
    model_input = ModelInput.Tokens
    # Set model type based on disaggregation mode for unified frontend support

--- a/components/src/dynamo/trtllm/utils/trtllm_utils.py
+++ b/components/src/dynamo/trtllm/utils/trtllm_utils.py
@@ -41,11 +41,11 @@ class Config:
        self.kv_block_size: int = 32
        self.migration_limit: int = 0
        self.gpus_per_node: Optional[int] = None
-        self.max_batch_size: int = BuildConfig.max_batch_size
+        self.max_batch_size: int = BuildConfig.model_fields["max_batch_size"].default
-        self.max_num_tokens: int = BuildConfig.max_num_tokens
+        self.max_num_tokens: int = BuildConfig.model_fields["max_num_tokens"].default
-        self.max_seq_len: int = BuildConfig.max_seq_len
+        self.max_seq_len: int = BuildConfig.model_fields["max_seq_len"].default
-        self.max_beam_width: int = BuildConfig.max_beam_width
+        self.max_beam_width: int = BuildConfig.model_fields["max_beam_width"].default
-        self.free_gpu_memory_fraction: Optional[float] = None
+        self.free_gpu_memory_fraction: float = 0.9
        self.extra_engine_args: str = ""
        self.override_engine_args: str = ""
        self.publish_events_and_metrics: bool = False
@@ -176,26 +176,26 @@ def cmd_line_args():
    parser.add_argument(
        "--max-batch-size",
        type=int,
-        default=BuildConfig.max_batch_size,
+        default=BuildConfig.model_fields["max_batch_size"].default,
        help="Maximum number of requests that the engine can schedule.",
    )
    parser.add_argument(
        "--max-num-tokens",
        type=int,
-        default=BuildConfig.max_num_tokens,
+        default=BuildConfig.model_fields["max_num_tokens"].default,
        help="Maximum number of batched input tokens after padding is removed in each batch.",
    )
    parser.add_argument(
        "--max-seq-len",
        type=int,
-        default=BuildConfig.max_seq_len,
+        default=BuildConfig.model_fields["max_seq_len"].default,
        help="Maximum total length of one request, including prompt and outputs. "
        "If unspecified, the value is deduced from the model config.",
    )
    parser.add_argument(
        "--max-beam-width",
        type=int,
-        default=BuildConfig.max_beam_width,
+        default=BuildConfig.model_fields["max_beam_width"].default,
        help="Maximum number of beams for beam search decoding.",
    )
    parser.add_argument(

--- a/container/Dockerfile.trtllm
+++ b/container/Dockerfile.trtllm
@@ -2,18 +2,18 @@
 # SPDX-License-Identifier: Apache-2.0
 ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
-ARG BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04"
+ARG BASE_IMAGE_TAG="25.10-cuda13.0-devel-ubuntu24.04"
 ARG PYTORCH_BASE_IMAGE="nvcr.io/nvidia/pytorch"
-ARG PYTORCH_BASE_IMAGE_TAG="25.06-py3"
+ARG PYTORCH_BASE_IMAGE_TAG="25.10-py3"
 ARG ENABLE_KVBM=false
-ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
+ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda-dl-base"
-ARG RUNTIME_IMAGE_TAG="12.9.1-runtime-ubuntu24.04"
+ARG RUNTIME_IMAGE_TAG="25.10-cuda13.0-runtime-ubuntu24.04"
 # TensorRT-LLM specific configuration
 ARG HAS_TRTLLM_CONTEXT=0
 ARG TENSORRTLLM_PIP_WHEEL="tensorrt-llm"
-ARG TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
+ARG TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/"
 ARG GITHUB_TRTLLM_COMMIT
 # Define general architecture ARGs for supporting both x86 and aarch64 builds.
@@ -72,6 +72,7 @@ RUN apt-get update && \
        git \
        git-lfs \
        ca-certificates && \
+    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Copy uv
@@ -82,16 +83,12 @@ RUN mkdir -p /opt/dynamo/venv && \
    uv venv /opt/dynamo/venv --python $PYTHON_VERSION
 # Copy pytorch installation from NGC PyTorch
-ARG TORCH_VER=2.8.0a0+5228986c39.nv25.6
+ARG TORCH_VER=2.9.0a0+145a3a7bda.nv25.10
-ARG TORCHVISION_VER=0.22.0a0+95f10a4e
+ARG TORCH_TENSORRT_VER=2.9.0a0
-ARG SETUPTOOLS_VER=78.1.1
+ARG TORCHVISION_VER=0.24.0a0+094e7af5
-ARG PYTORCH_TRITON_VER=3.3.0+git96316ce52.nvinternal
 ARG JINJA2_VER=3.1.6
-ARG NETWORKX_VER=3.5
 ARG SYMPY_VER=1.14.0
-ARG PACKAGING_VER=23.2
+ARG FLASH_ATTN_VER=2.7.4.post1+25.10
-ARG FLASH_ATTN_VER=2.7.4.post1
-ARG MPMATH_VER=1.3.0
 COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch
 COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch-${TORCH_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch-${TORCH_VER}.dist-info
@@ -107,8 +104,8 @@ COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/sy
 COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn
 COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/flash_attn-${FLASH_ATTN_VER}.dist-info
 COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/flash_attn_2_cuda.cpython-*-*-linux-gnu.so ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/
-COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/triton ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/triton
+COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch_tensorrt ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt
-COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info
+COPY --from=pytorch_base /usr/local/lib/python${PYTHON_VERSION}/dist-packages/torch_tensorrt-${TORCH_TENSORRT_VER}.dist-info ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/torch_tensorrt-${TORCH_TENSORRT_VER}.dist-info
 # Install TensorRT-LLM and related dependencies
 ARG HAS_TRTLLM_CONTEXT
@@ -120,8 +117,7 @@ ARG GITHUB_TRTLLM_COMMIT
 COPY --from=trtllm_wheel /*.whl /trtllm_wheel/
 COPY --from=trtllm_wheel /*.txt /trtllm_wheel/
-# NOTE: locking cuda-python version to <13 to avoid breaks with tensorrt-llm 1.0.0rc6.
+RUN uv pip install --no-cache "cuda-python==13.0.2"
-RUN uv pip install "cuda-python>=12,<13"
 # Note: TensorRT needs to be uninstalled before installing the TRTLLM wheel
 # because there might be mismatched versions of TensorRT between the NGC PyTorch
@@ -141,7 +137,7 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
        # Install from local wheel directory in build context
        WHEEL_FILE="$(find /trtllm_wheel -name "*.whl" | head -n 1)"; \
        if [ -n "$WHEEL_FILE" ]; then \
-            uv pip install "$WHEEL_FILE"; \
+            uv pip install --no-cache "$WHEEL_FILE"; \
        else \
            echo "No wheel file found in /trtllm_wheel directory."; \
            exit 1; \
@@ -155,7 +151,10 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
        sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \
        bash /tmp/install_tensorrt.sh && \
        # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI
-        uv pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}"; \
+        # TRTLLM 1.2.0rc2 has issues installing from pypi with uv, installing from direct wheel link works best
+        # explicitly installing triton 3.5.0 as trtllm only lists triton as dependency on x64_64 for some reason
+        export TENSORRTLLM_PIP_WHEEL="https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-1.2.0rc2-cp312-cp312-linux_${ARCH_ALT}.whl"; \
+        uv pip install --no-cache --index-strategy=unsafe-best-match --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}" triton==3.5.0; \
    fi
 ##################################################
@@ -190,12 +189,27 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv
 ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
 ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
 ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
+# workaround for pickle lib issue
+ENV OMPI_MCA_coll_ucc_enable=0
+# Use UCX KVCACHE by default
+ENV TRTLLM_USE_UCX_KVCACHE=1
 ARG DYNAMO_COMMIT_SHA
 ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
 # Install Python, build-essential and python3-dev as apt dependencies
-RUN apt-get update && \
+RUN if [ ${ARCH_ALT} = "x86_64" ]; then \
+        ARCH_FOR_GPG=${ARCH_ALT}; \
+    else \
+        ARCH_FOR_GPG="sbsa"; \
+    fi && \
+    curl -fsSL \
+        https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH_FOR_GPG}/cuda-archive-keyring.gpg \
+        -o /usr/share/keyrings/cuda-archive-keyring.gpg &&\
+    echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] \
+        https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH_FOR_GPG} /" \
+        | tee /etc/apt/sources.list.d/cuda.repo.list > /dev/null &&\
+    apt-get update && \
    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        # Build tools
        build-essential \
@@ -209,7 +223,8 @@ RUN apt-get update && \
        # jq for polling various endpoints and health checks
        jq \
        # CUDA/ML libraries
-        libcudnn9-cuda-12 \
+        libcudnn9-cuda-13 \
+        libnvshmem3-cuda-13 \
        # Network and communication libraries
        libzmq3-dev \
        # RDMA/UCX libraries required to find RDMA devices
@@ -228,6 +243,8 @@ RUN apt-get update && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
+ENV LD_LIBRARY_PATH="/usr/lib/${ARCH_ALT}-linux-gnu/nvshmem/13/:${LD_LIBRARY_PATH}"
 # Copy CUDA development tools (nvcc, headers, dependencies, etc.) from PyTorch base image
 COPY --from=pytorch_base /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc
 COPY --from=pytorch_base /usr/local/cuda/bin/cudafe++ /usr/local/cuda/bin/cudafe++
@@ -238,6 +255,16 @@ COPY --from=pytorch_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm
 COPY --from=pytorch_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
 COPY --from=pytorch_base /usr/local/cuda/lib64/libcupti* /usr/local/cuda/lib64/
 COPY --from=pytorch_base /usr/local/lib/lib* /usr/local/lib/
+COPY --from=pytorch_base /usr/local/cuda/bin/cuobjdump /usr/local/cuda/bin/cuobjdump
+COPY --from=pytorch_base /usr/local/cuda/bin/nvdisasm /usr/local/cuda/bin/nvdisasm
+ENV CUDA_HOME=/usr/local/cuda \
+    TRITON_CUPTI_PATH=/usr/local/cuda/include \
+    TRITON_CUDACRT_PATH=/usr/local/cuda/include \
+    TRITON_CUOBJDUMP_PATH=/usr/local/cuda/bin/cuobjdump \
+    TRITON_NVDISASM_PATH=/usr/local/cuda/bin/nvdisasm \
+    TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas \
+    TRITON_CUDART_PATH=/usr/local/cuda/include
 # Copy nats and etcd from dynamo_base image
 COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
@@ -255,8 +282,6 @@ COPY --from=pytorch_base /opt/hpcx /opt/hpcx
 # This is needed to make libucc.so visible so pytorch can use it.
 ENV LD_LIBRARY_PATH="/opt/hpcx/ucc/lib:${LD_LIBRARY_PATH}"
 # Might not need to copy cusparseLt in the future once it's included in DLFW cuda container
-# networkx, packaging, setuptools get overridden by trtllm installation, so not copying them
-# pytorch-triton is copied after trtllm installation.
 COPY --from=pytorch_base /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/
 # Copy uv to system /bin
@@ -274,6 +299,7 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
    && chown -R dynamo: /workspace /home/dynamo /opt/dynamo \
    && chmod -R g+w /workspace /home/dynamo/.cache /opt/dynamo
 # Switch to dynamo user
 USER dynamo
 ENV HOME=/home/dynamo
@@ -299,17 +325,18 @@ ENV OPAL_PREFIX=/opt/hpcx/ompi
 COPY --chown=dynamo: --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV}
 ENV TENSORRT_LIB_DIR=/usr/local/tensorrt/targets/${ARCH_ALT}-linux-gnu/lib
-ENV LD_LIBRARY_PATH=${TENSORRT_LIB_DIR}:${LD_LIBRARY_PATH}
+ENV LD_LIBRARY_PATH=/opt/dynamo/venv/lib/python3.12/site-packages/torch/lib:/opt/dynamo/venv/lib/python3.12/site-packages/torch_tensorrt/lib:${TENSORRT_LIB_DIR}:${LD_LIBRARY_PATH}
 # Install dynamo, NIXL, and dynamo-specific dependencies
 COPY --chown=dynamo: benchmarks/ /opt/dynamo/benchmarks/
 COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
 RUN uv pip install \
+      --no-cache \
      /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
      /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
      /opt/dynamo/wheelhouse/nixl/nixl*.whl \
    && if [ "${ENABLE_KVBM}" = "true" ]; then \
-        uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \
+        uv pip install --no-cache /opt/dynamo/wheelhouse/kvbm*.whl; \
       fi \
    && cd /opt/dynamo/benchmarks \
    && UV_GIT_LFS=1 uv pip install --no-cache . \
@@ -321,8 +348,11 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
    --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \
    UV_GIT_LFS=1 uv pip install \
        --no-cache \
+        --index-strategy unsafe-best-match \
+        --extra-index-url https://download.pytorch.org/whl/cu130 \
        --requirement /tmp/requirements.txt \
-        --requirement /tmp/requirements.test.txt
+        --requirement /tmp/requirements.test.txt \
+        cupy-cuda13x
 # Copy tests, benchmarks, deploy and components for CI with correct ownership
 COPY --chown=dynamo: tests /workspace/tests
@@ -346,7 +376,6 @@ RUN chmod 755 /opt/dynamo/.launch_screen && \
    echo 'cat /opt/dynamo/.launch_screen' >> /etc/bash.bashrc
 USER dynamo
 ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
 CMD []
@@ -397,6 +426,7 @@ RUN apt-get update -y && \
        clang \
        libclang-dev \
        protobuf-compiler && \
+    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Set workspace directory variable
@@ -412,10 +442,10 @@ COPY --from=dynamo_base /usr/local/rustup /usr/local/rustup
 COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo
 # Install maturin, for maturin develop
-RUN uv pip install maturin[patchelf]
+RUN uv pip install --no-cache maturin[patchelf]
 # Editable install of dynamo
 COPY pyproject.toml README.md hatch_build.py /workspace/
-RUN uv pip install --no-deps -e .
+RUN uv pip install --no-cache --no-deps -e .
 CMD []
--- a/container/build.sh
+++ b/container/build.sh
@@ -59,7 +59,7 @@ BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")")
 # Base Images
 TRTLLM_BASE_IMAGE=nvcr.io/nvidia/pytorch
-TRTLLM_BASE_IMAGE_TAG=25.06-py3
+TRTLLM_BASE_IMAGE_TAG=25.10-py3
 # Important Note: Because of ABI compatibility issues between TensorRT-LLM and NGC PyTorch,
 # we need to build the TensorRT-LLM wheel from source.
@@ -89,19 +89,18 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
 # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
 # Important Note: This commit is not used in our CI pipeline. See the CI
 # variables to learn how to run a pipeline with a specific commit.
-DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="0c9430e5a530ba958fc9dca561a3ad865ad9f492"
+DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="31116825b39f4e6a6a1e127001f5204b73d1dc32" # 1.2.0rc2
 TRTLLM_COMMIT=""
 TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
 TRTLLM_GIT_URL=""
 # TensorRT-LLM PyPI index URL
-DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
+DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/"
 # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
 # Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package.
-DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.1.0rc5"
+DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc2"
 TENSORRTLLM_PIP_WHEEL=""
 VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 # FIXME: NCCL will hang with 25.03, so use 25.01 for now
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065

--- a/container/deps/trtllm/install_nixl.sh
+++ b/container/deps/trtllm/install_nixl.sh
@@ -23,11 +23,11 @@ set -ex
 GITHUB_URL="https://github.com"
-UCX_VERSION="v1.18.1"
+UCX_VERSION="v1.19.1"
 UCX_INSTALL_PATH="/usr/local/ucx/"
 CUDA_PATH="/usr/local/cuda"
-NIXL_COMMIT="16348080f5bdeb9fe6058a23be140cec020ef3f3"
+NIXL_COMMIT="97c9b5b48e2ed3f1f2539c461c4971a7db8b1197"
 UCX_REPO="https://github.com/openucx/ucx.git"
 NIXL_REPO="https://github.com/ai-dynamo/nixl.git"

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,7 +49,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"
 [project.optional-dependencies]
 trtllm =[
    "uvloop",
-    "tensorrt-llm==1.1.0rc5",
+    "tensorrt-llm==1.2.0rc2",
 ]
 vllm = [