chore: Update trtllm version to 1.1.0rc3 (#2930)

Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>

chore: Update trtllm version to 1.1.0rc3 (#2930)
Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>
e8cb972e · Indrajit Bhosale · GitHub · 241bd014 · e8cb972e · e8cb972e
Unverified Commit e8cb972e authored Sep 09, 2025 by Indrajit Bhosale Committed by GitHub Sep 09, 2025
8 changed files
--- a/components/backends/trtllm/engine_configs/prefill.yaml
+++ b/components/backends/trtllm/engine_configs/prefill.yaml
@@ -27,4 +27,4 @@ kv_cache_config:
  free_gpu_memory_fraction: 0.85
 cache_transceiver_config:
-  backend: default
+  backend: DEFAULT
\ No newline at end of file
--- a/components/backends/trtllm/multimodal_support.md
+++ b/components/backends/trtllm/multimodal_support.md
@@ -14,24 +14,6 @@ limitations under the License.
 # Multimodal Support
-> [!Important]
-> There are some known issues in tensorrt_llm==1.0.0rc6 version for multimodal support
-> It is important to rebuild the dynamo container with a specific version of tensorrt_llm
-> commit to use multimodal feature.
-## Build Container
-```bash
-./container/build.sh --framework trtllm --tensorrtllm-commit b4065d8ca64a64eee9fdc64b39cb66d73d4be47c
-```
-## Run Container
-```bash
-./container/run.sh --framework trtllm -it
-```
-## Usage Guide
 TRTLLM supports multimodal models with dynamo. You can provide multimodal inputs in the following ways:
 - By sending image URLs

--- a/components/backends/trtllm/src/dynamo/trtllm/main.py
+++ b/components/backends/trtllm/src/dynamo/trtllm/main.py
@@ -8,7 +8,6 @@ import signal
 import sys
 import uvloop
-from tensorrt_llm import SamplingParams
 from tensorrt_llm.llmapi import (
    BuildConfig,
    CapacitySchedulerPolicy,
@@ -16,6 +15,7 @@ from tensorrt_llm.llmapi import (
    KvCacheConfig,
    SchedulerConfig,
 )
+from tensorrt_llm.llmapi.llm import SamplingParams
 from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_options
 from tensorrt_llm.llmapi.tokenizer import tokenizer_factory
 from torch.cuda import device_count

--- a/components/backends/trtllm/src/dynamo/trtllm/request_handlers/handler_base.py
+++ b/components/backends/trtllm/src/dynamo/trtllm/request_handlers/handler_base.py
@@ -21,8 +21,8 @@ from enum import Enum
 from typing import Optional, Union
 import torch
-from tensorrt_llm import SamplingParams
 from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams
+from tensorrt_llm.llmapi.llm import SamplingParams
 from dynamo.logits_processing.examples import HelloWorldLogitsProcessor
 from dynamo.nixl_connect import Connector

--- a/container/Dockerfile.trtllm
+++ b/container/Dockerfile.trtllm
@@ -140,7 +140,6 @@ COPY --from=trtllm_wheel . /trtllm_wheel/
 # Note: TensorRT needs to be uninstalled before installing the TRTLLM wheel
 # because there might be mismatched versions of TensorRT between the NGC PyTorch
 # and the TRTLLM wheel.
-# Locking triton version to 3.3.1 as 3.4.0 breaks tensorrt-llm 1.0.0rc6
 RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \
    pip uninstall -y tensorrt && \
    if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
@@ -148,9 +147,6 @@ RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \
        WHEEL_FILE=$(find /trtllm_wheel -name "*.whl" | head -n 1); \
        if [ -n "$WHEEL_FILE" ]; then \
            pip install "$WHEEL_FILE"; \
-            if [ "$ARCH" = "amd64" ]; then \
-                pip install "triton==3.3.1"; \
-            fi; \
        else \
            echo "No wheel file found in /trtllm_wheel directory."; \
            exit 1; \
@@ -158,9 +154,6 @@ RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \
    else \
        # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI
        pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}"; \
-        if [ "$ARCH" = "amd64" ]; then \
-            pip install "triton==3.3.1"; \
-        fi; \
    fi
 # Install test dependencies
@@ -477,12 +470,7 @@ COPY --from=dev /workspace/target/release/metrics /usr/local/bin/metrics
 # NOTE: If a package (tensorrt_llm) exists on both --index-url and --extra-index-url,
 # uv will prioritize the --extra-index-url, unless --index-strategy unsafe-best-match
 # is also specified. So set the configurable index as a --extra-index-url for prioritization.
-# NOTE: locking triton version to 3.3.1 as 3.4.0 breaks tensorrt-llm 1.0.0rc6
+RUN python3 -m pip install --no-cache-dir --break-system-packages --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}" && \
-# NOTE: locking cuda-python version to <13 to avoid breaks with tensorrt-llm 1.0.0rc6. This
-#       can be removed after https://github.com/NVIDIA/TensorRT-LLM/pull/6703 is merged
-#       we upgrade to a published pip wheel containing this change.
-RUN python3 -m pip install --no-cache-dir --break-system-packages "cuda-python>=12,<13" && \
-    python3 -m pip install --no-cache-dir --break-system-packages --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}" && \
    python3 -m pip install --no-cache-dir --break-system-packages \
        /workspace/wheelhouse/ai_dynamo_runtime*cp312*.whl \
        /workspace/wheelhouse/ai_dynamo*any.whl \

--- a/container/build.sh
+++ b/container/build.sh
@@ -89,7 +89,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
 # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
 # Important Note: This commit is not used in our CI pipeline. See the CI
 # variables to learn how to run a pipeline with a specific commit.
-DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="a16ba6445c61ed70e7aadfe787d6f316bb422652"
+DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="e81c50dbd2811ec858eccc2c71b5e7a330ff7e24"
 TRTLLM_COMMIT=""
 TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
 TRTLLM_GIT_URL=""
@@ -98,7 +98,7 @@ TRTLLM_GIT_URL=""
 TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
 # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
 # Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package.
-DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.0.0rc6"
+DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.1.0rc3"
 TENSORRTLLM_PIP_WHEEL=""

--- a/docs/support_matrix.md
+++ b/docs/support_matrix.md
@@ -67,7 +67,7 @@ If you are using a **GPU**, the following GPU models and architectures are suppo
 | **Build Dependency** | **Version**                                                                      |
 | :------------------- | :------------------------------------------------------------------------------- |
 | **Base Container**   | [25.03](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda-dl-base/tags) |
-| **TensorRT-LLM**     | 1.0.0rc6                                                                         |
+| **TensorRT-LLM**     | 1.1.0rc3                                                                         |
 | **NIXL**             | 0.4.1                                                                            |
 | **vLLM**             | 0.10.1.1                                                                         |
 | **SGLang**           | 0.5.0rc2                                                                         |

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,8 +48,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"
 [project.optional-dependencies]
 trtllm =[
    "uvloop",
-    "tensorrt-llm==1.0.0rc6",
+    "tensorrt-llm==1.1.0rc3",
-    "triton==3.3.1",  # locking triton as version 3.4.0 breaks tensorrt-llm 1.0.0rc6
 ]
 vllm = [