chore: TRTLLM 1.2.0rc4 (#4836)

Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com>

chore: TRTLLM 1.2.0rc4 (#4836)
Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com>
52503032 · Dmitry Tokarev · GitHub · 00f8615e · 52503032 · 52503032
Unverified Commit 52503032 authored Dec 10, 2025 by Dmitry Tokarev Committed by GitHub Dec 11, 2025
8 changed files
--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@@ -46,7 +46,7 @@ dependencies = [
    "pydantic>=2",
    "tabulate",
    "types-tabulate",
-    # Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.2.0rc2/rc3 (==4.56.0), SGLang 0.5.6 (==4.57.1)
+    # Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.2.0rc5 (==4.56.0), SGLang 0.5.6 (==4.57.1)
    "transformers>=4.56.0,<=4.57.1",
    "pytest-mypy",
 ]

--- a/components/src/dynamo/trtllm/main.py
+++ b/components/src/dynamo/trtllm/main.py
@@ -22,7 +22,6 @@ if "TLLM_LOG_LEVEL" not in os.environ and os.getenv(
 import uvloop
 from prometheus_client import REGISTRY
 from tensorrt_llm.llmapi import (
-    BuildConfig,
    CapacitySchedulerPolicy,
    DynamicBatchConfig,
    KvCacheConfig,
@@ -162,13 +161,6 @@ async def init(runtime: DistributedRuntime, config: Config):
    else:
        gpus_per_node = config.gpus_per_node
-    build_config = BuildConfig(
-        max_batch_size=config.max_batch_size,
-        max_num_tokens=config.max_num_tokens,
-        max_beam_width=config.max_beam_width,
-        max_seq_len=config.max_seq_len,
-    )
    kv_cache_config = KvCacheConfig(
        free_gpu_memory_fraction=config.free_gpu_memory_fraction
    )
@@ -190,7 +182,6 @@ async def init(runtime: DistributedRuntime, config: Config):
        "pipeline_parallel_size": config.pipeline_parallel_size,
        "moe_expert_parallel_size": config.expert_parallel_size,
        "backend": Backend.PYTORCH,
-        "build_config": build_config,
        "kv_cache_config": kv_cache_config,
        "gpus_per_node": gpus_per_node,
        "max_num_tokens": config.max_num_tokens,

--- a/container/Dockerfile.trtllm
+++ b/container/Dockerfile.trtllm
@@ -314,6 +314,7 @@ COPY components/ /opt/dynamo/components/
 # Build dynamo wheels
 ARG ENABLE_KVBM
+ARG USE_SCCACHE
 RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
@@ -453,7 +454,7 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
        sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \
        bash /tmp/install_tensorrt.sh && \
        # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI
-        # TRTLLM 1.2.0rc2 has issues installing from pypi with uv, installing from direct wheel link works best
+        # TRTLLM 1.2.0rc5 has issues installing from pypi with uv, installing from direct wheel link works best
        # explicitly installing triton 3.5.0 as trtllm only lists triton as dependency on x64_64 for some reason
        if echo "${TENSORRTLLM_PIP_WHEEL}" | grep -q '^tensorrt-llm=='; then \
            TRTLLM_VERSION=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -E 's/tensorrt-llm==([0-9a-zA-Z.+-]+).*/\1/'); \

--- a/container/build.sh
+++ b/container/build.sh
@@ -89,7 +89,7 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
 # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
 # Important Note: This commit is not used in our CI pipeline. See the CI
 # variables to learn how to run a pipeline with a specific commit.
-DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="31116825b39f4e6a6a1e127001f5204b73d1dc32" # 1.2.0rc2
+DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="e4c707845ff58fcc0b1d87afb4dd0e64885c780a" # 1.2.0rc5
 TRTLLM_COMMIT=""
 TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
 TRTLLM_GIT_URL=""
@@ -98,7 +98,7 @@ TRTLLM_GIT_URL=""
 DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/"
 # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
 # Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package.
-DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc3"
+DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc5"
 TENSORRTLLM_PIP_WHEEL=""
 VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"

--- a/container/deps/requirements.txt
+++ b/container/deps/requirements.txt
@@ -52,7 +52,7 @@ tensorboard==2.19.0
 tensorboardX==2.6.2.2
 # Transformers version constraint for container builds
 # - vLLM 0.11.0: >=4.55.2, vLLM 0.11.2: >=4.56.0,<5
-# - TensorRT-LLM 1.2.0rc2/rc3: ==4.56.0
+# - TensorRT-LLM 1.2.0rc5: ==4.56.0
 # - SGLang 0.5.6: ==4.57.1
 # Using >=4.56.0 and <=4.57.1 to satisfy all frameworks
 transformers>=4.56.0,<=4.57.1

--- a/docs/backends/trtllm/multimodal_support.md
+++ b/docs/backends/trtllm/multimodal_support.md
@@ -96,15 +96,6 @@ To deploy `Llama-4-Maverick-17B-128E-Instruct` in disaggregated mode, you will n
 For high-performance multimodal inference, Dynamo supports pre-computed embeddings with an **Encode-Prefill-Decode (EPD)** flow using **NIXL (RDMA)** for zero-copy tensor transfer.
-### Enabling the Feature
-This is an experimental feature that requires using a specific TensorRT-LLM commit.
-To enable it build the dynamo container with the `--tensorrtllm-commit` flag:
-```bash
-./container/build.sh --framework trtllm --tensorrtllm-git-url https://github.com/NVIDIA/TensorRT-LLM.git --tensorrtllm-commit v1.2.0rc3
-```
 ### Supported File Types
 - `.pt` - PyTorch tensor files

--- a/docs/reference/support-matrix.md
+++ b/docs/reference/support-matrix.md
@@ -59,9 +59,9 @@ If you are using a **GPU**, the following GPU models and architectures are suppo
 ### Build Dependency
 | **Build Dependency** | **Version as of Dynamo v0.7.0** |
-| :------------------- | :------------------------------------------------------------------------------- |
+| :------------------- | :------------------------------ |
 | **SGLang**           | 0.5.3.post4                     |
-| **TensorRT-LLM**     | 1.2.0rc2                                                                         |
+| **TensorRT-LLM**     | 1.2.0rc5                        |
 | **vLLM**             | 0.11.0                          |
 | **NIXL**             | 0.7.1                           |

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"
 [project.optional-dependencies]
 trtllm =[
    "uvloop",
-    "tensorrt-llm==1.2.0rc3",
+    "tensorrt-llm==1.2.0rc5",
 ]
 vllm = [