chore: upgrade tensorrt-llm to 1.3.0rc8 (#7504)

Signed-off-by: Yuewei Na <nv-yna@users.noreply.github.com> Co-authored-by: Yuewei Na <nv-yna@users.noreply.github.com>

chore: upgrade tensorrt-llm to 1.3.0rc8 (#7504)
Signed-off-by: Yuewei Na <nv-yna@users.noreply.github.com> Co-authored-by: Yuewei Na <nv-yna@users.noreply.github.com>
ba3aef8a · Yuewei Na · GitHub · f2fd3a16 · ba3aef8a · ba3aef8a
Unverified Commit ba3aef8a authored Mar 19, 2026 by Yuewei Na Committed by GitHub Mar 19, 2026
8 changed files
--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@@ -48,7 +48,7 @@ dependencies = [
    "pandas",
    "pydantic>=2",
    "tabulate",
-    # Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.3.0rc7 (==4.57.1), SGLang 0.5.8 (==4.57.1)
+    # Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.3.0rc8 (==4.57.1), SGLang 0.5.8 (==4.57.1)
    "transformers>=4.56.0",
 ]


--- a/components/src/dynamo/trtllm/configs/diffusion_config.py
+++ b/components/src/dynamo/trtllm/configs/diffusion_config.py
@@ -6,8 +6,10 @@
 This module defines the DiffusionConfig dataclass used for configuring
 video and image diffusion workers.

-Fields map to TensorRT-LLM's DiffusionArgs sub-configs:
- PipelineConfig: torch_compile, CUDA graph, warmup, offloading, fuse_qkv
+Fields map to TensorRT-LLM's VisualGenArgs sub-configs:
+- PipelineConfig: offloading, fuse_qkv, NVTX markers
+- TorchCompileConfig: torch_compile, fullgraph
+- CudaGraphConfig: CUDA graph capture
 - AttentionConfig: attention backend (VANILLA, TRTLLM)
 - ParallelConfig: dit_*_size parallelism dimensions
 - TeaCacheConfig: caching optimization
@@ -86,7 +88,7 @@ class DiffusionConfig:
    # Attention backend: "VANILLA" (PyTorch SDPA) or "TRTLLM"
    attn_backend: str = "VANILLA"

-    # ── Quantization config (maps to DiffusionArgs.quant_config) ──
+    # ── Quantization config (maps to VisualGenArgs.quant_config) ──
    # Quantization algorithm. Options:
    #   None (no quantization), "FP8", "FP8_BLOCK_SCALES", "NVFP4",
    #   "W4A16_AWQ", "W4A8_AWQ", "W8A8_SQ_PER_CHANNEL"

--- a/components/src/dynamo/trtllm/engines/diffusion_engine.py
+++ b/components/src/dynamo/trtllm/engines/diffusion_engine.py
@@ -30,7 +30,7 @@ from typing import TYPE_CHECKING, Optional
 import torch

 if TYPE_CHECKING:
-    from tensorrt_llm._torch.visual_gen import DiffusionArgs
+    from tensorrt_llm._torch.visual_gen import VisualGenArgs
    from tensorrt_llm._torch.visual_gen.output import MediaOutput
    from tensorrt_llm._torch.visual_gen.pipeline import BasePipeline

@@ -71,7 +71,7 @@ class DiffusionEngine:
    The old visual_gen standalone package (setup_configs + from_pretrained +
    PIPELINE_REGISTRY) has been replaced by TensorRT-LLM's integrated
    visual_gen module which uses:
-    - DiffusionArgs for configuration
+    - VisualGenArgs for configuration
    - PipelineLoader for model loading (handles MetaInit, weight loading,
      quantization, torch.compile, and warmup)
    - AutoPipeline for pipeline type auto-detection
@@ -117,12 +117,12 @@ class DiffusionEngine:
        # Import TensorRT-LLM visual_gen components
        from tensorrt_llm._torch.visual_gen import PipelineLoader

-        # Build DiffusionArgs from DiffusionConfig
+        # Build VisualGenArgs from DiffusionConfig
        diffusion_args = self._build_diffusion_args()
-        logger.info(f"DiffusionArgs: {diffusion_args}")
+        logger.info(f"VisualGenArgs: {diffusion_args}")

        # Use PipelineLoader for the full loading flow:
-        #   DiffusionArgs → DiffusionModelConfig → AutoPipeline → BasePipeline
+        #   VisualGenArgs → DiffusionModelConfig → AutoPipeline → BasePipeline
        loader = PipelineLoader(diffusion_args)
        self._pipeline = loader.load()

@@ -132,26 +132,29 @@ class DiffusionEngine:
            f"{self._pipeline.__class__.__name__}"
        )

-    def _build_diffusion_args(self) -> "DiffusionArgs":
-        """Build DiffusionArgs from DiffusionConfig.
+    def _build_diffusion_args(self) -> "VisualGenArgs":
+        """Build VisualGenArgs from DiffusionConfig.

-        Maps dynamo's DiffusionConfig fields to TensorRT-LLM's DiffusionArgs
-        structure with its nested sub-configs (PipelineConfig, AttentionConfig,
-        ParallelConfig, TeaCacheConfig, quant_config).
+        Maps dynamo's DiffusionConfig fields to TensorRT-LLM's VisualGenArgs
+        structure with its nested sub-configs (PipelineConfig, TorchCompileConfig,
+        CudaGraphConfig, AttentionConfig, ParallelConfig, TeaCacheConfig,
+        quant_config).

        Returns:
-            DiffusionArgs instance for PipelineLoader.
+            VisualGenArgs instance for PipelineLoader.
        """
        from tensorrt_llm._torch.visual_gen import (
-            DiffusionArgs,
+            CudaGraphConfig,
            ParallelConfig,
            PipelineConfig,
            TeaCacheConfig,
+            TorchCompileConfig,
+            VisualGenArgs,
        )
        from tensorrt_llm._torch.visual_gen.config import AttentionConfig

        # Build quant_config dict if quantization is requested
-        # DiffusionArgs accepts a dict in ModelOpt format and parses it via model_validator
+        # VisualGenArgs accepts a dict in ModelOpt format and parses it via model_validator
        quant_config: dict | None = None
        if self.config.quant_algo:
            quant_config = {
@@ -164,16 +167,19 @@ class DiffusionEngine:
            device=self.device,
            dtype=self.config.torch_dtype,
            skip_components=self.config.skip_components,
+            skip_warmup=(self.config.warmup_steps == 0),
            pipeline=PipelineConfig(
-                enable_torch_compile=not self.config.disable_torch_compile,
-                torch_compile_mode=self.config.torch_compile_mode,
-                enable_fullgraph=self.config.enable_fullgraph,
                fuse_qkv=self.config.fuse_qkv,
-                enable_cuda_graph=self.config.enable_cuda_graph,
                enable_layerwise_nvtx_marker=self.config.enable_layerwise_nvtx_marker,
-                warmup_steps=self.config.warmup_steps,
                enable_offloading=self.config.enable_async_cpu_offload,
            ),
+            torch_compile=TorchCompileConfig(
+                enable_torch_compile=not self.config.disable_torch_compile,
+                enable_fullgraph=self.config.enable_fullgraph,
+            ),
+            cuda_graph=CudaGraphConfig(
+                enable_cuda_graph=self.config.enable_cuda_graph,
+            ),
            attention=AttentionConfig(
                backend=self.config.attn_backend.upper(),
            ),
@@ -198,7 +204,7 @@ class DiffusionEngine:
        if quant_config is not None:
            args_kwargs["quant_config"] = quant_config

-        return DiffusionArgs(**args_kwargs)
+        return VisualGenArgs(**args_kwargs)

    def generate(
        self,

--- a/container/context.yaml
+++ b/container/context.yaml
@@ -96,10 +96,10 @@ trtllm:
  python_version: "3.12"
  index_url: https://pypi.nvidia.com/
  pip_wheel_dir: /tmp/trtllm_wheel/
-  pip_wheel: tensorrt-llm==1.3.0rc7
+  pip_wheel: tensorrt-llm==1.3.0rc8
  trtllm_wheel_image: nvcr.io/nvidia/tensorrt-llm/release:${TENSORRTLLM_PIP_WHEEL#*==}

-  github_trtllm_commit: v1.3.0rc7
+  github_trtllm_commit: v1.3.0rc8
  torch_version: 2.10.0a0+b4e4ee81d3.nv25.12
  torch_tensorrt_version: 2.10.0a0
  torchvision_version: 0.25.0a0+ca221243

--- a/container/deps/requirements.common.txt
+++ b/container/deps/requirements.common.txt
@@ -22,7 +22,7 @@ tensorboard>=2.19.0,<2.21.0
 tensorboardX==2.6.2.2
 # Transformers version constraint for container builds
 # - vLLM 0.11.0: >=4.55.2, vLLM 0.11.2: >=4.56.0,<5
-# - TensorRT-LLM 1.3.0rc7: ==4.57.1
+# - TensorRT-LLM 1.3.0rc8: ==4.57.1
 # - SGLang 0.5.8: ==4.57.1
 # Using >=4.56.0 to satisfy all frameworks
 transformers>=4.56.0

--- a/docs/reference/support-matrix.md
+++ b/docs/reference/support-matrix.md
@@ -29,7 +29,7 @@ The following table shows the backend framework versions included with each Dyna

 | **Dynamo** | **SGLang** | **TensorRT-LLM** | **vLLM** | **NIXL** |
 | :--- | :--- | :--- | :--- | :--- |
-| **main (ToT)** | `0.5.9` | `1.3.0rc7` | `0.17.1` | `0.10.1` |
+| **main (ToT)** | `0.5.9` | `1.3.0rc8` | `0.17.1` | `0.10.1` |
 | **v1.1.0-dev.1** *(experimental)* | `0.5.9` | `1.3.0rc5.post1` | `0.17.1` | `0.10.1` |
 | **v1.0.1** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` |
 | **v1.0.0** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` |

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"
 [project.optional-dependencies]
 trtllm =[
    "uvloop",
-    "tensorrt-llm==1.3.0rc7",
+    "tensorrt-llm==1.3.0rc8",
 ]

 vllm = [

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -168,6 +168,84 @@ def download_models(model_list=None, ignore_weights=False):
        )


+def _enable_offline_with_mistral_patch():
+    """Set HF_HUB_OFFLINE=1 and work around a transformers 4.57.3 regression.
+
+    transformers 4.57.3 (PR #42389) introduced _patch_mistral_regex which calls
+    huggingface_hub.model_info() unconditionally for every tokenizer load — even
+    non-Mistral models with fully cached weights. This API call fails when
+    HF_HUB_OFFLINE=1.
+
+    Since tests launch TRT-LLM workers as subprocesses that inherit env vars but
+    not in-process monkey-patches, we inject the fix via a sitecustomize.py on
+    PYTHONPATH so every subprocess auto-applies it at startup.
+
+    Upstream bug: https://github.com/huggingface/transformers/issues/44843
+
+    TODO: Remove this workaround once transformers ships a fix and TRT-LLM (or
+    any other dependency) upgrades to that fixed version.
+    """
+    os.environ["HF_HUB_OFFLINE"] = "1"
+
+    # Apply the patch in this process
+    try:
+        from huggingface_hub.errors import OfflineModeIsEnabled
+        from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
+        original = PreTrainedTokenizerBase._patch_mistral_regex
+
+        @classmethod  # type: ignore[misc]
+        def _safe_patch(cls, tokenizer, *args, **kwargs):
+            try:
+                return original.__func__(cls, tokenizer, *args, **kwargs)
+            except OfflineModeIsEnabled:
+                return tokenizer
+
+        PreTrainedTokenizerBase._patch_mistral_regex = _safe_patch
+    except (ImportError, AttributeError):
+        return  # transformers version without _patch_mistral_regex — nothing to do
+
+    # Write a sitecustomize.py so subprocesses also get the patch
+    patch_dir = os.path.join(tempfile.gettempdir(), "dynamo_test_hf_patch")
+    os.makedirs(patch_dir, exist_ok=True)
+    with open(os.path.join(patch_dir, "sitecustomize.py"), "w") as f:
+        f.write(
+            "import os\n"
+            "if os.environ.get('HF_HUB_OFFLINE') == '1':\n"
+            "    try:\n"
+            "        from transformers.tokenization_utils_base import"
+            " PreTrainedTokenizerBase as _T\n"
+            "        from huggingface_hub.errors import"
+            " OfflineModeIsEnabled as _E\n"
+            "        _orig = _T._patch_mistral_regex\n"
+            "        @classmethod\n"
+            "        def _safe(cls, tokenizer, *a, **kw):\n"
+            "            try:\n"
+            "                return _orig.__func__(cls, tokenizer, *a, **kw)\n"
+            "            except _E:\n"
+            "                return tokenizer\n"
+            "        _T._patch_mistral_regex = _safe\n"
+            "    except (ImportError, AttributeError):\n"
+            "        pass\n"
+        )
+    pythonpath = os.environ.get("PYTHONPATH", "")
+    os.environ["PYTHONPATH"] = f"{patch_dir}:{pythonpath}" if pythonpath else patch_dir
+    logging.info(
+        "Enabled HF_HUB_OFFLINE with _patch_mistral_regex workaround "
+        "(see https://github.com/huggingface/transformers/issues/44843)"
+    )
+
+
+def _disable_offline_with_mistral_patch():
+    """Undo _enable_offline_with_mistral_patch."""
+    os.environ.pop("HF_HUB_OFFLINE", None)
+    patch_dir = os.path.join(tempfile.gettempdir(), "dynamo_test_hf_patch")
+    pythonpath = os.environ.get("PYTHONPATH", "")
+    os.environ["PYTHONPATH"] = pythonpath.replace(f"{patch_dir}:", "").replace(
+        patch_dir, ""
+    )
+
+
 @pytest.fixture(scope="session")
 def predownload_models(pytestconfig):
    """Fixture wrapper around download_models for models used in collected tests"""
@@ -182,9 +260,9 @@ def predownload_models(pytestconfig):
        # Fallback to original behavior if extraction failed
        download_models()

-    os.environ["HF_HUB_OFFLINE"] = "1"
+    _enable_offline_with_mistral_patch()
    yield
-    os.environ.pop("HF_HUB_OFFLINE", None)
+    _disable_offline_with_mistral_patch()


 @pytest.fixture(scope="session")
@@ -204,9 +282,9 @@ def predownload_tokenizers(pytestconfig):
    # Skip redundant HuggingFace API calls in worker subprocesses since
    # tokenizers are already cached. This avoids flaky timeouts from slow
    # HF API responses (the RepoInfo fetch still happens even for cached models).
-    os.environ["HF_HUB_OFFLINE"] = "1"
+    _enable_offline_with_mistral_patch()
    yield
-    os.environ.pop("HF_HUB_OFFLINE", None)
+    _disable_offline_with_mistral_patch()


 @pytest.fixture(autouse=True)