fix: align video diffusion pipeline with TRT-LLM 1.3.0rc9 API changes (#7529)

Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>

fix: align video diffusion pipeline with TRT-LLM 1.3.0rc9 API changes (#7529)
Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>
10fb23d2 · Indrajit Bhosale · GitHub · 8ff0b6e7 · 10fb23d2 · 10fb23d2
Unverified Commit 10fb23d2 authored Mar 31, 2026 by Indrajit Bhosale Committed by GitHub Mar 31, 2026
9 changed files
--- a/components/src/dynamo/trtllm/backend_args.py
+++ b/components/src/dynamo/trtllm/backend_args.py
@@ -329,14 +329,6 @@ class DynamoTrtllmArgGroup(ArgGroup):
            default=False,
            help="Disable torch.compile optimization.",
        )
-        add_argument(
-            diffusion_group,
-            flag_name="--torch-compile-mode",
-            env_var="DYN_TRTLLM_TORCH_COMPILE_MODE",
-            default="default",
-            choices=["default", "reduce-overhead", "max-autotune"],
-            help="torch.compile mode.",
-        )
        add_negatable_bool_argument(
            diffusion_group,
            flag_name="--enable-fullgraph",
@@ -365,13 +357,12 @@ class DynamoTrtllmArgGroup(ArgGroup):
            default=False,
            help="Enable per-layer NVTX markers for profiling with Nsight Systems.",
        )
-        add_argument(
+        add_negatable_bool_argument(
            diffusion_group,
-            flag_name="--warmup-steps",
-            env_var="DYN_TRTLLM_WARMUP_STEPS",
-            default=1,
-            arg_type=int,
-            help="Number of denoising steps to run during warmup (0 to disable).",
+            flag_name="--skip-warmup",
+            env_var="DYN_TRTLLM_SKIP_WARMUP",
+            default=False,
+            help="Skip warmup inference during initialization.",
        )
        add_argument(
            diffusion_group,
@@ -484,12 +475,11 @@ class DynamoTrtllmConfig(ConfigBase):
    quant_algo: Optional[str]
    quant_dynamic: bool
    disable_torch_compile: bool
-    torch_compile_mode: str
    enable_fullgraph: bool
    fuse_qkv: bool
    enable_cuda_graph: bool
    enable_layerwise_nvtx_marker: bool
-    warmup_steps: int
+    skip_warmup: bool
    dit_dp_size: int
    dit_tp_size: int
    dit_ulysses_size: int

--- a/components/src/dynamo/trtllm/configs/diffusion_config.py
+++ b/components/src/dynamo/trtllm/configs/diffusion_config.py
@@ -71,7 +71,6 @@ class DiffusionConfig:

    # ── Pipeline optimization config (maps to PipelineConfig) ──
    disable_torch_compile: bool = False
-    torch_compile_mode: str = "default"
    # Enable torch.compile fullgraph mode (stricter but potentially faster)
    enable_fullgraph: bool = False
    # QKV fusion for transformer attention layers
@@ -81,8 +80,8 @@ class DiffusionConfig:
    enable_cuda_graph: bool = False
    # Enable per-layer NVTX markers for profiling
    enable_layerwise_nvtx_marker: bool = False
-    # Number of denoising steps to run during warmup (0 to disable)
-    warmup_steps: int = 1
+    # Skip warmup inference during initialization (default: run warmup)
+    skip_warmup: bool = False

    # ── Attention config (maps to AttentionConfig) ──
    # Attention backend: "VANILLA" (PyTorch SDPA) or "TRTLLM"
@@ -135,7 +134,7 @@ class DiffusionConfig:
            f"attn_backend={self.attn_backend}, "
            f"quant_algo={self.quant_algo}, "
            f"enable_cuda_graph={self.enable_cuda_graph}, "
-            f"warmup_steps={self.warmup_steps}, "
+            f"skip_warmup={self.skip_warmup}, "
            f"dit_dp_size={self.dit_dp_size}, "
            f"dit_tp_size={self.dit_tp_size})"
        )
--- a/components/src/dynamo/trtllm/engines/diffusion_engine.py
+++ b/components/src/dynamo/trtllm/engines/diffusion_engine.py
@@ -124,7 +124,7 @@ class DiffusionEngine:
        # Use PipelineLoader for the full loading flow:
        #   VisualGenArgs → DiffusionModelConfig → AutoPipeline → BasePipeline
        loader = PipelineLoader(diffusion_args)
-        self._pipeline = loader.load()
+        self._pipeline = loader.load(skip_warmup=self.config.skip_warmup)

        self._initialized = True
        logger.info(
@@ -167,7 +167,7 @@ class DiffusionEngine:
            device=self.device,
            dtype=self.config.torch_dtype,
            skip_components=self.config.skip_components,
-            skip_warmup=(self.config.warmup_steps == 0),
+            skip_warmup=self.config.skip_warmup,
            pipeline=PipelineConfig(
                fuse_qkv=self.config.fuse_qkv,
                enable_layerwise_nvtx_marker=self.config.enable_layerwise_nvtx_marker,
@@ -260,7 +260,7 @@ class DiffusionEngine:

        req = DiffusionRequest(
            request_id=0,
-            prompt=prompt,
+            prompt=[prompt],
            negative_prompt=negative_prompt,
            height=height,
            width=width,

--- a/components/src/dynamo/trtllm/request_handlers/video_diffusion/video_handler.py
+++ b/components/src/dynamo/trtllm/request_handlers/video_diffusion/video_handler.py
@@ -239,8 +239,13 @@ class VideoGenerationHandler(BaseGenerativeHandler):

            # Encode media based on what the pipeline returned
            if output.video is not None:
-                # Video output: torch.Tensor (num_frames, H, W, 3) uint8 → MP4
-                frames_np = output.video.cpu().numpy()
+                # MediaOutput.video is (B, T, H, W, C) uint8 since TRT-LLM rc9;
+                # squeeze the batch dim to get (T, H, W, C) for MP4 encoding.
+                video = output.video
+                assert (
+                    video.ndim == 5 and video.shape[0] == 1
+                ), f"Expected video shape (1, T, H, W, C), got {video.shape}"
+                frames_np = video[0].cpu().numpy()
                logger.info(
                    f"Request {request_id}: encoding video output "
                    f"(shape={frames_np.shape}) to MP4 at {fps} fps"

--- a/components/src/dynamo/trtllm/tests/test_trtllm_video_diffusion.py
+++ b/components/src/dynamo/trtllm/tests/test_trtllm_video_diffusion.py
@@ -109,7 +109,7 @@ class TestDiffusionConfig:
        assert config.attn_backend == "VANILLA"
        assert config.quant_algo is None
        assert config.enable_cuda_graph is False
-        assert config.warmup_steps == 1
+        assert config.skip_warmup is False
        assert config.fuse_qkv is True

        # Parallelism defaults
@@ -484,7 +484,66 @@ class TestNvVideosResponse:


 # =============================================================================
-# Part 5: Concurrency Safety Tests
+# Part 5: DiffusionEngine Unit Tests
+# =============================================================================
+
+
+class TestDiffusionEngineGenerate:
+    """Tests for DiffusionEngine.generate() logic."""
+
+    def _make_engine(self):
+        """Create a DiffusionEngine with mocked pipeline (no TRT-LLM needed)."""
+        from dynamo.trtllm.engines.diffusion_engine import DiffusionEngine
+
+        config = DiffusionConfig()
+        engine = DiffusionEngine(config=config)
+        engine._initialized = True
+        engine._pipeline = MagicMock()
+        engine._pipeline.infer.return_value = SimpleNamespace(
+            video=torch.zeros((1, 4, 64, 64, 3), dtype=torch.uint8),
+            image=None,
+            audio=None,
+        )
+        return engine
+
+    def test_generate_wraps_prompt_as_list(self):
+        """Verify DiffusionEngine passes prompt as List[str] to DiffusionRequest."""
+        engine = self._make_engine()
+
+        captured = {}
+
+        class FakeDiffusionRequest:
+            def __init__(self, **kwargs):
+                captured.update(kwargs)
+                for k, v in kwargs.items():
+                    setattr(self, k, v)
+
+        # DiffusionRequest is imported inside generate() via
+        #   from tensorrt_llm._torch.visual_gen.executor import DiffusionRequest
+        # so we inject a fake module into sys.modules.
+        fake_executor = MagicMock(DiffusionRequest=FakeDiffusionRequest)
+        with patch.dict(
+            "sys.modules",
+            {
+                "tensorrt_llm._torch.visual_gen.executor": fake_executor,
+            },
+        ):
+            engine.generate(
+                prompt="a golden retriever",
+                height=64,
+                width=64,
+                num_frames=4,
+                num_inference_steps=1,
+            )
+
+        assert isinstance(
+            captured["prompt"], list
+        ), f"Expected list, got {type(captured['prompt'])}"
+        assert captured["prompt"] == ["a golden retriever"]
+
+
+# =============================================================================
+# Part 6: Concurrency Safety Tests
 # =============================================================================


@@ -539,7 +598,7 @@ class ConcurrencyTracker:

        # Return a mock MediaOutput with a video tensor
        return SimpleNamespace(
-            video=torch.zeros((4, 64, 64, 3), dtype=torch.uint8),
+            video=torch.zeros((1, 4, 64, 64, 3), dtype=torch.uint8),
            image=None,
            audio=None,
        )
@@ -672,7 +731,7 @@ class TestVideoHandlerResponseFormats:
        )

        mock_output = SimpleNamespace(
-            video=torch.zeros((4, 64, 64, 3), dtype=torch.uint8),
+            video=torch.zeros((1, 4, 64, 64, 3), dtype=torch.uint8),
            image=None,
            audio=None,
        )

--- a/components/src/dynamo/trtllm/workers/video_diffusion_worker.py
+++ b/components/src/dynamo/trtllm/workers/video_diffusion_worker.py
@@ -85,12 +85,11 @@ async def init_video_diffusion_worker(
        default_guidance_scale=config.default_guidance_scale,
        # Pipeline optimization
        disable_torch_compile=config.disable_torch_compile,
-        torch_compile_mode=config.torch_compile_mode,
        enable_fullgraph=config.enable_fullgraph,
        fuse_qkv=config.fuse_qkv,
        enable_cuda_graph=config.enable_cuda_graph,
        enable_layerwise_nvtx_marker=config.enable_layerwise_nvtx_marker,
-        warmup_steps=config.warmup_steps,
+        skip_warmup=config.skip_warmup,
        # Attention
        attn_backend=config.attn_backend,
        # Quantization

--- a/container/deps/requirements.common.txt
+++ b/container/deps/requirements.common.txt
@@ -7,6 +7,10 @@
 fastapi==0.120.1
 grpcio-tools<=1.76.0  # May have platform-specific builds; pins grpcio ecosystem version
 httpx==0.28.1
+
+# Video generation: encode frames to MP4 (used by TRT-LLM, vLLM-Omni, SGLang diffusion)
+imageio>=2.37.0
+imageio-ffmpeg>=0.6.0
 msgpack==1.1.2
 msgspec==0.19.0
 nvidia-ml-py<=13.580.65  # NVIDIA/CUDA related, may vary by driver version

--- a/examples/backends/trtllm/launch/agg_video_diffusion.sh
+++ b/examples/backends/trtllm/launch/agg_video_diffusion.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Aggregated video diffusion serving with TensorRT-LLM backend.
+# Uses Wan2.1-T2V-1.3B-Diffusers by default (1 GPU).
+
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+# Environment variables with defaults
+export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
+export MODEL_PATH=${MODEL_PATH:-"Wan-AI/Wan2.1-T2V-1.3B-Diffusers"}
+export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Wan-AI/Wan2.1-T2V-1.3B-Diffusers"}
+export MEDIA_OUTPUT_FS_URL=${MEDIA_OUTPUT_FS_URL:-"file:///tmp/dynamo_media"}
+
+# Parse command line arguments
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  -h, --help           Show this help message"
+            echo ""
+            echo "Any additional options are passed through to dynamo.trtllm."
+            exit 0
+            ;;
+        *)
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner --no-curl "Launching Video Diffusion Serving (1 GPU)" "$MODEL_PATH" "$HTTP_PORT" \
+    "Media URL:   $MEDIA_OUTPUT_FS_URL"
+
+print_curl_footer <<CURL
+  curl http://localhost:${HTTP_PORT}/v1/videos \\
+    -H 'Content-Type: application/json' \\
+    -d '{
+      "model": "${SERVED_MODEL_NAME}",
+      "prompt": "${EXAMPLE_PROMPT_VISUAL}",
+      "size": "832x480",
+      "seconds": 4,
+      "nvext": {"num_inference_steps": 10, "seed": 42}
+    }'
+CURL
+
+# run frontend
+python3 -m dynamo.frontend &
+
+# run video diffusion worker
+python3 -m dynamo.trtllm \
+  --model-path "$MODEL_PATH" \
+  --served-model-name "$SERVED_MODEL_NAME" \
+  --modality video_diffusion \
+  --media-output-fs-url "$MEDIA_OUTPUT_FS_URL" \
+  "${EXTRA_ARGS[@]}" &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -5,6 +5,7 @@ import dataclasses
 import logging
 import os
 from dataclasses import dataclass, field
+from typing import Any

 import pytest

@@ -24,10 +25,40 @@ from tests.utils.payload_builder import (
    metric_payload_default,
    multimodal_payload_default,
 )
+from tests.utils.payloads import BasePayload

 logger = logging.getLogger(__name__)


+@dataclass
+class VideoGenerationPayload(BasePayload):
+    """Payload for /v1/videos endpoint (TRT-LLM video diffusion)."""
+
+    endpoint: str = "/v1/videos"
+    timeout: int = 300
+
+    def response_handler(self, response: Any) -> str:
+        response.raise_for_status()
+        result = response.json()
+        assert result.get("status") == "completed", (
+            f"Video generation not completed. Status: {result.get('status')}, "
+            f"Error: {result.get('error', 'none')}"
+        )
+        assert (
+            "data" in result
+        ), f"Missing 'data' in response. Keys: {list(result.keys())}"
+        assert len(result["data"]) > 0, "Empty data in video response"
+        entry = result["data"][0]
+        if "url" in entry:
+            assert entry["url"], "Video response url is empty"
+            return entry["url"]
+        assert entry.get("b64_json"), "Video response b64_json is empty"
+        return "b64_video_returned"
+
+    def validate(self, response: Any, content: str) -> None:
+        assert content, "Video response content is empty"
+
+
 @dataclass
 class TRTLLMConfig(EngineConfig):
    """Configuration for trtllm test scenarios"""
@@ -265,6 +296,56 @@ trtllm_configs = {
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
    ),
+    # TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
+    # Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
+    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
+    # --disable-torch-compile, and small default resolution (480x272, 17 frames)
+    # to fit within CI GPU memory constraints.
+    "video_diffusion": TRTLLMConfig(
+        name="video_diffusion",
+        directory=trtllm_dir,
+        script_name="agg_video_diffusion.sh",
+        script_args=[
+            "--skip-warmup",
+            "--disable-torch-compile",
+            "--default-height",
+            "272",
+            "--default-width",
+            "480",
+            "--default-num-frames",
+            "17",
+        ],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.trtllm,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(
+                600
+            ),  # Video generation is slow even at small resolution
+        ],
+        model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+        frontend_port=DefaultPort.FRONTEND.value,
+        timeout=300,
+        delayed_start=60,  # Model loading takes time
+        request_payloads=[
+            VideoGenerationPayload(
+                body={
+                    "prompt": "A golden retriever running on a beach",
+                    "size": "480x272",
+                    "response_format": "url",
+                    "nvext": {
+                        "num_inference_steps": 10,
+                        "num_frames": 17,
+                        "guidance_scale": 5.0,
+                        "seed": 42,
+                    },
+                },
+                repeat_count=1,
+                expected_response=[],
+                expected_log=[],
+            ),
+        ],
+    ),
    "completions_only": TRTLLMConfig(
        name="completions_only",
        directory=trtllm_dir,