"tests/vscode:/vscode.git/clone" did not exist on "f701319e6933dbd43d6963cea28366c56e2e246c"
Unverified Commit 10fb23d2 authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

fix: align video diffusion pipeline with TRT-LLM 1.3.0rc9 API changes (#7529)


Signed-off-by: default avatarIndrajit Bhosale <iamindrajitb@gmail.com>
parent 8ff0b6e7
......@@ -329,14 +329,6 @@ class DynamoTrtllmArgGroup(ArgGroup):
default=False,
help="Disable torch.compile optimization.",
)
add_argument(
diffusion_group,
flag_name="--torch-compile-mode",
env_var="DYN_TRTLLM_TORCH_COMPILE_MODE",
default="default",
choices=["default", "reduce-overhead", "max-autotune"],
help="torch.compile mode.",
)
add_negatable_bool_argument(
diffusion_group,
flag_name="--enable-fullgraph",
......@@ -365,13 +357,12 @@ class DynamoTrtllmArgGroup(ArgGroup):
default=False,
help="Enable per-layer NVTX markers for profiling with Nsight Systems.",
)
add_argument(
add_negatable_bool_argument(
diffusion_group,
flag_name="--warmup-steps",
env_var="DYN_TRTLLM_WARMUP_STEPS",
default=1,
arg_type=int,
help="Number of denoising steps to run during warmup (0 to disable).",
flag_name="--skip-warmup",
env_var="DYN_TRTLLM_SKIP_WARMUP",
default=False,
help="Skip warmup inference during initialization.",
)
add_argument(
diffusion_group,
......@@ -484,12 +475,11 @@ class DynamoTrtllmConfig(ConfigBase):
quant_algo: Optional[str]
quant_dynamic: bool
disable_torch_compile: bool
torch_compile_mode: str
enable_fullgraph: bool
fuse_qkv: bool
enable_cuda_graph: bool
enable_layerwise_nvtx_marker: bool
warmup_steps: int
skip_warmup: bool
dit_dp_size: int
dit_tp_size: int
dit_ulysses_size: int
......
......@@ -71,7 +71,6 @@ class DiffusionConfig:
# ── Pipeline optimization config (maps to PipelineConfig) ──
disable_torch_compile: bool = False
torch_compile_mode: str = "default"
# Enable torch.compile fullgraph mode (stricter but potentially faster)
enable_fullgraph: bool = False
# QKV fusion for transformer attention layers
......@@ -81,8 +80,8 @@ class DiffusionConfig:
enable_cuda_graph: bool = False
# Enable per-layer NVTX markers for profiling
enable_layerwise_nvtx_marker: bool = False
# Number of denoising steps to run during warmup (0 to disable)
warmup_steps: int = 1
# Skip warmup inference during initialization (default: run warmup)
skip_warmup: bool = False
# ── Attention config (maps to AttentionConfig) ──
# Attention backend: "VANILLA" (PyTorch SDPA) or "TRTLLM"
......@@ -135,7 +134,7 @@ class DiffusionConfig:
f"attn_backend={self.attn_backend}, "
f"quant_algo={self.quant_algo}, "
f"enable_cuda_graph={self.enable_cuda_graph}, "
f"warmup_steps={self.warmup_steps}, "
f"skip_warmup={self.skip_warmup}, "
f"dit_dp_size={self.dit_dp_size}, "
f"dit_tp_size={self.dit_tp_size})"
)
......@@ -124,7 +124,7 @@ class DiffusionEngine:
# Use PipelineLoader for the full loading flow:
# VisualGenArgs → DiffusionModelConfig → AutoPipeline → BasePipeline
loader = PipelineLoader(diffusion_args)
self._pipeline = loader.load()
self._pipeline = loader.load(skip_warmup=self.config.skip_warmup)
self._initialized = True
logger.info(
......@@ -167,7 +167,7 @@ class DiffusionEngine:
device=self.device,
dtype=self.config.torch_dtype,
skip_components=self.config.skip_components,
skip_warmup=(self.config.warmup_steps == 0),
skip_warmup=self.config.skip_warmup,
pipeline=PipelineConfig(
fuse_qkv=self.config.fuse_qkv,
enable_layerwise_nvtx_marker=self.config.enable_layerwise_nvtx_marker,
......@@ -260,7 +260,7 @@ class DiffusionEngine:
req = DiffusionRequest(
request_id=0,
prompt=prompt,
prompt=[prompt],
negative_prompt=negative_prompt,
height=height,
width=width,
......
......@@ -239,8 +239,13 @@ class VideoGenerationHandler(BaseGenerativeHandler):
# Encode media based on what the pipeline returned
if output.video is not None:
# Video output: torch.Tensor (num_frames, H, W, 3) uint8 → MP4
frames_np = output.video.cpu().numpy()
# MediaOutput.video is (B, T, H, W, C) uint8 since TRT-LLM rc9;
# squeeze the batch dim to get (T, H, W, C) for MP4 encoding.
video = output.video
assert (
video.ndim == 5 and video.shape[0] == 1
), f"Expected video shape (1, T, H, W, C), got {video.shape}"
frames_np = video[0].cpu().numpy()
logger.info(
f"Request {request_id}: encoding video output "
f"(shape={frames_np.shape}) to MP4 at {fps} fps"
......
......@@ -109,7 +109,7 @@ class TestDiffusionConfig:
assert config.attn_backend == "VANILLA"
assert config.quant_algo is None
assert config.enable_cuda_graph is False
assert config.warmup_steps == 1
assert config.skip_warmup is False
assert config.fuse_qkv is True
# Parallelism defaults
......@@ -484,7 +484,66 @@ class TestNvVideosResponse:
# =============================================================================
# Part 5: Concurrency Safety Tests
# Part 5: DiffusionEngine Unit Tests
# =============================================================================
class TestDiffusionEngineGenerate:
"""Tests for DiffusionEngine.generate() logic."""
def _make_engine(self):
"""Create a DiffusionEngine with mocked pipeline (no TRT-LLM needed)."""
from dynamo.trtllm.engines.diffusion_engine import DiffusionEngine
config = DiffusionConfig()
engine = DiffusionEngine(config=config)
engine._initialized = True
engine._pipeline = MagicMock()
engine._pipeline.infer.return_value = SimpleNamespace(
video=torch.zeros((1, 4, 64, 64, 3), dtype=torch.uint8),
image=None,
audio=None,
)
return engine
def test_generate_wraps_prompt_as_list(self):
"""Verify DiffusionEngine passes prompt as List[str] to DiffusionRequest."""
engine = self._make_engine()
captured = {}
class FakeDiffusionRequest:
def __init__(self, **kwargs):
captured.update(kwargs)
for k, v in kwargs.items():
setattr(self, k, v)
# DiffusionRequest is imported inside generate() via
# from tensorrt_llm._torch.visual_gen.executor import DiffusionRequest
# so we inject a fake module into sys.modules.
fake_executor = MagicMock(DiffusionRequest=FakeDiffusionRequest)
with patch.dict(
"sys.modules",
{
"tensorrt_llm._torch.visual_gen.executor": fake_executor,
},
):
engine.generate(
prompt="a golden retriever",
height=64,
width=64,
num_frames=4,
num_inference_steps=1,
)
assert isinstance(
captured["prompt"], list
), f"Expected list, got {type(captured['prompt'])}"
assert captured["prompt"] == ["a golden retriever"]
# =============================================================================
# Part 6: Concurrency Safety Tests
# =============================================================================
......@@ -539,7 +598,7 @@ class ConcurrencyTracker:
# Return a mock MediaOutput with a video tensor
return SimpleNamespace(
video=torch.zeros((4, 64, 64, 3), dtype=torch.uint8),
video=torch.zeros((1, 4, 64, 64, 3), dtype=torch.uint8),
image=None,
audio=None,
)
......@@ -672,7 +731,7 @@ class TestVideoHandlerResponseFormats:
)
mock_output = SimpleNamespace(
video=torch.zeros((4, 64, 64, 3), dtype=torch.uint8),
video=torch.zeros((1, 4, 64, 64, 3), dtype=torch.uint8),
image=None,
audio=None,
)
......
......@@ -85,12 +85,11 @@ async def init_video_diffusion_worker(
default_guidance_scale=config.default_guidance_scale,
# Pipeline optimization
disable_torch_compile=config.disable_torch_compile,
torch_compile_mode=config.torch_compile_mode,
enable_fullgraph=config.enable_fullgraph,
fuse_qkv=config.fuse_qkv,
enable_cuda_graph=config.enable_cuda_graph,
enable_layerwise_nvtx_marker=config.enable_layerwise_nvtx_marker,
warmup_steps=config.warmup_steps,
skip_warmup=config.skip_warmup,
# Attention
attn_backend=config.attn_backend,
# Quantization
......
......@@ -7,6 +7,10 @@
fastapi==0.120.1
grpcio-tools<=1.76.0 # May have platform-specific builds; pins grpcio ecosystem version
httpx==0.28.1
# Video generation: encode frames to MP4 (used by TRT-LLM, vLLM-Omni, SGLang diffusion)
imageio>=2.37.0
imageio-ffmpeg>=0.6.0
msgpack==1.1.2
msgspec==0.19.0
nvidia-ml-py<=13.580.65 # NVIDIA/CUDA related, may vary by driver version
......
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Aggregated video diffusion serving with TensorRT-LLM backend.
# Uses Wan2.1-T2V-1.3B-Diffusers by default (1 GPU).
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Wan-AI/Wan2.1-T2V-1.3B-Diffusers"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Wan-AI/Wan2.1-T2V-1.3B-Diffusers"}
export MEDIA_OUTPUT_FS_URL=${MEDIA_OUTPUT_FS_URL:-"file:///tmp/dynamo_media"}
# Parse command line arguments
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " -h, --help Show this help message"
echo ""
echo "Any additional options are passed through to dynamo.trtllm."
exit 0
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --no-curl "Launching Video Diffusion Serving (1 GPU)" "$MODEL_PATH" "$HTTP_PORT" \
"Media URL: $MEDIA_OUTPUT_FS_URL"
print_curl_footer <<CURL
curl http://localhost:${HTTP_PORT}/v1/videos \\
-H 'Content-Type: application/json' \\
-d '{
"model": "${SERVED_MODEL_NAME}",
"prompt": "${EXAMPLE_PROMPT_VISUAL}",
"size": "832x480",
"seconds": 4,
"nvext": {"num_inference_steps": 10, "seed": 42}
}'
CURL
# run frontend
python3 -m dynamo.frontend &
# run video diffusion worker
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--modality video_diffusion \
--media-output-fs-url "$MEDIA_OUTPUT_FS_URL" \
"${EXTRA_ARGS[@]}" &
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
......@@ -5,6 +5,7 @@ import dataclasses
import logging
import os
from dataclasses import dataclass, field
from typing import Any
import pytest
......@@ -24,10 +25,40 @@ from tests.utils.payload_builder import (
metric_payload_default,
multimodal_payload_default,
)
from tests.utils.payloads import BasePayload
logger = logging.getLogger(__name__)
@dataclass
class VideoGenerationPayload(BasePayload):
"""Payload for /v1/videos endpoint (TRT-LLM video diffusion)."""
endpoint: str = "/v1/videos"
timeout: int = 300
def response_handler(self, response: Any) -> str:
response.raise_for_status()
result = response.json()
assert result.get("status") == "completed", (
f"Video generation not completed. Status: {result.get('status')}, "
f"Error: {result.get('error', 'none')}"
)
assert (
"data" in result
), f"Missing 'data' in response. Keys: {list(result.keys())}"
assert len(result["data"]) > 0, "Empty data in video response"
entry = result["data"][0]
if "url" in entry:
assert entry["url"], "Video response url is empty"
return entry["url"]
assert entry.get("b64_json"), "Video response b64_json is empty"
return "b64_video_returned"
def validate(self, response: Any, content: str) -> None:
assert content, "Video response content is empty"
@dataclass
class TRTLLMConfig(EngineConfig):
"""Configuration for trtllm test scenarios"""
......@@ -265,6 +296,56 @@ trtllm_configs = {
"ENCODE_CUDA_VISIBLE_DEVICES": "0",
},
),
# TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
# Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
# Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
# --disable-torch-compile, and small default resolution (480x272, 17 frames)
# to fit within CI GPU memory constraints.
"video_diffusion": TRTLLMConfig(
name="video_diffusion",
directory=trtllm_dir,
script_name="agg_video_diffusion.sh",
script_args=[
"--skip-warmup",
"--disable-torch-compile",
"--default-height",
"272",
"--default-width",
"480",
"--default-num-frames",
"17",
],
marks=[
pytest.mark.gpu_1,
pytest.mark.trtllm,
pytest.mark.pre_merge,
pytest.mark.timeout(
600
), # Video generation is slow even at small resolution
],
model="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
frontend_port=DefaultPort.FRONTEND.value,
timeout=300,
delayed_start=60, # Model loading takes time
request_payloads=[
VideoGenerationPayload(
body={
"prompt": "A golden retriever running on a beach",
"size": "480x272",
"response_format": "url",
"nvext": {
"num_inference_steps": 10,
"num_frames": 17,
"guidance_scale": 5.0,
"seed": 42,
},
},
repeat_count=1,
expected_response=[],
expected_log=[],
),
],
),
"completions_only": TRTLLMConfig(
name="completions_only",
directory=trtllm_dir,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment