Unverified Commit ba3aef8a authored by Yuewei Na's avatar Yuewei Na Committed by GitHub
Browse files

chore: upgrade tensorrt-llm to 1.3.0rc8 (#7504)


Signed-off-by: default avatarYuewei Na <nv-yna@users.noreply.github.com>
Co-authored-by: default avatarYuewei Na <nv-yna@users.noreply.github.com>
parent f2fd3a16
...@@ -48,7 +48,7 @@ dependencies = [ ...@@ -48,7 +48,7 @@ dependencies = [
"pandas", "pandas",
"pydantic>=2", "pydantic>=2",
"tabulate", "tabulate",
# Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.3.0rc7 (==4.57.1), SGLang 0.5.8 (==4.57.1) # Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.3.0rc8 (==4.57.1), SGLang 0.5.8 (==4.57.1)
"transformers>=4.56.0", "transformers>=4.56.0",
] ]
......
...@@ -6,8 +6,10 @@ ...@@ -6,8 +6,10 @@
This module defines the DiffusionConfig dataclass used for configuring This module defines the DiffusionConfig dataclass used for configuring
video and image diffusion workers. video and image diffusion workers.
Fields map to TensorRT-LLM's DiffusionArgs sub-configs: Fields map to TensorRT-LLM's VisualGenArgs sub-configs:
- PipelineConfig: torch_compile, CUDA graph, warmup, offloading, fuse_qkv - PipelineConfig: offloading, fuse_qkv, NVTX markers
- TorchCompileConfig: torch_compile, fullgraph
- CudaGraphConfig: CUDA graph capture
- AttentionConfig: attention backend (VANILLA, TRTLLM) - AttentionConfig: attention backend (VANILLA, TRTLLM)
- ParallelConfig: dit_*_size parallelism dimensions - ParallelConfig: dit_*_size parallelism dimensions
- TeaCacheConfig: caching optimization - TeaCacheConfig: caching optimization
...@@ -86,7 +88,7 @@ class DiffusionConfig: ...@@ -86,7 +88,7 @@ class DiffusionConfig:
# Attention backend: "VANILLA" (PyTorch SDPA) or "TRTLLM" # Attention backend: "VANILLA" (PyTorch SDPA) or "TRTLLM"
attn_backend: str = "VANILLA" attn_backend: str = "VANILLA"
# ── Quantization config (maps to DiffusionArgs.quant_config) ── # ── Quantization config (maps to VisualGenArgs.quant_config) ──
# Quantization algorithm. Options: # Quantization algorithm. Options:
# None (no quantization), "FP8", "FP8_BLOCK_SCALES", "NVFP4", # None (no quantization), "FP8", "FP8_BLOCK_SCALES", "NVFP4",
# "W4A16_AWQ", "W4A8_AWQ", "W8A8_SQ_PER_CHANNEL" # "W4A16_AWQ", "W4A8_AWQ", "W8A8_SQ_PER_CHANNEL"
......
...@@ -30,7 +30,7 @@ from typing import TYPE_CHECKING, Optional ...@@ -30,7 +30,7 @@ from typing import TYPE_CHECKING, Optional
import torch import torch
if TYPE_CHECKING: if TYPE_CHECKING:
from tensorrt_llm._torch.visual_gen import DiffusionArgs from tensorrt_llm._torch.visual_gen import VisualGenArgs
from tensorrt_llm._torch.visual_gen.output import MediaOutput from tensorrt_llm._torch.visual_gen.output import MediaOutput
from tensorrt_llm._torch.visual_gen.pipeline import BasePipeline from tensorrt_llm._torch.visual_gen.pipeline import BasePipeline
...@@ -71,7 +71,7 @@ class DiffusionEngine: ...@@ -71,7 +71,7 @@ class DiffusionEngine:
The old visual_gen standalone package (setup_configs + from_pretrained + The old visual_gen standalone package (setup_configs + from_pretrained +
PIPELINE_REGISTRY) has been replaced by TensorRT-LLM's integrated PIPELINE_REGISTRY) has been replaced by TensorRT-LLM's integrated
visual_gen module which uses: visual_gen module which uses:
- DiffusionArgs for configuration - VisualGenArgs for configuration
- PipelineLoader for model loading (handles MetaInit, weight loading, - PipelineLoader for model loading (handles MetaInit, weight loading,
quantization, torch.compile, and warmup) quantization, torch.compile, and warmup)
- AutoPipeline for pipeline type auto-detection - AutoPipeline for pipeline type auto-detection
...@@ -117,12 +117,12 @@ class DiffusionEngine: ...@@ -117,12 +117,12 @@ class DiffusionEngine:
# Import TensorRT-LLM visual_gen components # Import TensorRT-LLM visual_gen components
from tensorrt_llm._torch.visual_gen import PipelineLoader from tensorrt_llm._torch.visual_gen import PipelineLoader
# Build DiffusionArgs from DiffusionConfig # Build VisualGenArgs from DiffusionConfig
diffusion_args = self._build_diffusion_args() diffusion_args = self._build_diffusion_args()
logger.info(f"DiffusionArgs: {diffusion_args}") logger.info(f"VisualGenArgs: {diffusion_args}")
# Use PipelineLoader for the full loading flow: # Use PipelineLoader for the full loading flow:
# DiffusionArgs → DiffusionModelConfig → AutoPipeline → BasePipeline # VisualGenArgs → DiffusionModelConfig → AutoPipeline → BasePipeline
loader = PipelineLoader(diffusion_args) loader = PipelineLoader(diffusion_args)
self._pipeline = loader.load() self._pipeline = loader.load()
...@@ -132,26 +132,29 @@ class DiffusionEngine: ...@@ -132,26 +132,29 @@ class DiffusionEngine:
f"{self._pipeline.__class__.__name__}" f"{self._pipeline.__class__.__name__}"
) )
def _build_diffusion_args(self) -> "DiffusionArgs": def _build_diffusion_args(self) -> "VisualGenArgs":
"""Build DiffusionArgs from DiffusionConfig. """Build VisualGenArgs from DiffusionConfig.
Maps dynamo's DiffusionConfig fields to TensorRT-LLM's DiffusionArgs Maps dynamo's DiffusionConfig fields to TensorRT-LLM's VisualGenArgs
structure with its nested sub-configs (PipelineConfig, AttentionConfig, structure with its nested sub-configs (PipelineConfig, TorchCompileConfig,
ParallelConfig, TeaCacheConfig, quant_config). CudaGraphConfig, AttentionConfig, ParallelConfig, TeaCacheConfig,
quant_config).
Returns: Returns:
DiffusionArgs instance for PipelineLoader. VisualGenArgs instance for PipelineLoader.
""" """
from tensorrt_llm._torch.visual_gen import ( from tensorrt_llm._torch.visual_gen import (
DiffusionArgs, CudaGraphConfig,
ParallelConfig, ParallelConfig,
PipelineConfig, PipelineConfig,
TeaCacheConfig, TeaCacheConfig,
TorchCompileConfig,
VisualGenArgs,
) )
from tensorrt_llm._torch.visual_gen.config import AttentionConfig from tensorrt_llm._torch.visual_gen.config import AttentionConfig
# Build quant_config dict if quantization is requested # Build quant_config dict if quantization is requested
# DiffusionArgs accepts a dict in ModelOpt format and parses it via model_validator # VisualGenArgs accepts a dict in ModelOpt format and parses it via model_validator
quant_config: dict | None = None quant_config: dict | None = None
if self.config.quant_algo: if self.config.quant_algo:
quant_config = { quant_config = {
...@@ -164,16 +167,19 @@ class DiffusionEngine: ...@@ -164,16 +167,19 @@ class DiffusionEngine:
device=self.device, device=self.device,
dtype=self.config.torch_dtype, dtype=self.config.torch_dtype,
skip_components=self.config.skip_components, skip_components=self.config.skip_components,
skip_warmup=(self.config.warmup_steps == 0),
pipeline=PipelineConfig( pipeline=PipelineConfig(
enable_torch_compile=not self.config.disable_torch_compile,
torch_compile_mode=self.config.torch_compile_mode,
enable_fullgraph=self.config.enable_fullgraph,
fuse_qkv=self.config.fuse_qkv, fuse_qkv=self.config.fuse_qkv,
enable_cuda_graph=self.config.enable_cuda_graph,
enable_layerwise_nvtx_marker=self.config.enable_layerwise_nvtx_marker, enable_layerwise_nvtx_marker=self.config.enable_layerwise_nvtx_marker,
warmup_steps=self.config.warmup_steps,
enable_offloading=self.config.enable_async_cpu_offload, enable_offloading=self.config.enable_async_cpu_offload,
), ),
torch_compile=TorchCompileConfig(
enable_torch_compile=not self.config.disable_torch_compile,
enable_fullgraph=self.config.enable_fullgraph,
),
cuda_graph=CudaGraphConfig(
enable_cuda_graph=self.config.enable_cuda_graph,
),
attention=AttentionConfig( attention=AttentionConfig(
backend=self.config.attn_backend.upper(), backend=self.config.attn_backend.upper(),
), ),
...@@ -198,7 +204,7 @@ class DiffusionEngine: ...@@ -198,7 +204,7 @@ class DiffusionEngine:
if quant_config is not None: if quant_config is not None:
args_kwargs["quant_config"] = quant_config args_kwargs["quant_config"] = quant_config
return DiffusionArgs(**args_kwargs) return VisualGenArgs(**args_kwargs)
def generate( def generate(
self, self,
......
...@@ -96,10 +96,10 @@ trtllm: ...@@ -96,10 +96,10 @@ trtllm:
python_version: "3.12" python_version: "3.12"
index_url: https://pypi.nvidia.com/ index_url: https://pypi.nvidia.com/
pip_wheel_dir: /tmp/trtllm_wheel/ pip_wheel_dir: /tmp/trtllm_wheel/
pip_wheel: tensorrt-llm==1.3.0rc7 pip_wheel: tensorrt-llm==1.3.0rc8
trtllm_wheel_image: nvcr.io/nvidia/tensorrt-llm/release:${TENSORRTLLM_PIP_WHEEL#*==} trtllm_wheel_image: nvcr.io/nvidia/tensorrt-llm/release:${TENSORRTLLM_PIP_WHEEL#*==}
github_trtllm_commit: v1.3.0rc7 github_trtllm_commit: v1.3.0rc8
torch_version: 2.10.0a0+b4e4ee81d3.nv25.12 torch_version: 2.10.0a0+b4e4ee81d3.nv25.12
torch_tensorrt_version: 2.10.0a0 torch_tensorrt_version: 2.10.0a0
torchvision_version: 0.25.0a0+ca221243 torchvision_version: 0.25.0a0+ca221243
......
...@@ -22,7 +22,7 @@ tensorboard>=2.19.0,<2.21.0 ...@@ -22,7 +22,7 @@ tensorboard>=2.19.0,<2.21.0
tensorboardX==2.6.2.2 tensorboardX==2.6.2.2
# Transformers version constraint for container builds # Transformers version constraint for container builds
# - vLLM 0.11.0: >=4.55.2, vLLM 0.11.2: >=4.56.0,<5 # - vLLM 0.11.0: >=4.55.2, vLLM 0.11.2: >=4.56.0,<5
# - TensorRT-LLM 1.3.0rc7: ==4.57.1 # - TensorRT-LLM 1.3.0rc8: ==4.57.1
# - SGLang 0.5.8: ==4.57.1 # - SGLang 0.5.8: ==4.57.1
# Using >=4.56.0 to satisfy all frameworks # Using >=4.56.0 to satisfy all frameworks
transformers>=4.56.0 transformers>=4.56.0
......
...@@ -29,7 +29,7 @@ The following table shows the backend framework versions included with each Dyna ...@@ -29,7 +29,7 @@ The following table shows the backend framework versions included with each Dyna
| **Dynamo** | **SGLang** | **TensorRT-LLM** | **vLLM** | **NIXL** | | **Dynamo** | **SGLang** | **TensorRT-LLM** | **vLLM** | **NIXL** |
| :--- | :--- | :--- | :--- | :--- | | :--- | :--- | :--- | :--- | :--- |
| **main (ToT)** | `0.5.9` | `1.3.0rc7` | `0.17.1` | `0.10.1` | | **main (ToT)** | `0.5.9` | `1.3.0rc8` | `0.17.1` | `0.10.1` |
| **v1.1.0-dev.1** *(experimental)* | `0.5.9` | `1.3.0rc5.post1` | `0.17.1` | `0.10.1` | | **v1.1.0-dev.1** *(experimental)* | `0.5.9` | `1.3.0rc5.post1` | `0.17.1` | `0.10.1` |
| **v1.0.1** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` | | **v1.0.1** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` |
| **v1.0.0** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` | | **v1.0.0** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` |
......
...@@ -44,7 +44,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git" ...@@ -44,7 +44,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"
[project.optional-dependencies] [project.optional-dependencies]
trtllm =[ trtllm =[
"uvloop", "uvloop",
"tensorrt-llm==1.3.0rc7", "tensorrt-llm==1.3.0rc8",
] ]
vllm = [ vllm = [
......
...@@ -168,6 +168,84 @@ def download_models(model_list=None, ignore_weights=False): ...@@ -168,6 +168,84 @@ def download_models(model_list=None, ignore_weights=False):
) )
def _enable_offline_with_mistral_patch():
"""Set HF_HUB_OFFLINE=1 and work around a transformers 4.57.3 regression.
transformers 4.57.3 (PR #42389) introduced _patch_mistral_regex which calls
huggingface_hub.model_info() unconditionally for every tokenizer load — even
non-Mistral models with fully cached weights. This API call fails when
HF_HUB_OFFLINE=1.
Since tests launch TRT-LLM workers as subprocesses that inherit env vars but
not in-process monkey-patches, we inject the fix via a sitecustomize.py on
PYTHONPATH so every subprocess auto-applies it at startup.
Upstream bug: https://github.com/huggingface/transformers/issues/44843
TODO: Remove this workaround once transformers ships a fix and TRT-LLM (or
any other dependency) upgrades to that fixed version.
"""
os.environ["HF_HUB_OFFLINE"] = "1"
# Apply the patch in this process
try:
from huggingface_hub.errors import OfflineModeIsEnabled
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
original = PreTrainedTokenizerBase._patch_mistral_regex
@classmethod # type: ignore[misc]
def _safe_patch(cls, tokenizer, *args, **kwargs):
try:
return original.__func__(cls, tokenizer, *args, **kwargs)
except OfflineModeIsEnabled:
return tokenizer
PreTrainedTokenizerBase._patch_mistral_regex = _safe_patch
except (ImportError, AttributeError):
return # transformers version without _patch_mistral_regex — nothing to do
# Write a sitecustomize.py so subprocesses also get the patch
patch_dir = os.path.join(tempfile.gettempdir(), "dynamo_test_hf_patch")
os.makedirs(patch_dir, exist_ok=True)
with open(os.path.join(patch_dir, "sitecustomize.py"), "w") as f:
f.write(
"import os\n"
"if os.environ.get('HF_HUB_OFFLINE') == '1':\n"
" try:\n"
" from transformers.tokenization_utils_base import"
" PreTrainedTokenizerBase as _T\n"
" from huggingface_hub.errors import"
" OfflineModeIsEnabled as _E\n"
" _orig = _T._patch_mistral_regex\n"
" @classmethod\n"
" def _safe(cls, tokenizer, *a, **kw):\n"
" try:\n"
" return _orig.__func__(cls, tokenizer, *a, **kw)\n"
" except _E:\n"
" return tokenizer\n"
" _T._patch_mistral_regex = _safe\n"
" except (ImportError, AttributeError):\n"
" pass\n"
)
pythonpath = os.environ.get("PYTHONPATH", "")
os.environ["PYTHONPATH"] = f"{patch_dir}:{pythonpath}" if pythonpath else patch_dir
logging.info(
"Enabled HF_HUB_OFFLINE with _patch_mistral_regex workaround "
"(see https://github.com/huggingface/transformers/issues/44843)"
)
def _disable_offline_with_mistral_patch():
"""Undo _enable_offline_with_mistral_patch."""
os.environ.pop("HF_HUB_OFFLINE", None)
patch_dir = os.path.join(tempfile.gettempdir(), "dynamo_test_hf_patch")
pythonpath = os.environ.get("PYTHONPATH", "")
os.environ["PYTHONPATH"] = pythonpath.replace(f"{patch_dir}:", "").replace(
patch_dir, ""
)
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def predownload_models(pytestconfig): def predownload_models(pytestconfig):
"""Fixture wrapper around download_models for models used in collected tests""" """Fixture wrapper around download_models for models used in collected tests"""
...@@ -182,9 +260,9 @@ def predownload_models(pytestconfig): ...@@ -182,9 +260,9 @@ def predownload_models(pytestconfig):
# Fallback to original behavior if extraction failed # Fallback to original behavior if extraction failed
download_models() download_models()
os.environ["HF_HUB_OFFLINE"] = "1" _enable_offline_with_mistral_patch()
yield yield
os.environ.pop("HF_HUB_OFFLINE", None) _disable_offline_with_mistral_patch()
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
...@@ -204,9 +282,9 @@ def predownload_tokenizers(pytestconfig): ...@@ -204,9 +282,9 @@ def predownload_tokenizers(pytestconfig):
# Skip redundant HuggingFace API calls in worker subprocesses since # Skip redundant HuggingFace API calls in worker subprocesses since
# tokenizers are already cached. This avoids flaky timeouts from slow # tokenizers are already cached. This avoids flaky timeouts from slow
# HF API responses (the RepoInfo fetch still happens even for cached models). # HF API responses (the RepoInfo fetch still happens even for cached models).
os.environ["HF_HUB_OFFLINE"] = "1" _enable_offline_with_mistral_patch()
yield yield
os.environ.pop("HF_HUB_OFFLINE", None) _disable_offline_with_mistral_patch()
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment