"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "fe2016de2d2d68bb967e9c43ee168db01322b0f1"
Unverified Commit ba3aef8a authored by Yuewei Na's avatar Yuewei Na Committed by GitHub
Browse files

chore: upgrade tensorrt-llm to 1.3.0rc8 (#7504)


Signed-off-by: default avatarYuewei Na <nv-yna@users.noreply.github.com>
Co-authored-by: default avatarYuewei Na <nv-yna@users.noreply.github.com>
parent f2fd3a16
......@@ -48,7 +48,7 @@ dependencies = [
"pandas",
"pydantic>=2",
"tabulate",
# Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.3.0rc7 (==4.57.1), SGLang 0.5.8 (==4.57.1)
# Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.3.0rc8 (==4.57.1), SGLang 0.5.8 (==4.57.1)
"transformers>=4.56.0",
]
......
......@@ -6,8 +6,10 @@
This module defines the DiffusionConfig dataclass used for configuring
video and image diffusion workers.
Fields map to TensorRT-LLM's DiffusionArgs sub-configs:
- PipelineConfig: torch_compile, CUDA graph, warmup, offloading, fuse_qkv
Fields map to TensorRT-LLM's VisualGenArgs sub-configs:
- PipelineConfig: offloading, fuse_qkv, NVTX markers
- TorchCompileConfig: torch_compile, fullgraph
- CudaGraphConfig: CUDA graph capture
- AttentionConfig: attention backend (VANILLA, TRTLLM)
- ParallelConfig: dit_*_size parallelism dimensions
- TeaCacheConfig: caching optimization
......@@ -86,7 +88,7 @@ class DiffusionConfig:
# Attention backend: "VANILLA" (PyTorch SDPA) or "TRTLLM"
attn_backend: str = "VANILLA"
# ── Quantization config (maps to DiffusionArgs.quant_config) ──
# ── Quantization config (maps to VisualGenArgs.quant_config) ──
# Quantization algorithm. Options:
# None (no quantization), "FP8", "FP8_BLOCK_SCALES", "NVFP4",
# "W4A16_AWQ", "W4A8_AWQ", "W8A8_SQ_PER_CHANNEL"
......
......@@ -30,7 +30,7 @@ from typing import TYPE_CHECKING, Optional
import torch
if TYPE_CHECKING:
from tensorrt_llm._torch.visual_gen import DiffusionArgs
from tensorrt_llm._torch.visual_gen import VisualGenArgs
from tensorrt_llm._torch.visual_gen.output import MediaOutput
from tensorrt_llm._torch.visual_gen.pipeline import BasePipeline
......@@ -71,7 +71,7 @@ class DiffusionEngine:
The old visual_gen standalone package (setup_configs + from_pretrained +
PIPELINE_REGISTRY) has been replaced by TensorRT-LLM's integrated
visual_gen module which uses:
- DiffusionArgs for configuration
- VisualGenArgs for configuration
- PipelineLoader for model loading (handles MetaInit, weight loading,
quantization, torch.compile, and warmup)
- AutoPipeline for pipeline type auto-detection
......@@ -117,12 +117,12 @@ class DiffusionEngine:
# Import TensorRT-LLM visual_gen components
from tensorrt_llm._torch.visual_gen import PipelineLoader
# Build DiffusionArgs from DiffusionConfig
# Build VisualGenArgs from DiffusionConfig
diffusion_args = self._build_diffusion_args()
logger.info(f"DiffusionArgs: {diffusion_args}")
logger.info(f"VisualGenArgs: {diffusion_args}")
# Use PipelineLoader for the full loading flow:
# DiffusionArgs → DiffusionModelConfig → AutoPipeline → BasePipeline
# VisualGenArgs → DiffusionModelConfig → AutoPipeline → BasePipeline
loader = PipelineLoader(diffusion_args)
self._pipeline = loader.load()
......@@ -132,26 +132,29 @@ class DiffusionEngine:
f"{self._pipeline.__class__.__name__}"
)
def _build_diffusion_args(self) -> "DiffusionArgs":
"""Build DiffusionArgs from DiffusionConfig.
def _build_diffusion_args(self) -> "VisualGenArgs":
"""Build VisualGenArgs from DiffusionConfig.
Maps dynamo's DiffusionConfig fields to TensorRT-LLM's DiffusionArgs
structure with its nested sub-configs (PipelineConfig, AttentionConfig,
ParallelConfig, TeaCacheConfig, quant_config).
Maps dynamo's DiffusionConfig fields to TensorRT-LLM's VisualGenArgs
structure with its nested sub-configs (PipelineConfig, TorchCompileConfig,
CudaGraphConfig, AttentionConfig, ParallelConfig, TeaCacheConfig,
quant_config).
Returns:
DiffusionArgs instance for PipelineLoader.
VisualGenArgs instance for PipelineLoader.
"""
from tensorrt_llm._torch.visual_gen import (
DiffusionArgs,
CudaGraphConfig,
ParallelConfig,
PipelineConfig,
TeaCacheConfig,
TorchCompileConfig,
VisualGenArgs,
)
from tensorrt_llm._torch.visual_gen.config import AttentionConfig
# Build quant_config dict if quantization is requested
# DiffusionArgs accepts a dict in ModelOpt format and parses it via model_validator
# VisualGenArgs accepts a dict in ModelOpt format and parses it via model_validator
quant_config: dict | None = None
if self.config.quant_algo:
quant_config = {
......@@ -164,16 +167,19 @@ class DiffusionEngine:
device=self.device,
dtype=self.config.torch_dtype,
skip_components=self.config.skip_components,
skip_warmup=(self.config.warmup_steps == 0),
pipeline=PipelineConfig(
enable_torch_compile=not self.config.disable_torch_compile,
torch_compile_mode=self.config.torch_compile_mode,
enable_fullgraph=self.config.enable_fullgraph,
fuse_qkv=self.config.fuse_qkv,
enable_cuda_graph=self.config.enable_cuda_graph,
enable_layerwise_nvtx_marker=self.config.enable_layerwise_nvtx_marker,
warmup_steps=self.config.warmup_steps,
enable_offloading=self.config.enable_async_cpu_offload,
),
torch_compile=TorchCompileConfig(
enable_torch_compile=not self.config.disable_torch_compile,
enable_fullgraph=self.config.enable_fullgraph,
),
cuda_graph=CudaGraphConfig(
enable_cuda_graph=self.config.enable_cuda_graph,
),
attention=AttentionConfig(
backend=self.config.attn_backend.upper(),
),
......@@ -198,7 +204,7 @@ class DiffusionEngine:
if quant_config is not None:
args_kwargs["quant_config"] = quant_config
return DiffusionArgs(**args_kwargs)
return VisualGenArgs(**args_kwargs)
def generate(
self,
......
......@@ -96,10 +96,10 @@ trtllm:
python_version: "3.12"
index_url: https://pypi.nvidia.com/
pip_wheel_dir: /tmp/trtllm_wheel/
pip_wheel: tensorrt-llm==1.3.0rc7
pip_wheel: tensorrt-llm==1.3.0rc8
trtllm_wheel_image: nvcr.io/nvidia/tensorrt-llm/release:${TENSORRTLLM_PIP_WHEEL#*==}
github_trtllm_commit: v1.3.0rc7
github_trtllm_commit: v1.3.0rc8
torch_version: 2.10.0a0+b4e4ee81d3.nv25.12
torch_tensorrt_version: 2.10.0a0
torchvision_version: 0.25.0a0+ca221243
......
......@@ -22,7 +22,7 @@ tensorboard>=2.19.0,<2.21.0
tensorboardX==2.6.2.2
# Transformers version constraint for container builds
# - vLLM 0.11.0: >=4.55.2, vLLM 0.11.2: >=4.56.0,<5
# - TensorRT-LLM 1.3.0rc7: ==4.57.1
# - TensorRT-LLM 1.3.0rc8: ==4.57.1
# - SGLang 0.5.8: ==4.57.1
# Using >=4.56.0 to satisfy all frameworks
transformers>=4.56.0
......
......@@ -29,7 +29,7 @@ The following table shows the backend framework versions included with each Dyna
| **Dynamo** | **SGLang** | **TensorRT-LLM** | **vLLM** | **NIXL** |
| :--- | :--- | :--- | :--- | :--- |
| **main (ToT)** | `0.5.9` | `1.3.0rc7` | `0.17.1` | `0.10.1` |
| **main (ToT)** | `0.5.9` | `1.3.0rc8` | `0.17.1` | `0.10.1` |
| **v1.1.0-dev.1** *(experimental)* | `0.5.9` | `1.3.0rc5.post1` | `0.17.1` | `0.10.1` |
| **v1.0.1** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` |
| **v1.0.0** | `0.5.9` | `1.3.0rc5.post1` | `0.16.0` | `0.10.1` |
......
......@@ -44,7 +44,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"
[project.optional-dependencies]
trtllm =[
"uvloop",
"tensorrt-llm==1.3.0rc7",
"tensorrt-llm==1.3.0rc8",
]
vllm = [
......
......@@ -168,6 +168,84 @@ def download_models(model_list=None, ignore_weights=False):
)
def _enable_offline_with_mistral_patch():
"""Set HF_HUB_OFFLINE=1 and work around a transformers 4.57.3 regression.
transformers 4.57.3 (PR #42389) introduced _patch_mistral_regex which calls
huggingface_hub.model_info() unconditionally for every tokenizer load — even
non-Mistral models with fully cached weights. This API call fails when
HF_HUB_OFFLINE=1.
Since tests launch TRT-LLM workers as subprocesses that inherit env vars but
not in-process monkey-patches, we inject the fix via a sitecustomize.py on
PYTHONPATH so every subprocess auto-applies it at startup.
Upstream bug: https://github.com/huggingface/transformers/issues/44843
TODO: Remove this workaround once transformers ships a fix and TRT-LLM (or
any other dependency) upgrades to that fixed version.
"""
os.environ["HF_HUB_OFFLINE"] = "1"
# Apply the patch in this process
try:
from huggingface_hub.errors import OfflineModeIsEnabled
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
original = PreTrainedTokenizerBase._patch_mistral_regex
@classmethod # type: ignore[misc]
def _safe_patch(cls, tokenizer, *args, **kwargs):
try:
return original.__func__(cls, tokenizer, *args, **kwargs)
except OfflineModeIsEnabled:
return tokenizer
PreTrainedTokenizerBase._patch_mistral_regex = _safe_patch
except (ImportError, AttributeError):
return # transformers version without _patch_mistral_regex — nothing to do
# Write a sitecustomize.py so subprocesses also get the patch
patch_dir = os.path.join(tempfile.gettempdir(), "dynamo_test_hf_patch")
os.makedirs(patch_dir, exist_ok=True)
with open(os.path.join(patch_dir, "sitecustomize.py"), "w") as f:
f.write(
"import os\n"
"if os.environ.get('HF_HUB_OFFLINE') == '1':\n"
" try:\n"
" from transformers.tokenization_utils_base import"
" PreTrainedTokenizerBase as _T\n"
" from huggingface_hub.errors import"
" OfflineModeIsEnabled as _E\n"
" _orig = _T._patch_mistral_regex\n"
" @classmethod\n"
" def _safe(cls, tokenizer, *a, **kw):\n"
" try:\n"
" return _orig.__func__(cls, tokenizer, *a, **kw)\n"
" except _E:\n"
" return tokenizer\n"
" _T._patch_mistral_regex = _safe\n"
" except (ImportError, AttributeError):\n"
" pass\n"
)
pythonpath = os.environ.get("PYTHONPATH", "")
os.environ["PYTHONPATH"] = f"{patch_dir}:{pythonpath}" if pythonpath else patch_dir
logging.info(
"Enabled HF_HUB_OFFLINE with _patch_mistral_regex workaround "
"(see https://github.com/huggingface/transformers/issues/44843)"
)
def _disable_offline_with_mistral_patch():
"""Undo _enable_offline_with_mistral_patch."""
os.environ.pop("HF_HUB_OFFLINE", None)
patch_dir = os.path.join(tempfile.gettempdir(), "dynamo_test_hf_patch")
pythonpath = os.environ.get("PYTHONPATH", "")
os.environ["PYTHONPATH"] = pythonpath.replace(f"{patch_dir}:", "").replace(
patch_dir, ""
)
@pytest.fixture(scope="session")
def predownload_models(pytestconfig):
"""Fixture wrapper around download_models for models used in collected tests"""
......@@ -182,9 +260,9 @@ def predownload_models(pytestconfig):
# Fallback to original behavior if extraction failed
download_models()
os.environ["HF_HUB_OFFLINE"] = "1"
_enable_offline_with_mistral_patch()
yield
os.environ.pop("HF_HUB_OFFLINE", None)
_disable_offline_with_mistral_patch()
@pytest.fixture(scope="session")
......@@ -204,9 +282,9 @@ def predownload_tokenizers(pytestconfig):
# Skip redundant HuggingFace API calls in worker subprocesses since
# tokenizers are already cached. This avoids flaky timeouts from slow
# HF API responses (the RepoInfo fetch still happens even for cached models).
os.environ["HF_HUB_OFFLINE"] = "1"
_enable_offline_with_mistral_patch()
yield
os.environ.pop("HF_HUB_OFFLINE", None)
_disable_offline_with_mistral_patch()
@pytest.fixture(autouse=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment