chore(deps): bump vLLM to 0.19.1 (#8370)

Signed-off-by: ayushag <ayushag@nvidia.com>

chore(deps): bump vLLM to 0.19.1 (#8370)
Signed-off-by: ayushag <ayushag@nvidia.com>
d59b56be · Ayush Agarwal · GitHub · 47cfbd46 · d59b56be · d59b56be
Unverified Commit d59b56be authored Apr 20, 2026 by Ayush Agarwal Committed by GitHub Apr 20, 2026
6 changed files
--- a/components/src/dynamo/vllm/multimodal_utils/models/qwen.py
+++ b/components/src/dynamo/vllm/multimodal_utils/models/qwen.py
@@ -45,16 +45,19 @@ def load_qwen_grid_params(model_name: str) -> QwenGridParams | None:
        merge_size: int = processor.merge_size
        factor = patch_size * merge_size

-        # Qwen2/2.5-VL use min_pixels/max_pixels directly.
-        # Qwen3-VL sets them to None and uses size.shortest_edge/longest_edge.
+        # Qwen2/2.5-VL expose min_pixels/max_pixels attributes (transformers v4);
+        # transformers v5 and Qwen3-VL drop those attributes and rely on
+        # size.shortest_edge / size.longest_edge instead.
+        proc_min_pixels = getattr(processor, "min_pixels", None)
+        proc_max_pixels = getattr(processor, "max_pixels", None)
        min_pixels: int = (
-            processor.min_pixels
-            if processor.min_pixels is not None
+            proc_min_pixels
+            if proc_min_pixels is not None
            else processor.size.get("shortest_edge", factor)
        )
        max_pixels: int = (
-            processor.max_pixels
-            if processor.max_pixels is not None
+            proc_max_pixels
+            if proc_max_pixels is not None
            else processor.size.get("longest_edge", factor * factor * 1280)
        )
        vision_hidden_dim: int = getattr(

--- a/components/src/dynamo/vllm/omni/args.py
+++ b/components/src/dynamo/vllm/omni/args.py
@@ -6,8 +6,11 @@
 import argparse
 import dataclasses
 import logging
+import os
 from typing import Optional

+import huggingface_hub
+from vllm.transformers_utils.repo_utils import get_model_path
 from vllm_omni.engine.arg_utils import OmniEngineArgs

 try:
@@ -423,6 +426,31 @@ def parse_omni_args() -> OmniConfig:
    vllm_args = vllm_parser.parse_args(unknown)
    config.model = vllm_args.model

+    # Resolve repo id to local snapshot path under HF_HUB_OFFLINE so
+    # vllm-omni diffusion workers don't hit transformers v5's offline
+    # LocalEntryNotFoundError (vLLM's EngineArgs does the same rewrite).
+    if (
+        huggingface_hub.constants.HF_HUB_OFFLINE
+        and config.model
+        and not os.path.exists(config.model)
+    ):
+        model_id = config.model
+        config.model = get_model_path(
+            config.model, getattr(vllm_args, "revision", None)
+        )
+        if model_id != config.model:
+            # Preserve the original repo id as the user-facing model name
+            # so /v1/models still advertises "Wan-AI/..." not the snapshot path.
+            if getattr(config, "served_model_name", None) is None:
+                config.served_model_name = model_id
+            logger.info(
+                "HF_HUB_OFFLINE is True; replaced omni model_id [%s] "
+                "with model_path [%s] so vllm-omni diffusion workers "
+                "see a local snapshot.",
+                model_id,
+                config.model,
+            )
+
    engine_args = OmniEngineArgs.from_cli_args(vllm_args)

    if getattr(engine_args, "served_model_name", None) is not None:

--- a/container/context.yaml
+++ b/container/context.yaml
@@ -44,13 +44,13 @@ vllm:
    runtime_image: nvcr.io/nvidia/cuda
    base_image_tag: 25.06-cuda12.9-devel-ubuntu24.04
    runtime_image_tag: 12.9.1-runtime-ubuntu24.04
-    vllm_ref: v0.19.0
+    vllm_ref: v0.19.1
  cuda13.0:
    base_image: nvcr.io/nvidia/cuda-dl-base
    runtime_image: nvcr.io/nvidia/cuda
    base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
    runtime_image_tag: 13.0.2-runtime-ubuntu24.04
-    vllm_ref: v0.19.0
+    vllm_ref: v0.19.1
  xpu:
    base_image: intel/deep-learning-essentials
    runtime_image: intel/deep-learning-essentials

--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -12,7 +12,7 @@

 set -euo pipefail

-VLLM_VER="0.19.0"
+VLLM_VER="0.19.1"
 VLLM_REF="v${VLLM_VER}"
 DEVICE="cuda"


--- a/docs/backends/vllm/vllm-omni.md
+++ b/docs/backends/vllm/vllm-omni.md
@@ -375,8 +375,6 @@ sequenceDiagram
 GLM-Image is a 2-stage text-to-image model with an AR stage (generates prior token IDs) and a DiT stage (diffusion denoising + VAE decode). The built-in vLLM-Omni stage config already assigns each stage to a separate GPU.

 > **Experimental:** GLM-Image support is experimental; generation may fail or produce incorrect/garbled outputs for some prompts and sizes.
->
-> **Known issue:** GLM-Image requires `transformers>=5.0` to recognize the `glm_image` architecture. Older versions fail at model config creation with `The checkpoint you are trying to load has model type 'glm_image' but Transformers does not recognize this architecture`.

 ```bash
 bash examples/backends/vllm/launch/disagg_omni_glm_image.sh

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,7 @@ trtllm =[
 vllm = [
    "uvloop",
    "nixl[cu12]<=0.10.1",
-    "vllm[flashinfer,runai,otel]==0.19.0",
+    "vllm[flashinfer,runai,otel]==0.19.1",
    # vllm-omni is installed separately in container builds (see
    # container/deps/vllm/install_vllm.sh). Do not add it to ai-dynamo[vllm]:
    # pip/uv dependency resolution for omni can override the vLLM torch stack.