chore: bump vllm to 0.11.0 (#3422)

Signed-off-by: alec-flowers <aflowers@nvidia.com>

chore: bump vllm to 0.11.0 (#3422)
Signed-off-by: alec-flowers <aflowers@nvidia.com>
90dc7589 · Alec · GitHub · 60975b51 · 90dc7589 · 90dc7589
Unverified Commit 90dc7589 authored Oct 12, 2025 by Alec Committed by GitHub Oct 12, 2025
10 changed files
--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -15,9 +15,9 @@ ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
 ARG CUDA_VERSION="12.8"
 # Make sure to update the dependency version in pyproject.toml when updating this
-ARG VLLM_REF="v0.10.2"
+ARG VLLM_REF="v0.11.0"
 # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
-ARG FLASHINF_REF="v0.3.0"
+ARG FLASHINF_REF="v0.3.1"
 ARG TORCH_BACKEND="cu128"
 # If left blank, then we will fallback to vLLM defaults

--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -13,7 +13,7 @@
 set -euo pipefail
-VLLM_REF="v0.10.2"
+VLLM_REF="v0.11.0"
 # Basic Configurations
 ARCH=$(uname -m)
@@ -29,7 +29,7 @@ CUDA_VERSION="12.8" # For DEEPGEMM
 # These flags are applicable when installing vLLM from source code
 EDITABLE=true
 VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
-FLASHINF_REF="v0.3.0"
+FLASHINF_REF="v0.3.1"
 while [[ $# -gt 0 ]]; do
    case $1 in
@@ -131,10 +131,8 @@ git clone $VLLM_GIT_URL vllm
 cd vllm
 git checkout $VLLM_REF
-# TODO remove in future vLLM release, re-instate ignore torch script
+# TODO leave this here in case we need to do cherry-picks in future
-# https://github.com/vllm-project/vllm/pull/24729
+# GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064
-GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064
 echo "\n=== Installing vLLM & FlashInfer ==="
@@ -243,4 +241,4 @@ echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
 cd ep_kernels/
 TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
 echo "\n✅ All installations completed successfully!"
\ No newline at end of file
--- a/examples/multimodal/utils/protocol.py
+++ b/examples/multimodal/utils/protocol.py
@@ -22,10 +22,11 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator
 from pydantic_core import core_schema
 from typing_extensions import NotRequired
 from vllm.inputs.data import TokensPrompt
+from vllm.logprobs import PromptLogprobs
 from vllm.multimodal.inputs import MultiModalUUIDDict  # noqa: F401
 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import PromptLogprobs, RequestMetrics
+from vllm.sequence import RequestMetrics
 import dynamo.nixl_connect as connect

--- a/lib/bindings/python/src/dynamo/llm/vllm_integration/connector_leader.py
+++ b/lib/bindings/python/src/dynamo/llm/vllm_integration/connector_leader.py
@@ -192,7 +192,9 @@ class KvConnectorLeader:
        if self._connector.has_slot(request.request_id):
            return None
-        if bool(request.mm_positions):
+        if bool(getattr(request, "mm_features", None)) or bool(
+            getattr(request, "mm_positions", None)
+        ):
            raise ValueError("Unsupported request - requires mm extra keys")
        all_token_ids = request.all_token_ids

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -40,6 +40,21 @@ logging.basicConfig(
 )
+@pytest.fixture()
+def set_ucx_tls_no_mm():
+    """Set UCX env defaults for all tests."""
+    mp = pytest.MonkeyPatch()
+    # CI note:
+    # - Affected test: tests/fault_tolerance/cancellation/test_vllm.py::test_request_cancellation_vllm_decode_cancel
+    # - Symptom on L40 CI: UCX/NIXL mm transport assertion during worker init
+    #   (uct_mem.c:482: mem.memh != UCT_MEM_HANDLE_NULL) when two workers
+    #   start on the same node (maybe a shared-memory segment collision/limits).
+    # - Mitigation: disable UCX "mm" shared-memory transport globally for tests
+    mp.setenv("UCX_TLS", "^mm")
+    yield
+    mp.undo()
 def download_models(model_list=None, ignore_weights=False):
    """Download models - can be called directly or via fixture

--- a/tests/fault_tolerance/cancellation/test_vllm.py
+++ b/tests/fault_tolerance/cancellation/test_vllm.py
@@ -193,7 +193,7 @@ def test_request_cancellation_vllm_aggregated(
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
 def test_request_cancellation_vllm_decode_cancel(
-    request, runtime_services, predownload_models
+    request, runtime_services, predownload_models, set_ucx_tls_no_mm
 ):
    """
    End-to-end test for request cancellation during decode phase.
@@ -266,7 +266,7 @@ def test_request_cancellation_vllm_decode_cancel(
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
 def test_request_cancellation_vllm_remote_prefill_cancel(
-    request, runtime_services, predownload_models
+    request, runtime_services, predownload_models, set_ucx_tls_no_mm
 ):
    """
    End-to-end test for request cancellation during remote prefill phase.

--- a/tests/fault_tolerance/test_request_migration.py
+++ b/tests/fault_tolerance/test_request_migration.py
@@ -290,7 +290,9 @@ def verify_migration_occurred(frontend_process: DynamoFrontendProcess) -> None:
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
-def test_request_migration_vllm(request, runtime_services, predownload_models):
+def test_request_migration_vllm(
+    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+):
    """
    End-to-end test for worker fault tolerance with migration support.

--- a/tests/frontend/reasoning_effort/test_reasoning_effort.py
+++ b/tests/frontend/reasoning_effort/test_reasoning_effort.py
@@ -58,6 +58,8 @@ class GPTOSSWorkerProcess(ManagedProcess):
            "dynamo.vllm",
            "--model",
            REASONING_TEST_MODEL,
+            "--connector",
+            "none",  # skip nixl registration, noticing long startup times in CI. Potentially a bug...
            "--enforce-eager",
            "--dyn-tool-call-parser",
            "harmony",
@@ -85,7 +87,7 @@ class GPTOSSWorkerProcess(ManagedProcess):
                ("http://localhost:8000/v1/models", check_models_api),
                ("http://localhost:8083/health", self.is_ready),
            ],
-            timeout=300,
+            timeout=500,
            display_output=True,
            terminate_existing=False,
            stragglers=["VLLM::EngineCore"],

--- a/tests/kvbm/test_determinism_agg.py
+++ b/tests/kvbm/test_determinism_agg.py
@@ -111,7 +111,7 @@ class LLMServerManager:
            "--kv-transfer-config",
            '{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}',
            os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"),
-            "--max-seq-len",
+            "--max-model-len",
            "8000",  # required to fit on L4 GPU when using 8b model
        ]

--- a/tests/kvbm/test_determinism_disagg.py
+++ b/tests/kvbm/test_determinism_disagg.py
@@ -132,7 +132,7 @@ class LLMServerManager:
            os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"),
            "--block-size",
            "16",
-            "--max-seq-len",
+            "--max-model-len",
            "8000",  # required to fit on L4 GPU when using 8b model
            "--connector",
            "nixl",
@@ -148,7 +148,7 @@ class LLMServerManager:
            "--is-prefill-worker",
            "--block-size",
            "16",
-            "--max-seq-len",
+            "--max-model-len",
            "8000",  # required to fit on L4 GPU when using 8b model
            "--connector",
            "kvbm",