chore: bump vLLM to 0.11.2 (#4476)

60feb955 · Karen Chung · GitHub · c5e8c4c2 · 60feb955 · 60feb955
Unverified Commit 60feb955 authored Dec 03, 2025 by Karen Chung Committed by GitHub Dec 03, 2025
3 changed files
--- a/tests/fault_tolerance/deploy/container/Dockerfile.local_vllm
+++ b/tests/fault_tolerance/deploy/container/Dockerfile.local_vllm
@@ -9,7 +9,7 @@
 ARG LOCAL_VLLM_IMAGE="vllm-elastic-ep:latest_all2all_buffer_input"
 ARG DYNAMO_BASE_IMAGE="dynamo:latest-none"
 ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
-ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
+ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"

 # Other build arguments
 ARG PYTHON_VERSION=3.12
@@ -57,7 +57,7 @@ RUN apt-get update && \
        # prometheus dependencies
        ca-certificates \
        # DeepGemm uses 'cuobjdump' which does not come with CUDA image
-        cuda-command-line-tools-12-8 && \
+        cuda-command-line-tools-12-9 && \
    rm -rf /var/lib/apt/lists/*

 # Copy CUDA development tools from vLLM image (for JIT compilation)

--- a/tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml
+++ b/tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml
@@ -60,7 +60,6 @@ spec:
            - --model
            - deepseek-ai/DeepSeek-V2-Lite
            - --trust-remote-code
-            - --disable-log-requests
            - --tensor-parallel-size
            - "1"
            - --data-parallel-size

--- a/tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml
+++ b/tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml
@@ -63,7 +63,6 @@ spec:
            - --model
            - deepseek-ai/DeepSeek-V2-Lite
            - --trust-remote-code
-            - --disable-log-requests
            - --tensor-parallel-size
            - "1"
            - --data-parallel-size
@@ -130,7 +129,6 @@ spec:
            - --model
            - deepseek-ai/DeepSeek-V2-Lite
            - --trust-remote-code
-            - --disable-log-requests
            - --is-prefill-worker
            - --tensor-parallel-size
            - "1"