fix: patch vLLM 0.17.1 multi-node TP init ordering, enable gpu_2 pre-merge CI (#7357)

Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

fix: patch vLLM 0.17.1 multi-node TP init ordering, enable gpu_2 pre-merge CI (#7357)
Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
c7bac979 · Dmitry Tokarev · GitHub · 0b665150 · c7bac979 · c7bac979
Unverified Commit c7bac979 authored Mar 14, 2026 by Dmitry Tokarev Committed by GitHub Mar 14, 2026
3 changed files
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -197,7 +197,10 @@ jobs:
      run_single_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' }}
      single_gpu_test_markers: 'pre_merge and vllm and gpu_1'
      single_gpu_test_timeout_minutes: 35
-      run_multi_gpu_tests: false  # TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
+      run_multi_gpu_tests: false  # TODO: select multi-GPU tests based for pre_merge from post_merge and anable below lines.
+      # run_multi_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' }}
+      # multi_gpu_test_markers: 'pre_merge and vllm and gpu_2'
+      # multi_gpu_test_timeout_minutes: 30
    secrets: inherit
  # ============================================================================

--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -208,6 +208,37 @@ if [ "$DEVICE" = "cpu" ]; then
 fi
 echo "✓ vLLM installation completed"
+# Apply hotfix for multi-node TP init ordering (vLLM PR #35892).
+# Only applies to vLLM 0.17.1 — fail loudly on any other version so the
+# patch + this block get cleaned up when vLLM is bumped.
+# Note: In Docker builds the script is copied to /tmp but deps are bind-mounted
+# at /tmp/deps, so resolve the patch relative to BASH_SOURCE first, then fall
+# back to the bind-mount path.
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+VLLM_PATCH="${SCRIPT_DIR}/multinode-tp-init-order.patch"
+if [ ! -f "$VLLM_PATCH" ]; then
+    VLLM_PATCH="/tmp/deps/vllm/multinode-tp-init-order.patch"
+fi
+if [ "$VLLM_VER" = "0.17.1" ]; then
+    # Patch the cloned repo (used by CPU/XPU source builds that build from source)
+    echo "Applying vLLM multi-node TP hotfix to cloned repo..."
+    git -C "${INSTALLATION_DIR}/vllm" apply --ignore-whitespace "$VLLM_PATCH"
+    # Also patch site-packages if vLLM was installed from a wheel (CUDA builds).
+    # Skip if the installed location is the clone itself (already patched above).
+    # Use `uv pip show` instead of importing vllm to avoid pulling in torch/CUDA.
+    VLLM_SITE=$(uv pip show vllm 2>/dev/null | grep -i '^Location:' | awk '{print $2}')
+    if [ -n "$VLLM_SITE" ] && [ "$VLLM_SITE" != "${INSTALLATION_DIR}/vllm" ]; then
+        echo "Applying vLLM multi-node TP hotfix to ${VLLM_SITE}..."
+        patch -d "$VLLM_SITE" -p1 < "$VLLM_PATCH"
+    fi
+    echo "✓ vLLM multi-node TP hotfix applied"
+else
+    echo "❌ ERROR: vLLM version is ${VLLM_VER}, not 0.17.1."
+    echo "   The multi-node TP hotfix patch (multinode-tp-init-order.patch) and"
+    echo "   this block in install_vllm.sh are no longer needed — please remove them."
+    exit 1
+fi
 echo "\n=== Installing LMCache from source ==="
 # LMCache prebuilt wheels are built against PyTorch <=2.8.0 and fail with PyTorch 2.10+
 # (undefined symbol: c10::cuda::c10_cuda_check_implementation).

--- a/container/deps/vllm/multinode-tp-init-order.patch
+++ b/container/deps/vllm/multinode-tp-init-order.patch
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-License-Identifier: Apache-2.0
+#
+# Hotfix for vLLM 0.17.1: multi-node TP init ordering
+# Upstream fix: https://github.com/vllm-project/vllm/commit/86e1060b
+# Upstream PR:  https://github.com/vllm-project/vllm/pull/35892
+# Issue:        https://github.com/vllm-project/vllm/issues/36389
+#
+# WorkerProc.__init__ calls _init_message_queues() before init_device(),
+# but the former needs _INNER_DP_WORLD which the latter creates.
+# Move _init_message_queues() after init_device()+load_model().
+#
+# Remove this patch once vLLM >= 0.17.2 is adopted.
+diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
+index e3376ba..39a3646 100644
+--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
+@@ -586,7 +586,6 @@ class WorkerProc:
+         )
+         # Load model
+-        self._init_message_queues(input_shm_handle, vllm_config)
+         is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
+         if not is_eep_new_worker:
+             self.worker.init_device()
+@@ -596,6 +595,10 @@ class WorkerProc:
+             )
+             self.worker.load_model()
+        # Initialize message queues after init_device() since multi-node setups
+        # (nnodes_within_dp > 1) require distributed groups to be initialized
+        self._init_message_queues(input_shm_handle, vllm_config)
+
+         # Enable environment variable cache (e.g. assume no more
+         # environment variable overrides after this point)
+         enable_envs_cache()