"...ssh:/git@developer.sourcefind.cn:2222/OpenDAS/dynamo.git" did not exist on "5816c08289f14ac83b9adbc0c87f991e756547f5"
Unverified Commit c7bac979 authored by Dmitry Tokarev's avatar Dmitry Tokarev Committed by GitHub
Browse files

fix: patch vLLM 0.17.1 multi-node TP init ordering, enable gpu_2 pre-merge CI (#7357)


Signed-off-by: default avatarDmitry Tokarev <dtokarev@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent 0b665150
...@@ -197,7 +197,10 @@ jobs: ...@@ -197,7 +197,10 @@ jobs:
run_single_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' }} run_single_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' }}
single_gpu_test_markers: 'pre_merge and vllm and gpu_1' single_gpu_test_markers: 'pre_merge and vllm and gpu_1'
single_gpu_test_timeout_minutes: 35 single_gpu_test_timeout_minutes: 35
run_multi_gpu_tests: false # TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved run_multi_gpu_tests: false # TODO: select multi-GPU tests based for pre_merge from post_merge and anable below lines.
# run_multi_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' }}
# multi_gpu_test_markers: 'pre_merge and vllm and gpu_2'
# multi_gpu_test_timeout_minutes: 30
secrets: inherit secrets: inherit
# ============================================================================ # ============================================================================
......
...@@ -208,6 +208,37 @@ if [ "$DEVICE" = "cpu" ]; then ...@@ -208,6 +208,37 @@ if [ "$DEVICE" = "cpu" ]; then
fi fi
echo "✓ vLLM installation completed" echo "✓ vLLM installation completed"
# Apply hotfix for multi-node TP init ordering (vLLM PR #35892).
# Only applies to vLLM 0.17.1 — fail loudly on any other version so the
# patch + this block get cleaned up when vLLM is bumped.
# Note: In Docker builds the script is copied to /tmp but deps are bind-mounted
# at /tmp/deps, so resolve the patch relative to BASH_SOURCE first, then fall
# back to the bind-mount path.
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VLLM_PATCH="${SCRIPT_DIR}/multinode-tp-init-order.patch"
if [ ! -f "$VLLM_PATCH" ]; then
VLLM_PATCH="/tmp/deps/vllm/multinode-tp-init-order.patch"
fi
if [ "$VLLM_VER" = "0.17.1" ]; then
# Patch the cloned repo (used by CPU/XPU source builds that build from source)
echo "Applying vLLM multi-node TP hotfix to cloned repo..."
git -C "${INSTALLATION_DIR}/vllm" apply --ignore-whitespace "$VLLM_PATCH"
# Also patch site-packages if vLLM was installed from a wheel (CUDA builds).
# Skip if the installed location is the clone itself (already patched above).
# Use `uv pip show` instead of importing vllm to avoid pulling in torch/CUDA.
VLLM_SITE=$(uv pip show vllm 2>/dev/null | grep -i '^Location:' | awk '{print $2}')
if [ -n "$VLLM_SITE" ] && [ "$VLLM_SITE" != "${INSTALLATION_DIR}/vllm" ]; then
echo "Applying vLLM multi-node TP hotfix to ${VLLM_SITE}..."
patch -d "$VLLM_SITE" -p1 < "$VLLM_PATCH"
fi
echo "✓ vLLM multi-node TP hotfix applied"
else
echo "❌ ERROR: vLLM version is ${VLLM_VER}, not 0.17.1."
echo " The multi-node TP hotfix patch (multinode-tp-init-order.patch) and"
echo " this block in install_vllm.sh are no longer needed — please remove them."
exit 1
fi
echo "\n=== Installing LMCache from source ===" echo "\n=== Installing LMCache from source ==="
# LMCache prebuilt wheels are built against PyTorch <=2.8.0 and fail with PyTorch 2.10+ # LMCache prebuilt wheels are built against PyTorch <=2.8.0 and fail with PyTorch 2.10+
# (undefined symbol: c10::cuda::c10_cuda_check_implementation). # (undefined symbol: c10::cuda::c10_cuda_check_implementation).
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-License-Identifier: Apache-2.0
#
# Hotfix for vLLM 0.17.1: multi-node TP init ordering
# Upstream fix: https://github.com/vllm-project/vllm/commit/86e1060b
# Upstream PR: https://github.com/vllm-project/vllm/pull/35892
# Issue: https://github.com/vllm-project/vllm/issues/36389
#
# WorkerProc.__init__ calls _init_message_queues() before init_device(),
# but the former needs _INNER_DP_WORLD which the latter creates.
# Move _init_message_queues() after init_device()+load_model().
#
# Remove this patch once vLLM >= 0.17.2 is adopted.
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index e3376ba..39a3646 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -586,7 +586,6 @@ class WorkerProc:
)
# Load model
- self._init_message_queues(input_shm_handle, vllm_config)
is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
if not is_eep_new_worker:
self.worker.init_device()
@@ -596,6 +595,10 @@ class WorkerProc:
)
self.worker.load_model()
+ # Initialize message queues after init_device() since multi-node setups
+ # (nnodes_within_dp > 1) require distributed groups to be initialized
+ self._init_message_queues(input_shm_handle, vllm_config)
+
# Enable environment variable cache (e.g. assume no more
# environment variable overrides after this point)
enable_envs_cache()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment