multinode-tp-init-order.patch

# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-License-Identifier: Apache-2.0
#
# Hotfix for vLLM 0.17.1: multi-node TP init ordering
# Upstream fix: https://github.com/vllm-project/vllm/commit/86e1060b
# Upstream PR:  https://github.com/vllm-project/vllm/pull/35892
# Issue:        https://github.com/vllm-project/vllm/issues/36389
#
# WorkerProc.__init__ calls _init_message_queues() before init_device(),
# but the former needs _INNER_DP_WORLD which the latter creates.
# Move _init_message_queues() after init_device()+load_model().
#
# Remove this patch once vLLM >= 0.17.2 is adopted.
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index e3376ba..39a3646 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -586,7 +586,6 @@ class WorkerProc:
         )

         # Load model
-        self._init_message_queues(input_shm_handle, vllm_config)
         is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
         if not is_eep_new_worker:
             self.worker.init_device()
@@ -596,6 +595,10 @@ class WorkerProc:
             )
             self.worker.load_model()

+        # Initialize message queues after init_device() since multi-node setups
+        # (nnodes_within_dp > 1) require distributed groups to be initialized
+        self._init_message_queues(input_shm_handle, vllm_config)
+
         # Enable environment variable cache (e.g. assume no more
         # environment variable overrides after this point)
         enable_envs_cache()