Merge tag 'v0.18.0' into v0.18.0-ori

3fb4b5fa · zhuwenwen · bcf25339 · 89138b21 · 3fb4b5fa · 3fb4b5fa
Commit 3fb4b5fa authored Mar 23, 2026 by zhuwenwen
20 changed files
--- a/tests/detokenizer/test_min_tokens.py
+++ b/tests/detokenizer/test_min_tokens.py
@@ -39,7 +39,6 @@ def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str):
        mm_features=None,
        sampling_params=params,
        pooling_params=None,
-        eos_token_id=None,
        arrival_time=0.0,
        lora_request=None,
        cache_salt=None,

--- a/tests/detokenizer/test_stop_string_while_stop_model_terminates.py
+++ b/tests/detokenizer/test_stop_string_while_stop_model_terminates.py
@@ -35,7 +35,6 @@ def _make_request(stop, include_stop_str_in_output: bool, min_tokens: int = 0):
        mm_features=None,
        sampling_params=params,
        pooling_params=None,
-        eos_token_id=None,
        arrival_time=0.0,
        lora_request=None,
        cache_salt=None,

--- a/tests/distributed/eplb_utils.py
+++ b/tests/distributed/eplb_utils.py
@@ -7,6 +7,7 @@ import random
 import torch
 import torch.multiprocessing as mp

+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed.parallel_state import (
    init_distributed_environment,
 )
@@ -41,8 +42,12 @@ def set_env_vars_and_device(env: dict[str, str]) -> None:
    update_environment_variables(env)
    local_rank = os.environ["LOCAL_RANK"]
    device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
-    init_distributed_environment()
+    torch.accelerator.set_device_index(device)
+
+    # Create a minimal vllm config for init_distributed_environment
+    vllm_config = VllmConfig()
+    with set_current_vllm_config(vllm_config):
+        init_distributed_environment()

    # Ensure each worker process has the same random seed
    random.seed(42)

--- a/tests/distributed/test_ca_buffer_sharing.py
+++ b/tests/distributed/test_ca_buffer_sharing.py
@@ -32,7 +32,7 @@ pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes)
 print(f"Rank {rank} has pointers {pointers}")

 dist.barrier()
-torch.cuda.synchronize()
+torch.accelerator.synchronize()

 if rank == 0:
    # the first rank tries to write to all buffers
@@ -41,7 +41,7 @@ if rank == 0:
        lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes)

 dist.barrier()
-torch.cuda.synchronize()
+torch.accelerator.synchronize()

 host_data = (ctypes.c_char * buffer_size_in_bytes)()

@@ -59,6 +59,6 @@ for p in pointers:
 print(f"Rank {rank} verified all buffers")

 dist.barrier()
-torch.cuda.synchronize()
+torch.accelerator.synchronize()

 CustomAllreduce.free_shared_buffer(pointers)
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -19,6 +19,8 @@ from vllm.distributed import (
    tensor_model_parallel_all_reduce,
    tensor_model_parallel_reduce_scatter,
 )
+from vllm.distributed.parallel_state import GroupCoordinator, TensorMetadata
+from vllm.v1.worker.gpu_worker import AsyncIntermediateTensors

 from ..utils import (
    init_test_distributed_environment,
@@ -41,7 +43,7 @@ def all_reduce_test_worker(
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)

    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
    num_elements = 8
    all_tensors = [
@@ -67,7 +69,7 @@ def reduce_scatter_test_worker(
    # they will be able to set the device to the correct GPU
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)

    num_elements = 8
@@ -98,7 +100,7 @@ def all_gather_test_worker(
    # they will be able to set the device to the correct GPU
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
    num_dimensions = 3
    tensor_size = list(range(2, num_dimensions + 2))
@@ -132,7 +134,7 @@ def broadcast_tensor_dict_test_worker(
    # they will be able to set the device to the correct GPU
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
    test_dict = {
        # device tensor
@@ -169,7 +171,7 @@ def send_recv_tensor_dict_test_worker(
 ):
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)

    test_dict = {
@@ -200,6 +202,111 @@ def send_recv_tensor_dict_test_worker(
        torch.testing.assert_close(recv_dict["f"], test_dict["f"])


+class _DummyWork:
+    def __init__(self) -> None:
+        self.wait_calls = 0
+
+    def wait(self) -> None:
+        self.wait_calls += 1
+
+
+class _DummyAllGatherGroup:
+    def __init__(self, world_size: int, rank_in_group: int) -> None:
+        self.world_size = world_size
+        self.rank_in_group = rank_in_group
+
+    def all_gather(self, t: torch.Tensor, dim: int = 0) -> torch.Tensor:
+        # duplicate local slice across ranks.
+        assert dim == 0
+        return torch.cat([t for _ in range(self.world_size)], dim=0)
+
+
+def _make_group_for_unit_test(
+    rank_in_group: int = 0, world_size: int = 2
+) -> GroupCoordinator:
+    # avoid running GroupCoordinator.__init__ (it wires up real process groups).
+    g = GroupCoordinator.__new__(GroupCoordinator)
+    g.world_size = world_size
+    g.rank_in_group = rank_in_group
+    g.ranks = list(range(world_size))
+    g.use_cpu_custom_send_recv = False
+    g.device_group = None
+    g.cpu_group = None
+    return g
+
+
+def test_irecv_tensor_dict_send_allgather_postprocess_binds_keys(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    def fake_irecv(t: torch.Tensor, *args: Any, **kwargs: Any) -> _DummyWork:
+        t.fill_(1)
+        return _DummyWork()
+
+    monkeypatch.setattr(torch.distributed, "is_initialized", lambda: True)
+    monkeypatch.setattr(torch.distributed, "irecv", fake_irecv)
+
+    g = _make_group_for_unit_test(rank_in_group=0, world_size=2)
+    # 2 tensors so we can catch late-binding bugs in postprocess closures.
+    metadata_list = [
+        ("a", TensorMetadata("cpu", torch.int32, torch.Size([4]))),
+        ("b", TensorMetadata("cpu", torch.int32, torch.Size([4]))),
+    ]
+    g.recv_object = lambda src=None: metadata_list  # type: ignore[method-assign]
+
+    ag = _DummyAllGatherGroup(world_size=2, rank_in_group=0)
+    td, handles, postprocess = g.irecv_tensor_dict(all_gather_group=ag)
+
+    assert td is not None
+    assert len(handles) == 2
+    assert len(postprocess) == 2
+
+    # before postprocess, dict holds the TP slice (shape 2).
+    assert td["a"].shape == torch.Size([2])
+    assert td["b"].shape == torch.Size([2])
+
+    # simulate worker-side "defer wait": wait + postprocess later.
+    for handle in handles:
+        handle.wait()
+    for fn in postprocess:
+        fn()
+
+    # after postprocess, dict values are reconstructed to full shape (shape 4),
+    # and each key should be updated independently
+    assert td["a"].shape == torch.Size([4])
+    assert td["b"].shape == torch.Size([4])
+    torch.testing.assert_close(td["a"], torch.ones(4, dtype=torch.int32))
+    torch.testing.assert_close(td["b"], torch.ones(4, dtype=torch.int32))
+
+
+def test_async_intermediate_tensors_lazy_wait() -> None:
+    work = _DummyWork()
+    post_calls = {"n": 0}
+
+    def post() -> None:
+        post_calls["n"] += 1
+
+    it = AsyncIntermediateTensors(
+        {"x": torch.tensor([1])},
+        comm_handles=[work],
+        comm_postprocess=[post],
+    )
+
+    # accessing non-tensor attributes should not trigger wait.
+    assert it.kv_connector_output is None
+    assert work.wait_calls == 0
+    assert post_calls["n"] == 0
+
+    # first access of `.tensors` triggers wait + postprocess.
+    _ = it.tensors
+    assert work.wait_calls == 1
+    assert post_calls["n"] == 1
+
+    # subsequent access should not re-wait.
+    _ = it.tensors
+    assert work.wait_calls == 1
+    assert post_calls["n"] == 1
+
+
 @ray.remote(num_gpus=1, max_calls=1)
 def send_recv_test_worker(
    monkeypatch: pytest.MonkeyPatch,
@@ -210,7 +317,7 @@ def send_recv_test_worker(
 ):
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)

    size = 64

--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -33,8 +33,9 @@ def graph_allreduce(
 ):
    with monkeypatch.context() as m:
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        m.delenv("HIP_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
        ensure_model_parallel_initialized(tp_size, pp_size)
        group = get_tp_group().device_group
@@ -47,7 +48,7 @@ def graph_allreduce(
        data = torch.zeros(1)
        data = data.to(device=device)
        torch.distributed.all_reduce(data, group=group)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        del data

        # we use the first group to communicate once
@@ -61,13 +62,11 @@ def graph_allreduce(
            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
                with graph_capture(device=device) as graph_capture_context:
                    # use integers so result matches NCCL exactly
-                    inp1 = torch.randint(
-                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
-                    )
-                    inp2 = torch.randint(
-                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
-                    )
-                    torch.cuda.synchronize()
+                    device_idx = torch.accelerator.current_device_index()
+                    inp1 = torch.randint(1, 16, (sz,), dtype=dtype, device=device_idx)
+                    inp2 = torch.randint(1, 16, (sz,), dtype=dtype, device=device_idx)
+
+                    torch.accelerator.synchronize()
                    graph = torch.cuda.CUDAGraph()
                    with torch.cuda.graph(graph, stream=graph_capture_context.stream):
                        for i in range(num_communication):
@@ -92,8 +91,9 @@ def eager_allreduce(
 ):
    with monkeypatch.context() as m:
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        m.delenv("HIP_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)

        # we use the first group to communicate once
@@ -127,6 +127,6 @@ def test_custom_allreduce(
    test_target,
 ):
    world_size = tp_size * pipeline_parallel_size
-    if world_size > torch.cuda.device_count():
+    if world_size > torch.accelerator.device_count():
        pytest.skip("Not enough GPUs to run the test.")
    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
--- a/tests/distributed/test_dcp_a2a.py
+++ b/tests/distributed/test_dcp_a2a.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for DCP A2A communication backend (no GPU required).
+
+Tests cover:
+1. DCP A2A config validation (--dcp-comm-backend)
+2. KVP group function exists
+3. LSE-weighted combination correctness
+"""
+
+import math
+
+import pytest
+import torch
+
+from vllm.config.parallel import ParallelConfig
+
+
+class TestDCPCommBackendConfig:
+    """Test --dcp-comm-backend config validation."""
+
+    def test_default_is_ag_rs(self):
+        """Default comm backend is ag_rs."""
+        config = ParallelConfig()
+        assert config.dcp_comm_backend == "ag_rs"
+
+    def test_a2a_requires_dcp_greater_than_1(self):
+        """A2A backend requires decode_context_parallel_size > 1."""
+        with pytest.raises(
+            ValueError, match="requires decode_context_parallel_size > 1"
+        ):
+            ParallelConfig(
+                dcp_comm_backend="a2a",
+                decode_context_parallel_size=1,
+            )
+
+    def test_a2a_with_dcp_valid(self):
+        """A2A backend is valid when DCP > 1."""
+        config = ParallelConfig(
+            dcp_comm_backend="a2a",
+            tensor_parallel_size=8,
+            decode_context_parallel_size=4,
+        )
+        assert config.dcp_comm_backend == "a2a"
+
+    def test_invalid_backend_rejected(self):
+        """Invalid backend values are rejected."""
+        with pytest.raises(ValueError, match="must be one of"):
+            ParallelConfig(
+                dcp_comm_backend="invalid",
+            )
+
+    def test_ag_rs_with_dcp_1_valid(self):
+        """ag_rs backend is valid with DCP=1 (no DCP)."""
+        config = ParallelConfig(
+            dcp_comm_backend="ag_rs",
+            decode_context_parallel_size=1,
+        )
+        assert config.dcp_comm_backend == "ag_rs"
+
+
+class TestLSEWeightedCombine:
+    """Test LSE-weighted combination logic (CPU only, no GPU).
+
+    The _lse_weighted_combine function is the reference implementation
+    that verifies the Triton kernel's correctness. It computes:
+
+        result[b,h,d] = sum_n(w_n * output_n[b,h,d])
+
+    where w_n = softmax(lse_n) = exp(lse_n) / sum_k(exp(lse_k))
+    """
+
+    def test_importable(self):
+        """Verify _lse_weighted_combine is importable."""
+        from vllm.v1.attention.ops.dcp_alltoall import _lse_weighted_combine
+
+        assert callable(_lse_weighted_combine)
+
+    def test_single_rank(self):
+        """Single rank: output unchanged."""
+        from vllm.v1.attention.ops.dcp_alltoall import _lse_weighted_combine
+
+        # N=1, B=2, H=4, D=8
+        outputs = torch.randn(1, 2, 4, 8)
+        lses = torch.randn(1, 2, 4)
+
+        result = _lse_weighted_combine(outputs, lses)
+
+        assert result.shape == (2, 4, 8)
+        torch.testing.assert_close(result, outputs.squeeze(0), rtol=1e-5, atol=1e-5)
+
+    def test_equal_lse(self):
+        """Equal LSE values: outputs averaged equally."""
+        from vllm.v1.attention.ops.dcp_alltoall import _lse_weighted_combine
+
+        _N, B, H, D = 2, 1, 1, 4
+        outputs = torch.tensor(
+            [
+                [[[1.0, 2.0, 3.0, 4.0]]],  # Rank 0
+                [[[5.0, 6.0, 7.0, 8.0]]],  # Rank 1
+            ]
+        )
+        lses = torch.tensor(
+            [
+                [[0.0]],  # Rank 0
+                [[0.0]],  # Rank 1
+            ]
+        )
+
+        result = _lse_weighted_combine(outputs, lses)
+
+        expected = (outputs[0] + outputs[1]) / 2
+        assert result.shape == (B, H, D)
+        torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-5)
+
+    def test_dominant_rank(self):
+        """Different LSE values: larger LSE gets more weight."""
+        from vllm.v1.attention.ops.dcp_alltoall import _lse_weighted_combine
+
+        B, H, D = 1, 1, 2
+        outputs = torch.tensor(
+            [
+                [[[0.0, 0.0]]],  # Rank 0
+                [[[1.0, 1.0]]],  # Rank 1
+            ]
+        )
+        lses = torch.tensor(
+            [
+                [[-100.0]],  # Rank 0: negligible contribution
+                [[0.0]],  # Rank 1: dominant
+            ]
+        )
+
+        result = _lse_weighted_combine(outputs, lses)
+
+        assert result.shape == (B, H, D)
+        torch.testing.assert_close(result, outputs[1].squeeze(0), atol=1e-5, rtol=1e-5)
+
+    def test_mathematically_correct(self):
+        """Verify mathematical correctness of LSE combination."""
+        from vllm.v1.attention.ops.dcp_alltoall import _lse_weighted_combine
+
+        outputs = torch.tensor(
+            [
+                [[[2.0, 4.0]]],
+                [[[6.0, 8.0]]],
+            ]
+        )
+        lses = torch.tensor(
+            [
+                [[1.0]],  # exp(1) ≈ 2.718
+                [[2.0]],  # exp(2) ≈ 7.389
+            ]
+        )
+
+        result = _lse_weighted_combine(outputs, lses)
+
+        w0 = math.exp(1) / (math.exp(1) + math.exp(2))
+        w1 = math.exp(2) / (math.exp(1) + math.exp(2))
+        expected = torch.tensor([[[w0 * 2.0 + w1 * 6.0, w0 * 4.0 + w1 * 8.0]]])
+
+        torch.testing.assert_close(result, expected, rtol=1e-4, atol=1e-4)
+
+    def test_return_lse(self):
+        """return_lse=True returns global LSE (logsumexp of inputs)."""
+        from vllm.v1.attention.ops.dcp_alltoall import _lse_weighted_combine
+
+        B, H, D = 1, 1, 2
+        outputs = torch.tensor(
+            [
+                [[[1.0, 2.0]]],
+                [[[3.0, 4.0]]],
+            ]
+        )
+        lses = torch.tensor(
+            [
+                [[1.0]],
+                [[2.0]],
+            ]
+        )
+
+        result, global_lse = _lse_weighted_combine(outputs, lses, return_lse=True)
+
+        expected_global_lse = math.log(math.exp(1) + math.exp(2))
+
+        assert result.shape == (B, H, D)
+        assert global_lse.shape == (B, H)
+        assert abs(global_lse.item() - expected_global_lse) < 1e-5
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
--- a/tests/distributed/test_elastic_ep.py
+++ b/tests/distributed/test_elastic_ep.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import subprocess
+import time
+
+import pytest
+import requests
+
+from ..evals.gsm8k.gsm8k_eval import evaluate_gsm8k
+from ..utils import RemoteOpenAIServer, multi_gpu_test
+
+
+@pytest.fixture(autouse=True)
+def cleanup_ray_between_tests():
+    """Force-stop any lingering Ray processes between tests."""
+    subprocess.run(["ray", "stop", "--force"], timeout=30, capture_output=True)
+    time.sleep(5)
+    yield
+
+
+MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+
+NUM_GSM8K_QUESTIONS = 256
+EXPECTED_ACCURACY = 0.58
+ACCURACY_TOL = 0.08
+MAX_NUM_SEQS = 32
+
+
+def _send_scale_command(server: RemoteOpenAIServer, new_dp_size: int) -> bool:
+    url = server.url_for("scale_elastic_ep")
+    payload = {"new_data_parallel_size": new_dp_size}
+    headers = {"Content-Type": "application/json"}
+
+    try:
+        response = requests.post(url, json=payload, headers=headers, timeout=300)
+        return response.status_code == 200
+    except requests.exceptions.RequestException:
+        return False
+
+
+def _run_gsm8k_eval(server: RemoteOpenAIServer, stage: str) -> float:
+    assert server.port is not None
+    result = evaluate_gsm8k(
+        num_questions=NUM_GSM8K_QUESTIONS,
+        host=f"http://{server.host}",
+        port=server.port,
+    )
+    accuracy = result["accuracy"]
+    print(
+        f"[{stage}] GSM8K accuracy: {accuracy:.3f} "
+        f"({result['num_questions']} questions)"
+    )
+    assert accuracy >= EXPECTED_ACCURACY, (
+        f"[{stage}] GSM8K accuracy {accuracy:.3f} is below "
+        f"expected threshold {EXPECTED_ACCURACY}"
+    )
+    return accuracy
+
+
+@multi_gpu_test(num_gpus=4)
+def test_elastic_ep_scaling():
+    vllm_serve_args = [
+        "--trust-remote-code",
+        "--tensor-parallel-size",
+        "1",
+        "--gpu-memory-utilization",
+        "0.8",
+        "--max-model-len",
+        "4096",
+        "--max-num-seqs",
+        str(MAX_NUM_SEQS),
+        "--enable-expert-parallel",
+        "--all2all-backend",
+        "allgather_reducescatter",
+        "--enable-elastic-ep",
+        "--enable-eplb",
+        "--eplb-config.num_redundant_experts",
+        "0",
+        "--data-parallel-backend",
+        "ray",
+        "--data-parallel-size",
+        "2",
+        "--api-server-count",
+        "1",
+    ]
+
+    leader_address = os.environ.get("LEADER_ADDRESS")
+    if leader_address:
+        vllm_serve_args.extend(["--data-parallel-address", leader_address])
+
+    with RemoteOpenAIServer(
+        MODEL_NAME, vllm_serve_args, env_dict={}, max_wait_seconds=1200
+    ) as server:
+        initial_accuracy = _run_gsm8k_eval(server, "Initial (2 GPUs)")
+
+        assert _send_scale_command(server, 4)
+        time.sleep(10)
+        scale_up_accuracy = _run_gsm8k_eval(server, "After scale up (4 GPUs)")
+
+        assert scale_up_accuracy >= initial_accuracy - ACCURACY_TOL, (
+            f"Scale up accuracy {scale_up_accuracy:.3f} dropped more than "
+            f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
+        )
+
+        assert _send_scale_command(server, 2)
+        time.sleep(5)
+        scale_down_accuracy = _run_gsm8k_eval(server, "After scale down (2 GPUs)")
+
+        assert scale_down_accuracy >= initial_accuracy - ACCURACY_TOL, (
+            f"Scale down accuracy {scale_down_accuracy:.3f} dropped more than "
+            f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
+        )
+
+        print("\nAccuracy Summary:")
+        print(f"  Initial:    {initial_accuracy:.3f}")
+        print(
+            f"  Scale up:   {scale_up_accuracy:.3f} "
+            f"(diff: {scale_up_accuracy - initial_accuracy:+.3f})"
+        )
+        print(
+            f"  Scale down: {scale_down_accuracy:.3f} "
+            f"(diff: {scale_down_accuracy - initial_accuracy:+.3f})"
+        )
+        print(f"  Tolerance:  {ACCURACY_TOL:.3f}")
+
+
+@multi_gpu_test(num_gpus=4)
+def test_elastic_ep_scaling_uneven():
+    """Test scale up with uneven worker distribution.
+
+    This tests the case where num_new_workers % old_dp_size != 0,
+    specifically 2 -> 3 where remainder = 1 % 2 = 1.
+    This exercises the remainder handling in sender-receiver pairing.
+    """
+    vllm_serve_args = [
+        "--trust-remote-code",
+        "--tensor-parallel-size",
+        "1",
+        "--gpu-memory-utilization",
+        "0.8",
+        "--max-model-len",
+        "4096",
+        "--max-num-seqs",
+        str(MAX_NUM_SEQS),
+        "--enable-expert-parallel",
+        "--all2all-backend",
+        "allgather_reducescatter",
+        "--enable-elastic-ep",
+        "--enable-eplb",
+        "--eplb-config.num_redundant_experts",
+        "0",
+        "--data-parallel-backend",
+        "ray",
+        "--data-parallel-size",
+        "2",
+        "--api-server-count",
+        "1",
+    ]
+
+    leader_address = os.environ.get("LEADER_ADDRESS")
+    if leader_address:
+        vllm_serve_args.extend(["--data-parallel-address", leader_address])
+
+    with RemoteOpenAIServer(
+        MODEL_NAME, vllm_serve_args, env_dict={}, max_wait_seconds=1200
+    ) as server:
+        initial_accuracy = _run_gsm8k_eval(server, "Initial (2 GPUs)")
+
+        # Scale 2 -> 3: This has remainder = 1 % 2 = 1
+        # Tests uneven sender-receiver pairing
+        assert _send_scale_command(server, 3)
+        time.sleep(10)
+        scale_up_accuracy = _run_gsm8k_eval(server, "After scale up (3 GPUs)")
+
+        assert scale_up_accuracy >= initial_accuracy - ACCURACY_TOL, (
+            f"Scale up accuracy {scale_up_accuracy:.3f} dropped more than "
+            f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
+        )
+
+        # Scale back down to 2
+        assert _send_scale_command(server, 2)
+        time.sleep(5)
+        scale_down_accuracy = _run_gsm8k_eval(server, "After scale down (2 GPUs)")
+
+        assert scale_down_accuracy >= initial_accuracy - ACCURACY_TOL, (
+            f"Scale down accuracy {scale_down_accuracy:.3f} dropped more than "
+            f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
+        )
+
+        print("\nAccuracy Summary (Uneven Scaling):")
+        print(f"  Initial:    {initial_accuracy:.3f}")
+        print(
+            f"  Scale up:   {scale_up_accuracy:.3f} "
+            f"(diff: {scale_up_accuracy - initial_accuracy:+.3f})"
+        )
+        print(
+            f"  Scale down: {scale_down_accuracy:.3f} "
+            f"(diff: {scale_down_accuracy - initial_accuracy:+.3f})"
+        )
+        print(f"  Tolerance:  {ACCURACY_TOL:.3f}")
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -8,6 +8,7 @@ import pytest
 import torch
 import torch.distributed

+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed.eplb.rebalance_execute import (
    move_from_buffer,
    rearrange_expert_weights_inplace,
@@ -244,91 +245,95 @@ def _test_async_transfer_layer_without_mtp_worker(
    num_logical_experts: int,
 ) -> None:
    set_env_vars_and_device(env)
-    ensure_model_parallel_initialized(
-        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-    )

-    tp_group = get_tp_group()
-    ep_group = tp_group.device_group
-    ep_rank = torch.distributed.get_rank()
-    device = torch.device(f"cuda:{ep_rank}")
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size

-    total_physical_experts = world_size * num_local_experts
-    hidden_sizes = [16, 32]
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )

-    redundancy_config = create_redundancy_config(
-        num_logical_experts,
-        total_physical_experts,
-    )
-    old_indices = create_expert_indices_with_redundancy(
-        num_layers,
-        num_logical_experts,
-        total_physical_experts,
-        redundancy_config,
-    )
+        tp_group = get_tp_group()
+        ep_group = tp_group.device_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")

-    new_redundancy_config = create_redundancy_config(
-        num_logical_experts,
-        total_physical_experts,
-    )
-    new_indices = create_expert_indices_with_redundancy(
-        num_layers,
-        num_logical_experts,
-        total_physical_experts,
-        new_redundancy_config,
-    )
+        total_physical_experts = world_size * num_local_experts
+        hidden_sizes = [16, 32]

-    expert_weights = create_expert_weights(
-        num_layers,
-        num_local_experts,
-        hidden_sizes,
-        ep_rank,
-        device,
-        old_indices,
-    )
-    old_indices_cpu = old_indices.cpu()
-    new_indices_cpu = new_indices.cpu()
-
-    expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
-    cuda_stream = torch.cuda.Stream(device=device)
-
-    for layer_idx in range(num_layers):
-        is_unchanged, is_received_locally, recv_metadata = asyncio.run(
-            transfer_layer(
-                old_global_expert_indices=old_indices_cpu,
-                new_global_expert_indices=new_indices_cpu,
-                expert_weights=expert_weights,
-                expert_weights_buffer=expert_buffer,
-                ep_group=ep_group,
-                layer=layer_idx,
-                cuda_stream=cuda_stream,
-            )
+        redundancy_config = create_redundancy_config(
+            num_logical_experts,
+            total_physical_experts,
        )
-        cuda_stream.synchronize()
-        move_from_buffer(
-            expert_weights=expert_weights[layer_idx],
-            expert_weights_buffers=expert_buffer,
-            is_unchanged=is_unchanged,
-            is_received_locally=is_received_locally,
-            recv_metadata=recv_metadata,
-            new_indices=new_indices_cpu[layer_idx].numpy(),
-            ep_rank=ep_rank,
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            redundancy_config,
        )

-    verify_expert_weights_after_shuffle(
-        expert_weights,
-        new_indices,
-        hidden_sizes,
-        ep_rank,
-        num_local_experts,
-    )
-    verify_redundant_experts_have_same_weights(
-        expert_weights,
-        new_indices,
-        hidden_sizes,
-        world_size,
-        num_local_experts,
-    )
+        new_redundancy_config = create_redundancy_config(
+            num_logical_experts,
+            total_physical_experts,
+        )
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            new_redundancy_config,
+        )
+
+        expert_weights = create_expert_weights(
+            num_layers,
+            num_local_experts,
+            hidden_sizes,
+            ep_rank,
+            device,
+            old_indices,
+        )
+        old_indices_cpu = old_indices.cpu()
+        new_indices_cpu = new_indices.cpu()
+
+        expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
+        cuda_stream = torch.cuda.Stream(device=device)
+
+        for layer_idx in range(num_layers):
+            is_unchanged, is_received_locally, recv_metadata = asyncio.run(
+                transfer_layer(
+                    old_layer_indices=old_indices_cpu[layer_idx],
+                    new_layer_indices=new_indices_cpu[layer_idx],
+                    expert_weights=expert_weights[layer_idx],
+                    expert_weights_buffer=expert_buffer,
+                    ep_group=ep_group,
+                    cuda_stream=cuda_stream,
+                )
+            )
+            cuda_stream.synchronize()
+            move_from_buffer(
+                expert_weights=expert_weights[layer_idx],
+                expert_weights_buffers=expert_buffer,
+                is_unchanged=is_unchanged,
+                is_received_locally=is_received_locally,
+                recv_metadata=recv_metadata,
+                new_indices=new_indices_cpu[layer_idx].numpy(),
+                ep_rank=ep_rank,
+            )
+
+        verify_expert_weights_after_shuffle(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            ep_rank,
+            num_local_experts,
+        )
+        verify_redundant_experts_have_same_weights(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            world_size,
+            num_local_experts,
+        )


 def _test_rearrange_expert_weights_with_redundancy(
@@ -337,71 +342,76 @@ def _test_rearrange_expert_weights_with_redundancy(
    # Initialize model parallel (using tensor parallel as an entrypoint
    # to expert parallel)
    set_env_vars_and_device(env)
-    ensure_model_parallel_initialized(
-        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-    )

-    ep_group = get_tp_group().cpu_group
-    ep_rank = torch.distributed.get_rank()
-    device = torch.device(f"cuda:{ep_rank}")
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size

-    # Test parameters
-    total_physical_experts = world_size * num_local_experts
-    hidden_sizes = [32, 64]  # Two different weight matrices
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )

-    # Create old expert indices (with redundancy)
-    redundancy_config = create_redundancy_config(
-        num_logical_experts, total_physical_experts
-    )
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")

-    old_indices = create_expert_indices_with_redundancy(
-        num_layers,
-        num_logical_experts,
-        total_physical_experts,
-        redundancy_config,
-    )
+        # Test parameters
+        total_physical_experts = world_size * num_local_experts
+        hidden_sizes = [32, 64]  # Two different weight matrices

-    # Create new expert indices (with redundancy)
-    new_redundancy_config = create_redundancy_config(
-        num_logical_experts, total_physical_experts
-    )
-    new_indices = create_expert_indices_with_redundancy(
-        num_layers,
-        num_logical_experts,
-        total_physical_experts,
-        new_redundancy_config,
-    )
+        # Create old expert indices (with redundancy)
+        redundancy_config = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )

-    # Create expert weights
-    expert_weights = create_expert_weights(
-        num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
-    )
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            redundancy_config,
+        )

-    # Execute weight rearrangement
-    rearrange_expert_weights_inplace(
-        old_indices,
-        new_indices,
-        expert_weights,
-        ep_group,
-        is_profile=False,
-    )
+        # Create new expert indices (with redundancy)
+        new_redundancy_config = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            new_redundancy_config,
+        )

-    # Verify the rearrangement result
-    verify_expert_weights_after_shuffle(
-        expert_weights,
-        new_indices,
-        hidden_sizes,
-        ep_rank,
-        num_local_experts,
-    )
+        # Create expert weights
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+        )

-    verify_redundant_experts_have_same_weights(
-        expert_weights,
-        new_indices,
-        hidden_sizes,
-        world_size,
-        num_local_experts,
-    )
+        # Execute weight rearrangement
+        rearrange_expert_weights_inplace(
+            old_indices,
+            new_indices,
+            expert_weights,
+            ep_group,
+            is_profile=False,
+        )
+
+        # Verify the rearrangement result
+        verify_expert_weights_after_shuffle(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            ep_rank,
+            num_local_experts,
+        )
+
+        verify_redundant_experts_have_same_weights(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            world_size,
+            num_local_experts,
+        )


 @pytest.mark.parametrize(
@@ -432,7 +442,7 @@ def test_rearrange_expert_weights_with_redundancy(
 ):
    """Test the functionality of rearranging expert weights with redundancy."""

-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
        pytest.skip(f"Need at least {world_size} GPUs to run the test")
    distributed_run(
        _test_rearrange_expert_weights_with_redundancy,
@@ -445,58 +455,63 @@ def test_rearrange_expert_weights_with_redundancy(

 def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
    set_env_vars_and_device(env)
-    ensure_model_parallel_initialized(
-        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-    )

-    ep_group = get_tp_group().cpu_group
-    ep_rank = torch.distributed.get_rank()
-    device = torch.device(f"cuda:{ep_rank}")
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size

-    num_layers = 2
-    num_local_experts = 2
-    total_physical_experts = world_size * num_local_experts
-    num_logical_experts = total_physical_experts // 2  # Some redundancy
-    hidden_sizes = [32, 64]
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )

-    # Create redundancy configuration
-    redundancy_config = [2] * num_logical_experts
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")

-    # Same indices - no change
-    indices = create_expert_indices_with_redundancy(
-        num_layers, num_logical_experts, total_physical_experts, redundancy_config
-    )
+        num_layers = 2
+        num_local_experts = 2
+        total_physical_experts = world_size * num_local_experts
+        num_logical_experts = total_physical_experts // 2  # Some redundancy
+        hidden_sizes = [32, 64]

-    expert_weights = create_expert_weights(
-        num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
-    )
+        # Create redundancy configuration
+        redundancy_config = [2] * num_logical_experts

-    # Save original weights
-    original_weights = []
-    for layer_weights in expert_weights:
-        layer_copy = []
-        for weight in layer_weights:
-            layer_copy.append(weight.clone())
-        original_weights.append(layer_copy)
-
-    # Execute rearrangement (should be no change)
-    rearrange_expert_weights_inplace(
-        indices,
-        indices,  # Same indices
-        expert_weights,
-        ep_group,
-        is_profile=False,
-    )
+        # Same indices - no change
+        indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts, redundancy_config
+        )

-    # Verify that the weights have not changed
-    for layer in range(num_layers):
-        for weight_idx in range(len(hidden_sizes)):
-            torch.testing.assert_close(
-                expert_weights[layer][weight_idx],
-                original_weights[layer][weight_idx],
-                msg=f"""Layer {layer}, weight {weight_idx}
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
+        )
+
+        # Save original weights
+        original_weights = []
+        for layer_weights in expert_weights:
+            layer_copy = []
+            for weight in layer_weights:
+                layer_copy.append(weight.clone())
+            original_weights.append(layer_copy)
+
+        # Execute rearrangement (should be no change)
+        rearrange_expert_weights_inplace(
+            indices,
+            indices,  # Same indices
+            expert_weights,
+            ep_group,
+            is_profile=False,
+        )
+
+        # Verify that the weights have not changed
+        for layer in range(num_layers):
+            for weight_idx in range(len(hidden_sizes)):
+                torch.testing.assert_close(
+                    expert_weights[layer][weight_idx],
+                    original_weights[layer][weight_idx],
+                    msg=f"""Layer {layer}, weight {weight_idx}
 should remain unchanged""",
-            )
+                )


 @pytest.mark.parametrize(
@@ -513,7 +528,7 @@ def test_async_transfer_layer_without_mtp(
 ):
    """Exercise async EPLB transfer path without MTP/spec decode."""

-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
        pytest.skip(f"Need at least {world_size} GPUs to run the test")

    distributed_run(
@@ -532,77 +547,82 @@ def test_rearrange_expert_weights_no_change(world_size):
    unchanged.
    """

-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
        pytest.skip(f"Need at least {world_size} GPUs to run the test")
    distributed_run(_test_rearrange_expert_weights_no_change, world_size)


 def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None:
    set_env_vars_and_device(env)
-    ensure_model_parallel_initialized(
-        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-    )

-    ep_group = get_tp_group().cpu_group
-    ep_rank = torch.distributed.get_rank()
-    device = torch.device(f"cuda:{ep_rank}")
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size

-    num_layers = 1
-    num_local_experts = 2
-    total_physical_experts = world_size * num_local_experts
-    num_logical_experts = total_physical_experts // 2
-    hidden_sizes = [32]
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )

-    # Create different index distributions
-    old_redundancy = create_redundancy_config(
-        num_logical_experts, total_physical_experts
-    )
-    new_redundancy = create_redundancy_config(
-        num_logical_experts, total_physical_experts
-    )
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")

-    old_indices = create_expert_indices_with_redundancy(
-        num_layers, num_logical_experts, total_physical_experts, old_redundancy
-    )
-    new_indices = create_expert_indices_with_redundancy(
-        num_layers, num_logical_experts, total_physical_experts, new_redundancy
-    )
+        num_layers = 1
+        num_local_experts = 2
+        total_physical_experts = world_size * num_local_experts
+        num_logical_experts = total_physical_experts // 2
+        hidden_sizes = [32]

-    expert_weights = create_expert_weights(
-        num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
-    )
+        # Create different index distributions
+        old_redundancy = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
+        new_redundancy = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )

-    # Save original weights
-    original_weights = []
-    for layer_weights in expert_weights:
-        layer_copy = []
-        for weight in layer_weights:
-            layer_copy.append(weight.clone())
-        original_weights.append(layer_copy)
-
-    # Execute profile mode rearrangement
-    rearrange_expert_weights_inplace(
-        old_indices,
-        new_indices,
-        expert_weights,
-        ep_group,
-        is_profile=True,  # Profile mode
-    )
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts, old_redundancy
+        )
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts, new_redundancy
+        )

-    # In profile mode, the weights should remain unchanged
-    for layer in range(num_layers):
-        for weight_idx in range(len(hidden_sizes)):
-            torch.testing.assert_close(
-                expert_weights[layer][weight_idx],
-                original_weights[layer][weight_idx],
-                msg="In profile mode, the weights should remain unchanged",
-            )
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+        )
+
+        # Save original weights
+        original_weights = []
+        for layer_weights in expert_weights:
+            layer_copy = []
+            for weight in layer_weights:
+                layer_copy.append(weight.clone())
+            original_weights.append(layer_copy)
+
+        # Execute profile mode rearrangement
+        rearrange_expert_weights_inplace(
+            old_indices,
+            new_indices,
+            expert_weights,
+            ep_group,
+            is_profile=True,  # Profile mode
+        )
+
+        # In profile mode, the weights should remain unchanged
+        for layer in range(num_layers):
+            for weight_idx in range(len(hidden_sizes)):
+                torch.testing.assert_close(
+                    expert_weights[layer][weight_idx],
+                    original_weights[layer][weight_idx],
+                    msg="In profile mode, the weights should remain unchanged",
+                )


 @pytest.mark.parametrize("world_size", [2, 4])
 def test_rearrange_expert_weights_profile_mode(world_size):
    """Test profile mode (should not copy actual weights)"""

-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
        pytest.skip(f"Need at least {world_size} GPUs to run the test")
    distributed_run(_test_rearrange_expert_weights_profile_mode, world_size)
--- a/tests/distributed/test_eplb_fused_moe_layer.py
+++ b/tests/distributed/test_eplb_fused_moe_layer.py
@@ -257,7 +257,7 @@ def test_eplb_fml(
    intermediate_size: int,
    column_major_scales: bool,
 ):
-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
        pytest.skip(f"Need at least {world_size} GPUs to run the test")

    num_local_experts = num_experts // world_size

--- a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
+++ b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
@@ -253,7 +253,7 @@ def test_eplb_fml(
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", backend)

-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
        pytest.skip(f"Need at least {world_size} GPUs to run the test")

    num_local_experts = num_experts // world_size

--- a/tests/distributed/test_mq_connect_ip.py
+++ b/tests/distributed/test_mq_connect_ip.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test that MessageQueue uses the local node's IP for binding,
+not a remote master_addr. This validates the fix for cross-node
+data-parallel where each DP group leader must bind to its own IP.
+
+The bug: multiproc_executor used `parallel_config.master_addr` as
+`connect_ip` for every DP group's MessageQueue. For DP groups whose
+leader is NOT on the master node, binding to master_addr fails with
+"Cannot assign requested address".
+
+The fix: use `get_ip()` (local node IP) instead of `master_addr`.
+"""
+
+import pytest
+import zmq
+
+from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+from vllm.utils.network_utils import get_ip
+
+
+def test_mq_bind_with_local_ip():
+    """MessageQueue with remote readers should successfully bind
+    when connect_ip is the local node's IP."""
+    # n_reader=2, n_local_reader=1 means 1 remote reader,
+    # which triggers the remote ZMQ socket bind.
+    mq = MessageQueue(
+        n_reader=2,
+        n_local_reader=1,
+        connect_ip=get_ip(),
+    )
+    handle = mq.export_handle()
+    assert handle.remote_subscribe_addr is not None
+    # The bound address should contain our local IP
+    local_ip = get_ip()
+    assert (
+        local_ip in handle.remote_subscribe_addr
+        or f"[{local_ip}]" in handle.remote_subscribe_addr
+    )
+    del mq
+
+
+def test_mq_bind_with_non_local_ip_fails():
+    """MessageQueue should fail to bind when connect_ip is a
+    non-local IP address (simulating the bug where master_addr
+    from a different node was used)."""
+    # Use a non-local IP that we definitely can't bind to.
+    # 198.51.100.1 is from TEST-NET-2 (RFC 5737), never locally assigned.
+    non_local_ip = "198.51.100.1"
+    with pytest.raises(zmq.error.ZMQError, match="Cannot assign requested address"):
+        MessageQueue(
+            n_reader=2,
+            n_local_reader=1,
+            connect_ip=non_local_ip,
+        )
+
+
+def test_mq_bind_defaults_to_local_ip():
+    """When connect_ip is None, MessageQueue should auto-detect
+    the local IP and bind successfully."""
+    mq = MessageQueue(
+        n_reader=2,
+        n_local_reader=1,
+        connect_ip=None,  # should fallback to get_ip()
+    )
+    handle = mq.export_handle()
+    assert handle.remote_subscribe_addr is not None
+    del mq
+
+
+if __name__ == "__main__":
+    test_mq_bind_with_local_ip()
+    print("PASSED: test_mq_bind_with_local_ip")
+    test_mq_bind_with_non_local_ip_fails()
+    print("PASSED: test_mq_bind_with_non_local_ip_fails")
+    test_mq_bind_defaults_to_local_ip()
+    print("PASSED: test_mq_bind_defaults_to_local_ip")
+    print("\nAll tests passed!")
--- a/tests/distributed/test_multiproc_executor.py
+++ b/tests/distributed/test_multiproc_executor.py
@@ -9,11 +9,11 @@ focusing on executor initialization, RPC calls, and distributed execution.

 import multiprocessing
 import os
+import socket

 from tests.utils import multi_gpu_test
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import get_open_port
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.executor.multiproc_executor import MultiprocExecutor

@@ -333,7 +333,9 @@ def test_multiproc_executor_multi_node():
    - Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
    Total world_size = 4, nnodes = 2
    """
-    port = get_open_port()
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        port = s.getsockname()[1]
    # symm_mem does not work for simulating multi instance in single node
    os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"


--- a/tests/distributed/test_nccl_symm_mem_allreduce.py
+++ b/tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -10,6 +10,7 @@ import torch.distributed as dist
 import torch.multiprocessing as mp

 import vllm.envs as envs
+from tests.utils import ensure_current_vllm_config
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
 from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric_ops
@@ -37,7 +38,7 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        dtype = torch.bfloat16
        device = torch.device(f"cuda:{local_rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
        torch.set_default_device(device)
        torch.set_default_dtype(dtype)
        update_environment_variables(
@@ -51,7 +52,8 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
        )

        init_distributed_environment()
-        initialize_model_parallel(tensor_model_parallel_size=world_size)
+        with ensure_current_vllm_config():
+            initialize_model_parallel(tensor_model_parallel_size=world_size)

        cuda_communicator = typing.cast(
            CudaCommunicator, get_tp_group().device_communicator
@@ -82,7 +84,7 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
 @pytest.mark.parametrize("world_size", [2])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_nccl_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, world_size):
-    if world_size > torch.cuda.device_count():
+    if world_size > torch.accelerator.device_count():
        pytest.skip("Not enough GPUs to run the test.")

    # Enable SymmMemCommunicator

--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -247,6 +247,7 @@ def _compare_tp(
    hf_config = get_config(model_id, trust_remote_code)
    require_embed_inputs = model_info.require_embed_inputs
    max_num_seqs = model_info.max_num_seqs
+    enable_prefix_caching = model_info.enable_prefix_caching

    dtype = "float16"
    if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
@@ -300,6 +301,8 @@ def _compare_tp(
        common_args.extend(["--load-format", load_format])
    if hf_overrides:
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+    if not enable_prefix_caching:
+        common_args.append("--no-enable-prefix-caching")
    if require_embed_inputs:
        common_args.extend(
            [

--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -9,6 +9,7 @@ import pytest
 import torch
 import torch.distributed

+from tests.utils import ensure_current_vllm_config
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce  # noqa
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
@@ -53,7 +54,7 @@ def worker_fn_wrapper(fn):
        update_environment_variables(env)
        local_rank = os.environ["LOCAL_RANK"]
        device = torch.device(f"cuda:{local_rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
        init_distributed_environment()
        fn()

@@ -67,12 +68,12 @@ def worker_fn():
    )
    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
    tensor = pynccl_comm.all_reduce(tensor)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    assert torch.all(tensor == pynccl_comm.world_size).cpu().item()


 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+    torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test."
 )
 def test_pynccl():
    distributed_run(worker_fn, 2)
@@ -92,16 +93,16 @@ def multiple_allreduce_worker_fn():
    if torch.distributed.get_rank() in [0, 1]:
        tensor = pynccl_comm.all_reduce(tensor)
        tensor = pynccl_comm.all_reduce(tensor)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        assert torch.all(tensor == 4).cpu().item()
    else:
        tensor = pynccl_comm.all_reduce(tensor)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        assert torch.all(tensor == 2).cpu().item()


 @pytest.mark.skipif(
-    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+    torch.accelerator.device_count() < 4, reason="Need at least 4 GPUs to run the test."
 )
 def test_pynccl_multiple_allreduce():
    # this tests pynccl for multiple tp groups, in a standalone way
@@ -112,23 +113,24 @@ def test_pynccl_multiple_allreduce():
 @worker_fn_wrapper
 def multiple_allreduce_with_vllm_worker_fn():
    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
-    ensure_model_parallel_initialized(2, 2)
+    with ensure_current_vllm_config():
+        ensure_model_parallel_initialized(2, 2)
    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
    with graph_capture(device=device):
        # two tp groups can communicate independently
        if torch.distributed.get_rank() in [0, 1]:
            tensor = tensor_model_parallel_all_reduce(tensor)
            tensor = tensor_model_parallel_all_reduce(tensor)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            assert torch.all(tensor == 4).cpu().item()
        else:
            tensor = tensor_model_parallel_all_reduce(tensor)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            assert torch.all(tensor == 2).cpu().item()


 @pytest.mark.skipif(
-    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+    torch.accelerator.device_count() < 4, reason="Need at least 4 GPUs to run the test."
 )
 def test_pynccl_multiple_allreduce_with_vllm():
    # this tests pynccl for multiple tp groups, together with vllm
@@ -145,12 +147,12 @@ def worker_fn_with_cudagraph():
        )
        # run something in the default stream to initialize torch engine
        a = torch.ones((4, 4), device=f"cuda:{pynccl_comm.rank}")
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        with torch.cuda.graph(graph):
            a_out = pynccl_comm.all_reduce(a)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        assert torch.all(a_out == pynccl_comm.world_size).cpu().item()


@@ -178,12 +180,12 @@ def all_gather_worker_fn():
    ).to(device)

    pynccl_comm.all_gather(result, tensor)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)


 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+    torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test."
 )
 def test_pynccl_all_gather():
    distributed_run(all_gather_worker_fn, 2)
@@ -213,12 +215,12 @@ def all_gatherv_worker_fn():
    ).to(device)

    pynccl_comm.all_gatherv(result, tensor, sizes=sizes)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)


 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+    torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test."
 )
 def test_pynccl_all_gatherv():
    distributed_run(all_gatherv_worker_fn, 2)
@@ -253,12 +255,12 @@ def reduce_scatter_worker_fn():
    ).to(device)

    pynccl_comm.reduce_scatter(result, tensor)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)


 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+    torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test."
 )
 def test_pynccl_reduce_scatter():
    distributed_run(reduce_scatter_worker_fn, 2)
@@ -291,19 +293,19 @@ def reduce_scatterv_worker_fn():
    expected = sum(tensor[start:end] for tensor in all_tensors).to(device)

    pynccl_comm.reduce_scatterv(result, tensor, sizes=sizes)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)


 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+    torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test."
 )
 def test_pynccl_reduce_scatterv():
    distributed_run(reduce_scatterv_worker_fn, 2)


 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+    torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test."
 )
 def test_pynccl_with_cudagraph():
    distributed_run(worker_fn_with_cudagraph, 2)
@@ -323,12 +325,12 @@ def send_recv_worker_fn():
        pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
    else:
        pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    assert torch.all(tensor == 1).cpu().item()


 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+    torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test."
 )
 def test_pynccl_send_recv():
    distributed_run(send_recv_worker_fn, 2)
@@ -353,7 +355,7 @@ def multiple_send_recv_worker_fn():
        pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
    else:
        pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    if torch.distributed.get_rank() in [0, 2]:
        assert torch.all(tensor == 1).cpu().item()
    else:
@@ -361,14 +363,14 @@ def multiple_send_recv_worker_fn():


 @pytest.mark.skipif(
-    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+    torch.accelerator.device_count() < 4, reason="Need at least 4 GPUs to run the test."
 )
 def test_pynccl_multiple_send_recv():
    distributed_run(multiple_send_recv_worker_fn, 4)


 @pytest.mark.skipif(
-    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+    torch.accelerator.device_count() < 4, reason="Need at least 4 GPUs to run the test."
 )
 def test_pynccl_broadcast():
    distributed_run(broadcast_worker_fn, 4)
@@ -394,7 +396,7 @@ def broadcast_worker_fn():
        pynccl_comm.broadcast(recv_tensors[i], src=i)
        # the broadcast op might be launched in a different stream
        # need to synchronize to make sure the tensor is ready
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        assert torch.all(recv_tensors[i] == i).cpu().item()



--- a/tests/distributed/test_quick_all_reduce.py
+++ b/tests/distributed/test_quick_all_reduce.py
@@ -39,7 +39,7 @@ def graph_quickreduce(
    with monkeypatch.context() as m:
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
        ensure_model_parallel_initialized(tp_size, pp_size)
        group = get_tp_group().device_group
@@ -52,7 +52,7 @@ def graph_quickreduce(
        data = torch.zeros(1)
        data = data.to(device=device)
        torch.distributed.all_reduce(data, group=group)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        del data

        # we use the first group to communicate once
@@ -65,13 +65,11 @@ def graph_quickreduce(
        for sz in test_sizes:
            for dtype in [torch.float16, torch.bfloat16]:
                with graph_capture(device=device) as graph_capture_context:
-                    inp1 = torch.randint(
-                        1, 23, (sz,), dtype=dtype, device=torch.cuda.current_device()
-                    )
-                    inp2 = torch.randint(
-                        -23, 1, (sz,), dtype=dtype, device=torch.cuda.current_device()
-                    )
-                    torch.cuda.synchronize()
+                    device_idx = torch.accelerator.current_device_index()
+                    inp1 = torch.randint(1, 23, (sz,), dtype=dtype, device=device_idx)
+                    inp2 = torch.randint(-23, 1, (sz,), dtype=dtype, device=device_idx)
+
+                    torch.accelerator.synchronize()
                    graph = torch.cuda.CUDAGraph()
                    with torch.cuda.graph(graph, stream=graph_capture_context.stream):
                        for _ in range(num_communication):
@@ -95,7 +93,7 @@ def eager_quickreduce(
    with monkeypatch.context() as m:
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)

        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)

@@ -130,7 +128,7 @@ def test_custom_quick_allreduce(
    quant_mode,
 ):
    world_size = tp_size * pipeline_parallel_size
-    if world_size > torch.cuda.device_count():
+    if world_size > torch.accelerator.device_count():
        pytest.skip("Not enough GPUs to run the test.")

    monkeypatch.setenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", quant_mode)
@@ -145,7 +143,7 @@ def qr_variable_input(rank, world_size):
    has been observed with the gpt_oss model).
    """
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    qr_max_size = None  # MB
    _ptr = ops.init_custom_qr(rank, world_size, qr_max_size)
    ranks = []
@@ -169,14 +167,13 @@ def qr_variable_input(rank, world_size):
    s1 = 1024
    while num < 50000:  # 50000 is sufficient to identify issues.
        dtype = torch.float16
+        device_idx = torch.accelerator.current_device_index()
        if num % 2 == 0:
            s2 = 1024
-            inp1 = torch.zeros(
-                (s1, s2), dtype=dtype, device=torch.cuda.current_device()
-            )
+            inp1 = torch.zeros((s1, s2), dtype=dtype, device=device_idx)
        else:
            s2 = 2048
-            inp1 = torch.ones((s1, s2), dtype=dtype, device=torch.cuda.current_device())
+            inp1 = torch.ones((s1, s2), dtype=dtype, device=device_idx)
        result = torch.empty_like(inp1)
        # FP = 0 INT8 = 1 INT6 = 2 INT4 = 3 NONE = 4
        ops.qr_all_reduce(_ptr, inp1, result, 3, cast_bf2half=True)
@@ -198,7 +195,7 @@ def qr_variable_input(rank, world_size):
 @pytest.mark.parametrize("pipeline_parallel_size", [1])
 def test_custom_quick_allreduce_variable_input(tp_size, pipeline_parallel_size):
    world_size = tp_size * pipeline_parallel_size
-    if world_size > torch.cuda.device_count():
+    if world_size > torch.accelerator.device_count():
        pytest.skip("Not enough GPUs to run the test.")

    multiprocessing.set_start_method("spawn", force=True)

--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import multiprocessing
 import random
+import threading
 import time
+from unittest import mock

+import multiprocess as mp
 import numpy as np
+import pytest
 import torch.distributed as dist

 from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
@@ -22,7 +25,14 @@ def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
    return [np.random.randint(1, 100, i) for i in sizes]


-def distributed_run(fn, world_size):
+def distributed_run(fn, world_size, timeout=60):
+    """Run a function in multiple processes with proper error handling.
+
+    Args:
+        fn: Function to run in each process
+        world_size: Number of processes to spawn
+        timeout: Maximum time in seconds to wait for processes (default: 60)
+    """
    number_of_processes = world_size
    processes = []
    for i in range(number_of_processes):
@@ -33,19 +43,45 @@ def distributed_run(fn, world_size):
        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
        env["MASTER_ADDR"] = "localhost"
        env["MASTER_PORT"] = "12345"
-        p = multiprocessing.Process(target=fn, args=(env,))
+        p = mp.Process(target=fn, args=(env,))
        processes.append(p)
        p.start()

-    for p in processes:
-        p.join()
+    # Monitor processes and fail fast if any process fails
+    start_time = time.time()
+    failed_processes = []
+
+    # Wait for all processes, checking for failures
+    while time.time() - start_time < timeout:
+        all_done = True
+        for i, p in enumerate(processes):
+            if p.is_alive():
+                all_done = False
+            elif p.exitcode != 0:
+                # Process failed
+                failed_processes.append((i, p.exitcode))
+                break
+
+        if failed_processes or all_done:
+            break
+        time.sleep(0.1)  # Check every 100ms

-    for p in processes:
-        assert p.exitcode == 0
+    # Check for timeout if no failures detected yet
+    for i, p in enumerate(processes):
+        if p.is_alive():
+            p.kill()
+            p.join()
+
+    # Report failures
+    if failed_processes:
+        error_msg = "Distributed test failed:\n"
+        for rank, status in failed_processes:
+            error_msg += f"  Rank {rank}: Exit code {status}\n"
+        raise AssertionError(error_msg)


 def worker_fn_wrapper(fn):
-    # `multiprocessing.Process` cannot accept environment variables directly
+    # `mp.Process` cannot accept environment variables directly
    # so we need to pass the environment variables as arguments
    # and update the environment variables in the function
    def wrapped_fn(env):
@@ -115,3 +151,244 @@ def worker_fn():

 def test_shm_broadcast():
    distributed_run(worker_fn, 4)
+
+
+@worker_fn_wrapper
+def worker_fn_test_shutdown_busy():
+    rank = dist.get_rank()
+    writer_rank = 2
+    message_queue = MessageQueue.create_from_process_group(
+        dist.group.WORLD, 40 * 1024, 2, writer_rank
+    )
+
+    if not message_queue._is_writer:
+        # Put into busy mode
+        message_queue._spin_condition.busy_loop_s = 9999
+
+        shutdown_event = threading.Event()
+
+        def shutdown_thread(mq, shutdown_event):
+            shutdown_event.wait()
+            mq.shutdown()
+
+        threading.Thread(
+            target=shutdown_thread, args=(message_queue, shutdown_event)
+        ).start()
+
+        with pytest.raises(TimeoutError):
+            message_queue.dequeue(timeout=0.01)
+
+        shutdown_event.set()
+
+        with pytest.raises(RuntimeError, match="cancelled"):
+            message_queue.dequeue(timeout=1)
+
+        assert message_queue.shutting_down
+
+    print(f"torch distributed passed the test! Rank {rank}")
+    dist.barrier()
+
+
+def test_message_queue_shutdown_busy(caplog_vllm):
+    distributed_run(worker_fn_test_shutdown_busy, 4)
+    print(caplog_vllm.text)
+
+
+@worker_fn_wrapper
+def worker_fn_test_shutdown_idle():
+    rank = dist.get_rank()
+    writer_rank = 2
+    message_queue = MessageQueue.create_from_process_group(
+        dist.group.WORLD, 40 * 1024, 2, writer_rank
+    )
+
+    if not message_queue._is_writer:
+        # Put into idle mode
+        message_queue._spin_condition.last_read = 0
+
+        shutdown_event = threading.Event()
+
+        def shutdown_thread(mq, shutdown_event):
+            shutdown_event.wait()
+            mq.shutdown()
+
+        threading.Thread(
+            target=shutdown_thread, args=(message_queue, shutdown_event)
+        ).start()
+
+        with pytest.raises(TimeoutError):
+            message_queue.dequeue(timeout=0.01)
+
+        shutdown_event.set()
+
+        with pytest.raises(RuntimeError, match="cancelled"):
+            message_queue.dequeue(timeout=1)
+
+        assert message_queue.shutting_down
+
+    print(f"torch distributed passed the test! Rank {rank}")
+    dist.barrier()
+
+
+def test_message_queue_shutdown_idle():
+    distributed_run(worker_fn_test_shutdown_idle, 4)
+
+
+@worker_fn_wrapper
+def worker_fn_test_idle_to_busy():
+    rank = dist.get_rank()
+    writer_rank = 2
+    message_queue = MessageQueue.create_from_process_group(
+        dist.group.WORLD, 40 * 1024, 2, writer_rank
+    )
+
+    message1 = "hello world"
+    message2 = np.random.randint(1, 100, 100)
+    with mock.patch.object(
+        message_queue._spin_condition, "wait", wraps=message_queue._spin_condition.wait
+    ) as wrapped_wait:
+        if not message_queue._is_writer:
+            # Put into idle mode
+            message_queue._spin_condition.last_read = 0
+
+            # no messages, so expect a TimeoutError
+            with pytest.raises(TimeoutError):
+                message_queue.dequeue(timeout=0.01)
+            # wait should only be called once while idle
+            assert wrapped_wait.call_count == 1
+
+            # sync with the writer and wait for message1
+            dist.barrier()
+            recv_message = message_queue.dequeue(timeout=5)
+            assert recv_message == message1
+            # second call to wait, with a message read, this puts in a busy spin
+            assert wrapped_wait.call_count == 2
+
+            # sync with the writer and wait for message2
+            dist.barrier()
+            recv_message = message_queue.dequeue(timeout=1)
+            assert np.array_equal(recv_message, message2)
+            # in busy mode, we expect wait to have been called multiple times
+            assert wrapped_wait.call_count > 3
+        else:
+            # writer writes two messages in sync with the reader
+            dist.barrier()
+            # sleep delays the send to ensure reader enters the read loop
+            time.sleep(0.1)
+            message_queue.enqueue(message1)
+
+            dist.barrier()
+            time.sleep(0.1)
+            message_queue.enqueue(message2)
+
+    message_queue.shutdown()
+    assert message_queue.shutting_down
+    print(f"torch distributed passed the test! Rank {rank}")
+
+
+def test_message_queue_idle_wake():
+    distributed_run(worker_fn_test_idle_to_busy, 4)
+
+
+@worker_fn_wrapper
+def worker_fn_test_busy_to_idle():
+    rank = dist.get_rank()
+    writer_rank = 2
+    message_queue = MessageQueue.create_from_process_group(
+        dist.group.WORLD, 40 * 1024, 2, writer_rank
+    )
+
+    message1 = 12345
+    message2 = list(range(3))
+    with mock.patch.object(
+        message_queue._spin_condition, "wait", wraps=message_queue._spin_condition.wait
+    ) as wrapped_wait:
+        if not message_queue._is_writer:
+            # Put into busy mode
+            message_queue._spin_condition.busy_loop_s = 9999
+
+            # sync with the writer and wait for message1
+            dist.barrier()
+            recv_message = message_queue.dequeue(timeout=1)
+            assert recv_message == message1
+            # in busy mode, we expect wait to have been called many times
+            assert wrapped_wait.call_count > 1
+
+            # simulate busy loop ending
+            message_queue._spin_condition.busy_loop_s = 0
+            # ensure we enter idle mode, then record call count
+            with pytest.raises(TimeoutError):
+                message_queue.dequeue(timeout=0.01)
+            call_count = wrapped_wait.call_count
+
+            # sync with the writer and wait for message2
+            dist.barrier()
+            recv_message = message_queue.dequeue(timeout=1)
+            assert recv_message == message2
+
+            # call to wait after idle should only happen once
+            assert wrapped_wait.call_count == call_count + 1
+        else:
+            # writer writes two messages in sync with the reader
+            dist.barrier()
+            # sleep delays the send to ensure reader enters the read loop
+            time.sleep(0.1)
+            message_queue.enqueue(message1)
+
+            dist.barrier()
+            time.sleep(0.1)
+            message_queue.enqueue(message2)
+
+    message_queue.shutdown()
+    assert message_queue.shutting_down
+    print(f"torch distributed passed the test! Rank {rank}")
+
+
+def test_message_queue_busy_to_idle():
+    distributed_run(worker_fn_test_busy_to_idle, 4)
+
+
+def test_warning_logs(caplog_vllm):
+    """
+    Test that warning logs are emitted at VLLM_RINGBUFFER_WARNING_INTERVAL intervals
+    when indefinite=False, and are not emitted when indefinite=True.
+    """
+
+    # Patch the warning log interval to every 1 ms during reads
+    with mock.patch(
+        "vllm.distributed.device_communicators.shm_broadcast.VLLM_RINGBUFFER_WARNING_INTERVAL",
+        new=0.001,  # 1 ms
+    ):
+        writer = MessageQueue(
+            n_reader=1,
+            n_local_reader=1,
+            max_chunk_bytes=1024 * 1024,  # 1MB chunks
+            max_chunks=10,
+        )
+        reader = MessageQueue.create_from_handle(writer.export_handle(), rank=0)
+        writer.wait_until_ready()
+        reader.wait_until_ready()
+
+        # We should have at least one warning log here
+        # "0 seconds" expected due to rounding of 1ms test interval
+        with pytest.raises(TimeoutError):
+            reader.dequeue(timeout=0.01, indefinite=False)
+        assert any(
+            "No available shared memory broadcast block found in 0 seconds"
+            in record.message
+            for record in caplog_vllm.records
+        )
+        caplog_vllm.clear()
+
+        # We should have no warnings this time
+        with pytest.raises(TimeoutError):
+            reader.dequeue(timeout=0.01, indefinite=True)
+        assert all(
+            "No available shared memory broadcast block found in 0 seconds"
+            not in record.message
+            for record in caplog_vllm.records
+        )
+
+        # Clean up when done
+        writer.shutdown()
+        reader.shutdown()
--- a/tests/distributed/test_symm_mem_allreduce.py
+++ b/tests/distributed/test_symm_mem_allreduce.py
@@ -39,7 +39,7 @@ def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        dtype = torch.bfloat16
        device = torch.device(f"cuda:{local_rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
        torch.set_default_device(device)
        torch.set_default_dtype(dtype)
        update_environment_variables(
@@ -105,7 +105,7 @@ def test_symm_mem_allreduce(
    monkeypatch: pytest.MonkeyPatch, tp_size, pipeline_parallel_size
 ):
    world_size = tp_size * pipeline_parallel_size
-    if world_size > torch.cuda.device_count():
+    if world_size > torch.accelerator.device_count():
        pytest.skip("Not enough GPUs to run the test.")
    q = mp.get_context("spawn").Queue()
    mp.spawn(symm_mem_allreduce_worker, args=(world_size, q), nprocs=world_size)
@@ -126,7 +126,7 @@ def test_symm_mem_allreduce(
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_dp_with_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch):
    world_size = 4
-    if world_size > torch.cuda.device_count():
+    if world_size > torch.accelerator.device_count():
        pytest.skip("Not enough GPUs to run the test.")
    # Verify that the DataParallel runs without error
    engine_args = EngineArgs(

--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -22,7 +22,7 @@ prompts = [

 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-# set different `gpu_memory_utilization` and `swap_space` for different ranks,
+# set different `gpu_memory_utilization` for different ranks,
 # to test if all ranks agree on the same kv cache configuration.
 llm = LLM(
    model="facebook/opt-125m",
@@ -30,7 +30,6 @@ llm = LLM(
    pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
    distributed_executor_backend="external_launcher",
    gpu_memory_utilization=random.uniform(0.7, 0.9),
-    swap_space=random.randint(1, 4),
    seed=0,
 )