[1/N] Elastic EP Milestone 2 (#34861)

Signed-off-by: Yongji Wu <wuyongji317@gmail.com> Signed-off-by: Itay Alroy <ialroy@nvidia.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Signed-off-by: Ron Tourgeman <rtourgeman@nvidia.com> Co-authored-by: Yongji Wu <wuyongji317@gmail.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Ron Tourgeman <rtourgeman@nvidia.com>

[1/N] Elastic EP Milestone 2 (#34861)
Signed-off-by: Yongji Wu <wuyongji317@gmail.com> Signed-off-by: Itay Alroy <ialroy@nvidia.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Signed-off-by: Ron Tourgeman <rtourgeman@nvidia.com> Co-authored-by: Yongji Wu <wuyongji317@gmail.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Ron Tourgeman <rtourgeman@nvidia.com>
dea26833 · Itay Alroy · GitHub · 90805ff4 · dea26833 · dea26833
Unverified Commit dea26833 authored Feb 28, 2026 by Itay Alroy Committed by GitHub Feb 28, 2026
20 changed files
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -20,4 +20,19 @@ steps:
  - tests/distributed/test_eplb_execute.py
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
-  - pytest -v -s distributed/test_eplb_spec_decode.py
\ No newline at end of file
+  - pytest -v -s distributed/test_eplb_spec_decode.py
+
+- label: Elastic EP Scaling Test
+  timeout_in_minutes: 20
+  device: b200
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/compilation/
+  - tests/distributed/
+  commands:
+  - pytest -v -s distributed/test_elastic_ep.py
--- a/tests/compile/passes/distributed/test_async_tp.py
+++ b/tests/compile/passes/distributed/test_async_tp.py
@@ -316,7 +316,6 @@ def async_tp_pass_on_test_model(

    # initialize distributed
    init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)

    # configure vllm config for SequenceParallelismPass
    vllm_config = VllmConfig()
@@ -334,11 +333,10 @@ def async_tp_pass_on_test_model(
        model=model_name, trust_remote_code=True, dtype=dtype, seed=42
    )

-    async_tp_pass = AsyncTPPass(vllm_config)
-
-    # Set the global vllm_config for TestBackend which calls
-    # get_current_vllm_config()
    with set_current_vllm_config(vllm_config):
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+        async_tp_pass = AsyncTPPass(vllm_config)
        backend = TestBackend(async_tp_pass)

        assert (

--- a/tests/compile/passes/distributed/test_fusion_all_reduce.py
+++ b/tests/compile/passes/distributed/test_fusion_all_reduce.py
@@ -278,7 +278,6 @@ def all_reduce_fusion_pass_on_test_model(
    )

    init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)

    custom_ops = []
    if enable_rms_norm_custom_op:
@@ -304,6 +303,7 @@ def all_reduce_fusion_pass_on_test_model(
        model=model_name, trust_remote_code=True, dtype=dtype, seed=42
    )
    with set_current_vllm_config(vllm_config):
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
        all_reduce_fusion_pass = AllReduceFusionPass(vllm_config)
        noop_pass = NoOpEliminationPass(vllm_config)
        func_pass = FixFunctionalizationPass(vllm_config)

--- a/tests/compile/passes/distributed/test_sequence_parallelism.py
+++ b/tests/compile/passes/distributed/test_sequence_parallelism.py
@@ -242,7 +242,6 @@ def sequence_parallelism_pass_on_test_model(

    # initialize distributed
    init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)

    # configure vllm config for SequenceParallelismPass
    custom_ops_list = custom_ops.split(",") if custom_ops else []
@@ -272,6 +271,7 @@ def sequence_parallelism_pass_on_test_model(
    )

    with set_current_vllm_config(vllm_config):
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
        noop_pass = NoOpEliminationPass(vllm_config)
        sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
        cleanup_pass = PostCleanupPass(vllm_config)

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -176,16 +176,20 @@ def init_test_http_connection():

 @pytest.fixture
 def dist_init():
+    from tests.utils import ensure_current_vllm_config
+
    temp_file = tempfile.mkstemp()[1]
-    init_distributed_environment(
-        world_size=1,
-        rank=0,
-        distributed_init_method=f"file://{temp_file}",
-        local_rank=0,
-        backend="nccl",
-    )
-    initialize_model_parallel(1, 1)
-    yield
+
+    with ensure_current_vllm_config():
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            distributed_init_method=f"file://{temp_file}",
+            local_rank=0,
+            backend="nccl",
+        )
+        initialize_model_parallel(1, 1)
+        yield
    cleanup_dist_env_and_memory()



--- a/tests/distributed/eplb_utils.py
+++ b/tests/distributed/eplb_utils.py
@@ -7,6 +7,7 @@ import random
 import torch
 import torch.multiprocessing as mp

+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed.parallel_state import (
    init_distributed_environment,
 )
@@ -42,7 +43,11 @@ def set_env_vars_and_device(env: dict[str, str]) -> None:
    local_rank = os.environ["LOCAL_RANK"]
    device = torch.device(f"cuda:{local_rank}")
    torch.cuda.set_device(device)
-    init_distributed_environment()
+
+    # Create a minimal vllm config for init_distributed_environment
+    vllm_config = VllmConfig()
+    with set_current_vllm_config(vllm_config):
+        init_distributed_environment()

    # Ensure each worker process has the same random seed
    random.seed(42)

--- a/tests/distributed/test_elastic_ep.py
+++ b/tests/distributed/test_elastic_ep.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import subprocess
+import time
+
+import pytest
+import requests
+
+from ..evals.gsm8k.gsm8k_eval import evaluate_gsm8k
+from ..utils import RemoteOpenAIServer, multi_gpu_test
+
+
+@pytest.fixture(autouse=True)
+def cleanup_ray_between_tests():
+    """Force-stop any lingering Ray processes between tests."""
+    subprocess.run(["ray", "stop", "--force"], timeout=30, capture_output=True)
+    time.sleep(5)
+    yield
+
+
+MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+
+NUM_GSM8K_QUESTIONS = 256
+EXPECTED_ACCURACY = 0.58
+ACCURACY_TOL = 0.08
+MAX_NUM_SEQS = 32
+
+
+def _send_scale_command(server: RemoteOpenAIServer, new_dp_size: int) -> bool:
+    url = server.url_for("scale_elastic_ep")
+    payload = {"new_data_parallel_size": new_dp_size}
+    headers = {"Content-Type": "application/json"}
+
+    try:
+        response = requests.post(url, json=payload, headers=headers, timeout=300)
+        return response.status_code == 200
+    except requests.exceptions.RequestException:
+        return False
+
+
+def _run_gsm8k_eval(server: RemoteOpenAIServer, stage: str) -> float:
+    assert server.port is not None
+    result = evaluate_gsm8k(
+        num_questions=NUM_GSM8K_QUESTIONS,
+        host=f"http://{server.host}",
+        port=server.port,
+    )
+    accuracy = result["accuracy"]
+    print(
+        f"[{stage}] GSM8K accuracy: {accuracy:.3f} "
+        f"({result['num_questions']} questions)"
+    )
+    assert accuracy >= EXPECTED_ACCURACY, (
+        f"[{stage}] GSM8K accuracy {accuracy:.3f} is below "
+        f"expected threshold {EXPECTED_ACCURACY}"
+    )
+    return accuracy
+
+
+@multi_gpu_test(num_gpus=4)
+def test_elastic_ep_scaling():
+    vllm_serve_args = [
+        "--trust-remote-code",
+        "--tensor-parallel-size",
+        "1",
+        "--gpu-memory-utilization",
+        "0.8",
+        "--max-model-len",
+        "4096",
+        "--max-num-seqs",
+        str(MAX_NUM_SEQS),
+        "--enable-expert-parallel",
+        "--all2all-backend",
+        "allgather_reducescatter",
+        "--enable-elastic-ep",
+        "--enable-eplb",
+        "--eplb-config.num_redundant_experts",
+        "0",
+        "--data-parallel-backend",
+        "ray",
+        "--data-parallel-size",
+        "2",
+        "--api-server-count",
+        "1",
+    ]
+
+    leader_address = os.environ.get("LEADER_ADDRESS")
+    if leader_address:
+        vllm_serve_args.extend(["--data-parallel-address", leader_address])
+
+    with RemoteOpenAIServer(
+        MODEL_NAME, vllm_serve_args, env_dict={}, max_wait_seconds=1200
+    ) as server:
+        initial_accuracy = _run_gsm8k_eval(server, "Initial (2 GPUs)")
+
+        assert _send_scale_command(server, 4)
+        time.sleep(10)
+        scale_up_accuracy = _run_gsm8k_eval(server, "After scale up (4 GPUs)")
+
+        assert scale_up_accuracy >= initial_accuracy - ACCURACY_TOL, (
+            f"Scale up accuracy {scale_up_accuracy:.3f} dropped more than "
+            f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
+        )
+
+        assert _send_scale_command(server, 2)
+        time.sleep(5)
+        scale_down_accuracy = _run_gsm8k_eval(server, "After scale down (2 GPUs)")
+
+        assert scale_down_accuracy >= initial_accuracy - ACCURACY_TOL, (
+            f"Scale down accuracy {scale_down_accuracy:.3f} dropped more than "
+            f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
+        )
+
+        print("\nAccuracy Summary:")
+        print(f"  Initial:    {initial_accuracy:.3f}")
+        print(
+            f"  Scale up:   {scale_up_accuracy:.3f} "
+            f"(diff: {scale_up_accuracy - initial_accuracy:+.3f})"
+        )
+        print(
+            f"  Scale down: {scale_down_accuracy:.3f} "
+            f"(diff: {scale_down_accuracy - initial_accuracy:+.3f})"
+        )
+        print(f"  Tolerance:  {ACCURACY_TOL:.3f}")
+
+
+@multi_gpu_test(num_gpus=4)
+def test_elastic_ep_scaling_uneven():
+    """Test scale up with uneven worker distribution.
+
+    This tests the case where num_new_workers % old_dp_size != 0,
+    specifically 2 -> 3 where remainder = 1 % 2 = 1.
+    This exercises the remainder handling in sender-receiver pairing.
+    """
+    vllm_serve_args = [
+        "--trust-remote-code",
+        "--tensor-parallel-size",
+        "1",
+        "--gpu-memory-utilization",
+        "0.8",
+        "--max-model-len",
+        "4096",
+        "--max-num-seqs",
+        str(MAX_NUM_SEQS),
+        "--enable-expert-parallel",
+        "--all2all-backend",
+        "allgather_reducescatter",
+        "--enable-elastic-ep",
+        "--enable-eplb",
+        "--eplb-config.num_redundant_experts",
+        "0",
+        "--data-parallel-backend",
+        "ray",
+        "--data-parallel-size",
+        "2",
+        "--api-server-count",
+        "1",
+    ]
+
+    leader_address = os.environ.get("LEADER_ADDRESS")
+    if leader_address:
+        vllm_serve_args.extend(["--data-parallel-address", leader_address])
+
+    with RemoteOpenAIServer(
+        MODEL_NAME, vllm_serve_args, env_dict={}, max_wait_seconds=1200
+    ) as server:
+        initial_accuracy = _run_gsm8k_eval(server, "Initial (2 GPUs)")
+
+        # Scale 2 -> 3: This has remainder = 1 % 2 = 1
+        # Tests uneven sender-receiver pairing
+        assert _send_scale_command(server, 3)
+        time.sleep(10)
+        scale_up_accuracy = _run_gsm8k_eval(server, "After scale up (3 GPUs)")
+
+        assert scale_up_accuracy >= initial_accuracy - ACCURACY_TOL, (
+            f"Scale up accuracy {scale_up_accuracy:.3f} dropped more than "
+            f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
+        )
+
+        # Scale back down to 2
+        assert _send_scale_command(server, 2)
+        time.sleep(5)
+        scale_down_accuracy = _run_gsm8k_eval(server, "After scale down (2 GPUs)")
+
+        assert scale_down_accuracy >= initial_accuracy - ACCURACY_TOL, (
+            f"Scale down accuracy {scale_down_accuracy:.3f} dropped more than "
+            f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
+        )
+
+        print("\nAccuracy Summary (Uneven Scaling):")
+        print(f"  Initial:    {initial_accuracy:.3f}")
+        print(
+            f"  Scale up:   {scale_up_accuracy:.3f} "
+            f"(diff: {scale_up_accuracy - initial_accuracy:+.3f})"
+        )
+        print(
+            f"  Scale down: {scale_down_accuracy:.3f} "
+            f"(diff: {scale_down_accuracy - initial_accuracy:+.3f})"
+        )
+        print(f"  Tolerance:  {ACCURACY_TOL:.3f}")
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -8,6 +8,7 @@ import pytest
 import torch
 import torch.distributed

+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed.eplb.rebalance_execute import (
    move_from_buffer,
    rearrange_expert_weights_inplace,
@@ -244,91 +245,96 @@ def _test_async_transfer_layer_without_mtp_worker(
    num_logical_experts: int,
 ) -> None:
    set_env_vars_and_device(env)
-    ensure_model_parallel_initialized(
-        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-    )

-    tp_group = get_tp_group()
-    ep_group = tp_group.device_group
-    ep_rank = torch.distributed.get_rank()
-    device = torch.device(f"cuda:{ep_rank}")
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size

-    total_physical_experts = world_size * num_local_experts
-    hidden_sizes = [16, 32]
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )

-    redundancy_config = create_redundancy_config(
-        num_logical_experts,
-        total_physical_experts,
-    )
-    old_indices = create_expert_indices_with_redundancy(
-        num_layers,
-        num_logical_experts,
-        total_physical_experts,
-        redundancy_config,
-    )
+        tp_group = get_tp_group()
+        ep_group = tp_group.device_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")

-    new_redundancy_config = create_redundancy_config(
-        num_logical_experts,
-        total_physical_experts,
-    )
-    new_indices = create_expert_indices_with_redundancy(
-        num_layers,
-        num_logical_experts,
-        total_physical_experts,
-        new_redundancy_config,
-    )
+        total_physical_experts = world_size * num_local_experts
+        hidden_sizes = [16, 32]

-    expert_weights = create_expert_weights(
-        num_layers,
-        num_local_experts,
-        hidden_sizes,
-        ep_rank,
-        device,
-        old_indices,
-    )
-    old_indices_cpu = old_indices.cpu()
-    new_indices_cpu = new_indices.cpu()
+        redundancy_config = create_redundancy_config(
+            num_logical_experts,
+            total_physical_experts,
+        )
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            redundancy_config,
+        )

-    expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
-    cuda_stream = torch.cuda.Stream(device=device)
+        new_redundancy_config = create_redundancy_config(
+            num_logical_experts,
+            total_physical_experts,
+        )
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            new_redundancy_config,
+        )

-    for layer_idx in range(num_layers):
-        is_unchanged, is_received_locally, recv_metadata = asyncio.run(
-            transfer_layer(
-                old_layer_indices=old_indices_cpu[layer_idx],
-                new_layer_indices=new_indices_cpu[layer_idx],
+        expert_weights = create_expert_weights(
+            num_layers,
+            num_local_experts,
+            hidden_sizes,
+            ep_rank,
+            device,
+            old_indices,
+        )
+        old_indices_cpu = old_indices.cpu()
+        new_indices_cpu = new_indices.cpu()
+
+        expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
+        cuda_stream = torch.cuda.Stream(device=device)
+
+        for layer_idx in range(num_layers):
+            is_unchanged, is_received_locally, recv_metadata = asyncio.run(
+                transfer_layer(
+                    old_layer_indices=old_indices_cpu[layer_idx],
+                    new_layer_indices=new_indices_cpu[layer_idx],
+                    expert_weights=expert_weights[layer_idx],
+                    expert_weights_buffer=expert_buffer,
+                    ep_group=ep_group,
+                    cuda_stream=cuda_stream,
+                )
+            )
+            cuda_stream.synchronize()
+            move_from_buffer(
                expert_weights=expert_weights[layer_idx],
-                expert_weights_buffer=expert_buffer,
-                ep_group=ep_group,
-                cuda_stream=cuda_stream,
+                expert_weights_buffers=expert_buffer,
+                is_unchanged=is_unchanged,
+                is_received_locally=is_received_locally,
+                recv_metadata=recv_metadata,
+                new_indices=new_indices_cpu[layer_idx].numpy(),
+                ep_rank=ep_rank,
            )
+
+        verify_expert_weights_after_shuffle(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            ep_rank,
+            num_local_experts,
        )
-        cuda_stream.synchronize()
-        move_from_buffer(
-            expert_weights=expert_weights[layer_idx],
-            expert_weights_buffers=expert_buffer,
-            is_unchanged=is_unchanged,
-            is_received_locally=is_received_locally,
-            recv_metadata=recv_metadata,
-            new_indices=new_indices_cpu[layer_idx].numpy(),
-            ep_rank=ep_rank,
+        verify_redundant_experts_have_same_weights(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            world_size,
+            num_local_experts,
        )

-    verify_expert_weights_after_shuffle(
-        expert_weights,
-        new_indices,
-        hidden_sizes,
-        ep_rank,
-        num_local_experts,
-    )
-    verify_redundant_experts_have_same_weights(
-        expert_weights,
-        new_indices,
-        hidden_sizes,
-        world_size,
-        num_local_experts,
-    )
-

 def _test_rearrange_expert_weights_with_redundancy(
    env, world_size, num_layers, num_local_experts, num_logical_experts
@@ -336,71 +342,76 @@ def _test_rearrange_expert_weights_with_redundancy(
    # Initialize model parallel (using tensor parallel as an entrypoint
    # to expert parallel)
    set_env_vars_and_device(env)
-    ensure_model_parallel_initialized(
-        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-    )

-    ep_group = get_tp_group().cpu_group
-    ep_rank = torch.distributed.get_rank()
-    device = torch.device(f"cuda:{ep_rank}")
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size

-    # Test parameters
-    total_physical_experts = world_size * num_local_experts
-    hidden_sizes = [32, 64]  # Two different weight matrices
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )

-    # Create old expert indices (with redundancy)
-    redundancy_config = create_redundancy_config(
-        num_logical_experts, total_physical_experts
-    )
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")

-    old_indices = create_expert_indices_with_redundancy(
-        num_layers,
-        num_logical_experts,
-        total_physical_experts,
-        redundancy_config,
-    )
+        # Test parameters
+        total_physical_experts = world_size * num_local_experts
+        hidden_sizes = [32, 64]  # Two different weight matrices

-    # Create new expert indices (with redundancy)
-    new_redundancy_config = create_redundancy_config(
-        num_logical_experts, total_physical_experts
-    )
-    new_indices = create_expert_indices_with_redundancy(
-        num_layers,
-        num_logical_experts,
-        total_physical_experts,
-        new_redundancy_config,
-    )
+        # Create old expert indices (with redundancy)
+        redundancy_config = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )

-    # Create expert weights
-    expert_weights = create_expert_weights(
-        num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
-    )
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            redundancy_config,
+        )

-    # Execute weight rearrangement
-    rearrange_expert_weights_inplace(
-        old_indices,
-        new_indices,
-        expert_weights,
-        ep_group,
-        is_profile=False,
-    )
+        # Create new expert indices (with redundancy)
+        new_redundancy_config = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            new_redundancy_config,
+        )

-    # Verify the rearrangement result
-    verify_expert_weights_after_shuffle(
-        expert_weights,
-        new_indices,
-        hidden_sizes,
-        ep_rank,
-        num_local_experts,
-    )
+        # Create expert weights
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+        )

-    verify_redundant_experts_have_same_weights(
-        expert_weights,
-        new_indices,
-        hidden_sizes,
-        world_size,
-        num_local_experts,
-    )
+        # Execute weight rearrangement
+        rearrange_expert_weights_inplace(
+            old_indices,
+            new_indices,
+            expert_weights,
+            ep_group,
+            is_profile=False,
+        )
+
+        # Verify the rearrangement result
+        verify_expert_weights_after_shuffle(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            ep_rank,
+            num_local_experts,
+        )
+
+        verify_redundant_experts_have_same_weights(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            world_size,
+            num_local_experts,
+        )


 @pytest.mark.parametrize(
@@ -444,58 +455,63 @@ def test_rearrange_expert_weights_with_redundancy(

 def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
    set_env_vars_and_device(env)
-    ensure_model_parallel_initialized(
-        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-    )

-    ep_group = get_tp_group().cpu_group
-    ep_rank = torch.distributed.get_rank()
-    device = torch.device(f"cuda:{ep_rank}")
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size

-    num_layers = 2
-    num_local_experts = 2
-    total_physical_experts = world_size * num_local_experts
-    num_logical_experts = total_physical_experts // 2  # Some redundancy
-    hidden_sizes = [32, 64]
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )

-    # Create redundancy configuration
-    redundancy_config = [2] * num_logical_experts
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")

-    # Same indices - no change
-    indices = create_expert_indices_with_redundancy(
-        num_layers, num_logical_experts, total_physical_experts, redundancy_config
-    )
+        num_layers = 2
+        num_local_experts = 2
+        total_physical_experts = world_size * num_local_experts
+        num_logical_experts = total_physical_experts // 2  # Some redundancy
+        hidden_sizes = [32, 64]

-    expert_weights = create_expert_weights(
-        num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
-    )
+        # Create redundancy configuration
+        redundancy_config = [2] * num_logical_experts

-    # Save original weights
-    original_weights = []
-    for layer_weights in expert_weights:
-        layer_copy = []
-        for weight in layer_weights:
-            layer_copy.append(weight.clone())
-        original_weights.append(layer_copy)
-
-    # Execute rearrangement (should be no change)
-    rearrange_expert_weights_inplace(
-        indices,
-        indices,  # Same indices
-        expert_weights,
-        ep_group,
-        is_profile=False,
-    )
+        # Same indices - no change
+        indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts, redundancy_config
+        )

-    # Verify that the weights have not changed
-    for layer in range(num_layers):
-        for weight_idx in range(len(hidden_sizes)):
-            torch.testing.assert_close(
-                expert_weights[layer][weight_idx],
-                original_weights[layer][weight_idx],
-                msg=f"""Layer {layer}, weight {weight_idx}
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
+        )
+
+        # Save original weights
+        original_weights = []
+        for layer_weights in expert_weights:
+            layer_copy = []
+            for weight in layer_weights:
+                layer_copy.append(weight.clone())
+            original_weights.append(layer_copy)
+
+        # Execute rearrangement (should be no change)
+        rearrange_expert_weights_inplace(
+            indices,
+            indices,  # Same indices
+            expert_weights,
+            ep_group,
+            is_profile=False,
+        )
+
+        # Verify that the weights have not changed
+        for layer in range(num_layers):
+            for weight_idx in range(len(hidden_sizes)):
+                torch.testing.assert_close(
+                    expert_weights[layer][weight_idx],
+                    original_weights[layer][weight_idx],
+                    msg=f"""Layer {layer}, weight {weight_idx}
 should remain unchanged""",
-            )
+                )


 @pytest.mark.parametrize(
@@ -538,64 +554,69 @@ def test_rearrange_expert_weights_no_change(world_size):

 def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None:
    set_env_vars_and_device(env)
-    ensure_model_parallel_initialized(
-        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-    )

-    ep_group = get_tp_group().cpu_group
-    ep_rank = torch.distributed.get_rank()
-    device = torch.device(f"cuda:{ep_rank}")
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size

-    num_layers = 1
-    num_local_experts = 2
-    total_physical_experts = world_size * num_local_experts
-    num_logical_experts = total_physical_experts // 2
-    hidden_sizes = [32]
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )

-    # Create different index distributions
-    old_redundancy = create_redundancy_config(
-        num_logical_experts, total_physical_experts
-    )
-    new_redundancy = create_redundancy_config(
-        num_logical_experts, total_physical_experts
-    )
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")

-    old_indices = create_expert_indices_with_redundancy(
-        num_layers, num_logical_experts, total_physical_experts, old_redundancy
-    )
-    new_indices = create_expert_indices_with_redundancy(
-        num_layers, num_logical_experts, total_physical_experts, new_redundancy
-    )
+        num_layers = 1
+        num_local_experts = 2
+        total_physical_experts = world_size * num_local_experts
+        num_logical_experts = total_physical_experts // 2
+        hidden_sizes = [32]

-    expert_weights = create_expert_weights(
-        num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
-    )
+        # Create different index distributions
+        old_redundancy = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
+        new_redundancy = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )

-    # Save original weights
-    original_weights = []
-    for layer_weights in expert_weights:
-        layer_copy = []
-        for weight in layer_weights:
-            layer_copy.append(weight.clone())
-        original_weights.append(layer_copy)
-
-    # Execute profile mode rearrangement
-    rearrange_expert_weights_inplace(
-        old_indices,
-        new_indices,
-        expert_weights,
-        ep_group,
-        is_profile=True,  # Profile mode
-    )
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts, old_redundancy
+        )
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts, new_redundancy
+        )

-    # In profile mode, the weights should remain unchanged
-    for layer in range(num_layers):
-        for weight_idx in range(len(hidden_sizes)):
-            torch.testing.assert_close(
-                expert_weights[layer][weight_idx],
-                original_weights[layer][weight_idx],
-                msg="In profile mode, the weights should remain unchanged",
-            )
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+        )
+
+        # Save original weights
+        original_weights = []
+        for layer_weights in expert_weights:
+            layer_copy = []
+            for weight in layer_weights:
+                layer_copy.append(weight.clone())
+            original_weights.append(layer_copy)
+
+        # Execute profile mode rearrangement
+        rearrange_expert_weights_inplace(
+            old_indices,
+            new_indices,
+            expert_weights,
+            ep_group,
+            is_profile=True,  # Profile mode
+        )
+
+        # In profile mode, the weights should remain unchanged
+        for layer in range(num_layers):
+            for weight_idx in range(len(hidden_sizes)):
+                torch.testing.assert_close(
+                    expert_weights[layer][weight_idx],
+                    original_weights[layer][weight_idx],
+                    msg="In profile mode, the weights should remain unchanged",
+                )


 @pytest.mark.parametrize("world_size", [2, 4])

--- a/tests/distributed/test_nccl_symm_mem_allreduce.py
+++ b/tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -10,6 +10,7 @@ import torch.distributed as dist
 import torch.multiprocessing as mp

 import vllm.envs as envs
+from tests.utils import ensure_current_vllm_config
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
 from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric_ops
@@ -51,7 +52,8 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
        )

        init_distributed_environment()
-        initialize_model_parallel(tensor_model_parallel_size=world_size)
+        with ensure_current_vllm_config():
+            initialize_model_parallel(tensor_model_parallel_size=world_size)

        cuda_communicator = typing.cast(
            CudaCommunicator, get_tp_group().device_communicator

--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -9,6 +9,7 @@ import pytest
 import torch
 import torch.distributed

+from tests.utils import ensure_current_vllm_config
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce  # noqa
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
@@ -112,7 +113,8 @@ def test_pynccl_multiple_allreduce():
 @worker_fn_wrapper
 def multiple_allreduce_with_vllm_worker_fn():
    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
-    ensure_model_parallel_initialized(2, 2)
+    with ensure_current_vllm_config():
+        ensure_model_parallel_initialized(2, 2)
    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
    with graph_capture(device=device):
        # two tp groups can communicate independently

--- a/tests/kernels/mamba/test_mamba_mixer2.py
+++ b/tests/kernels/mamba/test_mamba_mixer2.py
@@ -6,7 +6,7 @@ import unittest
 import pytest
 import torch

-from tests.utils import multi_gpu_test
+from tests.utils import ensure_current_vllm_config, multi_gpu_test
 from vllm.distributed.parallel_state import (
    init_distributed_environment,
    initialize_model_parallel,
@@ -87,7 +87,8 @@ def mixer2_gated_norm_tensor_parallel(

    # initialize distributed
    init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)
+    with ensure_current_vllm_config():
+        initialize_model_parallel(tensor_model_parallel_size=world_size)

    # create random weights an inputs
    weight = torch.rand((hidden_size,), dtype=dtype, device=device)

--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -45,21 +45,24 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):

 @pytest.fixture
 def dist_init():
+    from tests.utils import ensure_current_vllm_config
+
    temp_file = tempfile.mkstemp()[1]

    backend = "nccl"
    if current_platform.is_cpu() or current_platform.is_tpu():
        backend = "gloo"

-    init_distributed_environment(
-        world_size=1,
-        rank=0,
-        distributed_init_method=f"file://{temp_file}",
-        local_rank=0,
-        backend=backend,
-    )
-    initialize_model_parallel(1, 1)
-    yield
+    with ensure_current_vllm_config():
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            distributed_init_method=f"file://{temp_file}",
+            local_rank=0,
+            backend=backend,
+        )
+        initialize_model_parallel(1, 1)
+        yield
    cleanup_dist_env_and_memory(shutdown_ray=True)



--- a/tests/lora/test_fused_moe_lora_kernel.py
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -6,7 +6,7 @@ import random
 import pytest
 import torch

-from tests.utils import multi_gpu_test
+from tests.utils import ensure_current_vllm_config, multi_gpu_test
 from vllm import _custom_ops as ops
 from vllm.distributed import (
    init_distributed_environment,
@@ -631,7 +631,8 @@ def use_fused_moe_lora_kernel_tensor_parallel(
        local_rank=local_rank,
        distributed_init_method=init_method,
    )
-    initialize_model_parallel(world_size, 1)
+    with ensure_current_vllm_config():
+        initialize_model_parallel(world_size, 1)
    tp_size = get_tensor_model_parallel_world_size()

    input_dim = K if column_parallel else N

--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -13,6 +13,7 @@ from vllm.config import (
    ParallelConfig,
    SchedulerConfig,
    VllmConfig,
+    set_current_vllm_config,
 )
 from vllm.config.load import LoadConfig
 from vllm.config.lora import LoRAConfig
@@ -77,8 +78,9 @@ def test_worker_apply_lora(qwen3_lora_files):
        distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
    )

-    worker.init_device()
-    worker.load_model()
+    with set_current_vllm_config(vllm_config):
+        worker.init_device()
+        worker.load_model()

    set_active_loras(worker, [])
    assert worker.list_loras() == set()

--- a/tests/models/test_vision.py
+++ b/tests/models/test_vision.py
@@ -6,7 +6,7 @@ import pytest
 import torch
 import torch.multiprocessing as mp

-from tests.utils import multi_gpu_test
+from tests.utils import ensure_current_vllm_config, multi_gpu_test
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import (
    init_distributed_environment,
@@ -117,7 +117,8 @@ def run_dp_sharded_vision_model_vs_direct(

    # initialize distributed
    init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)
+    with ensure_current_vllm_config():
+        initialize_model_parallel(tensor_model_parallel_size=world_size)

    # Create a test input tensor
    image_input = torch.randn(batch_size, 3, 224, 224)
@@ -302,7 +303,8 @@ def run_dp_sharded_mrope_vision_model_vs_direct(

    # initialize distributed
    init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)
+    with ensure_current_vllm_config():
+        initialize_model_parallel(tensor_model_parallel_size=world_size)

    # Create test data
    grid_thw_list = []
@@ -377,7 +379,8 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker(
    )

    init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)
+    with ensure_current_vllm_config():
+        initialize_model_parallel(tensor_model_parallel_size=world_size)

    # Create empty inputs
    pixel_values = torch.empty((0, 768))
@@ -425,7 +428,8 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
    )

    init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=world_size)
+    with ensure_current_vllm_config():
+        initialize_model_parallel(tensor_model_parallel_size=world_size)

    # Create images with very different sizes
    grid_thw_list = [

--- a/tests/utils.py
+++ b/tests/utils.py
@@ -895,6 +895,36 @@ def compare_all_settings(
                    )


+@contextmanager
+def ensure_current_vllm_config():
+    """Ensures a vllm config is set for the duration of the context.
+
+    If a config is already set, this is a no-op. Otherwise, it creates a default
+    VllmConfig and sets it for the duration of the context.
+
+    Used for tests that call functions which require a vllm config but don't
+    need a specific config.
+
+    Example:
+        with ensure_current_vllm_config():
+            init_distributed_environment(...)
+            ensure_model_parallel_initialized(...)
+    """
+    from vllm.config import (
+        VllmConfig,
+        get_current_vllm_config_or_none,
+        set_current_vllm_config,
+    )
+
+    if get_current_vllm_config_or_none() is not None:
+        # Config already set, just yield
+        yield
+    else:
+        # No config set, create a default one for the duration
+        with set_current_vllm_config(VllmConfig()):
+            yield
+
+
 def init_test_distributed_environment(
    tp_size: int,
    pp_size: int,
@@ -921,6 +951,7 @@ def init_test_distributed_environment(
            distributed_init_method=distributed_init_method,
            local_rank=local_rank,
        )
+        ensure_model_parallel_initialized(tp_size, pp_size)
    else:
        # No config set, create a default one for the test
        with set_current_vllm_config(VllmConfig()):
@@ -930,7 +961,7 @@ def init_test_distributed_environment(
                distributed_init_method=distributed_init_method,
                local_rank=local_rank,
            )
-    ensure_model_parallel_initialized(tp_size, pp_size)
+            ensure_model_parallel_initialized(tp_size, pp_size)


 def multi_process_parallel(

--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -789,8 +789,11 @@ def test_hybrid_attention_mamba_tensor_shapes():
            "MASTER_PORT": "12345",
        }
    )
-    init_distributed_environment()
-    initialize_model_parallel(tensor_model_parallel_size=1)
+    from tests.utils import ensure_current_vllm_config
+
+    with ensure_current_vllm_config():
+        init_distributed_environment()
+        initialize_model_parallel(tensor_model_parallel_size=1)
    torch.set_default_dtype(torch.float16)

    model_config = ModelConfig(

--- a/tests/v1/worker/test_worker_memory_snapshot.py
+++ b/tests/v1/worker/test_worker_memory_snapshot.py
@@ -10,6 +10,7 @@ from unittest.mock import patch
 import pytest
 import torch

+from vllm.config import set_current_vllm_config
 from vllm.engine.arg_utils import EngineArgs
 from vllm.utils.mem_utils import MemorySnapshot
 from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
@@ -95,7 +96,12 @@ def worker_process(
            side_effect=make_operation_tracker("nccl_all_reduce", original_all_reduce),
        )

-        with init_patch, memory_patch, all_reduce_patch:
+        with (
+            init_patch,
+            memory_patch,
+            all_reduce_patch,
+            set_current_vllm_config(vllm_config),
+        ):
            # Initialize device (this is where we test the order)
            worker.init_device()


--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -319,3 +319,52 @@ class TorchCompileWithNoGuardsWrapper:
            yield
        finally:
            self.__class__.forward.__code__ = original
+
+
+def reset_compile_wrapper(model: torch.nn.Module) -> None:
+    """
+    Clean up compiled model and captured CUDA graphs for elastic EP.
+    """
+    if not isinstance(model, TorchCompileWithNoGuardsWrapper) and hasattr(
+        model, "model"
+    ):
+        model = model.model
+    if not isinstance(model, TorchCompileWithNoGuardsWrapper):
+        return
+    # model.do_not_compile is set by the @support_torch_compile decorator
+    if hasattr(model, "do_not_compile") and model.do_not_compile:
+        return
+    from vllm.compilation.counter import compilation_counter
+
+    # reset the compilation counter
+    compilation_counter.num_models_seen = 0
+    compilation_counter.num_graphs_seen = 0
+    compilation_counter.num_piecewise_graphs_seen = 0
+    compilation_counter.num_piecewise_capturable_graphs_seen = 0
+    compilation_counter.num_backend_compilations = 0
+    compilation_counter.num_gpu_runner_capture_triggers = 0
+    compilation_counter.num_cudagraph_captured = 0
+    compilation_counter.num_inductor_compiles = 0
+    compilation_counter.num_eager_compiles = 0
+    compilation_counter.num_cache_entries_updated = 0
+    compilation_counter.num_compiled_artifacts_saved = 0
+    compilation_counter.stock_torch_compile_count = 0
+
+    # Clear the AOT compiled function so the model is forced to
+    # recompile on the next call. Without this, decorators.py
+    # __call__ uses the stale aot_compiled_fn whose torchinductor
+    # kernels have old parameters (expert_map size for example)
+    # baked in as compile-time constants.
+    if hasattr(model, "aot_compiled_fn"):
+        model.aot_compiled_fn = None
+    if hasattr(model, "was_aot_compile_fn_loaded_from_disk"):
+        model.was_aot_compile_fn_loaded_from_disk = False
+
+    # Reset the cache_dir so VllmBackend recomputes the hash
+    # (data_parallel_size changed, so the config hash differs).
+    compilation_config = model.vllm_config.compilation_config
+    compilation_config.cache_dir = ""
+    compilation_config.local_cache_dir = ""
+
+    model.__class__.forward.__code__ = model.original_code_object()
+    TorchCompileWithNoGuardsWrapper.__init__(model)
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -165,6 +165,9 @@ class ParallelConfig:
    disable_custom_all_reduce: bool = False
    """Disable the custom all-reduce kernel and fall back to NCCL."""

+    enable_elastic_ep: bool = False
+    """Enable elastic expert parallelism with stateless NCCL groups for DP/EP."""
+
    enable_dbo: bool = False
    """Enable dual batch overlap for the model executor."""
    ubatch_size: int = 0
@@ -244,6 +247,34 @@ class ParallelConfig:
    Set to be private as it's not intended to be configured by users.
    """

+    _stateless_dp_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless DP groups when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    It is a list of list[int], with each inner list contains a set of 3 ports
+    to be used for setting up the stateless CPU/device/TCPStore groups
+    in StatelessGroupCoordinator. The number of inner lists is equal to
+    the number of DP groups, 
+    i.e., len(self._stateless_dp_group_port_list) == world_size_across_dp // dp_size,
+    and len(self._stateless_dp_group_port_list[i]) == 3 for all i.
+    """
+
+    _stateless_ep_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless EP groups when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    len(self._stateless_ep_group_port_list) == world_size_across_dp // ep_size,
+    """
+
+    _stateless_eplb_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless EPLB groups when enable_elastic_ep is True.
+    Same topology as EP but separate NCCL communicator to avoid deadlocks.
+    """
+
+    _stateless_world_group_port_list: list[list[int]] = Field(default_factory=list)
+    """List of open ports for stateless world group when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    len(self._stateless_world_group_port_list) == 1,
+    """
+
    decode_context_parallel_size: int = 1
    """Number of decode context parallel groups, because the world size does
    not change by dcp, it simply reuse the GPUs of TP group, and tp_size
@@ -402,7 +433,67 @@ class ParallelConfig:

        return answer

-    def stateless_init_dp_group(self) -> ProcessGroup:
+    def allocate_elastic_ep_ports(self) -> None:
+        """Allocate all ports for elastic EP (stateless groups + DP master).
+
+        Must be called AFTER ray.init() so that ports claimed by Ray's
+        idle worker pool are already in use and won't be returned by
+        get_open_ports_list().
+        """
+        if not self.enable_elastic_ep:
+            return
+        if self._stateless_world_group_port_list:
+            return
+
+        num_world_groups = 1
+        dp_size = self.data_parallel_size
+        ep_size = self.data_parallel_size * self.world_size_across_dp
+        num_dp_groups = max(1, self.world_size_across_dp // dp_size)
+        num_ep_groups = max(1, self.world_size_across_dp // ep_size)
+        num_eplb_groups = num_ep_groups
+        total_stateless_ports = (
+            num_world_groups + num_dp_groups + num_ep_groups + num_eplb_groups
+        ) * 3
+        num_dp_master_ports = 5
+
+        all_ports = get_open_ports_list(total_stateless_ports + num_dp_master_ports)
+
+        self._data_parallel_master_port_list = all_ports[-num_dp_master_ports:]
+        self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
+        all_ports = all_ports[:-num_dp_master_ports]
+
+        self._stateless_world_group_port_list = [
+            all_ports[i : i + 3] for i in range(0, num_world_groups * 3, 3)
+        ]
+        start_idx = num_world_groups * 3
+        self._stateless_dp_group_port_list = [
+            all_ports[i : i + 3]
+            for i in range(start_idx, start_idx + num_dp_groups * 3, 3)
+        ]
+        start_idx += num_dp_groups * 3
+        self._stateless_ep_group_port_list = [
+            all_ports[i : i + 3]
+            for i in range(start_idx, start_idx + num_ep_groups * 3, 3)
+        ]
+        start_idx += num_ep_groups * 3
+        self._stateless_eplb_group_port_list = [
+            all_ports[i : i + 3]
+            for i in range(start_idx, start_idx + num_eplb_groups * 3, 3)
+        ]
+
+    def get_next_stateless_world_group_port(self) -> list[int]:
+        return self._stateless_world_group_port_list.pop()
+
+    def get_next_stateless_dp_group_port(self) -> list[int]:
+        return self._stateless_dp_group_port_list.pop()
+
+    def get_next_stateless_ep_group_port(self) -> list[int]:
+        return self._stateless_ep_group_port_list.pop()
+
+    def get_next_stateless_eplb_group_port(self) -> list[int]:
+        return self._stateless_eplb_group_port_list.pop()
+
+    def stateless_init_dp_group(self, return_store: bool = False) -> ProcessGroup:
        # NOTE: In high-concurrency scenarios multiple processes
        # can pick the same (currently free) port through a race
        # condition when calling `get_open_port()`. When the first
@@ -426,7 +517,8 @@ class ParallelConfig:
                    self.get_next_dp_init_port(),
                    self.data_parallel_rank,
                    self.data_parallel_size,
-                    backend=current_platform.dist_backend,
+                    backend="gloo",
+                    return_store=return_store,
                )
            except DistNetworkError as e:
                # We only want to retry when the root cause is EADDRINUSE.
@@ -561,6 +653,21 @@ class ParallelConfig:
            logger.info("Using external launcher for distributed inference.")
            self.world_size *= self.data_parallel_size

+        if self.enable_elastic_ep:
+            if not self.enable_eplb:
+                raise ValueError("Elastic EP is only supported with enable_eplb=True.")
+            if self.pipeline_parallel_size > 1:
+                raise ValueError(
+                    "Elastic EP is not supported with pipeline parallelism "
+                    f"(pipeline_parallel_size={self.pipeline_parallel_size})."
+                )
+            if self.data_parallel_external_lb or self.data_parallel_hybrid_lb:
+                raise NotImplementedError(
+                    "Elastic EP is not compatible with data_parallel_external_lb "
+                    "or data_parallel_hybrid_lb. Elastic EP relies on a single API "
+                    "server and core client to coordinate scale up/down."
+                )
+
        if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
            # Data parallel was specified in the engine args.
            if self.distributed_executor_backend == "external_launcher":
@@ -573,9 +680,12 @@ class ParallelConfig:
                    "Set data_parallel_rank to %d automatically.",
                    self.data_parallel_rank,
                )
-            if not self._data_parallel_master_port_list:
-                self._data_parallel_master_port_list = get_open_ports_list(5)
-            self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
+            if not self.enable_elastic_ep:
+                if not self._data_parallel_master_port_list:
+                    self._data_parallel_master_port_list = get_open_ports_list(5)
+                self.data_parallel_master_port = (
+                    self._data_parallel_master_port_list.pop()
+                )

            if not (0 <= self.data_parallel_rank < self.data_parallel_size):
                raise ValueError(
@@ -602,7 +712,7 @@ class ParallelConfig:
            os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
            logger.info("Disabling V1 multiprocessing for external launcher.")

-        if self.distributed_executor_backend is None and self.world_size > 1:
+        if self.distributed_executor_backend is None and self.world_size_across_dp > 1:
            # We use multiprocessing by default if world_size fits on the
            # current node and we aren't in a ray placement group.