Merge branch 'v0.9.2-dev-ds-deepep-yql' into 'v0.9.2-dev-ds'

add ALLOW_MNNV default falase, use VLLM_ALLOW_MNNVL=1 See merge request dcutoolkit/deeplearing/vllm!286

Merge branch 'v0.9.2-dev-ds-deepep-yql' into 'v0.9.2-dev-ds'
add ALLOW_MNNV default falase, use VLLM_ALLOW_MNNVL=1 See merge request dcutoolkit/deeplearing/vllm!286
ba1999c2 · zhuwenwen · 52f895ab · ddb01cd9 · ba1999c2 · ba1999c2
Commit ba1999c2 authored Dec 07, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 10 additions and 2 deletions

vllm/distributed/device_communicators/all2all.py vllm/distributed/device_communicators/all2all.py +4 -2

vllm/envs.py vllm/envs.py +6 -0

No files found.
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -8,7 +8,7 @@ import torch.distributed as dist
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.utils import has_deep_ep, has_pplx
+import vllm.envs as envs
 from .base_device_communicator import All2AllManagerBase, Cache
 logger = init_logger(__name__)
@@ -253,7 +253,9 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
                    num_nvl_bytes=num_nvl_bytes,
                    num_rdma_bytes=num_rdma_bytes,
                    low_latency_mode=True,
-                    num_qps_per_rank=num_qps_per_rank)
+                    num_qps_per_rank=num_qps_per_rank,
+                    allow_mnnvl=envs.VLLM_ALLOW_MNNVL,
+                    )
    def get_handle(self, kwargs):
        """

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -128,6 +128,7 @@ if TYPE_CHECKING:
    VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
    VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
    VLLM_ALL2ALL_BACKEND: str = "naive"
+    VLLM_ALLOW_MNNVL: bool = False
    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
    VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
    VLLM_SLEEP_WHEN_IDLE: bool = False
@@ -952,6 +953,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_ALL2ALL_BACKEND":
    lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
+    # use ALLOW_MNNVL
+    "VLLM_ALLOW_MNNVL":
+    lambda: (os.environ.get("VLLM_ALLOW_MNNVL", "False").lower() in
+             ("true", "1")),
    # Control the maximum number of tokens per expert supported by the
    # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
    # the blockscale tensor of activations NVFP4 Quantization.