Commit ba1999c2 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev-ds-deepep-yql' into 'v0.9.2-dev-ds'

add ALLOW_MNNV default falase, use VLLM_ALLOW_MNNVL=1

See merge request dcutoolkit/deeplearing/vllm!286
parents 52f895ab ddb01cd9
...@@ -8,7 +8,7 @@ import torch.distributed as dist ...@@ -8,7 +8,7 @@ import torch.distributed as dist
from vllm.forward_context import get_forward_context from vllm.forward_context import get_forward_context
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import has_deep_ep, has_pplx from vllm.utils import has_deep_ep, has_pplx
import vllm.envs as envs
from .base_device_communicator import All2AllManagerBase, Cache from .base_device_communicator import All2AllManagerBase, Cache
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -253,7 +253,9 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase): ...@@ -253,7 +253,9 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
num_nvl_bytes=num_nvl_bytes, num_nvl_bytes=num_nvl_bytes,
num_rdma_bytes=num_rdma_bytes, num_rdma_bytes=num_rdma_bytes,
low_latency_mode=True, low_latency_mode=True,
num_qps_per_rank=num_qps_per_rank) num_qps_per_rank=num_qps_per_rank,
allow_mnnvl=envs.VLLM_ALLOW_MNNVL,
)
def get_handle(self, kwargs): def get_handle(self, kwargs):
""" """
......
...@@ -128,6 +128,7 @@ if TYPE_CHECKING: ...@@ -128,6 +128,7 @@ if TYPE_CHECKING:
VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost" VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557 VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
VLLM_ALL2ALL_BACKEND: str = "naive" VLLM_ALL2ALL_BACKEND: str = "naive"
VLLM_ALLOW_MNNVL: bool = False
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840 VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1 VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
VLLM_SLEEP_WHEN_IDLE: bool = False VLLM_SLEEP_WHEN_IDLE: bool = False
...@@ -952,6 +953,11 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -952,6 +953,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ALL2ALL_BACKEND": "VLLM_ALL2ALL_BACKEND":
lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"), lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
# use ALLOW_MNNVL
"VLLM_ALLOW_MNNVL":
lambda: (os.environ.get("VLLM_ALLOW_MNNVL", "False").lower() in
("true", "1")),
# Control the maximum number of tokens per expert supported by the # Control the maximum number of tokens per expert supported by the
# NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
# the blockscale tensor of activations NVFP4 Quantization. # the blockscale tensor of activations NVFP4 Quantization.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment