"vllm/vscode:/vscode.git/clone" did not exist on "15aba081f33e6d048422df6dcdb94301d08d13e6"
Unverified Commit c9d3c6e6 authored by Itay Alroy's avatar Itay Alroy Committed by GitHub
Browse files

fused_moe: treat NIXL EP as batched experts (#40412)


Signed-off-by: default avatarItay Alroy <ialroy@nvidia.com>
parent 51adca74
......@@ -993,6 +993,10 @@ class FusedMoEParallelConfig:
def use_batched_activation_format(self):
return self.use_deepep_ll_kernels or self.use_nixl_ep_kernels
@property
def needs_round_robin_routing_tables(self):
return self.use_deepep_ll_kernels or self.use_nixl_ep_kernels
@property
def use_ag_rs_all2all_kernels(self):
return (
......@@ -1294,3 +1298,7 @@ class FusedMoEConfig:
@property
def use_nixl_ep_kernels(self):
return self.moe_parallel_config.use_nixl_ep_kernels
@property
def needs_round_robin_routing_tables(self):
return self.moe_parallel_config.needs_round_robin_routing_tables
......@@ -180,8 +180,7 @@ def determine_expert_placement_strategy(
return "linear"
if (
moe_parallel_config.use_all2all_kernels
and not moe_parallel_config.use_deepep_ll_kernels
and not moe_parallel_config.use_nixl_ep_kernels
and not moe_parallel_config.needs_round_robin_routing_tables
):
logger.warning(
"Round-robin expert placement currently only supports "
......@@ -687,8 +686,7 @@ class FusedMoE(PluggableLayer):
# Currently routing_tables only needed for round-robin expert placement
# with DeepEP-ll or NIXL EP all2all backends.
if self.expert_placement_strategy != "round_robin" or (
not self.moe_parallel_config.use_deepep_ll_kernels
and not self.moe_parallel_config.use_nixl_ep_kernels
not self.moe_parallel_config.needs_round_robin_routing_tables
):
return None
......
......@@ -884,7 +884,7 @@ def make_mxfp4_moe_kernel(
experts,
shared_experts=(
shared_experts
if moe_config.moe_parallel_config.use_deepep_ll_kernels
if moe_config.moe_parallel_config.use_batched_activation_format
else None
),
inplace=(
......
......@@ -168,10 +168,7 @@ def select_nvfp4_moe_backend(
NvFp4MoeBackend.EMULATION,
]
# NOTE(rob): this is kind of a hack. We need to peak into
# the prepare-finalize selection to determine if we are using
# the batched or standard expert format.
use_batched = config.moe_parallel_config.use_deepep_ll_kernels
use_batched = config.moe_parallel_config.use_batched_activation_format
activation_format = (
mk.FusedMoEActivationFormat.BatchedExperts
if use_batched
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment