Unverified Commit c9d3c6e6 authored by Itay Alroy's avatar Itay Alroy Committed by GitHub
Browse files

fused_moe: treat NIXL EP as batched experts (#40412)


Signed-off-by: default avatarItay Alroy <ialroy@nvidia.com>
parent 51adca74
...@@ -993,6 +993,10 @@ class FusedMoEParallelConfig: ...@@ -993,6 +993,10 @@ class FusedMoEParallelConfig:
def use_batched_activation_format(self): def use_batched_activation_format(self):
return self.use_deepep_ll_kernels or self.use_nixl_ep_kernels return self.use_deepep_ll_kernels or self.use_nixl_ep_kernels
@property
def needs_round_robin_routing_tables(self):
return self.use_deepep_ll_kernels or self.use_nixl_ep_kernels
@property @property
def use_ag_rs_all2all_kernels(self): def use_ag_rs_all2all_kernels(self):
return ( return (
...@@ -1294,3 +1298,7 @@ class FusedMoEConfig: ...@@ -1294,3 +1298,7 @@ class FusedMoEConfig:
@property @property
def use_nixl_ep_kernels(self): def use_nixl_ep_kernels(self):
return self.moe_parallel_config.use_nixl_ep_kernels return self.moe_parallel_config.use_nixl_ep_kernels
@property
def needs_round_robin_routing_tables(self):
return self.moe_parallel_config.needs_round_robin_routing_tables
...@@ -180,8 +180,7 @@ def determine_expert_placement_strategy( ...@@ -180,8 +180,7 @@ def determine_expert_placement_strategy(
return "linear" return "linear"
if ( if (
moe_parallel_config.use_all2all_kernels moe_parallel_config.use_all2all_kernels
and not moe_parallel_config.use_deepep_ll_kernels and not moe_parallel_config.needs_round_robin_routing_tables
and not moe_parallel_config.use_nixl_ep_kernels
): ):
logger.warning( logger.warning(
"Round-robin expert placement currently only supports " "Round-robin expert placement currently only supports "
...@@ -687,8 +686,7 @@ class FusedMoE(PluggableLayer): ...@@ -687,8 +686,7 @@ class FusedMoE(PluggableLayer):
# Currently routing_tables only needed for round-robin expert placement # Currently routing_tables only needed for round-robin expert placement
# with DeepEP-ll or NIXL EP all2all backends. # with DeepEP-ll or NIXL EP all2all backends.
if self.expert_placement_strategy != "round_robin" or ( if self.expert_placement_strategy != "round_robin" or (
not self.moe_parallel_config.use_deepep_ll_kernels not self.moe_parallel_config.needs_round_robin_routing_tables
and not self.moe_parallel_config.use_nixl_ep_kernels
): ):
return None return None
......
...@@ -884,7 +884,7 @@ def make_mxfp4_moe_kernel( ...@@ -884,7 +884,7 @@ def make_mxfp4_moe_kernel(
experts, experts,
shared_experts=( shared_experts=(
shared_experts shared_experts
if moe_config.moe_parallel_config.use_deepep_ll_kernels if moe_config.moe_parallel_config.use_batched_activation_format
else None else None
), ),
inplace=( inplace=(
......
...@@ -168,10 +168,7 @@ def select_nvfp4_moe_backend( ...@@ -168,10 +168,7 @@ def select_nvfp4_moe_backend(
NvFp4MoeBackend.EMULATION, NvFp4MoeBackend.EMULATION,
] ]
# NOTE(rob): this is kind of a hack. We need to peak into use_batched = config.moe_parallel_config.use_batched_activation_format
# the prepare-finalize selection to determine if we are using
# the batched or standard expert format.
use_batched = config.moe_parallel_config.use_deepep_ll_kernels
activation_format = ( activation_format = (
mk.FusedMoEActivationFormat.BatchedExperts mk.FusedMoEActivationFormat.BatchedExperts
if use_batched if use_batched
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment