skip moe_fused_gate

7a6ab87a · zhuwenwen · 19071331 · 7a6ab87a · 7a6ab87a · 7a6ab87a
Commit 7a6ab87a authored Aug 19, 2025 by zhuwenwen
7 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -802,8 +802,8 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
  "csrc/moe/moe_align_sum_kernels.cu"
-  "csrc/moe/topk_softmax_kernels.cu"
+  "csrc/moe/topk_softmax_kernels.cu")
-  "csrc/moe/moe_fused_gate.cu")
+  # "csrc/moe/moe_fused_gate.cu")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")

--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -30,11 +30,11 @@ void shuffle_rows(const torch::Tensor& input_tensor,
                  const torch::Tensor& dst2src_map,
                  torch::Tensor& output_tensor);
-std::vector<torch::Tensor> moe_fused_gate(
+// std::vector<torch::Tensor> moe_fused_gate(
-    torch::Tensor& input,
+//     torch::Tensor& input,
-    torch::Tensor& bias,
+//     torch::Tensor& bias,
-    int64_t num_expert_group,
+//     int64_t num_expert_group,
-    int64_t topk_group,
+//     int64_t topk_group,
-    int64_t topk,
+//     int64_t topk,
-    int64_t n_share_experts_fusion,
+//     int64_t n_share_experts_fusion,
-    double routed_scaling_factor);
+//     double routed_scaling_factor);
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -22,11 +22,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "                     Tensor! num_tokens_post_pad) -> ()");
  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
-  m.def(
+//   m.def(
-      "moe_fused_gate(Tensor input, Tensor bias, int num_expert_group, int topk_group, int topk, int "
+//       "moe_fused_gate(Tensor input, Tensor bias, int num_expert_group, int topk_group, int topk, int "
-      "n_share_experts_fusion, float routed_scaling_factor) -> "
+//       "n_share_experts_fusion, float routed_scaling_factor) -> "
-      "(Tensor[])");
+//       "(Tensor[])");
-  m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate);
+//   m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate);
 #ifndef USE_ROCM

--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2260,51 +2260,51 @@ def flash_mla_with_kvcache(
 #     return out
-def moe_fused_gate(
+# def moe_fused_gate(
-    input_tensor,
+#     input_tensor,
-    bias,
+#     bias,
-    num_expert_group,
+#     num_expert_group,
-    topk_group,
+#     topk_group,
-    topk,
+#     topk,
-    n_share_experts_fusion=0,
+#     n_share_experts_fusion=0,
-    routed_scaling_factor=0,
+#     routed_scaling_factor=0,
-):
+# ):
-    # This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion
+#     # This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion
-    # it split group of expert into num_expert_group, and use top2 expert weight sum in each group
+#     # it split group of expert into num_expert_group, and use top2 expert weight sum in each group
-    # as the group weight to select exerpt groups and then select topk experts within the selected groups
+#     # as the group weight to select exerpt groups and then select topk experts within the selected groups
-    # the #experts is decided by the input tensor shape and we currently only support power of 2 #experts
+#     # the #experts is decided by the input tensor shape and we currently only support power of 2 #experts
-    # and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limitted for now.
+#     # and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limitted for now.
-    # for non-supported case, we suggestion to use the biased_grouped_topk func in sglang.srt.layers.moe.topk
+#     # for non-supported case, we suggestion to use the biased_grouped_topk func in sglang.srt.layers.moe.topk
-    # n_share_experts_fusion: if > 0, the last expert will be replaced with a round-robin shared expert
+#     # n_share_experts_fusion: if > 0, the last expert will be replaced with a round-robin shared expert
-    # routed_scaling_factor: if > 0, the last expert will be scaled by this factor
+#     # routed_scaling_factor: if > 0, the last expert will be scaled by this factor
-    return torch.ops._moe_C.moe_fused_gate(
+#     return torch.ops._moe_C.moe_fused_gate(
-        input_tensor,
+#         input_tensor,
-        bias,
+#         bias,
-        num_expert_group,
+#         num_expert_group,
-        topk_group,
+#         topk_group,
-        topk,
+#         topk,
-        n_share_experts_fusion,
+#         n_share_experts_fusion,
-        routed_scaling_factor,
+#         routed_scaling_factor,
-    )
+#     )
-if hasattr(torch.ops._moe_C, "moe_fused_gate"):
+# if hasattr(torch.ops._moe_C, "moe_fused_gate"):
-    @register_fake("_moe_C::moe_fused_gate")
+#     @register_fake("_moe_C::moe_fused_gate")
-    def moe_fused_gate_fake(
+#     def moe_fused_gate_fake(
-        input_tensor: torch.Tensor,
+#         input_tensor: torch.Tensor,
-        bias: torch.Tensor,
+#         bias: torch.Tensor,
-        num_expert_group: int,
+#         num_expert_group: int,
-        topk_group: int,
+#         topk_group: int,
-        topk: int,
+#         topk: int,
-        n_share_experts_fusion: int,
+#         n_share_experts_fusion: int,
-        routed_scaling_factor: int,
+#         routed_scaling_factor: int,
-    ):
+#     ):
-        return torch.empty((input_tensor.size(0), topk),
+#         return torch.empty((input_tensor.size(0), topk),
-                           dtype=input_tensor.dtype,
+#                            dtype=input_tensor.dtype,
-                           device=input_tensor.device), \
+#                            device=input_tensor.device), \
-                    torch.empty((input_tensor.size(0), topk),
+#                     torch.empty((input_tensor.size(0), topk),
-                           dtype=input_tensor.dtype,
+#                            dtype=input_tensor.dtype,
-                           device=input_tensor.device)
+#                            device=input_tensor.device)
 def sm100_cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1278,7 +1278,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # If set, vLLM will enable the moe_fused_gate kernel.
    "VLLM_ENABLE_MOE_FUSED_GATE":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_FUSED_GATE", "1"))),
+    lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_FUSED_GATE", "0"))),
    # vLLM will use FlashAttention Backend for page attention computation on rocm
    "VLLM_USE_FLASH_ATTN_PA":

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -455,7 +455,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
            logical_to_physical_map=logical_to_physical_map,
            logical_replica_count=logical_replica_count,
            use_nn_moe=use_nn_moe,
-            routed_scaling_factor=routed_scaling_factor,
+            # routed_scaling_factor=routed_scaling_factor,
            use_fused_gate=use_fused_gate
        )
@@ -481,7 +481,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None,
        use_nn_moe: Optional[bool] = False,
-        routed_scaling_factor: Optional[float] = None,
+        # routed_scaling_factor: Optional[float] = None,
        use_fused_gate: Optional[bool] = False,
    ) -> torch.Tensor:
@@ -502,7 +502,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
            expert_load_view=expert_load_view,
            logical_to_physical_map=logical_to_physical_map,
            logical_replica_count=logical_replica_count,
-            routed_scaling_factor=routed_scaling_factor,
+            # routed_scaling_factor=routed_scaling_factor,
            use_fused_gate=use_fused_gate)
        if self.rocm_aiter_moe_enabled:
@@ -571,7 +571,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None,
        use_nn_moe: Optional[bool] = False,
-        routed_scaling_factor: Optional[float] = None,
+        # routed_scaling_factor: Optional[float] = None,
        use_fused_gate: Optional[bool] = False,
    ):
        if enable_eplb is not False or expert_load_view is not None or \
@@ -782,7 +782,7 @@ class FusedMoE(CustomOp):
        enable_eplb: bool = False,
        num_redundant_experts: int = 0,
        has_bias: bool = False,
-        routed_scaling_factor: Optional[float] = None,
+        # routed_scaling_factor: Optional[float] = None,
    ):
        super().__init__()
        if params_dtype is None:
@@ -856,7 +856,7 @@ class FusedMoE(CustomOp):
        self.e_score_correction_bias = e_score_correction_bias
        self.apply_router_weight_on_input = apply_router_weight_on_input
        self.activation = activation
-        self.routed_scaling_factor = routed_scaling_factor
+        # self.routed_scaling_factor = routed_scaling_factor
        if self.scoring_func != "softmax" and not self.use_grouped_topk:
            raise ValueError("Only softmax scoring function is supported for "
@@ -1466,7 +1466,7 @@ class FusedMoE(CustomOp):
        expert_load_view: Optional[torch.Tensor] = None,
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None,
-        routed_scaling_factor: Optional[float] = None,
+        # routed_scaling_factor: Optional[float] = None,
        use_fused_gate: Optional[bool] = False
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
@@ -1504,7 +1504,7 @@ class FusedMoE(CustomOp):
                    num_expert_group,
                    topk_group,
                    top_k,
-                    routed_scaling_factor=routed_scaling_factor,
+                    # routed_scaling_factor=routed_scaling_factor,
                    n_share_experts_fusion=0,
                )
            else:
@@ -1759,7 +1759,7 @@ class FusedMoE(CustomOp):
            logical_to_physical_map=self.logical_to_physical_map,
            logical_replica_count=self.logical_replica_count,
            use_nn_moe=self.use_nn_moe,
-            routed_scaling_factor=self.routed_scaling_factor,
+            # routed_scaling_factor=self.routed_scaling_factor,
            use_fused_gate=self.use_fused_gate
        )

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -168,8 +168,8 @@ class DeepseekV2MoE(nn.Module):
            scoring_func=config.scoring_func,
            e_score_correction_bias=self.gate.e_score_correction_bias,
            enable_eplb=self.enable_eplb,
-            num_redundant_experts=self.n_redundant_experts,
+            num_redundant_experts=self.n_redundant_experts)
-            routed_scaling_factor=self.routed_scaling_factor)
+            # routed_scaling_factor=self.routed_scaling_factor)
        if config.n_shared_experts is not None:
            intermediate_size = (config.moe_intermediate_size *