Commit 7a6ab87a authored by zhuwenwen's avatar zhuwenwen
Browse files

skip moe_fused_gate

parent 19071331
...@@ -802,8 +802,8 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) ...@@ -802,8 +802,8 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp" "csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu" "csrc/moe/moe_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu" "csrc/moe/topk_softmax_kernels.cu")
"csrc/moe/moe_fused_gate.cu") # "csrc/moe/moe_fused_gate.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu") list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
......
...@@ -30,11 +30,11 @@ void shuffle_rows(const torch::Tensor& input_tensor, ...@@ -30,11 +30,11 @@ void shuffle_rows(const torch::Tensor& input_tensor,
const torch::Tensor& dst2src_map, const torch::Tensor& dst2src_map,
torch::Tensor& output_tensor); torch::Tensor& output_tensor);
std::vector<torch::Tensor> moe_fused_gate( // std::vector<torch::Tensor> moe_fused_gate(
torch::Tensor& input, // torch::Tensor& input,
torch::Tensor& bias, // torch::Tensor& bias,
int64_t num_expert_group, // int64_t num_expert_group,
int64_t topk_group, // int64_t topk_group,
int64_t topk, // int64_t topk,
int64_t n_share_experts_fusion, // int64_t n_share_experts_fusion,
double routed_scaling_factor); // double routed_scaling_factor);
...@@ -22,11 +22,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { ...@@ -22,11 +22,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" Tensor! num_tokens_post_pad) -> ()"); " Tensor! num_tokens_post_pad) -> ()");
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
m.def( // m.def(
"moe_fused_gate(Tensor input, Tensor bias, int num_expert_group, int topk_group, int topk, int " // "moe_fused_gate(Tensor input, Tensor bias, int num_expert_group, int topk_group, int topk, int "
"n_share_experts_fusion, float routed_scaling_factor) -> " // "n_share_experts_fusion, float routed_scaling_factor) -> "
"(Tensor[])"); // "(Tensor[])");
m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate); // m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate);
#ifndef USE_ROCM #ifndef USE_ROCM
......
...@@ -2260,51 +2260,51 @@ def flash_mla_with_kvcache( ...@@ -2260,51 +2260,51 @@ def flash_mla_with_kvcache(
# return out # return out
def moe_fused_gate( # def moe_fused_gate(
input_tensor, # input_tensor,
bias, # bias,
num_expert_group, # num_expert_group,
topk_group, # topk_group,
topk, # topk,
n_share_experts_fusion=0, # n_share_experts_fusion=0,
routed_scaling_factor=0, # routed_scaling_factor=0,
): # ):
# This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion # # This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion
# it split group of expert into num_expert_group, and use top2 expert weight sum in each group # # it split group of expert into num_expert_group, and use top2 expert weight sum in each group
# as the group weight to select exerpt groups and then select topk experts within the selected groups # # as the group weight to select exerpt groups and then select topk experts within the selected groups
# the #experts is decided by the input tensor shape and we currently only support power of 2 #experts # # the #experts is decided by the input tensor shape and we currently only support power of 2 #experts
# and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limitted for now. # # and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limitted for now.
# for non-supported case, we suggestion to use the biased_grouped_topk func in sglang.srt.layers.moe.topk # # for non-supported case, we suggestion to use the biased_grouped_topk func in sglang.srt.layers.moe.topk
# n_share_experts_fusion: if > 0, the last expert will be replaced with a round-robin shared expert # # n_share_experts_fusion: if > 0, the last expert will be replaced with a round-robin shared expert
# routed_scaling_factor: if > 0, the last expert will be scaled by this factor # # routed_scaling_factor: if > 0, the last expert will be scaled by this factor
return torch.ops._moe_C.moe_fused_gate( # return torch.ops._moe_C.moe_fused_gate(
input_tensor, # input_tensor,
bias, # bias,
num_expert_group, # num_expert_group,
topk_group, # topk_group,
topk, # topk,
n_share_experts_fusion, # n_share_experts_fusion,
routed_scaling_factor, # routed_scaling_factor,
) # )
if hasattr(torch.ops._moe_C, "moe_fused_gate"): # if hasattr(torch.ops._moe_C, "moe_fused_gate"):
@register_fake("_moe_C::moe_fused_gate") # @register_fake("_moe_C::moe_fused_gate")
def moe_fused_gate_fake( # def moe_fused_gate_fake(
input_tensor: torch.Tensor, # input_tensor: torch.Tensor,
bias: torch.Tensor, # bias: torch.Tensor,
num_expert_group: int, # num_expert_group: int,
topk_group: int, # topk_group: int,
topk: int, # topk: int,
n_share_experts_fusion: int, # n_share_experts_fusion: int,
routed_scaling_factor: int, # routed_scaling_factor: int,
): # ):
return torch.empty((input_tensor.size(0), topk), # return torch.empty((input_tensor.size(0), topk),
dtype=input_tensor.dtype, # dtype=input_tensor.dtype,
device=input_tensor.device), \ # device=input_tensor.device), \
torch.empty((input_tensor.size(0), topk), # torch.empty((input_tensor.size(0), topk),
dtype=input_tensor.dtype, # dtype=input_tensor.dtype,
device=input_tensor.device) # device=input_tensor.device)
def sm100_cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor, def sm100_cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,
......
...@@ -1278,7 +1278,7 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1278,7 +1278,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vLLM will enable the moe_fused_gate kernel. # If set, vLLM will enable the moe_fused_gate kernel.
"VLLM_ENABLE_MOE_FUSED_GATE": "VLLM_ENABLE_MOE_FUSED_GATE":
lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_FUSED_GATE", "1"))), lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_FUSED_GATE", "0"))),
# vLLM will use FlashAttention Backend for page attention computation on rocm # vLLM will use FlashAttention Backend for page attention computation on rocm
"VLLM_USE_FLASH_ATTN_PA": "VLLM_USE_FLASH_ATTN_PA":
......
...@@ -455,7 +455,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -455,7 +455,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
logical_to_physical_map=logical_to_physical_map, logical_to_physical_map=logical_to_physical_map,
logical_replica_count=logical_replica_count, logical_replica_count=logical_replica_count,
use_nn_moe=use_nn_moe, use_nn_moe=use_nn_moe,
routed_scaling_factor=routed_scaling_factor, # routed_scaling_factor=routed_scaling_factor,
use_fused_gate=use_fused_gate use_fused_gate=use_fused_gate
) )
...@@ -481,7 +481,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -481,7 +481,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
logical_to_physical_map: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None,
use_nn_moe: Optional[bool] = False, use_nn_moe: Optional[bool] = False,
routed_scaling_factor: Optional[float] = None, # routed_scaling_factor: Optional[float] = None,
use_fused_gate: Optional[bool] = False, use_fused_gate: Optional[bool] = False,
) -> torch.Tensor: ) -> torch.Tensor:
...@@ -502,7 +502,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -502,7 +502,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
expert_load_view=expert_load_view, expert_load_view=expert_load_view,
logical_to_physical_map=logical_to_physical_map, logical_to_physical_map=logical_to_physical_map,
logical_replica_count=logical_replica_count, logical_replica_count=logical_replica_count,
routed_scaling_factor=routed_scaling_factor, # routed_scaling_factor=routed_scaling_factor,
use_fused_gate=use_fused_gate) use_fused_gate=use_fused_gate)
if self.rocm_aiter_moe_enabled: if self.rocm_aiter_moe_enabled:
...@@ -571,7 +571,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -571,7 +571,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
logical_to_physical_map: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None,
use_nn_moe: Optional[bool] = False, use_nn_moe: Optional[bool] = False,
routed_scaling_factor: Optional[float] = None, # routed_scaling_factor: Optional[float] = None,
use_fused_gate: Optional[bool] = False, use_fused_gate: Optional[bool] = False,
): ):
if enable_eplb is not False or expert_load_view is not None or \ if enable_eplb is not False or expert_load_view is not None or \
...@@ -782,7 +782,7 @@ class FusedMoE(CustomOp): ...@@ -782,7 +782,7 @@ class FusedMoE(CustomOp):
enable_eplb: bool = False, enable_eplb: bool = False,
num_redundant_experts: int = 0, num_redundant_experts: int = 0,
has_bias: bool = False, has_bias: bool = False,
routed_scaling_factor: Optional[float] = None, # routed_scaling_factor: Optional[float] = None,
): ):
super().__init__() super().__init__()
if params_dtype is None: if params_dtype is None:
...@@ -856,7 +856,7 @@ class FusedMoE(CustomOp): ...@@ -856,7 +856,7 @@ class FusedMoE(CustomOp):
self.e_score_correction_bias = e_score_correction_bias self.e_score_correction_bias = e_score_correction_bias
self.apply_router_weight_on_input = apply_router_weight_on_input self.apply_router_weight_on_input = apply_router_weight_on_input
self.activation = activation self.activation = activation
self.routed_scaling_factor = routed_scaling_factor # self.routed_scaling_factor = routed_scaling_factor
if self.scoring_func != "softmax" and not self.use_grouped_topk: if self.scoring_func != "softmax" and not self.use_grouped_topk:
raise ValueError("Only softmax scoring function is supported for " raise ValueError("Only softmax scoring function is supported for "
...@@ -1466,7 +1466,7 @@ class FusedMoE(CustomOp): ...@@ -1466,7 +1466,7 @@ class FusedMoE(CustomOp):
expert_load_view: Optional[torch.Tensor] = None, expert_load_view: Optional[torch.Tensor] = None,
logical_to_physical_map: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None,
routed_scaling_factor: Optional[float] = None, # routed_scaling_factor: Optional[float] = None,
use_fused_gate: Optional[bool] = False use_fused_gate: Optional[bool] = False
) -> tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
""" """
...@@ -1504,7 +1504,7 @@ class FusedMoE(CustomOp): ...@@ -1504,7 +1504,7 @@ class FusedMoE(CustomOp):
num_expert_group, num_expert_group,
topk_group, topk_group,
top_k, top_k,
routed_scaling_factor=routed_scaling_factor, # routed_scaling_factor=routed_scaling_factor,
n_share_experts_fusion=0, n_share_experts_fusion=0,
) )
else: else:
...@@ -1759,7 +1759,7 @@ class FusedMoE(CustomOp): ...@@ -1759,7 +1759,7 @@ class FusedMoE(CustomOp):
logical_to_physical_map=self.logical_to_physical_map, logical_to_physical_map=self.logical_to_physical_map,
logical_replica_count=self.logical_replica_count, logical_replica_count=self.logical_replica_count,
use_nn_moe=self.use_nn_moe, use_nn_moe=self.use_nn_moe,
routed_scaling_factor=self.routed_scaling_factor, # routed_scaling_factor=self.routed_scaling_factor,
use_fused_gate=self.use_fused_gate use_fused_gate=self.use_fused_gate
) )
......
...@@ -168,8 +168,8 @@ class DeepseekV2MoE(nn.Module): ...@@ -168,8 +168,8 @@ class DeepseekV2MoE(nn.Module):
scoring_func=config.scoring_func, scoring_func=config.scoring_func,
e_score_correction_bias=self.gate.e_score_correction_bias, e_score_correction_bias=self.gate.e_score_correction_bias,
enable_eplb=self.enable_eplb, enable_eplb=self.enable_eplb,
num_redundant_experts=self.n_redundant_experts, num_redundant_experts=self.n_redundant_experts)
routed_scaling_factor=self.routed_scaling_factor) # routed_scaling_factor=self.routed_scaling_factor)
if config.n_shared_experts is not None: if config.n_shared_experts is not None:
intermediate_size = (config.moe_intermediate_size * intermediate_size = (config.moe_intermediate_size *
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment