Commit 9c663e50 authored by zhuwenwen's avatar zhuwenwen
Browse files

skip moe_fused_gate

parent 4f6c0cd4
......@@ -829,8 +829,9 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu"
"csrc/moe/moe_fused_gate.cu")
"csrc/moe/topk_softmax_kernels.cu")
# "csrc/moe/moe_fused_gate.cu"
if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC
......
......@@ -36,11 +36,11 @@ void shuffle_rows(const torch::Tensor& input_tensor,
const torch::Tensor& dst2src_map,
torch::Tensor& output_tensor);
std::vector<torch::Tensor> moe_fused_gate(
torch::Tensor& input,
torch::Tensor& bias,
int64_t num_expert_group,
int64_t topk_group,
int64_t topk,
int64_t n_share_experts_fusion,
double routed_scaling_factor);
// std::vector<torch::Tensor> moe_fused_gate(
// torch::Tensor& input,
// torch::Tensor& bias,
// int64_t num_expert_group,
// int64_t topk_group,
// int64_t topk,
// int64_t n_share_experts_fusion,
// double routed_scaling_factor);
......@@ -25,11 +25,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" Tensor! num_tokens_post_pad) -> ()");
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
m.def(
"moe_fused_gate(Tensor input, Tensor bias, int num_expert_group, int topk_group, int topk, int "
"n_share_experts_fusion, float routed_scaling_factor) -> "
"(Tensor[])");
m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate);
// m.def(
// "moe_fused_gate(Tensor input, Tensor bias, int num_expert_group, int topk_group, int topk, int "
// "n_share_experts_fusion, float routed_scaling_factor) -> "
// "(Tensor[])");
// m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate);
#ifndef USE_ROCM
......
......@@ -2335,51 +2335,51 @@ def flash_mla_with_kvcache(
# return out
def moe_fused_gate(
input_tensor,
bias,
num_expert_group,
topk_group,
topk,
n_share_experts_fusion=0,
routed_scaling_factor=0,
):
# This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion
# it split group of expert into num_expert_group, and use top2 expert weight sum in each group
# as the group weight to select exerpt groups and then select topk experts within the selected groups
# the #experts is decided by the input tensor shape and we currently only support power of 2 #experts
# and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limitted for now.
# for non-supported case, we suggestion to use the biased_grouped_topk func in sglang.srt.layers.moe.topk
# n_share_experts_fusion: if > 0, the last expert will be replaced with a round-robin shared expert
# routed_scaling_factor: if > 0, the last expert will be scaled by this factor
return torch.ops._moe_C.moe_fused_gate(
input_tensor,
bias,
num_expert_group,
topk_group,
topk,
n_share_experts_fusion,
routed_scaling_factor,
)
if hasattr(torch.ops._moe_C, "moe_fused_gate"):
@register_fake("_moe_C::moe_fused_gate")
def moe_fused_gate_fake(
input_tensor: torch.Tensor,
bias: torch.Tensor,
num_expert_group: int,
topk_group: int,
topk: int,
n_share_experts_fusion: int,
routed_scaling_factor: int,
):
return torch.empty((input_tensor.size(0), topk),
dtype=input_tensor.dtype,
device=input_tensor.device), \
torch.empty((input_tensor.size(0), topk),
dtype=input_tensor.dtype,
device=input_tensor.device)
# def moe_fused_gate(
# input_tensor,
# bias,
# num_expert_group,
# topk_group,
# topk,
# n_share_experts_fusion=0,
# routed_scaling_factor=0,
# ):
# # This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion
# # it split group of expert into num_expert_group, and use top2 expert weight sum in each group
# # as the group weight to select exerpt groups and then select topk experts within the selected groups
# # the #experts is decided by the input tensor shape and we currently only support power of 2 #experts
# # and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limitted for now.
# # for non-supported case, we suggestion to use the biased_grouped_topk func in sglang.srt.layers.moe.topk
# # n_share_experts_fusion: if > 0, the last expert will be replaced with a round-robin shared expert
# # routed_scaling_factor: if > 0, the last expert will be scaled by this factor
# return torch.ops._moe_C.moe_fused_gate(
# input_tensor,
# bias,
# num_expert_group,
# topk_group,
# topk,
# n_share_experts_fusion,
# routed_scaling_factor,
# )
# if hasattr(torch.ops._moe_C, "moe_fused_gate"):
# @register_fake("_moe_C::moe_fused_gate")
# def moe_fused_gate_fake(
# input_tensor: torch.Tensor,
# bias: torch.Tensor,
# num_expert_group: int,
# topk_group: int,
# topk: int,
# n_share_experts_fusion: int,
# routed_scaling_factor: int,
# ):
# return torch.empty((input_tensor.size(0), topk),
# dtype=input_tensor.dtype,
# device=input_tensor.device), \
# torch.empty((input_tensor.size(0), topk),
# dtype=input_tensor.dtype,
# device=input_tensor.device)
def sm100_cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment