Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7a6ab87a
Commit
7a6ab87a
authored
Aug 19, 2025
by
zhuwenwen
Browse files
skip moe_fused_gate
parent
19071331
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
72 additions
and
72 deletions
+72
-72
CMakeLists.txt
CMakeLists.txt
+2
-2
csrc/moe/moe_ops.h
csrc/moe/moe_ops.h
+8
-8
csrc/moe/torch_bindings.cpp
csrc/moe/torch_bindings.cpp
+5
-5
vllm/_custom_ops.py
vllm/_custom_ops.py
+45
-45
vllm/envs.py
vllm/envs.py
+1
-1
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+9
-9
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+2
-2
No files found.
CMakeLists.txt
View file @
7a6ab87a
...
...
@@ -802,8 +802,8 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set
(
VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu"
"csrc/moe/moe_fused_gate.cu"
)
"csrc/moe/topk_softmax_kernels.cu"
)
#
"csrc/moe/moe_fused_gate.cu")
if
(
VLLM_GPU_LANG STREQUAL
"CUDA"
)
list
(
APPEND VLLM_MOE_EXT_SRC
"csrc/moe/moe_wna16.cu"
)
...
...
csrc/moe/moe_ops.h
View file @
7a6ab87a
...
...
@@ -30,11 +30,11 @@ void shuffle_rows(const torch::Tensor& input_tensor,
const
torch
::
Tensor
&
dst2src_map
,
torch
::
Tensor
&
output_tensor
);
std
::
vector
<
torch
::
Tensor
>
moe_fused_gate
(
torch
::
Tensor
&
input
,
torch
::
Tensor
&
bias
,
int64_t
num_expert_group
,
int64_t
topk_group
,
int64_t
topk
,
int64_t
n_share_experts_fusion
,
double
routed_scaling_factor
);
//
std::vector<torch::Tensor> moe_fused_gate(
//
torch::Tensor& input,
//
torch::Tensor& bias,
//
int64_t num_expert_group,
//
int64_t topk_group,
//
int64_t topk,
//
int64_t n_share_experts_fusion,
//
double routed_scaling_factor);
csrc/moe/torch_bindings.cpp
View file @
7a6ab87a
...
...
@@ -22,11 +22,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" Tensor! num_tokens_post_pad) -> ()"
);
m
.
impl
(
"moe_align_block_size"
,
torch
::
kCUDA
,
&
moe_align_block_size
);
m
.
def
(
"moe_fused_gate(Tensor input, Tensor bias, int num_expert_group, int topk_group, int topk, int "
"n_share_experts_fusion, float routed_scaling_factor) -> "
"(Tensor[])"
);
m
.
impl
(
"moe_fused_gate"
,
torch
::
kCUDA
,
&
moe_fused_gate
);
//
m.def(
//
"moe_fused_gate(Tensor input, Tensor bias, int num_expert_group, int topk_group, int topk, int "
//
"n_share_experts_fusion, float routed_scaling_factor) -> "
//
"(Tensor[])");
//
m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate);
#ifndef USE_ROCM
...
...
vllm/_custom_ops.py
View file @
7a6ab87a
...
...
@@ -2260,51 +2260,51 @@ def flash_mla_with_kvcache(
# return out
def
moe_fused_gate
(
input_tensor
,
bias
,
num_expert_group
,
topk_group
,
topk
,
n_share_experts_fusion
=
0
,
routed_scaling_factor
=
0
,
):
# This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion
# it split group of expert into num_expert_group, and use top2 expert weight sum in each group
# as the group weight to select exerpt groups and then select topk experts within the selected groups
# the #experts is decided by the input tensor shape and we currently only support power of 2 #experts
# and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limitted for now.
# for non-supported case, we suggestion to use the biased_grouped_topk func in sglang.srt.layers.moe.topk
# n_share_experts_fusion: if > 0, the last expert will be replaced with a round-robin shared expert
# routed_scaling_factor: if > 0, the last expert will be scaled by this factor
return
torch
.
ops
.
_moe_C
.
moe_fused_gate
(
input_tensor
,
bias
,
num_expert_group
,
topk_group
,
topk
,
n_share_experts_fusion
,
routed_scaling_factor
,
)
if
hasattr
(
torch
.
ops
.
_moe_C
,
"moe_fused_gate"
):
@
register_fake
(
"_moe_C::moe_fused_gate"
)
def
moe_fused_gate_fake
(
input_tensor
:
torch
.
Tensor
,
bias
:
torch
.
Tensor
,
num_expert_group
:
int
,
topk_group
:
int
,
topk
:
int
,
n_share_experts_fusion
:
int
,
routed_scaling_factor
:
int
,
):
return
torch
.
empty
((
input_tensor
.
size
(
0
),
topk
),
dtype
=
input_tensor
.
dtype
,
device
=
input_tensor
.
device
),
\
torch
.
empty
((
input_tensor
.
size
(
0
),
topk
),
dtype
=
input_tensor
.
dtype
,
device
=
input_tensor
.
device
)
#
def moe_fused_gate(
#
input_tensor,
#
bias,
#
num_expert_group,
#
topk_group,
#
topk,
#
n_share_experts_fusion=0,
#
routed_scaling_factor=0,
#
):
#
# This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion
#
# it split group of expert into num_expert_group, and use top2 expert weight sum in each group
#
# as the group weight to select exerpt groups and then select topk experts within the selected groups
#
# the #experts is decided by the input tensor shape and we currently only support power of 2 #experts
#
# and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limitted for now.
#
# for non-supported case, we suggestion to use the biased_grouped_topk func in sglang.srt.layers.moe.topk
#
# n_share_experts_fusion: if > 0, the last expert will be replaced with a round-robin shared expert
#
# routed_scaling_factor: if > 0, the last expert will be scaled by this factor
#
return torch.ops._moe_C.moe_fused_gate(
#
input_tensor,
#
bias,
#
num_expert_group,
#
topk_group,
#
topk,
#
n_share_experts_fusion,
#
routed_scaling_factor,
#
)
#
if hasattr(torch.ops._moe_C, "moe_fused_gate"):
#
@register_fake("_moe_C::moe_fused_gate")
#
def moe_fused_gate_fake(
#
input_tensor: torch.Tensor,
#
bias: torch.Tensor,
#
num_expert_group: int,
#
topk_group: int,
#
topk: int,
#
n_share_experts_fusion: int,
#
routed_scaling_factor: int,
#
):
#
return torch.empty((input_tensor.size(0), topk),
#
dtype=input_tensor.dtype,
#
device=input_tensor.device), \
#
torch.empty((input_tensor.size(0), topk),
#
dtype=input_tensor.dtype,
#
device=input_tensor.device)
def
sm100_cutlass_mla_decode
(
out
:
torch
.
Tensor
,
q_nope
:
torch
.
Tensor
,
...
...
vllm/envs.py
View file @
7a6ab87a
...
...
@@ -1278,7 +1278,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vLLM will enable the moe_fused_gate kernel.
"VLLM_ENABLE_MOE_FUSED_GATE"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_ENABLE_MOE_FUSED_GATE"
,
"
1
"
))),
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_ENABLE_MOE_FUSED_GATE"
,
"
0
"
))),
# vLLM will use FlashAttention Backend for page attention computation on rocm
"VLLM_USE_FLASH_ATTN_PA"
:
...
...
vllm/model_executor/layers/fused_moe/layer.py
View file @
7a6ab87a
...
...
@@ -455,7 +455,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
logical_to_physical_map
=
logical_to_physical_map
,
logical_replica_count
=
logical_replica_count
,
use_nn_moe
=
use_nn_moe
,
routed_scaling_factor
=
routed_scaling_factor
,
#
routed_scaling_factor=routed_scaling_factor,
use_fused_gate
=
use_fused_gate
)
...
...
@@ -481,7 +481,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
logical_to_physical_map
:
Optional
[
torch
.
Tensor
]
=
None
,
logical_replica_count
:
Optional
[
torch
.
Tensor
]
=
None
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
routed_scaling_factor
:
Optional
[
float
]
=
None
,
#
routed_scaling_factor: Optional[float] = None,
use_fused_gate
:
Optional
[
bool
]
=
False
,
)
->
torch
.
Tensor
:
...
...
@@ -502,7 +502,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
expert_load_view
=
expert_load_view
,
logical_to_physical_map
=
logical_to_physical_map
,
logical_replica_count
=
logical_replica_count
,
routed_scaling_factor
=
routed_scaling_factor
,
#
routed_scaling_factor=routed_scaling_factor,
use_fused_gate
=
use_fused_gate
)
if
self
.
rocm_aiter_moe_enabled
:
...
...
@@ -571,7 +571,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
logical_to_physical_map
:
Optional
[
torch
.
Tensor
]
=
None
,
logical_replica_count
:
Optional
[
torch
.
Tensor
]
=
None
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
routed_scaling_factor
:
Optional
[
float
]
=
None
,
#
routed_scaling_factor: Optional[float] = None,
use_fused_gate
:
Optional
[
bool
]
=
False
,
):
if
enable_eplb
is
not
False
or
expert_load_view
is
not
None
or
\
...
...
@@ -782,7 +782,7 @@ class FusedMoE(CustomOp):
enable_eplb
:
bool
=
False
,
num_redundant_experts
:
int
=
0
,
has_bias
:
bool
=
False
,
routed_scaling_factor
:
Optional
[
float
]
=
None
,
#
routed_scaling_factor: Optional[float] = None,
):
super
().
__init__
()
if
params_dtype
is
None
:
...
...
@@ -856,7 +856,7 @@ class FusedMoE(CustomOp):
self
.
e_score_correction_bias
=
e_score_correction_bias
self
.
apply_router_weight_on_input
=
apply_router_weight_on_input
self
.
activation
=
activation
self
.
routed_scaling_factor
=
routed_scaling_factor
#
self.routed_scaling_factor = routed_scaling_factor
if
self
.
scoring_func
!=
"softmax"
and
not
self
.
use_grouped_topk
:
raise
ValueError
(
"Only softmax scoring function is supported for "
...
...
@@ -1466,7 +1466,7 @@ class FusedMoE(CustomOp):
expert_load_view
:
Optional
[
torch
.
Tensor
]
=
None
,
logical_to_physical_map
:
Optional
[
torch
.
Tensor
]
=
None
,
logical_replica_count
:
Optional
[
torch
.
Tensor
]
=
None
,
routed_scaling_factor
:
Optional
[
float
]
=
None
,
#
routed_scaling_factor: Optional[float] = None,
use_fused_gate
:
Optional
[
bool
]
=
False
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
...
...
@@ -1504,7 +1504,7 @@ class FusedMoE(CustomOp):
num_expert_group
,
topk_group
,
top_k
,
routed_scaling_factor
=
routed_scaling_factor
,
#
routed_scaling_factor=routed_scaling_factor,
n_share_experts_fusion
=
0
,
)
else
:
...
...
@@ -1759,7 +1759,7 @@ class FusedMoE(CustomOp):
logical_to_physical_map
=
self
.
logical_to_physical_map
,
logical_replica_count
=
self
.
logical_replica_count
,
use_nn_moe
=
self
.
use_nn_moe
,
routed_scaling_factor
=
self
.
routed_scaling_factor
,
#
routed_scaling_factor=self.routed_scaling_factor,
use_fused_gate
=
self
.
use_fused_gate
)
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
7a6ab87a
...
...
@@ -168,8 +168,8 @@ class DeepseekV2MoE(nn.Module):
scoring_func
=
config
.
scoring_func
,
e_score_correction_bias
=
self
.
gate
.
e_score_correction_bias
,
enable_eplb
=
self
.
enable_eplb
,
num_redundant_experts
=
self
.
n_redundant_experts
,
routed_scaling_factor
=
self
.
routed_scaling_factor
)
num_redundant_experts
=
self
.
n_redundant_experts
)
#
routed_scaling_factor=self.routed_scaling_factor)
if
config
.
n_shared_experts
is
not
None
:
intermediate_size
=
(
config
.
moe_intermediate_size
*
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment