Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
257015d5
"vscode:/vscode.git/clone" did not exist on "031a7995f38d3c73b0790280cc0fa1fe25d33bff"
Unverified
Commit
257015d5
authored
Apr 20, 2026
by
milesial
Committed by
GitHub
Apr 21, 2026
Browse files
[MoE] Triton MoE Perf regression - restore low latency path (#39016)
parent
b4784001
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
72 additions
and
31 deletions
+72
-31
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+72
-31
No files found.
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
257015d5
...
@@ -1551,6 +1551,55 @@ def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
...
@@ -1551,6 +1551,55 @@ def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
return
torch_vllm_outplace_fused_experts
return
torch_vllm_outplace_fused_experts
def
_prepare_expert_assignment
(
topk_ids
:
torch
.
Tensor
,
config
:
dict
[
str
,
Any
],
num_tokens
:
int
,
top_k_num
:
int
,
global_num_experts
:
int
,
expert_map
:
torch
.
Tensor
|
None
,
*
,
use_int8_w8a16
:
bool
=
False
,
use_int4_w4a16
:
bool
=
False
,
block_shape
:
list
[
int
]
|
None
=
None
,
ignore_invalid_experts
:
bool
=
False
,
)
->
tuple
[
torch
.
Tensor
|
None
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""Prepare expert assignments for the aligned and low-latency Triton paths."""
# SPARSITY_FACTOR is a heuristic margin ensuring tokens_in_chunk * top_k
# activates only a small fraction of total experts
# Skips moe_align_block_size and activates the `sorted_token_ids is None`
# path of the fused_moe_kernel kernel
naive_block_assignment
=
(
expert_map
is
None
and
num_tokens
*
top_k_num
*
4
<=
global_num_experts
and
not
(
(
use_int8_w8a16
or
use_int4_w4a16
)
and
block_shape
is
not
None
and
block_shape
[
1
]
>
0
)
)
if
naive_block_assignment
:
return
(
None
,
topk_ids
.
view
(
-
1
),
torch
.
full
(
(
1
,),
topk_ids
.
numel
()
*
config
[
"BLOCK_SIZE_M"
],
dtype
=
torch
.
int32
,
device
=
topk_ids
.
device
,
),
)
return
moe_align_block_size
(
topk_ids
,
config
[
"BLOCK_SIZE_M"
],
global_num_experts
,
expert_map
,
ignore_invalid_experts
=
ignore_invalid_experts
,
)
# TODO (bnell): replace this with modular op. Can get rid of inplace/outplace
# TODO (bnell): replace this with modular op. Can get rid of inplace/outplace
# torch ops.
# torch ops.
def
fused_experts
(
def
fused_experts
(
...
@@ -1791,36 +1840,18 @@ def fused_experts_impl(
...
@@ -1791,36 +1840,18 @@ def fused_experts_impl(
ocp_mx_scheme
=
ocp_mx_scheme
,
ocp_mx_scheme
=
ocp_mx_scheme
,
)
)
# SPARSITY_FACTOR is a heuristic margin ensuring num_tokens * top_k
sorted_token_ids
,
expert_ids
,
num_tokens_post_padded
=
_prepare_expert_assignment
(
# activates only a small fraction of total experts
SPARSITY_FACTOR
=
4
# block quantized code path is not implemented yet.
naive_block_assignment
=
(
expert_map
is
None
and
num_tokens
*
top_k_num
*
SPARSITY_FACTOR
<=
global_num_experts
and
not
(
(
use_int8_w8a16
or
use_int4_w4a16
)
and
block_shape
is
not
None
and
block_shape
[
1
]
>
0
)
)
if
not
naive_block_assignment
:
sorted_token_ids
,
expert_ids
,
num_tokens_post_padded
=
moe_align_block_size
(
topk_ids
,
topk_ids
,
config
[
"BLOCK_SIZE_M"
],
config
,
num_tokens
,
top_k_num
,
global_num_experts
,
global_num_experts
,
expert_map
,
expert_map
,
use_int8_w8a16
=
use_int8_w8a16
,
use_int4_w4a16
=
use_int4_w4a16
,
block_shape
=
block_shape
,
ignore_invalid_experts
=
True
,
ignore_invalid_experts
=
True
,
)
)
else
:
max_num_tokens_padded
=
topk_ids
.
numel
()
*
config
[
"BLOCK_SIZE_M"
]
expert_ids
=
topk_ids
.
view
(
-
1
)
num_tokens_post_padded
=
torch
.
empty
(
(
1
),
dtype
=
torch
.
int32
,
device
=
topk_ids
.
device
)
num_tokens_post_padded
.
fill_
(
max_num_tokens_padded
)
sorted_token_ids
=
None
dispatch_fused_moe_kernel
(
dispatch_fused_moe_kernel
(
qhidden_states
,
qhidden_states
,
...
@@ -2073,8 +2104,18 @@ class TritonExperts(mk.FusedMoEExpertsModular):
...
@@ -2073,8 +2104,18 @@ class TritonExperts(mk.FusedMoEExpertsModular):
)
)
intermediate_cache3
=
_resize_cache
(
workspace2
,
(
num_tokens
,
top_k_num
,
K
))
intermediate_cache3
=
_resize_cache
(
workspace2
,
(
num_tokens
,
top_k_num
,
K
))
sorted_token_ids
,
expert_ids
,
num_tokens_post_padded
=
moe_align_block_size
(
sorted_token_ids
,
expert_ids
,
num_tokens_post_padded
=
(
topk_ids
,
config
[
"BLOCK_SIZE_M"
],
global_num_experts
,
expert_map
_prepare_expert_assignment
(
topk_ids
,
config
,
num_tokens
,
top_k_num
,
global_num_experts
,
expert_map
,
use_int8_w8a16
=
self
.
quant_config
.
use_int8_w8a16
,
use_int4_w4a16
=
self
.
quant_config
.
use_int4_w4a16
,
block_shape
=
self
.
block_shape
,
)
)
)
invoke_fused_moe_triton_kernel
(
invoke_fused_moe_triton_kernel
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment