Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fc16f1c4
Unverified
Commit
fc16f1c4
authored
Oct 31, 2025
by
Shu Wang
Committed by
GitHub
Oct 31, 2025
Browse files
Flashinfer_CUTLASS_MOE fuses quantization for TP (#27223)
Signed-off-by:
Shu Wang.
<
shuw@nvidia.com
>
parent
bc306fe5
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
15 additions
and
32 deletions
+15
-32
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
...model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+5
-1
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
...r/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+9
-8
vllm/model_executor/layers/quantization/modelopt.py
vllm/model_executor/layers/quantization/modelopt.py
+0
-23
vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
..._executor/layers/quantization/utils/flashinfer_fp4_moe.py
+1
-0
No files found.
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
View file @
fc16f1c4
...
...
@@ -56,6 +56,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
ep_size
:
int
=
1
,
tp_rank
:
int
=
0
,
tp_size
:
int
=
1
,
use_dp
:
bool
=
False
,
):
super
().
__init__
(
quant_config
)
assert
quant_config
.
quant_dtype
in
(
"nvfp4"
,
torch
.
float8_e4m3fn
,
None
),
(
...
...
@@ -67,6 +68,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
self
.
tp_rank
=
tp_rank
self
.
tp_size
=
tp_size
self
.
out_dtype
=
out_dtype
self
.
use_dp
=
use_dp
@
property
def
activation_formats
(
...
...
@@ -117,7 +119,8 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
"""
workspace1
=
(
M
,
K
)
workspace2
=
(
0
,)
output_shape
=
(
M
,
K
*
2
if
self
.
quant_dtype
==
"nvfp4"
else
K
)
# For TP, the quantization is fused with fused_moe call.
output_shape
=
(
M
,
K
*
2
if
self
.
quant_dtype
==
"nvfp4"
and
self
.
use_dp
else
K
)
# The workspace is determined by `aq`, since it comes after any
# potential communication op and is involved in the expert computation.
return
(
workspace1
,
workspace2
,
output_shape
)
...
...
@@ -214,6 +217,7 @@ def flashinfer_cutlass_moe_fp4(
FlashInferExperts
(
out_dtype
=
hidden_states
.
dtype
,
quant_config
=
quant_config
,
use_dp
=
False
,
),
)
...
...
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
View file @
fc16f1c4
...
...
@@ -170,6 +170,8 @@ class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFin
self
.
_apply_router_weight_on_input
(
a1
,
topk_weights
,
topk_ids
,
apply_router_weight_on_input
)
if
not
self
.
use_dp
:
return
a1
,
None
,
None
,
topk_ids
,
topk_weights
a1q
,
a1q_scale
=
moe_kernel_quantize_input
(
a1
,
...
...
@@ -179,7 +181,6 @@ class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFin
quant_config
.
block_shape
,
is_fp4_scale_swizzled
=
not
self
.
use_dp
,
)
if
self
.
use_dp
:
topk_weights
,
topk_ids
,
a1q
,
a1q_scale
=
get_dp_group
().
all_gatherv
(
[
topk_weights
,
topk_ids
,
a1q
,
a1q_scale
],
dim
=
0
,
...
...
vllm/model_executor/layers/quantization/modelopt.py
View file @
fc16f1c4
...
...
@@ -1769,29 +1769,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
expert_map
=
expert_map
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
)
elif
(
self
.
allow_flashinfer
and
self
.
flashinfer_moe_backend
==
FlashinferMoeBackend
.
CUTLASS
):
from
vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe
import
(
# noqa: E501
flashinfer_cutlass_moe_fp4
,
)
assert
self
.
moe_quant_config
is
not
None
return
flashinfer_cutlass_moe_fp4
(
hidden_states
=
x
,
w1
=
layer
.
w13_weight
,
w2
=
layer
.
w2_weight
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
quant_config
=
self
.
moe_quant_config
,
inplace
=
False
,
activation
=
activation
,
global_num_experts
=
global_num_experts
,
expert_map
=
expert_map
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
)
else
:
# If no modular kernel is provided, use cutlass_moe_fp4 for TP case
# only (no EP).
...
...
vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
View file @
fc16f1c4
...
...
@@ -79,6 +79,7 @@ def select_nvfp4_gemm_impl(
ep_size
=
moe
.
moe_parallel_config
.
ep_size
,
tp_rank
=
moe
.
moe_parallel_config
.
tp_rank
,
tp_size
=
moe
.
moe_parallel_config
.
tp_size
,
use_dp
=
moe
.
moe_parallel_config
.
dp_size
>
1
,
)
# native cutlass experts currently don't support DP; TP case won't call this
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment