Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
95089607
Unverified
Commit
95089607
authored
Aug 28, 2025
by
Po-Han Huang (NVIDIA)
Committed by
GitHub
Aug 28, 2025
Browse files
[Model][gpt-oss] Support DP+EP for GPT-OSS with FlashInfer trtllm-gen MoE (#23819)
Signed-off-by:
Po-Han Huang
<
pohanh@nvidia.com
>
parent
1f096f9b
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
14 additions
and
15 deletions
+14
-15
vllm/model_executor/layers/fused_moe/config.py
vllm/model_executor/layers/fused_moe/config.py
+8
-7
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+4
-4
vllm/model_executor/layers/quantization/mxfp4.py
vllm/model_executor/layers/quantization/mxfp4.py
+2
-4
No files found.
vllm/model_executor/layers/fused_moe/config.py
View file @
95089607
...
...
@@ -190,12 +190,6 @@ class FusedMoEParallelConfig:
return
(
self
.
use_all2all_kernels
and
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_low_latency"
)
@
property
def
use_flashinfer_cutlass_kernels
(
self
):
return
(
envs
.
VLLM_USE_FLASHINFER_MOE_FP4
and
has_flashinfer_cutlass_fused_moe
()
and
envs
.
VLLM_FLASHINFER_MOE_BACKEND
==
"throughput"
)
@
staticmethod
def
make
(
tp_size_
:
int
,
dp_size_
:
int
,
vllm_parallel_config
:
ParallelConfig
)
->
"FusedMoEParallelConfig"
:
...
...
@@ -404,7 +398,14 @@ class FusedMoEConfig:
@
property
def
use_flashinfer_cutlass_kernels
(
self
):
return
self
.
moe_parallel_config
.
use_flashinfer_cutlass_kernels
"""
Whether to use FlashInfer cutlass kernels for NVFP4 MoE.
"""
return
(
self
.
quant_config
is
not
None
and
self
.
quant_config
.
quant_dtype
==
"nvfp4"
and
envs
.
VLLM_USE_FLASHINFER_MOE_FP4
and
has_flashinfer_cutlass_fused_moe
()
and
envs
.
VLLM_FLASHINFER_MOE_BACKEND
==
"throughput"
)
@
staticmethod
def
make
(
...
...
vllm/model_executor/layers/fused_moe/layer.py
View file @
95089607
...
...
@@ -920,7 +920,7 @@ class FusedMoE(CustomOp):
self
.
batched_router_logits
:
Optional
[
torch
.
Tensor
]
=
None
if
(
self
.
moe_parallel_config
.
use_pplx_kernels
or
self
.
moe_parallel_config
.
use_deepep_ll_kernels
or
self
.
moe_
parallel_
config
.
use_flashinfer_cutlass_kernels
):
or
self
.
moe_config
.
use_flashinfer_cutlass_kernels
):
self
.
batched_hidden_states
=
torch
.
zeros
(
(
moe
.
max_num_tokens
,
self
.
hidden_size
),
dtype
=
moe
.
in_dtype
,
...
...
@@ -974,7 +974,7 @@ class FusedMoE(CustomOp):
@
property
def
use_flashinfer_cutlass_kernels
(
self
):
return
self
.
moe_
parallel_
config
.
use_flashinfer_cutlass_kernels
return
self
.
moe_config
.
use_flashinfer_cutlass_kernels
def
update_expert_map
(
self
):
# ep_size and ep_rank should already be updated
...
...
@@ -1665,7 +1665,7 @@ class FusedMoE(CustomOp):
# only when data parallelism (DP) is enabled.
use_flashinfer_cutlass_kernels
=
(
self
.
dp_size
>
1
and
self
.
moe_
parallel_
config
.
use_flashinfer_cutlass_kernels
)
and
self
.
moe_config
.
use_flashinfer_cutlass_kernels
)
if
(
self
.
moe_parallel_config
.
use_pplx_kernels
or
self
.
moe_parallel_config
.
use_deepep_ll_kernels
or
use_flashinfer_cutlass_kernels
):
...
...
@@ -1674,7 +1674,7 @@ class FusedMoE(CustomOp):
do_naive_dispatch_combine
:
bool
=
(
self
.
dp_size
>
1
and
not
self
.
moe_parallel_config
.
use_deepep_ht_kernels
and
not
self
.
moe_
parallel_
config
.
use_flashinfer_cutlass_kernels
)
and
not
self
.
moe_config
.
use_flashinfer_cutlass_kernels
)
if
do_naive_dispatch_combine
:
hidden_states
,
router_logits
=
get_ep_group
().
dispatch
(
hidden_states
,
router_logits
)
...
...
vllm/model_executor/layers/quantization/mxfp4.py
View file @
95089607
...
...
@@ -623,8 +623,6 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
if
should_use_flashinfer_mxfp4
():
from
flashinfer
import
mxfp8_quantize
,
trtllm_fp4_block_scale_moe
assert
not
self
.
moe
.
use_ep
,
(
"EP is not supported for flashinfer mxfp4 moe backend yet."
)
if
_should_use_flashinfer_mxfp4_bf16
():
assert
x
.
dtype
==
torch
.
bfloat16
x_quant
=
x
...
...
@@ -650,12 +648,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
None
,
# output1_scale_scalar
None
,
# output1_scale_gate_scalar
None
,
# output2_scale_scalar
self
.
num_experts
,
global_
num_experts
,
top_k
,
None
,
# n_group
None
,
# topk_group
self
.
intermediate_size
,
# padded to multiple of 256
0
,
# local_expert_offset
layer
.
ep_rank
*
layer
.
local_num_experts
,
# local_expert_offset
self
.
num_experts
,
# local num experts
None
,
self
.
_get_tile_tokens_dim
(
x
,
top_k
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment