Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
67c153b8
Unverified
Commit
67c153b8
authored
Aug 12, 2025
by
Po-Han Huang (NVIDIA)
Committed by
GitHub
Aug 12, 2025
Browse files
Fix Llama4 FlashInfer FP4 MoE issues (#22511)
Signed-off-by:
Po-Han Huang
<
pohanh@nvidia.com
>
parent
f7ad6a1e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
9 additions
and
5 deletions
+9
-5
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
...model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+0
-2
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
...r/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+6
-1
vllm/model_executor/layers/quantization/modelopt.py
vllm/model_executor/layers/quantization/modelopt.py
+3
-2
No files found.
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
View file @
67c153b8
...
...
@@ -170,8 +170,6 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
"w1_scale and w2_scale must not "
"be None for FlashInferExperts"
)
assert
not
apply_router_weight_on_input
quant_scales
=
[
a1_gscale
,
w1_scale
.
view
(
torch
.
int32
),
...
...
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
View file @
67c153b8
...
...
@@ -60,7 +60,12 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
)
->
tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
Optional
[
torch
.
Tensor
],
Optional
[
torch
.
Tensor
],
Optional
[
torch
.
Tensor
]]:
assert
not
apply_router_weight_on_input
if
apply_router_weight_on_input
:
topk
=
topk_ids
.
size
(
1
)
# TODO: this only works for topK=1, will need to update for topK>1
assert
topk
==
1
,
\
"apply_router_weight_on_input is only implemented for topk=1"
a1
.
mul_
(
topk_weights
.
to
(
a1
.
dtype
))
(
a1_gscale
,
use_dp
,
local_tokens
)
=
extract_required_args
(
extra_prepare_args
,
[
'a1_gscale'
,
'use_dp'
,
'local_tokens'
])
...
...
vllm/model_executor/layers/quantization/modelopt.py
View file @
67c153b8
...
...
@@ -1299,8 +1299,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
output2_scale_scalar
=
layer
.
g2_alphas
.
data
,
num_experts
=
global_num_experts
,
top_k
=
top_k
,
n_group
=
num_expert_group
,
topk_group
=
topk_group
,
n_group
=
num_expert_group
if
num_expert_group
is
not
None
else
0
,
topk_group
=
topk_group
if
topk_group
is
not
None
else
0
,
intermediate_size
=
layer
.
intermediate_size_per_partition
,
local_expert_offset
=
layer
.
ep_rank
*
layer
.
local_num_experts
,
local_num_experts
=
layer
.
local_num_experts
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment