Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
58c468f4
Unverified
Commit
58c468f4
authored
Jul 25, 2025
by
Trevor Morris
Committed by
GitHub
Jul 25, 2025
Browse files
Fix FP4 MoE accuracy from missing routed_scaling_factor (#8333)
parent
f8ca2368
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
8 additions
and
8 deletions
+8
-8
python/sglang/srt/layers/quantization/modelopt_quant.py
python/sglang/srt/layers/quantization/modelopt_quant.py
+8
-4
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+0
-4
No files found.
python/sglang/srt/layers/quantization/modelopt_quant.py
View file @
58c468f4
...
@@ -952,7 +952,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
...
@@ -952,7 +952,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
tp_rank
:
Optional
[
int
]
=
None
,
tp_rank
:
Optional
[
int
]
=
None
,
tp_size
:
Optional
[
int
]
=
None
,
tp_size
:
Optional
[
int
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
assert
activation
==
"silu"
,
"Only SiLU activation is supported."
assert
activation
==
"silu"
,
"Only SiLU activation is supported."
if
self
.
enable_flashinfer_moe
:
if
self
.
enable_flashinfer_moe
:
...
@@ -982,13 +981,15 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
...
@@ -982,13 +981,15 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
tp_size
=
tp_size
,
tp_size
=
tp_size
,
tp_rank
=
tp_rank
,
tp_rank
=
tp_rank
,
tune_max_num_tokens
=
next_power_of_2
(
x
.
shape
[
0
]),
tune_max_num_tokens
=
next_power_of_2
(
x
.
shape
[
0
]),
)
)[
0
]
return
output
[
0
]
if
routed_scaling_factor
is
not
None
:
output
*=
routed_scaling_factor
return
output
from
sglang.srt.layers.moe.cutlass_moe
import
cutlass_moe_fp4
from
sglang.srt.layers.moe.cutlass_moe
import
cutlass_moe_fp4
topk_weights
,
topk_ids
,
_
=
topk_output
topk_weights
,
topk_ids
,
_
=
topk_output
return
cutlass_moe_fp4
(
output
=
cutlass_moe_fp4
(
a
=
x
,
a
=
x
,
a1_gscale
=
layer
.
w13_input_scale_quant
,
a1_gscale
=
layer
.
w13_input_scale_quant
,
w1_fp4
=
layer
.
w13_weight
,
w1_fp4
=
layer
.
w13_weight
,
...
@@ -1003,3 +1004,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
...
@@ -1003,3 +1004,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
params
=
layer
.
cutlass_moe_params
,
params
=
layer
.
cutlass_moe_params
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
).
to
(
x
.
dtype
)
).
to
(
x
.
dtype
)
if
routed_scaling_factor
is
not
None
:
output
*=
routed_scaling_factor
return
output
python/sglang/srt/server_args.py
View file @
58c468f4
...
@@ -433,10 +433,6 @@ class ServerArgs:
...
@@ -433,10 +433,6 @@ class ServerArgs:
self
.
quantization
==
"modelopt_fp4"
self
.
quantization
==
"modelopt_fp4"
),
"modelopt_fp4 quantization is required for Flashinfer MOE"
),
"modelopt_fp4 quantization is required for Flashinfer MOE"
os
.
environ
[
"TRTLLM_ENABLE_PDL"
]
=
"1"
os
.
environ
[
"TRTLLM_ENABLE_PDL"
]
=
"1"
self
.
disable_shared_experts_fusion
=
True
logger
.
warning
(
f
"Flashinfer MoE is enabled. Shared expert fusion is disabled."
)
# DeepEP MoE
# DeepEP MoE
if
self
.
enable_deepep_moe
:
if
self
.
enable_deepep_moe
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment