Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
258d02c8
Unverified
Commit
258d02c8
authored
Sep 15, 2025
by
fzyzcjy
Committed by
GitHub
Sep 14, 2025
Browse files
Fix correction bias undefined behavior for nvfp4 models (#10426)
parent
60d7beda
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
5 additions
and
1 deletion
+5
-1
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+3
-1
sgl-kernel/csrc/moe/moe_fused_gate.cu
sgl-kernel/csrc/moe/moe_fused_gate.cu
+2
-0
No files found.
python/sglang/srt/models/deepseek_v2.py
View file @
258d02c8
...
@@ -65,6 +65,7 @@ from sglang.srt.layers.moe import (
...
@@ -65,6 +65,7 @@ from sglang.srt.layers.moe import (
get_deepep_mode
,
get_deepep_mode
,
get_moe_a2a_backend
,
get_moe_a2a_backend
,
should_use_flashinfer_cutlass_moe_fp4_allgather
,
should_use_flashinfer_cutlass_moe_fp4_allgather
,
should_use_flashinfer_trtllm_moe
,
)
)
from
sglang.srt.layers.moe.ep_moe.layer
import
DeepEPMoE
,
get_moe_impl_class
from
sglang.srt.layers.moe.ep_moe.layer
import
DeepEPMoE
,
get_moe_impl_class
from
sglang.srt.layers.moe.fused_moe_triton.layer
import
(
from
sglang.srt.layers.moe.fused_moe_triton.layer
import
(
...
@@ -375,7 +376,8 @@ class DeepseekV2MoE(nn.Module):
...
@@ -375,7 +376,8 @@ class DeepseekV2MoE(nn.Module):
)
)
correction_bias
=
self
.
gate
.
e_score_correction_bias
correction_bias
=
self
.
gate
.
e_score_correction_bias
if
_is_fp4_quantization_enabled
():
# https://github.com/sgl-project/sglang/pull/9834#discussion_r2324480643
if
_is_fp4_quantization_enabled
()
and
should_use_flashinfer_trtllm_moe
():
correction_bias
=
correction_bias
.
to
(
torch
.
bfloat16
)
correction_bias
=
correction_bias
.
to
(
torch
.
bfloat16
)
self
.
topk
=
TopK
(
self
.
topk
=
TopK
(
top_k
=
config
.
num_experts_per_tok
+
self
.
num_fused_shared_experts
,
top_k
=
config
.
num_experts_per_tok
+
self
.
num_fused_shared_experts
,
...
...
sgl-kernel/csrc/moe/moe_fused_gate.cu
View file @
258d02c8
...
@@ -385,6 +385,8 @@ std::vector<at::Tensor> moe_fused_gate(
...
@@ -385,6 +385,8 @@ std::vector<at::Tensor> moe_fused_gate(
int64_t
num_fused_shared_experts
,
int64_t
num_fused_shared_experts
,
double
routed_scaling_factor
,
double
routed_scaling_factor
,
bool
apply_routed_scaling_factor_on_output
)
{
bool
apply_routed_scaling_factor_on_output
)
{
TORCH_CHECK
(
input
.
dtype
()
==
bias
.
dtype
(),
"input and bias should have the same dtype"
);
int64_t
num_rows
=
input
.
size
(
0
);
int64_t
num_rows
=
input
.
size
(
0
);
int32_t
num_experts
=
input
.
size
(
1
);
int32_t
num_experts
=
input
.
size
(
1
);
auto
options
=
torch
::
TensorOptions
().
dtype
(
torch
::
kFloat32
).
device
(
torch
::
kCUDA
);
auto
options
=
torch
::
TensorOptions
().
dtype
(
torch
::
kFloat32
).
device
(
torch
::
kCUDA
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment