Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
0d1e27a0
Unverified
Commit
0d1e27a0
authored
Aug 08, 2025
by
Xiaoyu Zhang
Committed by
GitHub
Aug 08, 2025
Browse files
Better optimization log for gpt-oss model (#8953)
parent
774b47f3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
11 additions
and
4 deletions
+11
-4
python/sglang/srt/layers/quantization/mxfp4.py
python/sglang/srt/layers/quantization/mxfp4.py
+5
-4
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+6
-0
No files found.
python/sglang/srt/layers/quantization/mxfp4.py
View file @
0d1e27a0
...
...
@@ -24,6 +24,7 @@ from sglang.srt.utils import (
is_cuda
,
is_flashinfer_available
,
is_hip
,
log_info_on_rank0
,
next_power_of_2
,
round_up
,
set_weight_attrs
,
...
...
@@ -34,7 +35,6 @@ has_triton_kernels = importlib.util.find_spec("triton_kernels") is not None
if
is_flashinfer_available
():
# from flashinfer.fused_moe import cutlass_fused_moe
from
flashinfer
import
(
mxfp8_quantize
,
shuffle_matrix_a
,
...
...
@@ -63,7 +63,7 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
scale_layout
,
scale_layout_opts
=
layout
.
make_default_matmul_mxfp4_w_scale_layout
(
mx_axis
=
1
,
num_warps
=
num_warps
)
if
is_
cuda
()
and
torch
.
cuda
.
get_device_capability
()[
0
]
==
10
:
if
_
is_
sm100_supported
:
constraints
=
{
"is_persistent"
:
True
,
"epilogue_subtile"
:
1
,
...
...
@@ -331,8 +331,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
def
process_weights_after_loading
(
self
,
layer
):
if
self
.
use_flashinfer
:
logger
.
info
(
"Shuffling MoE weights for FlashInfer, it might take a while..."
log_info_on_rank0
(
logger
,
"Shuffling MoE weights for FlashInfer MXFP4 moe kernel, it might take a while..."
,
)
layer
.
gemm1_alpha
=
Parameter
(
torch
.
tensor
([
1.702
]
*
self
.
num_experts
,
dtype
=
torch
.
float32
).
cuda
(),
...
...
python/sglang/srt/server_args.py
View file @
0d1e27a0
...
...
@@ -488,8 +488,14 @@ class ServerArgs:
if
is_sm100_supported
()
and
is_mxfp4_quant_format
:
self
.
enable_flashinfer_mxfp4_moe
=
True
self
.
enable_triton_kernel_moe
=
False
logger
.
info
(
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
)
else
:
self
.
enable_triton_kernel_moe
=
True
logger
.
info
(
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
)
self
.
disable_hybrid_swa_memory
=
True
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment