Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhaoyu6
sglang
Commits
fb2e816e
Unverified
Commit
fb2e816e
authored
Nov 05, 2025
by
Lianmin Zheng
Committed by
GitHub
Nov 05, 2025
Browse files
Fix server args for gpt oss so users can override the moe runner backend (#12696)
parent
7c45b8b4
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
15 additions
and
18 deletions
+15
-18
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+15
-18
No files found.
python/sglang/srt/server_args.py
View file @
fb2e816e
...
...
@@ -959,30 +959,27 @@ class ServerArgs:
quantization_config
is
not
None
and
quantization_config
.
get
(
"quant_method"
)
==
"mxfp4"
)
if
is_mxfp4_quant_format
:
# use bf16 for mxfp4 triton kernels
self
.
dtype
=
"bfloat16"
if
is_blackwell_supported
()
and
is_mxfp4_quant_format
:
self
.
moe_runner_backend
=
"flashinfer_mxfp4"
logger
.
warning
(
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
)
else
:
if
self
.
moe_runner_backend
==
"triton_kernel"
:
assert
(
self
.
ep_size
==
1
),
"Triton kernel MoE is only supported when ep_size == 1"
if
(
self
.
moe_runner_backend
==
"auto"
and
self
.
ep_size
==
1
and
is_triton_kernels_available
()
):
if
self
.
moe_runner_backend
==
"auto"
:
if
is_blackwell_supported
()
and
is_mxfp4_quant_format
:
self
.
moe_runner_backend
=
"flashinfer_mxfp4"
logger
.
warning
(
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
)
elif
self
.
ep_size
==
1
and
is_triton_kernels_available
():
self
.
moe_runner_backend
=
"triton_kernel"
logger
.
warning
(
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
)
if
self
.
moe_runner_backend
==
"triton_kernel"
:
assert
(
self
.
ep_size
==
1
),
"Triton kernel MoE is only supported when ep_size == 1"
self
.
disable_hybrid_swa_memory
=
True
if
is_mxfp4_quant_format
:
# use bf16 for mxfp4 triton kernels
self
.
dtype
=
"bfloat16"
elif
"Llama4"
in
model_arch
and
self
.
device
!=
"cpu"
:
assert
self
.
attention_backend
in
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment