"test/git@developer.sourcefind.cn:change/sglang.git" did not exist on "a11f8d5f6a80595cd90982b369284a5b87d50163"
Unverified Commit fb2e816e authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Fix server args for gpt oss so users can override the moe runner backend (#12696)

parent 7c45b8b4
...@@ -959,30 +959,27 @@ class ServerArgs: ...@@ -959,30 +959,27 @@ class ServerArgs:
quantization_config is not None quantization_config is not None
and quantization_config.get("quant_method") == "mxfp4" and quantization_config.get("quant_method") == "mxfp4"
) )
if is_mxfp4_quant_format:
# use bf16 for mxfp4 triton kernels
self.dtype = "bfloat16"
if self.moe_runner_backend == "auto":
if is_blackwell_supported() and is_mxfp4_quant_format: if is_blackwell_supported() and is_mxfp4_quant_format:
self.moe_runner_backend = "flashinfer_mxfp4" self.moe_runner_backend = "flashinfer_mxfp4"
logger.warning( logger.warning(
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel." "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
) )
else: elif self.ep_size == 1 and is_triton_kernels_available():
if self.moe_runner_backend == "triton_kernel":
assert (
self.ep_size == 1
), "Triton kernel MoE is only supported when ep_size == 1"
if (
self.moe_runner_backend == "auto"
and self.ep_size == 1
and is_triton_kernels_available()
):
self.moe_runner_backend = "triton_kernel" self.moe_runner_backend = "triton_kernel"
logger.warning( logger.warning(
"Detected GPT-OSS model, enabling triton_kernels MOE kernel." "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
) )
if self.moe_runner_backend == "triton_kernel":
assert (
self.ep_size == 1
), "Triton kernel MoE is only supported when ep_size == 1"
self.disable_hybrid_swa_memory = True self.disable_hybrid_swa_memory = True
if is_mxfp4_quant_format:
# use bf16 for mxfp4 triton kernels
self.dtype = "bfloat16"
elif "Llama4" in model_arch and self.device != "cpu": elif "Llama4" in model_arch and self.device != "cpu":
assert self.attention_backend in { assert self.attention_backend in {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment