Fix enable flashinfer mxfp4 moe bf16 check (#8950)

76915d68 · Xiaoyu Zhang · GitHub · 39fd1788 · 76915d68
Unverified Commit 76915d68 authored Aug 08, 2025 by Xiaoyu Zhang Committed by GitHub Aug 07, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 8 deletions

python/sglang/srt/server_args.py python/sglang/srt/server_args.py +9 -8

No files found.
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -476,8 +476,15 @@ class ServerArgs:
                self.attention_backend == "trtllm_mha"
                or self.attention_backend == "triton"
            )
+            quantization_config = getattr(
+                self.get_hf_config(), "quantization_config", None
+            )
+            is_mxfp4_quant_format = (
+                quantization_config is not None
+                and quantization_config.get("quant_method") == "mxfp4"
+            )

-            if is_sm100_supported():
+            if is_sm100_supported() and is_mxfp4_quant_format:
                self.enable_flashinfer_mxfp4_moe = True
                self.enable_triton_kernel_moe = False
            else:
@@ -485,13 +492,7 @@ class ServerArgs:

            self.disable_hybrid_swa_memory = True

-            quantization_config = getattr(
-                self.get_hf_config(), "quantization_config", None
-            )
-            if (
-                quantization_config is not None
-                and quantization_config.get("quant_method") == "mxfp4"
-            ):
+            if is_mxfp4_quant_format:
                # use bf16 for mxfp4 triton kernels
                self.dtype = "bfloat16"