Update dsv3 quantization auto setting for sm100 (#12778)

cd135bfe · Ke Bao · GitHub · fc84b073 · cd135bfe
Unverified Commit cd135bfe authored Nov 06, 2025 by Ke Bao Committed by GitHub Nov 06, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 9 deletions

python/sglang/srt/server_args.py python/sglang/srt/server_args.py +22 -9

No files found.
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -912,19 +912,32 @@ class ServerArgs:
                    logger.info(
                        "Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
                    )
-                if self.moe_a2a_backend == "none" and self.moe_runner_backend == "auto":
-                    self.moe_runner_backend = "flashinfer_trtllm"
-                    logger.info(
-                        "Use flashinfer_trtllm as MoE runner backend on sm100 for DeepseekV3ForCausalLM"
-                    )
-                    if self.quantization is None:
-                        # Default DeepSeek V3/R1 native FP8 when not explicitly set,
-                        # Because we need this condition for an assertion in
-                        # flashinfer_trtllm MoE runner backend.
+                quantization_config = getattr(hf_config, "quantization_config", None)
+                quant_method = (
+                    quantization_config.get("quant_method")
+                    if quantization_config is not None
+                    else None
+                )
+                if self.quantization is None:
+                    # Default DeepSeek V3/R1 native FP8 when not explicitly set,
+                    # Because we need this condition for an assertion in
+                    # flashinfer_trtllm MoE runner backend.
+                    if quant_method is None:
                        self.quantization = "fp8"
                        logger.info(
                            "Quantization not specified, default to fp8 for DeepSeek on sm100"
                        )
+                    else:
+                        self.quantization = quant_method
+                if (
+                    self.moe_a2a_backend == "none"
+                    and self.moe_runner_backend == "auto"
+                    and self.quantization in ["fp8", "modelopt_fp8", "modelopt_fp4"]
+                ):
+                    self.moe_runner_backend = "flashinfer_trtllm"
+                    logger.info(
+                        "Use flashinfer_trtllm as MoE runner backend on sm100 for DeepseekV3ForCausalLM"
+                    )

        elif model_arch in ["GptOssForCausalLM"]:
            if (