Update DeepSeek-R1-FP4 default config on blackwell (#11512)

a2b3d9b9 · Qiaolin Yu · GitHub · 9a30914e · a2b3d9b9
Unverified Commit a2b3d9b9 authored Oct 12, 2025 by Qiaolin Yu Committed by GitHub Oct 12, 2025
Show whitespace changes
Inline Side-by-side

Showing with 26 additions and 1 deletion

python/sglang/srt/server_args.py python/sglang/srt/server_args.py +26 -1

No files found.
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -802,7 +802,32 @@ class ServerArgs:
        hf_config = self.get_hf_config()
        model_arch = hf_config.architectures[0]
-        if model_arch in ["GptOssForCausalLM"]:
+        if model_arch in ["DeepseekV3ForCausalLM"]:
+            if is_cuda() and is_sm100_supported():
+                if (
+                    self.attention_backend is None
+                    and self.prefill_attention_backend is None
+                    and self.decode_attention_backend is None
+                ):
+                    self.attention_backend = "trtllm_mla"
+                    logger.info(
+                        "Use trtllm_mla as attention backend on sm100 for DeepseekV3ForCausalLM"
+                    )
+                if not self.enable_dp_attention:
+                    self.enable_flashinfer_allreduce_fusion = True
+                    logger.info(
+                        "Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
+                    )
+                if (
+                    self.quantization == "modelopt_fp4"
+                    and self.moe_runner_backend == "auto"
+                ):
+                    self.moe_runner_backend = "flashinfer_trtllm"
+                    logger.info(
+                        "Use flashinfer_trtllm as moe runner backend on sm100 for DeepseekV3ForCausalLM"
+                    )
+        elif model_arch in ["GptOssForCausalLM"]:
            if (
                self.attention_backend is None
                and self.prefill_attention_backend is None