Unverified Commit a2b3d9b9 authored by Qiaolin Yu's avatar Qiaolin Yu Committed by GitHub
Browse files

Update DeepSeek-R1-FP4 default config on blackwell (#11512)

parent 9a30914e
...@@ -802,7 +802,32 @@ class ServerArgs: ...@@ -802,7 +802,32 @@ class ServerArgs:
hf_config = self.get_hf_config() hf_config = self.get_hf_config()
model_arch = hf_config.architectures[0] model_arch = hf_config.architectures[0]
if model_arch in ["GptOssForCausalLM"]: if model_arch in ["DeepseekV3ForCausalLM"]:
if is_cuda() and is_sm100_supported():
if (
self.attention_backend is None
and self.prefill_attention_backend is None
and self.decode_attention_backend is None
):
self.attention_backend = "trtllm_mla"
logger.info(
"Use trtllm_mla as attention backend on sm100 for DeepseekV3ForCausalLM"
)
if not self.enable_dp_attention:
self.enable_flashinfer_allreduce_fusion = True
logger.info(
"Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
)
if (
self.quantization == "modelopt_fp4"
and self.moe_runner_backend == "auto"
):
self.moe_runner_backend = "flashinfer_trtllm"
logger.info(
"Use flashinfer_trtllm as moe runner backend on sm100 for DeepseekV3ForCausalLM"
)
elif model_arch in ["GptOssForCausalLM"]:
if ( if (
self.attention_backend is None self.attention_backend is None
and self.prefill_attention_backend is None and self.prefill_attention_backend is None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment