"git@developer.sourcefind.cn:change/sglang.git" did not exist on "66283dbc0c052c6f32bde68451addc5b0d00cf3b"
Unverified Commit cd135bfe authored by Ke Bao's avatar Ke Bao Committed by GitHub
Browse files

Update dsv3 quantization auto setting for sm100 (#12778)

parent fc84b073
......@@ -912,19 +912,32 @@ class ServerArgs:
logger.info(
"Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
)
if self.moe_a2a_backend == "none" and self.moe_runner_backend == "auto":
self.moe_runner_backend = "flashinfer_trtllm"
logger.info(
"Use flashinfer_trtllm as MoE runner backend on sm100 for DeepseekV3ForCausalLM"
)
if self.quantization is None:
# Default DeepSeek V3/R1 native FP8 when not explicitly set,
# Because we need this condition for an assertion in
# flashinfer_trtllm MoE runner backend.
quantization_config = getattr(hf_config, "quantization_config", None)
quant_method = (
quantization_config.get("quant_method")
if quantization_config is not None
else None
)
if self.quantization is None:
# Default DeepSeek V3/R1 native FP8 when not explicitly set,
# Because we need this condition for an assertion in
# flashinfer_trtllm MoE runner backend.
if quant_method is None:
self.quantization = "fp8"
logger.info(
"Quantization not specified, default to fp8 for DeepSeek on sm100"
)
else:
self.quantization = quant_method
if (
self.moe_a2a_backend == "none"
and self.moe_runner_backend == "auto"
and self.quantization in ["fp8", "modelopt_fp8", "modelopt_fp4"]
):
self.moe_runner_backend = "flashinfer_trtllm"
logger.info(
"Use flashinfer_trtllm as MoE runner backend on sm100 for DeepseekV3ForCausalLM"
)
elif model_arch in ["GptOssForCausalLM"]:
if (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment