Unverified Commit 76915d68 authored by Xiaoyu Zhang's avatar Xiaoyu Zhang Committed by GitHub
Browse files

Fix enable flashinfer mxfp4 moe bf16 check (#8950)

parent 39fd1788
......@@ -476,8 +476,15 @@ class ServerArgs:
self.attention_backend == "trtllm_mha"
or self.attention_backend == "triton"
)
quantization_config = getattr(
self.get_hf_config(), "quantization_config", None
)
is_mxfp4_quant_format = (
quantization_config is not None
and quantization_config.get("quant_method") == "mxfp4"
)
if is_sm100_supported():
if is_sm100_supported() and is_mxfp4_quant_format:
self.enable_flashinfer_mxfp4_moe = True
self.enable_triton_kernel_moe = False
else:
......@@ -485,13 +492,7 @@ class ServerArgs:
self.disable_hybrid_swa_memory = True
quantization_config = getattr(
self.get_hf_config(), "quantization_config", None
)
if (
quantization_config is not None
and quantization_config.get("quant_method") == "mxfp4"
):
if is_mxfp4_quant_format:
# use bf16 for mxfp4 triton kernels
self.dtype = "bfloat16"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment