Unverified Commit c143f416 authored by b8zhong's avatar b8zhong Committed by GitHub
Browse files

fix: Llama 4 BF16 load on Blackwell (#12308)

parent b48354c5
...@@ -972,10 +972,11 @@ class ServerArgs: ...@@ -972,10 +972,11 @@ class ServerArgs:
"Use trtllm_mha as attention backend on sm100 for Llama4 model" "Use trtllm_mha as attention backend on sm100 for Llama4 model"
) )
if is_sm100_supported() and self.moe_runner_backend == "auto": if is_sm100_supported() and self.moe_runner_backend == "auto":
self.moe_runner_backend = "flashinfer_trtllm" if self.quantization in {"fp8", "modelopt_fp8"}:
logger.info( self.moe_runner_backend = "flashinfer_trtllm"
"Use flashinfer_trtllm as MoE runner backend on SM100 for Llama4" logger.info(
) "Use flashinfer_trtllm as MoE runner backend on SM100 for Llama4"
)
elif model_arch in [ elif model_arch in [
"Gemma2ForCausalLM", "Gemma2ForCausalLM",
"Gemma3ForCausalLM", "Gemma3ForCausalLM",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment