Unverified Commit c143f416 authored by b8zhong's avatar b8zhong Committed by GitHub
Browse files

fix: Llama 4 BF16 load on Blackwell (#12308)

parent b48354c5
......@@ -972,10 +972,11 @@ class ServerArgs:
"Use trtllm_mha as attention backend on sm100 for Llama4 model"
)
if is_sm100_supported() and self.moe_runner_backend == "auto":
self.moe_runner_backend = "flashinfer_trtllm"
logger.info(
"Use flashinfer_trtllm as MoE runner backend on SM100 for Llama4"
)
if self.quantization in {"fp8", "modelopt_fp8"}:
self.moe_runner_backend = "flashinfer_trtllm"
logger.info(
"Use flashinfer_trtllm as MoE runner backend on SM100 for Llama4"
)
elif model_arch in [
"Gemma2ForCausalLM",
"Gemma3ForCausalLM",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment