fix: llama 4 + trtllm gen + fp8 kv cache incompatibility (#12347)

bacb3825 · b8zhong · GitHub · b53d9e11 · bacb3825
Unverified Commit bacb3825 authored Oct 29, 2025 by b8zhong Committed by GitHub Oct 29, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 0 deletions

python/sglang/srt/server_args.py python/sglang/srt/server_args.py +7 -0

No files found.
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -971,6 +971,13 @@ class ServerArgs:
                logger.warning(
                    "Use trtllm_mha as attention backend on sm100 for Llama4 model"
                )
+            if is_sm100_supported() and self.attention_backend == "trtllm_mha":
+                # TODO(brayden): remove this once TRTLLM MHA kernel for FP8 w/ tileSizeKv=128 is available.
+                # This is a Llama 4 specific issue only.
+                self.kv_cache_dtype = "bfloat16"
+                logger.warning(
+                    "Setting kv_cache_dtype to bfloat16 for Llama4 with trtllm_mha backend, due to a missing FlashInfer TRTLLM MHA kernel for FP8 KV Cache"
+                )
            if is_sm100_supported() and self.moe_runner_backend == "auto":
                if self.quantization in {"fp8", "modelopt_fp8"}:
                    self.moe_runner_backend = "flashinfer_trtllm"