Revert "fix llama4 kv cache layout" (#12437)

a076ec1a · b8zhong · GitHub · 72b5f3d0 · a076ec1a · a076ec1a
Unverified Commit a076ec1a authored Oct 30, 2025 by b8zhong Committed by GitHub Oct 30, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 8 deletions

docs/advanced_features/attention_backend.md docs/advanced_features/attention_backend.md +1 -1

python/sglang/srt/server_args.py python/sglang/srt/server_args.py +0 -7

No files found.
--- a/docs/advanced_features/attention_backend.md
+++ b/docs/advanced_features/attention_backend.md
@@ -21,7 +21,7 @@ The support matrix is split into two parts: MHA (standard attention) and MLA (mu
 | **Triton**                      | ❌                          | ❌               | ✅              | ✅              | ✅                 | ✅             |
 | **Torch Native (SDPA)**         | ❌                          | ❌               | ❌              | ❌              | ❌                 | ❌             |
 | **FlexAttention (PyTorch)**     | ❌                          | ❌               | ❌              | ❌              | ❌                 | ❌             |
-| **TRTLLM MHA**                  | 16, 32 or 64                | ❌               | ✅              | ❌              | ❌                 | ❌             |
+| **TRTLLM MHA**                  | 16, 32 or 64                | ✅               | ✅              | ❌              | ❌                 | ❌             |
 | **Dual Chunk FlashAttention**   | ✅                          | ❌               | ❌              | ❌              | ❌                 | ❌             |
 | **AITER (ROCm)**                | ✅                          | ❌               | ✅              | ✅              | ❌                 | ❌             |
 | **Wave (ROCm)**                 | ✅                          | ❌               | ❌              | ❌              | ❌                 | ❌             |

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -980,13 +980,6 @@ class ServerArgs:
                logger.warning(
                    "Use trtllm_mha as attention backend on sm100 for Llama4 model"
                )
-            if is_sm100_supported() and self.attention_backend == "trtllm_mha":
-                # TODO(brayden): remove this once TRTLLM MHA kernel for FP8 w/ tileSizeKv=128 is available.
-                # This is a Llama 4 specific issue only.
-                self.kv_cache_dtype = "bfloat16"
-                logger.warning(
-                    "Setting kv_cache_dtype to bfloat16 for Llama4 with trtllm_mha backend, due to a missing FlashInfer TRTLLM MHA kernel for FP8 KV Cache"
-                )
            if is_sm100_supported() and self.moe_runner_backend == "auto":
                if self.quantization in {"fp8", "modelopt_fp8"}:
                    self.moe_runner_backend = "flashinfer_trtllm"