Allow use of TRTLLM_MHA backend for hybrid attention on Blackwell (#11138)

e8100774 · Dom Brown · GitHub · 963175d5 · e8100774 · e8100774
Unverified Commit e8100774 authored Oct 03, 2025 by Dom Brown Committed by GitHub Oct 02, 2025
Showing with 3 additions and 2 deletions

python/sglang/srt/layers/attention/attention_registry.py python/sglang/srt/layers/attention/attention_registry.py +2 -1

python/sglang/srt/model_executor/model_runner.py python/sglang/srt/model_executor/model_runner.py +1 -1

No files found.
--- a/python/sglang/srt/layers/attention/attention_registry.py
+++ b/python/sglang/srt/layers/attention/attention_registry.py
@@ -178,7 +178,8 @@ def attn_backend_wrapper(runner, full_attn_backend):
        if is_blackwell():
            assert (
                runner.server_args.attention_backend == "triton"
-            ), "triton backend is the only supported backend on Blackwell GPUs for hybrid GDN models, use --attention-backend triton to specify the backend."
+                or runner.server_args.attention_backend == "trtllm_mha"
+            ), "triton or trtllm_mha backend are the only supported backends on Blackwell GPUs for hybrid GDN models, use --attention-backend triton or --attention-backend trtllm_mha to specify the backend."
        if is_npu():
            assert (
                runner.server_args.attention_backend == "ascend"

--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -1620,7 +1620,7 @@ class ModelRunner:
                )
            elif self.is_hybrid_gdn:
                self.token_to_kv_pool = HybridLinearKVPool(
-                    page_size=self.page_size if _is_npu else 1,
+                    page_size=self.page_size,
                    size=self.max_total_num_tokens,
                    dtype=self.kv_cache_dtype,
                    head_num=self.model_config.get_num_kv_heads(