Support speculative decoding in the trtllm_mha attention backend (#9331)

Co-authored-by: ispobock <ispobaoke@gmail.com>

Support speculative decoding in the trtllm_mha attention backend (#9331)
Co-authored-by: ispobock <ispobaoke@gmail.com>
9ec314c6 · Qiaolin Yu · GitHub · fedfe91c · 9ec314c6 · 9ec314c6
Unverified Commit 9ec314c6 authored Aug 21, 2025 by Qiaolin Yu Committed by GitHub Aug 21, 2025
3 changed files
--- a/python/sglang/srt/layers/attention/trtllm_mha_backend.py
+++ b/python/sglang/srt/layers/attention/trtllm_mha_backend.py
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -500,11 +500,6 @@ class ServerArgs:
                )
                self.page_size = 64

-            if self.speculative_algorithm is not None:
-                raise ValueError(
-                    "trtllm_mha backend does not support speculative decoding yet."
-                )
-
        if self.attention_backend == "dual_chunk_flash_attn":
            logger.warning(
                "Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
@@ -653,6 +648,16 @@ class ServerArgs:
                    self.speculative_num_draft_tokens,
                ) = auto_choose_speculative_params(self)

+            if (
+                self.attention_backend == "trtllm_mha"
+                or self.decode_attention_backend == "trtllm_mha"
+                or self.prefill_attention_backend == "trtllm_mha"
+            ):
+                if self.speculative_eagle_topk > 1:
+                    raise ValueError(
+                        "trtllm_mha backend only supports topk = 1 for speculative decoding."
+                    )
+
            if (
                self.speculative_eagle_topk == 1
                and self.speculative_num_draft_tokens != self.speculative_num_steps + 1

--- a/python/sglang/srt/speculative/eagle_worker.py
+++ b/python/sglang/srt/speculative/eagle_worker.py
@@ -266,6 +266,22 @@ class EAGLEWorker(TpModelWorker):
                self.topk,
                self.speculative_num_steps,
            )
+        elif self.server_args.attention_backend == "trtllm_mha":
+            from sglang.srt.layers.attention.trtllm_mha_backend import (
+                TRTLLMHAAttnBackend,
+                TRTLLMHAAttnMultiStepDraftBackend,
+            )
+
+            self.draft_attn_backend = TRTLLMHAAttnMultiStepDraftBackend(
+                self.draft_model_runner,
+                self.topk,
+                self.speculative_num_steps,
+            )
+            self.draft_extend_attn_backend = TRTLLMHAAttnBackend(
+                self.draft_model_runner,
+                skip_prefill=False,
+            )
+            self.has_prefill_wrapper_verify = True
        elif self.server_args.attention_backend == "trtllm_mla":
            if not global_server_args_dict["use_mla_backend"]:
                raise ValueError(