Fix 1-step draft model forward (#11653)

Signed-off-by: Shangming Cai <csmthu@gmail.com> Co-authored-by: Liangsheng Yin <lsyincs@gmail.com>

Fix 1-step draft model forward (#11653)
Signed-off-by: Shangming Cai <csmthu@gmail.com> Co-authored-by: Liangsheng Yin <lsyincs@gmail.com>
6d036468 · Shangming Cai · GitHub · 8221f9ae · 6d036468 · 6d036468
Unverified Commit 6d036468 authored Oct 15, 2025 by Shangming Cai Committed by GitHub Oct 15, 2025
4 changed files
--- a/python/sglang/srt/speculative/draft_utils.py
+++ b/python/sglang/srt/speculative/draft_utils.py
@@ -33,15 +33,7 @@ class DraftBackendFactory:
    def create_decode_backend(self):
        if self.speculative_num_steps == 1:
+            return None
-            class DummyAttnBackend:
-                def __init__(self):
-                    pass
-                def init_forward_metadata(*args, **kwargs):
-                    pass
-            return DummyAttnBackend()
        backend_map = {
            "flashinfer": self._create_flashinfer_decode_backend,

--- a/python/sglang/srt/speculative/eagle_info_v2.py
+++ b/python/sglang/srt/speculative/eagle_info_v2.py
@@ -276,7 +276,7 @@ class EagleVerifyInputV2Mixin:
                accept_length=accept_length,  # mutable
                simulate_acc_len=SIMULATE_ACC_LEN,
                bs=bs,
-                spec_steps=self.draft_token_num,
+                spec_steps=self.spec_steps,
            )
        # Include the bonus token

--- a/python/sglang/srt/speculative/eagle_worker.py
+++ b/python/sglang/srt/speculative/eagle_worker.py
@@ -218,6 +218,7 @@ class EAGLEWorker(TpModelWorker):
            return
        # Capture draft
+        if self.speculative_num_steps > 1:
            tic = time.perf_counter()
            before_mem = get_available_gpu_memory(self.device, self.gpu_id)
            logger.info(
@@ -500,8 +501,11 @@ class EAGLEWorker(TpModelWorker):
            )
        else:
            forward_batch.can_run_dp_cuda_graph = False
-            if not forward_batch.forward_mode.is_idle():
+            if (
-                # Initialize attention backend
+                not forward_batch.forward_mode.is_idle()
+                and self.speculative_num_steps > 1
+            ):
+                # Skip attention backend init for idle mode or 1-step draft
                self.draft_attn_backend.init_forward_metadata(forward_batch)
            # Run forward steps
            parent_list, top_scores_index, draft_tokens = self.draft_forward(

--- a/python/sglang/srt/speculative/eagle_worker_v2.py
+++ b/python/sglang/srt/speculative/eagle_worker_v2.py
@@ -97,6 +97,9 @@ class EAGLEWorkerV2(EAGLEWorker):
                forward_batch,
            )
        else:
+            if self.speculative_num_steps > 1:
+                # Skip attention backend init for 1-step draft,
+                # `draft_forward` only does sample in this case.
                self.draft_attn_backend.init_forward_metadata(forward_batch)
            parent_list, top_scores_index, draft_tokens = self.draft_forward(
                forward_batch