Fix draft decode max batch size (#3676)

e5ce395a · Ke Bao · GitHub · f983213a · e5ce395a · e5ce395a
Unverified Commit e5ce395a authored Feb 18, 2025 by Ke Bao Committed by GitHub Feb 18, 2025
3 changed files
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -1094,7 +1094,7 @@ class FlashInferMultiStepDraftBackend:
        self.topk = topk
        self.speculative_num_steps = speculative_num_steps
        self.generate_draft_decode_kv_indices = generate_draft_decode_kv_indices
-        max_bs = model_runner.req_to_token_pool.size
+        max_bs = model_runner.req_to_token_pool.size * self.topk
        self.kv_indptr = torch.zeros(
            (
                self.speculative_num_steps,

--- a/python/sglang/srt/layers/attention/triton_backend.py
+++ b/python/sglang/srt/layers/attention/triton_backend.py
@@ -474,7 +474,7 @@ class TritonMultiStepDraftBackend:
        self.topk = topk
        self.speculative_num_steps = speculative_num_steps
        self.generate_draft_decode_kv_indices = generate_draft_decode_kv_indices
-        max_bs = model_runner.req_to_token_pool.size
+        max_bs = model_runner.req_to_token_pool.size * self.topk
        self.kv_indptr = torch.zeros(
            (
                self.speculative_num_steps,

--- a/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
+++ b/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
@@ -635,6 +635,9 @@ def decode_attention_fwd(
    logit_cap=0.0,
 ):
    assert num_kv_splits == attn_logits.shape[2]
+    assert q.shape[0] <= kv_indptr.shape[0] - 1
+    assert q.shape[0] <= attn_logits.shape[0]
    kv_group_num = q.shape[1] // v_buffer.shape[1]
    if kv_group_num == 1: