Support page size > 1 + eagle (#4908)

b26bc86b · Lianmin Zheng · GitHub · 5ec5eaf7 · b26bc86b · b26bc86b
Unverified Commit b26bc86b authored Mar 30, 2025 by Lianmin Zheng Committed by GitHub Mar 30, 2025
16 changed files
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -33,6 +33,7 @@ runtime_common = [
    "prometheus-client>=0.20.0",
    "psutil",
    "pydantic",
+    "pynvml",
    "python-multipart",
    "pyzmq>=25.1.2",
    "soundfile==0.13.1",

--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -14,7 +14,6 @@ from functools import partial
 from typing import TYPE_CHECKING, Callable, List, Optional, Union
 import torch
-import triton
 from sglang.global_config import global_config
 from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
@@ -22,7 +21,7 @@ from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_trito
 from sglang.srt.layers.dp_attention import get_attention_tp_size
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
-from sglang.srt.utils import get_bool_env_var, is_flashinfer_available
+from sglang.srt.utils import is_flashinfer_available, next_power_of_2
 if TYPE_CHECKING:
    from sglang.srt.layers.radix_attention import RadixAttention
@@ -932,6 +931,7 @@ class FlashInferMultiStepDraftBackend:
        self.topk = topk
        self.speculative_num_steps = speculative_num_steps
        self.generate_draft_decode_kv_indices = generate_draft_decode_kv_indices
+        self.page_size = model_runner.page_size
        max_bs = model_runner.req_to_token_pool.size * self.topk
        self.kv_indptr = torch.zeros(
@@ -985,9 +985,9 @@ class FlashInferMultiStepDraftBackend:
            self.pool_len,
            kv_indices_buffer.shape[1],
            self.kv_indptr.shape[1],
-            triton.next_power_of_2(num_seqs),
+            next_power_of_2(num_seqs),
-            triton.next_power_of_2(self.speculative_num_steps),
+            next_power_of_2(self.speculative_num_steps),
-            triton.next_power_of_2(bs),
+            next_power_of_2(bs),
        )
        assert forward_batch.spec_info is not None
@@ -1018,8 +1018,6 @@ class FlashInferMultiStepDraftBackend:
        )
        def call_fn(i, forward_batch):
-            assert forward_batch.spec_info is not None
-            assert isinstance(forward_batch.spec_info, EagleDraftInput)
            forward_batch.spec_info.kv_indptr = (
                forward_batch.spec_info.kv_indptr.clone()
            )

--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -740,11 +740,14 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
            )
        return req_pool_indices
-    def alloc_token_slots(self, num_tokens: int):
+    def alloc_token_slots(self, num_tokens: int, backup_state: bool = False):
        if self.token_to_kv_pool_allocator.available_size() < num_tokens:
            if self.tree_cache is not None:
                self.tree_cache.evict(num_tokens)
+        if backup_state:
+            state = self.token_to_kv_pool_allocator.backup_state()
        out_cache_loc = self.token_to_kv_pool_allocator.alloc(num_tokens)
        if out_cache_loc is None:
            phase_str = "Prefill" if self.forward_mode.is_extend() else "Decode"
@@ -758,7 +761,10 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                self.tree_cache.pretty_print()
            raise RuntimeError(error_msg)
-        return out_cache_loc
+        if backup_state:
+            return out_cache_loc, state
+        else:
+            return out_cache_loc
    def alloc_paged_token_slots_extend(
        self,
@@ -766,6 +772,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
        seq_lens: torch.Tensor,
        last_loc: torch.Tensor,
        extend_num_tokens: int,
+        backup_state: bool = False,
    ):
        if (
            self.token_to_kv_pool_allocator.available_size()
@@ -778,6 +785,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                    + len(seq_lens) * self.token_to_kv_pool_allocator.page_size,
                )
+        if backup_state:
+            state = self.token_to_kv_pool_allocator.backup_state()
        out_cache_loc = self.token_to_kv_pool_allocator.alloc_extend(
            prefix_lens, seq_lens, last_loc, extend_num_tokens
        )
@@ -791,12 +801,17 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
            )
            logger.error(error_msg)
            raise RuntimeError(error_msg)
-        return out_cache_loc
+        if backup_state:
+            return out_cache_loc, state
+        else:
+            return out_cache_loc
    def alloc_paged_token_slots_decode(
        self,
        seq_lens: torch.Tensor,
        last_loc: torch.Tensor,
+        backup_state: bool = False,
    ):
        if (
            self.token_to_kv_pool_allocator.available_size()
@@ -806,8 +821,11 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
                self.tree_cache.evict(
                    len(seq_lens) * self.token_to_kv_pool_allocator.page_size,
                )
-        out_cache_loc = self.token_to_kv_pool_allocator.alloc_decode(seq_lens, last_loc)
+        if backup_state:
+            state = self.token_to_kv_pool_allocator.backup_state()
+        out_cache_loc = self.token_to_kv_pool_allocator.alloc_decode(seq_lens, last_loc)
        if out_cache_loc is None:
            error_msg = (
                f"Decode out of memory. Try to lower your batch size.\n"
@@ -818,7 +836,11 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
            )
            logger.error(error_msg)
            raise RuntimeError(error_msg)
-        return out_cache_loc
+        if backup_state:
+            return out_cache_loc, state
+        else:
+            return out_cache_loc
    def prepare_encoder_info_extend(self, input_ids: List[int], seq_lens: List[int]):
        self.encoder_lens_cpu = []

--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -1110,7 +1110,7 @@ class Scheduler(
        )
        if memory_leak:
            msg = (
-                "KV cache pool leak detected! "
+                "token_to_kv_pool_allocator memory leak detected! "
                f"{available_size=}, {protected_size=}, {self.max_total_num_tokens=}\n"
                f"{self.token_to_kv_pool_allocator.available_size()=}\n"
                f"{self.tree_cache.evictable_size()=}\n"
@@ -1121,7 +1121,7 @@ class Scheduler(
        if len(self.req_to_token_pool.free_slots) != self.req_to_token_pool.size:
            msg = (
-                "Memory pool leak detected!"
+                "req_to_token_pool memory leak detected!"
                f"available_size={len(self.req_to_token_pool.free_slots)}, "
                f"total_size={self.req_to_token_pool.size}\n"
            )

--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -185,6 +185,12 @@ class TokenToKVPoolAllocator:
        if self.free_group:
            self.free(torch.cat(self.free_group))
+    def backup_state(self):
+        return self.free_slots
+    def restore_state(self, free_slots):
+        self.free_slots = free_slots
    def clear(self):
        # The padded slot 0 is used for writing dummy outputs from padded tokens.
        self.free_slots = torch.arange(

--- a/python/sglang/srt/mem_cache/paged_allocator.py
+++ b/python/sglang/srt/mem_cache/paged_allocator.py
@@ -218,6 +218,9 @@ class PagedTokenToKVPoolAllocator:
            next_power_of_2(extend_num_tokens),
        )
+        if self.debug_mode:
+            assert len(torch.unique(out_indices)) == len(out_indices)
        merged_value = self.ret_values.item()
        num_new_pages = merged_value >> 32
        if num_new_pages > len(self.free_pages):
@@ -248,6 +251,9 @@ class PagedTokenToKVPoolAllocator:
            self.page_size,
        )
+        if self.debug_mode:
+            assert len(torch.unique(out_indices)) == len(out_indices)
        num_new_pages = self.ret_values.item()
        if num_new_pages > len(self.free_pages):
            return None
@@ -265,6 +271,9 @@ class PagedTokenToKVPoolAllocator:
        else:
            self.free_group.append(free_index)
+        if self.debug_mode:
+            assert len(torch.unique(self.free_pages)) == len(self.free_pages)
    def free_group_begin(self):
        self.is_not_in_free_group = False
        self.free_group = []
@@ -274,6 +283,12 @@ class PagedTokenToKVPoolAllocator:
        if self.free_group:
            self.free(torch.cat(self.free_group))
+    def backup_state(self):
+        return self.free_pages
+    def restore_state(self, free_pages):
+        self.free_pages = free_pages
    def clear(self):
        # The padded slot 0 is used for writing dummy outputs from padded tokens.
        self.free_pages = torch.arange(

--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -116,16 +116,18 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
    if capture_bs is None:
        if server_args.speculative_algorithm is None:
            if server_args.disable_cuda_graph_padding:
-                capture_bs = list(range(1, 33)) + [64, 96, 128, 160]
+                capture_bs = list(range(1, 33)) + range(40, 161, 16)
            else:
-                capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
+                capture_bs = [1, 2, 4, 8] + list(range(16, 161, 8))
        else:
            # Since speculative decoding requires more cuda graph memory, we
            # capture less.
-            capture_bs = list(range(1, 9)) + list(range(9, 33, 2)) + [64, 96, 128, 160]
+            capture_bs = (
+                list(range(1, 9)) + list(range(10, 33, 2)) + list(range(40, 161, 16))
+            )
        if _is_hip:
-            capture_bs += [i * 8 for i in range(21, 33)]
+            capture_bs += list(range(160, 257, 8))
    if max(capture_bs) > model_runner.req_to_token_pool.size:
        # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests

--- a/python/sglang/srt/models/llama.py
+++ b/python/sglang/srt/models/llama.py
@@ -17,7 +17,7 @@
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 import logging
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 import torch
 from torch import nn

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -15,6 +15,7 @@
 import argparse
 import dataclasses
+import json
 import logging
 import os
 import random
@@ -132,9 +133,9 @@ class ServerArgs:
    # Speculative decoding
    speculative_algorithm: Optional[str] = None
    speculative_draft_model_path: Optional[str] = None
-    speculative_num_steps: int = 5
+    speculative_num_steps: Optional[int] = None
-    speculative_eagle_topk: int = 4
+    speculative_eagle_topk: Optional[int] = None
-    speculative_num_draft_tokens: int = 8
+    speculative_num_draft_tokens: Optional[int] = None
    speculative_accept_threshold_single: float = 1.0
    speculative_accept_threshold_acc: float = 1.0
    speculative_token_map: Optional[str] = None
@@ -313,12 +314,29 @@ class ServerArgs:
            or self.speculative_algorithm == "EAGLE3"
        ):
            if self.max_running_requests is None:
-                self.max_running_requests = 32
+                self.max_running_requests = 48
            self.disable_overlap_schedule = True
            logger.info(
                "Overlap scheduler is disabled because of using "
                "eagle speculative decoding."
            )
+            # Auto choose parameters
+            if self.speculative_num_steps is None:
+                assert (
+                    self.speculative_eagle_topk is None
+                    and self.speculative_num_draft_tokens is None
+                )
+                (
+                    self.speculative_num_steps,
+                    self.speculative_eagle_topk,
+                    self.speculative_num_draft_tokens,
+                ) = auto_choose_speculative_params(self)
+            if self.page_size > 1 and self.speculative_eagle_topk > 1:
+                self.speculative_eagle_topk = 1
+                logger.info("speculative_eagle_topk is changed to 1 when page_size > 1")
            # The token generated from the verify step is counted.
            # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
            # assert self.speculative_num_steps < self.speculative_num_draft_tokens
@@ -1253,3 +1271,33 @@ class DeprecatedAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
        raise ValueError(self.help)
+def auto_choose_speculative_params(self: ServerArgs):
+    """
+    Automatically choose the parameters for speculative decoding.
+    You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
+    """
+    if self.decrypted_config_file:
+        config_path = self.decrypted_config_file
+    else:
+        config_path = os.path.join(self.model_path, "config.json")
+    if not os.path.exists(config_path):
+        raise ValueError(f"{config_path} is not found.")
+    config = json.load(open(config_path))
+    arch = config.get("architectures", ["Unknown"])[0]
+    if arch in ["LlamaForCausalLM"]:
+        # The default value for llama
+        return (5, 4, 8)
+    elif arch in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
+        # The default value for deepseek
+        return (5, 4, 8)
+    elif arch in ["Grok1ForCausalLM", "Grok1VForCausalLM"]:
+        return (5, 4, 8)
+    else:
+        # The default value for all other models
+        return (5, 4, 8)
--- a/python/sglang/srt/speculative/eagle_utils.py
+++ b/python/sglang/srt/speculative/eagle_utils.py
 from __future__ import annotations
+import os
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, List, Optional
@@ -10,11 +11,15 @@ import triton.language as tl
 from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
-from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.managers.schedule_batch import (
+    ScheduleBatch,
+    get_last_loc,
+    global_server_args_dict,
+)
 from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
 from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
 from sglang.srt.speculative.build_eagle_tree import build_tree_kernel_efficient
-from sglang.srt.utils import is_cuda_available, is_hip
+from sglang.srt.utils import is_cuda_available, is_hip, next_power_of_2
 if is_cuda_available():
    from sgl_kernel import (
@@ -34,6 +39,9 @@ import logging
 logger = logging.getLogger(__name__)
+SIMULATE_ACC_LEN = os.environ.get("SIMULATE_ACC_LEN")
 @dataclass
 class EagleDraftInput:
    # The inputs for decode
@@ -93,7 +101,7 @@ class EagleDraftInput:
            torch.cumsum(self.accept_length, axis=0, dtype=torch.int),
            self.positions,
            new_verified_id,
-            triton.next_power_of_2(speculative_num_steps + 1),
+            next_power_of_2(speculative_num_steps + 1),
        )
        batch.seq_lens_sum = sum(seq_lens_cpu)
@@ -225,18 +233,34 @@ class EagleVerifyInput:
            CaptureHiddenMode.FULL,
        )
-    def prepare_for_verify(self, batch: ScheduleBatch):
+    def prepare_for_verify(self, batch: ScheduleBatch, page_size: int):
        batch.input_ids = self.draft_token
-        batch.out_cache_loc = batch.alloc_token_slots(batch.input_ids.numel())
+        if page_size == 1:
+            batch.out_cache_loc = batch.alloc_token_slots(len(batch.input_ids))
+            end_offset = batch.seq_lens + self.draft_token_num
+        else:
+            prefix_lens = batch.seq_lens
+            end_offset = prefix_lens + self.draft_token_num
+            last_loc = get_last_loc(
+                batch.req_to_token_pool.req_to_token,
+                batch.req_pool_indices,
+                prefix_lens,
+            )
+            batch.out_cache_loc = batch.alloc_paged_token_slots_extend(
+                prefix_lens, end_offset, last_loc, len(batch.input_ids)
+            )
+            self.last_loc = last_loc
        bs = batch.batch_size()
        assign_req_to_token_pool[(bs,)](
            batch.req_pool_indices,
            batch.req_to_token_pool.req_to_token,
            batch.seq_lens,
-            batch.seq_lens + self.draft_token_num,
+            end_offset,
            batch.out_cache_loc,
            batch.req_to_token_pool.req_to_token.shape[1],
-            triton.next_power_of_2(bs),
+            next_power_of_2(bs),
        )
    def generate_attn_arg_prefill(
@@ -282,6 +306,7 @@ class EagleVerifyInput:
        batch: ScheduleBatch,
        logits_output: torch.Tensor,
        token_to_kv_pool_allocator: TokenToKVPoolAllocator,
+        page_size: int,
    ) -> torch.Tensor:
        """
        Verify and find accepted tokens based on logits output and batch
@@ -305,6 +330,7 @@ class EagleVerifyInput:
        )
        accept_length = torch.empty((bs,), dtype=torch.int32, device="cuda")
+        # Apply penalty
        if sampling_info.penalizer_orchestrator.is_required:
            # This is a relaxed version of penalties for speculative decoding.
            linear_penalty = torch.zeros(
@@ -317,6 +343,7 @@ class EagleVerifyInput:
                torch.repeat_interleave(linear_penalty, self.draft_token_num, dim=0)
            )
+        # Sample tokens
        if batch.sampling_info.is_all_greedy:
            target_predict = torch.argmax(logits_output.next_token_logits, dim=-1)
            target_predict = target_predict.reshape(bs, self.draft_token_num)
@@ -378,13 +405,24 @@ class EagleVerifyInput:
                deterministic=True,
            )
+        if SIMULATE_ACC_LEN:
+            # Do simulation
+            accept_index = _generate_simulated_accept_index(
+                accept_index=accept_index,
+                predict=predict,  # mutable
+                accept_length=accept_length,  # mutable
+                simulate_acc_len=SIMULATE_ACC_LEN,
+                bs=bs,
+                spec_steps=self.spec_steps,
+            )
        new_accept_index = []
        unfinished_index = []
        accept_index_cpu = accept_index.tolist()
        predict_cpu = predict.tolist()
        has_finished = False
-        # iterate every accepted token and check if req has finished after append the token
+        # Iterate every accepted token and check if req has finished after append the token
        # should be checked BEFORE free kv cache slots
        for i, (req, accept_index_row) in enumerate(zip(batch.reqs, accept_index_cpu)):
            new_accept_index_ = []
@@ -407,13 +445,28 @@ class EagleVerifyInput:
                unfinished_index.append(i)
            req.spec_verify_ct += 1
+        if has_finished:
+            accept_length = (accept_index != -1).sum(dim=1) - 1
+        # Free the KV cache for unaccepted tokens
+        accept_index = accept_index[accept_index != -1]
+        verified_id = predict[accept_index]
+        evict_mask = torch.full_like(self.draft_token, True, dtype=torch.bool)
+        evict_mask[accept_index] = False
+        if page_size != 1:
+            align_evict_mask_to_page_size[len(batch.seq_lens),](
+                batch.seq_lens,
+                evict_mask,
+                page_size,
+                self.draft_token_num,
+                next_power_of_2(self.draft_token_num),
+            )
+        token_to_kv_pool_allocator.free(batch.out_cache_loc[evict_mask])
+        # Construct EagleVerifyOutput
        if not has_finished:
-            accept_index = accept_index[accept_index != -1]
-            verified_id = predict[accept_index]
-            evict_mask = torch.full_like(self.draft_token, True, dtype=torch.bool)
-            evict_mask[accept_index] = False
-            mem_need_free_idx = batch.out_cache_loc[evict_mask]
-            token_to_kv_pool_allocator.free(mem_need_free_idx)
            batch.out_cache_loc = batch.out_cache_loc[accept_index]
            assign_req_to_token_pool[(bs,)](
                batch.req_pool_indices,
@@ -422,7 +475,7 @@ class EagleVerifyInput:
                batch.seq_lens + accept_length + 1,
                batch.out_cache_loc,
                batch.req_to_token_pool.req_to_token.shape[1],
-                triton.next_power_of_2(bs),
+                next_power_of_2(bs),
            )
            batch.seq_lens.add_(accept_length + 1)
            accept_length_cpu = accept_length.tolist()
@@ -443,13 +496,6 @@ class EagleVerifyInput:
                accepeted_indices=accept_index,
            )
        else:
-            accept_length = (accept_index != -1).sum(dim=1) - 1
-            accept_index = accept_index[accept_index != -1]
-            verified_id = predict[accept_index]
-            evict_mask = torch.full_like(self.draft_token, True, dtype=torch.bool)
-            evict_mask[accept_index] = False
-            mem_need_free_idx = batch.out_cache_loc[evict_mask]
-            token_to_kv_pool_allocator.free(mem_need_free_idx)
            assign_req_to_token_pool[(bs,)](
                batch.req_pool_indices,
                batch.req_to_token_pool.req_to_token,
@@ -457,7 +503,7 @@ class EagleVerifyInput:
                batch.seq_lens + accept_length + 1,
                batch.out_cache_loc[accept_index],
                batch.req_to_token_pool.req_to_token.shape[1],
-                triton.next_power_of_2(bs),
+                next_power_of_2(bs),
            )
            batch.seq_lens.add_(accept_length + 1)
            accept_length_cpu = accept_length.tolist()
@@ -465,20 +511,21 @@ class EagleVerifyInput:
            draft_input = EagleDraftInput()
            if len(new_accept_index) > 0:
                new_accept_index = torch.tensor(new_accept_index, device="cuda")
+                unfinished_index_device = torch.tensor(unfinished_index, device="cuda")
                draft_input.hidden_states = batch.spec_info.hidden_states[
                    new_accept_index
                ]
                draft_input.verified_id = predict[new_accept_index]
-                draft_input.accept_length = accept_length[unfinished_index]
                draft_input.accept_length_cpu = [
                    accept_length_cpu[i] for i in unfinished_index
                ]
+                draft_input.accept_length = accept_length[unfinished_index_device]
                if has_finished:
                    draft_input.seq_lens_for_draft_extend = batch.seq_lens[
-                        unfinished_index
+                        unfinished_index_device
                    ]
                    draft_input.req_pool_indices_for_draft_extend = (
-                        batch.req_pool_indices[unfinished_index]
+                        batch.req_pool_indices[unfinished_index_device]
                    )
                else:
                    draft_input.seq_lens_for_draft_extend = batch.seq_lens
@@ -564,13 +611,24 @@ def assign_draft_cache_locs(
    pool_len: tl.constexpr,
    topk: tl.constexpr,
    speculative_num_steps: tl.constexpr,
+    page_size: tl.constexpr,
 ):
    BLOCK_SIZE: tl.constexpr = 32
    pid = tl.program_id(axis=0)
    kv_start = tl.load(seq_lens + pid)
-    kv_end = tl.load(seq_lens + pid) + topk * speculative_num_steps
+    if page_size == 1 or topk == 1:
+        kv_end = tl.load(seq_lens + pid) + topk * speculative_num_steps
+        out_cache_ptr = out_cache_loc + pid * topk * speculative_num_steps
+    else:
+        prefix_len = tl.load(seq_lens + pid)
+        last_page_len = prefix_len % page_size
+        num_new_page = (
+            last_page_len + speculative_num_steps + page_size - 1
+        ) // page_size
+        kv_end = prefix_len // page_size * page_size + num_new_page * (page_size * topk)
    token_pool = req_to_token + tl.load(req_pool_indices + pid) * pool_len
-    out_cache_ptr = out_cache_loc + pid * topk * speculative_num_steps
    num_loop = tl.cdiv(topk * speculative_num_steps, BLOCK_SIZE)
    for i in range(num_loop):
@@ -642,6 +700,29 @@ def generate_draft_decode_kv_indices(
    tl.store(kv_indptr + zid, base + zid * iters)
+@triton.jit
+def align_evict_mask_to_page_size(
+    seq_lens,
+    evict_mask,
+    page_size: tl.constexpr,
+    num_draft_tokens: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    t_range = tl.arange(0, BLOCK_SIZE)
+    bid = tl.program_id(axis=0)
+    seq_len = tl.load(seq_lens + bid)
+    io_mask = t_range < num_draft_tokens
+    mask_row = tl.load(evict_mask + bid * num_draft_tokens + t_range, mask=io_mask)
+    num_trues = tl.sum(mask_row)
+    num_false = num_draft_tokens - num_trues
+    start = (seq_len + num_false - 1) // page_size * page_size - seq_len
+    for i in range(max(start, 0), min(start + page_size, num_draft_tokens)):
+        tl.store(evict_mask + bid * num_draft_tokens + i, False)
 @torch.compile(dynamic=True)
 def select_top_k_tokens(
    i: int,
@@ -699,3 +780,34 @@ def fast_topk(values, topk, dim):
    else:
        # Use topk for efficiency with larger k values
        return torch.topk(values, topk, dim=dim)
+def _generate_simulated_accept_index(
+    accept_index,
+    predict,
+    accept_length,
+    simulate_acc_len,
+    bs,
+    spec_steps,
+):
+    simulate_acc_len_float = float(simulate_acc_len)
+    simulated_values = torch.normal(
+        mean=simulate_acc_len_float,
+        std=1.0,
+        size=(1,),
+        device="cpu",
+    )
+    # clamp simulated values to be between 1 and self.spec_steps
+    simulated_values = torch.clamp(simulated_values, min=1.0, max=spec_steps)
+    simulate_acc_len = int(simulated_values.round().item())
+    accept_indx_first_col = accept_index[:, 0].view(-1, 1)
+    sim_accept_index = torch.full(
+        (bs, spec_steps + 1), -1, dtype=torch.int32, device="cuda"
+    )
+    sim_accept_index[:, :simulate_acc_len] = accept_indx_first_col + torch.arange(
+        simulate_acc_len, device=accept_index.device
+    )
+    accept_length.fill_(simulate_acc_len - 1)
+    predict.fill_(100)  # some legit token id
+    return sim_accept_index
--- a/python/sglang/srt/speculative/eagle_worker.py
+++ b/python/sglang/srt/speculative/eagle_worker.py
@@ -11,7 +11,7 @@ from sglang.srt.distributed import GroupCoordinator, patch_tensor_parallel_group
 from sglang.srt.layers.dp_attention import disable_dp_size
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.sampler import get_token_ids_logprobs, get_top_logprobs
-from sglang.srt.managers.schedule_batch import ScheduleBatch
+from sglang.srt.managers.schedule_batch import ScheduleBatch, get_last_loc
 from sglang.srt.managers.tp_worker import TpModelWorker
 from sglang.srt.model_executor.forward_batch_info import (
    CaptureHiddenMode,
@@ -67,6 +67,7 @@ class EAGLEWorker(TpModelWorker):
        self.gpu_id = gpu_id
        self.device = server_args.device
        self.target_worker = target_worker
+        self.page_size = server_args.page_size
        self.speculative_algorithm = SpeculativeAlgorithm.from_string(
            server_args.speculative_algorithm
        )
@@ -234,14 +235,11 @@ class EAGLEWorker(TpModelWorker):
        """
        if batch.forward_mode.is_decode():
            with self.draft_tp_context(self.draft_model_runner.tp_group):
-                spec_info, to_free_cache_loc = self.draft(batch)
+                spec_info = self.draft(batch)
            logits_output, verify_output, model_worker_batch = self.verify(
                batch, spec_info
            )
-            # Free cache loc (we put it here to avoid synchronization and hide kernel launch overhead.)
-            self.token_to_kv_pool_allocator.free(to_free_cache_loc)
            # If it is None, it means all requests are finished
            if batch.spec_info.verified_id is not None:
                with self.draft_tp_context(self.draft_model_runner.tp_group):
@@ -305,9 +303,59 @@ class EAGLEWorker(TpModelWorker):
            )
        # Allocate cache locations
-        out_cache_loc = batch.alloc_token_slots(
+        if self.page_size == 1:
-            num_seqs * self.topk * self.speculative_num_steps
+            out_cache_loc, token_to_kv_pool_state_backup = batch.alloc_token_slots(
-        )
+                num_seqs * self.topk * self.speculative_num_steps, backup_state=True
+            )
+        else:
+            if self.topk == 1:
+                prefix_lens = batch.seq_lens
+                seq_lens = prefix_lens + self.speculative_num_steps
+                extend_num_tokens = num_seqs * self.speculative_num_steps
+            else:
+                # In this case, the last partial page needs to be duplicated.
+                # KV cache layout in batch.req_to_token_pool.req_to_token:
+                #
+                # | -------- | -- xxxx .. | -- xxxx .. | -- xxxx .. |
+                #    prefix     top-k = 0    tok-k = 1    top-k = 2
+                #
+                #  "-" means prefix tokens
+                #  "x" means speculative draft tokens
+                #  "." means padded tokens
+                # TODO: fuse these ops
+                prefix_lens = batch.seq_lens
+                last_page_lens = prefix_lens % self.page_size
+                num_new_pages = (
+                    last_page_lens + self.speculative_num_steps + self.page_size - 1
+                ) // self.page_size
+                seq_lens = (
+                    prefix_lens // self.page_size * self.page_size
+                    + num_new_pages * (self.page_size * self.topk)
+                )
+                extend_num_tokens = torch.sum(seq_lens - prefix_lens).item()
+                raise NotImplementedError(
+                    "page_size > 1 and top_k > 1 are not supported."
+                )
+                # TODO: Support page_size > 1 and top_k > 1
+                # 1. Duplicate the KV cache in the last partial page for all top-k segments
+                # 2. Modify generate_draft_decode_kv_indices accordingly
+            last_loc = get_last_loc(
+                batch.req_to_token_pool.req_to_token,
+                batch.req_pool_indices,
+                prefix_lens,
+            )
+            out_cache_loc, token_to_kv_pool_state_backup = (
+                batch.alloc_paged_token_slots_extend(
+                    prefix_lens,
+                    seq_lens,
+                    last_loc,
+                    extend_num_tokens,
+                    backup_state=True,
+                )
+            )
        assign_draft_cache_locs[(num_seqs,)](
            batch.req_pool_indices,
            batch.req_to_token_pool.req_to_token,
@@ -316,6 +364,7 @@ class EAGLEWorker(TpModelWorker):
            batch.req_to_token_pool.req_to_token.shape[1],
            self.topk,
            self.speculative_num_steps,
+            self.page_size,
        )
        batch.out_cache_loc = out_cache_loc
        batch.seq_lens_sum = torch.sum(batch.seq_lens).item()
@@ -343,6 +392,8 @@ class EAGLEWorker(TpModelWorker):
            # Run forward steps
            score_list, token_list, parents_list = self.draft_forward(forward_batch)
+        self.token_to_kv_pool_allocator.restore_state(token_to_kv_pool_state_backup)
        ret = EagleVerifyInput.create(
            spec_info.verified_id,
            score_list,
@@ -354,7 +405,7 @@ class EAGLEWorker(TpModelWorker):
            self.speculative_num_steps,
            self.server_args.speculative_num_draft_tokens,
        )
-        return ret, out_cache_loc
+        return ret
    def draft_forward(self, forward_batch: ForwardBatch):
        # Parse args
@@ -411,7 +462,7 @@ class EAGLEWorker(TpModelWorker):
        return score_list, token_list, parents_list
    def verify(self, batch: ScheduleBatch, spec_info: EagleVerifyInput):
-        spec_info.prepare_for_verify(batch)
+        spec_info.prepare_for_verify(batch, self.page_size)
        batch.forward_mode = ForwardMode.TARGET_VERIFY
        batch.spec_info = spec_info
        model_worker_batch = batch.get_model_worker_batch()
@@ -421,7 +472,10 @@ class EAGLEWorker(TpModelWorker):
        self._detect_nan_if_needed(logits_output)
        spec_info.hidden_states = logits_output.hidden_states
        res: EagleVerifyOutput = spec_info.verify(
-            batch, logits_output, self.token_to_kv_pool_allocator
+            batch,
+            logits_output,
+            self.token_to_kv_pool_allocator,
+            self.page_size,
        )
        # Post process based on verified outputs.

--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -76,11 +76,14 @@ def is_in_ci():
 if is_in_ci():
-    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
+    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
-    DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157"
+        5000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
+    )
 else:
-    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 1157
+    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
-    DEFAULT_URL_FOR_TEST = "http://127.0.0.1:2157"
+        7000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
+    )
+DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
 def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
@@ -1009,6 +1012,9 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
 class CustomTestCase(unittest.TestCase):
+    pass
+    """
    def _callTestMethod(self, method):
        max_retry = int(
            os.environ.get("SGLANG_TEST_MAX_RETRY", "2" if is_in_ci() else "0")
@@ -1017,3 +1023,4 @@ class CustomTestCase(unittest.TestCase):
            lambda: super(CustomTestCase, self)._callTestMethod(method),
            max_retry=max_retry,
        )
+    """
--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
@@ -18,7 +18,7 @@ pip install flashinfer_python==0.2.3 --find-links ${FLASHINFER_REPO} --force-rei
 pip install sgl-kernel==0.0.5.post4 --force-reinstall
 pip install torch_memory_saver
-pip install transformers==4.50.0 sentence_transformers accelerate==1.4.0 peft pandas datasets timm
+pip install transformers==4.50.0 sentence_transformers accelerate==1.4.0 peft pandas datasets timm torchaudio
 # For compling xgrammar kernels
 pip install cuda-python nvidia-cuda-nvrtc-cu12

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -26,7 +26,7 @@ suites = {
        TestFile("test_abort.py", 51),
        TestFile("test_block_int8.py", 22),
        TestFile("test_chunked_prefill.py", 336),
-        TestFile("test_eagle_infer.py", 447),
+        TestFile("test_eagle_infer.py", 500),
        TestFile("test_ebnf_constrained.py"),
        TestFile("test_fp8_kernel.py", 2),
        TestFile("test_embedding_openai_server.py", 36),

--- a/test/srt/test_eagle_infer.py
+++ b/test/srt/test_eagle_infer.py
@@ -298,10 +298,16 @@ class TestEAGLEServer(CustomTestCase):
        print(f"{metrics=}")
        self.assertGreater(metrics["accuracy"], 0.20)
-        server_info = requests.get(self.base_url + "/get_server_info")
+        server_info = requests.get(self.base_url + "/get_server_info").json()
-        avg_spec_accept_length = server_info.json()["avg_spec_accept_length"]
+        avg_spec_accept_length = server_info["avg_spec_accept_length"]
        print(f"{avg_spec_accept_length=}")
-        self.assertGreater(avg_spec_accept_length, 3.5)
+        speculative_eagle_topk = server_info["speculative_eagle_topk"]
+        if speculative_eagle_topk == 1:
+            self.assertGreater(avg_spec_accept_length, 2.5)
+        else:
+            self.assertGreater(avg_spec_accept_length, 3.5)
        # Wait a little bit so that the memory check happens.
        time.sleep(4)
@@ -535,5 +541,36 @@ class TestEAGLEServerTriton(TestEAGLEServer):
        )
+class TestEAGLEServerPageSize(TestEAGLEServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+                "--speculative-num-steps",
+                5,
+                "--speculative-eagle-topk",
+                1,
+                "--speculative-num-draft-tokens",
+                6,
+                "--mem-fraction-static",
+                0.7,
+                "--chunked-prefill-size",
+                128,
+                "--max-running-requests",
+                8,
+                "--page-size",
+                4,
+            ],
+        )
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_mla_flashinfer.py
+++ b/test/srt/test_mla_flashinfer.py
@@ -157,6 +157,7 @@ class TestFlashinferMLAMTP(CustomTestCase):
        self.assertGreater(metrics["accuracy"], 0.60)
        server_info = requests.get(self.base_url + "/get_server_info")
+        print(f"{server_info=}")
        avg_spec_accept_length = server_info.json()["avg_spec_accept_length"]
        print(f"{avg_spec_accept_length=}")
        self.assertGreater(avg_spec_accept_length, 2.5)