Support MHA with chunked prefix cache for DeepSeek chunked prefill (#5113)

a42736bb · Baizhou Zhang · GitHub · dd83e7e9 · a42736bb · a42736bb
Unverified Commit a42736bb authored Apr 15, 2025 by Baizhou Zhang Committed by GitHub Apr 15, 2025
10 changed files
--- a/docs/backend/server_arguments.md
+++ b/docs/backend/server_arguments.md
@@ -195,3 +195,4 @@ Please consult the documentation below to learn more about the parameters you ma
 * `triton_attention_num_kv_splits`: Use to adjust the number of KV splits in triton kernels. Default is 8.
 * `enable_flashinfer_mla`: Use the attention backend with FlashInfer MLA wrapper for DeepSeek models. **This argument will be deprecated in the next release. Please use `--attention_backend flashinfer` instead to enable FlashfIner MLA.**
 * `flashinfer_mla_disable_ragged`: Disable the use of the ragged prefill wrapper for the FlashInfer MLA attention backend. Only use it when FlashInfer is being used as the MLA backend.
+* `disable_chunked_prefix_cache`: Disable the use of chunked prefix cache for DeepSeek models. Only use it when FA3 is attention backend.
--- a/docs/references/deepseek.md
+++ b/docs/references/deepseek.md
@@ -92,13 +92,15 @@ Please refer to [the example](https://github.com/sgl-project/sglang/tree/main/be
 - **CUDA Graph & Torch.compile**: Both MLA and Mixture of Experts (MoE) are compatible with CUDA Graph and Torch.compile, which reduces latency and accelerates decoding speed for small batch sizes.
+- **Chunked Prefix Cache**: Chunked prefix cache optimization can increase throughput by cutting prefix cache into chunks, processing them with multi-head attention and merging their states. Its improvement can be significant when doing chunked prefill on long sequences. Currently this optimization is only available for FlashAttention3 backend.
 Overall, with these optimizations, we have achieved up to **7x** acceleration in output throughput compared to the previous version.
 <p align="center">
  <img src="https://lmsys.org/images/blog/sglang_v0_3/deepseek_mla.svg" alt="Multi-head Latent Attention for DeepSeek Series Models">
 </p>
-**Usage**: MLA optimization is enabled by default, to disable, use `--disable-mla`.
+**Usage**: MLA optimization is enabled by default. To disable MLA usage, use `--disable-mla`. To disable chunked prefix cache feature for mla, use `disable-chunked-prefix-cache`.
 **Reference**: Check [Blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#deepseek-multi-head-latent-attention-mla-throughput-optimizations) and [Slides](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/lmsys_1st_meetup_deepseek_mla.pdf) for more details.

--- a/python/sglang/srt/layers/attention/flashattention_backend.py
+++ b/python/sglang/srt/layers/attention/flashattention_backend.py
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
    from sglang.srt.layers.radix_attention import RadixAttention
    from sglang.srt.model_executor.model_runner import ModelRunner
-from sgl_kernel.flash_attn import flash_attn_with_kvcache
+from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
 @dataclass
@@ -593,41 +593,87 @@ class FlashAttentionBackend(AttentionBackend):
                k_descale=k_descale,
                v_descale=v_descale,
            )
+            return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
        else:
-            # Do absorbed multi-latent attention
+            if (
-            kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+                not global_server_args_dict["disable_chunked_prefix_cache"]
-            k_rope = kv_cache[:, :, layer.v_head_dim :]
+                and forward_batch.attn_attend_prefix_cache is not None
-            c_kv = kv_cache[:, :, : layer.v_head_dim]
+                and not forward_batch.forward_mode.is_target_verify()
-            k_rope_cache = k_rope.view(
+                and not forward_batch.forward_mode.is_draft_extend()
-                -1,
+            ):
-                self.page_size,
+                # Do multi-head attention with chunked prefix cache
-                layer.tp_k_head_num,
-                layer.head_dim - layer.v_head_dim,
+                if forward_batch.attn_attend_prefix_cache:
-            )
+                    # MHA for chunked prefix kv cache when running model with MLA
-            c_kv_cache = c_kv.view(
+                    assert forward_batch.prefix_chunk_idx is not None
-                -1, self.page_size, layer.tp_v_head_num, layer.v_head_dim
+                    assert forward_batch.prefix_chunk_cu_seq_lens is not None
-            )
+                    assert forward_batch.prefix_chunk_max_seq_lens is not None
-            q_all = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
-            q_nope = q_all[:, :, : layer.v_head_dim]
+                    chunk_idx = forward_batch.prefix_chunk_idx
-            q_rope = q_all[:, :, layer.v_head_dim :]
+                    assert chunk_idx >= 0
-            o = flash_attn_with_kvcache(
-                q=q_rope,
+                    output, lse, *rest = flash_attn_varlen_func(
-                k_cache=k_rope_cache,
+                        q=q.view(-1, layer.tp_q_head_num, layer.head_dim),
-                v_cache=c_kv_cache,
+                        k=k.view(-1, layer.tp_k_head_num, layer.head_dim),
-                qv=q_nope,
+                        v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim),
-                page_table=page_table,
+                        cu_seqlens_q=metadata.cu_seqlens_q,
-                cache_seqlens=cache_seqlens,
+                        cu_seqlens_k=forward_batch.prefix_chunk_cu_seq_lens[chunk_idx],
-                cu_seqlens_q=cu_seqlens_q,
+                        max_seqlen_q=metadata.max_seq_len_q,
-                cu_seqlens_k_new=cu_seqlens_k if not use_local_attn else None,
+                        max_seqlen_k=forward_batch.prefix_chunk_max_seq_lens[chunk_idx],
-                max_seqlen_q=max_seqlen_q,
+                        softmax_scale=layer.scaling,
-                softmax_scale=layer.scaling,
+                        causal=False,
-                causal=True,
+                        return_softmax_lse=True,
-                softcap=layer.logit_cap,
+                    )
-                k_descale=k_descale,
+                else:
-                v_descale=v_descale,
+                    # MHA for extend part of sequence without attending prefix kv cache
-            )
+                    output, lse, *rest = flash_attn_varlen_func(
+                        q=q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                        k=k.view(-1, layer.tp_k_head_num, layer.head_dim),
+                        v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim),
+                        cu_seqlens_q=metadata.cu_seqlens_q,
+                        cu_seqlens_k=metadata.cu_seqlens_q,
+                        max_seqlen_q=metadata.max_seq_len_q,
+                        max_seqlen_k=metadata.max_seq_len_q,
+                        softmax_scale=layer.scaling,
+                        causal=True,
+                        return_softmax_lse=True,
+                    )
+                return output, lse
+            else:
+                # Do absorbed multi-latent attention
+                kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+                k_rope = kv_cache[:, :, layer.v_head_dim :]
+                c_kv = kv_cache[:, :, : layer.v_head_dim]
+                k_rope_cache = k_rope.view(
+                    -1,
+                    self.page_size,
+                    layer.tp_k_head_num,
+                    layer.head_dim - layer.v_head_dim,
+                )
+                c_kv_cache = c_kv.view(
+                    -1, self.page_size, layer.tp_v_head_num, layer.v_head_dim
+                )
+                q_all = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+                q_nope = q_all[:, :, : layer.v_head_dim]
+                q_rope = q_all[:, :, layer.v_head_dim :]
+                o = flash_attn_with_kvcache(
+                    q=q_rope,
+                    k_cache=k_rope_cache,
+                    v_cache=c_kv_cache,
+                    qv=q_nope,
+                    page_table=page_table,
+                    cache_seqlens=cache_seqlens,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k_new=cu_seqlens_k if not use_local_attn else None,
+                    max_seqlen_q=max_seqlen_q,
+                    softmax_scale=layer.scaling,
+                    causal=True,
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                )
-        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+                return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
    def forward_decode(
        self,

--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -83,6 +83,7 @@ global_server_args_dict = {
    "chunked_prefill_size": ServerArgs.chunked_prefill_size,
    "n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
    "disable_shared_experts_fusion": ServerArgs.disable_shared_experts_fusion,
+    "disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache,
 }
 logger = logging.getLogger(__name__)

--- a/python/sglang/srt/model_executor/forward_batch_info.py
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -181,6 +181,28 @@ class ForwardBatch:
    extend_logprob_start_lens_cpu: Optional[List[int]] = None
    extend_input_logprob_token_ids_gpu: Optional[torch.Tensor] = None
+    # For MLA chunked prefix cache used in chunked prefill
+    # Tell attention backend whether the kv cache needs to be attended in current pass
+    attn_attend_prefix_cache: Optional[bool] = None
+    # Number of prefix cache chunks
+    num_prefix_chunks: Optional[int] = None
+    # Index of current chunk, used by attention backend
+    prefix_chunk_idx: Optional[int] = None
+    # Maximum number of tokens in each chunk per sequence. Computed from maximum chunk capacity
+    prefix_chunk_len: Optional[int] = None
+    # Start positions of prefix cache for each chunk, (num_prefix_chunks, batch_size)
+    prefix_chunk_starts: Optional[torch.Tensor] = None
+    # Lengths of prefix cache for each chunk, (num_prefix_chunks, batch_size)
+    prefix_chunk_seq_lens: Optional[torch.Tensor] = None
+    # Accumulated lengths of prefix cache for each chunk, (num_prefix_chunks, batch_size + 1)
+    prefix_chunk_cu_seq_lens: Optional[torch.Tensor] = None
+    # Max lengths of prefix cache for each chunk, (num_prefix_chunks,)
+    prefix_chunk_max_seq_lens: Optional[List[int]] = None
+    # Number of tokens in each prefix cache chunk, (num_prefix_chunks,)
+    prefix_chunk_num_tokens: Optional[List[int]] = None
+    # KV Indices for each chunk
+    prefix_chunk_kv_indices: Optional[List[torch.Tensor]] = None
    # For multimodal
    mm_inputs: Optional[List[MultimodalInputs]] = None
@@ -484,6 +506,128 @@ class ForwardBatch:
        )
        self.mrope_positions = self.mrope_positions.to(torch.int64)
+    def get_max_chunk_capacity(self):
+        # Maximum number of tokens in each chunk
+        # TODO: Should be changed to a better value, maybe passed through server args
+        return 128 * 1024
+    def set_prefix_chunk_idx(self, idx: int):
+        self.prefix_chunk_idx = idx
+    def set_attn_attend_prefix_cache(self, attn_attend_prefix_cache: bool):
+        self.attn_attend_prefix_cache = attn_attend_prefix_cache
+    def prepare_chunked_kv_indices(self, device: torch.device):
+        self.prefix_chunk_kv_indices = []
+        for idx in range(self.num_prefix_chunks):
+            chunk_starts = self.prefix_chunk_starts[idx]
+            chunk_seq_lens = self.prefix_chunk_seq_lens[idx]
+            chunk_cu_seq_lens = self.prefix_chunk_cu_seq_lens[idx]
+            num_chunk_tokens = self.prefix_chunk_num_tokens[idx]
+            chunk_kv_indices = torch.empty(
+                num_chunk_tokens, dtype=torch.int32, device=device
+            )
+            create_chunked_prefix_cache_kv_indices[(self.batch_size,)](
+                self.req_to_token_pool.req_to_token,
+                self.req_pool_indices,
+                chunk_starts,
+                chunk_seq_lens,
+                chunk_cu_seq_lens,
+                chunk_kv_indices,
+                self.req_to_token_pool.req_to_token.shape[1],
+            )
+            self.prefix_chunk_kv_indices.append(chunk_kv_indices)
+    # Here we suppose the length of each chunk is equal
+    # For example, if we have 4 sequences with prefix length [256, 512, 768, 1024], prefix_chunk_len = 256
+    # num_prefix_chunks = cdiv(1024, 256) = 4
+    # prefix_chunk_starts = [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512], [768, 768, 768, 768]]
+    # prefix_chunk_ends = [[256, 256, 256, 256], [256, 512, 512, 512], [256, 512, 768, 768], [256, 512, 768, 1024]]
+    # prefix_chunk_seq_lens = [[256, 256, 256, 256], [0, 256, 256, 256], [0, 0, 256, 256], [0, 0, 0, 256]]
+    # TODO: Implement a better way to allocate chunk lengths that uses memory spaces more efficiently.
+    def get_prefix_chunk_seq_lens(
+        self, prefix_lens: torch.Tensor, num_prefix_chunks: int, prefix_chunk_len: int
+    ):
+        device = prefix_lens.device
+        prefix_chunk_starts = (
+            torch.arange(num_prefix_chunks, device=device, dtype=torch.int32)
+            .unsqueeze(1)
+            .expand(-1, self.batch_size)
+            * prefix_chunk_len
+        )
+        prefix_chunk_ends = torch.min(
+            prefix_lens.unsqueeze(0),
+            prefix_chunk_starts + prefix_chunk_len,
+        ).to(torch.int32)
+        prefix_chunk_seq_lens = (
+            (prefix_chunk_ends - prefix_chunk_starts).clamp(min=0).to(torch.int32)
+        )
+        return prefix_chunk_starts, prefix_chunk_seq_lens
+    # Called before each attention module if using chunked kv cache for prefill
+    # Some of the codes are adapted from https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/mla/common.py
+    def prepare_chunked_prefix_cache_info(self, device: torch.device):
+        from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
+        assert isinstance(
+            self.token_to_kv_pool, MLATokenToKVPool
+        ), "Currently chunked prefix cache can only be used by Deepseek models"
+        if self.prefix_chunk_len is not None:
+            # Chunked kv cache info already prepared by prior modules
+            return
+        self.prefix_chunk_idx = -1
+        # chunk_capacity is the maximum number of tokens in each chunk
+        chunk_capacity = self.get_max_chunk_capacity()
+        self.prefix_chunk_len = chunk_capacity // self.batch_size
+        self.num_prefix_chunks = (
+            max(self.extend_prefix_lens_cpu) + self.prefix_chunk_len - 1
+        ) // self.prefix_chunk_len
+        # Here we compute chunk lens twice to avoid stream sync, once on gpu and once on cpu.
+        prefix_chunk_starts_cuda, prefix_chunk_seq_lens_cuda = (
+            self.get_prefix_chunk_seq_lens(
+                self.extend_prefix_lens,
+                self.num_prefix_chunks,
+                self.prefix_chunk_len,
+            )
+        )
+        _, prefix_chunk_seq_lens_cpu = self.get_prefix_chunk_seq_lens(
+            torch.tensor(self.extend_prefix_lens_cpu),
+            self.num_prefix_chunks,
+            self.prefix_chunk_len,
+        )
+        self.prefix_chunk_starts = prefix_chunk_starts_cuda
+        self.prefix_chunk_seq_lens = prefix_chunk_seq_lens_cuda
+        # Metadata for attention backend
+        self.prefix_chunk_cu_seq_lens = torch.zeros(
+            self.num_prefix_chunks,
+            self.batch_size + 1,
+            device=device,
+            dtype=torch.int32,
+        )
+        self.prefix_chunk_cu_seq_lens[:, 1:] = prefix_chunk_seq_lens_cuda.cumsum(
+            dim=1
+        ).to(torch.int32)
+        self.prefix_chunk_max_seq_lens = prefix_chunk_seq_lens_cpu.max(
+            dim=1
+        ).values.tolist()
+        self.prefix_chunk_num_tokens = prefix_chunk_seq_lens_cpu.sum(dim=1).tolist()
+        assert max(self.prefix_chunk_num_tokens) <= self.get_max_chunk_capacity()
+        # Precompute the kv indices for each chunk
+        self.prepare_chunked_kv_indices(device)
 def compute_position_triton(
    extend_prefix_lens: torch.Tensor, extend_seq_lens: torch.Tensor, extend_seq_lens_sum
@@ -561,3 +705,40 @@ def compute_position_torch(
 @torch.compile(dynamic=True, backend=get_compiler_backend())
 def clamp_position(seq_lens):
    return torch.clamp((seq_lens - 1), min=0).to(torch.int64)
+@triton.jit
+def create_chunked_prefix_cache_kv_indices(
+    req_to_token_ptr,  # (max_batch, max_context_len,)
+    req_pool_indices_ptr,  # (batch_size,)
+    chunk_start_idx_ptr,  # (batch_size,)
+    chunk_seq_lens_ptr,  # (batch_size,)
+    chunk_cu_seq_lens_ptr,  # (batch_size + 1,)
+    chunk_kv_indices_ptr,  # (num_chunk_tokens,)
+    req_to_token_ptr_stride: tl.constexpr,
+):
+    BLOCK_SIZE: tl.constexpr = 512
+    pid = tl.program_id(axis=0)
+    # find the req pool idx, this is for batch to token
+    req_pool_index = tl.load(req_pool_indices_ptr + pid)
+    chunk_kv_indices_offset = tl.load(chunk_cu_seq_lens_ptr + pid)
+    # get the token positions of current chunk
+    chunk_start_pos = tl.load(chunk_start_idx_ptr + pid).to(tl.int32)
+    chunk_seq_len = tl.load(chunk_seq_lens_ptr + pid).to(tl.int32)
+    num_loop = tl.cdiv(chunk_seq_len, BLOCK_SIZE)
+    for i in range(num_loop):
+        offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        mask = offset < chunk_seq_len
+        data = tl.load(
+            req_to_token_ptr
+            + req_pool_index * req_to_token_ptr_stride
+            + chunk_start_pos
+            + offset,
+            mask=mask,
+        )
+        tl.store(
+            chunk_kv_indices_ptr + chunk_kv_indices_offset + offset, data, mask=mask
+        )
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -167,6 +167,7 @@ class ModelRunner:
                "debug_tensor_dump_inject": server_args.debug_tensor_dump_inject,
                "n_share_experts_fusion": server_args.n_share_experts_fusion,
                "disable_shared_experts_fusion": server_args.disable_shared_experts_fusion,
+                "disable_chunked_prefix_cache": server_args.disable_chunked_prefix_cache,
                "use_mla_backend": self.use_mla_backend,
            }
        )
@@ -318,6 +319,16 @@ class ModelRunner:
        if server_args.enable_deepep_moe:
            logger.info(f"DeepEP is turned on. DeepEP mode: {server_args.deepep_mode}")
+        if not self.use_mla_backend:
+            logger.info("Disable chunked prefix cache for non-MLA backend.")
+            server_args.disable_chunked_prefix_cache = True
+        elif self.page_size > 1:
+            logger.info("Disable chunked prefix cache when page size > 1.")
+            server_args.disable_chunked_prefix_cache = True
+        if not server_args.disable_chunked_prefix_cache:
+            logger.info("Chunked prefix cache is turned on.")
    def init_torch_distributed(self):
        logger.info("Init torch distributed begin.")

--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -18,6 +18,7 @@
 import logging
 import os
+from enum import IntEnum, auto
 from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
@@ -78,7 +79,7 @@ _is_hip = is_hip()
 _is_cuda = is_cuda()
 if _is_cuda:
-    from sgl_kernel import awq_dequantize, bmm_fp8
+    from sgl_kernel import awq_dequantize, bmm_fp8, merge_state_v2
    from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
 else:
@@ -94,6 +95,19 @@ expert_distribution_recorder = ExpertDistributionRecorder()
 logger = logging.getLogger(__name__)
+class AttnForwardMethod(IntEnum):
+    # Use multi-head attention
+    MHA = auto()
+    # Use absorbed multi-latent attention
+    MLA = auto()
+    # Use multi-head attention, but with KV cache chunked.
+    # This method can avoid OOM when prefix lengths are long.
+    MHA_CHUNKED_KV = auto()
 class DeepseekV2MLP(nn.Module):
    def __init__(
        self,
@@ -694,30 +708,54 @@ class DeepseekV2AttentionMLA(nn.Module):
        self.flashinfer_mla_disable_ragged = global_server_args_dict[
            "flashinfer_mla_disable_ragged"
        ]
+        self.disable_chunked_prefix_cache = global_server_args_dict[
+            "disable_chunked_prefix_cache"
+        ]
        self.attention_backend = global_server_args_dict["attention_backend"]
        self.rocm_fused_decode_mla = os.getenv("SGLANG_ROCM_FUSED_DECODE_MLA") == "1"
-    def no_absorb(self, forward_batch: ForwardBatch) -> bool:
+        # TODO: Design a finer way to determine the threshold
+        self.chunked_prefix_cache_threshold = 8192
+    def dispatch_attn_forward_method(
+        self, forward_batch: ForwardBatch
+    ) -> AttnForwardMethod:
        if self.attention_backend == "flashinfer":
            # Flashinfer MLA: Do not absorb when enabling ragged prefill
-            return (
+            if (
                not self.flashinfer_mla_disable_ragged
                and forward_batch.forward_mode.is_extend()
                and not forward_batch.forward_mode.is_target_verify()
                and not forward_batch.forward_mode.is_draft_extend()
                and sum(forward_batch.extend_prefix_lens_cpu) == 0
-            )
+            ):
+                return AttnForwardMethod.MHA
+            else:
+                return AttnForwardMethod.MLA
        elif self.attention_backend == "fa3":
-            # Flash Attention: Keep absorbing for all extend/decode
+            # Flash Attention: Use MHA with chunked KV cache when prefilling on long sequences.
-            return False
+            if (
+                forward_batch.forward_mode.is_extend()
+                and not self.disable_chunked_prefix_cache
+                and not forward_batch.forward_mode.is_target_verify()
+                and not forward_batch.forward_mode.is_draft_extend()
+                and sum(forward_batch.extend_prefix_lens_cpu)
+                >= self.chunked_prefix_cache_threshold
+            ):
+                return AttnForwardMethod.MHA_CHUNKED_KV
+            else:
+                return AttnForwardMethod.MLA
        else:
            # Triton: Use normal computation for prefill and use weight absorption for extend/decode
-            return (
+            if (
                forward_batch.forward_mode.is_extend()
                and not forward_batch.forward_mode.is_target_verify()
                and not forward_batch.forward_mode.is_draft_extend()
                and sum(forward_batch.extend_prefix_lens_cpu) == 0
-            )
+            ):
+                return AttnForwardMethod.MHA
+            else:
+                return AttnForwardMethod.MLA
    def forward(
        self,
@@ -731,8 +769,14 @@ class DeepseekV2AttentionMLA(nn.Module):
            ), "short-circuiting allreduce will lead to hangs"
            return hidden_states
-        if self.no_absorb(forward_batch):
+        attn_forward_method = self.dispatch_attn_forward_method(forward_batch)
+        if attn_forward_method == AttnForwardMethod.MHA:
            return self.forward_normal(positions, hidden_states, forward_batch)
+        elif attn_forward_method == AttnForwardMethod.MHA_CHUNKED_KV:
+            return self.forward_normal_chunked_kv(
+                positions, hidden_states, forward_batch
+            )
        else:
            if _is_hip:
                if (
@@ -1007,6 +1051,127 @@ class DeepseekV2AttentionMLA(nn.Module):
        return output
+    def _chunked_prefix_attn_mha(
+        self,
+        q: torch.Tensor,
+        accum_output: torch.Tensor,
+        accum_lse: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        assert forward_batch.num_prefix_chunks is not None
+        for i in range(forward_batch.num_prefix_chunks):
+            forward_batch.set_prefix_chunk_idx(i)
+            # Fetch latent cache from memory pool with precomputed chunked kv indices
+            latent_cache_buf = forward_batch.token_to_kv_pool.get_key_buffer(
+                self.attn_mha.layer_id
+            )
+            latent_cache = latent_cache_buf[
+                forward_batch.prefix_chunk_kv_indices[i]
+            ].contiguous()
+            kv_a_normed, k_pe = latent_cache.split(
+                [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+            )
+            kv_a_normed = kv_a_normed.squeeze(1).contiguous()
+            kv = self.kv_b_proj(kv_a_normed)[0]
+            kv = kv.view(
+                -1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim
+            )
+            v = kv[..., self.qk_nope_head_dim :]
+            k_nope = kv[..., : self.qk_nope_head_dim]
+            k = torch.empty(
+                (
+                    k_nope.shape[0],
+                    self.num_local_heads,
+                    self.qk_nope_head_dim + self.qk_rope_head_dim,
+                ),
+                dtype=v.dtype,
+                device=v.device,
+            )
+            k[..., : self.qk_nope_head_dim] = k_nope
+            k[..., self.qk_nope_head_dim :] = k_pe
+            output, lse = self.attn_mha(q, k, v, forward_batch, save_kv_cache=False)
+            lse = torch.transpose(lse, 0, 1).contiguous()
+            tmp_output = torch.empty_like(accum_output)
+            tmp_lse = torch.empty_like(accum_lse)
+            merge_state_v2(output, lse, accum_output, accum_lse, tmp_output, tmp_lse)
+            accum_output, accum_lse = tmp_output, tmp_lse
+        return accum_output
+    def forward_normal_chunked_kv(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # In normal mha, the k and v tensors will become overly large when the prefix length is long.
+        # To avoid this, we split the kv cache into chunks and process them one after another.
+        # Since mha is compute friendly, the for loop induced here will not introduce significant overhead.
+        # The top comments in https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/mla/common.py
+        # will be helpful for understanding the purpose of this function.
+        # First do normal mha forward to get output for extended part
+        if self.q_lora_rank is not None:
+            q = self.q_a_proj(hidden_states)[0]
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(
+                -1, self.num_local_heads, self.qk_head_dim
+            )
+        _, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+        kv_a, _ = latent_cache.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        latent_cache = latent_cache.unsqueeze(1)
+        kv_a = self.kv_a_layernorm(kv_a.contiguous())
+        kv = self.kv_b_proj(kv_a)[0]
+        kv = kv.view(-1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim)
+        k_nope = kv[..., : self.qk_nope_head_dim]
+        v = kv[..., self.qk_nope_head_dim :]
+        k_pe = latent_cache[:, :, self.kv_lora_rank :]
+        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+        q[..., self.qk_nope_head_dim :] = q_pe
+        k = torch.empty_like(q)
+        k[..., : self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim :] = k_pe
+        latent_cache[:, :, : self.kv_lora_rank] = kv_a.unsqueeze(1)
+        latent_cache[:, :, self.kv_lora_rank :] = k_pe
+        # Save latent cache
+        forward_batch.token_to_kv_pool.set_kv_buffer(
+            self.attn_mha, forward_batch.out_cache_loc, latent_cache, None
+        )
+        # Do mha for extended part without prefix
+        forward_batch.set_attn_attend_prefix_cache(False)
+        attn_output, lse = self.attn_mha(q, k, v, forward_batch, save_kv_cache=False)
+        lse = torch.transpose(lse, 0, 1).contiguous()
+        # Do mha attention with chunked prefix cache if there are any sequence with prefix
+        if any(forward_batch.extend_prefix_lens_cpu):
+            # Only initialize the info once
+            if forward_batch.num_prefix_chunks is None:
+                forward_batch.prepare_chunked_prefix_cache_info(q.device)
+            forward_batch.set_attn_attend_prefix_cache(True)
+            attn_output = self._chunked_prefix_attn_mha(
+                q=q,
+                accum_output=attn_output,
+                accum_lse=lse,
+                forward_batch=forward_batch,
+            )
+        attn_output = attn_output.reshape(-1, self.num_local_heads * self.v_head_dim)
+        output, _ = self.o_proj(attn_output)
+        return output
 class DeepseekV2DecoderLayer(nn.Module):

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -186,6 +186,7 @@ class ServerArgs:
    warmups: Optional[str] = None
    n_share_experts_fusion: int = 0
    disable_shared_experts_fusion: bool = False
+    disable_chunked_prefix_cache: bool = False
    # Debug tensor dumps
    debug_tensor_dump_output_folder: Optional[str] = None
@@ -1130,6 +1131,11 @@ class ServerArgs:
            action="store_true",
            help="Disable shared experts fusion by setting n_share_experts_fusion to 0.",
        )
+        parser.add_argument(
+            "--disable-chunked-prefix-cache",
+            action="store_true",
+            help="Disable chunked prefix cache feature for deepseek, which should save overhead for short sequences.",
+        )
        # Server warmups
        parser.add_argument(

--- a/python/sglang/test/attention/test_prefix_chunk_info.py
+++ b/python/sglang/test/attention/test_prefix_chunk_info.py
+import unittest
+import torch
+from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.test.test_utils import CustomTestCase
+TEST_CASES = [
+    # Sequence with same prefix lens
+    {
+        "batch_size": 3,
+        "prefix_lens": [64, 64, 64],
+        "max_chunk_capacity": 48,
+        "prefix_chunk_len": 16,
+        "num_prefix_chunks": 4,
+        "prefix_chunk_starts": torch.tensor(
+            [
+                [0, 0, 0],
+                [16, 16, 16],
+                [32, 32, 32],
+                [48, 48, 48],
+            ],
+            dtype=torch.int32,
+        ),
+        "prefix_chunk_seq_lens": torch.tensor(
+            [
+                [16, 16, 16],
+                [16, 16, 16],
+                [16, 16, 16],
+                [16, 16, 16],
+            ],
+            dtype=torch.int32,
+        ),
+    },
+    # Sequence with different prefix lens
+    {
+        "batch_size": 4,
+        "prefix_lens": [16, 32, 48, 64],
+        "max_chunk_capacity": 64,
+        "prefix_chunk_len": 16,
+        "num_prefix_chunks": 4,
+        "prefix_chunk_starts": torch.tensor(
+            [
+                [0, 0, 0, 0],
+                [16, 16, 16, 16],
+                [32, 32, 32, 32],
+                [48, 48, 48, 48],
+            ],
+            dtype=torch.int32,
+        ),
+        "prefix_chunk_seq_lens": torch.tensor(
+            [
+                [16, 16, 16, 16],
+                [0, 16, 16, 16],
+                [0, 0, 16, 16],
+                [0, 0, 0, 16],
+            ],
+            dtype=torch.int32,
+        ),
+    },
+    # Sequence with irregular shapes
+    {
+        "batch_size": 2,
+        "prefix_lens": [1, 64],
+        "max_chunk_capacity": 31,
+        "prefix_chunk_len": 15,
+        "num_prefix_chunks": 5,
+        "prefix_chunk_starts": torch.tensor(
+            [
+                [0, 0],
+                [15, 15],
+                [30, 30],
+                [45, 45],
+                [60, 60],
+            ],
+            dtype=torch.int32,
+        ),
+        "prefix_chunk_seq_lens": torch.tensor(
+            [
+                [1, 15],
+                [0, 15],
+                [0, 15],
+                [0, 15],
+                [0, 4],
+            ],
+            dtype=torch.int32,
+        ),
+    },
+]
+class MockForwardBatch(ForwardBatch):
+    def __init__(self, max_chunk_capacity: int, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.max_chunk_capacity = max_chunk_capacity
+    def get_max_chunk_capacity(self):
+        return self.max_chunk_capacity
+class MockReqToTokenPool:
+    def __init__(self, batch_size, seq_len, device):
+        self.req_to_token = (
+            torch.arange(batch_size * seq_len, device=device)
+            .reshape(batch_size, seq_len)
+            .to(torch.int32)
+        )
+# Test correctness of triton kernel for computing kv indices
+def check_kv_indices(forward_batch):
+    for i in range(forward_batch.num_prefix_chunks):
+        computed_kv_indices = forward_batch.prefix_chunk_kv_indices[i]
+        req_to_token = forward_batch.req_to_token_pool.req_to_token[
+            : forward_batch.batch_size, :
+        ]
+        ref_kv_indices = torch.empty(
+            forward_batch.prefix_chunk_num_tokens[i],
+            dtype=torch.int32,
+            device=computed_kv_indices.device,
+        )
+        running_ptr = 0
+        for j in range(forward_batch.batch_size):
+            seq_start = forward_batch.prefix_chunk_starts[i, j].item()
+            seq_len = forward_batch.prefix_chunk_seq_lens[i, j].item()
+            ref_kv_indices[running_ptr : running_ptr + seq_len].copy_(
+                req_to_token[j, seq_start : seq_start + seq_len]
+            )
+            running_ptr += seq_len
+        assert torch.allclose(computed_kv_indices, ref_kv_indices)
+@unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
+class TestPrefixChunkInfo(CustomTestCase):
+    def setUp(self):
+        # Common test parameters
+        self.num_local_heads = 128
+        self.kv_lora_rank = 512
+        self.qk_rope_head_dim = 64
+        self.device = torch.device("cuda")
+        self.dtype = torch.bfloat16
+        self.extend_len = 64
+        self.max_bs = 4
+        self.max_seq_len = 128
+        # req_to_token_pool
+        self.req_to_token_pool = MockReqToTokenPool(
+            self.max_bs,
+            self.max_seq_len,
+            self.device,
+        )
+        # token_to_kv_pool
+        self.token_to_kv_pool = MLATokenToKVPool(
+            size=self.max_bs * self.max_seq_len,
+            page_size=1,  # only consider page=1 for unit test
+            dtype=self.dtype,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            layer_num=1,  # only consider layer=1 for unit test
+            device=self.device,
+            enable_memory_saver=False,
+        )
+    def test_prefix_chunk_info(self):
+        """Test the standard extend operation."""
+        for test_case in TEST_CASES:
+            print(
+                f"Test case with batch_size={test_case['batch_size']}, prefix_lens={test_case['prefix_lens']}, max_chunk_capacity={test_case['max_chunk_capacity']}"
+            )
+            batch_size = test_case["batch_size"]
+            prefix_lens_cpu = test_case["prefix_lens"]
+            assert len(prefix_lens_cpu) == batch_size
+            prefix_lens = torch.tensor(prefix_lens_cpu, device=self.device)
+            max_chunk_capacity = test_case["max_chunk_capacity"]
+            seq_lens_cpu = [
+                self.extend_len + prefix_lens_cpu[i] for i in range(batch_size)
+            ]
+            seq_lens = torch.tensor(seq_lens_cpu, device=self.device)
+            # Create forward batch
+            # input_ids and out_cache_loc are dummy tensors in this test
+            forward_batch = MockForwardBatch(
+                max_chunk_capacity=max_chunk_capacity,
+                batch_size=batch_size,
+                input_ids=torch.randint(
+                    0, 100, (batch_size, self.extend_len), device=self.device
+                ),
+                out_cache_loc=torch.arange(
+                    self.max_bs * self.max_seq_len - batch_size * self.extend_len,
+                    self.max_bs * self.max_seq_len,
+                    device=self.device,
+                ),
+                seq_lens_sum=sum(seq_lens_cpu),
+                forward_mode=ForwardMode.EXTEND,
+                req_pool_indices=torch.arange(batch_size, device=self.device),
+                seq_lens=seq_lens,
+                seq_lens_cpu=seq_lens_cpu,
+                extend_prefix_lens=prefix_lens,
+                extend_prefix_lens_cpu=prefix_lens_cpu,
+            )
+            forward_batch.req_to_token_pool = self.req_to_token_pool
+            forward_batch.token_to_kv_pool = self.token_to_kv_pool
+            forward_batch.prepare_chunked_prefix_cache_info(self.device)
+            assert forward_batch.get_max_chunk_capacity() == max_chunk_capacity
+            assert forward_batch.prefix_chunk_len == test_case["prefix_chunk_len"]
+            assert forward_batch.num_prefix_chunks == test_case["num_prefix_chunks"]
+            assert torch.allclose(
+                forward_batch.prefix_chunk_starts,
+                test_case["prefix_chunk_starts"].to(self.device),
+            )
+            assert torch.allclose(
+                forward_batch.prefix_chunk_seq_lens,
+                test_case["prefix_chunk_seq_lens"].to(self.device),
+            )
+            check_kv_indices(forward_batch)
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_fa3.py
+++ b/test/srt/test_fa3.py
@@ -7,7 +7,6 @@ import torch
 from sglang.srt.utils import get_device_sm, kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.test_utils import (
-    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
@@ -19,7 +18,7 @@ Integration test for python/sglang/srt/layers/attention/flashattention_backend.p
 """
 # Change to your own model if testing model is not public.
 MODEL_USED_FOR_TEST = DEFAULT_MODEL_NAME_FOR_TEST
-MODEL_USED_FOR_TEST_MLA = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+MODEL_USED_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
 # Setting data path to None uses default data path in few_shot_gsm8k eval test.
 DATA_PATH = None
@@ -174,5 +173,57 @@ class TestFlashAttention3SpeculativeDecode(BaseFlashAttentionTest):
        self.assertGreater(avg_spec_accept_length, 1.5)
+class TestFlashAttention3MLASpeculativeDecode(BaseFlashAttentionTest):
+    """Test FlashAttention3 with speculative decode enabled."""
+    model = MODEL_USED_FOR_TEST_MLA
+    @classmethod
+    def get_server_args(cls):
+        args = super().get_server_args()
+        args.extend(
+            [
+                "--cuda-graph-max-bs",
+                "2",
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft",
+                "lmsys/sglang-ci-dsv3-test-NextN",
+                "--speculative-num-steps",
+                "3",
+                "--speculative-eagle-topk",
+                "1",
+                "--speculative-num-draft-tokens",
+                "3",
+            ]
+        )
+        return args
+    def test_gsm8k(self):
+        """
+        Override the test_gsm8k to further test for average speculative accept length.
+        """
+        requests.get(self.base_url + "/flush_cache")
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=DATA_PATH,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+        self.assertGreater(metrics["accuracy"], 0.60)
+        server_info = requests.get(self.base_url + "/get_server_info")
+        avg_spec_accept_length = server_info.json()["avg_spec_accept_length"]
+        print(f"{avg_spec_accept_length=}")
+        self.assertGreater(avg_spec_accept_length, 1.5)
 if __name__ == "__main__":
    unittest.main()