Merge branch 'v0.5.4_dev' into 'v0.5.4_dev_shangxl'

# Conflicts: # python/sglang/srt/layers/attention/dcu_mla_backend.py # python/sglang/srt/layers/attention/flashattention_backend.py # python/sglang/srt/model_executor/forward_batch_info.py # sgl-kernel/csrc/common_extension_rocm.cc # sgl-kernel/include/sgl_kernel_ops.h

Merge branch 'v0.5.4_dev' into 'v0.5.4_dev_shangxl'
# Conflicts: # python/sglang/srt/layers/attention/dcu_mla_backend.py # python/sglang/srt/layers/attention/flashattention_backend.py # python/sglang/srt/model_executor/forward_batch_info.py # sgl-kernel/csrc/common_extension_rocm.cc # sgl-kernel/include/sgl_kernel_ops.h
bdda6c42 · niuhb · b08d561e · 769353e6 · bdda6c42 · bdda6c42
Commit bdda6c42 authored Nov 19, 2025 by niuhb
18 changed files
--- a/python/sglang/srt/layers/attention/dcu_mla_backend.py
+++ b/python/sglang/srt/layers/attention/dcu_mla_backend.py
@@ -105,14 +105,14 @@ class DCUMLABackend(AttentionBackend):
                skip_prefill=False,
            )

-    def _build_decode_metadata(
-        self, 
-        forward_batch: ForwardBatch,
-        seq_lens: torch.Tensor
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor, torch.Tensor]:
+    def init_forward_metadata(self, forward_batch: ForwardBatch):

        bs = forward_batch.batch_size
-        max_seqlen_pad = triton.cdiv(seq_lens.max().item(), PAGE_SIZE)
+        if forward_batch.forward_mode.is_decode_or_idle():
+            
+            max_seqlen_pad = triton.cdiv(
+                forward_batch.seq_lens_cpu.max().item(), PAGE_SIZE
+            )
        
        # 参考vllm官方博客分页
        block_kv_indices = torch.full(
@@ -487,7 +487,7 @@ class DCUMLABackend(AttentionBackend):
        )
        return o

-    @torch._dynamo.disable()
+    @torch._dynamo.disable()  # NOTE: FP8 cache decode不支持compile
    def forward_decode(
        self,
        q: torch.Tensor,
@@ -514,7 +514,7 @@ class DCUMLABackend(AttentionBackend):

        reshape_q = q.view(bs, -1, layer.tp_q_head_num, layer.head_dim)
        k_cache_reshaped = k_cache.view(-1, PAGE_SIZE, 1, self.kv_cache_dim)
-
+        num_draft_tokens = self.num_draft_tokens if self.num_draft_tokens is not None else 0
        if self.data_type in (torch.float8_e4m3fn, torch.float8_e4m3fnuz, 
                              torch.float8_e5m2, torch.float8_e5m2fnuz):
            if self.data_type in (torch.float8_e4m3fnuz, torch.float8_e4m3fn):
@@ -526,7 +526,7 @@ class DCUMLABackend(AttentionBackend):
                reshape_q, 
                k_cache_reshaped, 
                self.forward_metadata.block_kv_indices[:bs],
-                forward_batch.seq_lens.to(torch.int32), 
+                (forward_batch.seq_lens + num_draft_tokens).to(torch.int32), 
                layer.scaling, 
                k_scale, 
                kv_cache_dtype=kv_cache_dtype,
@@ -536,13 +536,13 @@ class DCUMLABackend(AttentionBackend):
                reshape_q, 
                k_cache_reshaped, 
                self.forward_metadata.block_kv_indices[:bs],
-                forward_batch.seq_lens.to(torch.int32), 
+                (forward_batch.seq_lens + num_draft_tokens).to(torch.int32), 
                layer.scaling,
            )

        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)

-    @torch._dynamo.disable()  # NOTE: untested
+    @torch._dynamo.disable()
    def forward_extend(
        self,
        q: torch.Tensor,
@@ -556,12 +556,9 @@ class DCUMLABackend(AttentionBackend):
        k_rope: Optional[torch.Tensor] = None,
        sinks=None,
    ):
-        if save_kv_cache and self.num_draft_tokens == 0:  #nhb
-            return self.forward_decode(q,k,v,layer,forward_batch, save_kv_cache)
-    
-        if ((
+        if (
            forward_batch.forward_mode == ForwardMode.EXTEND
-            or forward_batch.forward_mode == ForwardMode.DRAFT_EXTEND)
+            or forward_batch.forward_mode == ForwardMode.DRAFT_EXTEND
        ):
            if not self.skip_prefill:
                return self.flashattn_backend.forward_extend(
@@ -574,14 +571,19 @@ class DCUMLABackend(AttentionBackend):
        if k is not None:
            assert v is not None
            if save_kv_cache:
-                forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer,
+                    cache_loc,
+                    k,
+                    v,
+                )

        bs = forward_batch.batch_size
        k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)

        reshape_q = q.view(bs, -1, layer.tp_q_head_num, layer.head_dim)
        k_cache_reshaped = k_cache.view(-1, PAGE_SIZE, 1, self.kv_cache_dim)
-
+        num_draft_tokens = self.num_draft_tokens if self.num_draft_tokens is not None else 0
        if self.data_type in (torch.float8_e4m3fn, torch.float8_e4m3fnuz, 
                              torch.float8_e5m2, torch.float8_e5m2fnuz):
            if self.data_type in (torch.float8_e4m3fnuz, torch.float8_e4m3fn):
@@ -593,7 +595,7 @@ class DCUMLABackend(AttentionBackend):
                reshape_q, 
                k_cache_reshaped, 
                self.forward_metadata.block_kv_indices[:bs],
-                (forward_batch.seq_lens + self.num_draft_tokens).to(torch.int32),
+                (forward_batch.seq_lens + num_draft_tokens).to(torch.int32),
                layer.scaling,
                k_scale,
                kv_cache_dtype=kv_cache_dtype,
@@ -603,13 +605,12 @@ class DCUMLABackend(AttentionBackend):
                reshape_q, 
                k_cache_reshaped, 
                self.forward_metadata.block_kv_indices[:bs],
-                (forward_batch.seq_lens + self.num_draft_tokens).to(torch.int32),
+                (forward_batch.seq_lens + num_draft_tokens).to(torch.int32),
                layer.scaling,
            )

        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)

-
 class DCUMLAMultiStepDraftBackend:
    """
    Wrap multiple flashmla attention backends as one for multiple consecutive

--- a/python/sglang/srt/layers/attention/flashattention_backend.py
+++ b/python/sglang/srt/layers/attention/flashattention_backend.py
@@ -329,6 +329,8 @@ class FlashAttentionBackend(AttentionBackend):
        self.use_mla = model_runner.model_config.attention_arch == AttentionArch.MLA
        self.skip_prefill = skip_prefill
        self.is_hybrid = model_runner.is_hybrid
+        self.k_scale = torch.tensor([1.0], dtype=torch.float32, device=self.device)
+        self.v_scale = torch.tensor([1.0], dtype=torch.float32, device=self.device)
        if self.is_hybrid:
            self.full_to_swa_index_mapping = (
                model_runner.token_to_kv_pool.full_to_swa_index_mapping
@@ -609,10 +611,13 @@ class FlashAttentionBackend(AttentionBackend):
                metadata.max_seq_len_q = metadata.max_seq_len_k
                metadata.cu_seqlens_q = metadata.cu_seqlens_k

-            # Setup local attention if enabled
-            if forward_batch.forward_mode == ForwardMode.EXTEND:
+            # # Setup local attention if enabled
+            # if forward_batch.forward_mode == ForwardMode.EXTEND:
+            #     self._init_local_attn_metadata(forward_batch, metadata, device)
+            if forward_batch.forward_mode in (ForwardMode.EXTEND, ForwardMode.DRAFT_EXTEND_V2):
                self._init_local_attn_metadata(forward_batch, metadata, device)

+
        # Encoder metadata for cross attention
        if forward_batch.encoder_lens is not None:
            assert (
@@ -691,7 +696,8 @@ class FlashAttentionBackend(AttentionBackend):
            layer.sliding_window_size is not None and layer.sliding_window_size > -1
        )
        window_size = (layer.sliding_window_size, 0) if is_swa else (-1, -1)
-        k_descale, v_descale = None, None
+        # k_descale, v_descale = None, None
+        k_descale, v_descale = self.k_scale, self.v_scale
        # only use kv scaling if: 1) fp8 kv is explicitly enabled, 2) RadixAttention
        # has corresponding quantization method so that layer.k_scale is not None,
        # 3) layer.head_dim <= 256 since fa3 kernel require fp16 and bf16 data type in this case,
@@ -775,55 +781,53 @@ class FlashAttentionBackend(AttentionBackend):
                cu_seqlens_k = metadata.encoder_cu_seqlens_k
                window_size = (-1, -1)

-            result = flash_attn_with_kvcache(
-                q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
-                k_cache=key_cache,
-                v_cache=value_cache,
-                page_table=page_table,
-                cache_seqlens=cache_seqlens,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k_new=cu_seqlens_k if not use_local_attn else None,
-                max_seqlen_q=max_seqlen_q,
-                softmax_scale=layer.scaling,
-                causal=False if use_cascade_attn else causal,
-                window_size=window_size,
-                softcap=layer.logit_cap,
-                k_descale=k_descale,
-                v_descale=v_descale,
-                return_softmax_lse=use_cascade_attn,
-                num_splits=self.num_splits,
-                **kwargs,
-            )
-
-            if use_cascade_attn:
-                o, softmax_lse, *rest = result
-                o_expand, softmax_lse_expand, *rest_expand = flash_attn_with_kvcache(
-                    q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
-                    k_cache=key_cache,
-                    v_cache=value_cache,
-                    page_table=self.forward_metadata_spec_decode_expand.page_table,
-                    cache_seqlens=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
-                    cu_seqlens_q=self.forward_metadata_spec_decode_expand.cu_seqlens_q,
-                    cu_seqlens_k_new=self.forward_metadata_spec_decode_expand.cu_seqlens_k,
-                    max_seqlen_q=self.forward_metadata_spec_decode_expand.max_seq_len_q,
+            if forward_batch.attn_attend_prefix_cache:
+                assert not get_global_server_args().disable_chunked_prefix_cache
+                # MHA for chunked prefix kv cache when running model with MLA
+                assert forward_batch.prefix_chunk_idx is not None
+                assert forward_batch.prefix_chunk_cu_seq_lens is not None
+                assert forward_batch.prefix_chunk_max_seq_lens is not None
+
+                chunk_idx = forward_batch.prefix_chunk_idx
+                assert chunk_idx >= 0
+
+                assert forward_batch.mha_return_lse
+                output = flash_attn_varlen_func(
+                    q=q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                    k=k.view(-1, layer.tp_k_head_num, layer.head_dim).view(q.dtype),
+                    v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim).view(q.dtype),
+                    cu_seqlens_q=metadata.cu_seqlens_q,
+                    cu_seqlens_k=forward_batch.prefix_chunk_cu_seq_lens[chunk_idx],
+                    max_seqlen_q=metadata.max_seq_len_q,
+                    max_seqlen_k=forward_batch.prefix_chunk_max_seq_lens[chunk_idx],
                    softmax_scale=layer.scaling,
                    causal=False,
-                    window_size=window_size,
-                    softcap=layer.logit_cap,
                    k_descale=k_descale,
                    v_descale=v_descale,
                    return_softmax_lse=True,
-                    num_splits=self.num_splits,
                    **kwargs,
                )
-                o, _ = merge_state_v2_wrapper(
-                    o,
-                    softmax_lse.T.contiguous(),
-                    o_expand,
-                    softmax_lse_expand.T.contiguous(),
-                )
            else:
-                o = result
+                output = flash_attn_varlen_func(
+                    q=q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                    k=k.view(-1, layer.tp_k_head_num, layer.head_dim).view(q.dtype),
+                    v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim).view(q.dtype),
+                    cu_seqlens_q=metadata.cu_seqlens_q,
+                    cu_seqlens_k=metadata.cu_seqlens_q,
+                    max_seqlen_q=metadata.max_seq_len_q,
+                    max_seqlen_k=metadata.max_seq_len_q,
+                    softmax_scale=layer.scaling,
+                    causal=True,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    return_softmax_lse=forward_batch.mha_return_lse,
+                    **kwargs,
+                )
+            if forward_batch.mha_return_lse:
+                output, lse, *rest = output
+                lse = torch.transpose(lse, 0, 1).contiguous()
+                return output, lse
+            return output.view(-1, layer.tp_q_head_num * layer.v_head_dim)
        else:
            if (
                forward_batch.attn_attend_prefix_cache is not None
@@ -852,6 +856,8 @@ class FlashAttentionBackend(AttentionBackend):
                        max_seqlen_k=forward_batch.prefix_chunk_max_seq_lens[chunk_idx],
                        softmax_scale=layer.scaling,
                        causal=False,
+                        k_descale=k_descale,
+                        v_descale=v_descale,
                        return_softmax_lse=True,
                        **kwargs,
                    )
@@ -866,6 +872,8 @@ class FlashAttentionBackend(AttentionBackend):
                        max_seqlen_k=metadata.max_seq_len_q,
                        softmax_scale=layer.scaling,
                        causal=True,
+                        k_descale=k_descale,
+                        v_descale=v_descale,
                        return_softmax_lse=forward_batch.mha_return_lse,
                        **kwargs,
                    )
@@ -975,10 +983,16 @@ class FlashAttentionBackend(AttentionBackend):
                    if not layer.is_cross_attention
                    else forward_batch.encoder_out_cache_loc
                )
-                if not self.use_mla:
-                    forward_batch.token_to_kv_pool.set_kv_buffer(
-                        layer, cache_loc, k, v, layer.k_scale, layer.v_scale
-                    )
+                # if not self.use_mla:
+                if k_rope is None:
+                    if not self.use_mla:
+                        forward_batch.token_to_kv_pool.set_kv_buffer(
+                            layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                        )
+                    else:
+                        forward_batch.token_to_kv_pool.set_kv_buffer(
+                            layer, cache_loc, k, v
+                        )
                else:
                    forward_batch.token_to_kv_pool.set_mla_kv_buffer(
                        layer,
@@ -1020,7 +1034,8 @@ class FlashAttentionBackend(AttentionBackend):
        if sinks is not None:
            kwargs["sinks"] = sinks

-        k_descale, v_descale = None, None
+        # k_descale, v_descale = None, None
+        k_descale, v_descale = self.k_scale, self.v_scale
        # only use kv scaling if: 1) fp8 kv is explicitly enabled, 2) RadixAttention
        # has corresponding quantization method so that layer.k_scale is not None,
        # 3) layer.head_dim <= 256 since fa3 kernel require fp16 and bf16 data type in this case.
@@ -1034,7 +1049,6 @@ class FlashAttentionBackend(AttentionBackend):
            k_rope = k_rope.to(self.kv_cache_dtype) if k_rope is not None else None
        if not self.use_mla:
            # Do multi-head attention
-
            key_cache, value_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
                layer.layer_id
            )
@@ -1086,65 +1100,62 @@ class FlashAttentionBackend(AttentionBackend):
                    **kwargs,
                )
            else:
+                cu_seqlens_q = metadata.cu_seqlens_q
+                max_seqlen_q = metadata.max_seq_len_q
                page_table = metadata.page_table
-                cache_seqlens = metadata.cache_seqlens_int32
                cu_seqlens_k = metadata.cu_seqlens_k
-                max_seqlen_q = metadata.max_seq_len_q
-                q_reshaped = q.contiguous().view(
-                    -1, layer.tp_q_head_num, layer.head_dim
+                cache_seqlens = metadata.cache_seqlens_int32
+                key_cache = key_cache.view(
+                    -1, self.page_size, layer.tp_k_head_num, layer.head_dim
                )
-
-                # Default: single-token self-attention
-                result = flash_attn_with_kvcache(
-                    q=q_reshaped,
-                    k_cache=key_cache,
-                    v_cache=value_cache,
-                    page_table=page_table,
-                    cache_seqlens=cache_seqlens,
-                    cu_seqlens_q=metadata.cu_seqlens_q,
-                    cu_seqlens_k_new=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_q,
-                    softmax_scale=layer.scaling,
-                    causal=False if use_cascade_attn else causal,
-                    window_size=window_size,
-                    softcap=layer.logit_cap,
-                    k_descale=k_descale,
-                    v_descale=v_descale,
-                    return_softmax_lse=use_cascade_attn,
-                    num_splits=self.num_splits,
-                    **kwargs,
+                value_cache = value_cache.view(
+                    -1, self.page_size, layer.tp_v_head_num, layer.head_dim
                )
-                if use_cascade_attn:
-                    o, softmax_lse, *rest = result
-                    o_expand, softmax_lse_expand, *rest_expand = (
-                        flash_attn_with_kvcache(
-                            q=q_reshaped,
-                            k_cache=key_cache,
-                            v_cache=value_cache,
-                            page_table=self.forward_metadata_spec_decode_expand.page_table,
-                            cache_seqlens=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
-                            cu_seqlens_q=self.forward_metadata_spec_decode_expand.cu_seqlens_q,
-                            cu_seqlens_k_new=self.forward_metadata_spec_decode_expand.cu_seqlens_k,
-                            max_seqlen_q=self.forward_metadata_spec_decode_expand.max_seq_len_q,
-                            softmax_scale=layer.scaling,
-                            causal=False,
-                            window_size=window_size,
-                            softcap=layer.logit_cap,
-                            k_descale=k_descale,
-                            v_descale=v_descale,
-                            return_softmax_lse=True,
-                            num_splits=self.num_splits,
-                            **kwargs,
-                        )
-                    )
-                    o, _ = merge_state_v2(
-                        o,
-                        softmax_lse.T.contiguous(),
-                        o_expand,
-                        softmax_lse_expand.T.contiguous(),
+                if layer.is_cross_attention:
+                    page_table = metadata.encoder_page_table
+                    cache_seqlens = metadata.encoder_lens_int32
+                    cu_seqlens_k = metadata.encoder_cu_seqlens_k
+                    window_size = (-1, -1)
+                if max_seqlen_q > 1:
+                    result = flash_attn_varlen_func(
+                        q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                        k=k.view(-1, layer.tp_k_head_num, layer.head_dim).view(q.dtype),
+                        v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim).view(q.dtype),
+                        cu_seqlens_q=cu_seqlens_q,
+                        cu_seqlens_k=cu_seqlens_k,
+                        max_seqlen_q=max_seqlen_q,
+                        max_seqlen_k=max_seqlen_q,
+                        softmax_scale=layer.scaling,
+                        causal=True,
+                        window_size=window_size,
+                        softcap=layer.logit_cap,
+                        k_descale=k_descale,
+                        v_descale=v_descale,
+                        return_softmax_lse=use_cascade_attn,
+                        num_splits=self.num_splits,
+                        **kwargs,
                    )
                else:
-                    o = result
+                    result = flash_attn_with_kvcache(
+                        q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                        k_cache=key_cache,
+                        v_cache=value_cache,
+                        page_table=page_table,
+                        cache_seqlens=cache_seqlens,
+                        cu_seqlens_q=cu_seqlens_q,
+                        cu_seqlens_k_new=cu_seqlens_k if not use_local_attn else None,
+                        max_seqlen_q=max_seqlen_q,
+                        softmax_scale=layer.scaling,
+                        causal=True,
+                        window_size=window_size,
+                        softcap=layer.logit_cap,
+                        k_descale=k_descale,
+                        v_descale=v_descale,
+                        return_softmax_lse=use_cascade_attn,
+                        num_splits=self.num_splits,
+                        **kwargs,
+                    )
+                o = result
        else:
            # Do absorbed multi-latent attention
            kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id).to(

--- a/python/sglang/srt/layers/attention/flashattention_interface.py
+++ b/python/sglang/srt/layers/attention/flashattention_interface.py
@@ -41,18 +41,18 @@ def flash_attn_with_kvcache(
    ver=3,
 ):
    return flash_attn_with_kvcache_interface(
-                q=q.contiguous().view(-1, max_seqlen_q, q.shape[-2], q.shape[-1]), 
-                k_cache=k_cache,                                            
-                v_cache=v_cache,                                               
-                block_table=page_table,                                     
-                cache_seqlens=cache_seqlens,                                  
-                softmax_scale=softmax_scale,                                   
-                causal=causal,                
-                window_size=window_size,                                     
-                softcap=softcap,                                
-                return_softmax_lse=return_softmax_lse,                           
-                num_splits=num_splits,                                   
-            )
+            q=q.contiguous().view(-1, max_seqlen_q, q.shape[-2], q.shape[-1]), 
+            k_cache=k_cache.view(q.dtype),                                            
+            v_cache=v_cache.view(q.dtype),                                               
+            block_table=page_table,                                     
+            cache_seqlens=cache_seqlens,                                  
+            softmax_scale=softmax_scale,                                   
+            causal=causal,                
+            window_size=window_size,                                     
+            softcap=softcap,                                
+            return_softmax_lse=return_softmax_lse,                           
+            num_splits=num_splits,                                   
+        )

 def flash_attn_varlen_func(
    q,
@@ -83,8 +83,8 @@ def flash_attn_varlen_func(
 ):
    return flash_attn_varlen_func_interface(
                q=q,
-                k=k,
-                v=v,
+                k=k.view(q.dtype),
+                v=v.view(q.dtype),
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_q,
@@ -92,4 +92,5 @@ def flash_attn_varlen_func(
                softmax_scale=softmax_scale,
                causal=causal,
                return_attn_probs=return_softmax_lse,
+                softcap=softcap,
            )
\ No newline at end of file
--- a/python/sglang/srt/layers/attention/lightop_concat.py
+++ b/python/sglang/srt/layers/attention/lightop_concat.py
@@ -4,7 +4,7 @@ import warnings

 import torch

-from sglang.srt.utils import get_bool_env_var
+from sglang.srt.utils import get_bool_env_var, direct_register_custom_op

 _USE_OPT_CAT = get_bool_env_var("SGLANG_USE_OPT_CAT")

@@ -18,15 +18,50 @@ if _USE_OPT_CAT:
        )
 else:
    ds_cat = None
+    


-def concat_decode_opt(A:torch.Tensor, B:torch.Tensor, dim:int):
-    assert dim==2 , "tensor dim must be 3 and concat dim must be 2"
+
+
+# TODO: 单独注册有些问题
+def ds_cat_wrapper(A: torch.Tensor,
+                   B: torch.Tensor,
+                   dim: int,
+                   mode: int) -> torch.Tensor:
    output_shape = list(A.shape)
-    output_shape[dim] = A.shape[dim] + B.shape[dim]  
+    output_shape[dim] = A.shape[dim] + B.shape[dim]
    C = torch.empty(output_shape, device=A.device, dtype=A.dtype)
+    ds_cat(A, B, C, mode)
+    return C
+    
+def ds_cat_fake(A: torch.Tensor,
+                B: torch.Tensor,
+                dim: int,
+                mode: int) -> torch.Tensor:
+    # 使用标准cat作为fake实现
+    return torch.cat([A, B], dim=dim)
+
+direct_register_custom_op(
+    op_name="ds_cat",
+    op_func=ds_cat_wrapper,
+    mutates_args=[],  # 没有修改参数，只有返回值
+    fake_impl=ds_cat_fake
+)
+
+def concat_decode_opt(A: torch.Tensor, B: torch.Tensor, dim: int):
+    assert dim == 2, "tensor dim must be 3 and concat dim must be 2"
    mode = 0
-    if dim!=0 :
-        ds_cat( A, B, C, mode)
-        return C
-    assert False, "not support"
\ No newline at end of file
+    if dim != 0:
+        return torch.ops.sglang.ds_cat(A, B, dim, mode)
+    assert False, "not support"
+
+# def concat_decode_opt(A:torch.Tensor, B:torch.Tensor, dim:int):
+#     assert dim==2 , "tensor dim must be 3 and concat dim must be 2"
+#     output_shape = list(A.shape)
+#     output_shape[dim] = A.shape[dim] + B.shape[dim]  
+#     C = torch.empty(output_shape, device=A.device, dtype=A.dtype)
+#     mode=0
+#     if dim!=0 :
+#         ds_cat(A, B, C, mode)
+#         return C
+#     assert False, "not support"
--- a/python/sglang/srt/layers/moe/ep_moe/layer.py
+++ b/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -31,7 +31,7 @@ from sglang.srt.layers.quantization.fp8_kernel import (
 )
 from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config, W4AFp8MoEMethod
 from sglang.srt.single_batch_overlap import DownGemmOverlapArgs
-from sglang.srt.utils import ceil_div, dispose_tensor, get_bool_env_var, is_hip, is_npu
+from sglang.srt.utils import ceil_div, dispose_tensor, get_bool_env_var, is_hip, is_npu, direct_register_custom_op
 from sglang.srt.utils.offloader import get_offloader

 if TYPE_CHECKING:
@@ -57,6 +57,105 @@ if _use_aiter:

 logger = logging.getLogger(__name__)

+#------ custom op for lightop
+def m_grouped_w4a8_gemm_nt_masked_wrapper(
+    a0: torch.Tensor, a1: torch.Tensor,
+    b0: torch.Tensor, b1: torch.Tensor,
+    d: torch.Tensor,
+    masked_m: torch.Tensor,
+    expected_m_per_group: int
+    ) -> torch.Tensor:
+    return m_grouped_w4a8_gemm_nt_masked(
+            (a0, a1),
+            (b0, b1),
+            d,
+            masked_m,
+            expected_m_per_group,
+            config={"MODE": 1000,}
+        )
+def m_grouped_w4a8_gemm_nt_masked_fake(
+    a0: torch.Tensor, a1: torch.Tensor,
+    b0: torch.Tensor, b1: torch.Tensor,
+    d: torch.Tensor,
+    masked_m: torch.Tensor,
+    expected_m_per_group: int
+    ) -> torch.Tensor:
+    return d
+def m_grouped_w8a8_gemm_nt_masked_wrapper(
+    a0: torch.Tensor, a1: torch.Tensor,
+    b0: torch.Tensor, b1: torch.Tensor,
+    d: torch.Tensor,
+    masked_m: torch.Tensor,
+    expected_m_per_group: int
+    ) -> torch.Tensor:
+    return m_grouped_w8a8_gemm_nt_masked(
+            (a0, a1),
+            (b0, b1),
+            d,
+            masked_m,
+            expected_m_per_group,
+            config={"MODE": 1000,}
+        )
+def m_grouped_w8a8_gemm_nt_masked_fake(
+    a0: torch.Tensor, a1: torch.Tensor,
+    b0: torch.Tensor, b1: torch.Tensor,
+    d: torch.Tensor,
+    masked_m: torch.Tensor,
+    expected_m_per_group: int
+    ) -> torch.Tensor:
+    return d
+
+def fuse_silu_mul_quant_ep_wrapper(
+    input: torch.Tensor,
+    tokens_per_expert: Optional[torch.Tensor] = None,
+    num_local_tokens_tensor: Optional[torch.Tensor] = None,
+    topk:int=1,
+    expect_m:int=-1) -> tuple[torch.Tensor, torch.Tensor]:
+    return fuse_silu_mul_quant_ep(
+        input,
+        tokens_per_expert,
+        num_local_tokens_tensor,
+        topk,
+        expect_m
+    )
+    
+    
+def fuse_silu_mul_quant_ep_fake(
+    input: torch.Tensor,
+    tokens_per_expert: Optional[torch.Tensor] = None,
+    num_local_tokens_tensor: Optional[torch.Tensor] = None,
+    topk:int=1,
+    expect_m:int=-1) -> tuple[torch.Tensor, torch.Tensor]:
+    E, T, H = input.shape
+    d = H // 2
+    output = torch.empty(E, T, d, dtype=torch.int8, device=input.device)
+    scales = torch.empty((E, T, 1),
+                        device=input.device,
+                        dtype=torch.float32)
+    return output, scales
+
+
+direct_register_custom_op(
+    op_name="m_grouped_w4a8_gemm_nt_masked",
+    op_func=m_grouped_w4a8_gemm_nt_masked_wrapper,
+    mutates_args=[],
+    fake_impl=m_grouped_w4a8_gemm_nt_masked_fake
+)
+direct_register_custom_op(
+    op_name="m_grouped_w8a8_gemm_nt_masked",
+    op_func=m_grouped_w8a8_gemm_nt_masked_wrapper,
+    mutates_args=[],
+    fake_impl=m_grouped_w8a8_gemm_nt_masked_fake
+)
+direct_register_custom_op(
+    op_name="fuse_silu_mul_quant_ep",
+    op_func=fuse_silu_mul_quant_ep_wrapper,
+    mutates_args=[],
+    fake_impl=fuse_silu_mul_quant_ep_fake
+)
+
+#------
+

 # TODO(kaixih@nvidia): ideally we should merge this logic into
 # `fill_gateup_input_triton_kernel` to directly generate e8m0 scale.
@@ -815,23 +914,23 @@ class DeepEPMoE(EPMoE):
        gateup_output = torch.empty((num_groups, m, n1), device=hidden_states.device, dtype=torch.bfloat16)

        # ---- first GEMM ----
-        m_grouped_w4a8_gemm_nt_masked(
-            (q_a1_all, q_a1_scale),
-            (w13_weight, w13_scales),
+        torch.ops.sglang.m_grouped_w4a8_gemm_nt_masked(
+            q_a1_all, q_a1_scale,
+            w13_weight, w13_scales,
            gateup_output,
            masked_m,
            expected_m,
        )

-        q_a2_all, q_a2_scale = fuse_silu_mul_quant_ep(gateup_output, masked_m)
+        q_a2_all, q_a2_scale = torch.ops.sglang.fuse_silu_mul_quant_ep(gateup_output, masked_m)

        # ---- second GEMM ----
        n2 = w2_scales.size(1)
        down_output = torch.empty((num_groups, m, n2), device=q_a2_all.device, dtype=torch.bfloat16)

-        m_grouped_w4a8_gemm_nt_masked(
-            (q_a2_all, q_a2_scale),
-            (w2_weight, w2_scales),
+        torch.ops.sglang.m_grouped_w4a8_gemm_nt_masked(
+            q_a2_all, q_a2_scale,
+            w2_weight, w2_scales,
            down_output,
            masked_m,
            expected_m,
@@ -865,23 +964,23 @@ class DeepEPMoE(EPMoE):
        gateup_output = torch.empty((num_groups, m, n1), device=hidden_states.device, dtype=torch.bfloat16)

        # ---- first GEMM ----
-        m_grouped_w8a8_gemm_nt_masked(
-            (q_a1_all, q_a1_scale),
-            (w13_weight, w13_scales),
+        torch.ops.sglang.m_grouped_w8a8_gemm_nt_masked(
+            q_a1_all, q_a1_scale,
+            w13_weight, w13_scales,
            gateup_output,
            masked_m,
            expected_m,
        )

-        q_a2_all, q_a2_scale = fuse_silu_mul_quant_ep(gateup_output, masked_m)
+        q_a2_all, q_a2_scale = torch.ops.sglang.fuse_silu_mul_quant_ep(gateup_output, masked_m)

        # ---- second GEMM ----
        n2 = w2_scales.size(1)
        down_output = torch.empty((num_groups, m, n2), device=q_a2_all.device, dtype=torch.bfloat16)

-        m_grouped_w8a8_gemm_nt_masked(
-            (q_a2_all, q_a2_scale),
-            (w2_weight, w2_scales),
+        torch.ops.sglang.m_grouped_w8a8_gemm_nt_masked(
+            q_a2_all, q_a2_scale,
+            w2_weight, w2_scales,
            down_output,
            masked_m,
            expected_m,

--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
+      
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
@@ -15,6 +16,7 @@ from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase

 from sglang.srt.utils import set_weight_attrs
 from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.utils import get_moe_a2a_backend
 try:
    from lmslim.layers.fused_moe.fuse_moe_int8_marlin import fused_experts_impl_int8_marlin
 except Exception:
@@ -77,7 +79,7 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
            "weights")
        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
            "input_activations")
-        self.use_deepep = True
+        self.use_deepep = get_moe_a2a_backend().is_deepep()
        per_channel = (
            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
            and self.input_quant.strategy == QuantizationStrategy.TOKEN)

--- a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -163,12 +163,14 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
                )
                layer.register_parameter("input_zero_point", input_zero_point)

+    @torch._dynamo.disable()
    def apply_weights(
        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
    ) -> torch.Tensor:
        # TODO: add cutlass_scaled_mm_azp support
        x_q, x_scale = per_token_quant_int8(x)

+        # TODO: fix with lmslim/lightop
        return quant_ops.triton_scaled_mm(
            x_q, layer.weight, x_scale, layer.weight_scale, out_dtype=x.dtype, bias=bias
        )
--- a/python/sglang/srt/layers/quantization/slimquant_w4a8.py
+++ b/python/sglang/srt/layers/quantization/slimquant_w4a8.py
@@ -157,7 +157,7 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase):
        )
        layer.register_parameter("weight_scale", weight_scale)

-    @torch._dynamo.disable()
+    @torch._dynamo.disable()  # TODO: 性能优化需要lmslim/lightop配合
    def apply(
        self,
        layer: torch.nn.Module,

--- a/python/sglang/srt/layers/quantization/slimquant_w4a8_marlin.py
+++ b/python/sglang/srt/layers/quantization/slimquant_w4a8_marlin.py
@@ -214,7 +214,7 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
        self.moe_runner_config = moe_runner_config
        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
        
-    @torch._dynamo.disable()
+    @torch._dynamo.disable()  # TODO: 性能优化需lmslim/lightop配合
    def apply(
        self,
        layer: torch.nn.Module,
@@ -253,6 +253,7 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
        )
        return StandardCombineInput(hidden_states=output)

+    @torch._dynamo.disable()  # TODO: 性能优化需lmslim/lightop配合
    def apply_with_shared_output(
        self,
        layer: torch.nn.Module,

--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -1193,6 +1193,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
        self.seq_lens = seq_lens_tensor
        self.seq_lens_cpu = seq_lens_cpu
        self.extend_num_tokens = extend_num_tokens
+        self.loc_tensor = torch.tensor([-1], device=self.device) 

        # Allocate memory
        out_cache_loc, req_pool_indices_tensor, req_pool_indices = alloc_for_extend(

--- a/python/sglang/srt/mem_cache/common.py
+++ b/python/sglang/srt/mem_cache/common.py
@@ -361,7 +361,8 @@ def alloc_for_extend(
    else:
        # Paged allocation - build last_loc
        last_loc = [
-            (t[-1:] if len(t) > 0 else torch.tensor([-1], device=batch.device))
+            # (t[-1:] if len(t) > 0 else torch.tensor([-1], device=batch.device))
+            (t[-1:] if len(t) > 0 else batch.loc_tensor)
            for t in prefix_tensors
        ]
        out_cache_loc = alloc_paged_token_slots_extend(

--- a/python/sglang/srt/model_executor/forward_batch_info.py
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -127,7 +127,7 @@ class ForwardMode(IntEnum):
        # For fixed shape logits output in v2 eagle worker
        return self == ForwardMode.DRAFT_EXTEND_V2

-    def is_extend_or_draft_extend_or_mixed(self):
+    def is_extend_or_draft_extend_or_mixed(self): #nhb
        return (
            self == ForwardMode.EXTEND
            or self == ForwardMode.DRAFT_EXTEND

--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -2241,6 +2241,7 @@ class ModelRunner:
            and self.graph_runner
            and self.graph_runner.can_run(forward_batch)
        )
+
        if can_run_graph:
            ret = self.graph_runner.replay(
                forward_batch,

--- a/python/sglang/srt/speculative/eagle_info.py
+++ b/python/sglang/srt/speculative/eagle_info.py
@@ -37,7 +37,8 @@ from sglang.srt.speculative.spec_utils import (
    get_src_tgt_cache_loc,
    get_target_cache_loc,
 )
-from sglang.srt.utils import is_cuda, is_hip, next_power_of_2
+from sglang.srt.utils import is_cuda, is_hip, next_power_of_2, get_bool_env_var
+from sgl_kernel.kvcacheio import dcu_create_extend_after_decode_spec_info

 if is_cuda():
    from sgl_kernel import (
@@ -615,6 +616,8 @@ class EagleDraftInput(SpecInput, EagleDraftInputV2Mixin):
    new_seq_lens: Optional[torch.Tensor] = None
    verify_done: Optional[torch.cuda.Event] = None

+    use_sglang_create_extend_after_decode_spec_info = get_bool_env_var("SGLANG_CREATE_EXTEND_AFTER_DECODE_SPEC_INFO")
+
    def __post_init__(self):
        super().__init__(SpecInputType.EAGLE_DRAFT)

@@ -679,14 +682,24 @@ class EagleDraftInput(SpecInput, EagleDraftInputV2Mixin):
        self.positions = torch.empty_like(batch.input_ids, dtype=torch.long)
        self.verified_id = torch.empty_like(self.accept_length, dtype=torch.int32)

-        create_extend_after_decode_spec_info[(len(batch.seq_lens),)](
-            batch.input_ids,
-            batch.seq_lens,
-            self.accept_length,
-            self.positions,
-            self.verified_id,
-            next_power_of_2(max(speculative_num_steps + 1, len(batch.seq_lens))),
-        )
+        if self.use_sglang_create_extend_after_decode_spec_info:
+            dcu_create_extend_after_decode_spec_info(
+                verified_id = batch.input_ids,
+                seq_lens = batch.seq_lens,
+                accept_lens = self.accept_length,
+                positions = self.positions,
+                new_verified_id = self.verified_id,
+                bs = max(speculative_num_steps + 1, len(batch.seq_lens)),
+            )
+        else:
+            create_extend_after_decode_spec_info[(len(batch.seq_lens),)](
+                batch.input_ids,
+                batch.seq_lens,
+                self.accept_length,
+                self.positions,
+                self.verified_id,
+                next_power_of_2(max(speculative_num_steps + 1, len(batch.seq_lens))),
+            )

    def generate_attn_arg_prefill(
        self,

--- a/sgl-kernel/csrc/common_extension_rocm.cc
+++ b/sgl-kernel/csrc/common_extension_rocm.cc
@@ -139,6 +139,8 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
  /*
   * From csrc/kvcacheio
   */
+  m.def("dcu_create_extend_after_decode_spec_info(Tensor verified_id, Tensor seq_lens, Tensor accept_lens, Tensor positions, Tensor new_verified_id, int bs) -> ()");
+  m.impl("dcu_create_extend_after_decode_spec_info", torch::kCUDA, &dcu_create_extend_after_decode_spec_info);
  m.def("dcu_create_chunked_prefix_cache_kv_indices(Tensor req_to_token, Tensor req_pool_indices, Tensor chunk_starts, Tensor chunk_seq_lens, Tensor chunk_cu_seq_lens, Tensor chunk_kv_indices, int col_num, int bs) -> ()");
  m.impl("dcu_create_chunked_prefix_cache_kv_indices", torch::kCUDA, &dcu_create_chunked_prefix_cache_kv_indices);
  m.def("dcu_assign_extend_cache_locs(Tensor req_pool_indices, Tensor req_to_token, Tensor start_offset, Tensor end_offset, Tensor out_cache_loc, int pool_len, int bs) -> ()");
@@ -146,7 +148,7 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
  m.def("dcu_get_last_loc(Tensor req_to_token, Tensor req_pool_indices, Tensor prefix_lens) -> Tensor");
  m.impl("dcu_get_last_loc", torch::kCUDA, &dcu_get_last_loc); 
  m.def("dcu_assign_req_to_token_pool(Tensor req_pool_indices_ptr,Tensor req_to_token_ptr,Tensor allocate_lens_ptr,Tensor new_allocate_lens,Tensor out_cache_loc_ptr,int shape,int bs) -> ()");
-  m.impl("dcu_assign_req_to_token_pool",torch::kCUDA,&dcu_assign_req_to_token_pool);   
+  m.impl("dcu_assign_req_to_token_pool",torch::kCUDA,&dcu_assign_req_to_token_pool);  


  m.def("dcu_alloc_extend_kernel(Tensor pre_lens_ptr, Tensor seq_lens_ptr, Tensor last_loc_ptr, Tensor free_page_ptr, Tensor out_indices, int bs, int page_size) -> ()");

--- a/sgl-kernel/csrc/kvcacheio/transfer.cu
+++ b/sgl-kernel/csrc/kvcacheio/transfer.cu
@@ -693,6 +693,65 @@ __global__ void launch_alloc_extend_kernel(
      out_indices[output_idx] = start_loc * page_size + offset;
    }
 }
+__global__ void launch_create_extend_after_decode_spec_info_int32_kernel(
+    const int32_t* verified_id_ptr,
+    const int64_t* seq_lens_ptr,
+    const int32_t* accept_lens_ptr,
+    int64_t* positions_ptr,
+    int32_t* new_verified_id_ptr,
+    int64_t bs) {
+    
+    int64_t pid = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (pid >= bs) return;
+    
+    int64_t seq_length = seq_lens_ptr[pid];
+    int32_t accept_length = accept_lens_ptr[pid];
+
+    int32_t accept_len_cumsum = 0;
+    for (int32_t offset = 0; offset < pid; offset++) {
+        accept_len_cumsum += accept_lens_ptr[offset];
+    }
+
+    int64_t* positions_ptr1 = positions_ptr + accept_len_cumsum;
+    for (int32_t offset = 0; offset < accept_length && offset < bs; offset++) 
+    {
+      positions_ptr1[offset] = seq_length - accept_length + offset;
+    }
+
+    int32_t verified_idx = accept_len_cumsum + accept_length - 1;
+    new_verified_id_ptr[pid] = verified_id_ptr[verified_idx];
+}
+
+__global__ void launch_create_extend_after_decode_spec_info_int64_kernel(
+    const int32_t* verified_id_ptr,
+    const int64_t* seq_lens_ptr,
+    const int64_t* accept_lens_ptr,
+    int64_t* positions_ptr,
+    int32_t* new_verified_id_ptr,
+    int64_t bs) {
+    
+    int64_t pid = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (pid >= bs) return;
+    
+    int64_t seq_length = seq_lens_ptr[pid];
+    int64_t accept_length = accept_lens_ptr[pid];
+
+    int64_t accept_len_cumsum = 0;
+    for (int64_t offset = 0; offset < pid; offset++) {
+        accept_len_cumsum += accept_lens_ptr[offset];
+    }
+
+    int64_t* positions_ptr1 = positions_ptr + accept_len_cumsum;
+    for (int64_t offset = 0; offset < accept_length && offset < bs; offset++) 
+    {
+      positions_ptr1[offset] = seq_length - accept_length + offset;
+    }
+
+    int64_t verified_idx = accept_len_cumsum + accept_length - 1;
+    new_verified_id_ptr[pid] = verified_id_ptr[verified_idx];
+}

 void dcu_alloc_decode_kernel(
  const at::Tensor seq_lens_ptr,   
@@ -714,6 +773,49 @@ void dcu_alloc_decode_kernel(
    C10_CUDA_KERNEL_LAUNCH_CHECK();
 }

+void dcu_create_extend_after_decode_spec_info(
+    const at::Tensor verified_id,
+    const at::Tensor seq_lens,
+    const at::Tensor accept_lens,
+    at::Tensor positions,
+    at::Tensor new_verified_id,
+    int64_t bs) {
+
+    const int32_t* verified_id_ptr;
+    const int64_t* seq_lens_ptr;
+    const int32_t* accept_lens_ptr_int32;
+    const int64_t* accept_lens_ptr_int64;
+    int64_t* positions_ptr;
+    int32_t* new_verified_id_ptr;
+
+    int64_t block_size = 64;
+    int64_t grid_size = (bs + block_size - 1) / block_size;
+    cudaStream_t torch_current_stream = at::cuda::getCurrentCUDAStream();
+
+    if (accept_lens.dtype() == torch::kInt32)
+    {
+      verified_id_ptr       = static_cast<const int32_t*>(verified_id.data_ptr());
+      seq_lens_ptr          = static_cast<const int64_t*>(seq_lens.data_ptr());
+      accept_lens_ptr_int32 = static_cast<const int32_t*>(accept_lens.data_ptr());
+      positions_ptr         = static_cast<int64_t*>(positions.data_ptr());
+      new_verified_id_ptr   = static_cast<int32_t*>(new_verified_id.data_ptr());
+
+      launch_create_extend_after_decode_spec_info_int32_kernel<<<grid_size, block_size, 0, torch_current_stream>>>(verified_id_ptr, seq_lens_ptr, accept_lens_ptr_int32, positions_ptr, new_verified_id_ptr, bs);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    }
+    else
+    {
+      verified_id_ptr       = static_cast<const int32_t*>(verified_id.data_ptr());
+      seq_lens_ptr          = static_cast<const int64_t*>(seq_lens.data_ptr());
+      accept_lens_ptr_int64 = static_cast<const int64_t*>(accept_lens.data_ptr());
+      positions_ptr         = static_cast<int64_t*>(positions.data_ptr());
+      new_verified_id_ptr   = static_cast<int32_t*>(new_verified_id.data_ptr());
+
+      launch_create_extend_after_decode_spec_info_int64_kernel<<<grid_size, block_size, 0, torch_current_stream>>>(verified_id_ptr, seq_lens_ptr, accept_lens_ptr_int64, positions_ptr, new_verified_id_ptr, bs);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    }
+};
+
 void dcu_alloc_extend_kernel(
    const at::Tensor pre_lens_ptr,
    const at::Tensor seq_lens_ptr,

--- a/sgl-kernel/include/sgl_kernel_ops.h
+++ b/sgl-kernel/include/sgl_kernel_ops.h
@@ -538,6 +538,13 @@ void segment_packbits(
 /*
 * From csrc/kvcacheio
 */
+void dcu_create_extend_after_decode_spec_info(
+    const at::Tensor verified_id,
+    const at::Tensor seq_lens,
+    const at::Tensor accept_lens,
+    at::Tensor positions,
+    at::Tensor new_verified_id,
+    int64_t bs);
 void dcu_create_chunked_prefix_cache_kv_indices(
    at::Tensor req_to_token,
    const at::Tensor req_pool_indices,

--- a/sgl-kernel/python/sgl_kernel/kvcacheio.py
+++ b/sgl-kernel/python/sgl_kernel/kvcacheio.py
@@ -9,6 +9,22 @@ def is_hip() -> bool:

 _is_hip = is_hip()

+def dcu_create_extend_after_decode_spec_info(
+    verified_id: torch.Tensor,
+    seq_lens: torch.Tensor,
+    accept_lens: torch.Tensor,
+    positions: torch.Tensor,
+    new_verified_id: torch.Tensor,
+    bs: int,
+):
+    torch.ops.sgl_kernel.dcu_create_extend_after_decode_spec_info(
+        verified_id,
+        seq_lens,
+        accept_lens,
+        positions,
+        new_verified_id,
+        bs,
+    )

 def dcu_alloc_extend_kernel(
    pre_lens_ptr: torch.Tensor,