Replace pad with cat for better performance (#11388)

Co-authored-by: luoyuan.luo <luoyuan.luo@antgroup.com>

Replace pad with cat for better performance (#11388)
Co-authored-by: luoyuan.luo <luoyuan.luo@antgroup.com>
b5044fbf · Yuan Luo · GitHub · 70fbb3ad · b5044fbf · b5044fbf
Unverified Commit b5044fbf authored Oct 10, 2025 by Yuan Luo Committed by GitHub Oct 10, 2025
5 changed files
--- a/python/sglang/srt/models/dots_vlm_vit.py
+++ b/python/sglang/srt/models/dots_vlm_vit.py
@@ -323,7 +323,7 @@ class DotsVisionTransformer(PreTrainedModel):
            dim=0,
            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
        )
-        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
        for blk in self.blocks:
            hidden_states = blk(

--- a/python/sglang/srt/models/glm4v.py
+++ b/python/sglang/srt/models/glm4v.py
@@ -434,7 +434,7 @@ class Glm4vVisionModel(nn.Module):
        cu_seqlens = torch.repeat_interleave(
            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
        ).cumsum(dim=0, dtype=torch.int32)
-        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
        x = self.embeddings(

--- a/python/sglang/srt/models/qwen2_5_vl.py
+++ b/python/sglang/srt/models/qwen2_5_vl.py
@@ -436,7 +436,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
                .to(device=x.device, dtype=torch.int32),
            ]
        )
-        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
        # transformers
        x = x.unsqueeze(1)

--- a/python/sglang/srt/models/qwen2_vl.py
+++ b/python/sglang/srt/models/qwen2_vl.py
@@ -407,7 +407,7 @@ class Qwen2VisionTransformer(nn.Module):
        cu_seqlens = torch.repeat_interleave(
            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
        ).cumsum(dim=0, dtype=torch.int32)
-        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
        # transformers
        x = x.unsqueeze(1)

--- a/python/sglang/srt/models/qwen3_vl.py
+++ b/python/sglang/srt/models/qwen3_vl.py
@@ -458,7 +458,7 @@ class Qwen3_VisionTransformer(nn.Module):
                (grid_thw[:, 0] * grid_thw[:, 1] * grid_thw[:, 2]).cumsum(dim=0),
            ]
        )
-        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
        # max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
        x = x.unsqueeze(1)