Optimize a pad operation to accelerate 25us (#5945)

6fc17596 · Stefan He · GitHub · ad506a4e · 6fc17596
Unverified Commit 6fc17596 authored May 01, 2025 by Stefan He Committed by GitHub May 01, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 2 deletions

python/sglang/srt/layers/attention/flashattention_backend.py python/sglang/srt/layers/attention/flashattention_backend.py +3 -2

No files found.
--- a/python/sglang/srt/layers/attention/flashattention_backend.py
+++ b/python/sglang/srt/layers/attention/flashattention_backend.py
@@ -1587,8 +1587,9 @@ class FlashAttentionBackend(AttentionBackend):
                metadata.max_seq_len_k = max_len
                metadata.cache_seqlens_int32 = seq_lens.to(torch.int32)
-                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                # Optimize cumulative sequence length calculation
-                    torch.cumsum(seq_lens, dim=0, dtype=torch.int32), (1, 0)
+                metadata.cu_seqlens_k[1:].copy_(
+                    torch.cumsum(seq_lens, dim=0, dtype=torch.int32)
                )
                max_seq_pages = (