perf: optimize local_block_table memory allocation (#6273)

912788c0 · Chang Su · GitHub · 0f75b907 · 912788c0
Unverified Commit 912788c0 authored May 13, 2025 by Chang Su Committed by GitHub May 13, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 2 deletions

python/sglang/srt/layers/attention/flashattention_backend.py python/sglang/srt/layers/attention/flashattention_backend.py +1 -2

No files found.
--- a/python/sglang/srt/layers/attention/flashattention_backend.py
+++ b/python/sglang/srt/layers/attention/flashattention_backend.py
@@ -1165,7 +1165,6 @@ class FlashAttentionBackend(AttentionBackend):
            max_virtual_batches = max_bs * (
                (max_seq_len + attn_chunk_size - 1) // attn_chunk_size
            )
-            max_blocks_per_seq = (max_seq_len + attn_chunk_size - 1) // attn_chunk_size
            max_pages_per_block = (attn_chunk_size + page_size - 1) // page_size

            self.decode_cuda_graph_local_attn_metadata = {
@@ -1177,7 +1176,7 @@ class FlashAttentionBackend(AttentionBackend):
                ),
                "local_block_table": torch.zeros(
                    max_virtual_batches,
-                    max_blocks_per_seq * max_pages_per_block,
+                    max_pages_per_block,
                    dtype=torch.int32,
                    device=self.device,
                ),