Unverified Commit 912788c0 authored by Chang Su's avatar Chang Su Committed by GitHub
Browse files

perf: optimize local_block_table memory allocation (#6273)

parent 0f75b907
...@@ -1165,7 +1165,6 @@ class FlashAttentionBackend(AttentionBackend): ...@@ -1165,7 +1165,6 @@ class FlashAttentionBackend(AttentionBackend):
max_virtual_batches = max_bs * ( max_virtual_batches = max_bs * (
(max_seq_len + attn_chunk_size - 1) // attn_chunk_size (max_seq_len + attn_chunk_size - 1) // attn_chunk_size
) )
max_blocks_per_seq = (max_seq_len + attn_chunk_size - 1) // attn_chunk_size
max_pages_per_block = (attn_chunk_size + page_size - 1) // page_size max_pages_per_block = (attn_chunk_size + page_size - 1) // page_size
self.decode_cuda_graph_local_attn_metadata = { self.decode_cuda_graph_local_attn_metadata = {
...@@ -1177,7 +1176,7 @@ class FlashAttentionBackend(AttentionBackend): ...@@ -1177,7 +1176,7 @@ class FlashAttentionBackend(AttentionBackend):
), ),
"local_block_table": torch.zeros( "local_block_table": torch.zeros(
max_virtual_batches, max_virtual_batches,
max_blocks_per_seq * max_pages_per_block, max_pages_per_block,
dtype=torch.int32, dtype=torch.int32,
device=self.device, device=self.device,
), ),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment