update use_rocm_custom_paged_attention

df877aad · zhuwenwen · 1e302221 · df877aad · df877aad
Commit df877aad authored May 28, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 13 deletions

vllm/attention/backends/rocm_flash_attn.py vllm/attention/backends/rocm_flash_attn.py +3 -4

vllm/platforms/rocm.py vllm/platforms/rocm.py +10 -9

No files found.
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -922,10 +922,9 @@ class ROCmFlashAttentionImpl(AttentionImpl):
            num_seqs, num_heads, head_size = decode_query.shape
            block_size = value_cache.shape[3]
            gqa_ratio = num_heads // self.num_kv_heads
-            # use_custom = use_rocm_custom_paged_attention(
-            #     decode_query.dtype, head_size, block_size, gqa_ratio,
-            #     decode_meta.max_decode_seq_len, self.sliding_window)
-            use_custom = False
+            use_custom = use_rocm_custom_paged_attention(
+                decode_query.dtype, head_size, block_size, gqa_ratio,
+                decode_meta.max_decode_seq_len, self.sliding_window)
            if use_custom:
                max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type
                               != AttentionType.ENCODER_DECODER else

--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -110,15 +110,16 @@ def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
    # rocm custom page attention not support on gfx1*
    # custom paged attn always supported on V0. On V1, requires sliding window
    # disabled due to observed numerical discrepancy.
-    return (on_mi250_mi300() and (not envs.VLLM_USE_V1 or sliding_window == 0
-                                  or sliding_window == (-1, -1))
-            and (qtype == torch.half or qtype == torch.bfloat16)
-            and (head_size == 64 or head_size == 128)
-            and (block_size == 16 or block_size == 32)
-            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
-            and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
-            and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
-                     and envs.VLLM_ROCM_USE_AITER))
+    return False
+    # return (on_mi250_mi300() and (not envs.VLLM_USE_V1 or sliding_window == 0
+    #                               or sliding_window == (-1, -1))
+    #         and (qtype == torch.half or qtype == torch.bfloat16)
+    #         and (head_size == 64 or head_size == 128)
+    #         and (block_size == 16 or block_size == 32)
+    #         and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
+    #         and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+    #         and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
+    #                  and envs.VLLM_ROCM_USE_AITER))


 class RocmPlatform(Platform):