Fix: MI100 Support By Bypassing Custom Paged Attention (#9560)

55137e8e · ErkinSagiroglu · GitHub · 5cbdccd1 · 55137e8e
Unverified Commit 55137e8e authored Oct 26, 2024 by ErkinSagiroglu Committed by GitHub Oct 26, 2024
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 2 deletions

vllm/attention/backends/rocm_flash_attn.py vllm/attention/backends/rocm_flash_attn.py +6 -2

No files found.
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -21,7 +21,10 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)
 _PARTITION_SIZE_ROCM = 512
-_ON_NAVI = "gfx1" in torch.cuda.get_device_properties("cuda").gcnArchName
+_GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+_ON_NAVI = "gfx1" in _GPU_ARCH
+_ON_MI250_MI300 = any(arch in _GPU_ARCH
+                      for arch in ["gfx90a", "gfx940", "gfx941", "gfx942"])
 class ROCmFlashAttentionBackend(AttentionBackend):
@@ -662,7 +665,8 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
                                     block_size: int, gqa_ratio: int,
                                     max_seq_len: int) -> bool:
    # rocm custom page attention not support on navi (gfx1*)
-    return (not _ON_NAVI and (qtype == torch.half or qtype == torch.bfloat16)
+    return (_ON_MI250_MI300 and not _ON_NAVI
+            and (qtype == torch.half or qtype == torch.bfloat16)
            and (head_size == 64 or head_size == 128)
            and (block_size == 16 or block_size == 32)
            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)