[ROCm] Env variable to trigger custom PA (#15557)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>

[ROCm] Env variable to trigger custom PA (#15557)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
ecff8309 · Gregory Shtrasberg · GitHub · dcf2a590 · ecff8309 · ecff8309
Unverified Commit ecff8309 authored Mar 27, 2025 by Gregory Shtrasberg Committed by GitHub Mar 26, 2025
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 1 deletion

vllm/attention/backends/rocm_flash_attn.py vllm/attention/backends/rocm_flash_attn.py +2 -1

vllm/envs.py vllm/envs.py +6 -0

No files found.
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -908,4 +908,5 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
            and (qtype == torch.half or qtype == torch.bfloat16)
            and (head_size == 64 or head_size == 128)
            and (block_size == 16 or block_size == 32)
-            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)
+            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
+            and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -78,6 +78,7 @@ if TYPE_CHECKING:
    VLLM_ROCM_USE_AITER_RMSNORM: bool = True
    VLLM_ROCM_FP8_PADDING: bool = True
    VLLM_ROCM_MOE_PADDING: bool = True
+    VLLM_ROCM_CUSTOM_PAGED_ATTN: bool = True
    VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
    VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
    VLLM_DISABLE_COMPILE_CACHE: bool = False
@@ -541,6 +542,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_ROCM_MOE_PADDING":
    lambda: bool(int(os.getenv("VLLM_ROCM_MOE_PADDING", "1"))),

+    # custom paged attention kernel for MI3* cards
+    "VLLM_ROCM_CUSTOM_PAGED_ATTN":
+    lambda: (os.getenv("VLLM_ROCM_CUSTOM_PAGED_ATTN", "True").lower() in
+             ("true", "1")),
+
    # Divisor for dynamic query scale factor calculation for FP8 KV Cache
    "Q_SCALE_CONSTANT":
    lambda: int(os.getenv("Q_SCALE_CONSTANT", "200")),