only K100_AI can use cutlass prefix cache

14945681 · zhuwenwen · bbd14169 · 14945681
Commit 14945681 authored Jun 21, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 1 addition and 1 deletion

vllm/attention/backends/rocm_flash_attn.py vllm/attention/backends/rocm_flash_attn.py +1 -1

No files found.
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -857,7 +857,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
            else:
                # prefix-enabled attention -
                # not applicable for encoder-only models
-                if envs.VLLM_USE_TRITON_PREFIX_FLASH_ATTN or gpuname.startswith('BW'):   
+                if envs.VLLM_USE_TRITON_PREFIX_FLASH_ATTN or (not gpuname.startswith('K100_AI')):   
                    version_key = triton_key()
                    if self.attn_type != AttentionType.ENCODER_ONLY:
                        output[:num_prefill_tokens] = paged_attn.forward_prefix(