update the conditions for pad_v on v0

1f9aadcf · zhuwenwen · 5f09420d · 1f9aadcf
Commit 1f9aadcf authored Sep 10, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 3 deletions

vllm/attention/backends/mla/common.py vllm/attention/backends/mla/common.py +6 -3

No files found.
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -957,9 +957,12 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
        # v with 0s to match the qk head dim for attention backends that do
        # not support different headdims
        # We don't need to pad V if we are on a hopper system with FA3
-        self._pad_v = self.vllm_flash_attn_version is None or torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120 or not (
+        if not current_platform.is_rocm():
+            self._pad_v = self.vllm_flash_attn_version is None or not (
                self.vllm_flash_attn_version == 3
                and current_platform.get_device_capability()[0] == 9)
+        else:
+            self._pad_v = torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120

    def _flash_attn_varlen_diff_headdims(self, q, k, v, softmax_scale,
                                         return_softmax_lse, **kwargs):