Commit 5f09420d authored by zhuwenwen's avatar zhuwenwen
Browse files

update the conditions for pad_v

parent e005bce5
...@@ -920,10 +920,12 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): ...@@ -920,10 +920,12 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
# v with 0s to match the qk head dim for attention backends that do # v with 0s to match the qk head dim for attention backends that do
# not support different headdims # not support different headdims
# We don't need to pad V if we are on a hopper system with FA3 # We don't need to pad V if we are on a hopper system with FA3
if not current_platform.is_rocm():
self._pad_v = self.vllm_flash_attn_version is None or not ( self._pad_v = self.vllm_flash_attn_version is None or not (
self.vllm_flash_attn_version == 3 self.vllm_flash_attn_version == 3
and current_platform.get_device_capability()[0] == 9 and current_platform.get_device_capability()[0] == 9)
and torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120) else:
self._pad_v = torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120
def _flash_attn_varlen_diff_headdims(self, def _flash_attn_varlen_diff_headdims(self,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment