vdim pad 32

cf28e5a4 · zhuwenwen · 9c3190d0 · cf28e5a4
Commit cf28e5a4 authored Apr 15, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 18 deletions

vllm/attention/backends/mla/utils.py vllm/attention/backends/mla/utils.py +8 -18

No files found.
--- a/vllm/attention/backends/mla/utils.py
+++ b/vllm/attention/backends/mla/utils.py
@@ -533,29 +533,16 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
        # For MLA the v head dim is smaller than qk head dim so we pad out
        # v with 0s to match the qk head dim
-        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
+        # v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
+        #                                    value=0)
+        v_padded = torch.nn.functional.pad(v, [0, (q.shape[-1] - v.shape[-1] -32)],
                                           value=0)
+        v_tmp = v_padded[..., :-32].reshape(v.shape[0], v.shape[1],v.shape[2])
-        # if torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120:
-        #     attn_output = flash_attn_varlen_func(
-        #         q=q,
-        #         k=k,
-        #         v=v_padded,
-        #         cu_seqlens_q=seq_start_loc,
-        #         cu_seqlens_k=seq_start_loc,
-        #         max_seqlen_q=max_prefill_seq_len,
-        #         max_seqlen_k=max_prefill_seq_len,
-        #         softmax_scale=self.scale,
-        #         causal=True,
-        #     )
-        #     attn_output = attn_output\
-        #         .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
-        #             .reshape(-1, self.num_heads * v.shape[-1])
-        # else:
        attn_output = flash_attn_varlen_func(
            q=q,
            k=k,
-            v=v,
+            v=v_tmp if torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120 else v,
            cu_seqlens_q=seq_start_loc,
            cu_seqlens_k=seq_start_loc,
            max_seqlen_q=max_prefill_seq_len,
@@ -563,6 +550,9 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
            softmax_scale=self.scale,
            causal=True,
        )
+        # output = output\
+        #     .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
+        #         .reshape(-1, self.num_heads * v.shape[-1])
        attn_output = attn_output\
                .reshape(-1, self.num_heads * v.shape[-1])