update prefix cache interface and vdim padding strategy

4564b2f5 · zhuwenwen · b99b5676 · 4564b2f5 · 4564b2f5
Commit 4564b2f5 authored Apr 14, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 33 additions and 18 deletions

vllm/attention/backends/mla/utils.py vllm/attention/backends/mla/utils.py +32 -18

vllm/attention/backends/rocm_flash_attn.py vllm/attention/backends/rocm_flash_attn.py +1 -0

No files found.
--- a/vllm/attention/backends/mla/utils.py
+++ b/vllm/attention/backends/mla/utils.py
@@ -535,23 +535,37 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
        # v with 0s to match the qk head dim
        # v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
        #                                    value=0)
+        v_padded = torch.nn.functional.pad(v, [0, (q.shape[-1] - v.shape[-1]-32)],
-        attn_output = flash_attn_varlen_func(
+                                           value=0)
-            q=q,
-            k=k,
+        if torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120:
-            # v=v_padded,
+            attn_output = flash_attn_varlen_func(
-            v=v,
+                q=q,
-            cu_seqlens_q=seq_start_loc,
+                k=k,
-            cu_seqlens_k=seq_start_loc,
+                v=v_padded,
-            max_seqlen_q=max_prefill_seq_len,
+                cu_seqlens_q=seq_start_loc,
-            max_seqlen_k=max_prefill_seq_len,
+                cu_seqlens_k=seq_start_loc,
-            softmax_scale=self.scale,
+                max_seqlen_q=max_prefill_seq_len,
-            causal=True,
+                max_seqlen_k=max_prefill_seq_len,
-        )
+                softmax_scale=self.scale,
-        # attn_output = attn_output\
+                causal=True,
-        #     .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
+            )
-        #         .reshape(-1, self.num_heads * v.shape[-1])
+            attn_output = attn_output\
-        attn_output = attn_output\
+                .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
-                .reshape(-1, self.num_heads * v.shape[-1])
+                    .reshape(-1, self.num_heads * v.shape[-1])
+        else:
+            attn_output = flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=v,
+                cu_seqlens_q=seq_start_loc,
+                cu_seqlens_k=seq_start_loc,
+                max_seqlen_q=max_prefill_seq_len,
+                max_seqlen_k=max_prefill_seq_len,
+                softmax_scale=self.scale,
+                causal=True,
+            )
+            attn_output = attn_output\
+                    .reshape(-1, self.num_heads * v.shape[-1])
        return self.o_proj(attn_output)[0]
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -790,6 +790,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                               prefill_meta.block_tables,
                               prefill_meta.query_start_loc,
                               prefill_meta.seq_lens_tensor,
+                               prefill_meta.context_lens_tensor,
                               prefill_meta.max_query_len,
                               self.alibi_slopes,
                               self.sliding_window[0],