update _forward_encoder_attention interface

8b1077ba · zhuwenwen · 98f111f9 · 8b1077ba
Commit 8b1077ba authored Jan 05, 2026 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 41 additions and 19 deletions

vllm/v1/attention/backends/flash_attn.py vllm/v1/attention/backends/flash_attn.py +41 -19

No files found.
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -736,6 +736,7 @@ class FlashAttentionImpl(AttentionImpl):
            self.num_kv_heads)

        # Call flash attention directly on Q, K, V tensors
+        if not current_platform.is_rocm():
            flash_attn_varlen_func(
                q=query,
                k=key,
@@ -755,6 +756,27 @@ class FlashAttentionImpl(AttentionImpl):
                k_descale=layer._k_scale.expand(descale_shape),
                v_descale=layer._v_scale.expand(descale_shape),
            )
+        else:
+            vllm_flash_attn_varlen_func(
+                q=query,
+                k=key,
+                v=value,
+                out=output,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_k=max_seqlen_k,
+                softmax_scale=self.scale,
+                causal=False,  # Encoder attention is bidirectional
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.sliding_window,
+                softcap=self.logits_soft_cap,
+                # fa_version=self.vllm_flash_attn_version,
+                # q_descale=layer._q_scale.expand(descale_shape),
+                # k_descale=layer._k_scale.expand(descale_shape),
+                # v_descale=layer._v_scale.expand(descale_shape),
+                is_prefix_cache=True,
+            )

        return output