add softcap interface for gemma-2

f9b567df · zhuwenwen · 6dc7aa42 · f9b567df
Commit f9b567df authored Jan 02, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 0 deletions

vllm/attention/backends/rocm_flash_attn.py vllm/attention/backends/rocm_flash_attn.py +8 -0

No files found.
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -350,10 +350,17 @@ class ROCmFlashAttentionImpl(AttentionImpl):
        if blocksparse_params is not None:
            raise ValueError(
                "ROCmFlashAttention does not support blocksparse attention.")
+        '''
        if logits_soft_cap is not None:
            raise ValueError(
                "ROCmFlashAttention does not support attention logits soft "
                "capping.")
+        '''
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
+
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)
@@ -566,6 +573,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                        causal=True,
                        window_size=self.sliding_window,
                        alibi_slopes=self.alibi_slopes,
+                        softcap=self.logits_soft_cap,
                    )

                # common code for prefill