Fix Gemma2 4d attention mask (#31674)

Update modeling_gemma2.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

Fix Gemma2 4d attention mask (#31674)
Update modeling_gemma2.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
5e89b335 · hoshi-hiyouga · GitHub · 0142aab7 · 5e89b335
Unverified Commit 5e89b335 authored Jun 28, 2024 by hoshi-hiyouga Committed by GitHub Jun 28, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 4 deletions

src/transformers/models/gemma2/modeling_gemma2.py src/transformers/models/gemma2/modeling_gemma2.py +6 -4

No files found.
--- a/src/transformers/models/gemma2/modeling_gemma2.py
+++ b/src/transformers/models/gemma2/modeling_gemma2.py
@@ -629,11 +629,13 @@ class Gemma2DecoderLayer(nn.Module):
        if (
            self.config._attn_implementation != "flash_attention_2" and self.is_sliding and attention_mask is not None
        ):  # efficient SDPA and no padding
-            attention_mask = attention_mask * torch.tril(
+            min_dtype = torch.finfo(hidden_states.dtype).min
-                torch.ones_like(attention_mask), diagonal=-self.sliding_window
+            sliding_window_mask = torch.tril(
+                torch.ones_like(attention_mask, dtype=torch.bool), diagonal=-self.sliding_window
            )
-            if attention_mask.shape[1] <= 1:  # when decoding
+            attention_mask = torch.where(sliding_window_mask, min_dtype, attention_mask)
-                attention_mask = attention_mask[:, -self.sliding_window :]
+            if attention_mask.shape[-1] <= 1:  # when decoding
+                attention_mask = attention_mask[:, :, :, -self.sliding_window :]
        residual = hidden_states