Gemma2: fix FA2 generation (#32553)

fix FA2

Gemma2: fix FA2 generation (#32553)
fix FA2
838d141f · Raushan Turganbay · GitHub · 85817d98 · 838d141f
Unverified Commit 838d141f authored Aug 09, 2024 by Raushan Turganbay Committed by GitHub Aug 09, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 1 deletion

src/transformers/models/gemma2/modeling_gemma2.py src/transformers/models/gemma2/modeling_gemma2.py +5 -1

No files found.
--- a/src/transformers/models/gemma2/modeling_gemma2.py
+++ b/src/transformers/models/gemma2/modeling_gemma2.py
@@ -1093,7 +1093,11 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel):
            # The clone here is for the same reason as for `position_ids`.
            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
-        if isinstance(past_key_values, HybridCache) and attention_mask.ndim == 2:
+        if (
+            isinstance(past_key_values, HybridCache)
+            and attention_mask.ndim == 2
+            and not self.config._attn_implementation == "flash_attention_2"
+        ):
            if model_inputs["inputs_embeds"] is not None:
                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
                device = model_inputs["inputs_embeds"].device