Generate: force cache with `inputs_embeds` forwarding (#24639)

f4e4b4d0 · Joao Gante · GitHub · 9934bb1f · f4e4b4d0
Unverified Commit f4e4b4d0 authored Jul 03, 2023 by Joao Gante Committed by GitHub Jul 03, 2023
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 1 deletion

src/transformers/generation/utils.py src/transformers/generation/utils.py +6 -1

No files found.
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1304,6 +1304,11 @@ class GenerationMixin:
        # 4. Define other model kwargs
        model_kwargs["output_attentions"] = generation_config.output_attentions
        model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
+        # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are
+        # generating the first new token or not, and we only want to use the embeddings for the first new token)
+        if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds":
+            model_kwargs["use_cache"] = True
+        else:
            model_kwargs["use_cache"] = generation_config.use_cache
        accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys())