[`Modeling` / `Mixtral`] Fix GC + PEFT issues with Mixtral (#28061)

fix for mistral

[`Modeling` / `Mixtral`] Fix GC + PEFT issues with Mixtral (#28061)
fix for mistral
e737446e · Younes Belkada · GitHub · 1e209317 · e737446e
Unverified Commit e737446e authored Dec 15, 2023 by Younes Belkada Committed by GitHub Dec 15, 2023
Show whitespace changes
Inline Side-by-side

Showing with 7 additions and 7 deletions

src/transformers/models/mixtral/modeling_mixtral.py src/transformers/models/mixtral/modeling_mixtral.py +7 -7

No files found.
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -1016,6 +1016,13 @@ class MixtralModel(MixtralPreTrainedModel):

        past_key_values_length = 0

+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
        if use_cache:
            use_legacy_cache = not isinstance(past_key_values, Cache)
            if use_legacy_cache:
@@ -1058,13 +1065,6 @@ class MixtralModel(MixtralPreTrainedModel):

        hidden_states = inputs_embeds

-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None