[FlaxBart] make sure no grads are computed an bias (#16345)

* [FlaxBart] make sure no grads are computed an bias * correct all other seq2seq models

[FlaxBart] make sure no grads are computed an bias (#16345)
* [FlaxBart] make sure no grads are computed an bias * correct all other seq2seq models
a220f160 · Patrick von Platen · GitHub · 4975002d · a220f160 · a220f160
Unverified Commit a220f160 authored Mar 23, 2022 by Patrick von Platen Committed by GitHub Mar 23, 2022
5 changed files
--- a/src/transformers/models/bart/modeling_flax_bart.py
+++ b/src/transformers/models/bart/modeling_flax_bart.py
@@ -1292,7 +1292,7 @@ class FlaxBartForConditionalGenerationModule(nn.Module):
        else:
            lm_logits = self.lm_head(hidden_states)
-        lm_logits += self.final_logits_bias.astype(self.dtype)
+        lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype))
        if not return_dict:
            output = (lm_logits,) + outputs[1:]

--- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
@@ -1270,7 +1270,7 @@ class FlaxBlenderbotForConditionalGenerationModule(nn.Module):
        else:
            lm_logits = self.lm_head(hidden_states)
-        lm_logits += self.final_logits_bias.astype(self.dtype)
+        lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype))
        if not return_dict:
            output = (lm_logits,) + outputs[1:]

--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -1267,7 +1267,7 @@ class FlaxBlenderbotSmallForConditionalGenerationModule(nn.Module):
        else:
            lm_logits = self.lm_head(hidden_states)
-        lm_logits += self.final_logits_bias.astype(self.dtype)
+        lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype))
        if not return_dict:
            output = (lm_logits,) + outputs[1:]

--- a/src/transformers/models/mbart/modeling_flax_mbart.py
+++ b/src/transformers/models/mbart/modeling_flax_mbart.py
@@ -1329,7 +1329,7 @@ class FlaxMBartForConditionalGenerationModule(nn.Module):
        else:
            lm_logits = self.lm_head(hidden_states)
-        lm_logits += self.final_logits_bias.astype(self.dtype)
+        lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype))
        if not return_dict:
            output = (lm_logits,) + outputs[1:]

--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -1280,7 +1280,7 @@ class FlaxPegasusForConditionalGenerationModule(nn.Module):
        else:
            lm_logits = self.lm_head(hidden_states)
-        lm_logits += self.final_logits_bias.astype(self.dtype)
+        lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype))
        if not return_dict:
            output = (lm_logits,) + outputs[1:]