Fix AdamWeightDecay for TF 2.11 (#20735)

* Fix AdamWeightDecay for TF * Fix AdamWeightDecay for TF * make fixup

Fix AdamWeightDecay for TF 2.11 (#20735)
* Fix AdamWeightDecay for TF * Fix AdamWeightDecay for TF * make fixup
4f1788b3 · Matt · GitHub · a12c5cbc · 4f1788b3
Unverified Commit 4f1788b3 authored Dec 13, 2022 by Matt Committed by GitHub Dec 13, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 1 deletion

src/transformers/optimization_tf.py src/transformers/optimization_tf.py +7 -1

No files found.
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -21,6 +21,12 @@ from typing import Callable, List, Optional, Union
 import tensorflow as tf
+if hasattr(tf.keras, "optimizer") and hasattr(tf.keras.optimizer, "legacy"):
+    Adam = tf.keras.optimizer.legacy.Adam
+else:
+    Adam = tf.keras.optimizers.Adam
 class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
    """
    Applies a warmup schedule on a given learning rate decay schedule.
@@ -163,7 +169,7 @@ def create_optimizer(
    return optimizer, lr_schedule
-class AdamWeightDecay(tf.keras.optimizers.Adam):
+class AdamWeightDecay(Adam):
    """
    Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
    loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact