Internal change

PiperOrigin-RevId: 337609198

Internal change
PiperOrigin-RevId: 337609198
19113a57 · Ruoxin Sang · A. Unique TensorFlower · a3e847b6 · 19113a57
Commit 19113a57 authored Oct 16, 2020 by Ruoxin Sang Committed by A. Unique TensorFlower Oct 16, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 4 deletions

official/nlp/optimization.py official/nlp/optimization.py +16 -4

No files found.
--- a/official/nlp/optimization.py
+++ b/official/nlp/optimization.py
@@ -194,15 +194,27 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
    return coefficients['lr_t'], dict(apply_state=apply_state)
  def _resource_apply_dense(self, grad, var, apply_state=None):
-    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+    # As the weight decay doesn't take any tensors from forward pass as inputs,
-    decay = self._decay_weights_op(var, lr_t, apply_state)
+    # add a control dependency here to make sure it happens strictly in the
+    # backward pass.
+    # TODO(b/171088214): Remove it after the control dependency in
+    # nested function is fixed.
+    with tf.control_dependencies([grad]):
+      lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+      decay = self._decay_weights_op(var, lr_t, apply_state)
    with tf.control_dependencies([decay]):
      return super(AdamWeightDecay,
                   self)._resource_apply_dense(grad, var, **kwargs)
  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+    # As the weight decay doesn't take any tensors from forward pass as inputs,
-    decay = self._decay_weights_op(var, lr_t, apply_state)
+    # add a control dependency here to make sure it happens strictly in the
+    # backward pass.
+    # TODO(b/171088214): Remove it after the control dependency in
+    # nested function is fixed.
+    with tf.control_dependencies([grad]):
+      lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+      decay = self._decay_weights_op(var, lr_t, apply_state)
    with tf.control_dependencies([decay]):
      return super(AdamWeightDecay,
                   self)._resource_apply_sparse(grad, var, indices, **kwargs)