Merged commit includes the following changes: (#7398)

262039434 by A. Unique TensorFlower<gardener@tensorflow.org>: Internal change 262024241 by hongkuny<hongkuny@google.com>: Adds __init__.py -- 262021128 by isaprykin<isaprykin@google.com>: Internal change PiperOrigin-RevId: 262039434

Merged commit includes the following changes: (#7398)
262039434 by A. Unique TensorFlower<gardener@tensorflow.org>: Internal change 262024241 by hongkuny<hongkuny@google.com>: Adds __init__.py -- 262021128 by isaprykin<isaprykin@google.com>: Internal change PiperOrigin-RevId: 262039434
45b708d4 · Hongkun Yu · GitHub · 4697163b · 45b708d4 · 45b708d4
Unverified Commit 45b708d4 authored Aug 07, 2019 by Hongkun Yu Committed by GitHub Aug 07, 2019
3 changed files
--- a/official/modeling/hyperparams/__init__.py
+++ b/official/modeling/hyperparams/__init__.py
-
--- a/official/transformer/v2/optimizer.py
+++ b/official/transformer/v2/optimizer.py
@@ -23,73 +23,6 @@ import tensorflow as tf
 K = tf.keras.backend


-class LazyAdam(tf.keras.optimizers.Adam):
-  """Variant of the Adam optimizer that handles sparse updates more efficiently.
-
-  The original Adam algorithm maintains two moving-average accumulators for
-  each trainable variable; the accumulators are updated at every step.
-  This class provides lazier handling of gradient updates for sparse
-  variables.  It only updates moving-average accumulators for sparse variable
-  indices that appear in the current batch, rather than updating the
-  accumulators for all indices. Compared with the original Adam optimizer,
-  it can provide large improvements in model training throughput for some
-  applications. However, it provides slightly different semantics than the
-  original Adam algorithm, and may lead to different empirical results.
-  Note, amsgrad is currently not supported and the argument can only be
-  False.
-
-  This class is borrowed from:
-  https://github.com/tensorflow/addons/blob/master/tensorflow_addons/optimizers/lazy_adam.py
-  """
-
-  def _resource_apply_sparse(self, grad, var, indices):
-    """Applies grad for one step."""
-    var_dtype = var.dtype.base_dtype
-    lr_t = self._decayed_lr(var_dtype)
-    beta_1_t = self._get_hyper('beta_1', var_dtype)
-    beta_2_t = self._get_hyper('beta_2', var_dtype)
-    local_step = tf.cast(self.iterations + 1, var_dtype)
-    beta_1_power = tf.math.pow(beta_1_t, local_step)
-    beta_2_power = tf.math.pow(beta_2_t, local_step)
-    epsilon_t = tf.convert_to_tensor(self.epsilon, var_dtype)
-    lr = (lr_t * tf.math.sqrt(1 - beta_2_power) / (1 - beta_1_power))
-
-    # \\(m := beta1 * m + (1 - beta1) * g_t\\)
-    m = self.get_slot(var, 'm')
-    m_t_slice = beta_1_t * tf.gather(m, indices) + (1 - beta_1_t) * grad
-
-    m_update_kwargs = {
-        'resource': m.handle,
-        'indices': indices,
-        'updates': m_t_slice
-    }
-    m_update_op = tf.raw_ops.ResourceScatterUpdate(**m_update_kwargs)
-
-    # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
-    v = self.get_slot(var, 'v')
-    v_t_slice = (beta_2_t * tf.gather(v, indices) +
-                 (1 - beta_2_t) * tf.math.square(grad))
-
-    v_update_kwargs = {
-        'resource': v.handle,
-        'indices': indices,
-        'updates': v_t_slice
-    }
-    v_update_op = tf.raw_ops.ResourceScatterUpdate(**v_update_kwargs)
-
-    # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
-    var_slice = lr * m_t_slice / (tf.math.sqrt(v_t_slice) + epsilon_t)
-
-    var_update_kwargs = {
-        'resource': var.handle,
-        'indices': indices,
-        'updates': var_slice
-    }
-    var_update_op = tf.raw_ops.ResourceScatterSub(**var_update_kwargs)
-
-    return tf.group(*[var_update_op, m_update_op, v_update_op])
-
-
 class LearningRateFn(object):
  """Creates learning rate function."""


--- a/official/transformer/v2/transformer_main.py
+++ b/official/transformer/v2/transformer_main.py
@@ -250,7 +250,7 @@ class TransformerTask(object):
  def _create_optimizer(self):
    """Creates optimizer."""
    params = self.params
-    opt = optimizer.LazyAdam(
+    opt = tf.keras.optimizers.Adam(
        params["learning_rate"],
        params["optimizer_adam_beta1"],
        params["optimizer_adam_beta2"],