Merged commit includes the following changes: (#7252)

258597234 by rxsang<rxsang@google.com>: Update all the TPUStrategy examples to use the new v2 APIs, i.e. make_dataset_iterator -> experimental_distribute_dataset, make_input_fn_iterator -> experimental_distribute_datasets_from_function, unwrap -> experimental_local_results, experimental_run -> experimental_run_v2 -- 258581998 by taylorrobie<taylorrobie@google.com>: Update keras v2 optimizers to reuse coefficients which are shared across all updates, which reduces the total number of ops created by between 5% (for simple optimizers such as SGD and Adagrad) and 25% (for complicated optimizers such as Adam and NAdam). Separate copies are made for each device and dtype. The effect of this change on run time is fairly minimal since Grappler is expected to consolidate most of these ops; however it does improve graph construction time. -- PiperOrigin-RevId: 258597234

Merged commit includes the following changes: (#7252)
258597234 by rxsang<rxsang@google.com>: Update all the TPUStrategy examples to use the new v2 APIs, i.e. make_dataset_iterator -> experimental_distribute_dataset, make_input_fn_iterator -> experimental_distribute_datasets_from_function, unwrap -> experimental_local_results, experimental_run -> experimental_run_v2 -- 258581998 by taylorrobie<taylorrobie@google.com>: Update keras v2 optimizers to reuse coefficients which are shared across all updates, which reduces the total number of ops created by between 5% (for simple optimizers such as SGD and Adagrad) and 25% (for complicated optimizers such as Adam and NAdam). Separate copies are made for each device and dtype. The effect of this change on run time is fairly minimal since Grappler is expected to consolidate most of these ops; however it does improve graph construction time. -- PiperOrigin-RevId: 258597234
1fb34e76 · Hongkun Yu · GitHub · 79b87be6 · 1fb34e76 · 1fb34e76
Unverified Commit 1fb34e76 authored Jul 18, 2019 by Hongkun Yu Committed by GitHub Jul 18, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 17 deletions

official/bert/optimization.py official/bert/optimization.py +16 -16

official/bert/run_squad.py official/bert/run_squad.py +1 -1

No files found.
--- a/official/bert/optimization.py
+++ b/official/bert/optimization.py
@@ -134,29 +134,29 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
    return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
-  def _resource_apply_dense(self, grad, var):
+  def _get_lr(self, var_device, var_dtype, apply_state):
-    var_dtype = var.dtype.base_dtype
+    if apply_state is None:
+      return self._decayed_lr_t[var_dtype], {}
-    try:
+    apply_state = apply_state or {}
-      lr_t = self.apply_cache[var.device, var.dtype.base_dtype].lr_t
+    coefficients = apply_state.get((var_device, var_dtype))
-    except AttributeError:
+    if coefficients is None:
-      lr_t = self._decayed_lr_t[var_dtype]
+      coefficients = self._fallback_apply_state(var_device, var_dtype)
+      apply_state[(var_device, var_dtype)] = coefficients
+    return coefficients['lr_t'], dict(apply_state=apply_state)
+  def _resource_apply_dense(self, grad, var, apply_state=None):
+    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
    with tf.control_dependencies([self._decay_weights_op(var, lr_t)]):
      return super(AdamWeightDecay, self)._resource_apply_dense(
-          grad, var)
+          grad, var, **kwargs)
-  def _resource_apply_sparse(self, grad, var, indices):
-    var_dtype = var.dtype.base_dtype
-    try:
-      lr_t = self.apply_cache[var.device, var.dtype.base_dtype].lr_t
-    except AttributeError:
-      lr_t = self._decayed_lr_t[var_dtype]
+  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
    with tf.control_dependencies([self._decay_weights_op(var, lr_t)]):
      return super(AdamWeightDecay, self)._resource_apply_sparse(
-          grad, var, indices)
+          grad, var, indices, **kwargs)
  def get_config(self):
    config = super(AdamWeightDecay, self).get_config()

--- a/official/bert/run_squad.py
+++ b/official/bert/run_squad.py
@@ -161,7 +161,7 @@ def predict_squad_customized(strategy, input_meta_data, bert_config,
      outputs = strategy.experimental_run_v2(
          _replicated_step, args=(next(iterator),))
-      return tf.nest.map_structure(strategy.unwrap, outputs)
+      return tf.nest.map_structure(strategy.experimental_local_results, outputs)
    all_results = []
    for _ in range(num_steps):