Merge remote-tracking branch 'upstream/master' into amp_resnet50

901c4cc4 · Vinh Nguyen · ef30de93 · 824ff2d6 · 901c4cc4 · 901c4cc4
Commit 901c4cc4 authored Aug 20, 2019 by Vinh Nguyen
20 changed files
--- a/official/recommendation/ncf_keras_main.py
+++ b/official/recommendation/ncf_keras_main.py
--- a/official/recommendation/ncf_test.py
+++ b/official/recommendation/ncf_test.py
@@ -189,26 +189,26 @@ class NcfTest(tf.test.TestCase):
    self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3) +
                                  2 * math.log(2) / math.log(4)) / 4)
-  _BASE_END_TO_END_FLAGS = ['-batch_size', '1024', '-train_epochs', '1']
+  _BASE_END_TO_END_FLAGS = ['-batch_size', '1044', '-train_epochs', '1']
  @unittest.skipIf(keras_utils.is_v2_0(), "TODO(b/136018594)")
  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
  def test_end_to_end_estimator(self):
    integration.run_synthetic(
-        ncf_estimator_main.main, tmp_root=self.get_temp_dir(), max_train=None,
+        ncf_estimator_main.main, tmp_root=self.get_temp_dir(),
        extra_flags=self._BASE_END_TO_END_FLAGS)
  @unittest.skipIf(keras_utils.is_v2_0(), "TODO(b/136018594)")
  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
  def test_end_to_end_estimator_mlperf(self):
    integration.run_synthetic(
-        ncf_estimator_main.main, tmp_root=self.get_temp_dir(), max_train=None,
+        ncf_estimator_main.main, tmp_root=self.get_temp_dir(),
        extra_flags=self._BASE_END_TO_END_FLAGS + ['-ml_perf', 'True'])
  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
  def test_end_to_end_keras_no_dist_strat(self):
    integration.run_synthetic(
-        ncf_keras_main.main, tmp_root=self.get_temp_dir(), max_train=None,
+        ncf_keras_main.main, tmp_root=self.get_temp_dir(),
        extra_flags=self._BASE_END_TO_END_FLAGS +
        ['-distribution_strategy', 'off'])
@@ -216,7 +216,7 @@ class NcfTest(tf.test.TestCase):
  @unittest.skipUnless(keras_utils.is_v2_0(), 'TF 2.0 only test.')
  def test_end_to_end_keras_dist_strat(self):
    integration.run_synthetic(
-        ncf_keras_main.main, tmp_root=self.get_temp_dir(), max_train=None,
+        ncf_keras_main.main, tmp_root=self.get_temp_dir(),
        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '0'])
  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
@@ -226,7 +226,7 @@ class NcfTest(tf.test.TestCase):
             ['-num_gpus', '0'] +
             ['-keras_use_ctl', 'True'])
    integration.run_synthetic(
-        ncf_keras_main.main, tmp_root=self.get_temp_dir(), max_train=None,
+        ncf_keras_main.main, tmp_root=self.get_temp_dir(),
        extra_flags=flags)
  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
@@ -238,7 +238,7 @@ class NcfTest(tf.test.TestCase):
          format(1, context.num_gpus()))
    integration.run_synthetic(
-        ncf_keras_main.main, tmp_root=self.get_temp_dir(), max_train=None,
+        ncf_keras_main.main, tmp_root=self.get_temp_dir(),
        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '1'])
  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
@@ -250,7 +250,7 @@ class NcfTest(tf.test.TestCase):
          format(2, context.num_gpus()))
    integration.run_synthetic(
-        ncf_keras_main.main, tmp_root=self.get_temp_dir(), max_train=None,
+        ncf_keras_main.main, tmp_root=self.get_temp_dir(),
        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '2'])
 if __name__ == "__main__":

--- a/official/recommendation/neumf_model.py
+++ b/official/recommendation/neumf_model.py
@@ -109,7 +109,6 @@ def neumf_model_fn(features, labels, mode, params):
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON,
                            value=params["epsilon"])
    optimizer = tf.compat.v1.train.AdamOptimizer(
        learning_rate=params["learning_rate"],
        beta1=params["beta1"],
@@ -151,7 +150,7 @@ def _strip_first_and_last_dimension(x, batch_size):
  return tf.reshape(x[0, :], (batch_size,))
-def construct_model(user_input, item_input, params, need_strip=False):
+def construct_model(user_input, item_input, params):
  # type: (tf.Tensor, tf.Tensor, dict) -> tf.keras.Model
  """Initialize NeuMF model.
@@ -184,34 +183,33 @@ def construct_model(user_input, item_input, params, need_strip=False):
  # Initializer for embedding layers
  embedding_initializer = "glorot_uniform"
-  if need_strip:
+  def mf_slice_fn(x):
-    batch_size = params["batch_size"]
+    x = tf.squeeze(x, [1])
+    return x[:, :mf_dim]
-    user_input_reshaped = tf.keras.layers.Lambda(
-        lambda x: _strip_first_and_last_dimension(
-            x, batch_size))(user_input)
-    item_input_reshaped = tf.keras.layers.Lambda(
+  def mlp_slice_fn(x):
-        lambda x: _strip_first_and_last_dimension(
+    x = tf.squeeze(x, [1])
-            x, batch_size))(item_input)
+    return x[:, mf_dim:]
  # It turns out to be significantly more effecient to store the MF and MLP
  # embedding portions in the same table, and then slice as needed.
-  mf_slice_fn = lambda x: x[:, :mf_dim]
-  mlp_slice_fn = lambda x: x[:, mf_dim:]
  embedding_user = tf.keras.layers.Embedding(
-      num_users, mf_dim + model_layers[0] // 2,
+      num_users,
+      mf_dim + model_layers[0] // 2,
      embeddings_initializer=embedding_initializer,
      embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
-      input_length=1, name="embedding_user")(
+      input_length=1,
-          user_input_reshaped if need_strip else user_input)
+      name="embedding_user")(
+          user_input)
  embedding_item = tf.keras.layers.Embedding(
-      num_items, mf_dim + model_layers[0] // 2,
+      num_items,
+      mf_dim + model_layers[0] // 2,
      embeddings_initializer=embedding_initializer,
      embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
-      input_length=1, name="embedding_item")(
+      input_length=1,
-          item_input_reshaped if need_strip else item_input)
+      name="embedding_item")(
+          item_input)
  # GMF part
  mf_user_latent = tf.keras.layers.Lambda(

--- a/official/resnet/README.md
+++ b/official/resnet/README.md
 # ResNet in TensorFlow
 * For the Keras version of the ResNet model, see
-  [`official/resnet/keras`](keras).
+  [`official/vision/image_classification`](../vision/image_classification).
 * For the Keras custom training loop version, see
  [`official/resnet/ctl`](ctl).
 * For the Estimator version, see [`official/r1/resnet`](../r1/resnet).
\ No newline at end of file
--- a/official/resnet/__init__.py
+++ b/official/resnet/__init__.py
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/resnet/ctl/ctl_imagenet_main.py
+++ b/official/resnet/ctl/ctl_imagenet_main.py
@@ -283,4 +283,6 @@ if __name__ == '__main__':
  logging.set_verbosity(logging.INFO)
  keras_common.define_keras_flags()
  ctl_common.define_ctl_flags()
+  flags.adopt_module_key_flags(keras_common)
+  flags.adopt_module_key_flags(ctl_common)
  absl_app.run(main)
--- a/official/resnet/keras/__init__.py
+++ b/official/resnet/keras/__init__.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bring in the shared Keras ResNet modules into this module.
+The TensorFlow official Keras models are moved under
+    official/vision/image_classification
+In order to be backward compatible with models that directly import its modules,
+we import the Keras ResNet modules under official.resnet.keras.
+New TF models should not depend on modules directly under this path.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from official.vision.image_classification import cifar_preprocessing
+from official.vision.image_classification import common as keras_common
+from official.vision.image_classification import imagenet_preprocessing
+from official.vision.image_classification import resnet_cifar_main as keras_cifar_main
+from official.vision.image_classification import resnet_cifar_model
+from official.vision.image_classification import resnet_imagenet_main as keras_imagenet_main
+from official.vision.image_classification import resnet_model
+del absolute_import
+del division
+del print_function
--- a/official/staging/shakespeare/shakespeare_benchmark.py
+++ b/official/staging/shakespeare/shakespeare_benchmark.py
@@ -208,21 +208,6 @@ class ShakespeareAccuracy(ShakespeareBenchmarkBase):
    FLAGS.model_dir = ''
    self._run_and_report_benchmark()
-  def benchmark_xla_8_gpu(self):
-    """Benchmark 8 gpu w/xla.
-    This is test is for accuracy not scaling.  The batch-size is not scaled to
-    the number of gpus.
-    """
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.training_data = self.train_data
-    FLAGS.batch_size = 64
-    FLAGS.train_epochs = 43
-    FLAGS.model_dir = ''
-    FLAGS.enable_xla = True
-    self._run_and_report_benchmark()
 class ShakespeareKerasBenchmarkReal(ShakespeareBenchmarkBase):
  """Benchmark accuracy tests."""

--- a/official/transformer/v2/data_pipeline.py
+++ b/official/transformer/v2/data_pipeline.py
@@ -273,7 +273,7 @@ def _generate_synthetic_data(params):
      label_value=1,
      label_dtype=tf.int64,
  )
-  return dataset.batch(batch)
+  return dataset.batch(batch, drop_remainder=True)
 def train_input_fn(params):

--- a/official/transformer/v2/misc.py
+++ b/official/transformer/v2/misc.py
@@ -176,6 +176,21 @@ def define_transformer_flags():
  flags.DEFINE_string(
      name='mode', default='train',
      help=flags_core.help_wrap('mode: train, eval, or predict'))
+  flags.DEFINE_bool(
+      name='use_ctl',
+      default=False,
+      help=flags_core.help_wrap(
+          'Whether the model runs with custom training loop.'))
+  flags.DEFINE_bool(
+      name='is_tpu_pod',
+      default=False,
+      help=flags_core.help_wrap('Whether the model runs on a TPU pod.'))
+  flags.DEFINE_bool(
+      name='use_tpu_2vm_config',
+      default=False,
+      help=flags_core.help_wrap(
+          'Whether the model runs in 2VM mode, Headless server and unit test '
+          'all use 1VM config.'))
  flags_core.set_defaults(data_dir='/tmp/translate_ende',
                          model_dir='/tmp/transformer_model',
@@ -216,8 +231,6 @@ def define_transformer_flags():
    return True
  # pylint: enable=unused-variable
-  flags_core.require_cloud_storage(['data_dir', 'model_dir', 'export_dir'])
 def get_callbacks():
  """Returns common callbacks."""

--- a/official/transformer/v2/optimizer.py
+++ b/official/transformer/v2/optimizer.py
@@ -23,6 +23,51 @@ import tensorflow as tf
 K = tf.keras.backend
+class LearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Learning rate schedule."""
+  def __init__(self, initial_learning_rate, hidden_size, warmup_steps):
+    """Initialize configuration of the learning rate schedule.
+    Args:
+      initial_learning_rate: A float, the initial learning rate.
+      hidden_size: An integer, the model dimension in the hidden layers.
+      warmup_steps: An integer, the number of steps required for linear warmup.
+    """
+    super(LearningRateSchedule, self).__init__()
+    self.initial_learning_rate = initial_learning_rate
+    self.hidden_size = hidden_size
+    self.warmup_steps = tf.cast(warmup_steps, tf.float32)
+  def __call__(self, global_step):
+    """Calculate learning rate with linear warmup and rsqrt decay.
+    Args:
+      global_step: An integer, the current global step used for learning rate
+        calculation.
+    Returns:
+      A float, the learning rate needs to be used for current global step.
+    """
+    with tf.name_scope('learning_rate_schedule'):
+      global_step = tf.cast(global_step, tf.float32)
+      learning_rate = self.initial_learning_rate
+      learning_rate *= (self.hidden_size**-0.5)
+      # Apply linear warmup
+      learning_rate *= tf.minimum(1.0, global_step / self.warmup_steps)
+      # Apply rsqrt decay
+      learning_rate /= tf.sqrt(tf.maximum(global_step, self.warmup_steps))
+      return learning_rate
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        'initial_learning_rate': self.initial_learning_rate,
+        'hidden_size': self.hidden_size,
+        'warmup_steps': self.warmup_steps,
+    }
 class LearningRateFn(object):
  """Creates learning rate function."""

--- a/official/transformer/v2/transformer_main.py
+++ b/official/transformer/v2/transformer_main.py
@@ -27,12 +27,16 @@ import tempfile
 from absl import app as absl_app  # pylint: disable=unused-import
 from absl import flags
+from absl import logging
 import tensorflow as tf
+from tensorflow.python.util import object_identity
 # pylint: disable=g-bad-import-order
 from official.transformer import compute_bleu
 from official.transformer.utils import tokenizer
 from official.transformer.v2 import data_pipeline
+from official.transformer.v2 import metrics
 from official.transformer.v2 import misc
 from official.transformer.v2 import optimizer
 from official.transformer.v2 import transformer
@@ -75,8 +79,8 @@ def evaluate_and_log_bleu(model, bleu_source, bleu_ref, vocab_file):
  uncased_score, cased_score = translate_and_compute_bleu(
      model, subtokenizer, bleu_source, bleu_ref)
-  tf.compat.v1.logging.info("Bleu score (uncased): %s", uncased_score)
+  logging.info("Bleu score (uncased): %s", uncased_score)
-  tf.compat.v1.logging.info("Bleu score (cased): %s", cased_score)
+  logging.info("Bleu score (cased): %s", cased_score)
  return uncased_score, cased_score
@@ -88,26 +92,20 @@ class TransformerTask(object):
    Args:
      flags_obj: Object containing parsed flag values, i.e., FLAGS.
+    Raises:
+      ValueError: if not using static batch for input data on TPU.
    """
    self.flags_obj = flags_obj
    self.predict_model = None
    # Add flag-defined parameters to params object
    num_gpus = flags_core.get_num_gpus(flags_obj)
-    self.distribution_strategy = distribution_utils.get_distribution_strategy(
-        distribution_strategy=flags_obj.distribution_strategy,
-        num_gpus=flags_core.get_num_gpus(flags_obj))
-    print("Running transformer with num_gpus =", num_gpus)
-    if self.distribution_strategy:
-      print("For training, using distribution strategy: ",
-            self.distribution_strategy)
-    else:
-      print("Not using any distribution strategy.")
    self.params = params = misc.get_model_params(flags_obj.param_set, num_gpus)
    params["num_gpus"] = num_gpus
+    params["use_ctl"] = flags_obj.use_ctl
+    params["is_tpu_pod"] = flags_obj.is_tpu_pod
    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["static_batch"] = flags_obj.static_batch
@@ -130,33 +128,113 @@ class TransformerTask(object):
          "infer_float32_vars")
      tf.keras.mixed_precision.experimental.set_policy(policy)
+    self.distribution_strategy = distribution_utils.get_distribution_strategy(
+        distribution_strategy=flags_obj.distribution_strategy,
+        num_gpus=num_gpus,
+        tpu_address=flags_obj.tpu or "")
+    if self.use_tpu:
+      if not params["static_batch"]:
+        raise ValueError("TPU requires static batch for input data.")
+    else:
+      print("Running transformer with num_gpus =", num_gpus)
+    if self.distribution_strategy:
+      print("For training, using distribution strategy: ",
+            self.distribution_strategy)
+    else:
+      print("Not using any distribution strategy.")
+  @property
+  def use_tpu(self):
+    if self.distribution_strategy:
+      return isinstance(self.distribution_strategy,
+                        tf.distribute.experimental.TPUStrategy)
+    return False
  def train(self):
    """Trains the model."""
-    params, flags_obj, is_train = self.params, self.flags_obj, True
+    params = self.params
+    flags_obj = self.flags_obj
    # Sets config options.
    keras_utils.set_session_config(
        enable_xla=flags_obj.enable_xla)
    _ensure_dir(flags_obj.model_dir)
-    if self.distribution_strategy:
+    with distribution_utils.get_strategy_scope(self.distribution_strategy):
-      with self.distribution_strategy.scope():
+      model = transformer.create_model(params, is_train=True)
-        model = transformer.create_model(params, is_train)
-        opt = self._create_optimizer()
-        model.compile(opt)
-    else:
-      model = transformer.create_model(params, is_train)
      opt = self._create_optimizer()
-      model.compile(opt)
+      if params["use_ctl"]:
+        train_loss_metric = tf.keras.metrics.Mean(
+            "training_loss", dtype=tf.float32)
+      else:
+        model.compile(opt)
    model.summary()
-    train_ds = data_pipeline.train_input_fn(params)
+    if self.use_tpu:
-    map_data_fn = data_pipeline.map_data_for_transformer_fn
+      # Different from experimental_distribute_dataset,
-    train_ds = train_ds.map(map_data_fn,
+      # experimental_distribute_datasets_from_function requires
-                            num_parallel_calls=params["num_parallel_calls"])
+      # per-replica/local batch size.
+      params["batch_size"] /= self.distribution_strategy.num_replicas_in_sync
+      train_ds = (
+          self.distribution_strategy
+          .experimental_distribute_datasets_from_function(
+              lambda ctx: data_pipeline.train_input_fn(params)))
+    else:
+      train_ds = data_pipeline.train_input_fn(params)
+      map_data_fn = data_pipeline.map_data_for_transformer_fn
+      train_ds = train_ds.map(
+          map_data_fn, num_parallel_calls=params["num_parallel_calls"])
+    if params["use_ctl"]:
+      train_ds_iterator = iter(train_ds)
    callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)
+    # TODO(b/139418525): Refactor the custom training loop logic.
+    @tf.function
+    def train_steps(iterator, steps):
+      """Training steps function for TPU runs.
+      Args:
+        iterator: The input iterator of the training dataset.
+        steps: An integer, the number of training steps.
+      Returns:
+        A float, the loss value.
+      """
+      def _step_fn(inputs):
+        """Per-replica step function."""
+        inputs, targets = inputs
+        with tf.GradientTape() as tape:
+          logits = model([inputs, targets], training=True)
+          loss = metrics.transformer_loss(logits, targets,
+                                          params["label_smoothing"],
+                                          params["vocab_size"])
+          # Scales the loss, which results in using the average loss across all
+          # of the replicas for backprop.
+          scaled_loss = loss / self.distribution_strategy.num_replicas_in_sync
+        # De-dupes variables due to keras tracking issues.
+        tvars = list(
+            object_identity.ObjectIdentitySet(model.trainable_variables))
+        grads = tape.gradient(scaled_loss, tvars)
+        opt.apply_gradients(zip(grads, tvars))
+        # For reporting, the metric takes the mean of losses.
+        train_loss_metric.update_state(loss)
+      for _ in tf.range(steps):
+        train_loss_metric.reset_states()
+        self.distribution_strategy.experimental_run_v2(
+            _step_fn, args=(next(iterator),))
+    if self.use_tpu:
+      checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)
+      latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir)
+      if latest_checkpoint:
+        checkpoint.restore(latest_checkpoint)
+        logging.info("Loaded checkpoint %s", latest_checkpoint)
    if flags_obj.train_steps < flags_obj.steps_between_evals:
      flags_obj.steps_between_evals = flags_obj.train_steps
    iterations = flags_obj.train_steps // flags_obj.steps_between_evals
@@ -165,28 +243,54 @@ class TransformerTask(object):
    cased_score_history, uncased_score_history = [], []
    for i in range(1, iterations + 1):
      print("Start train iteration:{}/{}".format(i, iterations))
-      history = model.fit(
+      history = None
-          train_ds,
+      if params["use_ctl"]:
-          initial_epoch=i-1,
+        if not self.use_tpu:
-          epochs=i,
+          raise NotImplementedError(
-          steps_per_epoch=flags_obj.steps_between_evals,
+              "Custom training loop on GPUs is not implemented.")
-          callbacks=callbacks,
+        train_steps_per_eval = tf.convert_to_tensor(
-          # If TimeHistory is enabled, progress bar would be messy. Increase the
+            flags_obj.steps_between_evals, dtype=tf.int32)
-          # verbose level to get rid of it.
-          verbose=(2 if flags_obj.enable_time_history else 1))
+        # Runs training steps.
+        train_steps(train_ds_iterator, train_steps_per_eval)
+        train_loss = train_loss_metric.result().numpy().astype(float)
+        logging.info("Train Step: %d/%d / loss = %s",
+                     i * flags_obj.steps_between_evals, flags_obj.train_steps,
+                     train_loss)
+        checkpoint_name = checkpoint.save(
+            os.path.join(
+                flags_obj.model_dir,
+                "ctl_step_{}.ckpt".format(i * flags_obj.steps_between_evals)))
+        logging.info("Saved checkpoint to %s", checkpoint_name)
+      else:
+        if self.use_tpu:
+          raise NotImplementedError(
+              "Keras model.fit on TPUs is not implemented.")
+        history = model.fit(
+            train_ds,
+            initial_epoch=i - 1,
+            epochs=i,
+            steps_per_epoch=flags_obj.steps_between_evals,
+            callbacks=callbacks,
+            # If TimeHistory is enabled, progress bar would be messy. Increase
+            # the verbose level to get rid of it.
+            verbose=(2 if flags_obj.enable_time_history else 1))
+        logging.info("Train history: {}".format(history.history))
      print("End train iteration:{}/{} global step:{}".format(
          i,
          iterations,
          i*flags_obj.steps_between_evals))
-      tf.compat.v1.logging.info("Train history: {}".format(history.history))
-      stats = misc.build_stats(history, callbacks)
      if (flags_obj.bleu_source and flags_obj.bleu_ref):
        uncased_score, cased_score = self.eval()
        cased_score_history.append([i, cased_score])
        uncased_score_history.append([i, uncased_score])
-    stats = misc.build_stats(history, callbacks)
+    stats = ({
+        "loss": train_loss
+    } if history is None else misc.build_stats(history, callbacks))
    if uncased_score and cased_score:
      stats["bleu_uncased"] = uncased_score
      stats["bleu_cased"] = cased_score
@@ -209,10 +313,11 @@ class TransformerTask(object):
  def predict(self):
    """Predicts result from the model."""
-    params, flags_obj, is_train = self.params, self.flags_obj, False
+    params = self.params
+    flags_obj = self.flags_obj
    with tf.name_scope("model"):
-      model = transformer.create_model(params, is_train)
+      model = transformer.create_model(params, is_train=False)
      self._load_weights_if_possible(
          model, tf.train.latest_checkpoint(self.flags_obj.model_dir))
      model.summary()
@@ -242,16 +347,28 @@ class TransformerTask(object):
  def _load_weights_if_possible(self, model, init_weight_path=None):
    """Loads model weights when it is provided."""
    if init_weight_path:
-      tf.compat.v1.logging.info("Load weights: {}".format(init_weight_path))
+      logging.info("Load weights: {}".format(init_weight_path))
-      model.load_weights(init_weight_path)
+      # TODO(b/139414977): Having the same variable restoring method for both
+      # TPU and GPU.
+      if self.use_tpu:
+        checkpoint = tf.train.Checkpoint(
+            model=model, optimizer=self._create_optimizer())
+        checkpoint.restore(init_weight_path)
+      else:
+        model.load_weights(init_weight_path)
    else:
      print("Weights not loaded from path:{}".format(init_weight_path))
  def _create_optimizer(self):
    """Creates optimizer."""
    params = self.params
+    # TODO(b/139414679): Explore the difference between using
+    # LearningRateSchedule and callback for GPU runs, and try to merge them.
+    lr_schedule = optimizer.LearningRateSchedule(
+        params["learning_rate"], params["hidden_size"],
+        params["learning_rate_warmup_steps"])
    opt = tf.keras.optimizers.Adam(
-        params["learning_rate"],
+        lr_schedule if self.use_tpu else params["learning_rate"],
        params["optimizer_adam_beta1"],
        params["optimizer_adam_beta2"],
        epsilon=params["optimizer_adam_epsilon"])
@@ -264,25 +381,34 @@ class TransformerTask(object):
 def _ensure_dir(log_dir):
  """Makes log dir if not existed."""
-  if not os.path.exists(log_dir):
+  if not tf.io.gfile.exists(log_dir):
-    os.makedirs(log_dir)
+    tf.io.gfile.makedirs(log_dir)
 def main(_):
  flags_obj = flags.FLAGS
  with logger.benchmark_context(flags_obj):
    task = TransformerTask(flags_obj)
-    if flags_obj.mode == "train":
-      task.train()
+    def _run_task(task):
-    elif flags_obj.mode == "predict":
+      if flags_obj.mode == "train":
-      task.predict()
+        task.train()
-    elif flags_obj.mode == "eval":
+      elif flags_obj.mode == "predict":
-      task.eval()
+        task.predict()
+      elif flags_obj.mode == "eval":
+        task.eval()
+      else:
+        raise ValueError("Invalid mode {}".format(flags_obj.mode))
+    if not flags_obj.distribution_strategy != "tpu":
+      _run_task(task)
    else:
-      raise ValueError("Invalid mode {}".format(flags_obj.mode))
+      primary_cpu_task = "/job:worker" if flags_obj.use_tpu_2vm_config else ""
+      with tf.device(primary_cpu_task):
+        _run_task(task)
 if __name__ == "__main__":
-  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+  logging.set_verbosity(logging.INFO)
  misc.define_transformer_flags()
  absl_app.run(main)
--- a/official/transformer/v2/transformer_main_test.py
+++ b/official/transformer/v2/transformer_main_test.py
@@ -30,7 +30,7 @@ from official.transformer.v2 import misc
 from official.transformer.v2 import transformer_main as tm
 from official.utils.misc import keras_utils
-from tensorflow.python.eager import context # pylint: disable=ungrouped-imports
+from tensorflow.python.eager import context  # pylint: disable=ungrouped-imports
 FLAGS = flags.FLAGS
 FIXED_TIMESTAMP = 'my_time_stamp'
@@ -80,11 +80,19 @@ class TransformerTaskTest(tf.test.TestCase):
    self.assertTrue(os.path.exists(filepath))
  def test_train_no_dist_strat(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
    t = tm.TransformerTask(FLAGS)
    t.train()
  def test_train_static_batch(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
    FLAGS.distribution_strategy = 'one_device'
+    if tf.test.is_built_with_cuda():
+      FLAGS.num_gpus = 1
+    else:
+      FLAGS.num_gpus = 0
    FLAGS.static_batch = True
    t = tm.TransformerTask(FLAGS)
    t.train()
@@ -97,6 +105,7 @@ class TransformerTaskTest(tf.test.TestCase):
  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
  def test_train_fp16(self):
+    FLAGS.distribution_strategy = 'one_device'
    FLAGS.dtype = 'fp16'
    t = tm.TransformerTask(FLAGS)
    t.train()
@@ -105,8 +114,8 @@ class TransformerTaskTest(tf.test.TestCase):
  def test_train_2_gpu(self):
    if context.num_gpus() < 2:
      self.skipTest(
-          '{} GPUs are not available for this test. {} GPUs are available'.
+          '{} GPUs are not available for this test. {} GPUs are available'
-          format(2, context.num_gpus()))
+          .format(2, context.num_gpus()))
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.num_gpus = 2
    FLAGS.param_set = 'base'
@@ -117,8 +126,8 @@ class TransformerTaskTest(tf.test.TestCase):
  def test_train_2_gpu_fp16(self):
    if context.num_gpus() < 2:
      self.skipTest(
-          '{} GPUs are not available for this test. {} GPUs are available'.
+          '{} GPUs are not available for this test. {} GPUs are available'
-          format(2, context.num_gpus()))
+          .format(2, context.num_gpus()))
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.num_gpus = 2
    FLAGS.param_set = 'base'
@@ -153,16 +162,22 @@ class TransformerTaskTest(tf.test.TestCase):
    FLAGS(update_flags)
  def test_predict(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
    self._prepare_files_and_flags()
    t = tm.TransformerTask(FLAGS)
    t.predict()
  def test_predict_fp16(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
    self._prepare_files_and_flags('--dtype=fp16')
    t = tm.TransformerTask(FLAGS)
    t.predict()
  def test_eval(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
    self._prepare_files_and_flags()
    t = tm.TransformerTask(FLAGS)
    t.eval()

--- a/official/utils/flags/_distribution.py
+++ b/official/utils/flags/_distribution.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Flags related to distributed execution."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import flags
+import tensorflow as tf
+from official.utils.flags._conventions import help_wrap
+def define_distribution(worker_hosts=True, task_index=True):
+  """Register distributed execution flags.
+  Args:
+    worker_hosts: Create a flag for specifying comma-separated list of workers.
+    task_index: Create a flag for specifying index of task.
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+  key_flags = []
+  if worker_hosts:
+    flags.DEFINE_string(
+        name='worker_hosts', default=None,
+        help=help_wrap(
+            'Comma-separated list of worker ip:port pairs for running '
+            'multi-worker models with DistributionStrategy.  The user would '
+            'start the program on each host with identical value for this '
+            'flag.'))
+  if task_index:
+    flags.DEFINE_integer(
+        name='task_index', default=-1,
+        help=help_wrap('If multi-worker training, the task_index of this '
+                       'worker.'))
+  return key_flags
--- a/official/utils/flags/_performance.py
+++ b/official/utils/flags/_performance.py
@@ -54,7 +54,7 @@ def get_loss_scale(flags_obj, default_for_fp16):
 def define_performance(num_parallel_calls=True, inter_op=True, intra_op=True,
-                       synthetic_data=True, max_train_steps=True, dtype=True,
+                       synthetic_data=True, max_train_steps=False, dtype=True,
                       all_reduce_alg=True, num_packs=True,
                       tf_gpu_thread_mode=False,
                       datasets_num_private_threads=False,

--- a/official/utils/flags/core.py
+++ b/official/utils/flags/core.py
@@ -32,6 +32,7 @@ from official.utils.flags import _base
 from official.utils.flags import _benchmark
 from official.utils.flags import _conventions
 from official.utils.flags import _device
+from official.utils.flags import _distribution
 from official.utils.flags import _misc
 from official.utils.flags import _performance
@@ -77,6 +78,8 @@ define_benchmark = register_key_flags_in_core(_benchmark.define_benchmark)
 define_device = register_key_flags_in_core(_device.define_device)
 define_image = register_key_flags_in_core(_misc.define_image)
 define_performance = register_key_flags_in_core(_performance.define_performance)
+define_distribution = register_key_flags_in_core(
+    _distribution.define_distribution)
 help_wrap = _conventions.help_wrap

--- a/official/utils/misc/distribution_utils.py
+++ b/official/utils/misc/distribution_utils.py
@@ -24,6 +24,8 @@ import random
 import string
 import tensorflow as tf
+from official.utils.misc import tpu_lib
 def _collective_communication(all_reduce_alg):
  """Return a CollectiveCommunication based on all_reduce_alg.
@@ -83,16 +85,18 @@ def get_distribution_strategy(distribution_strategy="default",
                              num_gpus=0,
                              num_workers=1,
                              all_reduce_alg=None,
-                              num_packs=1):
+                              num_packs=1,
+                              tpu_address=None):
  """Return a DistributionStrategy for running the model.
  Args:
    distribution_strategy: a string specifying which distribution strategy to
      use. Accepted values are 'off', 'default', 'one_device', 'mirrored',
-      'parameter_server', 'multi_worker_mirrored', case insensitive. 'off' means
+      'parameter_server', 'multi_worker_mirrored', and 'tpu' -- case insensitive.
-      not to use Distribution Strategy; 'default' means to choose from
+      'off' means not to use Distribution Strategy; 'default' means to choose from
      `MirroredStrategy`, `MultiWorkerMirroredStrategy`, or `OneDeviceStrategy`
-      according to the number of GPUs and number of workers.
+      according to the number of GPUs and number of workers. 'tpu' means to use
+      TPUStrategy using `tpu_address`.
    num_gpus: Number of GPUs to run this model.
    num_workers: Number of workers to run this model.
    all_reduce_alg: Optional. Specifies which algorithm to use when performing
@@ -102,12 +106,14 @@ def get_distribution_strategy(distribution_strategy="default",
      device topology.
    num_packs: Optional.  Sets the `num_packs` in `tf.distribute.NcclAllReduce`
      or `tf.distribute.HierarchicalCopyAllReduce` for `MirroredStrategy`.
+    tpu_address: Optional. String that represents TPU to connect to. Must not
+      be None if `distribution_strategy` is set to `tpu`.
  Returns:
    tf.distribute.DistibutionStrategy object.
  Raises:
    ValueError: if `distribution_strategy` is 'off' or 'one_device' and
-      `num_gpus` is larger than 1; or `num_gpus` is negative.
+      `num_gpus` is larger than 1; or `num_gpus` is negative or if
+      `distribution_strategy` is `tpu` but `tpu_address` is not specified.
  """
  if num_gpus < 0:
    raise ValueError("`num_gpus` can not be negative.")
@@ -120,6 +126,12 @@ def get_distribution_strategy(distribution_strategy="default",
          "flag cannot be set to 'off'.".format(num_gpus, num_workers))
    return None
+  if distribution_strategy == "tpu":
+    # When tpu_address is an empty string, we communicate with local TPUs.
+    # Initialize TPU System.
+    cluster_resolver = tpu_lib.tpu_initialize(tpu_address)
+    return tf.distribute.experimental.TPUStrategy(cluster_resolver)
  if distribution_strategy == "multi_worker_mirrored":
    return tf.distribute.experimental.MultiWorkerMirroredStrategy(
        communication=_collective_communication(all_reduce_alg))
@@ -190,38 +202,64 @@ class SyntheticDataset(object):
  """A dataset that generates synthetic data on each device."""
  def __init__(self, dataset, split_by=1):
-    self._input_data = {}
    # dataset.take(1) doesn't have GPU kernel.
    with tf.device('device:CPU:0'):
      tensor = tf.data.experimental.get_single_element(dataset.take(1))
    flat_tensor = tf.nest.flatten(tensor)
    variable_data = []
-    self._initializers = []
+    initializers = []
    for t in flat_tensor:
      rebatched_t = tf.split(t, num_or_size_splits=split_by, axis=0)[0]
      assert rebatched_t.shape.is_fully_defined(), rebatched_t.shape
-      v = tf.compat.v1.get_local_variable(self.random_name(),
+      v = tf.compat.v1.get_local_variable(self._random_name(),
                                          initializer=rebatched_t)
      variable_data.append(v)
-      self._initializers.append(v.initializer)
+      initializers.append(v.initializer)
-    self._input_data = tf.nest.pack_sequence_as(tensor, variable_data)
+    input_data = tf.nest.pack_sequence_as(tensor, variable_data)
+    self._iterator = SyntheticIterator(input_data, initializers)
+  def _random_name(self, size=10, chars=string.ascii_uppercase + string.digits):
+    return ''.join(random.choice(chars) for _ in range(size))
+  def __iter__(self):
+    return self._iterator
+  def make_one_shot_iterator(self):
+    return self._iterator
+  def make_initializable_iterator(self):
+    return self._iterator
+class SyntheticIterator(object):
+  """A dataset that generates synthetic data on each device."""
+  def __init__(self, input_data, initializers):
+    self._input_data = input_data
+    self._initializers = initializers
  def get_next(self):
    return self._input_data
+  def next(self):
+    return self.__next__()
+  def __next__(self):
+    try:
+      return self.get_next()
+    except tf.errors.OutOfRangeError:
+      raise StopIteration
  def initialize(self):
    if tf.executing_eagerly():
      return tf.no_op()
    else:
      return self._initializers
-  def random_name(self, size=10, chars=string.ascii_uppercase + string.digits):
-    return ''.join(random.choice(chars) for _ in range(size))
 def _monkey_patch_dataset_method(strategy):
  """Monkey-patch `strategy`'s `make_dataset_iterator` method."""
-  def make_dataset_iterator(self, dataset):
+  def make_dataset(self, dataset):
    tf.compat.v1.logging.info('Using pure synthetic data.')
    with self.scope():
      if self.extended._global_batch_size:  # pylint: disable=protected-access
@@ -229,22 +267,34 @@ def _monkey_patch_dataset_method(strategy):
      else:
        return SyntheticDataset(dataset)
-  strategy.org_make_dataset_iterator = strategy.make_dataset_iterator
+  def make_iterator(self, dataset):
-  strategy.make_dataset_iterator = make_dataset_iterator
+    dist_dataset = make_dataset(self, dataset)
+    return iter(dist_dataset)
+  strategy.orig_make_dataset_iterator = strategy.make_dataset_iterator
+  strategy.make_dataset_iterator = make_iterator
+  strategy.orig_distribute_dataset = strategy.experimental_distribute_dataset
+  strategy.experimental_distribute_dataset = make_dataset
 def _undo_monkey_patch_dataset_method(strategy):
-  if hasattr(strategy, 'org_make_dataset_iterator'):
+  if hasattr(strategy, 'orig_make_dataset_iterator'):
-    strategy.make_dataset_iterator = strategy.org_make_dataset_iterator
+    strategy.make_dataset_iterator = strategy.orig_make_dataset_iterator
+  if hasattr(strategy, 'orig_distribute_dataset'):
+    strategy.make_dataset_iterator = strategy.orig_distribute_dataset
 def set_up_synthetic_data():
  _monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
  _monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
+  _monkey_patch_dataset_method(
+      tf.distribute.experimental.MultiWorkerMirroredStrategy)
  # TODO(tobyboyd): Remove when contrib.distribute is all in core.
  if hasattr(tf, 'contrib'):
    _monkey_patch_dataset_method(tf.contrib.distribute.MirroredStrategy)
    _monkey_patch_dataset_method(tf.contrib.distribute.OneDeviceStrategy)
+    _monkey_patch_dataset_method(
+        tf.contrib.distribute.CollectiveAllReduceStrategy)
  else:
    print('Contrib missing: Skip monkey patch tf.contrib.distribute.*')
@@ -252,10 +302,14 @@ def set_up_synthetic_data():
 def undo_set_up_synthetic_data():
  _undo_monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
  _undo_monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
+  _undo_monkey_patch_dataset_method(
+      tf.distribute.experimental.MultiWorkerMirroredStrategy)
  # TODO(tobyboyd): Remove when contrib.distribute is all in core.
  if hasattr(tf, 'contrib'):
    _undo_monkey_patch_dataset_method(tf.contrib.distribute.MirroredStrategy)
    _undo_monkey_patch_dataset_method(tf.contrib.distribute.OneDeviceStrategy)
+    _undo_monkey_patch_dataset_method(
+        tf.contrib.distribute.CollectiveAllReduceStrategy)
  else:
    print('Contrib missing: Skip remove monkey patch tf.contrib.distribute.*')

--- a/official/bert/tpu_lib.py
+++ b/official/bert/tpu_lib.py
@@ -31,3 +31,8 @@ def tpu_initialize(tpu_address):
  tf.config.experimental_connect_to_host(cluster_resolver.master())
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  return cluster_resolver
+def get_primary_cpu_task(use_remote_tpu=False):
+  """Returns remote TPU worker address. No-op for GPU/CPU training."""
+  return "/job:worker" if use_remote_tpu else ""
--- a/official/utils/testing/integration.py
+++ b/official/utils/testing/integration.py
@@ -29,7 +29,7 @@ from absl import flags
 from official.utils.flags import core as flags_core
-def run_synthetic(main, tmp_root, extra_flags=None, synth=True, max_train=1):
+def run_synthetic(main, tmp_root, extra_flags=None, synth=True):
  """Performs a minimal run of a model.
    This function is intended to test for syntax errors throughout a model. A
@@ -41,7 +41,6 @@ def run_synthetic(main, tmp_root, extra_flags=None, synth=True, max_train=1):
    tmp_root: Root path for the temp directory created by the test class.
    extra_flags: Additional flags passed by the caller of this function.
    synth: Use synthetic data.
-    max_train: Maximum number of allowed training steps.
  """
  extra_flags = [] if extra_flags is None else extra_flags
@@ -54,9 +53,6 @@ def run_synthetic(main, tmp_root, extra_flags=None, synth=True, max_train=1):
  if synth:
    args.append("--use_synthetic_data")
-  if max_train is not None:
-    args.extend(["--max_train_steps", str(max_train)])
  try:
    flags_core.parse_flags(argv=args)
    main(flags.FLAGS)

--- a/official/resnet/keras/README.md
+++ b/official/resnet/keras/README.md
-This folder contains the Keras implementation of the ResNet models. For more 
+This folder contains the Keras implementation of the ResNet models. For more
-information about the models, please refer to this [README file](../README.md).
+information about the models, please refer to this [README file](../../README.md).
-Similar to the [estimator implementation](/official/resnet), the Keras 
+Similar to the [estimator implementation](../../r1/resnet), the Keras
 implementation has code for both CIFAR-10 data and ImageNet data. The CIFAR-10
-version uses a ResNet56 model implemented in 
+version uses a ResNet56 model implemented in
-[`resnet_cifar_model.py`](./resnet_cifar_model.py), and the ImageNet version 
+[`resnet_cifar_model.py`](./resnet_cifar_model.py), and the ImageNet version
 uses a ResNet50 model implemented in [`resnet_model.py`](./resnet_model.py).
-To use 
+To use
-either dataset, make sure that you have the latest version of TensorFlow 
+either dataset, make sure that you have the latest version of TensorFlow
-installed and 
+installed and
 [add the models folder to your Python path](/official/#running-the-models),
-otherwise you may encounter an error like `ImportError: No module named 
+otherwise you may encounter an error like `ImportError: No module named
 official.resnet`.
 ## CIFAR-10
@@ -36,7 +36,7 @@ python keras_cifar_main.py --data_dir=/path/to/cifar
 ## ImageNet
-Download the ImageNet dataset and convert it to TFRecord format. 
+Download the ImageNet dataset and convert it to TFRecord format.
 The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
 and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy)
 provide a few options.
@@ -56,17 +56,17 @@ python keras_imagenet_main.py --data_dir=/path/to/imagenet
 There are more flag options you can specify. Here are some examples:
- `--use_synthetic_data`: when set to true, synthetic data, rather than real 
+- `--use_synthetic_data`: when set to true, synthetic data, rather than real
 data, are used;
 - `--batch_size`: the batch size used for the model;
 - `--model_dir`: the directory to save the model checkpoint;
 - `--train_epochs`: number of epoches to run for training the model;
 - `--train_steps`: number of steps to run for training the model. We now only
 support a number that is smaller than the number of batches in an epoch.
- `--skip_eval`: when set to true, evaluation as well as validation during 
+- `--skip_eval`: when set to true, evaluation as well as validation during
 training is skipped
-For example, this is a typical command line to run with ImageNet data with 
+For example, this is a typical command line to run with ImageNet data with
 batch size 128 per GPU:
 ```bash
@@ -82,19 +82,19 @@ python -m keras_imagenet_main \
 See [`keras_common.py`](keras_common.py) for full list of options.
 ## Using multiple GPUs
-You can train these models on multiple GPUs using `tf.distribute.Strategy` API. 
+You can train these models on multiple GPUs using `tf.distribute.Strategy` API.
-You can read more about them in this 
+You can read more about them in this
 [guide](https://www.tensorflow.org/guide/distribute_strategy).
-In this example, we have made it easier to use is with just a command line flag 
+In this example, we have made it easier to use is with just a command line flag
-`--num_gpus`. By default this flag is 1 if TensorFlow is compiled with CUDA, 
+`--num_gpus`. By default this flag is 1 if TensorFlow is compiled with CUDA,
 and 0 otherwise.
 - --num_gpus=0: Uses tf.distribute.OneDeviceStrategy with CPU as the device.
 - --num_gpus=1: Uses tf.distribute.OneDeviceStrategy with GPU as the device.
- --num_gpus=2+: Uses tf.distribute.MirroredStrategy to run synchronous 
+- --num_gpus=2+: Uses tf.distribute.MirroredStrategy to run synchronous
 distributed training across the GPUs.
-If you wish to run without `tf.distribute.Strategy`, you can do so by setting 
+If you wish to run without `tf.distribute.Strategy`, you can do so by setting
 `--distribution_strategy=off`.