Merge pull request #7 from tensorflow/master

updated

Merge pull request #7 from tensorflow/master
updated
965cc3ee · Ayushman Kumar · GitHub · 1f3247f4 · 1f685c54 · 965cc3ee
Unverified Commit 965cc3ee authored Apr 21, 2020 by Ayushman Kumar Committed by GitHub Apr 21, 2020
20 changed files
--- a/official/recommendation/ncf_estimator_main.py
+++ b/official/recommendation/ncf_estimator_main.py
--- a/official/r1/resnet/README.md
+++ b/official/r1/resnet/README.md
+![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
+![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
+![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
 # ResNet in TensorFlow
 Deep residual networks, or ResNets for short, provided the breakthrough idea of

--- a/official/r1/resnet/resnet_run_loop.py
+++ b/official/r1/resnet/resnet_run_loop.py
@@ -329,6 +329,37 @@ def learning_rate_with_decay(
  return learning_rate_fn
+def per_replica_batch_size(batch_size, num_gpus):
+  """For multi-gpu, batch-size must be a multiple of the number of GPUs.
+  Note that distribution strategy handles this automatically when used with
+  Keras. For using with Estimator, we need to get per GPU batch.
+  Args:
+    batch_size: Global batch size to be divided among devices. This should be
+      equal to num_gpus times the single-GPU batch_size for multi-gpu training.
+    num_gpus: How many GPUs are used with DistributionStrategies.
+  Returns:
+    Batch size per device.
+  Raises:
+    ValueError: if batch_size is not divisible by number of devices
+  """
+  if num_gpus <= 1:
+    return batch_size
+  remainder = batch_size % num_gpus
+  if remainder:
+    err = ('When running with multiple GPUs, batch size '
+           'must be a multiple of the number of available GPUs. Found {} '
+           'GPUs with a batch size of {}; try --batch_size={} instead.'
+          ).format(num_gpus, batch_size, batch_size - remainder)
+    raise ValueError(err)
+  return int(batch_size / num_gpus)
 def resnet_model_fn(features, labels, mode, model_class,
                    resnet_size, weight_decay, learning_rate_fn, momentum,
                    data_format, resnet_version, loss_scale,
@@ -620,7 +651,7 @@ def resnet_main(
    return input_function(
        is_training=True,
        data_dir=flags_obj.data_dir,
-        batch_size=distribution_utils.per_replica_batch_size(
+        batch_size=per_replica_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=num_epochs,
        dtype=flags_core.get_tf_dtype(flags_obj),
@@ -631,7 +662,7 @@ def resnet_main(
    return input_function(
        is_training=False,
        data_dir=flags_obj.data_dir,
-        batch_size=distribution_utils.per_replica_batch_size(
+        batch_size=per_replica_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=1,
        dtype=flags_core.get_tf_dtype(flags_obj))

--- a/official/r1/transformer/README.md
+++ b/official/r1/transformer/README.md
+![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
+![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
+![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
 # Transformer Translation Model
 This is an implementation of the Transformer translation model as described in the [Attention is All You Need](https://arxiv.org/abs/1706.03762) paper. Based on the code provided by the authors: [Transformer code](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py) from [Tensor2Tensor](https://github.com/tensorflow/tensor2tensor). Also, check out the [tutorial](https://www.tensorflow.org/beta/tutorials/text/transformer) on Transformer in TF 2.0.

--- a/official/r1/transformer/transformer_main.py
+++ b/official/r1/transformer/transformer_main.py
@@ -562,6 +562,36 @@ def construct_estimator(flags_obj, params, schedule_manager):
      },
      config=run_config)
+def per_replica_batch_size(batch_size, num_gpus):
+  """For multi-gpu, batch-size must be a multiple of the number of GPUs.
+  Note that distribution strategy handles this automatically when used with
+  Keras. For using with Estimator, we need to get per GPU batch.
+  Args:
+    batch_size: Global batch size to be divided among devices. This should be
+      equal to num_gpus times the single-GPU batch_size for multi-gpu training.
+    num_gpus: How many GPUs are used with DistributionStrategies.
+  Returns:
+    Batch size per device.
+  Raises:
+    ValueError: if batch_size is not divisible by number of devices
+  """
+  if num_gpus <= 1:
+    return batch_size
+  remainder = batch_size % num_gpus
+  if remainder:
+    err = ('When running with multiple GPUs, batch size '
+           'must be a multiple of the number of available GPUs. Found {} '
+           'GPUs with a batch size of {}; try --batch_size={} instead.'
+          ).format(num_gpus, batch_size, batch_size - remainder)
+    raise ValueError(err)
+  return int(batch_size / num_gpus)
 def run_transformer(flags_obj):
  """Create tf.Estimator to train and evaluate transformer model.
@@ -605,8 +635,8 @@ def run_transformer(flags_obj):
  total_batch_size = params["batch_size"]
  if not params["use_tpu"]:
-    params["batch_size"] = distribution_utils.per_replica_batch_size(
+    params["batch_size"] = per_replica_batch_size(params["batch_size"],
-        params["batch_size"], num_gpus)
+                                                  num_gpus)
  schedule_manager = schedule.Manager(
      train_steps=flags_obj.train_steps,

--- a/official/r1/wide_deep/README.md
+++ b/official/r1/wide_deep/README.md
+![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
+![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
+![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
 # Predicting Income with the Census Income Dataset
-Note that, the implementation is based on TF 1.x.
+The implementation is based on TensorFlow 1.x.
-It is subjected to move to R1 archive folder.
 ## Overview
 The [Census Income Data Set](https://archive.ics.uci.edu/ml/datasets/Census+Income) contains over 48,000 samples with attributes including age, occupation, education, and income (a binary label, either `>50K` or `<=50K`). The dataset is split into roughly 32,000 training and 16,000 testing samples.

--- a/official/recommendation/data_pipeline.py
+++ b/official/recommendation/data_pipeline.py
@@ -331,7 +331,7 @@ class DatasetManager(object):
      """Returns batches for training."""
      # Estimator passes batch_size during training and eval_batch_size during
-      # eval. TPUEstimator only passes batch_size.
+      # eval.
      param_batch_size = (params["batch_size"] if self._is_training else
                          params.get("eval_batch_size") or params["batch_size"])
      if batch_size != param_batch_size:
@@ -713,7 +713,7 @@ class DummyConstructor(threading.Thread):
      """Returns dummy input batches for training."""
      # Estimator passes batch_size during training and eval_batch_size during
-      # eval. TPUEstimator only passes batch_size.
+      # eval.
      batch_size = (params["batch_size"] if is_training else
                    params.get("eval_batch_size") or params["batch_size"])
      num_users = params["num_users"]

--- a/official/recommendation/ncf_common.py
+++ b/official/recommendation/ncf_common.py
@@ -149,7 +149,7 @@ def define_ncf_flags():
  flags_core.define_base(model_dir=True, clean=True, train_epochs=True,
                         epochs_between_evals=True, export_dir=False,
                         run_eagerly=True, stop_threshold=True, num_gpu=True,
-                         hooks=True, distribution_strategy=True)
+                         distribution_strategy=True)
  flags_core.define_performance(
      synthetic_data=True,
      dtype=True,
@@ -167,8 +167,7 @@ def define_ncf_flags():
      model_dir="/tmp/ncf/",
      data_dir="/tmp/movielens-data/",
      train_epochs=2,
-      batch_size=256,
+      batch_size=99000,
-      hooks="ProfilerHook",
      tpu=None
  )

--- a/official/recommendation/ncf_keras_main.py
+++ b/official/recommendation/ncf_keras_main.py
@@ -29,7 +29,7 @@ import os
 from absl import app
 from absl import flags
 from absl import logging
-import tensorflow as tf
+import tensorflow.compat.v2 as tf
 # pylint: enable=g-bad-import-order
 from official.recommendation import constants as rconst

--- a/official/recommendation/ncf_test.py
+++ b/official/recommendation/ncf_test.py
@@ -23,18 +23,15 @@ import unittest
 import numpy as np
 import tensorflow as tf
+from tensorflow.python.eager import context  # pylint: disable=ungrouped-imports
 from official.recommendation import constants as rconst
 from official.recommendation import data_pipeline
-from official.recommendation import neumf_model
 from official.recommendation import ncf_common
-from official.recommendation import ncf_estimator_main
 from official.recommendation import ncf_keras_main
+from official.recommendation import neumf_model
 from official.utils.misc import keras_utils
 from official.utils.testing import integration
-from tensorflow.python.eager import context # pylint: disable=ungrouped-imports
 NUM_TRAIN_NEG = 4
@@ -190,20 +187,6 @@ class NcfTest(tf.test.TestCase):
  _BASE_END_TO_END_FLAGS = ['-batch_size', '1044', '-train_epochs', '1']
-  @unittest.skipIf(keras_utils.is_v2_0(), "TODO(b/136018594)")
-  @unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
-  def test_end_to_end_estimator(self):
-    integration.run_synthetic(
-        ncf_estimator_main.main, tmp_root=self.get_temp_dir(),
-        extra_flags=self._BASE_END_TO_END_FLAGS)
-  @unittest.skipIf(keras_utils.is_v2_0(), "TODO(b/136018594)")
-  @unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
-  def test_end_to_end_estimator_mlperf(self):
-    integration.run_synthetic(
-        ncf_estimator_main.main, tmp_root=self.get_temp_dir(),
-        extra_flags=self._BASE_END_TO_END_FLAGS + ['-ml_perf', 'True'])
  @unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
  def test_end_to_end_keras_no_dist_strat(self):
    integration.run_synthetic(

--- a/official/recommendation/neumf_model.py
+++ b/official/recommendation/neumf_model.py
@@ -126,7 +126,6 @@ def neumf_model_fn(features, labels, mode, params):
        weights=tf.cast(valid_pt_mask, tf.float32)
    )
-    # This tensor is used by logging hooks.
    tf.identity(loss, name="cross_entropy")
    global_step = tf.compat.v1.train.get_global_step()

--- a/official/recommendation/run_tpu.sh
+++ b/official/recommendation/run_tpu.sh
-#!/bin/bash
-set -e
-# Example settings:
-# export TPU="taylorrobie-tpu-0"
-# export BUCKET="gs://taylorrobie-tpu-test-bucket-2"
-# Remove IDE "not assigned" warning highlights.
-TPU=${TPU:-""}
-BUCKET=${BUCKET:-""}
-if [[ -z ${TPU} ]]; then
-  echo "Please set 'TPU' to the name of the TPU to be used."
-  exit 1
-fi
-if [[ -z ${BUCKET} ]]; then
-  echo "Please set 'BUCKET' to the GCS bucket to be used."
-  exit 1
-fi
-./run.sh
--- a/official/staging/training/standard_runnable.py
+++ b/official/staging/training/standard_runnable.py
@@ -139,13 +139,10 @@ class StandardEvaluable(runnable.AbstractEvaluable):
        eval_fn = tf.function(eval_fn)
      self.eval_loop_fn = utils.create_loop_fn(eval_fn)
-    # TODO(b/147718615): When async RPC is enabled in eager runtime, we make
+    eval_iter = tf.nest.map_structure(iter, self.eval_dataset)
-    # eval iterator as a class member so it doesn't get destroyed when out of
-    # the function scope.
-    self.eval_iter = tf.nest.map_structure(iter, self.eval_dataset)
    self.eval_begin()
-    self.eval_loop_fn(self.eval_iter, num_steps)
+    self.eval_loop_fn(eval_iter, num_steps)
    return self.eval_end()
  def eval_begin(self):

--- a/official/utils/hyperparams_flags.py
+++ b/official/utils/hyperparams_flags.py
@@ -94,6 +94,7 @@ def initialize_common_flags():
 def strategy_flags_dict():
  """Returns TPU and/or GPU related flags in a dictionary."""
  return {
+      'distribution_strategy': FLAGS.strategy_type,
      # TPUStrategy related flags.
      'tpu': FLAGS.tpu,
      # MultiWorkerMirroredStrategy related flags.

--- a/official/utils/misc/distribution_utils.py
+++ b/official/utils/misc/distribution_utils.py
@@ -40,7 +40,7 @@ def _collective_communication(all_reduce_alg):
    tf.distribute.experimental.CollectiveCommunication object
  Raises:
-    ValueError: if `all_reduce_alg` not in [None, 'ring', 'nccl']
+    ValueError: if `all_reduce_alg` not in [None, "ring", "nccl"]
  """
  collective_communication_options = {
      None: tf.distribute.experimental.CollectiveCommunication.AUTO,
@@ -50,7 +50,7 @@ def _collective_communication(all_reduce_alg):
  if all_reduce_alg not in collective_communication_options:
    raise ValueError(
        "When used with `multi_worker_mirrored`, valid values for "
-        "all_reduce_alg are ['ring', 'nccl'].  Supplied value: {}".format(
+        "all_reduce_alg are [`ring`, `nccl`].  Supplied value: {}".format(
            all_reduce_alg))
  return collective_communication_options[all_reduce_alg]
@@ -66,7 +66,7 @@ def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
    tf.distribute.CrossDeviceOps object or None.
  Raises:
-    ValueError: if `all_reduce_alg` not in [None, 'nccl', 'hierarchical_copy'].
+    ValueError: if `all_reduce_alg` not in [None, "nccl", "hierarchical_copy"].
  """
  if all_reduce_alg is None:
    return None
@@ -77,7 +77,7 @@ def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
  if all_reduce_alg not in mirrored_all_reduce_options:
    raise ValueError(
        "When used with `mirrored`, valid values for all_reduce_alg are "
-        "['nccl', 'hierarchical_copy'].  Supplied value: {}".format(
+        "[`nccl`, `hierarchical_copy`].  Supplied value: {}".format(
            all_reduce_alg))
  cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
  return cross_device_ops_class(num_packs=num_packs)
@@ -92,9 +92,9 @@ def get_distribution_strategy(distribution_strategy="mirrored",
  Args:
    distribution_strategy: a string specifying which distribution strategy to
-      use. Accepted values are 'off', 'one_device', 'mirrored',
+      use. Accepted values are "off", "one_device", "mirrored",
-      'parameter_server', 'multi_worker_mirrored', and 'tpu' -- case insensitive.
+      "parameter_server", "multi_worker_mirrored", and "tpu" -- case insensitive.
-      'off' means not to use Distribution Strategy; 'tpu' means to use
+      "off" means not to use Distribution Strategy; "tpu" means to use
      TPUStrategy using `tpu_address`.
    num_gpus: Number of GPUs to run this model.
    all_reduce_alg: Optional. Specifies which algorithm to use when performing
@@ -109,7 +109,7 @@ def get_distribution_strategy(distribution_strategy="mirrored",
  Returns:
    tf.distribute.DistibutionStrategy object.
  Raises:
-    ValueError: if `distribution_strategy` is 'off' or 'one_device' and
+    ValueError: if `distribution_strategy` is "off" or "one_device" and
      `num_gpus` is larger than 1; or `num_gpus` is negative or if
      `distribution_strategy` is `tpu` but `tpu_address` is not specified.
  """
@@ -121,7 +121,7 @@ def get_distribution_strategy(distribution_strategy="mirrored",
    if num_gpus > 1:
      raise ValueError(
          "When {} GPUs are specified, distribution_strategy "
-          "flag cannot be set to 'off'.".format(num_gpus))
+          "flag cannot be set to `off`.".format(num_gpus))
    return None
  if distribution_strategy == "tpu":
@@ -157,141 +157,6 @@ def get_distribution_strategy(distribution_strategy="mirrored",
      "Unrecognized Distribution Strategy: %r" % distribution_strategy)
-def per_replica_batch_size(batch_size, num_gpus):
-  """For multi-gpu, batch-size must be a multiple of the number of GPUs.
-  Note that distribution strategy handles this automatically when used with
-  Keras. For using with Estimator, we need to get per GPU batch.
-  Args:
-    batch_size: Global batch size to be divided among devices. This should be
-      equal to num_gpus times the single-GPU batch_size for multi-gpu training.
-    num_gpus: How many GPUs are used with DistributionStrategies.
-  Returns:
-    Batch size per device.
-  Raises:
-    ValueError: if batch_size is not divisible by number of devices
-  """
-  if num_gpus <= 1:
-    return batch_size
-  remainder = batch_size % num_gpus
-  if remainder:
-    err = ('When running with multiple GPUs, batch size '
-           'must be a multiple of the number of available GPUs. Found {} '
-           'GPUs with a batch size of {}; try --batch_size={} instead.'
-          ).format(num_gpus, batch_size, batch_size - remainder)
-    raise ValueError(err)
-  return int(batch_size / num_gpus)
-# The `SyntheticDataset` is a temporary solution for generating synthetic data
-# directly on devices. It is only useful for Keras with Distribution
-# Strategies. We will have better support in `tf.data` or Distribution Strategy
-# later.
-class SyntheticDataset(object):
-  """A dataset that generates synthetic data on each device."""
-  def __init__(self, dataset, split_by=1):
-    # dataset.take(1) doesn't have GPU kernel.
-    with tf.device('device:CPU:0'):
-      tensor = tf.data.experimental.get_single_element(dataset.take(1))
-    flat_tensor = tf.nest.flatten(tensor)
-    variable_data = []
-    initializers = []
-    for t in flat_tensor:
-      rebatched_t = tf.split(t, num_or_size_splits=split_by, axis=0)[0]
-      assert rebatched_t.shape.is_fully_defined(), rebatched_t.shape
-      v = tf.compat.v1.get_local_variable(self._random_name(),
-                                          initializer=rebatched_t)
-      variable_data.append(v)
-      initializers.append(v.initializer)
-    input_data = tf.nest.pack_sequence_as(tensor, variable_data)
-    self._iterator = SyntheticIterator(input_data, initializers)
-  def _random_name(self, size=10, chars=string.ascii_uppercase + string.digits):
-    return ''.join(random.choice(chars) for _ in range(size))
-  def __iter__(self):
-    return self._iterator
-  def make_one_shot_iterator(self):
-    return self._iterator
-  def make_initializable_iterator(self):
-    return self._iterator
-class SyntheticIterator(object):
-  """A dataset that generates synthetic data on each device."""
-  def __init__(self, input_data, initializers):
-    self._input_data = input_data
-    self._initializers = initializers
-  def get_next(self):
-    return self._input_data
-  def next(self):
-    return self.__next__()
-  def __next__(self):
-    try:
-      return self.get_next()
-    except tf.errors.OutOfRangeError:
-      raise StopIteration
-  def initialize(self):
-    if tf.executing_eagerly():
-      return tf.no_op()
-    else:
-      return self._initializers
-def _monkey_patch_dataset_method(strategy):
-  """Monkey-patch `strategy`'s `make_dataset_iterator` method."""
-  def make_dataset(self, dataset):
-    logging.info('Using pure synthetic data.')
-    with self.scope():
-      if self.extended._global_batch_size:  # pylint: disable=protected-access
-        return SyntheticDataset(dataset, self.num_replicas_in_sync)
-      else:
-        return SyntheticDataset(dataset)
-  def make_iterator(self, dataset):
-    dist_dataset = make_dataset(self, dataset)
-    return iter(dist_dataset)
-  strategy.orig_make_dataset_iterator = strategy.make_dataset_iterator
-  strategy.make_dataset_iterator = make_iterator
-  strategy.orig_distribute_dataset = strategy.experimental_distribute_dataset
-  strategy.experimental_distribute_dataset = make_dataset
-def _undo_monkey_patch_dataset_method(strategy):
-  if hasattr(strategy, 'orig_make_dataset_iterator'):
-    strategy.make_dataset_iterator = strategy.orig_make_dataset_iterator
-  if hasattr(strategy, 'orig_distribute_dataset'):
-    strategy.make_dataset_iterator = strategy.orig_distribute_dataset
-def set_up_synthetic_data():
-  _monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
-  _monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
-  _monkey_patch_dataset_method(
-      tf.distribute.experimental.MultiWorkerMirroredStrategy)
-def undo_set_up_synthetic_data():
-  _undo_monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
-  _undo_monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
-  _undo_monkey_patch_dataset_method(
-      tf.distribute.experimental.MultiWorkerMirroredStrategy)
 def configure_cluster(worker_hosts=None, task_index=-1):
  """Set multi-worker cluster spec in TF_CONFIG environment variable.
@@ -301,21 +166,21 @@ def configure_cluster(worker_hosts=None, task_index=-1):
  Returns:
    Number of workers in the cluster.
  """
-  tf_config = json.loads(os.environ.get('TF_CONFIG', '{}'))
+  tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
  if tf_config:
-    num_workers = (len(tf_config['cluster'].get('chief', [])) +
+    num_workers = (len(tf_config["cluster"].get("chief", [])) +
-                   len(tf_config['cluster'].get('worker', [])))
+                   len(tf_config["cluster"].get("worker", [])))
  elif worker_hosts:
-    workers = worker_hosts.split(',')
+    workers = worker_hosts.split(",")
    num_workers = len(workers)
    if num_workers > 1 and task_index < 0:
-      raise ValueError('Must specify task_index when number of workers > 1')
+      raise ValueError("Must specify task_index when number of workers > 1")
    task_index = 0 if num_workers == 1 else task_index
-    os.environ['TF_CONFIG'] = json.dumps({
+    os.environ["TF_CONFIG"] = json.dumps({
-        'cluster': {
+        "cluster": {
-            'worker': workers
+            "worker": workers
        },
-        'task': {'type': 'worker', 'index': task_index}
+        "task": {"type": "worker", "index": task_index}
    })
  else:
    num_workers = 1

--- a/official/utils/misc/distribution_utils_test.py
+++ b/official/utils/misc/distribution_utils_test.py
@@ -45,21 +45,5 @@ class GetDistributionStrategyTest(tf.test.TestCase):
      self.assertIn('GPU', device)
-class PerReplicaBatchSizeTest(tf.test.TestCase):
-  """Tests for per_replica_batch_size."""
-  def test_batch_size(self):
-    self.assertEquals(
-        distribution_utils.per_replica_batch_size(147, num_gpus=0), 147)
-    self.assertEquals(
-        distribution_utils.per_replica_batch_size(147, num_gpus=1), 147)
-    self.assertEquals(
-        distribution_utils.per_replica_batch_size(147, num_gpus=7), 21)
-  def test_batch_size_with_remainder(self):
-    with self.assertRaises(ValueError):
-        distribution_utils.per_replica_batch_size(147, num_gpus=5)
 if __name__ == "__main__":
  tf.test.main()
--- a/official/utils/misc/keras_utils.py
+++ b/official/utils/misc/keras_utils.py
@@ -164,6 +164,18 @@ def get_profiler_callback(model_dir, profile_steps, enable_tensorboard,
  return ProfilerCallback(model_dir, start_step, stop_step, steps_per_epoch)
+class SimpleCheckpoint(tf.keras.callbacks.Callback):
+  """Keras callback to save tf.train.Checkpoints."""
+  def __init__(self, checkpoint_manager):
+    super(SimpleCheckpoint, self).__init__()
+    self.checkpoint_manager = checkpoint_manager
+  def on_epoch_end(self, epoch, logs=None):
+    step_counter = self.checkpoint_manager._step_counter.numpy()  # pylint: disable=protected-access
+    self.checkpoint_manager.save(checkpoint_number=step_counter)
 class ProfilerCallback(tf.keras.callbacks.Callback):
  """Save profiles in specified step range to log directory."""

--- a/official/vision/detection/README.md
+++ b/official/vision/detection/README.md
-# Object Detection Models on TensorFlow 2.0
+# Object Detection Models on TensorFlow 2
-**Note**: The repo is still under construction. More features and instructions
+**Note**: This repository is still under construction.
-will be added soon.
+More features and instructions will be added soon.
 ## Prerequsite
-To get started, make sure to use Tensorflow 2.1+ on Google Cloud. Also here are
+To get started, download the code from TensorFlow models GitHub repository or
-a few package you need to install to get started:
+use the pre-installed Google Cloud VM.
 ```bash
-sudo apt-get install -y python-tk && \
+git clone https://github.com/tensorflow/models.git
-pip install Cython matplotlib opencv-python-headless pyyaml Pillow && \
-pip install 'git+https://github.com/cocodataset/cocoapi#egg=pycocotools&subdirectory=PythonAPI'
 ```
-Next, download the code from TensorFlow models github repository or use the
+Next, make sure to use TensorFlow 2.1+ on Google Cloud. Also here are
-pre-installed Google Cloud VM.
+a few package you need to install to get started:
 ```bash
-git clone https://github.com/tensorflow/models.git
+sudo apt-get install -y python-tk && \
+pip3 install -r ~/models/official/requirements.txt
 ```
 ## Train RetinaNet on TPU
 ### Train a vanilla ResNet-50 based RetinaNet.
 ```bash
@@ -30,7 +30,7 @@ RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
 TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
 EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
 VAL_JSON_FILE="<path to the validation annotation JSON file>"
-python ~/models/official/vision/detection/main.py \
+python3 ~/models/official/vision/detection/main.py \
  --strategy_type=tpu \
  --tpu="${TPU_NAME?}" \
  --model_dir="${MODEL_DIR?}" \
@@ -60,7 +60,7 @@ following command.
 ```bash
 TPU_NAME="<your GCP TPU name>"
 MODEL_DIR="<path to the directory to store model files>"
-python ~/models/official/vision/detection/main.py \
+python3 ~/models/official/vision/detection/main.py \
  --strategy_type=tpu \
  --tpu="${TPU_NAME?}" \
  --model_dir="${MODEL_DIR?}" \
@@ -86,7 +86,6 @@ python3 ~/models/official/vision/detection/main.py \
  --config_file="my_retinanet.yaml"
 ```
 ```bash
 MODEL_DIR="<path to the directory to store model files>"
 python3 ~/models/official/vision/detection/main.py \
@@ -123,6 +122,118 @@ use_tpu: False
 "
 ```
+---
+## Train Mask R-CNN on TPU
+### Train a vanilla ResNet-50 based Mask R-CNN.
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+python3 ~/models/official/vision/detection/main.py \
+  --strategy_type=tpu \
+  --tpu=${TPU_NAME} \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=mask_rcnn \
+  --params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } }"
+```
+### Train a custom Mask R-CNN using the config file.
+First, create a YAML config file, e.g. *my_maskrcnn.yaml*.
+This file specifies the parameters to be overridden,
+which should at least include the following fields.
+```YAML
+# my_maskrcnn.yaml
+train:
+  train_file_pattern: <path to the TFRecord training data>
+eval:
+  eval_file_pattern: <path to the TFRecord validation data>
+  val_json_file: <path to the validation annotation JSON file>
+```
+Once the YAML config file is created, you can launch the training using the
+following command.
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/vision/detection/main.py \
+  --strategy_type=tpu \
+  --tpu=${TPU_NAME} \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=mask_rcnn \
+  --config_file="my_maskrcnn.yaml"
+```
+## Train Mask R-CNN on GPU
+Training on GPU is similar to that on TPU. The major change is the strategy type
+(use
+"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
+for multiple GPU and
+"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
+for single GPU).
+Multi-GPUs example (assuming there are 8GPU connected to the host):
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/vision/detection/main.py \
+  --strategy_type=mirrored \
+  --num_gpus=8 \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=mask_rcnn \
+  --config_file="my_maskrcnn.yaml"
+```
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/vision/detection/main.py \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=mask_rcnn \
+  --config_file="my_maskrcnn.yaml"
+```
+An example with inline configuration (YAML or JSON format):
+```
+python3 ~/models/official/vision/detection/main.py \
+  --model_dir=<model folder> \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --mode=train \
+  --model=mask_rcnn \
+  --params_override="eval:
+ eval_file_pattern: <Eval TFRecord file pattern>
+ batch_size: 8
+ val_json_file: <COCO format groundtruth JSON file>
+predict:
+ predict_batch_size: 8
+architecture:
+ use_bfloat16: False
+maskrcnn_parser:
+ use_bfloat16: Flase
+train:
+ total_steps: 1000
+ batch_size: 8
+ train_file_pattern: <Eval TFRecord file pattern>
+use_tpu: False
+"
+```
 Note: The JSON groundtruth file is useful for [COCO dataset](http://cocodataset.org/#home) and can be
 downloaded from the [COCO website](http://cocodataset.org/#download). For custom dataset, it is unncessary because the groundtruth can be included in the TFRecord files.

--- a/official/vision/detection/configs/base_config.py
+++ b/official/vision/detection/configs/base_config.py
@@ -14,8 +14,16 @@
 # ==============================================================================
 """Base config template."""
-# pylint: disable=line-too-long
+BACKBONES = [
+    'resnet',
+]
+MULTILEVEL_FEATURES = [
+    'fpn',
+]
+# pylint: disable=line-too-long
 # For ResNet, this freezes the variables of the first conv1 and conv2_x
 # layers [1], which leads to higher training speed and slightly better testing
 # accuracy. The intuition is that the low-level architecture (e.g., ResNet-50)
@@ -24,7 +32,6 @@
 # Note that we need to trailing `/` to avoid the incorrect match.
 # [1]: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L198
 RESNET_FROZEN_VAR_PREFIX = r'(resnet\d+)\/(conv2d(|_([1-9]|10))|batch_normalization(|_([1-9]|10)))\/'
 REGULARIZATION_VAR_REGEX = r'.*(kernel|weight):0$'
 BASE_CFG = {
@@ -41,6 +48,7 @@ BASE_CFG = {
        'optimizer': {
            'type': 'momentum',
            'momentum': 0.9,
+            'nesterov': True,  # `False` is better for TPU v3-128.
        },
        'learning_rate': {
            'type': 'step',
@@ -49,21 +57,25 @@ BASE_CFG = {
            'init_learning_rate': 0.08,
            'learning_rate_levels': [0.008, 0.0008],
            'learning_rate_steps': [15000, 20000],
-            'total_steps': 22500,
        },
        'checkpoint': {
            'path': '',
            'prefix': '',
        },
-        'frozen_variable_prefix': RESNET_FROZEN_VAR_PREFIX,
+        # One can use 'RESNET_FROZEN_VAR_PREFIX' to speed up ResNet training
+        # when loading from the checkpoint.
+        'frozen_variable_prefix': '',
        'train_file_pattern': '',
        'train_dataset_type': 'tfrecord',
+        # TODO(b/142174042): Support transpose_input option.
        'transpose_input': False,
        'regularization_variable_regex': REGULARIZATION_VAR_REGEX,
        'l2_weight_decay': 0.0001,
        'gradient_clip_norm': 0.0,
+        'input_sharding': False,
    },
    'eval': {
+        'input_sharding': True,
        'batch_size': 8,
        'eval_samples': 5000,
        'min_eval_interval': 180,
@@ -74,38 +86,42 @@ BASE_CFG = {
        'val_json_file': '',
        'eval_file_pattern': '',
        'eval_dataset_type': 'tfrecord',
+        # When visualizing images, set evaluation batch size to 40 to avoid
+        # potential OOM.
+        'num_images_to_visualize': 0,
    },
    'predict': {
        'batch_size': 8,
    },
-    'anchor': {
+    'architecture': {
+        'backbone': 'resnet',
        'min_level': 3,
        'max_level': 7,
+        'multilevel_features': 'fpn',
+        'use_bfloat16': True,
+        # Note that `num_classes` is the total number of classes including
+        # one background classes whose index is 0.
+        'num_classes': 91,
+    },
+    'anchor': {
        'num_scales': 3,
        'aspect_ratios': [1.0, 2.0, 0.5],
        'anchor_size': 4.0,
    },
+    'norm_activation': {
+        'activation': 'relu',
+        'batch_norm_momentum': 0.997,
+        'batch_norm_epsilon': 1e-4,
+        'batch_norm_trainable': True,
+        'use_sync_bn': False,
+    },
    'resnet': {
        'resnet_depth': 50,
-        'batch_norm': {
-            'batch_norm_momentum': 0.997,
-            'batch_norm_epsilon': 1e-4,
-            'batch_norm_trainable': True,
-            'use_sync_bn': False,
-        },
    },
    'fpn': {
-        'min_level': 3,
-        'max_level': 7,
        'fpn_feat_dims': 256,
        'use_separable_conv': False,
        'use_batch_norm': True,
-        'batch_norm': {
-            'batch_norm_momentum': 0.997,
-            'batch_norm_epsilon': 1e-4,
-            'batch_norm_trainable': True,
-            'use_sync_bn': False,
-        },
    },
    'postprocess': {
        'use_batched_nms': False,
@@ -116,5 +132,4 @@ BASE_CFG = {
    },
    'enable_summary': False,
 }
 # pylint: enable=line-too-long
--- a/official/vision/detection/configs/maskrcnn_config.py
+++ b/official/vision/detection/configs/maskrcnn_config.py
@@ -28,13 +28,12 @@ MASKRCNN_CFG.override({
    },
    'architecture': {
        'parser': 'maskrcnn_parser',
-        'backbone': 'resnet',
+        'min_level': 2,
-        'multilevel_features': 'fpn',
+        'max_level': 6,
-        'use_bfloat16': True,
        'include_mask': True,
+        'mask_target_size': 28,
    },
    'maskrcnn_parser': {
-        'use_bfloat16': True,
        'output_size': [1024, 1024],
        'num_channels': 3,
        'rpn_match_threshold': 0.7,
@@ -46,74 +45,32 @@ MASKRCNN_CFG.override({
        'aug_scale_max': 1.0,
        'skip_crowd_during_training': True,
        'max_num_instances': 100,
-        'include_mask': True,
        'mask_crop_size': 112,
    },
    'anchor': {
-        'min_level': 2,
-        'max_level': 6,
        'num_scales': 1,
        'anchor_size': 8,
    },
-    'fpn': {
-        'min_level': 2,
-        'max_level': 6,
-    },
-    'nasfpn': {
-        'min_level': 2,
-        'max_level': 6,
-    },
-    # tunable_nasfpn:strip_begin
-    'tunable_nasfpn_v1': {
-        'min_level': 2,
-        'max_level': 6,
-    },
-    # tunable_nasfpn:strip_end
    'rpn_head': {
-        'min_level': 2,
-        'max_level': 6,
        'anchors_per_location': 3,
        'num_convs': 2,
        'num_filters': 256,
        'use_separable_conv': False,
        'use_batch_norm': False,
-        'batch_norm': {
-            'batch_norm_momentum': 0.997,
-            'batch_norm_epsilon': 1e-4,
-            'batch_norm_trainable': True,
-            'use_sync_bn': False,
-        },
    },
    'frcnn_head': {
-        # Note that `num_classes` is the total number of classes including
-        # one background classes whose index is 0.
-        'num_classes': 91,
        'num_convs': 0,
        'num_filters': 256,
        'use_separable_conv': False,
        'num_fcs': 2,
        'fc_dims': 1024,
        'use_batch_norm': False,
-        'batch_norm': {
-            'batch_norm_momentum': 0.997,
-            'batch_norm_epsilon': 1e-4,
-            'batch_norm_trainable': True,
-            'use_sync_bn': False,
-        },
    },
    'mrcnn_head': {
-        'num_classes': 91,
-        'mask_target_size': 28,
        'num_convs': 4,
        'num_filters': 256,
        'use_separable_conv': False,
        'use_batch_norm': False,
-        'batch_norm': {
-            'batch_norm_momentum': 0.997,
-            'batch_norm_epsilon': 1e-4,
-            'batch_norm_trainable': True,
-            'use_sync_bn': False,
-        },
    },
    'rpn_score_loss': {
        'rpn_batch_size_per_im': 256,
@@ -147,23 +104,10 @@ MASKRCNN_CFG.override({
    },
    'mask_sampling': {
        'num_mask_samples_per_image': 128,  # Typically = `num_samples_per_image` * `fg_fraction`.
-        'mask_target_size': 28,
-    },
-    'postprocess': {
-        'use_batched_nms': False,
-        'max_total_size': 100,
-        'nms_iou_threshold': 0.5,
-        'score_threshold': 0.05,
-        'pre_nms_num_boxes': 1000,
    },
 }, is_strict=False)
 MASKRCNN_RESTRICTIONS = [
-    'architecture.use_bfloat16 == maskrcnn_parser.use_bfloat16',
-    'architecture.include_mask == maskrcnn_parser.include_mask',
-    'anchor.min_level == rpn_head.min_level',
-    'anchor.max_level == rpn_head.max_level',
-    'mrcnn_head.mask_target_size == mask_sampling.mask_target_size',
 ]
 # pylint: enable=line-too-long