Merge remote-tracking branch 'upstream/master' into amp_resnet50

901c4cc4 · Vinh Nguyen · ef30de93 · 824ff2d6 · 901c4cc4 · 901c4cc4
Commit 901c4cc4 authored Aug 20, 2019 by Vinh Nguyen
20 changed files
--- a/official/boosted_trees/train_higgs_test.py
+++ b/official/boosted_trees/train_higgs_test.py
@@ -26,7 +26,7 @@ import pandas as pd
 import tensorflow as tf

 # pylint: disable=g-bad-import-order
-from official.boosted_trees import train_higgs
+from official.r1.boosted_trees import train_higgs
 from official.utils.misc import keras_utils
 from official.utils.testing import integration

@@ -133,7 +133,7 @@ class BaseTest(tf.test.TestCase):
            "--eval_start", "12",
            "--eval_count", "8",
        ],
-        synth=False, max_train=None)
+        synth=False)
    self.assertTrue(tf.gfile.Exists(os.path.join(model_dir, "checkpoint")))

  @unittest.skipIf(keras_utils.is_v2_0(), "TF 1.0 only test.")
@@ -152,7 +152,7 @@ class BaseTest(tf.test.TestCase):
            "--eval_start", "12",
            "--eval_count", "8",
        ],
-        synth=False, max_train=None)
+        synth=False)
    self.assertTrue(tf.gfile.Exists(os.path.join(model_dir, "checkpoint")))
    self.assertTrue(tf.gfile.Exists(os.path.join(export_dir)))


--- a/official/r1/resnet/cifar10_test.py
+++ b/official/r1/resnet/cifar10_test.py
@@ -168,13 +168,15 @@ class BaseTest(tf.test.TestCase):
  def test_cifar10_end_to_end_synthetic_v1(self):
    integration.run_synthetic(
        main=cifar10_main.run_cifar, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '1', '-batch_size', '4']
+        extra_flags=['-resnet_version', '1', '-batch_size', '4',
+                     '--max_train_steps', '1']
    )

  def test_cifar10_end_to_end_synthetic_v2(self):
    integration.run_synthetic(
        main=cifar10_main.run_cifar, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '2', '-batch_size', '4']
+        extra_flags=['-resnet_version', '2', '-batch_size', '4',
+                     '--max_train_steps', '1']
    )



--- a/official/r1/resnet/imagenet_test.py
+++ b/official/r1/resnet/imagenet_test.py
@@ -282,41 +282,43 @@ class BaseTest(tf.test.TestCase):
  def test_imagenet_end_to_end_synthetic_v1(self):
    integration.run_synthetic(
        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '1', '-batch_size', '4']
+        extra_flags=['-resnet_version', '1', '-batch_size', '4',
+                     '--max_train_steps', '1']
    )

  def test_imagenet_end_to_end_synthetic_v2(self):
    integration.run_synthetic(
        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '2', '-batch_size', '4']
+        extra_flags=['-resnet_version', '2', '-batch_size', '4',
+                     '--max_train_steps', '1']
    )

  def test_imagenet_end_to_end_synthetic_v1_tiny(self):
    integration.run_synthetic(
        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
        extra_flags=['-resnet_version', '1', '-batch_size', '4',
-                     '-resnet_size', '18']
+                     '-resnet_size', '18', '--max_train_steps', '1']
    )

  def test_imagenet_end_to_end_synthetic_v2_tiny(self):
    integration.run_synthetic(
        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
        extra_flags=['-resnet_version', '2', '-batch_size', '4',
-                     '-resnet_size', '18']
+                     '-resnet_size', '18', '--max_train_steps', '1']
    )

  def test_imagenet_end_to_end_synthetic_v1_huge(self):
    integration.run_synthetic(
        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
        extra_flags=['-resnet_version', '1', '-batch_size', '4',
-                     '-resnet_size', '200']
+                     '-resnet_size', '200', '--max_train_steps', '1']
    )

  def test_imagenet_end_to_end_synthetic_v2_huge(self):
    integration.run_synthetic(
        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
        extra_flags=['-resnet_version', '2', '-batch_size', '4',
-                     '-resnet_size', '200']
+                     '-resnet_size', '200', '--max_train_steps', '1']
    )



--- a/official/r1/resnet/resnet_run_loop.py
+++ b/official/r1/resnet/resnet_run_loop.py
@@ -730,9 +730,11 @@ def define_resnet_flags(resnet_size_choices=None, dynamic_loss_scale=False,
                                dynamic_loss_scale=dynamic_loss_scale,
                                fp16_implementation=fp16_implementation,
                                loss_scale=True,
-                                tf_data_experimental_slack=True)
+                                tf_data_experimental_slack=True,
+                                max_train_steps=True)
  flags_core.define_image()
  flags_core.define_benchmark()
+  flags_core.define_distribution()
  flags.adopt_module_key_flags(flags_core)

  flags.DEFINE_enum(
@@ -768,16 +770,6 @@ def define_resnet_flags(resnet_size_choices=None, dynamic_loss_scale=False,
          'If True, uses `tf.estimator.train_and_evaluate` for the training '
          'and evaluation loop, instead of separate calls to `classifier.train '
          'and `classifier.evaluate`, which is the default behavior.'))
-  flags.DEFINE_string(
-      name='worker_hosts', default=None,
-      help=flags_core.help_wrap(
-          'Comma-separated list of worker ip:port pairs for running '
-          'multi-worker models with DistributionStrategy.  The user would '
-          'start the program on each host with identical value for this flag.'))
-  flags.DEFINE_integer(
-      name='task_index', default=-1,
-      help=flags_core.help_wrap('If multi-worker training, the task_index of '
-                                'this worker.'))
  flags.DEFINE_bool(
      name='enable_lars', default=False,
      help=flags_core.help_wrap(

--- a/official/wide_deep/README.md
+++ b/official/wide_deep/README.md
 # Predicting Income with the Census Income Dataset
+
+Note that, the implementation is based on TF 1.x.
+It is subjected to move to R1 archive folder.
+
 ## Overview
 The [Census Income Data Set](https://archive.ics.uci.edu/ml/datasets/Census+Income) contains over 48,000 samples with attributes including age, occupation, education, and income (a binary label, either `>50K` or `<=50K`). The dataset is split into roughly 32,000 training and 16,000 testing samples.


--- a/official/keras_application_models/__init__.py
+++ b/official/keras_application_models/__init__.py
--- a/official/wide_deep/census_dataset.py
+++ b/official/wide_deep/census_dataset.py
--- a/official/wide_deep/census_main.py
+++ b/official/wide_deep/census_main.py
@@ -22,8 +22,8 @@ import tensorflow as tf

 from official.utils.flags import core as flags_core
 from official.utils.logs import logger
-from official.wide_deep import census_dataset
-from official.wide_deep import wide_deep_run_loop
+from official.r1.wide_deep import census_dataset
+from official.r1.wide_deep import wide_deep_run_loop


 def define_census_flags():

--- a/official/wide_deep/census_test.csv
+++ b/official/wide_deep/census_test.csv
--- a/official/wide_deep/census_test.py
+++ b/official/wide_deep/census_test.py
@@ -24,8 +24,8 @@ import tensorflow as tf  # pylint: disable=g-bad-import-order

 from official.utils.misc import keras_utils
 from official.utils.testing import integration
-from official.wide_deep import census_dataset
-from official.wide_deep import census_main
+from official.r1.wide_deep import census_dataset
+from official.r1.wide_deep import census_main

 tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

@@ -139,7 +139,7 @@ class BaseTest(tf.test.TestCase):
            '--model_type', 'wide',
            '--download_if_missing=false'
        ],
-        synth=False, max_train=None)
+        synth=False)

  @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
  def test_end_to_end_deep(self):
@@ -150,7 +150,7 @@ class BaseTest(tf.test.TestCase):
            '--model_type', 'deep',
            '--download_if_missing=false'
        ],
-        synth=False, max_train=None)
+        synth=False)

  @unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
  def test_end_to_end_wide_deep(self):
@@ -161,7 +161,7 @@ class BaseTest(tf.test.TestCase):
            '--model_type', 'wide_deep',
            '--download_if_missing=false'
        ],
-        synth=False, max_train=None)
+        synth=False)


 if __name__ == '__main__':

--- a/official/wide_deep/movielens_dataset.py
+++ b/official/wide_deep/movielens_dataset.py
--- a/official/wide_deep/movielens_main.py
+++ b/official/wide_deep/movielens_main.py
@@ -27,8 +27,8 @@ import tensorflow as tf
 from official.datasets import movielens
 from official.utils.flags import core as flags_core
 from official.utils.logs import logger
-from official.wide_deep import movielens_dataset
-from official.wide_deep import wide_deep_run_loop
+from official.r1.wide_deep import movielens_dataset
+from official.r1.wide_deep import wide_deep_run_loop


 def define_movie_flags():

--- a/official/wide_deep/movielens_test.py
+++ b/official/wide_deep/movielens_test.py
@@ -26,8 +26,8 @@ import tensorflow as tf  # pylint: disable=g-bad-import-order
 from official.datasets import movielens
 from official.utils.misc import keras_utils
 from official.utils.testing import integration
-from official.wide_deep import movielens_dataset
-from official.wide_deep import movielens_main
+from official.r1.wide_deep import movielens_dataset
+from official.r1.wide_deep import movielens_main

 tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

@@ -112,7 +112,7 @@ class BaseTest(tf.test.TestCase):
            "--train_epochs", "1",
            "--epochs_between_evals", "1"
        ],
-        synth=False, max_train=None)
+        synth=False)


 if __name__ == "__main__":

--- a/official/wide_deep/wide_deep_run_loop.py
+++ b/official/wide_deep/wide_deep_run_loop.py
--- a/official/recommendation/data_pipeline.py
+++ b/official/recommendation/data_pipeline.py
@@ -143,37 +143,32 @@ class DatasetManager(object):
      if is_training:
        return {
            movielens.USER_COLUMN:
-                tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
+                tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
            movielens.ITEM_COLUMN:
-                tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
+                tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
            rconst.VALID_POINT_MASK:
-                tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
+                tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
            "labels":
-                tf.io.FixedLenFeature([batch_size], dtype=tf.int64)
+                tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64)
        }
      else:
        return {
            movielens.USER_COLUMN:
-                tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
+                tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
            movielens.ITEM_COLUMN:
-                tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
+                tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
            rconst.DUPLICATE_MASK:
-                tf.io.FixedLenFeature([batch_size], dtype=tf.int64)
+                tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64)
        }

    features = tf.io.parse_single_example(
        serialized_data, _get_feature_map(batch_size, is_training=is_training))
-    users = tf.reshape(
-        tf.cast(features[movielens.USER_COLUMN], rconst.USER_DTYPE),
-        (batch_size,))
-    items = tf.reshape(
-        tf.cast(features[movielens.ITEM_COLUMN], rconst.ITEM_DTYPE),
-        (batch_size,))
+    users = tf.cast(features[movielens.USER_COLUMN], rconst.USER_DTYPE)
+    items = tf.cast(features[movielens.ITEM_COLUMN], rconst.ITEM_DTYPE)

    if is_training:
-      valid_point_mask = tf.reshape(
-          tf.cast(features[movielens.ITEM_COLUMN], tf.bool), (batch_size,))
-      fake_dup_mask = tf.zeros_like(features[movielens.USER_COLUMN])
+      valid_point_mask = tf.cast(features[rconst.VALID_POINT_MASK], tf.bool)
+      fake_dup_mask = tf.zeros_like(users)
      return {
          movielens.USER_COLUMN: users,
          movielens.ITEM_COLUMN: items,
@@ -184,20 +179,15 @@ class DatasetManager(object):
          rconst.DUPLICATE_MASK: fake_dup_mask
      }
    else:
-      labels = tf.reshape(
-          tf.cast(tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool),
-          (batch_size, 1))
-      fake_valid_pt_mask = tf.cast(
-          tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool)
+      labels = tf.cast(tf.zeros_like(users), tf.bool)
+      fake_valid_pt_mask = tf.cast(tf.zeros_like(users), tf.bool)
      return {
          movielens.USER_COLUMN:
              users,
          movielens.ITEM_COLUMN:
              items,
          rconst.DUPLICATE_MASK:
-              tf.reshape(
-                  tf.cast(features[rconst.DUPLICATE_MASK], tf.bool),
-                  (batch_size,)),
+              tf.cast(features[rconst.DUPLICATE_MASK], tf.bool),
          rconst.VALID_POINT_MASK:
              fake_valid_pt_mask,
          rconst.TRAIN_LABEL_KEY:
@@ -221,8 +211,8 @@ class DatasetManager(object):
    if self._is_training:
      mask_start_index = data.pop(rconst.MASK_START_INDEX)
      batch_size = data[movielens.ITEM_COLUMN].shape[0]
-      data[rconst.VALID_POINT_MASK] = np.less(
-          np.arange(batch_size), mask_start_index)
+      data[rconst.VALID_POINT_MASK] = np.expand_dims(
+          np.less(np.arange(batch_size), mask_start_index), -1)

    if self._stream_files:
      example_bytes = self.serialize(data)
@@ -313,19 +303,21 @@ class DatasetManager(object):
    else:
      types = {movielens.USER_COLUMN: rconst.USER_DTYPE,
               movielens.ITEM_COLUMN: rconst.ITEM_DTYPE}
-      shapes = {movielens.USER_COLUMN: tf.TensorShape([batch_size]),
-                movielens.ITEM_COLUMN: tf.TensorShape([batch_size])}
+      shapes = {
+          movielens.USER_COLUMN: tf.TensorShape([batch_size, 1]),
+          movielens.ITEM_COLUMN: tf.TensorShape([batch_size, 1])
+      }

      if self._is_training:
        types[rconst.VALID_POINT_MASK] = np.bool
-        shapes[rconst.VALID_POINT_MASK] = tf.TensorShape([batch_size])
+        shapes[rconst.VALID_POINT_MASK] = tf.TensorShape([batch_size, 1])

        types = (types, np.bool)
-        shapes = (shapes, tf.TensorShape([batch_size]))
+        shapes = (shapes, tf.TensorShape([batch_size, 1]))

      else:
        types[rconst.DUPLICATE_MASK] = np.bool
-        shapes[rconst.DUPLICATE_MASK] = tf.TensorShape([batch_size])
+        shapes[rconst.DUPLICATE_MASK] = tf.TensorShape([batch_size, 1])

      data_generator = functools.partial(
          self.data_generator, epochs_between_evals=epochs_between_evals)
@@ -554,12 +546,17 @@ class BaseDataConstructor(threading.Thread):
      items = np.concatenate([items, item_pad])
      labels = np.concatenate([labels, label_pad])

-    self._train_dataset.put(i, {
-        movielens.USER_COLUMN: users,
-        movielens.ITEM_COLUMN: items,
-        rconst.MASK_START_INDEX: np.array(mask_start_index, dtype=np.int32),
-        "labels": labels,
-    })
+    self._train_dataset.put(
+        i, {
+            movielens.USER_COLUMN:
+                np.reshape(users, (self.train_batch_size, 1)),
+            movielens.ITEM_COLUMN:
+                np.reshape(items, (self.train_batch_size, 1)),
+            rconst.MASK_START_INDEX:
+                np.array(mask_start_index, dtype=np.int32),
+            "labels":
+                np.reshape(labels, (self.train_batch_size, 1)),
+        })

  def _wait_to_construct_train_epoch(self):
    count = 0
@@ -649,11 +646,15 @@ class BaseDataConstructor(threading.Thread):
    users, items, duplicate_mask = self._assemble_eval_batch(
        users, positive_items, negative_items, self._eval_users_per_batch)

-    self._eval_dataset.put(i, {
-        movielens.USER_COLUMN: users.flatten(),
-        movielens.ITEM_COLUMN: items.flatten(),
-        rconst.DUPLICATE_MASK: duplicate_mask.flatten(),
-    })
+    self._eval_dataset.put(
+        i, {
+            movielens.USER_COLUMN:
+                np.reshape(users.flatten(), (self.eval_batch_size, 1)),
+            movielens.ITEM_COLUMN:
+                np.reshape(items.flatten(), (self.eval_batch_size, 1)),
+            rconst.DUPLICATE_MASK:
+                np.reshape(duplicate_mask.flatten(), (self.eval_batch_size, 1)),
+        })

  def _construct_eval_epoch(self):
    """Loop to construct data for evaluation."""
@@ -720,24 +721,37 @@ class DummyConstructor(threading.Thread):
      num_users = params["num_users"]
      num_items = params["num_items"]

-      users = tf.random.uniform([batch_size], dtype=tf.int32, minval=0,
+      users = tf.random.uniform([batch_size, 1],
+                                dtype=tf.int32,
+                                minval=0,
                                maxval=num_users)
-      items = tf.random.uniform([batch_size], dtype=tf.int32, minval=0,
+      items = tf.random.uniform([batch_size, 1],
+                                dtype=tf.int32,
+                                minval=0,
                                maxval=num_items)

      if is_training:
-        valid_point_mask = tf.cast(tf.random.uniform(
-            [batch_size], dtype=tf.int32, minval=0, maxval=2), tf.bool)
-        labels = tf.cast(tf.random.uniform(
-            [batch_size], dtype=tf.int32, minval=0, maxval=2), tf.bool)
+        valid_point_mask = tf.cast(
+            tf.random.uniform([batch_size, 1],
+                              dtype=tf.int32,
+                              minval=0,
+                              maxval=2), tf.bool)
+        labels = tf.cast(
+            tf.random.uniform([batch_size, 1],
+                              dtype=tf.int32,
+                              minval=0,
+                              maxval=2), tf.bool)
        data = {
            movielens.USER_COLUMN: users,
            movielens.ITEM_COLUMN: items,
            rconst.VALID_POINT_MASK: valid_point_mask,
        }, labels
      else:
-        dupe_mask = tf.cast(tf.random.uniform([batch_size], dtype=tf.int32,
-                                              minval=0, maxval=2), tf.bool)
+        dupe_mask = tf.cast(
+            tf.random.uniform([batch_size, 1],
+                              dtype=tf.int32,
+                              minval=0,
+                              maxval=2), tf.bool)
        data = {
            movielens.USER_COLUMN: users,
            movielens.ITEM_COLUMN: items,

--- a/official/recommendation/data_test.py
+++ b/official/recommendation/data_test.py
@@ -168,8 +168,11 @@ class BaseTest(tf.test.TestCase):
    md5 = hashlib.md5()
    for features, labels in first_epoch:
      data_list = [
-          features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
-          features[rconst.VALID_POINT_MASK], labels]
+          features[movielens.USER_COLUMN].flatten(),
+          features[movielens.ITEM_COLUMN].flatten(),
+          features[rconst.VALID_POINT_MASK].flatten(),
+          labels.flatten()
+      ]
      for i in data_list:
        md5.update(i.tobytes())

@@ -216,8 +219,10 @@ class BaseTest(tf.test.TestCase):
    md5 = hashlib.md5()
    for features in eval_data:
      data_list = [
-          features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
-          features[rconst.DUPLICATE_MASK]]
+          features[movielens.USER_COLUMN].flatten(),
+          features[movielens.ITEM_COLUMN].flatten(),
+          features[rconst.DUPLICATE_MASK].flatten()
+      ]
      for i in data_list:
        md5.update(i.tobytes())

@@ -276,8 +281,11 @@ class BaseTest(tf.test.TestCase):
    md5 = hashlib.md5()
    for features, labels in results:
      data_list = [
-          features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
-          features[rconst.VALID_POINT_MASK], labels]
+          features[movielens.USER_COLUMN].flatten(),
+          features[movielens.ITEM_COLUMN].flatten(),
+          features[rconst.VALID_POINT_MASK].flatten(),
+          labels.flatten()
+      ]
      for i in data_list:
        md5.update(i.tobytes())


--- a/official/recommendation/ncf_common.py
+++ b/official/recommendation/ncf_common.py
@@ -37,7 +37,6 @@ from official.utils.flags import core as flags_core
 from official.utils.misc import distribution_utils
 from official.utils.misc import keras_utils

-
 FLAGS = flags.FLAGS


@@ -60,13 +59,8 @@ def get_inputs(params):
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params,
        constructor_type=FLAGS.constructor_type,
        deterministic=FLAGS.seed is not None)
-
-    num_train_steps = (producer.train_batches_per_epoch //
-                       params["batches_per_step"])
-    num_eval_steps = (producer.eval_batches_per_epoch //
-                      params["batches_per_step"])
-    assert not producer.train_batches_per_epoch % params["batches_per_step"]
-    assert not producer.eval_batches_per_epoch % params["batches_per_step"]
+    num_train_steps = producer.train_batches_per_epoch
+    num_eval_steps = producer.eval_batches_per_epoch

  return num_users, num_items, num_train_steps, num_eval_steps, producer

@@ -74,18 +68,13 @@ def get_inputs(params):
 def parse_flags(flags_obj):
  """Convenience function to turn flags into params."""
  num_gpus = flags_core.get_num_gpus(flags_obj)
-  num_devices = FLAGS.num_tpu_shards if FLAGS.tpu else num_gpus or 1
-
-  batch_size = (flags_obj.batch_size + num_devices - 1) // num_devices

-  eval_divisor = (rconst.NUM_EVAL_NEGATIVES + 1) * num_devices
+  batch_size = flags_obj.batch_size
  eval_batch_size = flags_obj.eval_batch_size or flags_obj.batch_size
-  eval_batch_size = ((eval_batch_size + eval_divisor - 1) //
-                     eval_divisor * eval_divisor // num_devices)

  return {
      "train_epochs": flags_obj.train_epochs,
-      "batches_per_step": num_devices,
+      "batches_per_step": 1,
      "use_seed": flags_obj.seed is not None,
      "batch_size": batch_size,
      "eval_batch_size": eval_batch_size,
@@ -95,6 +84,7 @@ def parse_flags(flags_obj):
      "mf_regularization": flags_obj.mf_regularization,
      "mlp_reg_layers": [float(reg) for reg in flags_obj.mlp_regularization],
      "num_neg": flags_obj.num_neg,
+      "distribution_strategy": flags_obj.distribution_strategy,
      "num_gpus": num_gpus,
      "use_tpu": flags_obj.tpu is not None,
      "tpu": flags_obj.tpu,
@@ -115,7 +105,7 @@ def parse_flags(flags_obj):
  }


-def get_distribution_strategy(params):
+def get_v1_distribution_strategy(params):
  """Returns the distribution strategy to use."""
  if params["use_tpu"]:
    # Some of the networking libraries are quite chatty.

--- a/official/recommendation/ncf_estimator_main.py
+++ b/official/recommendation/ncf_estimator_main.py
@@ -66,7 +66,7 @@ def construct_estimator(model_dir, params):
  Returns:
    An Estimator or TPUEstimator.
  """
-  distribution = ncf_common.get_distribution_strategy(params)
+  distribution = ncf_common.get_v1_distribution_strategy(params)
  run_config = tf.estimator.RunConfig(train_distribute=distribution,
                                      eval_distribute=distribution)


--- a/official/recommendation/ncf_input_pipeline.py
+++ b/official/recommendation/ncf_input_pipeline.py
@@ -82,7 +82,6 @@ def create_dataset_from_data_producer(producer, params):
    Returns:
      Processed training features.
    """
-    labels = tf.expand_dims(labels, -1)
    fake_dup_mask = tf.zeros_like(features[movielens.USER_COLUMN])
    features[rconst.DUPLICATE_MASK] = fake_dup_mask
    features[rconst.TRAIN_LABEL_KEY] = labels
@@ -106,7 +105,6 @@ def create_dataset_from_data_producer(producer, params):
      Processed evaluation features.
    """
    labels = tf.cast(tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool)
-    labels = tf.expand_dims(labels, -1)
    fake_valid_pt_mask = tf.cast(
        tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool)
    features[rconst.VALID_POINT_MASK] = fake_valid_pt_mask
@@ -119,7 +117,10 @@ def create_dataset_from_data_producer(producer, params):
  return train_input_dataset, eval_input_dataset


-def create_ncf_input_data(params, producer=None, input_meta_data=None):
+def create_ncf_input_data(params,
+                          producer=None,
+                          input_meta_data=None,
+                          strategy=None):
  """Creates NCF training/evaluation dataset.

  Args:
@@ -130,13 +131,31 @@ def create_ncf_input_data(params, producer=None, input_meta_data=None):
    input_meta_data: A dictionary of input metadata to be used when reading data
      from tf record files. Must be specified when params["train_input_dataset"]
      is specified.
+    strategy: Distribution strategy used for distributed training. If specified,
+      used to assert that evaluation batch size is correctly a multiple of
+      total number of devices used.

  Returns:
    (training dataset, evaluation dataset, train steps per epoch,
    eval steps per epoch)
+
+  Raises:
+    ValueError: If data is being generated online for when using TPU's.
  """
+  # NCF evaluation metric calculation logic assumes that evaluation data
+  # sample size are in multiples of (1 + number of negative samples in
+  # evaluation) for each device. As so, evaluation batch size must be a
+  # multiple of (number of replicas * (1 + number of negative samples)).
+  num_devices = strategy.num_replicas_in_sync if strategy else 1
+  if (params["eval_batch_size"] % (num_devices *
+                                   (1 + rconst.NUM_EVAL_NEGATIVES))):
+    raise ValueError("Evaluation batch size must be divisible by {} "
+                     "times {}".format(num_devices,
+                                       (1 + rconst.NUM_EVAL_NEGATIVES)))

  if params["train_dataset_path"]:
+    assert params["eval_dataset_path"]
+
    train_dataset = create_dataset_from_tf_record_files(
        params["train_dataset_path"],
        input_meta_data["train_prebatch_size"],
@@ -148,34 +167,18 @@ def create_ncf_input_data(params, producer=None, input_meta_data=None):
        params["eval_batch_size"],
        is_training=False)

-    # TODO(b/259377621): Remove number of devices (i.e.
-    # params["batches_per_step"]) in input pipeline logic and only use
-    # global batch size instead.
-    num_train_steps = int(
-        np.ceil(input_meta_data["num_train_steps"] /
-                params["batches_per_step"]))
-    num_eval_steps = (
-        input_meta_data["num_eval_steps"] // params["batches_per_step"])
-
+    num_train_steps = int(input_meta_data["num_train_steps"])
+    num_eval_steps = int(input_meta_data["num_eval_steps"])
  else:
-    assert producer
+    if params["use_tpu"]:
+      raise ValueError("TPU training does not support data producer yet. "
+                       "Use pre-processed data.")

+    assert producer
    # Start retrieving data from producer.
    train_dataset, eval_dataset = create_dataset_from_data_producer(
        producer, params)
-    num_train_steps = (
-        producer.train_batches_per_epoch // params["batches_per_step"])
-    num_eval_steps = (
-        producer.eval_batches_per_epoch // params["batches_per_step"])
-    assert not producer.train_batches_per_epoch % params["batches_per_step"]
-    assert not producer.eval_batches_per_epoch % params["batches_per_step"]
-
-  # It is required that for distributed training, the dataset must call
-  # batch(). The parameter of batch() here is the number of replicas involed,
-  # such that each replica evenly gets a slice of data.
-  # drop_remainder = True, as we would like batch call to return a fixed shape
-  # vs None, this prevents a expensive broadcast during weighted_loss
-  batches_per_step = params["batches_per_step"]
-  train_dataset = train_dataset.batch(batches_per_step, drop_remainder=True)
-  eval_dataset = eval_dataset.batch(batches_per_step, drop_remainder=True)
+    num_train_steps = producer.train_batches_per_epoch
+    num_eval_steps = producer.eval_batches_per_epoch
+
  return train_dataset, eval_dataset, num_train_steps, num_eval_steps
--- a/official/recommendation/ncf_keras_benchmark.py
+++ b/official/recommendation/ncf_keras_benchmark.py
@@ -181,6 +181,13 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.early_stopping = True
    self._run_and_report_benchmark()

+  def benchmark_1_gpu_ctl_run_eagerly_early_stop(self):
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.early_stopping = True
+    FLAGS.run_eagerly = True
+    self._run_and_report_benchmark()
+
  def benchmark_xla_1_gpu_ctl_early_stop(self):
    self._setup()
    FLAGS.keras_use_ctl = True
@@ -192,6 +199,7 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    self._setup()
    FLAGS.early_stopping = True
    FLAGS.num_gpus = 2
+    FLAGS.eval_batch_size = 160000
    self._run_and_report_benchmark()

  def benchmark_2_gpus_ctl_early_stop(self):
@@ -200,10 +208,11 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.keras_use_ctl = True
    FLAGS.early_stopping = True
    FLAGS.num_gpus = 2
+    FLAGS.eval_batch_size = 160000
    self._run_and_report_benchmark()

 #############################################
-# Tests below with mlperf in the test name are of two types
+# Tests below with mlperf in the test name are of two types:
 #  1) 1 GPU tests are based on MLPerf 0.5 and the TensorFlow pulled submission.
 #  2) 8 GPU tests are based on MLPerf 0.5 and use NVIDIA's hyper parameters.
 #
@@ -254,6 +263,14 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.train_epochs = 7
    self._run_and_report_benchmark_mlperf_like()

+  def benchmark_1_gpu_ctl_run_eagerly_mlperf_like(self):
+    """1 GPU using CTL with eager and distribution strategy."""
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.run_eagerly = True
+    FLAGS.train_epochs = 7
+    self._run_and_report_benchmark()
+
  def benchmark_xla_1_gpu_ctl_mlperf_like(self):
    """1 GPU using CTL with XLA."""
    self._setup()
@@ -268,6 +285,7 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.num_gpus = 8
    FLAGS.train_epochs = 17
    FLAGS.batch_size = 1048576
+    FLAGS.eval_batch_size = 160000
    FLAGS.learning_rate = 0.0045
    FLAGS.beta1 = 0.25
    FLAGS.beta2 = 0.5
@@ -280,6 +298,7 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.num_gpus = 8
    FLAGS.train_epochs = 17
    FLAGS.batch_size = 1048576
+    FLAGS.eval_batch_size = 160000
    FLAGS.learning_rate = 0.0045
    FLAGS.beta1 = 0.25
    FLAGS.beta2 = 0.5
@@ -287,19 +306,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.force_v2_in_keras_compile = False
    self._run_and_report_benchmark_mlperf_like()

-  def benchmark_xla_8_gpu_mlperf_like(self):
-    """8 GPU using keras fit/compile with XLA."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.enable_xla = True
-    FLAGS.train_epochs = 17
-    FLAGS.batch_size = 1048576
-    FLAGS.learning_rate = 0.0045
-    FLAGS.beta1 = 0.25
-    FLAGS.beta2 = 0.5
-    FLAGS.epsilon = 1e-8
-    self._run_and_report_benchmark_mlperf_like()
-
  def benchmark_8_gpu_ctl_mlperf_like(self):
    """8 GPU using CTL."""
    self._setup()
@@ -307,20 +313,7 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.num_gpus = 8
    FLAGS.train_epochs = 17
    FLAGS.batch_size = 1048576
-    FLAGS.learning_rate = 0.0045
-    FLAGS.beta1 = 0.25
-    FLAGS.beta2 = 0.5
-    FLAGS.epsilon = 1e-8
-    self._run_and_report_benchmark_mlperf_like()
-
-  def benchmark_xla_8_gpu_ctl_mlperf_like(self):
-    """8 GPU using CTL with XLA."""
-    self._setup()
-    FLAGS.keras_use_ctl = True
-    FLAGS.enable_xla = True
-    FLAGS.num_gpus = 8
-    FLAGS.train_epochs = 17
-    FLAGS.batch_size = 1048576
+    FLAGS.eval_batch_size = 160000
    FLAGS.learning_rate = 0.0045
    FLAGS.beta1 = 0.25
    FLAGS.beta2 = 0.5
@@ -341,6 +334,7 @@ class NCFKerasSynth(NCFKerasBenchmarkBase):
    default_flags['num_gpus'] = 1
    default_flags['train_epochs'] = 8
    default_flags['batch_size'] = 99000
+    default_flags['eval_batch_size'] = 160000
    default_flags['learning_rate'] = 0.00382059
    default_flags['beta1'] = 0.783529
    default_flags['beta2'] = 0.909003