Merge remote-tracking branch 'upstream/master' into amp_resnet50

901c4cc4 · Vinh Nguyen · ef30de93 · 824ff2d6 · 901c4cc4 · 901c4cc4
Commit 901c4cc4 authored Aug 20, 2019 by Vinh Nguyen
20 changed files
--- a/official/README.md
+++ b/official/README.md
@@ -47,13 +47,18 @@ To make Official Models easier to use, we are planning to create a pip installab

 * [bert](bert): A powerful pre-trained language representation model: BERT, which
  stands for Bidirectional Encoder Representations from Transformers.
-* [boosted_trees](boosted_trees): A Gradient Boosted Trees model to classify higgs boson process from HIGGS Data Set.
 * [mnist](mnist): A basic model to classify digits from the MNIST dataset.
 * [resnet](resnet): A deep residual network that can be used to classify both CIFAR-10 and ImageNet's dataset of 1000 classes.
 * [transformer](transformer): A transformer model to translate the WMT English to German dataset.
 * [wide_deep](wide_deep): A model that combines a wide model and deep network to classify census income data.
 * More models to come!

+Models that will not update to TensorFlow 2.x stay inside R1 directory:
+
+* [boosted_trees](r1/boosted_trees): A Gradient Boosted Trees model to classify
+  higgs boson process from HIGGS Data Set.
+
+
 If you would like to make any fixes or improvements to the models, please [submit a pull request](https://github.com/tensorflow/models/compare).

 ## New Models

--- a/official/resnet/keras/keras_benchmark.py
+++ b/official/resnet/keras/keras_benchmark.py
--- a/official/resnet/keras/keras_cifar_benchmark.py
+++ b/official/resnet/keras/keras_cifar_benchmark.py
@@ -22,8 +22,8 @@ import time
 from absl import flags
 import tensorflow as tf  # pylint: disable=g-bad-import-order

-from official.resnet.keras import keras_benchmark
-from official.resnet.keras import keras_cifar_main
+from official.benchmark import keras_benchmark
+from official.vision.image_classification import resnet_cifar_main

 MIN_TOP_1_ACCURACY = 0.929
 MAX_TOP_1_ACCURACY = 0.938
@@ -47,7 +47,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
    """

    self.data_dir = os.path.join(root_data_dir, CIFAR_DATA_DIR_NAME)
-    flag_methods = [keras_cifar_main.define_cifar_flags]
+    flag_methods = [resnet_cifar_main.define_cifar_flags]

    super(Resnet56KerasAccuracy, self).__init__(
        output_dir=output_dir, flag_methods=flag_methods)
@@ -199,7 +199,7 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):

  def _run_and_report_benchmark(self):
    start_time_sec = time.time()
-    stats = keras_cifar_main.run(FLAGS)
+    stats = resnet_cifar_main.run(FLAGS)
    wall_time_sec = time.time() - start_time_sec

    super(Resnet56KerasAccuracy, self)._report_benchmark(
@@ -215,7 +215,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
  """Short performance tests for ResNet56 via Keras and CIFAR-10."""

  def __init__(self, output_dir=None, default_flags=None):
-    flag_methods = [keras_cifar_main.define_cifar_flags]
+    flag_methods = [resnet_cifar_main.define_cifar_flags]

    super(Resnet56KerasBenchmarkBase, self).__init__(
        output_dir=output_dir,
@@ -224,7 +224,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):

  def _run_and_report_benchmark(self):
    start_time_sec = time.time()
-    stats = keras_cifar_main.run(FLAGS)
+    stats = resnet_cifar_main.run(FLAGS)
    wall_time_sec = time.time() - start_time_sec

    super(Resnet56KerasBenchmarkBase, self)._report_benchmark(
@@ -248,6 +248,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    self._setup()
    FLAGS.num_gpus = 1
    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = False
    FLAGS.enable_xla = True
    FLAGS.distribution_strategy = 'default'
    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_xla')
@@ -270,6 +271,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    self._setup()
    FLAGS.num_gpus = 1
    FLAGS.enable_eager = False
+    FLAGS.run_eagerly = False
    FLAGS.distribution_strategy = 'default'
    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
    FLAGS.batch_size = 128
@@ -340,6 +342,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    self._setup()
    FLAGS.num_gpus = 2
    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = False
    FLAGS.distribution_strategy = 'default'
    FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
    FLAGS.batch_size = 128 * 2  # 2 GPUs
@@ -350,6 +353,7 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    self._setup()
    FLAGS.num_gpus = 2
    FLAGS.enable_eager = False
+    FLAGS.run_eagerly = False
    FLAGS.distribution_strategy = 'default'
    FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
    FLAGS.batch_size = 128 * 2  # 2 GPUs

--- a/official/resnet/keras/keras_imagenet_benchmark.py
+++ b/official/resnet/keras/keras_imagenet_benchmark.py
@@ -21,8 +21,8 @@ import time
 from absl import flags
 import tensorflow as tf  # pylint: disable=g-bad-import-order

-from official.resnet.keras import keras_benchmark
-from official.resnet.keras import keras_imagenet_main
+from official.benchmark import keras_benchmark
+from official.vision.image_classification import resnet_imagenet_main

 MIN_TOP_1_ACCURACY = 0.76
 MAX_TOP_1_ACCURACY = 0.77
@@ -44,7 +44,7 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
                named arguments before updating the constructor.
    """

-    flag_methods = [keras_imagenet_main.define_imagenet_keras_flags]
+    flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]

    self.data_dir = os.path.join(root_data_dir, 'imagenet')
    super(Resnet50KerasAccuracy, self).__init__(
@@ -129,32 +129,6 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
    FLAGS.use_tensor_lr = True
    self._run_and_report_benchmark()

-  def benchmark_8_gpu_mlperf_like_tweaked(self):
-    """Test similar to the rules for MLPerf 0.5.
-
-    Listed below are reasons this comparison is not to the MLSpec, but this is
-    still a decent directional measurement:
-      - Eval is every 4 epochs and again at the end. ~2 extra times.
-      - Learning rate is not tuned to hit 75%, but we know the model is correct.
-      - We measure total time and MLPerf 0.5 excluded some startup time.
-      - Eval is not on the total set, need to set eval batch_size where
-        8*batch_size/50K is even. 250 is a good number.
-      - Not sure if we are doing any extra or too few steps due to epoch bleed.
-    """
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 256 * 8
-    FLAGS.train_epochs = 61
-    FLAGS.epochs_between_evals = 4
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mlperf_like_tweaked')
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.use_tensor_lr = True
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    self._run_and_report_benchmark(top_1_min=0.736)
-
  def benchmark_8_gpu_mlperf_like(self):
    """Test similar to the rules for MLPerf 0.5.

@@ -201,7 +175,7 @@ class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
                                top_1_min=MIN_TOP_1_ACCURACY,
                                top_1_max=MAX_TOP_1_ACCURACY):
    start_time_sec = time.time()
-    stats = keras_imagenet_main.run(flags.FLAGS)
+    stats = resnet_imagenet_main.run(flags.FLAGS)
    wall_time_sec = time.time() - start_time_sec

    super(Resnet50KerasAccuracy, self)._report_benchmark(
@@ -220,7 +194,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
  """Resnet50 benchmarks."""

  def __init__(self, output_dir=None, default_flags=None):
-    flag_methods = [keras_imagenet_main.define_imagenet_keras_flags]
+    flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]

    super(Resnet50KerasBenchmarkBase, self).__init__(
        output_dir=output_dir,
@@ -229,7 +203,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):

  def _run_and_report_benchmark(self):
    start_time_sec = time.time()
-    stats = keras_imagenet_main.run(FLAGS)
+    stats = resnet_imagenet_main.run(FLAGS)
    wall_time_sec = time.time() - start_time_sec
    # Number of logged step time entries that are excluded in performance
    # report. We keep results from last 100 batches in this case.
@@ -294,48 +268,6 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.batch_size = 64
    self._run_and_report_benchmark()

-  def benchmark_1_gpu_no_dist_strat_force_v1_path_run_eagerly(self):
-    """Forced v1 execution in tf.compile path and force eager."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_force_v1_path_run_eagerly')
-    FLAGS.batch_size = 64
-    FLAGS.force_v2_in_keras_compile = False
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_force_v1_path_run_eagerly_tweaked(self):
-    """Forced v1 execution in tf.compile path and force eager."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.explicit_gpu_placement = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_force_v1_path_run_eagerly_tweaked')
-    FLAGS.batch_size = 64
-    FLAGS.force_v2_in_keras_compile = False
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_force_v1_path(self):
-    """No dist strat but forced v1 execution tf.compile path."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_force_v1_path')
-    FLAGS.batch_size = 128
-    FLAGS.force_v2_in_keras_compile = False
-    self._run_and_report_benchmark()
-
  def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self):
    """Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
    self._setup()
@@ -478,20 +410,6 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.tf_gpu_thread_mode = 'gpu_private'
    self._run_and_report_benchmark()

-  def benchmark_xla_1_gpu_fp16_slack(self):
-    """Test Keras model tf.data's experimental_slack functionality."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'default'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_slack')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 256
-    FLAGS.tf_data_experimental_slack = True
-    self._run_and_report_benchmark()
-
  def benchmark_xla_1_gpu_fp16_dynamic(self):
    """Test Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling."""
    self._setup()
@@ -570,21 +488,6 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.tf_gpu_thread_mode = 'gpu_private'
    self._run_and_report_benchmark()

-  def benchmark_graph_xla_1_gpu_fp16_slack(self):
-    """Test model in legacy graph with tf.data's experimental_slack."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = False
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'default'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_graph_xla_1_gpu_fp16_slack')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 256
-    FLAGS.tf_data_experimental_slack = True
-    self._run_and_report_benchmark()
-
  def benchmark_8_gpu(self):
    """Test Keras model with 8 GPUs."""
    self._setup()
@@ -621,18 +524,6 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.datasets_num_private_threads = 14
    self._run_and_report_benchmark()

-  def benchmark_8_gpu_slack(self):
-    """Test Keras model with tf.data's experimental_slack and 8 GPUs."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'default'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_slack')
-    FLAGS.batch_size = 128 * 8  # 8 GPUs
-    FLAGS.tf_data_experimental_slack = True
-    self._run_and_report_benchmark()
-
  def benchmark_xla_8_gpu(self):
    """Test Keras model with XLA and 8 GPUs."""
    self._setup()
@@ -715,24 +606,6 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.tf_gpu_thread_mode = 'gpu_private'
    self._run_and_report_benchmark()

-  def benchmark_xla_8_gpu_fp16_optional_next(self):
-    """Test Keras model with XLA, 8 GPUs and fp16.
-
-    This test also enables get_next_as_optional.
-    """
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'default'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_xla_8_gpu_fp16_optional_next')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.enable_get_next_as_optional = True
-    self._run_and_report_benchmark()
-
  def benchmark_xla_8_gpu_fp16(self):
    """Test Keras model with XLA, 8 GPUs and fp16."""
    self._setup()
@@ -782,44 +655,6 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.train_steps = 310
    self._run_and_report_benchmark()

-  def benchmark_xla_8_gpu_fp16_tweaked_optional_next(self):
-    """Test Keras model with manual config tuning, XLA, 8 GPUs, fp16.
-
-    This test also enables get_next_as_optional.
-    """
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'default'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_xla_8_gpu_fp16_tweaked_optional_next')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.use_tensor_lr = True
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.datasets_num_private_threads = 48
-    FLAGS.enable_get_next_as_optional = True
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_8_gpu_fp16_slack(self):
-    """Test Keras model with XLA, 8 GPUs and fp16.
-
-    This test also enable tf.data's experimental_slack functionality.
-    """
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'default'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16_slack')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.tf_data_experimental_slack = True
-    self._run_and_report_benchmark()
-
  def benchmark_xla_8_gpu_fp16_dynamic_tweaked(self):
    """Test Keras model with config tuning, XLA, 8 GPUs and dynamic fp16."""
    self._setup()
@@ -838,24 +673,6 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.datasets_num_private_threads = 48
    self._run_and_report_benchmark()

-  def benchmark_xla_8_gpu_fp16_tensorboard_tweaked(self):
-    """Test to track Tensorboard performance overhead."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'default'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_xla_8_gpu_fp16_tensorboard_tweaked')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.use_tensor_lr = True
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.datasets_num_private_threads = 48
-    FLAGS.enable_tensorboard = True
-    self._run_and_report_benchmark()
-
  def benchmark_graph_8_gpu(self):
    """Test Keras model in legacy graph mode with 8 GPUs."""
    self._setup()
@@ -954,41 +771,6 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.train_steps = 310
    self._run_and_report_benchmark()

-  def benchmark_graph_xla_8_gpu_fp16_tweaked_optional_next(self):
-    """Test in legacy graph mode with manual config tuning, XLA, 8 GPUs, fp16.
-
-    This test also enables get_next_as_optional.
-    """
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = False
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'default'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_graph_xla_8_gpu_fp16_tweaked_optional_next')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.use_tensor_lr = True
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.enable_get_next_as_optional = True
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_xla_8_gpu_fp16_slack(self):
-    """Test legacy graph mode with tf.data's experimental_slack."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = False
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'default'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_graph_xla_8_gpu_fp16_slack')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.tf_data_experimental_slack = True
-    self._run_and_report_benchmark()
-
  def benchmark_graph_8_gpu_fp16_dynamic_tweaked(self):
    """Test graph Keras with config tuning, 8 GPUs and dynamic fp16."""
    self._setup()
@@ -1063,7 +845,7 @@ class TrivialKerasBenchmarkReal(keras_benchmark.KerasBenchmark):
  """Trivial model with real data benchmark tests."""

  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    flag_methods = [keras_imagenet_main.define_imagenet_keras_flags]
+    flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]

    def_flags = {}
    def_flags['use_trivial_model'] = True
@@ -1083,7 +865,7 @@ class TrivialKerasBenchmarkReal(keras_benchmark.KerasBenchmark):

  def _run_and_report_benchmark(self):
    start_time_sec = time.time()
-    stats = keras_imagenet_main.run(FLAGS)
+    stats = resnet_imagenet_main.run(FLAGS)
    wall_time_sec = time.time() - start_time_sec

    super(TrivialKerasBenchmarkReal, self)._report_benchmark(
@@ -1180,5 +962,96 @@ class TrivialKerasBenchmarkReal(keras_benchmark.KerasBenchmark):
        log_steps=FLAGS.log_steps)


+class Resnet50MultiWorkerKerasBenchmark(Resnet50KerasBenchmarkBase):
+  """Resnet50 distributed benchmark tests with multiple workers."""
+
+  def __init__(self, output_dir=None, default_flags=None):
+    super(Resnet50MultiWorkerKerasBenchmark, self).__init__(
+        output_dir=output_dir, default_flags=default_flags)
+
+  def _benchmark_common(self, eager, num_workers, all_reduce_alg):
+    """Common to all benchmarks in this class."""
+    self._setup()
+
+    num_gpus = 8
+    FLAGS.num_gpus = num_gpus
+    FLAGS.dtype = 'fp16'
+    FLAGS.enable_eager = eager
+    FLAGS.enable_xla = False
+    FLAGS.distribution_strategy = 'multi_worker_mirrored'
+    FLAGS.use_tensor_lr = True
+    FLAGS.tf_gpu_thread_mode = 'gpu_private'
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_graph_8_gpu_{}_worker_fp16_{}_tweaked'.format(
+            num_workers, all_reduce_alg))
+    FLAGS.batch_size = 256 * num_gpus * num_workers
+    FLAGS.all_reduce_alg = all_reduce_alg
+
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_8_gpu_1_worker_fp16_ring_tweaked(self):
+    """Legacy graph, 8 GPUs per worker, 1 worker, fp16, ring all-reduce."""
+    self._benchmark_common(eager=False, num_workers=1, all_reduce_alg='ring')
+
+  def benchmark_graph_8_gpu_1_worker_fp16_nccl_tweaked(self):
+    """Legacy graph, 8 GPUs per worker, 1 worker, fp16, nccl all-reduce."""
+    self._benchmark_common(eager=False, num_workers=1, all_reduce_alg='nccl')
+
+  def benchmark_graph_8_gpu_2_workers_fp16_ring_tweaked(self):
+    """Legacy graph, 8 GPUs per worker, 2 workers, fp16, ring all-reduce."""
+    self._benchmark_common(eager=False, num_workers=2, all_reduce_alg='ring')
+
+  def benchmark_graph_8_gpu_2_workers_fp16_nccl_tweaked(self):
+    """Legacy graph, 8 GPUs per worker, 2 workers, fp16, nccl all-reduce."""
+    self._benchmark_common(eager=False, num_workers=2, all_reduce_alg='nccl')
+
+  def benchmark_graph_8_gpu_8_workers_fp16_ring_tweaked(self):
+    """Legacy graph, 8 GPUs per worker, 8 workers, fp16, ring all-reduce."""
+    self._benchmark_common(eager=False, num_workers=8, all_reduce_alg='ring')
+
+  def benchmark_graph_8_gpu_8_workers_fp16_nccl_tweaked(self):
+    """Legacy graph, 8 GPUs per worker, 8 workers, fp16, nccl all-reduce."""
+    self._benchmark_common(eager=False, num_workers=8, all_reduce_alg='nccl')
+
+  def benchmark_eager_8_gpu_1_worker_fp16_ring_tweaked(self):
+    """Eager, 8 GPUs per worker, 1 worker, fp16, ring all-reduce."""
+    self._benchmark_common(eager=True, num_workers=1, all_reduce_alg='ring')
+
+  def benchmark_eager_8_gpu_1_worker_fp16_nccl_tweaked(self):
+    """Eager, 8 GPUs per worker, 1 worker, fp16, nccl all-reduce."""
+    self._benchmark_common(eager=True, num_workers=1, all_reduce_alg='nccl')
+
+  def benchmark_eager_8_gpu_2_workers_fp16_ring_tweaked(self):
+    """Eager, 8 GPUs per worker, 2 workers, fp16, ring all-reduce."""
+    self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='ring')
+
+  def benchmark_eager_8_gpu_2_workers_fp16_nccl_tweaked(self):
+    """Eager, 8 GPUs per worker, 2 workers, fp16, nccl all-reduce."""
+    self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='nccl')
+
+  def benchmark_eager_8_gpu_8_workers_fp16_ring_tweaked(self):
+    """Eager, 8 GPUs per worker, 8 workers, fp16, ring all-reduce."""
+    self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='ring')
+
+  def benchmark_eager_8_gpu_8_workers_fp16_nccl_tweaked(self):
+    """Eager, 8 GPUs per worker, 8 workers, fp16, nccl all-reduce."""
+    self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='nccl')
+
+
+class Resnet50MultiWorkerKerasBenchmarkSynth(Resnet50MultiWorkerKerasBenchmark):
+  """Resnet50 multi-worker synthetic benchmark tests."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    def_flags = {}
+    def_flags['skip_eval'] = True
+    def_flags['report_accuracy_metrics'] = False
+    def_flags['use_synthetic_data'] = True
+    def_flags['train_steps'] = 110
+    def_flags['log_steps'] = 10
+
+    super(Resnet50MultiWorkerKerasBenchmarkSynth, self).__init__(
+        output_dir=output_dir, default_flags=def_flags)
+
+
 if __name__ == '__main__':
  tf.test.main()
--- a/official/bert/README.md
+++ b/official/bert/README.md
 # BERT (Bidirectional Encoder Representations from Transformers)

-Note> Please do not create pull request. This model is still under development
-and testing.
-
 The academic paper which describes BERT in detail and provides full results on a
 number of tasks can be found here: https://arxiv.org/abs/1810.04805.

@@ -30,6 +27,31 @@ Our current released checkpoints are exactly the same as TF 1.x official BERT
 repository, thus inside `BertConfig`, there is `backward_compatible=True`. We
 are going to release new pre-trained checkpoints soon.

+### Access to Pretrained Checkpoints
+
+We provide checkpoints that are converted from [google-research/bert](https://github.com/google-research/bert),
+in order to keep consistent with BERT paper.
+
+*   **[`BERT-Large, Uncased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/tf_20/wwm_uncased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Large, Cased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/tf_20/wwm_cased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/tf_20/uncased_L-12_H-768_A-12.tar.gz)**:
+    12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Large, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/tf_20/uncased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/tf_20/cased_L-12_H-768_A-12.tar.gz)**:
+    12-layer, 768-hidden, 12-heads , 110M parameters
+*   **[`BERT-Large, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/tf_20/cased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+
+We recommend to host checkpoints on Google Cloud storage buckets when you use
+Cloud GPU/TPU. For example, in the following tutorial, we use:
+
+```shell
+export BERT_BASE_DIR=gs://cloud-tpu-checkpoints/bert/tf_20/uncased_L-24_H-1024_A-16
+```
+
 ### Restoring from Checkpoints

 `tf.train.Checkpoint` is used to manage model checkpoints in TF 2.0. To restore
@@ -70,9 +92,9 @@ Second, you need to install TF 2.0 `tf-night` on your VM:
 pip install tf-nightly-2.0-preview
 ```

-Warning: More details TPU-specific set-up instructions and tutorial for TF 2.0
-are coming. Note that this repo is not officially supported by Google Cloud TPU
-team yet.
+Warning: More details TPU-specific set-up instructions and tutorial should come
+along with official TF 2.x release for TPU. Note that this repo is not officially
+supported by Google Cloud TPU team yet.

 ## Process Datasets


--- a/official/bert/benchmark/bert_squad_benchmark.py
+++ b/official/bert/benchmark/bert_squad_benchmark.py
@@ -152,7 +152,7 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    self._setup()
    self.num_gpus = 1
    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_squad')
-    FLAGS.train_batch_size = 4
+    FLAGS.train_batch_size = 3

    self._run_and_report_benchmark()

@@ -174,7 +174,7 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    self._setup()
    self.num_gpus = 1
    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat_squad')
-    FLAGS.train_batch_size = 4
+    FLAGS.train_batch_size = 3

    self._run_and_report_benchmark(use_ds=False)

@@ -185,7 +185,7 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    self.num_gpus = 1
    FLAGS.model_dir = self._get_model_dir(
        'benchmark_1_gpu_eager_no_dist_strat_squad')
-    FLAGS.train_batch_size = 4
+    FLAGS.train_batch_size = 3

    self._run_and_report_benchmark(use_ds=False, run_eagerly=True)

@@ -195,7 +195,7 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    self._setup()
    self.num_gpus = 2
    FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu_squad')
-    FLAGS.train_batch_size = 8
+    FLAGS.train_batch_size = 6

    self._run_and_report_benchmark()

@@ -205,7 +205,7 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    self._setup()
    self.num_gpus = 4
    FLAGS.model_dir = self._get_model_dir('benchmark_4_gpu_squad')
-    FLAGS.train_batch_size = 16
+    FLAGS.train_batch_size = 12

    self._run_and_report_benchmark()

@@ -215,7 +215,7 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    self._setup()
    self.num_gpus = 8
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad')
-    FLAGS.train_batch_size = 32
+    FLAGS.train_batch_size = 24

    self._run_and_report_benchmark()

@@ -231,6 +231,19 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):

    self._run_and_report_benchmark()

+  def benchmark_1_gpu_xla_fp16(self):
+    """Tests BERT SQuAD model performance with 1 GPU with XLA and FP16."""
+
+    self._setup()
+    self.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_xla_squad_fp16')
+    FLAGS.train_batch_size = 4
+    FLAGS.enable_xla = True
+    FLAGS.dtype = 'fp16'
+    FLAGS.loss_scale = 'dynamic'
+
+    self._run_and_report_benchmark()
+
  def benchmark_2_gpu_fp16(self):
    """Tests BERT SQuAD model performance with 2 GPUs and FP16."""

@@ -324,7 +337,7 @@ class BertSquadAccuracy(BertSquadBenchmarkBase):
    self._setup()
    self.num_gpus = 8
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad')
-    FLAGS.train_batch_size = 32
+    FLAGS.train_batch_size = 24

    self._run_and_report_benchmark()


--- a/official/bert/model_training_utils.py
+++ b/official/bert/model_training_utils.py
@@ -25,18 +25,12 @@ from absl import logging
 import tensorflow as tf
 from tensorflow.python.util import object_identity
 from official.utils.misc import distribution_utils
+from official.utils.misc import tpu_lib

 _SUMMARY_TXT = 'training_summary.txt'
 _MIN_SUMMARY_STEPS = 10


-def get_primary_cpu_task(use_remote_tpu=False):
-  """Returns primary CPU task to which input pipeline Ops are put."""
-
-  # Remote Eager Borg job configures the TPU worker with job name 'worker'.
-  return '/job:worker' if use_remote_tpu else ''
-
-
 def _save_checkpoint(checkpoint, model_dir, checkpoint_prefix):
  """Saves model to with provided checkpoint prefix."""

@@ -195,7 +189,7 @@ def run_customized_training_loop(

  # To reduce unnecessary send/receive input pipeline operation, we place input
  # pipeline ops in worker task.
-  with tf.device(get_primary_cpu_task(use_remote_tpu)):
+  with tf.device(tpu_lib.get_primary_cpu_task(use_remote_tpu)):
    train_iterator = _get_input_iterator(train_input_fn, strategy)

    with distribution_utils.get_strategy_scope(strategy):

--- a/official/bert/modeling.py
+++ b/official/bert/modeling.py
@@ -276,6 +276,7 @@ class EmbeddingPostprocessor(tf.keras.layers.Layer):
               max_position_embeddings=512,
               dropout_prob=0.0,
               initializer_range=0.02,
+               initializer=None,
               **kwargs):
    super(EmbeddingPostprocessor, self).__init__(**kwargs)
    self.use_type_embeddings = use_type_embeddings
@@ -285,6 +286,11 @@ class EmbeddingPostprocessor(tf.keras.layers.Layer):
    self.dropout_prob = dropout_prob
    self.initializer_range = initializer_range

+    if not initializer:
+      self.initializer = get_initializer(self.initializer_range)
+    else:
+      self.initializer = initializer
+
    if self.use_type_embeddings and not self.token_type_vocab_size:
      raise ValueError("If `use_type_embeddings` is True, then "
                       "`token_type_vocab_size` must be specified.")
@@ -723,6 +729,15 @@ class TransformerBlock(tf.keras.layers.Layer):
        name="output_layer_norm", axis=-1, epsilon=1e-12)
    super(TransformerBlock, self).build(unused_input_shapes)

+  def common_layers(self):
+    """Explicitly gets all layer objects inside a Transformer encoder block."""
+    return [
+        self.attention_layer, self.attention_output_dense,
+        self.attention_dropout, self.attention_layer_norm,
+        self.intermediate_dense, self.output_dense, self.output_dropout,
+        self.output_layer_norm
+    ]
+
  def __call__(self, input_tensor, attention_mask=None):
    inputs = pack_inputs([input_tensor, attention_mask])
    return super(TransformerBlock, self).__call__(inputs)

--- a/official/bert/run_classifier.py
+++ b/official/bert/run_classifier.py
@@ -35,8 +35,8 @@ from official.bert import model_saving_utils
 from official.bert import model_training_utils
 from official.bert import modeling
 from official.bert import optimization
-from official.bert import tpu_lib
 from official.utils.misc import keras_utils
+from official.utils.misc import tpu_lib

 flags.DEFINE_enum(
    'mode', 'train_and_eval', ['train_and_eval', 'export_only'],
@@ -210,7 +210,7 @@ def run_bert(strategy, input_meta_data):
      run_eagerly=FLAGS.run_eagerly)

  if FLAGS.model_export_path:
-    with tf.device(model_training_utils.get_primary_cpu_task(use_remote_tpu)):
+    with tf.device(tpu_lib.get_primary_cpu_task(use_remote_tpu)):
      model_saving_utils.export_bert_model(
          FLAGS.model_export_path, model=trained_model)
  return trained_model

--- a/official/bert/run_pretraining.py
+++ b/official/bert/run_pretraining.py
@@ -33,7 +33,7 @@ from official.bert import model_saving_utils
 from official.bert import model_training_utils
 from official.bert import modeling
 from official.bert import optimization
-from official.bert import tpu_lib
+from official.utils.misc import tpu_lib

 flags.DEFINE_string('input_files', None,
                    'File path to retrieve training data for pre-training.')

--- a/official/bert/run_squad.py
+++ b/official/bert/run_squad.py
@@ -36,8 +36,8 @@ from official.bert import modeling
 from official.bert import optimization
 from official.bert import squad_lib
 from official.bert import tokenization
-from official.bert import tpu_lib
 from official.utils.misc import keras_utils
+from official.utils.misc import tpu_lib

 flags.DEFINE_bool('do_train', False, 'Whether to run training.')
 flags.DEFINE_bool('do_predict', False, 'Whether to run eval on the dev set.')

--- a/official/keras_application_models/README.md
+++ b/official/keras_application_models/README.md
-# Keras Application Models Benchmark
-## Overview
-This provides a single scaffold to benchmark the Keras built-in application [models](https://keras.io/applications/). All the models are for image classification applications, and include:
-
- - Xception
- - VGG16
- - VGG19
- - ResNet50
- - InceptionV3
- - InceptionResNetV2
- - MobileNet
- - DenseNet
- - NASNet
-
-## Dataset
-Synthetic dataset is used for the benchmark.
-
-## Callbacks
-Two custom callbacks are provided for model benchmarking: ExamplesPerSecondCallback and LoggingMetricCallback. For each callback, `epoch_based` and `batch_based` options are available to set the benchmark level. Check [model_callbacks.py](model_callbacks.py) for more details.
-
-## Running Code
-To benchmark a model, use `--model` to specify the model name. To perform the benchmark with eager execution, issue the following command:
-```
-python benchmark_main.py --model resnet50 --eager
-```
-Note that, if eager execution is enabled, only one GPU is utilized even if multiple GPUs are provided and multi_gpu_model is used.
-
-
-To use distribution strategy in the benchmark, run the following:
-```
-python benchmark_main.py --model resnet50 --dist_strat
-```
-Currently, only one of the --eager and --dist_strat arguments can be defined, as DistributionStrategy is not supported in Eager execution now.
-
-Arguments:
-  * `--model`: Which model to be benchmarked. The model name is defined as the keys of `MODELS` in [benchmark_main.py](benchmark_main.py).
-  * `--callbacks`: To specify a list of callbacks.
-
-Use the `--help` or `-h` flag to get a full list of possible arguments.
--- a/official/keras_application_models/benchmark_main.py
+++ b/official/keras_application_models/benchmark_main.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Benchmark on the keras built-in application models."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=g-bad-import-order
-import numpy as np
-from absl import app as absl_app
-from absl import flags
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from official.keras_application_models import dataset
-from official.keras_application_models import model_callbacks
-from official.utils.flags import core as flags_core
-from official.utils.logs import logger
-from official.utils.misc import distribution_utils
-
-# Define a dictionary that maps model names to their model classes inside Keras
-MODELS = {
-    "vgg16": tf.keras.applications.VGG16,
-    "vgg19": tf.keras.applications.VGG19,
-    "inceptionv3": tf.keras.applications.InceptionV3,
-    "xception": tf.keras.applications.Xception,
-    "resnet50": tf.keras.applications.ResNet50,
-    "inceptionresnetv2": tf.keras.applications.InceptionResNetV2,
-    "mobilenet": tf.keras.applications.MobileNet,
-    "densenet121": tf.keras.applications.DenseNet121,
-    "densenet169": tf.keras.applications.DenseNet169,
-    "densenet201": tf.keras.applications.DenseNet201,
-    "nasnetlarge": tf.keras.applications.NASNetLarge,
-    "nasnetmobile": tf.keras.applications.NASNetMobile,
-}
-
-
-def run_keras_model_benchmark(_):
-  """Run the benchmark on keras model."""
-  # Ensure a valid model name was supplied via command line argument
-  if FLAGS.model not in MODELS.keys():
-    raise AssertionError("The --model command line argument should "
-                         "be a key in the `MODELS` dictionary.")
-
-  # Check if eager execution is enabled
-  if FLAGS.eager:
-    tf.logging.info("Eager execution is enabled...")
-    tf.enable_eager_execution()
-
-  # Load the model
-  tf.logging.info("Benchmark on {} model...".format(FLAGS.model))
-  keras_model = MODELS[FLAGS.model]
-
-  # Get dataset
-  dataset_name = "ImageNet"
-  if FLAGS.use_synthetic_data:
-    tf.logging.info("Using synthetic dataset...")
-    dataset_name += "_Synthetic"
-    train_dataset = dataset.generate_synthetic_input_dataset(
-        FLAGS.model, FLAGS.batch_size)
-    val_dataset = dataset.generate_synthetic_input_dataset(
-        FLAGS.model, FLAGS.batch_size)
-    model = keras_model(weights=None)
-  else:
-    tf.logging.info("Using CIFAR-10 dataset...")
-    dataset_name = "CIFAR-10"
-    ds = dataset.Cifar10Dataset(FLAGS.batch_size)
-    train_dataset = ds.train_dataset
-    val_dataset = ds.test_dataset
-    model = keras_model(
-        weights=None, input_shape=ds.input_shape, classes=ds.num_classes)
-
-  num_gpus = flags_core.get_num_gpus(FLAGS)
-
-  distribution = None
-  # Use distribution strategy
-  if FLAGS.dist_strat:
-    distribution = distribution_utils.get_distribution_strategy(
-        distribution_strategy=FLAGS.distribution_strategy,
-        num_gpus=num_gpus)
-  elif num_gpus > 1:
-    # Run with multi_gpu_model
-    # If eager execution is enabled, only one GPU is utilized even if multiple
-    # GPUs are provided.
-    if FLAGS.eager:
-      tf.logging.warning(
-          "{} GPUs are provided, but only one GPU is utilized as "
-          "eager execution is enabled.".format(num_gpus))
-    model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus)
-
-  # Adam optimizer and some other optimizers doesn't work well with
-  # distribution strategy (b/113076709)
-  # Use GradientDescentOptimizer here
-  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
-  model.compile(loss="categorical_crossentropy",
-                optimizer=optimizer,
-                metrics=["accuracy"],
-                distribute=distribution)
-
-  # Create benchmark logger for benchmark logging
-  run_params = {
-      "batch_size": FLAGS.batch_size,
-      "synthetic_data": FLAGS.use_synthetic_data,
-      "train_epochs": FLAGS.train_epochs,
-      "num_train_images": FLAGS.num_train_images,
-      "num_eval_images": FLAGS.num_eval_images,
-  }
-
-  benchmark_logger = logger.get_benchmark_logger()
-  benchmark_logger.log_run_info(
-      model_name=FLAGS.model,
-      dataset_name=dataset_name,
-      run_params=run_params,
-      test_id=FLAGS.benchmark_test_id)
-
-  # Create callbacks that log metric values about the training and evaluation
-  callbacks = model_callbacks.get_model_callbacks(
-      FLAGS.callbacks,
-      batch_size=FLAGS.batch_size,
-      metric_logger=benchmark_logger)
-  # Train and evaluate the model
-  history = model.fit(
-      train_dataset,
-      epochs=FLAGS.train_epochs,
-      callbacks=callbacks,
-      validation_data=val_dataset,
-      steps_per_epoch=int(np.ceil(FLAGS.num_train_images / FLAGS.batch_size)),
-      validation_steps=int(np.ceil(FLAGS.num_eval_images / FLAGS.batch_size))
-  )
-
-  tf.logging.info("Logging the evaluation results...")
-  for epoch in range(FLAGS.train_epochs):
-    eval_results = {
-        "accuracy": history.history["val_acc"][epoch],
-        "loss": history.history["val_loss"][epoch],
-        tf.GraphKeys.GLOBAL_STEP: (epoch + 1) * np.ceil(
-            FLAGS.num_eval_images/FLAGS.batch_size)
-    }
-    benchmark_logger.log_evaluation_result(eval_results)
-
-  # Clear the session explicitly to avoid session delete error
-  tf.keras.backend.clear_session()
-
-
-def define_keras_benchmark_flags():
-  """Add flags for keras built-in application models."""
-  flags_core.define_base(hooks=False)
-  flags_core.define_performance()
-  flags_core.define_image()
-  flags_core.define_benchmark()
-  flags.adopt_module_key_flags(flags_core)
-
-  flags_core.set_defaults(
-      data_format="channels_last",
-      use_synthetic_data=True,
-      batch_size=32,
-      train_epochs=2)
-
-  flags.DEFINE_enum(
-      name="model", default=None,
-      enum_values=MODELS.keys(), case_sensitive=False,
-      help=flags_core.help_wrap(
-          "Model to be benchmarked."))
-
-  flags.DEFINE_integer(
-      name="num_train_images", default=1000,
-      help=flags_core.help_wrap(
-          "The number of synthetic images for training. The default value is "
-          "1000."))
-
-  flags.DEFINE_integer(
-      name="num_eval_images", default=50,
-      help=flags_core.help_wrap(
-          "The number of synthetic images for evaluation. The default value is "
-          "50."))
-
-  flags.DEFINE_boolean(
-      name="eager", default=False, help=flags_core.help_wrap(
-          "To enable eager execution. Note that if eager execution is enabled, "
-          "only one GPU is utilized even if multiple GPUs are provided and "
-          "multi_gpu_model is used."))
-
-  flags.DEFINE_boolean(
-      name="dist_strat", default=False, help=flags_core.help_wrap(
-          "To enable distribution strategy for model training and evaluation. "
-          "Number of GPUs used for distribution strategy can be set by the "
-          "argument --num_gpus."))
-
-  flags.DEFINE_list(
-      name="callbacks",
-      default=["ExamplesPerSecondCallback", "LoggingMetricCallback"],
-      help=flags_core.help_wrap(
-          "A list of (case insensitive) strings to specify the names of "
-          "callbacks. For example: `--callbacks ExamplesPerSecondCallback,"
-          "LoggingMetricCallback`"))
-
-  @flags.multi_flags_validator(
-      ["eager", "dist_strat"],
-      message="Both --eager and --dist_strat were set. Only one can be "
-              "defined, as DistributionStrategy is not supported in Eager "
-              "execution currently.")
-  # pylint: disable=unused-variable
-  def _check_eager_dist_strat(flag_dict):
-    return not(flag_dict["eager"] and flag_dict["dist_strat"])
-
-
-def main(_):
-  with logger.benchmark_context(FLAGS):
-    run_keras_model_benchmark(FLAGS)
-
-
-if __name__ == "__main__":
-  tf.logging.set_verbosity(tf.logging.INFO)
-  define_keras_benchmark_flags()
-  FLAGS = flags.FLAGS
-  absl_app.run(main)
--- a/official/keras_application_models/dataset.py
+++ b/official/keras_application_models/dataset.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Prepare dataset for keras model benchmark."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-from official.utils.misc import model_helpers  # pylint: disable=g-bad-import-order
-
-# Default values for dataset.
-_NUM_CHANNELS = 3
-_NUM_CLASSES = 1000
-
-
-def _get_default_image_size(model):
-  """Provide default image size for each model."""
-  image_size = (224, 224)
-  if model in ["inceptionv3", "xception", "inceptionresnetv2"]:
-    image_size = (299, 299)
-  elif model in ["nasnetlarge"]:
-    image_size = (331, 331)
-  return image_size
-
-
-def generate_synthetic_input_dataset(model, batch_size):
-  """Generate synthetic dataset."""
-  image_size = _get_default_image_size(model)
-  image_shape = (batch_size,) + image_size + (_NUM_CHANNELS,)
-  label_shape = (batch_size, _NUM_CLASSES)
-
-  dataset = model_helpers.generate_synthetic_data(
-      input_shape=tf.TensorShape(image_shape),
-      label_shape=tf.TensorShape(label_shape),
-  )
-  return dataset
-
-
-class Cifar10Dataset(object):
-  """CIFAR10 dataset, including train and test set.
-
-  Each sample consists of a 32x32 color image, and label is from 10 classes.
-  """
-
-  def __init__(self, batch_size):
-    """Initializes train/test datasets.
-
-    Args:
-      batch_size: int, the number of batch size.
-    """
-    self.input_shape = (32, 32, 3)
-    self.num_classes = 10
-    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
-    x_train, x_test = x_train / 255.0, x_test / 255.0
-    y_train, y_test = y_train.astype(np.int64), y_test.astype(np.int64)
-    y_train = tf.keras.utils.to_categorical(y_train, self.num_classes)
-    y_test = tf.keras.utils.to_categorical(y_test, self.num_classes)
-    self.train_dataset = tf.data.Dataset.from_tensor_slices(
-        (x_train, y_train)).shuffle(2000).batch(batch_size).repeat()
-    self.test_dataset = tf.data.Dataset.from_tensor_slices(
-        (x_test, y_test)).shuffle(2000).batch(batch_size).repeat()
--- a/official/keras_application_models/model_callbacks.py
+++ b/official/keras_application_models/model_callbacks.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Callbacks for Keras built-in application models.
-
-Note that, in the callbacks, the global_step is initialized in the __init__ of
-each callback rather than on_train_begin. As on_train_begin gets called in
-the fit_loop, and it will be reset with each call to fit(). To keep the
-global_step persistent across all training sessions, it should be initialized in
-the __init__.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.utils.logs import logger
-
-# Metrics to log after each batch and epoch
-_PER_BATCH_METRICS = {
-    "loss": "train_loss",
-    "acc": "train_accuracy",
-}
-_PER_EPOCH_METRICS = {
-    "loss": "train_loss",
-    "acc": "train_accuracy",
-    "val_loss": "loss",
-    "val_acc": "accuracy"
-}
-
-
-class ExamplesPerSecondCallback(tf.keras.callbacks.Callback):
-  """ExamplesPerSecond callback.
-
-  This callback records the average_examples_per_sec and
-  current_examples_per_sec during training.
-  """
-
-  def __init__(self, batch_size, every_n_steps=1, metric_logger=None):
-    self._batch_size = batch_size
-    self._every_n_steps = every_n_steps
-    self._logger = metric_logger or logger.BaseBenchmarkLogger()
-    self._global_step = 0  # Initialize it in __init__
-    super(ExamplesPerSecondCallback, self).__init__()
-
-  def on_train_begin(self, logs=None):
-    self._train_start_time = time.time()
-    self._last_recorded_time = time.time()
-
-  def on_batch_end(self, batch, logs=None):
-    """Log the examples_per_sec metric every_n_steps."""
-    self._global_step += 1
-    current_time = time.time()
-
-    if self._global_step % self._every_n_steps == 0:
-      average_examples_per_sec = self._batch_size * (
-          self._global_step / (current_time - self._train_start_time))
-      self._logger.log_metric(
-          "average_examples_per_sec", average_examples_per_sec,
-          global_step=self._global_step)
-
-      current_examples_per_sec = self._batch_size * (
-          self._every_n_steps / (current_time - self._last_recorded_time))
-      self._logger.log_metric(
-          "current_examples_per_sec", current_examples_per_sec,
-          global_step=self._global_step)
-      self._last_recorded_time = current_time  # Update last_recorded_time
-
-
-class LoggingMetricCallback(tf.keras.callbacks.Callback):
-  """LoggingMetric callback.
-
-  Log the predefined _PER_BATCH_METRICS after each batch, and log the predefined
-  _PER_EPOCH_METRICS after each epoch.
-  """
-
-  def __init__(self, metric_logger=None):
-    self._logger = metric_logger or logger.BaseBenchmarkLogger()
-    self._per_batch_metrics = _PER_BATCH_METRICS
-    self._per_epoch_metrics = _PER_EPOCH_METRICS
-    self._global_step = 0  # Initialize it in __init__
-    super(LoggingMetricCallback, self).__init__()
-
-  def on_batch_end(self, batch, logs=None):
-    """Log metrics after each batch."""
-    self._global_step += 1
-    for metric in _PER_BATCH_METRICS:
-      self._logger.log_metric(
-          _PER_BATCH_METRICS[metric],
-          logs.get(metric),
-          global_step=self._global_step)
-
-  def on_epoch_end(self, epoch, logs=None):
-    """Log metrics after each epoch."""
-    for metric in _PER_EPOCH_METRICS:
-      self._logger.log_metric(
-          _PER_EPOCH_METRICS[metric],
-          logs.get(metric),
-          global_step=self._global_step)
-
-
-def get_model_callbacks(name_list, **kwargs):
-  """Factory for getting a list of TensorFlow hooks for training by name.
-
-  Args:
-    name_list: a list of strings to name desired callback classes. Allowed:
-      ExamplesPerSecondCallback, LoggingMetricCallback, which are defined
-      as keys in CALLBACKS.
-    **kwargs: a dictionary of arguments to the callbacks.
-
-  Returns:
-    list of instantiated callbacks, ready to be used in a classifier.train call.
-
-  Raises:
-    ValueError: if an unrecognized name is passed.
-  """
-
-  if not name_list:
-    return []
-
-  callbacks = []
-  for name in name_list:
-    callback_name = CALLBACKS.get(name.strip().lower())
-    if callback_name is None:
-      raise ValueError(
-          "Unrecognized training callback requested: {}".format(name))
-    else:
-      callbacks.append(callback_name(**kwargs))
-
-  return callbacks
-
-
-def get_examples_per_second_callback(
-    every_n_steps=1, batch_size=32, metric_logger=None, **kwargs):  # pylint: disable=unused-argument
-  """Function to get ExamplesPerSecondCallback."""
-  return ExamplesPerSecondCallback(
-      batch_size=batch_size, every_n_steps=every_n_steps,
-      metric_logger=metric_logger or logger.get_benchmark_logger())
-
-
-def get_logging_metric_callback(metric_logger=None, **kwargs):  # pylint: disable=unused-argument
-  """Function to get LoggingMetricCallback."""
-  return LoggingMetricCallback(
-      metric_logger=metric_logger or logger.get_benchmark_logger())
-
-
-# A dictionary to map the callback name and its corresponding function
-CALLBACKS = {
-    "examplespersecondcallback": get_examples_per_second_callback,
-    "loggingmetriccallback": get_logging_metric_callback,
-}
--- a/official/boosted_trees/README.md
+++ b/official/boosted_trees/README.md
--- a/official/boosted_trees/__init__.py
+++ b/official/boosted_trees/__init__.py
--- a/official/boosted_trees/data_download.py
+++ b/official/boosted_trees/data_download.py
--- a/official/boosted_trees/train_higgs.py
+++ b/official/boosted_trees/train_higgs.py
--- a/official/boosted_trees/train_higgs_test.csv
+++ b/official/boosted_trees/train_higgs_test.csv