Merge remote-tracking branch 'upstream/master' into add_multilevel_crop_and_resize

47bc1813 · syiming · d8611151 · b035a227 · d8611151 · d8611151
Commit 47bc1813 authored Jul 01, 2020 by syiming
20 changed files
--- a/official/r1/mnist/mnist_test.py
+++ b/official/r1/mnist/mnist_test.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import tensorflow.compat.v1 as tf  # pylint: disable=g-bad-import-order
-from absl import logging
-from official.r1.mnist import mnist
-
-BATCH_SIZE = 100
-
-
-def dummy_input_fn():
-  image = tf.random.uniform([BATCH_SIZE, 784])
-  labels = tf.random.uniform([BATCH_SIZE, 1], maxval=9, dtype=tf.int32)
-  return image, labels
-
-
-def make_estimator():
-  data_format = 'channels_last'
-  if tf.test.is_built_with_cuda():
-    data_format = 'channels_first'
-  return tf.estimator.Estimator(
-      model_fn=mnist.model_fn, params={
-          'data_format': data_format
-      })
-
-
-class Tests(tf.test.TestCase):
-  """Run tests for MNIST model.
-
-  MNIST uses contrib and will not work with TF 2.0.  All tests are disabled if
-  using TF 2.0.
-  """
-
-  def test_mnist(self):
-    classifier = make_estimator()
-    classifier.train(input_fn=dummy_input_fn, steps=2)
-    eval_results = classifier.evaluate(input_fn=dummy_input_fn, steps=1)
-
-    loss = eval_results['loss']
-    global_step = eval_results['global_step']
-    accuracy = eval_results['accuracy']
-    self.assertEqual(loss.shape, ())
-    self.assertEqual(2, global_step)
-    self.assertEqual(accuracy.shape, ())
-
-    input_fn = lambda: tf.random.uniform([3, 784])
-    predictions_generator = classifier.predict(input_fn)
-    for _ in range(3):
-      predictions = next(predictions_generator)
-      self.assertEqual(predictions['probabilities'].shape, (10,))
-      self.assertEqual(predictions['classes'].shape, ())
-
-  def mnist_model_fn_helper(self, mode, multi_gpu=False):
-    features, labels = dummy_input_fn()
-    image_count = features.shape[0]
-    spec = mnist.model_fn(features, labels, mode, {
-        'data_format': 'channels_last',
-        'multi_gpu': multi_gpu
-    })
-
-    if mode == tf.estimator.ModeKeys.PREDICT:
-      predictions = spec.predictions
-      self.assertAllEqual(predictions['probabilities'].shape, (image_count, 10))
-      self.assertEqual(predictions['probabilities'].dtype, tf.float32)
-      self.assertAllEqual(predictions['classes'].shape, (image_count,))
-      self.assertEqual(predictions['classes'].dtype, tf.int64)
-
-    if mode != tf.estimator.ModeKeys.PREDICT:
-      loss = spec.loss
-      self.assertAllEqual(loss.shape, ())
-      self.assertEqual(loss.dtype, tf.float32)
-
-    if mode == tf.estimator.ModeKeys.EVAL:
-      eval_metric_ops = spec.eval_metric_ops
-      self.assertAllEqual(eval_metric_ops['accuracy'][0].shape, ())
-      self.assertAllEqual(eval_metric_ops['accuracy'][1].shape, ())
-      self.assertEqual(eval_metric_ops['accuracy'][0].dtype, tf.float32)
-      self.assertEqual(eval_metric_ops['accuracy'][1].dtype, tf.float32)
-
-  def test_mnist_model_fn_train_mode(self):
-    self.mnist_model_fn_helper(tf.estimator.ModeKeys.TRAIN)
-
-  def test_mnist_model_fn_train_mode_multi_gpu(self):
-    self.mnist_model_fn_helper(tf.estimator.ModeKeys.TRAIN, multi_gpu=True)
-
-  def test_mnist_model_fn_eval_mode(self):
-    self.mnist_model_fn_helper(tf.estimator.ModeKeys.EVAL)
-
-  def test_mnist_model_fn_predict_mode(self):
-    self.mnist_model_fn_helper(tf.estimator.ModeKeys.PREDICT)
-
-
-class Benchmarks(tf.test.Benchmark):
-  """Simple speed benchmarking for MNIST."""
-
-  def benchmark_train_step_time(self):
-    classifier = make_estimator()
-    # Run one step to warmup any use of the GPU.
-    classifier.train(input_fn=dummy_input_fn, steps=1)
-
-    have_gpu = tf.test.is_gpu_available()
-    num_steps = 1000 if have_gpu else 100
-    name = 'train_step_time_%s' % ('gpu' if have_gpu else 'cpu')
-
-    start = time.time()
-    classifier.train(input_fn=dummy_input_fn, steps=num_steps)
-    end = time.time()
-
-    wall_time = (end - start) / num_steps
-    self.report_benchmark(
-        iters=num_steps,
-        wall_time=wall_time,
-        name=name,
-        extras={
-            'examples_per_sec': BATCH_SIZE / wall_time
-        })
-
-
-if __name__ == '__main__':
-  logging.set_verbosity(logging.ERROR)
-  tf.disable_v2_behavior()
-  tf.test.main()
--- a/official/r1/mnist/mnist_tpu.py
+++ b/official/r1/mnist/mnist_tpu.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""MNIST model training using TPUs.
-
-This program demonstrates training of the convolutional neural network model
-defined in mnist.py on Google Cloud TPUs (https://cloud.google.com/tpu/).
-
-If you are not interested in TPUs, you should ignore this file.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-
-# pylint: disable=g-bad-import-order
-from absl import app as absl_app  # pylint: disable=unused-import
-import tensorflow.compat.v1 as tf
-# pylint: enable=g-bad-import-order
-
-# For open source environment, add grandparent directory for import
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(sys.path[0]))))
-
-from official.r1.mnist import dataset  # pylint: disable=wrong-import-position
-from official.r1.mnist import mnist  # pylint: disable=wrong-import-position
-
-# Cloud TPU Cluster Resolver flags
-tf.flags.DEFINE_string(
-    "tpu", default=None,
-    help="The Cloud TPU to use for training. This should be either the name "
-    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
-    "url.")
-tf.flags.DEFINE_string(
-    "tpu_zone", default=None,
-    help="[Optional] GCE zone where the Cloud TPU is located in. If not "
-    "specified, we will attempt to automatically detect the GCE project from "
-    "metadata.")
-tf.flags.DEFINE_string(
-    "gcp_project", default=None,
-    help="[Optional] Project name for the Cloud TPU-enabled project. If not "
-    "specified, we will attempt to automatically detect the GCE project from "
-    "metadata.")
-
-# Model specific parameters
-tf.flags.DEFINE_string("data_dir", "",
-                       "Path to directory containing the MNIST dataset")
-tf.flags.DEFINE_string("model_dir", None, "Estimator model_dir")
-tf.flags.DEFINE_integer("batch_size", 1024,
-                        "Mini-batch size for the training. Note that this "
-                        "is the global batch size and not the per-shard batch.")
-tf.flags.DEFINE_integer("train_steps", 1000, "Total number of training steps.")
-tf.flags.DEFINE_integer("eval_steps", 0,
-                        "Total number of evaluation steps. If `0`, evaluation "
-                        "after training is skipped.")
-tf.flags.DEFINE_float("learning_rate", 0.05, "Learning rate.")
-
-tf.flags.DEFINE_bool("use_tpu", True, "Use TPUs rather than plain CPUs")
-tf.flags.DEFINE_bool("enable_predict", True, "Do some predictions at the end")
-tf.flags.DEFINE_integer("iterations", 50,
-                        "Number of iterations per TPU training loop.")
-tf.flags.DEFINE_integer("num_shards", 8, "Number of shards (TPU chips).")
-
-FLAGS = tf.flags.FLAGS
-
-
-def metric_fn(labels, logits):
-  accuracy = tf.metrics.accuracy(
-      labels=labels, predictions=tf.argmax(logits, axis=1))
-  return {"accuracy": accuracy}
-
-
-def model_fn(features, labels, mode, params):
-  """model_fn constructs the ML model used to predict handwritten digits."""
-
-  del params
-  image = features
-  if isinstance(image, dict):
-    image = features["image"]
-
-  model = mnist.create_model("channels_last")
-
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    logits = model(image, training=False)
-    predictions = {
-        'class_ids': tf.argmax(logits, axis=1),
-        'probabilities': tf.nn.softmax(logits),
-    }
-    return tf.estimator.tpu.TPUEstimatorSpec(mode, predictions=predictions)
-
-  logits = model(image, training=(mode == tf.estimator.ModeKeys.TRAIN))
-  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    learning_rate = tf.train.exponential_decay(
-        FLAGS.learning_rate,
-        tf.train.get_global_step(),
-        decay_steps=100000,
-        decay_rate=0.96)
-    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
-    if FLAGS.use_tpu:
-      optimizer = tf.tpu.CrossShardOptimizer(optimizer)
-    return tf.estimator.tpu.TPUEstimatorSpec(
-        mode=mode,
-        loss=loss,
-        train_op=optimizer.minimize(loss, tf.train.get_global_step()))
-
-  if mode == tf.estimator.ModeKeys.EVAL:
-    return tf.estimator.tpu.TPUEstimatorSpec(
-        mode=mode, loss=loss, eval_metrics=(metric_fn, [labels, logits]))
-
-
-def train_input_fn(params):
-  """train_input_fn defines the input pipeline used for training."""
-  batch_size = params["batch_size"]
-  data_dir = params["data_dir"]
-  # Retrieves the batch size for the current shard. The # of shards is
-  # computed according to the input pipeline deployment. See
-  # `tf.estimator.tpu.RunConfig` for details.
-  ds = dataset.train(data_dir).cache().repeat().shuffle(
-      buffer_size=50000).batch(batch_size, drop_remainder=True)
-  return ds
-
-
-def eval_input_fn(params):
-  batch_size = params["batch_size"]
-  data_dir = params["data_dir"]
-  ds = dataset.test(data_dir).batch(batch_size, drop_remainder=True)
-  return ds
-
-
-def predict_input_fn(params):
-  batch_size = params["batch_size"]
-  data_dir = params["data_dir"]
-  # Take out top 10 samples from test data to make the predictions.
-  ds = dataset.test(data_dir).take(10).batch(batch_size)
-  return ds
-
-
-def main(argv):
-  del argv  # Unused.
-  tf.logging.set_verbosity(tf.logging.INFO)
-
-  tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
-      FLAGS.tpu,
-      zone=FLAGS.tpu_zone,
-      project=FLAGS.gcp_project
-  )
-
-  run_config = tf.estimator.tpu.RunConfig(
-      cluster=tpu_cluster_resolver,
-      model_dir=FLAGS.model_dir,
-      session_config=tf.ConfigProto(
-          allow_soft_placement=True, log_device_placement=True),
-      tpu_config=tf.estimator.tpu.TPUConfig(FLAGS.iterations, FLAGS.num_shards),
-  )
-
-  estimator = tf.estimator.tpu.TPUEstimator(
-      model_fn=model_fn,
-      use_tpu=FLAGS.use_tpu,
-      train_batch_size=FLAGS.batch_size,
-      eval_batch_size=FLAGS.batch_size,
-      predict_batch_size=FLAGS.batch_size,
-      params={"data_dir": FLAGS.data_dir},
-      config=run_config)
-  # TPUEstimator.train *requires* a max_steps argument.
-  estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)
-  # TPUEstimator.evaluate *requires* a steps argument.
-  # Note that the number of examples used during evaluation is
-  # --eval_steps * --batch_size.
-  # So if you change --batch_size then change --eval_steps too.
-  if FLAGS.eval_steps:
-    estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.eval_steps)
-
-  # Run prediction on top few samples of test data.
-  if FLAGS.enable_predict:
-    predictions = estimator.predict(input_fn=predict_input_fn)
-
-    for pred_dict in predictions:
-      template = ('Prediction is "{}" ({:.1f}%).')
-
-      class_id = pred_dict['class_ids']
-      probability = pred_dict['probabilities'][class_id]
-
-      print(template.format(class_id, 100 * probability))
-
-
-if __name__ == "__main__":
-  tf.disable_v2_behavior()
-  absl_app.run(main)
--- a/official/r1/ncf/README.md
+++ b/official/r1/ncf/README.md
-![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
-![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
-![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
-
-# NCF Estimator implementation
-
-NCF framework to train and evaluate the NeuMF model
--- a/official/r1/ncf/ncf_estimator_main.py
+++ b/official/r1/ncf/ncf_estimator_main.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""NCF framework to train and evaluate the NeuMF model.
-
-The NeuMF model assembles both MF and MLP models under the NCF framework. Check
-`neumf_model.py` for more details about the models.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import contextlib
-import heapq
-import json
-import math
-import multiprocessing
-import os
-import signal
-
-from absl import app as absl_app
-from absl import flags
-from absl import logging
-import numpy as np
-from six.moves import range
-import tensorflow as tf
-import typing
-
-from official.r1.utils.logs import hooks_helper
-from official.r1.utils.logs import logger
-from official.r1.utils.logs import mlperf_helper
-from official.recommendation import constants as rconst
-from official.recommendation import data_pipeline
-from official.recommendation import data_preprocessing
-from official.recommendation import movielens
-from official.recommendation import ncf_common
-from official.recommendation import neumf_model
-from official.utils.flags import core as flags_core
-from official.utils.misc import distribution_utils
-from official.utils.misc import model_helpers
-
-
-FLAGS = flags.FLAGS
-
-
-def construct_estimator(model_dir, params):
-  """Construct either an Estimator for NCF.
-
-  Args:
-    model_dir: The model directory for the estimator
-    params: The params dict for the estimator
-
-  Returns:
-    An Estimator.
-  """
-  distribution = ncf_common.get_v1_distribution_strategy(params)
-  run_config = tf.estimator.RunConfig(train_distribute=distribution,
-                                      eval_distribute=distribution)
-  model_fn = neumf_model.neumf_model_fn
-  estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir,
-                                     config=run_config, params=params)
-  return estimator
-
-
-def log_and_get_hooks(eval_batch_size):
-  """Convenience function for hook and logger creation."""
-  # Create hooks that log information about the training and metric values
-  train_hooks = hooks_helper.get_train_hooks(
-      FLAGS.hooks,
-      model_dir=FLAGS.model_dir,
-      batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
-      tensors_to_log={"cross_entropy": "cross_entropy"}
-  )
-  run_params = {
-      "batch_size": FLAGS.batch_size,
-      "eval_batch_size": eval_batch_size,
-      "number_factors": FLAGS.num_factors,
-      "hr_threshold": FLAGS.hr_threshold,
-      "train_epochs": FLAGS.train_epochs,
-  }
-  benchmark_logger = logger.get_benchmark_logger()
-  benchmark_logger.log_run_info(
-      model_name="recommendation",
-      dataset_name=FLAGS.dataset,
-      run_params=run_params,
-      test_id=FLAGS.benchmark_test_id)
-
-  return benchmark_logger, train_hooks
-
-
-def main(_):
-  with logger.benchmark_context(FLAGS), \
-       mlperf_helper.LOGGER(FLAGS.output_ml_perf_compliance_logging):
-    mlperf_helper.set_ncf_root(os.path.split(os.path.abspath(__file__))[0])
-    run_ncf(FLAGS)
-
-
-def run_ncf(_):
-  """Run NCF training and eval loop."""
-  params = ncf_common.parse_flags(FLAGS)
-
-  num_users, num_items, num_train_steps, num_eval_steps, producer = (
-      ncf_common.get_inputs(params))
-
-  params["num_users"], params["num_items"] = num_users, num_items
-  producer.start()
-  model_helpers.apply_clean(flags.FLAGS)
-
-  estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params)
-
-  benchmark_logger, train_hooks = log_and_get_hooks(params["eval_batch_size"])
-  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
-
-  target_reached = False
-  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
-  for cycle_index in range(total_training_cycle):
-    assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
-    logging.info("Starting a training cycle: {}/{}".format(
-        cycle_index + 1, total_training_cycle))
-
-    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
-                            value=cycle_index)
-
-    train_input_fn = producer.make_input_fn(is_training=True)
-    estimator.train(input_fn=train_input_fn, hooks=train_hooks,
-                    steps=num_train_steps)
-
-    logging.info("Beginning evaluation.")
-    eval_input_fn = producer.make_input_fn(is_training=False)
-
-    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
-                            value=cycle_index)
-    eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps)
-    logging.info("Evaluation complete.")
-
-    hr = float(eval_results[rconst.HR_KEY])
-    ndcg = float(eval_results[rconst.NDCG_KEY])
-    loss = float(eval_results["loss"])
-
-    mlperf_helper.ncf_print(
-        key=mlperf_helper.TAGS.EVAL_TARGET,
-        value={"epoch": cycle_index, "value": FLAGS.hr_threshold})
-    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_ACCURACY,
-                            value={"epoch": cycle_index, "value": hr})
-    mlperf_helper.ncf_print(
-        key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
-        value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES})
-
-    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index)
-
-    # Benchmark the evaluation results
-    benchmark_logger.log_evaluation_result(eval_results)
-    # Log the HR and NDCG results.
-    logging.info(
-        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format(
-            cycle_index + 1, hr, ndcg, loss))
-
-    # If some evaluation threshold is met
-    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
-      target_reached = True
-      break
-
-  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
-                          value={"success": target_reached})
-  producer.stop_loop()
-  producer.join()
-
-  # Clear the session explicitly to avoid session delete error
-  tf.keras.backend.clear_session()
-  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_FINAL)
-
-
-if __name__ == "__main__":
-  logging.set_verbosity(logging.INFO)
-  ncf_common.define_ncf_flags()
-  absl_app.run(main)
--- a/official/r1/resnet/README.md
+++ b/official/r1/resnet/README.md
-![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
-![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
-![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
-
-# ResNet in TensorFlow
-
-Deep residual networks, or ResNets for short, provided the breakthrough idea of
-identity mappings in order to enable training of very deep convolutional neural
-networks. This folder contains an implementation of ResNet for the ImageNet
-dataset written in TensorFlow.
-
-See the following papers for more background:
-
-[1] [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385.pdf) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
-
-[2] [Identity Mappings in Deep Residual Networks](https://arxiv.org/pdf/1603.05027.pdf) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Jul 2016.
-
-In code, v1 refers to the ResNet defined in [1] but where a stride 2 is used on
-the 3x3 conv rather than the first 1x1 in the bottleneck. This change results
-in higher and more stable accuracy with less epochs than the original v1 and has
-shown to scale to higher batch sizes with minimal degradation in accuracy.
-There is no originating paper. The first mention we are aware of was in the
-torch version of [ResNetv1](https://github.com/facebook/fb.resnet.torch). Most
-popular v1 implementations are this implementation which we call ResNetv1.5.
-
-In testing we found v1.5 requires ~12% more compute to train and has 6% reduced
-throughput for inference compared to ResNetv1. CIFAR-10 ResNet does not use the
-bottleneck and is thus the same for v1 as v1.5.
-
-v2 refers to [2]. The principle difference between the two versions is that v1
-applies batch normalization and activation after convolution, while v2 applies
-batch normalization, then activation, and finally convolution. A schematic
-comparison is presented in Figure 1 (left) of [2].
-
-Please proceed according to which dataset you would like to train/evaluate on:
-
-
-## CIFAR-10
-
-### Setup
-
-You need to have the latest version of TensorFlow installed.
-First, make sure [the models folder is in your Python path](/official/#running-the-models); otherwise you may encounter `ImportError: No module named official.resnet`.
-
-Then, download and extract the CIFAR-10 data from Alex's website, specifying the location with the `--data_dir` flag. Run the following:
-
-```bash
-python cifar10_download_and_extract.py --data_dir <DATA_DIR>
-```
-
-Then, to train the model:
-
-```bash
-python cifar10_main.py --data_dir <DATA_DIR>/cifar-10-batches-bin --model_dir <MODEL_DIR>
-```
-
-Use `--data_dir` to specify the location of the CIFAR-10 data used in the previous step. There are more flag options as described in `cifar10_main.py`.
-
-To export a `SavedModel` from the trained checkpoint:
-
-```bash
-python cifar10_main.py --data_dir <DATA_DIR>/cifar-10-batches-bin --model_dir <MODEL_DIR> --eval_only --export_dir <EXPORT_DIR>
-```
-
-Note: The `<EXPORT_DIR>` must be present. You might want to run `mkdir <EXPORT_DIR>` beforehand.
-
-The `SavedModel` can then be [loaded](https://www.tensorflow.org/guide/saved_model#loading_a_savedmodel_in_python) in order to use the ResNet for prediction.
-
-
-## ImageNet
-
-### Setup
-To begin, you will need to download the ImageNet dataset and convert it to
-TFRecord format. The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
-and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy)
-provide a few options.
-
-Once your dataset is ready, you can begin training the model as follows:
-
-```bash
-python imagenet_main.py --data_dir=/path/to/imagenet
-```
-
-The model will begin training and will automatically evaluate itself on the
-validation data roughly once per epoch.
-
-Note that there are a number of other options you can specify, including
-`--model_dir` to choose where to store the model and `--resnet_size` to choose
-the model size (options include ResNet-18 through ResNet-200). See
-[`resnet_run_loop.py`](resnet_run_loop.py) for the full list of options.
-
-
-## Compute Devices
-Training is accomplished using the DistributionStrategies API. (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/distribute/README.md)
-
-The appropriate distribution strategy is chosen based on the `--num_gpus` flag.
-By default this flag is one if TensorFlow is compiled with CUDA, and zero
-otherwise.
-
-num_gpus:
-+ 0:  Use OneDeviceStrategy and train on CPU.
-+ 1:  Use OneDeviceStrategy and train on GPU.
-+ 2+: Use MirroredStrategy (data parallelism) to distribute a batch between devices.
-
-### Pre-trained model
-You can download pre-trained versions of ResNet-50. Reported accuracies are top-1 single-crop accuracy for the ImageNet validation set.
-Models are reported as both checkpoints produced by Estimator during training, and as SavedModels which are more portable. Checkpoints are fragile,
-and these are not guaranteed to work with future versions of the code. Both ResNet v1
-and ResNet v2 have been trained in both fp16 and fp32 precision. (Here v1 refers to "v1.5". See the note above.) Furthermore, SavedModels
-are generated to accept either tensor or JPG inputs, and with channels_first (NCHW) and channels_last (NHWC) convolutions. NCHW is generally
-better for GPUs, while NHWC is generally better for CPUs. See the TensorFlow [performance guide](https://www.tensorflow.org/performance/performance_guide#data_formats)
-for more details.
-
-ResNet-50 v2 (fp32, Accuracy 76.47%):
-* [Checkpoint](http://download.tensorflow.org/models/official/20181001_resnet/checkpoints/resnet_imagenet_v2_fp32_20181001.tar.gz)
-* SavedModel [(NCHW)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp32_savedmodel_NCHW.tar.gz),
-[(NCHW, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp32_savedmodel_NCHW_jpg.tar.gz),
-[(NHWC)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp32_savedmodel_NHWC.tar.gz),
-[(NHWC, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp32_savedmodel_NHWC_jpg.tar.gz)
-
-ResNet-50 v2 (fp16, Accuracy 76.56%):
-* [Checkpoint](http://download.tensorflow.org/models/official/20181001_resnet/checkpoints/resnet_imagenet_v2_fp16_20180928.tar.gz)
-* SavedModel [(NCHW)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp16_savedmodel_NCHW.tar.gz),
-[(NCHW, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp16_savedmodel_NCHW_jpg.tar.gz),
-[(NHWC)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp16_savedmodel_NHWC.tar.gz),
-[(NHWC, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp16_savedmodel_NHWC_jpg.tar.gz)
-
-ResNet-50 v1 (fp32, Accuracy 76.53%):
-* [Checkpoint](http://download.tensorflow.org/models/official/20181001_resnet/checkpoints/resnet_imagenet_v1_fp32_20181001.tar.gz)
-* SavedModel [(NCHW)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NCHW.tar.gz),
-[(NCHW, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NCHW_jpg.tar.gz),
-[(NHWC)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NHWC.tar.gz),
-[(NHWC, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NHWC_jpg.tar.gz)
-
-ResNet-50 v1 (fp16, Accuracy 76.18%):
-* [Checkpoint](http://download.tensorflow.org/models/official/20181001_resnet/checkpoints/resnet_imagenet_v1_fp16_20181001.tar.gz)
-* SavedModel [(NCHW)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp16_savedmodel_NCHW.tar.gz),
-[(NCHW, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp16_savedmodel_NCHW_jpg.tar.gz),
-[(NHWC)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp16_savedmodel_NHWC.tar.gz),
-[(NHWC, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp16_savedmodel_NHWC_jpg.tar.gz)
-
-### Transfer Learning
-You can use a pretrained model to initialize a training process. In addition you are able to freeze all but the final fully connected layers to fine tune your model. Transfer Learning is useful when training on your own small datasets. For a brief look at transfer learning in the context of convolutional neural networks, we recommend reading these [short notes](http://cs231n.github.io/transfer-learning/).
-
-
-To fine tune a pretrained resnet you must make three changes to your training procedure:
-
-1) Build the exact same model as previously except we change the number of labels in the final classification layer.
-
-2) Restore all weights from the pre-trained resnet except for the final classification layer; this will get randomly initialized instead.
-
-3) Freeze earlier layers of the network
-
-We can perform these three operations by specifying two flags: ```--pretrained_model_checkpoint_path``` and ```--fine_tune```. The first flag is a string that points to the path of a pre-trained resnet model. If this flag is specified, it will load all but the final classification layer. A key thing to note: if both ```--pretrained_model_checkpoint_path``` and a non empty ```model_dir``` directory are passed, the tensorflow estimator will load only the ```model_dir```. For more on this please see [WarmStartSettings](https://www.tensorflow.org/versions/master/api_docs/python/tf/estimator/WarmStartSettings) and [Estimators](https://www.tensorflow.org/guide/estimators).
-
-The second flag ```--fine_tune``` is a boolean that indicates whether earlier layers of the network should be frozen. You may set this flag to false if you wish to continue training a pre-trained model from a checkpoint. If you set this flag to true, you can train a new classification layer from scratch.
--- a/official/r1/resnet/__init__.py
+++ b/official/r1/resnet/__init__.py
--- a/official/r1/resnet/cifar10_download_and_extract.py
+++ b/official/r1/resnet/cifar10_download_and_extract.py
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Downloads and extracts the binary version of the CIFAR-10 dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import os
-import sys
-import tarfile
-
-from six.moves import urllib
-import tensorflow as tf
-
-DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
-
-parser = argparse.ArgumentParser()
-
-parser.add_argument(
-    '--data_dir', type=str, default='/tmp/cifar10_data',
-    help='Directory to download data and extract the tarball')
-
-
-def main(_):
-  """Download and extract the tarball from Alex's website."""
-  if not os.path.exists(FLAGS.data_dir):
-    os.makedirs(FLAGS.data_dir)
-
-  filename = DATA_URL.split('/')[-1]
-  filepath = os.path.join(FLAGS.data_dir, filename)
-
-  if not os.path.exists(filepath):
-    def _progress(count, block_size, total_size):
-      sys.stdout.write('\r>> Downloading %s %.1f%%' % (
-          filename, 100.0 * count * block_size / total_size))
-      sys.stdout.flush()
-
-    filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
-    print()
-    statinfo = os.stat(filepath)
-    print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
-
-  tarfile.open(filepath, 'r:gz').extractall(FLAGS.data_dir)
-
-
-if __name__ == '__main__':
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.compat.v1.app.run(argv=[sys.argv[0]] + unparsed)
--- a/official/r1/resnet/cifar10_main.py
+++ b/official/r1/resnet/cifar10_main.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Runs a ResNet model on the CIFAR-10 dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl import app as absl_app
-from absl import flags
-from absl import logging
-from six.moves import range
-import tensorflow as tf
-
-from official.r1.resnet import resnet_model
-from official.r1.resnet import resnet_run_loop
-from official.r1.utils.logs import logger
-from official.utils.flags import core as flags_core
-
-HEIGHT = 32
-WIDTH = 32
-NUM_CHANNELS = 3
-_DEFAULT_IMAGE_BYTES = HEIGHT * WIDTH * NUM_CHANNELS
-# The record is the image plus a one-byte label
-_RECORD_BYTES = _DEFAULT_IMAGE_BYTES + 1
-NUM_CLASSES = 10
-_NUM_DATA_FILES = 5
-
-# TODO(tobyboyd): Change to best practice 45K(train)/5K(val)/10K(test) splits.
-NUM_IMAGES = {
-    'train': 50000,
-    'validation': 10000,
-}
-
-DATASET_NAME = 'CIFAR-10'
-
-
-###############################################################################
-# Data processing
-###############################################################################
-def get_filenames(is_training, data_dir):
-  """Returns a list of filenames."""
-  assert tf.io.gfile.exists(data_dir), (
-      'Run cifar10_download_and_extract.py first to download and extract the '
-      'CIFAR-10 data.')
-
-  if is_training:
-    return [
-        os.path.join(data_dir, 'data_batch_%d.bin' % i)
-        for i in range(1, _NUM_DATA_FILES + 1)
-    ]
-  else:
-    return [os.path.join(data_dir, 'test_batch.bin')]
-
-
-def parse_record(raw_record, is_training, dtype):
-  """Parse CIFAR-10 image and label from a raw record."""
-  # Convert bytes to a vector of uint8 that is record_bytes long.
-  record_vector = tf.io.decode_raw(raw_record, tf.uint8)
-
-  # The first byte represents the label, which we convert from uint8 to int32
-  # and then to one-hot.
-  label = tf.cast(record_vector[0], tf.int32)
-
-  # The remaining bytes after the label represent the image, which we reshape
-  # from [depth * height * width] to [depth, height, width].
-  depth_major = tf.reshape(record_vector[1:_RECORD_BYTES],
-                           [NUM_CHANNELS, HEIGHT, WIDTH])
-
-  # Convert from [depth, height, width] to [height, width, depth], and cast as
-  # float32.
-  image = tf.cast(tf.transpose(a=depth_major, perm=[1, 2, 0]), tf.float32)
-
-  image = preprocess_image(image, is_training)
-  image = tf.cast(image, dtype)
-
-  return image, label
-
-
-def preprocess_image(image, is_training):
-  """Preprocess a single image of layout [height, width, depth]."""
-  if is_training:
-    # Resize the image to add four extra pixels on each side.
-    image = tf.image.resize_with_crop_or_pad(
-        image, HEIGHT + 8, WIDTH + 8)
-
-    # Randomly crop a [HEIGHT, WIDTH] section of the image.
-    image = tf.image.random_crop(image, [HEIGHT, WIDTH, NUM_CHANNELS])
-
-    # Randomly flip the image horizontally.
-    image = tf.image.random_flip_left_right(image)
-
-  # Subtract off the mean and divide by the variance of the pixels.
-  image = tf.image.per_image_standardization(image)
-  return image
-
-
-def input_fn(is_training,
-             data_dir,
-             batch_size,
-             num_epochs=1,
-             dtype=tf.float32,
-             datasets_num_private_threads=None,
-             parse_record_fn=parse_record,
-             input_context=None,
-             drop_remainder=False):
-  """Input function which provides batches for train or eval.
-
-  Args:
-    is_training: A boolean denoting whether the input is for training.
-    data_dir: The directory containing the input data.
-    batch_size: The number of samples per batch.
-    num_epochs: The number of epochs to repeat the dataset.
-    dtype: Data type to use for images/features
-    datasets_num_private_threads: Number of private threads for tf.data.
-    parse_record_fn: Function to use for parsing the records.
-    input_context: A `tf.distribute.InputContext` object passed in by
-      `tf.distribute.Strategy`.
-    drop_remainder: A boolean indicates whether to drop the remainder of the
-      batches. If True, the batch dimension will be static.
-
-  Returns:
-    A dataset that can be used for iteration.
-  """
-  filenames = get_filenames(is_training, data_dir)
-  dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES)
-
-  if input_context:
-    logging.info(
-        'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
-        input_context.input_pipeline_id, input_context.num_input_pipelines)
-    dataset = dataset.shard(input_context.num_input_pipelines,
-                            input_context.input_pipeline_id)
-
-  return resnet_run_loop.process_record_dataset(
-      dataset=dataset,
-      is_training=is_training,
-      batch_size=batch_size,
-      shuffle_buffer=NUM_IMAGES['train'],
-      parse_record_fn=parse_record_fn,
-      num_epochs=num_epochs,
-      dtype=dtype,
-      datasets_num_private_threads=datasets_num_private_threads,
-      drop_remainder=drop_remainder
-  )
-
-
-def get_synth_input_fn(dtype):
-  return resnet_run_loop.get_synth_input_fn(
-      HEIGHT, WIDTH, NUM_CHANNELS, NUM_CLASSES, dtype=dtype)
-
-
-###############################################################################
-# Running the model
-###############################################################################
-class Cifar10Model(resnet_model.Model):
-  """Model class with appropriate defaults for CIFAR-10 data."""
-
-  def __init__(self, resnet_size, data_format=None, num_classes=NUM_CLASSES,
-               resnet_version=resnet_model.DEFAULT_VERSION,
-               dtype=resnet_model.DEFAULT_DTYPE):
-    """These are the parameters that work for CIFAR-10 data.
-
-    Args:
-      resnet_size: The number of convolutional layers needed in the model.
-      data_format: Either 'channels_first' or 'channels_last', specifying which
-        data format to use when setting up the model.
-      num_classes: The number of output classes needed from the model. This
-        enables users to extend the same model to their own datasets.
-      resnet_version: Integer representing which version of the ResNet network
-      to use. See README for details. Valid values: [1, 2]
-      dtype: The TensorFlow dtype to use for calculations.
-
-    Raises:
-      ValueError: if invalid resnet_size is chosen
-    """
-    if resnet_size % 6 != 2:
-      raise ValueError('resnet_size must be 6n + 2:', resnet_size)
-
-    num_blocks = (resnet_size - 2) // 6
-
-    super(Cifar10Model, self).__init__(
-        resnet_size=resnet_size,
-        bottleneck=False,
-        num_classes=num_classes,
-        num_filters=16,
-        kernel_size=3,
-        conv_stride=1,
-        first_pool_size=None,
-        first_pool_stride=None,
-        block_sizes=[num_blocks] * 3,
-        block_strides=[1, 2, 2],
-        resnet_version=resnet_version,
-        data_format=data_format,
-        dtype=dtype
-    )
-
-
-def cifar10_model_fn(features, labels, mode, params):
-  """Model function for CIFAR-10."""
-  features = tf.reshape(features, [-1, HEIGHT, WIDTH, NUM_CHANNELS])
-  # Learning rate schedule follows arXiv:1512.03385 for ResNet-56 and under.
-  learning_rate_fn = resnet_run_loop.learning_rate_with_decay(
-      batch_size=params['batch_size'] * params.get('num_workers', 1),
-      batch_denom=128, num_images=NUM_IMAGES['train'],
-      boundary_epochs=[91, 136, 182], decay_rates=[1, 0.1, 0.01, 0.001])
-
-  # Weight decay of 2e-4 diverges from 1e-4 decay used in the ResNet paper
-  # and seems more stable in testing. The difference was nominal for ResNet-56.
-  weight_decay = 2e-4
-
-  # Empirical testing showed that including batch_normalization variables
-  # in the calculation of regularized loss helped validation accuracy
-  # for the CIFAR-10 dataset, perhaps because the regularization prevents
-  # overfitting on the small data set. We therefore include all vars when
-  # regularizing and computing loss during training.
-  def loss_filter_fn(_):
-    return True
-
-  return resnet_run_loop.resnet_model_fn(
-      features=features,
-      labels=labels,
-      mode=mode,
-      model_class=Cifar10Model,
-      resnet_size=params['resnet_size'],
-      weight_decay=weight_decay,
-      learning_rate_fn=learning_rate_fn,
-      momentum=0.9,
-      data_format=params['data_format'],
-      resnet_version=params['resnet_version'],
-      loss_scale=params['loss_scale'],
-      loss_filter_fn=loss_filter_fn,
-      dtype=params['dtype'],
-      fine_tune=params['fine_tune']
-  )
-
-
-def define_cifar_flags():
-  resnet_run_loop.define_resnet_flags()
-  flags.adopt_module_key_flags(resnet_run_loop)
-  flags_core.set_defaults(data_dir='/tmp/cifar10_data/cifar-10-batches-bin',
-                          model_dir='/tmp/cifar10_model',
-                          resnet_size='56',
-                          train_epochs=182,
-                          epochs_between_evals=10,
-                          batch_size=128,
-                          image_bytes_as_serving_input=False)
-
-
-def run_cifar(flags_obj):
-  """Run ResNet CIFAR-10 training and eval loop.
-
-  Args:
-    flags_obj: An object containing parsed flag values.
-
-  Returns:
-    Dictionary of results. Including final accuracy.
-  """
-  if flags_obj.image_bytes_as_serving_input:
-    logging.fatal(
-        '--image_bytes_as_serving_input cannot be set to True for CIFAR. '
-        'This flag is only applicable to ImageNet.')
-    return
-
-  input_function = (flags_obj.use_synthetic_data and
-                    get_synth_input_fn(flags_core.get_tf_dtype(flags_obj)) or
-                    input_fn)
-  result = resnet_run_loop.resnet_main(
-      flags_obj, cifar10_model_fn, input_function, DATASET_NAME,
-      shape=[HEIGHT, WIDTH, NUM_CHANNELS])
-
-  return result
-
-
-def main(_):
-  with logger.benchmark_context(flags.FLAGS):
-    run_cifar(flags.FLAGS)
-
-
-if __name__ == '__main__':
-  logging.set_verbosity(logging.INFO)
-  define_cifar_flags()
-  absl_app.run(main)
--- a/official/r1/resnet/cifar10_test.py
+++ b/official/r1/resnet/cifar10_test.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tempfile import mkstemp
-
-from absl import logging
-import numpy as np
-import tensorflow as tf
-
-from official.r1.resnet import cifar10_main
-from official.utils.testing import integration
-
-logging.set_verbosity(logging.ERROR)
-
-_BATCH_SIZE = 128
-_HEIGHT = 32
-_WIDTH = 32
-_NUM_CHANNELS = 3
-
-
-class BaseTest(tf.test.TestCase):
-  """Tests for the Cifar10 version of Resnet.
-  """
-
-  _num_validation_images = None
-
-  @classmethod
-  def setUpClass(cls):  # pylint: disable=invalid-name
-    super(BaseTest, cls).setUpClass()
-    tf.compat.v1.disable_eager_execution()
-    cifar10_main.define_cifar_flags()
-
-  def setUp(self):
-    super(BaseTest, self).setUp()
-    self._num_validation_images = cifar10_main.NUM_IMAGES['validation']
-    cifar10_main.NUM_IMAGES['validation'] = 4
-
-  def tearDown(self):
-    super(BaseTest, self).tearDown()
-    tf.io.gfile.rmtree(self.get_temp_dir())
-    cifar10_main.NUM_IMAGES['validation'] = self._num_validation_images
-
-  def test_dataset_input_fn(self):
-    fake_data = bytearray()
-    fake_data.append(7)
-    for i in range(_NUM_CHANNELS):
-      for _ in range(_HEIGHT * _WIDTH):
-        fake_data.append(i)
-
-    _, filename = mkstemp(dir=self.get_temp_dir())
-    data_file = open(filename, 'wb')
-    data_file.write(fake_data)
-    data_file.close()
-
-    fake_dataset = tf.data.FixedLengthRecordDataset(
-        filename, cifar10_main._RECORD_BYTES)  # pylint: disable=protected-access
-    fake_dataset = fake_dataset.map(
-        lambda val: cifar10_main.parse_record(val, False, tf.float32))
-    image, label = tf.compat.v1.data.make_one_shot_iterator(
-        fake_dataset).get_next()
-
-    self.assertAllEqual(label.shape, ())
-    self.assertAllEqual(image.shape, (_HEIGHT, _WIDTH, _NUM_CHANNELS))
-
-    with self.session() as sess:
-      image, label = sess.run([image, label])
-
-      self.assertEqual(label, 7)
-
-      for row in image:
-        for pixel in row:
-          self.assertAllClose(pixel, np.array([-1.225, 0., 1.225]), rtol=1e-3)
-
-  def cifar10_model_fn_helper(self, mode, resnet_version, dtype):
-    input_fn = cifar10_main.get_synth_input_fn(dtype)
-    dataset = input_fn(True, '', _BATCH_SIZE)
-    iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
-    features, labels = iterator.get_next()
-    spec = cifar10_main.cifar10_model_fn(
-        features, labels, mode, {
-            'dtype': dtype,
-            'resnet_size': 32,
-            'data_format': 'channels_last',
-            'batch_size': _BATCH_SIZE,
-            'resnet_version': resnet_version,
-            'loss_scale': 128 if dtype == tf.float16 else 1,
-            'fine_tune': False,
-        })
-
-    predictions = spec.predictions
-    self.assertAllEqual(predictions['probabilities'].shape,
-                        (_BATCH_SIZE, 10))
-    self.assertEqual(predictions['probabilities'].dtype, tf.float32)
-    self.assertAllEqual(predictions['classes'].shape, (_BATCH_SIZE,))
-    self.assertEqual(predictions['classes'].dtype, tf.int64)
-
-    if mode != tf.estimator.ModeKeys.PREDICT:
-      loss = spec.loss
-      self.assertAllEqual(loss.shape, ())
-      self.assertEqual(loss.dtype, tf.float32)
-
-    if mode == tf.estimator.ModeKeys.EVAL:
-      eval_metric_ops = spec.eval_metric_ops
-      self.assertAllEqual(eval_metric_ops['accuracy'][0].shape, ())
-      self.assertAllEqual(eval_metric_ops['accuracy'][1].shape, ())
-      self.assertEqual(eval_metric_ops['accuracy'][0].dtype, tf.float32)
-      self.assertEqual(eval_metric_ops['accuracy'][1].dtype, tf.float32)
-
-  def test_cifar10_model_fn_train_mode_v1(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.TRAIN, resnet_version=1,
-                                 dtype=tf.float32)
-
-  def test_cifar10_model_fn_trainmode__v2(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.TRAIN, resnet_version=2,
-                                 dtype=tf.float32)
-
-  def test_cifar10_model_fn_eval_mode_v1(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.EVAL, resnet_version=1,
-                                 dtype=tf.float32)
-
-  def test_cifar10_model_fn_eval_mode_v2(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.EVAL, resnet_version=2,
-                                 dtype=tf.float32)
-
-  def test_cifar10_model_fn_predict_mode_v1(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.PREDICT,
-                                 resnet_version=1, dtype=tf.float32)
-
-  def test_cifar10_model_fn_predict_mode_v2(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.PREDICT,
-                                 resnet_version=2, dtype=tf.float32)
-
-  def _test_cifar10model_shape(self, resnet_version):
-    batch_size = 135
-    num_classes = 246
-
-    model = cifar10_main.Cifar10Model(32, data_format='channels_last',
-                                      num_classes=num_classes,
-                                      resnet_version=resnet_version)
-    fake_input = tf.random.uniform([batch_size, _HEIGHT, _WIDTH, _NUM_CHANNELS])
-    output = model(fake_input, training=True)
-
-    self.assertAllEqual(output.shape, (batch_size, num_classes))
-
-  def test_cifar10model_shape_v1(self):
-    self._test_cifar10model_shape(resnet_version=1)
-
-  def test_cifar10model_shape_v2(self):
-    self._test_cifar10model_shape(resnet_version=2)
-
-  def test_cifar10_end_to_end_synthetic_v1(self):
-    integration.run_synthetic(
-        main=cifar10_main.run_cifar, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '1', '-batch_size', '4',
-                     '--max_train_steps', '1']
-    )
-
-  def test_cifar10_end_to_end_synthetic_v2(self):
-    integration.run_synthetic(
-        main=cifar10_main.run_cifar, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '2', '-batch_size', '4',
-                     '--max_train_steps', '1']
-    )
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/r1/resnet/estimator_benchmark.py
+++ b/official/r1/resnet/estimator_benchmark.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes Estimator benchmarks and accuracy tests."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-
-from absl import flags
-from absl import logging
-from absl.testing import flagsaver
-import tensorflow as tf
-
-from official.r1.resnet import cifar10_main as cifar_main
-from official.r1.resnet import imagenet_main
-from official.r1.utils.logs import hooks
-from official.utils.flags import core as flags_core
-
-IMAGENET_DATA_DIR_NAME = 'imagenet'
-CIFAR_DATA_DIR_NAME = 'cifar-10-batches-bin'
-FLAGS = flags.FLAGS
-
-
-class EstimatorBenchmark(tf.test.Benchmark):
-  """Base class to hold methods common to test classes in the module.
-
-     Code under test for Estimator models (ResNet50 and 56) report mostly the
-     same data and require the same FLAG setup.
-  """
-
-  local_flags = None
-
-  def __init__(self, output_dir=None, default_flags=None, flag_methods=None):
-    if not output_dir:
-      output_dir = '/tmp'
-    self.output_dir = output_dir
-    self.default_flags = default_flags or {}
-    self.flag_methods = flag_methods or {}
-
-  def _get_model_dir(self, folder_name):
-    """Returns directory to store info, e.g. saved model and event log."""
-    return os.path.join(self.output_dir, folder_name)
-
-  def _setup(self):
-    """Sets up and resets flags before each test."""
-    logging.set_verbosity(logging.INFO)
-    if EstimatorBenchmark.local_flags is None:
-      for flag_method in self.flag_methods:
-        flag_method()
-      # Loads flags to get defaults to then override. List cannot be empty.
-      flags.FLAGS(['foo'])
-      # Overrides flag values with defaults for the class of tests.
-      for k, v in self.default_flags.items():
-        setattr(FLAGS, k, v)
-      saved_flag_values = flagsaver.save_flag_values()
-      EstimatorBenchmark.local_flags = saved_flag_values
-    else:
-      flagsaver.restore_flag_values(EstimatorBenchmark.local_flags)
-
-  def _report_benchmark(self,
-                        stats,
-                        wall_time_sec,
-                        top_1_max=None,
-                        top_1_min=None):
-    """Report benchmark results by writing to local protobuf file.
-
-    Args:
-      stats: dict returned from estimator models with known entries.
-      wall_time_sec: the during of the benchmark execution in seconds
-      top_1_max: highest passing level for top_1 accuracy.
-      top_1_min: lowest passing level for top_1 accuracy.
-    """
-
-    examples_per_sec_hook = None
-    for hook in stats['train_hooks']:
-      if isinstance(hook, hooks.ExamplesPerSecondHook):
-        examples_per_sec_hook = hook
-        break
-
-    eval_results = stats['eval_results']
-    metrics = []
-    if 'accuracy' in eval_results:
-      metrics.append({'name': 'accuracy_top_1',
-                      'value': float(eval_results['accuracy']),
-                      'min_value': top_1_min,
-                      'max_value': top_1_max})
-    if 'accuracy_top_5' in eval_results:
-      metrics.append({'name': 'accuracy_top_5',
-                      'value': float(eval_results['accuracy_top_5'])})
-
-    if examples_per_sec_hook:
-      exp_per_second_list = examples_per_sec_hook.current_examples_per_sec_list
-      # ExamplesPerSecondHook skips the first 10 steps.
-      exp_per_sec = sum(exp_per_second_list) / (len(exp_per_second_list))
-      metrics.append({'name': 'exp_per_second',
-                      'value': exp_per_sec})
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(
-        iters=eval_results.get('global_step', None),
-        wall_time=wall_time_sec,
-        metrics=metrics,
-        extras={'flags': flags_str})
-
-
-class Resnet50EstimatorAccuracy(EstimatorBenchmark):
-  """Benchmark accuracy tests for ResNet50 w/ Estimator."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """Benchmark accuracy tests for ResNet50 w/ Estimator.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-    flag_methods = [imagenet_main.define_imagenet_flags]
-
-    self.data_dir = os.path.join(root_data_dir, IMAGENET_DATA_DIR_NAME)
-    super(Resnet50EstimatorAccuracy, self).__init__(
-        output_dir=output_dir, flag_methods=flag_methods)
-
-  def benchmark_graph_8_gpu(self):
-    """Test 8 GPUs graph mode."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
-    FLAGS.dtype = 'fp32'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_8_gpu(self):
-    """Test FP16 8 GPUs graph mode."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 256 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
-    FLAGS.dtype = 'fp16'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_graph_rewrite_8_gpu(self):
-    """Test FP16 graph rewrite 8 GPUs graph mode."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 256 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_graph_fp16_graph_rewrite_8_gpu')
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def _run_and_report_benchmark(self):
-    start_time_sec = time.time()
-    stats = imagenet_main.run_imagenet(flags.FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-    self._report_benchmark(stats,
-                           wall_time_sec,
-                           top_1_min=0.762,
-                           top_1_max=0.766)
-
-
-class Resnet50EstimatorBenchmarkBase(EstimatorBenchmark):
-  """Base class for benchmarks for ResNet50 using Estimator."""
-  local_flags = None
-
-  def __init__(self, output_dir=None, default_flags=None):
-    flag_methods = [imagenet_main.define_imagenet_flags]
-
-    super(Resnet50EstimatorBenchmarkBase, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        flag_methods=flag_methods)
-
-  def _run_and_report_benchmark(self):
-    start_time_sec = time.time()
-    stats = imagenet_main.run_imagenet(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-    print(stats)
-    # Remove values to skip triggering accuracy check.
-    stats['eval_results'].pop('accuracy', None)
-    stats['eval_results'].pop('accuracy_top_5', None)
-
-    self._report_benchmark(stats, wall_time_sec)
-
-
-class Resnet50EstimatorBenchmark(Resnet50EstimatorBenchmarkBase):
-  """Benchmarks for ResNet50 using Estimator with 1 worker."""
-
-  def __init__(self, output_dir=None, default_flags=None):
-    super(Resnet50EstimatorBenchmark, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags)
-
-  def benchmark_graph_fp16_1_gpu(self):
-    """Benchmarks graph fp16 1 gpu."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu')
-    FLAGS.batch_size = 128
-    FLAGS.dtype = 'fp16'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_1_gpu_tweaked(self):
-    """Benchmarks graph fp16 1 gpu tweaked."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.intra_op_parallelism_threads = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu_tweaked')
-    FLAGS.batch_size = 256
-    FLAGS.dtype = 'fp16'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_graph_rewrite_1_gpu_tweaked(self):
-    """Benchmarks graph fp16 graph rewrite 1 gpu tweaked."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.intra_op_parallelism_threads = 1
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_graph_fp16_graph_rewrite_1_gpu_tweaked')
-    FLAGS.batch_size = 256
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_1_gpu(self):
-    """Benchmarks graph 1 gpu."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
-    FLAGS.batch_size = 128
-    FLAGS.dtype = 'fp32'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_8_gpu(self):
-    """Benchmarks graph 8 gpus."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
-    FLAGS.batch_size = 128*8
-    FLAGS.dtype = 'fp32'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_8_gpu(self):
-    """Benchmarks graph fp16 8 gpus."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
-    FLAGS.batch_size = 256*8
-    FLAGS.dtype = 'fp16'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_8_gpu_tweaked(self):
-    """Benchmarks graph fp16 8 gpus tweaked."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.intra_op_parallelism_threads = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu_tweaked')
-    FLAGS.batch_size = 256*8
-    FLAGS.dtype = 'fp16'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_graph_rewrite_8_gpu_tweaked(self):
-    """Benchmarks graph fp16 graph rewrite 8 gpus tweaked."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.intra_op_parallelism_threads = 1
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_graph_fp16_graph_rewrite_8_gpu_tweaked')
-    FLAGS.batch_size = 256*8
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-
-class Resnet50EstimatorBenchmarkSynth(Resnet50EstimatorBenchmark):
-  """Resnet50 synthetic benchmark tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    def_flags = {}
-    def_flags['use_synthetic_data'] = True
-    def_flags['max_train_steps'] = 110
-    def_flags['train_epochs'] = 1
-
-    super(Resnet50EstimatorBenchmarkSynth, self).__init__(
-        output_dir=output_dir, default_flags=def_flags)
-
-
-class Resnet50EstimatorBenchmarkReal(Resnet50EstimatorBenchmark):
-  """Resnet50 real data benchmark tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    def_flags = {}
-    def_flags['data_dir'] = os.path.join(root_data_dir, IMAGENET_DATA_DIR_NAME)
-    def_flags['max_train_steps'] = 110
-    def_flags['train_epochs'] = 1
-
-    super(Resnet50EstimatorBenchmarkReal, self).__init__(
-        output_dir=output_dir, default_flags=def_flags)
-
-
-class Resnet50MultiWorkerEstimatorBenchmark(Resnet50EstimatorBenchmarkBase):
-  """Benchmarks for ResNet50 using Estimator with multiple workers."""
-
-  def __init__(self, output_dir=None, default_flags=None):
-    super(Resnet50MultiWorkerEstimatorBenchmark, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags)
-
-  def benchmark_graph_fp16_8_gpu_ring_tweaked(self):
-    """Benchmarks graph fp16 8 gpus with ring collective tweaked."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.distribution_strategy = 'multi_worker_mirrored'
-    FLAGS.all_reduce_alg = 'ring'
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.intra_op_parallelism_threads = 1
-    FLAGS.datasets_num_private_threads = 32
-    FLAGS.model_dir = self._get_model_dir(
-        folder_name='benchmark_graph_fp16_8_gpu_ring_tweaked')
-    FLAGS.batch_size = 256*8
-    FLAGS.dtype = 'fp16'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_8_gpu_nccl_tweaked(self):
-    """Benchmarks graph fp16 8 gpus with nccl collective tweaked."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.distribution_strategy = 'multi_worker_mirrored'
-    FLAGS.all_reduce_alg = 'nccl'
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.intra_op_parallelism_threads = 1
-    FLAGS.datasets_num_private_threads = 32
-    FLAGS.model_dir = self._get_model_dir(
-        folder_name='benchmark_graph_fp16_8_gpu_nccl_tweaked')
-    FLAGS.batch_size = 256*8
-    FLAGS.dtype = 'fp16'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-
-class Resnet50MultiWorkerEstimatorBenchmarkSynth(
-    Resnet50MultiWorkerEstimatorBenchmark):
-  """ResNet50, multi-worker, Estimator, synthetic data."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    def_flags = {}
-    def_flags['use_synthetic_data'] = True
-    def_flags['max_train_steps'] = 110
-    def_flags['train_epochs'] = 1
-
-    super(Resnet50MultiWorkerEstimatorBenchmarkSynth, self).__init__(
-        output_dir=output_dir, default_flags=def_flags)
-
-
-class Resnet56EstimatorAccuracy(EstimatorBenchmark):
-  """Accuracy tests for Estimator ResNet56."""
-
-  local_flags = None
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """A benchmark class.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-    flag_methods = [cifar_main.define_cifar_flags]
-
-    self.data_dir = os.path.join(root_data_dir, CIFAR_DATA_DIR_NAME)
-    super(Resnet56EstimatorAccuracy, self).__init__(
-        output_dir=output_dir, flag_methods=flag_methods)
-
-  def benchmark_graph_1_gpu(self):
-    """Test layers model with Estimator and distribution strategies."""
-    self._setup()
-    flags.FLAGS.num_gpus = 1
-    flags.FLAGS.data_dir = self.data_dir
-    flags.FLAGS.batch_size = 128
-    flags.FLAGS.train_epochs = 182
-    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
-    flags.FLAGS.resnet_size = 56
-    flags.FLAGS.dtype = 'fp32'
-    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_1_gpu(self):
-    """Test layers FP16 model with Estimator and distribution strategies."""
-    self._setup()
-    flags.FLAGS.num_gpus = 1
-    flags.FLAGS.data_dir = self.data_dir
-    flags.FLAGS.batch_size = 128
-    flags.FLAGS.train_epochs = 182
-    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu')
-    flags.FLAGS.resnet_size = 56
-    flags.FLAGS.dtype = 'fp16'
-    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_2_gpu(self):
-    """Test layers model with Estimator and dist_strat. 2 GPUs."""
-    self._setup()
-    flags.FLAGS.num_gpus = 2
-    flags.FLAGS.data_dir = self.data_dir
-    flags.FLAGS.batch_size = 128
-    flags.FLAGS.train_epochs = 182
-    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
-    flags.FLAGS.resnet_size = 56
-    flags.FLAGS.dtype = 'fp32'
-    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_2_gpu(self):
-    """Test layers FP16 model with Estimator and dist_strat. 2 GPUs."""
-    self._setup()
-    flags.FLAGS.num_gpus = 2
-    flags.FLAGS.data_dir = self.data_dir
-    flags.FLAGS.batch_size = 128
-    flags.FLAGS.train_epochs = 182
-    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_2_gpu')
-    flags.FLAGS.resnet_size = 56
-    flags.FLAGS.dtype = 'fp16'
-    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def unit_test(self):
-    """A lightweight test that can finish quickly."""
-    self._setup()
-    flags.FLAGS.num_gpus = 1
-    flags.FLAGS.data_dir = self.data_dir
-    flags.FLAGS.batch_size = 128
-    flags.FLAGS.train_epochs = 1
-    flags.FLAGS.model_dir = self._get_model_dir('unit_test')
-    flags.FLAGS.resnet_size = 8
-    flags.FLAGS.dtype = 'fp32'
-    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def _run_and_report_benchmark(self):
-    """Executes benchmark and reports result."""
-    start_time_sec = time.time()
-    stats = cifar_main.run_cifar(flags.FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    self._report_benchmark(stats,
-                           wall_time_sec,
-                           top_1_min=0.926,
-                           top_1_max=0.938)
--- a/official/r1/resnet/imagenet_main.py
+++ b/official/r1/resnet/imagenet_main.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Runs a ResNet model on the ImageNet dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl import app as absl_app
-from absl import flags
-from absl import logging
-from six.moves import range
-import tensorflow as tf
-
-from official.r1.resnet import imagenet_preprocessing
-from official.r1.resnet import resnet_model
-from official.r1.resnet import resnet_run_loop
-from official.r1.utils.logs import logger
-from official.utils.flags import core as flags_core
-
-DEFAULT_IMAGE_SIZE = 224
-NUM_CHANNELS = 3
-NUM_CLASSES = 1001
-
-NUM_IMAGES = {
-    'train': 1281167,
-    'validation': 50000,
-}
-
-_NUM_TRAIN_FILES = 1024
-_SHUFFLE_BUFFER = 10000
-
-DATASET_NAME = 'ImageNet'
-
-###############################################################################
-# Data processing
-###############################################################################
-def get_filenames(is_training, data_dir):
-  """Return filenames for dataset."""
-  if is_training:
-    return [
-        os.path.join(data_dir, 'train-%05d-of-01024' % i)
-        for i in range(_NUM_TRAIN_FILES)]
-  else:
-    return [
-        os.path.join(data_dir, 'validation-%05d-of-00128' % i)
-        for i in range(128)]
-
-
-def _parse_example_proto(example_serialized):
-  """Parses an Example proto containing a training example of an image.
-
-  The output of the build_image_data.py image preprocessing script is a dataset
-  containing serialized Example protocol buffers. Each Example proto contains
-  the following fields (values are included as examples):
-
-    image/height: 462
-    image/width: 581
-    image/colorspace: 'RGB'
-    image/channels: 3
-    image/class/label: 615
-    image/class/synset: 'n03623198'
-    image/class/text: 'knee pad'
-    image/object/bbox/xmin: 0.1
-    image/object/bbox/xmax: 0.9
-    image/object/bbox/ymin: 0.2
-    image/object/bbox/ymax: 0.6
-    image/object/bbox/label: 615
-    image/format: 'JPEG'
-    image/filename: 'ILSVRC2012_val_00041207.JPEG'
-    image/encoded: <JPEG encoded string>
-
-  Args:
-    example_serialized: scalar Tensor tf.string containing a serialized
-      Example protocol buffer.
-
-  Returns:
-    image_buffer: Tensor tf.string containing the contents of a JPEG file.
-    label: Tensor tf.int32 containing the label.
-    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
-      where each coordinate is [0, 1) and the coordinates are arranged as
-      [ymin, xmin, ymax, xmax].
-  """
-  # Dense features in Example proto.
-  feature_map = {
-      'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string,
-                                             default_value=''),
-      'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64,
-                                                 default_value=-1),
-      'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string,
-                                                default_value=''),
-  }
-  sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
-  # Sparse features in Example proto.
-  feature_map.update(
-      {k: sparse_float32 for k in ['image/object/bbox/xmin',
-                                   'image/object/bbox/ymin',
-                                   'image/object/bbox/xmax',
-                                   'image/object/bbox/ymax']})
-
-  features = tf.io.parse_single_example(serialized=example_serialized,
-                                        features=feature_map)
-  label = tf.cast(features['image/class/label'], dtype=tf.int32)
-
-  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
-  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
-  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
-  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
-
-  # Note that we impose an ordering of (y, x) just to make life difficult.
-  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
-
-  # Force the variable number of bounding boxes into the shape
-  # [1, num_boxes, coords].
-  bbox = tf.expand_dims(bbox, 0)
-  bbox = tf.transpose(a=bbox, perm=[0, 2, 1])
-
-  return features['image/encoded'], label, bbox
-
-
-def parse_record(raw_record, is_training, dtype):
-  """Parses a record containing a training example of an image.
-
-  The input record is parsed into a label and image, and the image is passed
-  through preprocessing steps (cropping, flipping, and so on).
-
-  Args:
-    raw_record: scalar Tensor tf.string containing a serialized
-      Example protocol buffer.
-    is_training: A boolean denoting whether the input is for training.
-    dtype: data type to use for images/features.
-
-  Returns:
-    Tuple with processed image tensor and one-hot-encoded label tensor.
-  """
-  image_buffer, label, bbox = _parse_example_proto(raw_record)
-
-  image = imagenet_preprocessing.preprocess_image(
-      image_buffer=image_buffer,
-      bbox=bbox,
-      output_height=DEFAULT_IMAGE_SIZE,
-      output_width=DEFAULT_IMAGE_SIZE,
-      num_channels=NUM_CHANNELS,
-      is_training=is_training)
-  image = tf.cast(image, dtype)
-
-  return image, label
-
-
-def input_fn(is_training,
-             data_dir,
-             batch_size,
-             num_epochs=1,
-             dtype=tf.float32,
-             datasets_num_private_threads=None,
-             parse_record_fn=parse_record,
-             input_context=None,
-             drop_remainder=False,
-             tf_data_experimental_slack=False):
-  """Input function which provides batches for train or eval.
-
-  Args:
-    is_training: A boolean denoting whether the input is for training.
-    data_dir: The directory containing the input data.
-    batch_size: The number of samples per batch.
-    num_epochs: The number of epochs to repeat the dataset.
-    dtype: Data type to use for images/features
-    datasets_num_private_threads: Number of private threads for tf.data.
-    parse_record_fn: Function to use for parsing the records.
-    input_context: A `tf.distribute.InputContext` object passed in by
-      `tf.distribute.Strategy`.
-    drop_remainder: A boolean indicates whether to drop the remainder of the
-      batches. If True, the batch dimension will be static.
-    tf_data_experimental_slack: Whether to enable tf.data's
-      `experimental_slack` option.
-
-  Returns:
-    A dataset that can be used for iteration.
-  """
-  filenames = get_filenames(is_training, data_dir)
-  dataset = tf.data.Dataset.from_tensor_slices(filenames)
-
-  if input_context:
-    logging.info(
-        'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
-        input_context.input_pipeline_id, input_context.num_input_pipelines)
-    dataset = dataset.shard(input_context.num_input_pipelines,
-                            input_context.input_pipeline_id)
-
-  if is_training:
-    # Shuffle the input files
-    dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)
-
-  # Convert to individual records.
-  # cycle_length = 10 means that up to 10 files will be read and deserialized in
-  # parallel. You may want to increase this number if you have a large number of
-  # CPU cores.
-  dataset = dataset.interleave(
-      tf.data.TFRecordDataset,
-      cycle_length=10,
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-  return resnet_run_loop.process_record_dataset(
-      dataset=dataset,
-      is_training=is_training,
-      batch_size=batch_size,
-      shuffle_buffer=_SHUFFLE_BUFFER,
-      parse_record_fn=parse_record_fn,
-      num_epochs=num_epochs,
-      dtype=dtype,
-      datasets_num_private_threads=datasets_num_private_threads,
-      drop_remainder=drop_remainder,
-      tf_data_experimental_slack=tf_data_experimental_slack,
-  )
-
-
-def get_synth_input_fn(dtype):
-  return resnet_run_loop.get_synth_input_fn(
-      DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, NUM_CHANNELS, NUM_CLASSES,
-      dtype=dtype)
-
-
-###############################################################################
-# Running the model
-###############################################################################
-class ImagenetModel(resnet_model.Model):
-  """Model class with appropriate defaults for Imagenet data."""
-
-  def __init__(self, resnet_size, data_format=None, num_classes=NUM_CLASSES,
-               resnet_version=resnet_model.DEFAULT_VERSION,
-               dtype=resnet_model.DEFAULT_DTYPE):
-    """These are the parameters that work for Imagenet data.
-
-    Args:
-      resnet_size: The number of convolutional layers needed in the model.
-      data_format: Either 'channels_first' or 'channels_last', specifying which
-        data format to use when setting up the model.
-      num_classes: The number of output classes needed from the model. This
-        enables users to extend the same model to their own datasets.
-      resnet_version: Integer representing which version of the ResNet network
-        to use. See README for details. Valid values: [1, 2]
-      dtype: The TensorFlow dtype to use for calculations.
-    """
-
-    # For bigger models, we want to use "bottleneck" layers
-    if resnet_size < 50:
-      bottleneck = False
-    else:
-      bottleneck = True
-
-    super(ImagenetModel, self).__init__(
-        resnet_size=resnet_size,
-        bottleneck=bottleneck,
-        num_classes=num_classes,
-        num_filters=64,
-        kernel_size=7,
-        conv_stride=2,
-        first_pool_size=3,
-        first_pool_stride=2,
-        block_sizes=_get_block_sizes(resnet_size),
-        block_strides=[1, 2, 2, 2],
-        resnet_version=resnet_version,
-        data_format=data_format,
-        dtype=dtype
-    )
-
-
-def _get_block_sizes(resnet_size):
-  """Retrieve the size of each block_layer in the ResNet model.
-
-  The number of block layers used for the Resnet model varies according
-  to the size of the model. This helper grabs the layer set we want, throwing
-  an error if a non-standard size has been selected.
-
-  Args:
-    resnet_size: The number of convolutional layers needed in the model.
-
-  Returns:
-    A list of block sizes to use in building the model.
-
-  Raises:
-    KeyError: if invalid resnet_size is received.
-  """
-  choices = {
-      18: [2, 2, 2, 2],
-      34: [3, 4, 6, 3],
-      50: [3, 4, 6, 3],
-      101: [3, 4, 23, 3],
-      152: [3, 8, 36, 3],
-      200: [3, 24, 36, 3]
-  }
-
-  try:
-    return choices[resnet_size]
-  except KeyError:
-    err = ('Could not find layers for selected Resnet size.\n'
-           'Size received: {}; sizes allowed: {}.'.format(
-               resnet_size, list(choices.keys())))
-    raise ValueError(err)
-
-
-def imagenet_model_fn(features, labels, mode, params):
-  """Our model_fn for ResNet to be used with our Estimator."""
-
-  # Warmup and higher lr may not be valid for fine tuning with small batches
-  # and smaller numbers of training images.
-  if params['fine_tune']:
-    warmup = False
-    base_lr = .1
-  else:
-    warmup = True
-    base_lr = .128
-
-  learning_rate_fn = resnet_run_loop.learning_rate_with_decay(
-      batch_size=params['batch_size'] * params.get('num_workers', 1),
-      batch_denom=256, num_images=NUM_IMAGES['train'],
-      boundary_epochs=[30, 60, 80, 90], decay_rates=[1, 0.1, 0.01, 0.001, 1e-4],
-      warmup=warmup, base_lr=base_lr)
-
-  return resnet_run_loop.resnet_model_fn(
-      features=features,
-      labels=labels,
-      mode=mode,
-      model_class=ImagenetModel,
-      resnet_size=params['resnet_size'],
-      weight_decay=flags.FLAGS.weight_decay,
-      learning_rate_fn=learning_rate_fn,
-      momentum=0.9,
-      data_format=params['data_format'],
-      resnet_version=params['resnet_version'],
-      loss_scale=params['loss_scale'],
-      loss_filter_fn=None,
-      dtype=params['dtype'],
-      fine_tune=params['fine_tune'],
-      label_smoothing=flags.FLAGS.label_smoothing
-  )
-
-
-def define_imagenet_flags():
-  resnet_run_loop.define_resnet_flags(
-      resnet_size_choices=['18', '34', '50', '101', '152', '200'],
-      dynamic_loss_scale=True,
-      fp16_implementation=True)
-  flags.adopt_module_key_flags(resnet_run_loop)
-  flags_core.set_defaults(train_epochs=90)
-
-
-def run_imagenet(flags_obj):
-  """Run ResNet ImageNet training and eval loop.
-
-  Args:
-    flags_obj: An object containing parsed flag values.
-
-  Returns:
-    Dict of results of the run.  Contains the keys `eval_results` and
-      `train_hooks`. `eval_results` contains accuracy (top_1) and
-      accuracy_top_5. `train_hooks` is a list the instances of hooks used during
-      training.
-  """
-  input_function = (flags_obj.use_synthetic_data and
-                    get_synth_input_fn(flags_core.get_tf_dtype(flags_obj)) or
-                    input_fn)
-
-  result = resnet_run_loop.resnet_main(
-      flags_obj, imagenet_model_fn, input_function, DATASET_NAME,
-      shape=[DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, NUM_CHANNELS])
-
-  return result
-
-
-def main(_):
-  with logger.benchmark_context(flags.FLAGS):
-    run_imagenet(flags.FLAGS)
-
-
-if __name__ == '__main__':
-  logging.set_verbosity(logging.INFO)
-  define_imagenet_flags()
-  absl_app.run(main)
--- a/official/r1/resnet/imagenet_preprocessing.py
+++ b/official/r1/resnet/imagenet_preprocessing.py
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Provides utilities to preprocess images.
-
-Training images are sampled using the provided bounding boxes, and subsequently
-cropped to the sampled bounding box. Images are additionally flipped randomly,
-then resized to the target output size (without aspect-ratio preservation).
-
-Images used during evaluation are resized (with aspect-ratio preservation) and
-centrally cropped.
-
-All images undergo mean color subtraction.
-
-Note that these steps are colloquially referred to as "ResNet preprocessing,"
-and they differ from "VGG preprocessing," which does not use bounding boxes
-and instead does an aspect-preserving resize followed by random crop during
-training. (These both differ from "Inception preprocessing," which introduces
-color distortion steps.)
-
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-_R_MEAN = 123.68
-_G_MEAN = 116.78
-_B_MEAN = 103.94
-_CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
-
-# The lower bound for the smallest side of the image for aspect-preserving
-# resizing. For example, if an image is 500 x 1000, it will be resized to
-# _RESIZE_MIN x (_RESIZE_MIN * 2).
-_RESIZE_MIN = 256
-
-
-def _decode_crop_and_flip(image_buffer, bbox, num_channels):
-  """Crops the given image to a random part of the image, and randomly flips.
-
-  We use the fused decode_and_crop op, which performs better than the two ops
-  used separately in series, but note that this requires that the image be
-  passed in as an un-decoded string Tensor.
-
-  Args:
-    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
-    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
-      where each coordinate is [0, 1) and the coordinates are arranged as
-      [ymin, xmin, ymax, xmax].
-    num_channels: Integer depth of the image buffer for decoding.
-
-  Returns:
-    3-D tensor with cropped image.
-
-  """
-  # A large fraction of image datasets contain a human-annotated bounding box
-  # delineating the region of the image containing the object of interest.  We
-  # choose to create a new bounding box for the object which is a randomly
-  # distorted version of the human-annotated bounding box that obeys an
-  # allowed range of aspect ratios, sizes and overlap with the human-annotated
-  # bounding box. If no box is supplied, then we assume the bounding box is
-  # the entire image.
-  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
-      tf.image.extract_jpeg_shape(image_buffer),
-      bounding_boxes=bbox,
-      min_object_covered=0.1,
-      aspect_ratio_range=[0.75, 1.33],
-      area_range=[0.05, 1.0],
-      max_attempts=100,
-      use_image_if_no_bounding_boxes=True)
-  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
-
-  # Reassemble the bounding box in the format the crop op requires.
-  offset_y, offset_x, _ = tf.unstack(bbox_begin)
-  target_height, target_width, _ = tf.unstack(bbox_size)
-  crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
-
-  # Use the fused decode and crop op here, which is faster than each in series.
-  cropped = tf.image.decode_and_crop_jpeg(
-      image_buffer, crop_window, channels=num_channels)
-
-  # Flip to add a little more random distortion in.
-  cropped = tf.image.random_flip_left_right(cropped)
-  return cropped
-
-
-def _central_crop(image, crop_height, crop_width):
-  """Performs central crops of the given image list.
-
-  Args:
-    image: a 3-D image tensor
-    crop_height: the height of the image following the crop.
-    crop_width: the width of the image following the crop.
-
-  Returns:
-    3-D tensor with cropped image.
-  """
-  shape = tf.shape(input=image)
-  height, width = shape[0], shape[1]
-
-  amount_to_be_cropped_h = (height - crop_height)
-  crop_top = amount_to_be_cropped_h // 2
-  amount_to_be_cropped_w = (width - crop_width)
-  crop_left = amount_to_be_cropped_w // 2
-  return tf.slice(
-      image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
-
-
-def _mean_image_subtraction(image, means, num_channels):
-  """Subtracts the given means from each image channel.
-
-  For example:
-    means = [123.68, 116.779, 103.939]
-    image = _mean_image_subtraction(image, means)
-
-  Note that the rank of `image` must be known.
-
-  Args:
-    image: a tensor of size [height, width, C].
-    means: a C-vector of values to subtract from each channel.
-    num_channels: number of color channels in the image that will be distorted.
-
-  Returns:
-    the centered image.
-
-  Raises:
-    ValueError: If the rank of `image` is unknown, if `image` has a rank other
-      than three or if the number of channels in `image` doesn't match the
-      number of values in `means`.
-  """
-  if image.get_shape().ndims != 3:
-    raise ValueError('Input must be of size [height, width, C>0]')
-
-  if len(means) != num_channels:
-    raise ValueError('len(means) must match the number of channels')
-
-  # We have a 1-D tensor of means; convert to 3-D.
-  # Note(b/130245863): we explicitly call `broadcast` instead of simply
-  # expanding dimensions for better performance.
-  means = tf.broadcast_to(means, tf.shape(image))
-
-  return image - means
-
-
-def _smallest_size_at_least(height, width, resize_min):
-  """Computes new shape with the smallest side equal to `smallest_side`.
-
-  Computes new shape with the smallest side equal to `smallest_side` while
-  preserving the original aspect ratio.
-
-  Args:
-    height: an int32 scalar tensor indicating the current height.
-    width: an int32 scalar tensor indicating the current width.
-    resize_min: A python integer or scalar `Tensor` indicating the size of
-      the smallest side after resize.
-
-  Returns:
-    new_height: an int32 scalar tensor indicating the new height.
-    new_width: an int32 scalar tensor indicating the new width.
-  """
-  resize_min = tf.cast(resize_min, tf.float32)
-
-  # Convert to floats to make subsequent calculations go smoothly.
-  height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
-
-  smaller_dim = tf.minimum(height, width)
-  scale_ratio = resize_min / smaller_dim
-
-  # Convert back to ints to make heights and widths that TF ops will accept.
-  new_height = tf.cast(height * scale_ratio, tf.int32)
-  new_width = tf.cast(width * scale_ratio, tf.int32)
-
-  return new_height, new_width
-
-
-def _aspect_preserving_resize(image, resize_min):
-  """Resize images preserving the original aspect ratio.
-
-  Args:
-    image: A 3-D image `Tensor`.
-    resize_min: A python integer or scalar `Tensor` indicating the size of
-      the smallest side after resize.
-
-  Returns:
-    resized_image: A 3-D tensor containing the resized image.
-  """
-  shape = tf.shape(input=image)
-  height, width = shape[0], shape[1]
-
-  new_height, new_width = _smallest_size_at_least(height, width, resize_min)
-
-  return _resize_image(image, new_height, new_width)
-
-
-def _resize_image(image, height, width):
-  """Simple wrapper around tf.resize_images.
-
-  This is primarily to make sure we use the same `ResizeMethod` and other
-  details each time.
-
-  Args:
-    image: A 3-D image `Tensor`.
-    height: The target height for the resized image.
-    width: The target width for the resized image.
-
-  Returns:
-    resized_image: A 3-D tensor containing the resized image. The first two
-      dimensions have the shape [height, width].
-  """
-  return tf.compat.v1.image.resize(
-      image, [height, width], method=tf.image.ResizeMethod.BILINEAR,
-      align_corners=False)
-
-
-def preprocess_image(image_buffer, bbox, output_height, output_width,
-                     num_channels, is_training=False):
-  """Preprocesses the given image.
-
-  Preprocessing includes decoding, cropping, and resizing for both training
-  and eval images. Training preprocessing, however, introduces some random
-  distortion of the image to improve accuracy.
-
-  Args:
-    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
-    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
-      where each coordinate is [0, 1) and the coordinates are arranged as
-      [ymin, xmin, ymax, xmax].
-    output_height: The height of the image after preprocessing.
-    output_width: The width of the image after preprocessing.
-    num_channels: Integer depth of the image buffer for decoding.
-    is_training: `True` if we're preprocessing the image for training and
-      `False` otherwise.
-
-  Returns:
-    A preprocessed image.
-  """
-  if is_training:
-    # For training, we want to randomize some of the distortions.
-    image = _decode_crop_and_flip(image_buffer, bbox, num_channels)
-    image = _resize_image(image, output_height, output_width)
-  else:
-    # For validation, we want to decode, resize, then just crop the middle.
-    image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
-    image = _aspect_preserving_resize(image, _RESIZE_MIN)
-    image = _central_crop(image, output_height, output_width)
-
-  image.set_shape([output_height, output_width, num_channels])
-
-  return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels)
--- a/official/r1/resnet/imagenet_test.py
+++ b/official/r1/resnet/imagenet_test.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-from absl import logging
-
-from official.r1.resnet import imagenet_main
-from official.utils.testing import integration
-
-logging.set_verbosity(logging.ERROR)
-
-_BATCH_SIZE = 32
-_LABEL_CLASSES = 1001
-
-
-class BaseTest(tf.test.TestCase):
-
-  _num_validation_images = None
-
-  @classmethod
-  def setUpClass(cls):  # pylint: disable=invalid-name
-    super(BaseTest, cls).setUpClass()
-    imagenet_main.define_imagenet_flags()
-
-  def setUp(self):
-    super(BaseTest, self).setUp()
-    tf.compat.v1.disable_eager_execution()
-    self._num_validation_images = imagenet_main.NUM_IMAGES['validation']
-    imagenet_main.NUM_IMAGES['validation'] = 4
-
-  def tearDown(self):
-    super(BaseTest, self).tearDown()
-    tf.io.gfile.rmtree(self.get_temp_dir())
-    imagenet_main.NUM_IMAGES['validation'] = self._num_validation_images
-
-  def _tensor_shapes_helper(self, resnet_size, resnet_version, dtype, with_gpu):
-    """Checks the tensor shapes after each phase of the ResNet model."""
-    def reshape(shape):
-      """Returns the expected dimensions depending on if a GPU is being used."""
-
-      # If a GPU is used for the test, the shape is returned (already in NCHW
-      # form). When GPU is not used, the shape is converted to NHWC.
-      if with_gpu:
-        return shape
-      return shape[0], shape[2], shape[3], shape[1]
-
-    graph = tf.Graph()
-
-    with graph.as_default(), self.test_session(
-        graph=graph, use_gpu=with_gpu, force_gpu=with_gpu):
-      model = imagenet_main.ImagenetModel(
-          resnet_size=resnet_size,
-          data_format='channels_first' if with_gpu else 'channels_last',
-          resnet_version=resnet_version,
-          dtype=dtype
-      )
-      inputs = tf.random.uniform([1, 224, 224, 3])
-      output = model(inputs, training=True)
-
-      initial_conv = graph.get_tensor_by_name('resnet_model/initial_conv:0')
-      max_pool = graph.get_tensor_by_name('resnet_model/initial_max_pool:0')
-      block_layer1 = graph.get_tensor_by_name('resnet_model/block_layer1:0')
-      block_layer2 = graph.get_tensor_by_name('resnet_model/block_layer2:0')
-      block_layer3 = graph.get_tensor_by_name('resnet_model/block_layer3:0')
-      block_layer4 = graph.get_tensor_by_name('resnet_model/block_layer4:0')
-      reduce_mean = graph.get_tensor_by_name('resnet_model/final_reduce_mean:0')
-      dense = graph.get_tensor_by_name('resnet_model/final_dense:0')
-
-      self.assertAllEqual(initial_conv.shape, reshape((1, 64, 112, 112)))
-      self.assertAllEqual(max_pool.shape, reshape((1, 64, 56, 56)))
-
-      # The number of channels after each block depends on whether we're
-      # using the building_block or the bottleneck_block.
-      if resnet_size < 50:
-        self.assertAllEqual(block_layer1.shape, reshape((1, 64, 56, 56)))
-        self.assertAllEqual(block_layer2.shape, reshape((1, 128, 28, 28)))
-        self.assertAllEqual(block_layer3.shape, reshape((1, 256, 14, 14)))
-        self.assertAllEqual(block_layer4.shape, reshape((1, 512, 7, 7)))
-        self.assertAllEqual(reduce_mean.shape, reshape((1, 512, 1, 1)))
-      else:
-        self.assertAllEqual(block_layer1.shape, reshape((1, 256, 56, 56)))
-        self.assertAllEqual(block_layer2.shape, reshape((1, 512, 28, 28)))
-        self.assertAllEqual(block_layer3.shape, reshape((1, 1024, 14, 14)))
-        self.assertAllEqual(block_layer4.shape, reshape((1, 2048, 7, 7)))
-        self.assertAllEqual(reduce_mean.shape, reshape((1, 2048, 1, 1)))
-
-      self.assertAllEqual(dense.shape, (1, _LABEL_CLASSES))
-      self.assertAllEqual(output.shape, (1, _LABEL_CLASSES))
-
-  def tensor_shapes_helper(self, resnet_size, resnet_version, with_gpu=False):
-    self._tensor_shapes_helper(resnet_size=resnet_size,
-                               resnet_version=resnet_version,
-                               dtype=tf.float32, with_gpu=with_gpu)
-    self._tensor_shapes_helper(resnet_size=resnet_size,
-                               resnet_version=resnet_version,
-                               dtype=tf.float16, with_gpu=with_gpu)
-
-  def test_tensor_shapes_resnet_18_v1(self):
-    self.tensor_shapes_helper(18, resnet_version=1)
-
-  def test_tensor_shapes_resnet_18_v2(self):
-    self.tensor_shapes_helper(18, resnet_version=2)
-
-  def test_tensor_shapes_resnet_34_v1(self):
-    self.tensor_shapes_helper(34, resnet_version=1)
-
-  def test_tensor_shapes_resnet_34_v2(self):
-    self.tensor_shapes_helper(34, resnet_version=2)
-
-  def test_tensor_shapes_resnet_50_v1(self):
-    self.tensor_shapes_helper(50, resnet_version=1)
-
-  def test_tensor_shapes_resnet_50_v2(self):
-    self.tensor_shapes_helper(50, resnet_version=2)
-
-  def test_tensor_shapes_resnet_101_v1(self):
-    self.tensor_shapes_helper(101, resnet_version=1)
-
-  def test_tensor_shapes_resnet_101_v2(self):
-    self.tensor_shapes_helper(101, resnet_version=2)
-
-  def test_tensor_shapes_resnet_152_v1(self):
-    self.tensor_shapes_helper(152, resnet_version=1)
-
-  def test_tensor_shapes_resnet_152_v2(self):
-    self.tensor_shapes_helper(152, resnet_version=2)
-
-  def test_tensor_shapes_resnet_200_v1(self):
-    self.tensor_shapes_helper(200, resnet_version=1)
-
-  def test_tensor_shapes_resnet_200_v2(self):
-    self.tensor_shapes_helper(200, resnet_version=2)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_18_with_gpu_v1(self):
-    self.tensor_shapes_helper(18, resnet_version=1, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_18_with_gpu_v2(self):
-    self.tensor_shapes_helper(18, resnet_version=2, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_34_with_gpu_v1(self):
-    self.tensor_shapes_helper(34, resnet_version=1, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_34_with_gpu_v2(self):
-    self.tensor_shapes_helper(34, resnet_version=2, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_50_with_gpu_v1(self):
-    self.tensor_shapes_helper(50, resnet_version=1, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_50_with_gpu_v2(self):
-    self.tensor_shapes_helper(50, resnet_version=2, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_101_with_gpu_v1(self):
-    self.tensor_shapes_helper(101, resnet_version=1, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_101_with_gpu_v2(self):
-    self.tensor_shapes_helper(101, resnet_version=2, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_152_with_gpu_v1(self):
-    self.tensor_shapes_helper(152, resnet_version=1, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_152_with_gpu_v2(self):
-    self.tensor_shapes_helper(152, resnet_version=2, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_200_with_gpu_v1(self):
-    self.tensor_shapes_helper(200, resnet_version=1, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_200_with_gpu_v2(self):
-    self.tensor_shapes_helper(200, resnet_version=2, with_gpu=True)
-
-  def resnet_model_fn_helper(self, mode, resnet_version, dtype):
-    """Tests that the EstimatorSpec is given the appropriate arguments."""
-    tf.compat.v1.train.create_global_step()
-
-    input_fn = imagenet_main.get_synth_input_fn(dtype)
-    dataset = input_fn(True, '', _BATCH_SIZE)
-    iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
-    features, labels = iterator.get_next()
-    spec = imagenet_main.imagenet_model_fn(
-        features, labels, mode, {
-            'dtype': dtype,
-            'resnet_size': 50,
-            'data_format': 'channels_last',
-            'batch_size': _BATCH_SIZE,
-            'resnet_version': resnet_version,
-            'loss_scale': 128 if dtype == tf.float16 else 1,
-            'fine_tune': False,
-        })
-
-    predictions = spec.predictions
-    self.assertAllEqual(predictions['probabilities'].shape,
-                        (_BATCH_SIZE, _LABEL_CLASSES))
-    self.assertEqual(predictions['probabilities'].dtype, tf.float32)
-    self.assertAllEqual(predictions['classes'].shape, (_BATCH_SIZE,))
-    self.assertEqual(predictions['classes'].dtype, tf.int64)
-
-    if mode != tf.estimator.ModeKeys.PREDICT:
-      loss = spec.loss
-      self.assertAllEqual(loss.shape, ())
-      self.assertEqual(loss.dtype, tf.float32)
-
-    if mode == tf.estimator.ModeKeys.EVAL:
-      eval_metric_ops = spec.eval_metric_ops
-      self.assertAllEqual(eval_metric_ops['accuracy'][0].shape, ())
-      self.assertAllEqual(eval_metric_ops['accuracy'][1].shape, ())
-      self.assertEqual(eval_metric_ops['accuracy'][0].dtype, tf.float32)
-      self.assertEqual(eval_metric_ops['accuracy'][1].dtype, tf.float32)
-
-  def test_resnet_model_fn_train_mode_v1(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.TRAIN, resnet_version=1,
-                                dtype=tf.float32)
-
-  def test_resnet_model_fn_train_mode_v2(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.TRAIN, resnet_version=2,
-                                dtype=tf.float32)
-
-  def test_resnet_model_fn_eval_mode_v1(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.EVAL, resnet_version=1,
-                                dtype=tf.float32)
-
-  def test_resnet_model_fn_eval_mode_v2(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.EVAL, resnet_version=2,
-                                dtype=tf.float32)
-
-  def test_resnet_model_fn_predict_mode_v1(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.PREDICT, resnet_version=1,
-                                dtype=tf.float32)
-
-  def test_resnet_model_fn_predict_mode_v2(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.PREDICT, resnet_version=2,
-                                dtype=tf.float32)
-
-  def _test_imagenetmodel_shape(self, resnet_version):
-    batch_size = 135
-    num_classes = 246
-
-    model = imagenet_main.ImagenetModel(
-        50, data_format='channels_last', num_classes=num_classes,
-        resnet_version=resnet_version)
-
-    fake_input = tf.random.uniform([batch_size, 224, 224, 3])
-    output = model(fake_input, training=True)
-
-    self.assertAllEqual(output.shape, (batch_size, num_classes))
-
-  def test_imagenetmodel_shape_v1(self):
-    self._test_imagenetmodel_shape(resnet_version=1)
-
-  def test_imagenetmodel_shape_v2(self):
-    self._test_imagenetmodel_shape(resnet_version=2)
-
-  def test_imagenet_end_to_end_synthetic_v1(self):
-    integration.run_synthetic(
-        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '1', '-batch_size', '4',
-                     '--max_train_steps', '1']
-    )
-
-  def test_imagenet_end_to_end_synthetic_v2(self):
-    integration.run_synthetic(
-        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '2', '-batch_size', '4',
-                     '--max_train_steps', '1']
-    )
-
-  def test_imagenet_end_to_end_synthetic_v1_tiny(self):
-    integration.run_synthetic(
-        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '1', '-batch_size', '4',
-                     '-resnet_size', '18', '--max_train_steps', '1']
-    )
-
-  def test_imagenet_end_to_end_synthetic_v2_tiny(self):
-    integration.run_synthetic(
-        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '2', '-batch_size', '4',
-                     '-resnet_size', '18', '--max_train_steps', '1']
-    )
-
-  def test_imagenet_end_to_end_synthetic_v1_huge(self):
-    integration.run_synthetic(
-        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '1', '-batch_size', '4',
-                     '-resnet_size', '200', '--max_train_steps', '1']
-    )
-
-  def test_imagenet_end_to_end_synthetic_v2_huge(self):
-    integration.run_synthetic(
-        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '2', '-batch_size', '4',
-                     '-resnet_size', '200', '--max_train_steps', '1']
-    )
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/r1/resnet/resnet_model.py
+++ b/official/r1/resnet/resnet_model.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Contains definitions for Residual Networks.
-
-Residual networks ('v1' ResNets) were originally proposed in:
-[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
-    Deep Residual Learning for Image Recognition. arXiv:1512.03385
-
-The full preactivation 'v2' ResNet variant was introduced by:
-[2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
-    Identity Mappings in Deep Residual Networks. arXiv: 1603.05027
-
-The key difference of the full preactivation 'v2' variant compared to the
-'v1' variant in [1] is the use of batch normalization before every weight layer
-rather than after.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-_BATCH_NORM_DECAY = 0.997
-_BATCH_NORM_EPSILON = 1e-5
-DEFAULT_VERSION = 2
-DEFAULT_DTYPE = tf.float32
-CASTABLE_TYPES = (tf.float16,)
-ALLOWED_TYPES = (DEFAULT_DTYPE,) + CASTABLE_TYPES
-
-
-################################################################################
-# Convenience functions for building the ResNet model.
-################################################################################
-def batch_norm(inputs, training, data_format):
-  """Performs a batch normalization using a standard set of parameters."""
-  # We set fused=True for a significant performance boost. See
-  # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
-  return tf.compat.v1.layers.batch_normalization(
-      inputs=inputs, axis=1 if data_format == 'channels_first' else 3,
-      momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True,
-      scale=True, training=training, fused=True)
-
-
-def fixed_padding(inputs, kernel_size, data_format):
-  """Pads the input along the spatial dimensions independently of input size.
-
-  Args:
-    inputs: A tensor of size [batch, channels, height_in, width_in] or
-      [batch, height_in, width_in, channels] depending on data_format.
-    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
-                 Should be a positive integer.
-    data_format: The input format ('channels_last' or 'channels_first').
-
-  Returns:
-    A tensor with the same format as the input with the data either intact
-    (if kernel_size == 1) or padded (if kernel_size > 1).
-  """
-  pad_total = kernel_size - 1
-  pad_beg = pad_total // 2
-  pad_end = pad_total - pad_beg
-
-  if data_format == 'channels_first':
-    padded_inputs = tf.pad(tensor=inputs,
-                           paddings=[[0, 0], [0, 0], [pad_beg, pad_end],
-                                     [pad_beg, pad_end]])
-  else:
-    padded_inputs = tf.pad(tensor=inputs,
-                           paddings=[[0, 0], [pad_beg, pad_end],
-                                     [pad_beg, pad_end], [0, 0]])
-  return padded_inputs
-
-
-def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
-  """Strided 2-D convolution with explicit padding."""
-  # The padding is consistent and is based only on `kernel_size`, not on the
-  # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
-  if strides > 1:
-    inputs = fixed_padding(inputs, kernel_size, data_format)
-
-  return tf.compat.v1.layers.conv2d(
-      inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides,
-      padding=('SAME' if strides == 1 else 'VALID'), use_bias=False,
-      kernel_initializer=tf.compat.v1.variance_scaling_initializer(),
-      data_format=data_format)
-
-
-################################################################################
-# ResNet block definitions.
-################################################################################
-def _building_block_v1(inputs, filters, training, projection_shortcut, strides,
-                       data_format):
-  """A single block for ResNet v1, without a bottleneck.
-
-  Convolution then batch normalization then ReLU as described by:
-    Deep Residual Learning for Image Recognition
-    https://arxiv.org/pdf/1512.03385.pdf
-    by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
-
-  Args:
-    inputs: A tensor of size [batch, channels, height_in, width_in] or
-      [batch, height_in, width_in, channels] depending on data_format.
-    filters: The number of filters for the convolutions.
-    training: A Boolean for whether the model is in training or inference
-      mode. Needed for batch normalization.
-    projection_shortcut: The function to use for projection shortcuts
-      (typically a 1x1 convolution when downsampling the input).
-    strides: The block's stride. If greater than 1, this block will ultimately
-      downsample the input.
-    data_format: The input format ('channels_last' or 'channels_first').
-
-  Returns:
-    The output tensor of the block; shape should match inputs.
-  """
-  shortcut = inputs
-
-  if projection_shortcut is not None:
-    shortcut = projection_shortcut(inputs)
-    shortcut = batch_norm(inputs=shortcut, training=training,
-                          data_format=data_format)
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
-      data_format=data_format)
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=3, strides=1,
-      data_format=data_format)
-  inputs = batch_norm(inputs, training, data_format)
-  inputs += shortcut
-  inputs = tf.nn.relu(inputs)
-
-  return inputs
-
-
-def _building_block_v2(inputs, filters, training, projection_shortcut, strides,
-                       data_format):
-  """A single block for ResNet v2, without a bottleneck.
-
-  Batch normalization then ReLu then convolution as described by:
-    Identity Mappings in Deep Residual Networks
-    https://arxiv.org/pdf/1603.05027.pdf
-    by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Jul 2016.
-
-  Args:
-    inputs: A tensor of size [batch, channels, height_in, width_in] or
-      [batch, height_in, width_in, channels] depending on data_format.
-    filters: The number of filters for the convolutions.
-    training: A Boolean for whether the model is in training or inference
-      mode. Needed for batch normalization.
-    projection_shortcut: The function to use for projection shortcuts
-      (typically a 1x1 convolution when downsampling the input).
-    strides: The block's stride. If greater than 1, this block will ultimately
-      downsample the input.
-    data_format: The input format ('channels_last' or 'channels_first').
-
-  Returns:
-    The output tensor of the block; shape should match inputs.
-  """
-  shortcut = inputs
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-
-  # The projection shortcut should come after the first batch norm and ReLU
-  # since it performs a 1x1 convolution.
-  if projection_shortcut is not None:
-    shortcut = projection_shortcut(inputs)
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
-      data_format=data_format)
-
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=3, strides=1,
-      data_format=data_format)
-
-  return inputs + shortcut
-
-
-def _bottleneck_block_v1(inputs, filters, training, projection_shortcut,
-                         strides, data_format):
-  """A single block for ResNet v1, with a bottleneck.
-
-  Similar to _building_block_v1(), except using the "bottleneck" blocks
-  described in:
-    Convolution then batch normalization then ReLU as described by:
-      Deep Residual Learning for Image Recognition
-      https://arxiv.org/pdf/1512.03385.pdf
-      by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
-
-  Args:
-    inputs: A tensor of size [batch, channels, height_in, width_in] or
-      [batch, height_in, width_in, channels] depending on data_format.
-    filters: The number of filters for the convolutions.
-    training: A Boolean for whether the model is in training or inference
-      mode. Needed for batch normalization.
-    projection_shortcut: The function to use for projection shortcuts
-      (typically a 1x1 convolution when downsampling the input).
-    strides: The block's stride. If greater than 1, this block will ultimately
-      downsample the input.
-    data_format: The input format ('channels_last' or 'channels_first').
-
-  Returns:
-    The output tensor of the block; shape should match inputs.
-  """
-  shortcut = inputs
-
-  if projection_shortcut is not None:
-    shortcut = projection_shortcut(inputs)
-    shortcut = batch_norm(inputs=shortcut, training=training,
-                          data_format=data_format)
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=1, strides=1,
-      data_format=data_format)
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
-      data_format=data_format)
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=4 * filters, kernel_size=1, strides=1,
-      data_format=data_format)
-  inputs = batch_norm(inputs, training, data_format)
-  inputs += shortcut
-  inputs = tf.nn.relu(inputs)
-
-  return inputs
-
-
-def _bottleneck_block_v2(inputs, filters, training, projection_shortcut,
-                         strides, data_format):
-  """A single block for ResNet v2, with a bottleneck.
-
-  Similar to _building_block_v2(), except using the "bottleneck" blocks
-  described in:
-    Convolution then batch normalization then ReLU as described by:
-      Deep Residual Learning for Image Recognition
-      https://arxiv.org/pdf/1512.03385.pdf
-      by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
-
-  Adapted to the ordering conventions of:
-    Batch normalization then ReLu then convolution as described by:
-      Identity Mappings in Deep Residual Networks
-      https://arxiv.org/pdf/1603.05027.pdf
-      by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Jul 2016.
-
-  Args:
-    inputs: A tensor of size [batch, channels, height_in, width_in] or
-      [batch, height_in, width_in, channels] depending on data_format.
-    filters: The number of filters for the convolutions.
-    training: A Boolean for whether the model is in training or inference
-      mode. Needed for batch normalization.
-    projection_shortcut: The function to use for projection shortcuts
-      (typically a 1x1 convolution when downsampling the input).
-    strides: The block's stride. If greater than 1, this block will ultimately
-      downsample the input.
-    data_format: The input format ('channels_last' or 'channels_first').
-
-  Returns:
-    The output tensor of the block; shape should match inputs.
-  """
-  shortcut = inputs
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-
-  # The projection shortcut should come after the first batch norm and ReLU
-  # since it performs a 1x1 convolution.
-  if projection_shortcut is not None:
-    shortcut = projection_shortcut(inputs)
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=1, strides=1,
-      data_format=data_format)
-
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
-      data_format=data_format)
-
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=4 * filters, kernel_size=1, strides=1,
-      data_format=data_format)
-
-  return inputs + shortcut
-
-
-def block_layer(inputs, filters, bottleneck, block_fn, blocks, strides,
-                training, name, data_format):
-  """Creates one layer of blocks for the ResNet model.
-
-  Args:
-    inputs: A tensor of size [batch, channels, height_in, width_in] or
-      [batch, height_in, width_in, channels] depending on data_format.
-    filters: The number of filters for the first convolution of the layer.
-    bottleneck: Is the block created a bottleneck block.
-    block_fn: The block to use within the model, either `building_block` or
-      `bottleneck_block`.
-    blocks: The number of blocks contained in the layer.
-    strides: The stride to use for the first convolution of the layer. If
-      greater than 1, this layer will ultimately downsample the input.
-    training: Either True or False, whether we are currently training the
-      model. Needed for batch norm.
-    name: A string name for the tensor output of the block layer.
-    data_format: The input format ('channels_last' or 'channels_first').
-
-  Returns:
-    The output tensor of the block layer.
-  """
-
-  # Bottleneck blocks end with 4x the number of filters as they start with
-  filters_out = filters * 4 if bottleneck else filters
-
-  def projection_shortcut(inputs):
-    return conv2d_fixed_padding(
-        inputs=inputs, filters=filters_out, kernel_size=1, strides=strides,
-        data_format=data_format)
-
-  # Only the first block per block_layer uses projection_shortcut and strides
-  inputs = block_fn(inputs, filters, training, projection_shortcut, strides,
-                    data_format)
-
-  for _ in range(1, blocks):
-    inputs = block_fn(inputs, filters, training, None, 1, data_format)
-
-  return tf.identity(inputs, name)
-
-
-class Model(object):
-  """Base class for building the Resnet Model."""
-
-  def __init__(self, resnet_size, bottleneck, num_classes, num_filters,
-               kernel_size,
-               conv_stride, first_pool_size, first_pool_stride,
-               block_sizes, block_strides,
-               resnet_version=DEFAULT_VERSION, data_format=None,
-               dtype=DEFAULT_DTYPE):
-    """Creates a model for classifying an image.
-
-    Args:
-      resnet_size: A single integer for the size of the ResNet model.
-      bottleneck: Use regular blocks or bottleneck blocks.
-      num_classes: The number of classes used as labels.
-      num_filters: The number of filters to use for the first block layer
-        of the model. This number is then doubled for each subsequent block
-        layer.
-      kernel_size: The kernel size to use for convolution.
-      conv_stride: stride size for the initial convolutional layer
-      first_pool_size: Pool size to be used for the first pooling layer.
-        If none, the first pooling layer is skipped.
-      first_pool_stride: stride size for the first pooling layer. Not used
-        if first_pool_size is None.
-      block_sizes: A list containing n values, where n is the number of sets of
-        block layers desired. Each value should be the number of blocks in the
-        i-th set.
-      block_strides: List of integers representing the desired stride size for
-        each of the sets of block layers. Should be same length as block_sizes.
-      resnet_version: Integer representing which version of the ResNet network
-        to use. See README for details. Valid values: [1, 2]
-      data_format: Input format ('channels_last', 'channels_first', or None).
-        If set to None, the format is dependent on whether a GPU is available.
-      dtype: The TensorFlow dtype to use for calculations. If not specified
-        tf.float32 is used.
-
-    Raises:
-      ValueError: if invalid version is selected.
-    """
-    self.resnet_size = resnet_size
-
-    if not data_format:
-      data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
-                     else 'channels_last')
-
-    self.resnet_version = resnet_version
-    if resnet_version not in (1, 2):
-      raise ValueError(
-          'Resnet version should be 1 or 2. See README for citations.')
-
-    self.bottleneck = bottleneck
-    if bottleneck:
-      if resnet_version == 1:
-        self.block_fn = _bottleneck_block_v1
-      else:
-        self.block_fn = _bottleneck_block_v2
-    else:
-      if resnet_version == 1:
-        self.block_fn = _building_block_v1
-      else:
-        self.block_fn = _building_block_v2
-
-    if dtype not in ALLOWED_TYPES:
-      raise ValueError('dtype must be one of: {}'.format(ALLOWED_TYPES))
-
-    self.data_format = data_format
-    self.num_classes = num_classes
-    self.num_filters = num_filters
-    self.kernel_size = kernel_size
-    self.conv_stride = conv_stride
-    self.first_pool_size = first_pool_size
-    self.first_pool_stride = first_pool_stride
-    self.block_sizes = block_sizes
-    self.block_strides = block_strides
-    self.dtype = dtype
-    self.pre_activation = resnet_version == 2
-
-  def _custom_dtype_getter(self, getter, name, shape=None, dtype=DEFAULT_DTYPE,
-                           *args, **kwargs):
-    """Creates variables in fp32, then casts to fp16 if necessary.
-
-    This function is a custom getter. A custom getter is a function with the
-    same signature as tf.get_variable, except it has an additional getter
-    parameter. Custom getters can be passed as the `custom_getter` parameter of
-    tf.variable_scope. Then, tf.get_variable will call the custom getter,
-    instead of directly getting a variable itself. This can be used to change
-    the types of variables that are retrieved with tf.get_variable.
-    The `getter` parameter is the underlying variable getter, that would have
-    been called if no custom getter was used. Custom getters typically get a
-    variable with `getter`, then modify it in some way.
-
-    This custom getter will create an fp32 variable. If a low precision
-    (e.g. float16) variable was requested it will then cast the variable to the
-    requested dtype. The reason we do not directly create variables in low
-    precision dtypes is that applying small gradients to such variables may
-    cause the variable not to change.
-
-    Args:
-      getter: The underlying variable getter, that has the same signature as
-        tf.get_variable and returns a variable.
-      name: The name of the variable to get.
-      shape: The shape of the variable to get.
-      dtype: The dtype of the variable to get. Note that if this is a low
-        precision dtype, the variable will be created as a tf.float32 variable,
-        then cast to the appropriate dtype
-      *args: Additional arguments to pass unmodified to getter.
-      **kwargs: Additional keyword arguments to pass unmodified to getter.
-
-    Returns:
-      A variable which is cast to fp16 if necessary.
-    """
-
-    if dtype in CASTABLE_TYPES:
-      var = getter(name, shape, tf.float32, *args, **kwargs)
-      return tf.cast(var, dtype=dtype, name=name + '_cast')
-    else:
-      return getter(name, shape, dtype, *args, **kwargs)
-
-  def _model_variable_scope(self):
-    """Returns a variable scope that the model should be created under.
-
-    If self.dtype is a castable type, model variable will be created in fp32
-    then cast to self.dtype before being used.
-
-    Returns:
-      A variable scope for the model.
-    """
-
-    return tf.compat.v1.variable_scope('resnet_model',
-                                       custom_getter=self._custom_dtype_getter)
-
-  def __call__(self, inputs, training):
-    """Add operations to classify a batch of input images.
-
-    Args:
-      inputs: A Tensor representing a batch of input images.
-      training: A boolean. Set to True to add operations required only when
-        training the classifier.
-
-    Returns:
-      A logits Tensor with shape [<batch_size>, self.num_classes].
-    """
-
-    with self._model_variable_scope():
-      if self.data_format == 'channels_first':
-        # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
-        # This provides a large performance boost on GPU. See
-        # https://www.tensorflow.org/performance/performance_guide#data_formats
-        inputs = tf.transpose(a=inputs, perm=[0, 3, 1, 2])
-
-      inputs = conv2d_fixed_padding(
-          inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size,
-          strides=self.conv_stride, data_format=self.data_format)
-      inputs = tf.identity(inputs, 'initial_conv')
-
-      # We do not include batch normalization or activation functions in V2
-      # for the initial conv1 because the first ResNet unit will perform these
-      # for both the shortcut and non-shortcut paths as part of the first
-      # block's projection. Cf. Appendix of [2].
-      if self.resnet_version == 1:
-        inputs = batch_norm(inputs, training, self.data_format)
-        inputs = tf.nn.relu(inputs)
-
-      if self.first_pool_size:
-        inputs = tf.compat.v1.layers.max_pooling2d(
-            inputs=inputs, pool_size=self.first_pool_size,
-            strides=self.first_pool_stride, padding='SAME',
-            data_format=self.data_format)
-        inputs = tf.identity(inputs, 'initial_max_pool')
-
-      for i, num_blocks in enumerate(self.block_sizes):
-        num_filters = self.num_filters * (2**i)
-        inputs = block_layer(
-            inputs=inputs, filters=num_filters, bottleneck=self.bottleneck,
-            block_fn=self.block_fn, blocks=num_blocks,
-            strides=self.block_strides[i], training=training,
-            name='block_layer{}'.format(i + 1), data_format=self.data_format)
-
-      # Only apply the BN and ReLU for model that does pre_activation in each
-      # building/bottleneck block, eg resnet V2.
-      if self.pre_activation:
-        inputs = batch_norm(inputs, training, self.data_format)
-        inputs = tf.nn.relu(inputs)
-
-      # The current top layer has shape
-      # `batch_size x pool_size x pool_size x final_size`.
-      # ResNet does an Average Pooling layer over pool_size,
-      # but that is the same as doing a reduce_mean. We do a reduce_mean
-      # here because it performs better than AveragePooling2D.
-      axes = [2, 3] if self.data_format == 'channels_first' else [1, 2]
-      inputs = tf.reduce_mean(input_tensor=inputs, axis=axes, keepdims=True)
-      inputs = tf.identity(inputs, 'final_reduce_mean')
-
-      inputs = tf.squeeze(inputs, axes)
-      inputs = tf.compat.v1.layers.dense(inputs=inputs, units=self.num_classes)
-      inputs = tf.identity(inputs, 'final_dense')
-      return inputs
--- a/official/r1/resnet/resnet_run_loop.py
+++ b/official/r1/resnet/resnet_run_loop.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Contains utility and supporting functions for ResNet.
-
-  This module contains ResNet code which does not directly build layers. This
-includes dataset management, hyperparameter and optimizer code, and argument
-parsing. Code for defining the ResNet layers can be found in resnet_model.py.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import math
-import multiprocessing
-import os
-
-from absl import flags
-from absl import logging
-import tensorflow as tf
-
-from official.r1.resnet import imagenet_preprocessing
-from official.r1.resnet import resnet_model
-from official.r1.utils import export
-from official.r1.utils.logs import hooks_helper
-from official.r1.utils.logs import logger
-from official.utils.flags import core as flags_core
-from official.utils.misc import distribution_utils
-from official.utils.misc import model_helpers
-
-
-################################################################################
-# Functions for input processing.
-################################################################################
-def process_record_dataset(dataset,
-                           is_training,
-                           batch_size,
-                           shuffle_buffer,
-                           parse_record_fn,
-                           num_epochs=1,
-                           dtype=tf.float32,
-                           datasets_num_private_threads=None,
-                           drop_remainder=False,
-                           tf_data_experimental_slack=False):
-  """Given a Dataset with raw records, return an iterator over the records.
-
-  Args:
-    dataset: A Dataset representing raw records
-    is_training: A boolean denoting whether the input is for training.
-    batch_size: The number of samples per batch.
-    shuffle_buffer: The buffer size to use when shuffling records. A larger
-      value results in better randomness, but smaller values reduce startup
-      time and use less memory.
-    parse_record_fn: A function that takes a raw record and returns the
-      corresponding (image, label) pair.
-    num_epochs: The number of epochs to repeat the dataset.
-    dtype: Data type to use for images/features.
-    datasets_num_private_threads: Number of threads for a private
-      threadpool created for all datasets computation.
-    drop_remainder: A boolean indicates whether to drop the remainder of the
-      batches. If True, the batch dimension will be static.
-    tf_data_experimental_slack: Whether to enable tf.data's
-      `experimental_slack` option.
-
-  Returns:
-    Dataset of (image, label) pairs ready for iteration.
-  """
-  # Defines a specific size thread pool for tf.data operations.
-  if datasets_num_private_threads:
-    options = tf.data.Options()
-    options.experimental_threading.private_threadpool_size = (
-        datasets_num_private_threads)
-    dataset = dataset.with_options(options)
-    logging.info('datasets_num_private_threads: %s',
-                 datasets_num_private_threads)
-
-  # Disable intra-op parallelism to optimize for throughput instead of latency.
-  options = tf.data.Options()
-  options.experimental_threading.max_intra_op_parallelism = 1
-  dataset = dataset.with_options(options)
-
-  # Prefetches a batch at a time to smooth out the time taken to load input
-  # files for shuffling and processing.
-  dataset = dataset.prefetch(buffer_size=batch_size)
-  if is_training:
-    # Shuffles records before repeating to respect epoch boundaries.
-    dataset = dataset.shuffle(buffer_size=shuffle_buffer)
-
-  # Repeats the dataset for the number of epochs to train.
-  dataset = dataset.repeat(num_epochs)
-
-  # Parses the raw records into images and labels.
-  dataset = dataset.map(
-      lambda value: parse_record_fn(value, is_training, dtype),
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
-  dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
-
-  # Operations between the final prefetch and the get_next call to the iterator
-  # will happen synchronously during run time. We prefetch here again to
-  # background all of the above processing work and keep it out of the
-  # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
-  # allows DistributionStrategies to adjust how many batches to fetch based
-  # on how many devices are present.
-  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
-
-  if tf_data_experimental_slack:
-    options = tf.data.Options()
-    options.experimental_slack = True
-    dataset = dataset.with_options(options)
-
-  return dataset
-
-
-def get_synth_input_fn(height, width, num_channels, num_classes,
-                       dtype=tf.float32):
-  """Returns an input function that returns a dataset with random data.
-
-  This input_fn returns a data set that iterates over a set of random data and
-  bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
-  copy is still included. This used to find the upper throughput bound when
-  tunning the full input pipeline.
-
-  Args:
-    height: Integer height that will be used to create a fake image tensor.
-    width: Integer width that will be used to create a fake image tensor.
-    num_channels: Integer depth that will be used to create a fake image tensor.
-    num_classes: Number of classes that should be represented in the fake labels
-      tensor
-    dtype: Data type for features/images.
-
-  Returns:
-    An input_fn that can be used in place of a real one to return a dataset
-    that can be used for iteration.
-  """
-  # pylint: disable=unused-argument
-  def input_fn(is_training, data_dir, batch_size, *args, **kwargs):
-    """Returns dataset filled with random data."""
-    # Synthetic input should be within [0, 255].
-    inputs = tf.random.truncated_normal(
-        [batch_size] + [height, width, num_channels],
-        dtype=dtype,
-        mean=127,
-        stddev=60,
-        name='synthetic_inputs')
-
-    labels = tf.random.uniform(
-        [batch_size],
-        minval=0,
-        maxval=num_classes - 1,
-        dtype=tf.int32,
-        name='synthetic_labels')
-    data = tf.data.Dataset.from_tensors((inputs, labels)).repeat()
-    data = data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
-    return data
-
-  return input_fn
-
-
-def image_bytes_serving_input_fn(image_shape, dtype=tf.float32):
-  """Serving input fn for raw jpeg images."""
-
-  def _preprocess_image(image_bytes):
-    """Preprocess a single raw image."""
-    # Bounding box around the whole image.
-    bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=dtype, shape=[1, 1, 4])
-    height, width, num_channels = image_shape
-    image = imagenet_preprocessing.preprocess_image(
-        image_bytes, bbox, height, width, num_channels, is_training=False)
-    return image
-
-  image_bytes_list = tf.compat.v1.placeholder(
-      shape=[None], dtype=tf.string, name='input_tensor')
-  images = tf.map_fn(
-      _preprocess_image, image_bytes_list, back_prop=False, dtype=dtype)
-  return tf.estimator.export.TensorServingInputReceiver(
-      images, {'image_bytes': image_bytes_list})
-
-
-def override_flags_and_set_envars_for_gpu_thread_pool(flags_obj):
-  """Override flags and set env_vars for performance.
-
-  These settings exist to test the difference between using stock settings
-  and manual tuning. It also shows some of the ENV_VARS that can be tweaked to
-  squeeze a few extra examples per second.  These settings are defaulted to the
-  current platform of interest, which changes over time.
-
-  On systems with small numbers of cpu cores, e.g. under 8 logical cores,
-  setting up a gpu thread pool with `tf_gpu_thread_mode=gpu_private` may perform
-  poorly.
-
-  Args:
-    flags_obj: Current flags, which will be adjusted possibly overriding
-    what has been set by the user on the command-line.
-  """
-  cpu_count = multiprocessing.cpu_count()
-  logging.info('Logical CPU cores: %s', cpu_count)
-
-  # Sets up thread pool for each GPU for op scheduling.
-  per_gpu_thread_count = 1
-  total_gpu_thread_count = per_gpu_thread_count * flags_obj.num_gpus
-  os.environ['TF_GPU_THREAD_MODE'] = flags_obj.tf_gpu_thread_mode
-  os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
-  logging.info('TF_GPU_THREAD_COUNT: %s', os.environ['TF_GPU_THREAD_COUNT'])
-  logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE'])
-
-  # Reduces general thread pool by number of threads used for GPU pool.
-  main_thread_count = cpu_count - total_gpu_thread_count
-  flags_obj.inter_op_parallelism_threads = main_thread_count
-
-  # Sets thread count for tf.data. Logical cores minus threads assign to the
-  # private GPU pool along with 2 thread per GPU for event monitoring and
-  # sending / receiving tensors.
-  num_monitoring_threads = 2 * flags_obj.num_gpus
-  flags_obj.datasets_num_private_threads = (cpu_count - total_gpu_thread_count
-                                            - num_monitoring_threads)
-
-
-################################################################################
-# Functions for running training/eval/validation loops for the model.
-################################################################################
-def learning_rate_with_decay(
-    batch_size, batch_denom, num_images, boundary_epochs, decay_rates,
-    base_lr=0.1, warmup=False):
-  """Get a learning rate that decays step-wise as training progresses.
-
-  Args:
-    batch_size: the number of examples processed in each training batch.
-    batch_denom: this value will be used to scale the base learning rate.
-      `0.1 * batch size` is divided by this number, such that when
-      batch_denom == batch_size, the initial learning rate will be 0.1.
-    num_images: total number of images that will be used for training.
-    boundary_epochs: list of ints representing the epochs at which we
-      decay the learning rate.
-    decay_rates: list of floats representing the decay rates to be used
-      for scaling the learning rate. It should have one more element
-      than `boundary_epochs`, and all elements should have the same type.
-    base_lr: Initial learning rate scaled based on batch_denom.
-    warmup: Run a 5 epoch warmup to the initial lr.
-  Returns:
-    Returns a function that takes a single argument - the number of batches
-    trained so far (global_step)- and returns the learning rate to be used
-    for training the next batch.
-  """
-  initial_learning_rate = base_lr * batch_size / batch_denom
-  batches_per_epoch = num_images / batch_size
-
-  # Reduce the learning rate at certain epochs.
-  # CIFAR-10: divide by 10 at epoch 100, 150, and 200
-  # ImageNet: divide by 10 at epoch 30, 60, 80, and 90
-  boundaries = [int(batches_per_epoch * epoch) for epoch in boundary_epochs]
-  vals = [initial_learning_rate * decay for decay in decay_rates]
-
-  def learning_rate_fn(global_step):
-    """Builds scaled learning rate function with 5 epoch warm up."""
-    lr = tf.compat.v1.train.piecewise_constant(global_step, boundaries, vals)
-    if warmup:
-      warmup_steps = int(batches_per_epoch * 5)
-      warmup_lr = (
-          initial_learning_rate * tf.cast(global_step, tf.float32) / tf.cast(
-              warmup_steps, tf.float32))
-      return tf.cond(pred=global_step < warmup_steps,
-                     true_fn=lambda: warmup_lr,
-                     false_fn=lambda: lr)
-    return lr
-
-  def poly_rate_fn(global_step):
-    """Handles linear scaling rule, gradual warmup, and LR decay.
-
-    The learning rate starts at 0, then it increases linearly per step.  After
-    FLAGS.poly_warmup_epochs, we reach the base learning rate (scaled to account
-    for batch size). The learning rate is then decayed using a polynomial rate
-    decay schedule with power 2.0.
-
-    Args:
-      global_step: the current global_step
-
-    Returns:
-      returns the current learning rate
-    """
-
-    # Learning rate schedule for LARS polynomial schedule
-    if flags.FLAGS.batch_size < 8192:
-      plr = 5.0
-      w_epochs = 5
-    elif flags.FLAGS.batch_size < 16384:
-      plr = 10.0
-      w_epochs = 5
-    elif flags.FLAGS.batch_size < 32768:
-      plr = 25.0
-      w_epochs = 5
-    else:
-      plr = 32.0
-      w_epochs = 14
-
-    w_steps = int(w_epochs * batches_per_epoch)
-    wrate = (plr * tf.cast(global_step, tf.float32) / tf.cast(
-        w_steps, tf.float32))
-
-    # TODO(pkanwar): use a flag to help calc num_epochs.
-    num_epochs = 90
-    train_steps = batches_per_epoch * num_epochs
-
-    min_step = tf.constant(1, dtype=tf.int64)
-    decay_steps = tf.maximum(min_step, tf.subtract(global_step, w_steps))
-    poly_rate = tf.train.polynomial_decay(
-        plr,
-        decay_steps,
-        train_steps - w_steps + 1,
-        power=2.0)
-    return tf.where(global_step <= w_steps, wrate, poly_rate)
-
-  # For LARS we have a new learning rate schedule
-  if flags.FLAGS.enable_lars:
-    return poly_rate_fn
-
-  return learning_rate_fn
-
-
-def per_replica_batch_size(batch_size, num_gpus):
-  """For multi-gpu, batch-size must be a multiple of the number of GPUs.
-
-
-  Note that distribution strategy handles this automatically when used with
-  Keras. For using with Estimator, we need to get per GPU batch.
-
-  Args:
-    batch_size: Global batch size to be divided among devices. This should be
-      equal to num_gpus times the single-GPU batch_size for multi-gpu training.
-    num_gpus: How many GPUs are used with DistributionStrategies.
-
-  Returns:
-    Batch size per device.
-
-  Raises:
-    ValueError: if batch_size is not divisible by number of devices
-  """
-  if num_gpus <= 1:
-    return batch_size
-
-  remainder = batch_size % num_gpus
-  if remainder:
-    err = ('When running with multiple GPUs, batch size '
-           'must be a multiple of the number of available GPUs. Found {} '
-           'GPUs with a batch size of {}; try --batch_size={} instead.'
-          ).format(num_gpus, batch_size, batch_size - remainder)
-    raise ValueError(err)
-  return int(batch_size / num_gpus)
-
-
-def resnet_model_fn(features, labels, mode, model_class,
-                    resnet_size, weight_decay, learning_rate_fn, momentum,
-                    data_format, resnet_version, loss_scale,
-                    loss_filter_fn=None, dtype=resnet_model.DEFAULT_DTYPE,
-                    fine_tune=False, label_smoothing=0.0):
-  """Shared functionality for different resnet model_fns.
-
-  Initializes the ResnetModel representing the model layers
-  and uses that model to build the necessary EstimatorSpecs for
-  the `mode` in question. For training, this means building losses,
-  the optimizer, and the train op that get passed into the EstimatorSpec.
-  For evaluation and prediction, the EstimatorSpec is returned without
-  a train op, but with the necessary parameters for the given mode.
-
-  Args:
-    features: tensor representing input images
-    labels: tensor representing class labels for all input images
-    mode: current estimator mode; should be one of
-      `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`
-    model_class: a class representing a TensorFlow model that has a __call__
-      function. We assume here that this is a subclass of ResnetModel.
-    resnet_size: A single integer for the size of the ResNet model.
-    weight_decay: weight decay loss rate used to regularize learned variables.
-    learning_rate_fn: function that returns the current learning rate given
-      the current global_step
-    momentum: momentum term used for optimization
-    data_format: Input format ('channels_last', 'channels_first', or None).
-      If set to None, the format is dependent on whether a GPU is available.
-    resnet_version: Integer representing which version of the ResNet network to
-      use. See README for details. Valid values: [1, 2]
-    loss_scale: The factor to scale the loss for numerical stability. A detailed
-      summary is present in the arg parser help text.
-    loss_filter_fn: function that takes a string variable name and returns
-      True if the var should be included in loss calculation, and False
-      otherwise. If None, batch_normalization variables will be excluded
-      from the loss.
-    dtype: the TensorFlow dtype to use for calculations.
-    fine_tune: If True only train the dense layers(final layers).
-    label_smoothing: If greater than 0 then smooth the labels.
-
-  Returns:
-    EstimatorSpec parameterized according to the input params and the
-    current mode.
-  """
-
-  # Generate a summary node for the images
-  tf.compat.v1.summary.image('images', features, max_outputs=6)
-  # Checks that features/images have same data type being used for calculations.
-  assert features.dtype == dtype
-
-  model = model_class(resnet_size, data_format, resnet_version=resnet_version,
-                      dtype=dtype)
-
-  logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)
-
-  # This acts as a no-op if the logits are already in fp32 (provided logits are
-  # not a SparseTensor). If dtype is is low precision, logits must be cast to
-  # fp32 for numerical stability.
-  logits = tf.cast(logits, tf.float32)
-
-  predictions = {
-      'classes': tf.argmax(input=logits, axis=1),
-      'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
-  }
-
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    # Return the predictions and the specification for serving a SavedModel
-    return tf.estimator.EstimatorSpec(
-        mode=mode,
-        predictions=predictions,
-        export_outputs={
-            'predict': tf.estimator.export.PredictOutput(predictions)
-        })
-
-  # Calculate loss, which includes softmax cross entropy and L2 regularization.
-  if label_smoothing != 0.0:
-    one_hot_labels = tf.one_hot(labels, 1001)
-    cross_entropy = tf.losses.softmax_cross_entropy(
-        logits=logits, onehot_labels=one_hot_labels,
-        label_smoothing=label_smoothing)
-  else:
-    cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy(
-        logits=logits, labels=labels)
-
-  # Create a tensor named cross_entropy for logging purposes.
-  tf.identity(cross_entropy, name='cross_entropy')
-  tf.compat.v1.summary.scalar('cross_entropy', cross_entropy)
-
-  # If no loss_filter_fn is passed, assume we want the default behavior,
-  # which is that batch_normalization variables are excluded from loss.
-  def exclude_batch_norm(name):
-    return 'batch_normalization' not in name
-  loss_filter_fn = loss_filter_fn or exclude_batch_norm
-
-  # Add weight decay to the loss.
-  l2_loss = weight_decay * tf.add_n(
-      # loss is computed using fp32 for numerical stability.
-      [
-          tf.nn.l2_loss(tf.cast(v, tf.float32))
-          for v in tf.compat.v1.trainable_variables()
-          if loss_filter_fn(v.name)
-      ])
-  tf.compat.v1.summary.scalar('l2_loss', l2_loss)
-  loss = cross_entropy + l2_loss
-
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    global_step = tf.compat.v1.train.get_or_create_global_step()
-
-    learning_rate = learning_rate_fn(global_step)
-
-    # Create a tensor named learning_rate for logging purposes
-    tf.identity(learning_rate, name='learning_rate')
-    tf.compat.v1.summary.scalar('learning_rate', learning_rate)
-
-    if flags.FLAGS.enable_lars:
-      from tensorflow.contrib import opt as contrib_opt  # pylint: disable=g-import-not-at-top
-      optimizer = contrib_opt.LARSOptimizer(
-          learning_rate,
-          momentum=momentum,
-          weight_decay=weight_decay,
-          skip_list=['batch_normalization', 'bias'])
-    else:
-      optimizer = tf.compat.v1.train.MomentumOptimizer(
-          learning_rate=learning_rate,
-          momentum=momentum
-      )
-
-    fp16_implementation = getattr(flags.FLAGS, 'fp16_implementation', None)
-    if fp16_implementation == 'graph_rewrite':
-      optimizer = (
-          tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
-              optimizer, loss_scale=loss_scale))
-
-    def _dense_grad_filter(gvs):
-      """Only apply gradient updates to the final layer.
-
-      This function is used for fine tuning.
-
-      Args:
-        gvs: list of tuples with gradients and variable info
-      Returns:
-        filtered gradients so that only the dense layer remains
-      """
-      return [(g, v) for g, v in gvs if 'dense' in v.name]
-
-    if loss_scale != 1 and fp16_implementation != 'graph_rewrite':
-      # When computing fp16 gradients, often intermediate tensor values are
-      # so small, they underflow to 0. To avoid this, we multiply the loss by
-      # loss_scale to make these tensor values loss_scale times bigger.
-      scaled_grad_vars = optimizer.compute_gradients(loss * loss_scale)
-
-      if fine_tune:
-        scaled_grad_vars = _dense_grad_filter(scaled_grad_vars)
-
-      # Once the gradient computation is complete we can scale the gradients
-      # back to the correct scale before passing them to the optimizer.
-      unscaled_grad_vars = [(grad / loss_scale, var)
-                            for grad, var in scaled_grad_vars]
-      minimize_op = optimizer.apply_gradients(unscaled_grad_vars, global_step)
-    else:
-      grad_vars = optimizer.compute_gradients(loss)
-      if fine_tune:
-        grad_vars = _dense_grad_filter(grad_vars)
-      minimize_op = optimizer.apply_gradients(grad_vars, global_step)
-
-    update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
-    train_op = tf.group(minimize_op, update_ops)
-  else:
-    train_op = None
-
-  accuracy = tf.compat.v1.metrics.accuracy(labels, predictions['classes'])
-  accuracy_top_5 = tf.compat.v1.metrics.mean(
-      tf.nn.in_top_k(predictions=logits, targets=labels, k=5, name='top_5_op'))
-  metrics = {'accuracy': accuracy,
-             'accuracy_top_5': accuracy_top_5}
-
-  # Create a tensor named train_accuracy for logging purposes
-  tf.identity(accuracy[1], name='train_accuracy')
-  tf.identity(accuracy_top_5[1], name='train_accuracy_top_5')
-  tf.compat.v1.summary.scalar('train_accuracy', accuracy[1])
-  tf.compat.v1.summary.scalar('train_accuracy_top_5', accuracy_top_5[1])
-
-  return tf.estimator.EstimatorSpec(
-      mode=mode,
-      predictions=predictions,
-      loss=loss,
-      train_op=train_op,
-      eval_metric_ops=metrics)
-
-
-def resnet_main(
-    flags_obj, model_function, input_function, dataset_name, shape=None):
-  """Shared main loop for ResNet Models.
-
-  Args:
-    flags_obj: An object containing parsed flags. See define_resnet_flags()
-      for details.
-    model_function: the function that instantiates the Model and builds the
-      ops for train/eval. This will be passed directly into the estimator.
-    input_function: the function that processes the dataset and returns a
-      dataset that the estimator can train on. This will be wrapped with
-      all the relevant flags for running and passed to estimator.
-    dataset_name: the name of the dataset for training and evaluation. This is
-      used for logging purpose.
-    shape: list of ints representing the shape of the images used for training.
-      This is only used if flags_obj.export_dir is passed.
-
-  Returns:
-     Dict of results of the run.  Contains the keys `eval_results` and
-    `train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5.
-    `train_hooks` is a list the instances of hooks used during training.
-  """
-
-  model_helpers.apply_clean(flags.FLAGS)
-
-  # Ensures flag override logic is only executed if explicitly triggered.
-  if flags_obj.tf_gpu_thread_mode:
-    override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)
-
-  # Configures cluster spec for distribution strategy.
-  num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts,
-                                                     flags_obj.task_index)
-
-  # Creates session config. allow_soft_placement = True, is required for
-  # multi-GPU and is not harmful for other modes.
-  session_config = tf.compat.v1.ConfigProto(
-      inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
-      intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
-      allow_soft_placement=True)
-
-  distribution_strategy = distribution_utils.get_distribution_strategy(
-      distribution_strategy=flags_obj.distribution_strategy,
-      num_gpus=flags_core.get_num_gpus(flags_obj),
-      all_reduce_alg=flags_obj.all_reduce_alg,
-      num_packs=flags_obj.num_packs)
-
-  # Creates a `RunConfig` that checkpoints every 24 hours which essentially
-  # results in checkpoints determined only by `epochs_between_evals`.
-  run_config = tf.estimator.RunConfig(
-      train_distribute=distribution_strategy,
-      session_config=session_config,
-      save_checkpoints_secs=60*60*24,
-      save_checkpoints_steps=None)
-
-  # Initializes model with all but the dense layer from pretrained ResNet.
-  if flags_obj.pretrained_model_checkpoint_path is not None:
-    warm_start_settings = tf.estimator.WarmStartSettings(
-        flags_obj.pretrained_model_checkpoint_path,
-        vars_to_warm_start='^(?!.*dense)')
-  else:
-    warm_start_settings = None
-
-  classifier = tf.estimator.Estimator(
-      model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
-      warm_start_from=warm_start_settings, params={
-          'resnet_size': int(flags_obj.resnet_size),
-          'data_format': flags_obj.data_format,
-          'batch_size': flags_obj.batch_size,
-          'resnet_version': int(flags_obj.resnet_version),
-          'loss_scale': flags_core.get_loss_scale(flags_obj,
-                                                  default_for_fp16=128),
-          'dtype': flags_core.get_tf_dtype(flags_obj),
-          'fine_tune': flags_obj.fine_tune,
-          'num_workers': num_workers,
-      })
-
-  run_params = {
-      'batch_size': flags_obj.batch_size,
-      'dtype': flags_core.get_tf_dtype(flags_obj),
-      'resnet_size': flags_obj.resnet_size,
-      'resnet_version': flags_obj.resnet_version,
-      'synthetic_data': flags_obj.use_synthetic_data,
-      'train_epochs': flags_obj.train_epochs,
-      'num_workers': num_workers,
-  }
-  if flags_obj.use_synthetic_data:
-    dataset_name = dataset_name + '-synthetic'
-
-  benchmark_logger = logger.get_benchmark_logger()
-  benchmark_logger.log_run_info('resnet', dataset_name, run_params,
-                                test_id=flags_obj.benchmark_test_id)
-
-  train_hooks = hooks_helper.get_train_hooks(
-      flags_obj.hooks,
-      model_dir=flags_obj.model_dir,
-      batch_size=flags_obj.batch_size)
-
-  def input_fn_train(num_epochs, input_context=None):
-    return input_function(
-        is_training=True,
-        data_dir=flags_obj.data_dir,
-        batch_size=per_replica_batch_size(
-            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
-        num_epochs=num_epochs,
-        dtype=flags_core.get_tf_dtype(flags_obj),
-        datasets_num_private_threads=flags_obj.datasets_num_private_threads,
-        input_context=input_context)
-
-  def input_fn_eval():
-    return input_function(
-        is_training=False,
-        data_dir=flags_obj.data_dir,
-        batch_size=per_replica_batch_size(
-            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
-        num_epochs=1,
-        dtype=flags_core.get_tf_dtype(flags_obj))
-
-  train_epochs = (0 if flags_obj.eval_only or not flags_obj.train_epochs else
-                  flags_obj.train_epochs)
-
-  use_train_and_evaluate = flags_obj.use_train_and_evaluate or num_workers > 1
-  if use_train_and_evaluate:
-    train_spec = tf.estimator.TrainSpec(
-        input_fn=lambda input_context=None: input_fn_train(
-            train_epochs, input_context=input_context),
-        hooks=train_hooks,
-        max_steps=flags_obj.max_train_steps)
-    eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval)
-    logging.info('Starting to train and evaluate.')
-    tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
-    # tf.estimator.train_and_evalute doesn't return anything in multi-worker
-    # case.
-    eval_results = {}
-  else:
-    if train_epochs == 0:
-      # If --eval_only is set, perform a single loop with zero train epochs.
-      schedule, n_loops = [0], 1
-    else:
-      # Compute the number of times to loop while training. All but the last
-      # pass will train for `epochs_between_evals` epochs, while the last will
-      # train for the number needed to reach `training_epochs`. For instance if
-      #   train_epochs = 25 and epochs_between_evals = 10
-      # schedule will be set to [10, 10, 5]. That is to say, the loop will:
-      #   Train for 10 epochs and then evaluate.
-      #   Train for another 10 epochs and then evaluate.
-      #   Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
-      n_loops = math.ceil(train_epochs / flags_obj.epochs_between_evals)
-      schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))]
-      schedule[-1] = train_epochs - sum(schedule[:-1])  # over counting.
-
-    for cycle_index, num_train_epochs in enumerate(schedule):
-      logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops))
-
-      if num_train_epochs:
-        # Since we are calling classifier.train immediately in each loop, the
-        # value of num_train_epochs in the lambda function will not be changed
-        # before it is used. So it is safe to ignore the pylint error here
-        # pylint: disable=cell-var-from-loop
-        classifier.train(
-            input_fn=lambda input_context=None: input_fn_train(
-                num_train_epochs, input_context=input_context),
-            hooks=train_hooks,
-            max_steps=flags_obj.max_train_steps)
-
-      # flags_obj.max_train_steps is generally associated with testing and
-      # profiling. As a result it is frequently called with synthetic data,
-      # which will iterate forever. Passing steps=flags_obj.max_train_steps
-      # allows the eval (which is generally unimportant in those circumstances)
-      # to terminate.  Note that eval will run for max_train_steps each loop,
-      # regardless of the global_step count.
-      logging.info('Starting to evaluate.')
-      eval_results = classifier.evaluate(input_fn=input_fn_eval,
-                                         steps=flags_obj.max_train_steps)
-
-      benchmark_logger.log_evaluation_result(eval_results)
-
-      if model_helpers.past_stop_threshold(
-          flags_obj.stop_threshold, eval_results['accuracy']):
-        break
-
-  if flags_obj.export_dir is not None:
-    # Exports a saved model for the given classifier.
-    export_dtype = flags_core.get_tf_dtype(flags_obj)
-    if flags_obj.image_bytes_as_serving_input:
-      input_receiver_fn = functools.partial(
-          image_bytes_serving_input_fn, shape, dtype=export_dtype)
-    else:
-      input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
-          shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
-    classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn,
-                                 strip_default_attrs=True)
-
-  stats = {}
-  stats['eval_results'] = eval_results
-  stats['train_hooks'] = train_hooks
-
-  return stats
-
-
-def define_resnet_flags(resnet_size_choices=None, dynamic_loss_scale=False,
-                        fp16_implementation=False):
-  """Add flags and validators for ResNet."""
-  flags_core.define_base(clean=True, train_epochs=True,
-                         epochs_between_evals=True, stop_threshold=True,
-                         num_gpu=True, hooks=True, export_dir=True,
-                         distribution_strategy=True)
-  flags_core.define_performance(num_parallel_calls=False,
-                                inter_op=True,
-                                intra_op=True,
-                                synthetic_data=True,
-                                dtype=True,
-                                all_reduce_alg=True,
-                                num_packs=True,
-                                tf_gpu_thread_mode=True,
-                                datasets_num_private_threads=True,
-                                dynamic_loss_scale=dynamic_loss_scale,
-                                fp16_implementation=fp16_implementation,
-                                loss_scale=True,
-                                tf_data_experimental_slack=True,
-                                max_train_steps=True)
-  flags_core.define_image()
-  flags_core.define_benchmark()
-  flags_core.define_distribution()
-  flags.adopt_module_key_flags(flags_core)
-
-  flags.DEFINE_enum(
-      name='resnet_version', short_name='rv', default='1',
-      enum_values=['1', '2'],
-      help=flags_core.help_wrap(
-          'Version of ResNet. (1 or 2) See README.md for details.'))
-  flags.DEFINE_bool(
-      name='fine_tune', short_name='ft', default=False,
-      help=flags_core.help_wrap(
-          'If True do not train any parameters except for the final layer.'))
-  flags.DEFINE_string(
-      name='pretrained_model_checkpoint_path', short_name='pmcp', default=None,
-      help=flags_core.help_wrap(
-          'If not None initialize all the network except the final layer with '
-          'these values'))
-  flags.DEFINE_boolean(
-      name='eval_only', default=False,
-      help=flags_core.help_wrap('Skip training and only perform evaluation on '
-                                'the latest checkpoint.'))
-  flags.DEFINE_boolean(
-      name='image_bytes_as_serving_input', default=False,
-      help=flags_core.help_wrap(
-          'If True exports savedmodel with serving signature that accepts '
-          'JPEG image bytes instead of a fixed size [HxWxC] tensor that '
-          'represents the image. The former is easier to use for serving at '
-          'the expense of image resize/cropping being done as part of model '
-          'inference. Note, this flag only applies to ImageNet and cannot '
-          'be used for CIFAR.'))
-  flags.DEFINE_boolean(
-      name='use_train_and_evaluate', default=False,
-      help=flags_core.help_wrap(
-          'If True, uses `tf.estimator.train_and_evaluate` for the training '
-          'and evaluation loop, instead of separate calls to `classifier.train '
-          'and `classifier.evaluate`, which is the default behavior.'))
-  flags.DEFINE_bool(
-      name='enable_lars', default=False,
-      help=flags_core.help_wrap(
-          'Enable LARS optimizer for large batch training.'))
-  flags.DEFINE_float(
-      name='label_smoothing', default=0.0,
-      help=flags_core.help_wrap(
-          'Label smoothing parameter used in the softmax_cross_entropy'))
-  flags.DEFINE_float(
-      name='weight_decay', default=1e-4,
-      help=flags_core.help_wrap(
-          'Weight decay coefficiant for l2 regularization.'))
-
-  choice_kwargs = dict(
-      name='resnet_size', short_name='rs', default='50',
-      help=flags_core.help_wrap('The size of the ResNet model to use.'))
-
-  if resnet_size_choices is None:
-    flags.DEFINE_string(**choice_kwargs)
-  else:
-    flags.DEFINE_enum(enum_values=resnet_size_choices, **choice_kwargs)
--- a/official/r1/transformer/README.md
+++ b/official/r1/transformer/README.md
-![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
-![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
-![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
-
-# Transformer Translation Model
-This is an implementation of the Transformer translation model as described in the [Attention is All You Need](https://arxiv.org/abs/1706.03762) paper. Based on the code provided by the authors: [Transformer code](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py) from [Tensor2Tensor](https://github.com/tensorflow/tensor2tensor). Also, check out the [tutorial](https://www.tensorflow.org/beta/tutorials/text/transformer) on Transformer in TF 2.0.
-
-**Please follow the [README](https://github.com/tensorflow/models/official/transformer/README.md), the new Keras-based TF 2 implementation, to walk through the new Transformer.**
-
-Transformer is a neural network architecture that solves sequence to sequence problems using attention mechanisms. Unlike traditional neural seq2seq models, Transformer does not involve recurrent connections. The attention mechanism learns dependencies between tokens in two sequences. Since attention weights apply to all tokens in the sequences, the Transformer model is able to easily capture long-distance dependencies.
-
-Transformer's overall structure follows the standard encoder-decoder pattern. The encoder uses self-attention to compute a representation of the input sequence. The decoder generates the output sequence one token at a time, taking the encoder output and previous decoder-outputted tokens as inputs.
-
-The model also applies embeddings on the input and output tokens, and adds a constant positional encoding. The positional encoding adds information about the position of each token.
-
-## Contents
-  * [Contents](#contents)
-  * [Walkthrough](#walkthrough)
-  * [Benchmarks](#benchmarks)
-    * [Training times](#training-times)
-    * [Evaluation results](#evaluation-results)
-  * [Detailed instructions](#detailed-instructions)
-    * [Environment preparation](#environment-preparation)
-    * [Download and preprocess datasets](#download-and-preprocess-datasets)
-    * [Model training and evaluation](#model-training-and-evaluation)
-    * [Translate using the model](#translate-using-the-model)
-    * [Compute official BLEU score](#compute-official-bleu-score)
-    * [TPU](#tpu)
-  * [Export trained model](#export-trained-model)
-    * [Example translation](#example-translation)
-  * [Implementation overview](#implementation-overview)
-    * [Model Definition](#model-definition)
-    * [Model Estimator](#model-estimator)
-    * [Other scripts](#other-scripts)
-    * [Test dataset](#test-dataset)
-  * [Term definitions](#term-definitions)
-
-## Walkthrough
-
-Below are the commands for running the Transformer model. See the
-[Detailed instructions](#detailed-instructions) for more details on running the
-model.
-
-```
-cd /path/to/models/official/transformer
-
-# Ensure that PYTHONPATH is correctly defined as described in
-# https://github.com/tensorflow/models/tree/master/official#requirements
-# export PYTHONPATH="$PYTHONPATH:/path/to/models"
-
-# Export variables
-PARAM_SET=big
-DATA_DIR=$HOME/transformer/data
-MODEL_DIR=$HOME/transformer/model_$PARAM_SET
-VOCAB_FILE=$DATA_DIR/vocab.ende.32768
-
-# Download training/evaluation/test datasets
-python data_download.py --data_dir=$DATA_DIR
-
-# Train the model for 10 epochs, and evaluate after every epoch.
-python transformer_main.py --data_dir=$DATA_DIR --model_dir=$MODEL_DIR \
-    --vocab_file=$VOCAB_FILE --param_set=$PARAM_SET \
-    --bleu_source=$DATA_DIR/newstest2014.en --bleu_ref=$DATA_DIR/newstest2014.de
-
-# Run during training in a separate process to get continuous updates,
-# or after training is complete.
-tensorboard --logdir=$MODEL_DIR
-
-# Translate some text using the trained model
-python translate.py --model_dir=$MODEL_DIR --vocab_file=$VOCAB_FILE \
-    --param_set=$PARAM_SET --text="hello world"
-
-# Compute model's BLEU score using the newstest2014 dataset.
-python translate.py --model_dir=$MODEL_DIR --vocab_file=$VOCAB_FILE \
-    --param_set=$PARAM_SET --file=$DATA_DIR/newstest2014.en --file_out=translation.en
-python compute_bleu.py --translation=translation.en --reference=$DATA_DIR/newstest2014.de
-```
-
-## Benchmarks
-### Training times
-
-Currently, both big and base parameter sets run on a single GPU. The measurements below
-are reported from running the model on a P100 GPU.
-
-Param Set | batches/sec | batches per epoch | time per epoch
--- | --- | --- | ---
-base | 4.8 | 83244 | 4 hr
-big | 1.1 | 41365 | 10 hr
-
-### Evaluation results
-Below are the case-insensitive BLEU scores after 10 epochs.
-
-Param Set | Score
--- | --- |
-base | 27.7
-big | 28.9
-
-
-## Detailed instructions
-
-
-0. ### Environment preparation
-
-   #### Add models repo to PYTHONPATH
-   Follow the instructions described in the [Requirements](https://github.com/tensorflow/models/tree/master/official#requirements) section to add the models folder to the python path.
-
-   #### Export variables (optional)
-
-   Export the following variables, or modify the values in each of the snippets below:
-   ```
-   PARAM_SET=big
-   DATA_DIR=$HOME/transformer/data
-   MODEL_DIR=$HOME/transformer/model_$PARAM_SET
-   VOCAB_FILE=$DATA_DIR/vocab.ende.32768
-   ```
-
-1. ### Download and preprocess datasets
-
-   [data_download.py](data_download.py) downloads and preprocesses the training and evaluation WMT datasets. After the data is downloaded and extracted, the training data is used to generate a vocabulary of subtokens. The evaluation and training strings are tokenized, and the resulting data is sharded, shuffled, and saved as TFRecords.
-
-   1.75GB of compressed data will be downloaded. In total, the raw files (compressed, extracted, and combined files) take up 8.4GB of disk space. The resulting TFRecord and vocabulary files are 722MB. The script takes around 40 minutes to run, with the bulk of the time spent downloading and ~15 minutes spent on preprocessing.
-
-   Command to run:
-   ```
-   python data_download.py --data_dir=$DATA_DIR
-   ```
-
-   Arguments:
-   * `--data_dir`: Path where the preprocessed TFRecord data, and vocab file will be saved.
-   * Use the `--help` or `-h` flag to get a full list of possible arguments.
-
-2. ### Model training and evaluation
-
-   [transformer_main.py](transformer_main.py) creates a Transformer model, and trains it using Tensorflow Estimator.
-
-   Command to run:
-   ```
-   python transformer_main.py --data_dir=$DATA_DIR --model_dir=$MODEL_DIR \
-       --vocab_file=$VOCAB_FILE --param_set=$PARAM_SET
-   ```
-
-   Arguments:
-   * `--data_dir`: This should be set to the same directory given to the `data_download`'s `data_dir` argument.
-   * `--model_dir`: Directory to save Transformer model training checkpoints.
-   * `--vocab_file`: Path to subtoken vocabulary file. If data_download was used, you may find the file in `data_dir`.
-   * `--param_set`: Parameter set to use when creating and training the model. Options are `base` and `big` (default).
-   * Use the `--help` or `-h` flag to get a full list of possible arguments.
-
-   #### Customizing training schedule
-
-   By default, the model will train for 10 epochs, and evaluate after every epoch. The training schedule may be defined through the flags:
-   * Training with epochs (default):
-     * `--train_epochs`: The total number of complete passes to make through the dataset
-     * `--epochs_between_evals`: The number of epochs to train between evaluations.
-   * Training with steps:
-     * `--train_steps`: sets the total number of training steps to run.
-     * `--steps_between_evals`: Number of training steps to run between evaluations.
-
-   Only one of `train_epochs` or `train_steps` may be set. Since the default option is to evaluate the model after training for an epoch, it may take 4 or more hours between model evaluations. To get more frequent evaluations, use the flags `--train_steps=250000 --steps_between_evals=1000`.
-
-   Note: At the beginning of each training session, the training dataset is reloaded and shuffled. Stopping the training before completing an epoch may result in worse model quality, due to the chance that some examples may be seen more than others. Therefore, it is recommended to use epochs when the model quality is important.
-
-   #### Compute BLEU score during model evaluation
-
-   Use these flags to compute the BLEU when the model evaluates:
-   * `--bleu_source`: Path to file containing text to translate.
-   * `--bleu_ref`: Path to file containing the reference translation.
-   * `--stop_threshold`: Train until the BLEU score reaches this lower bound. This setting overrides the `--train_steps` and `--train_epochs` flags.
-
-   When running `transformer_main.py`, use the flags: `--bleu_source=$DATA_DIR/newstest2014.en --bleu_ref=$DATA_DIR/newstest2014.de`
-
-   #### Tensorboard
-   Training and evaluation metrics (loss, accuracy, approximate BLEU score, etc.) are logged, and can be displayed in the browser using Tensorboard.
-   ```
-   tensorboard --logdir=$MODEL_DIR
-   ```
-   The values are displayed at [localhost:6006](localhost:6006).
-
-3. ### Translate using the model
-   [translate.py](translate.py) contains the script to use the trained model to translate input text or file. Each line in the file is translated separately.
-
-   Command to run:
-   ```
-   python translate.py --model_dir=$MODEL_DIR --vocab_file=$VOCAB_FILE \
-       --param_set=$PARAM_SET --text="hello world"
-   ```
-
-   Arguments for initializing the Subtokenizer and trained model:
-   * `--model_dir` and `--param_set`: These parameters are used to rebuild the trained model
-   * `--vocab_file`: Path to subtoken vocabulary file. If data_download was used, you may find the file in `data_dir`.
-
-   Arguments for specifying what to translate:
-   * `--text`: Text to translate
-   * `--file`: Path to file containing text to translate
-   * `--file_out`: If `--file` is set, then this file will store the input file's translations.
-
-   To translate the newstest2014 data, run:
-   ```
-   python translate.py --model_dir=$MODEL_DIR --vocab_file=$VOCAB_FILE \
-       --param_set=$PARAM_SET --file=$DATA_DIR/newstest2014.en --file_out=translation.en
-   ```
-
-   Translating the file takes around 15 minutes on a GTX1080, or 5 minutes on a P100.
-
-4. ### Compute official BLEU score
-   Use [compute_bleu.py](compute_bleu.py) to compute the BLEU by comparing generated translations to the reference translation.
-
-   Command to run:
-   ```
-   python compute_bleu.py --translation=translation.en --reference=$DATA_DIR/newstest2014.de
-   ```
-
-   Arguments:
-   * `--translation`: Path to file containing generated translations.
-   * `--reference`: Path to file containing reference translations.
-   * Use the `--help` or `-h` flag to get a full list of possible arguments.
-
-5. ### TPU
-   TPU support for this version of Transformer is experimental. Currently it is present for
-   demonstration purposes only, but will be optimized in the coming weeks.
-
-## Export trained model
-To export the model as a Tensorflow [SavedModel](https://www.tensorflow.org/guide/saved_model) format, use the argument `--export_dir` when running `transformer_main.py`. A folder will be created in the directory with the name as the timestamp (e.g. $EXPORT_DIR/1526427396).
-
-```
-EXPORT_DIR=$HOME/transformer/saved_model
-python transformer_main.py --data_dir=$DATA_DIR --model_dir=$MODEL_DIR \
-  --vocab_file=$VOCAB_FILE --param_set=$PARAM_SET --export_model=$EXPORT_DIR
-```
-
-To inspect the SavedModel, use saved_model_cli:
-```
-SAVED_MODEL_DIR=$EXPORT_DIR/{TIMESTAMP}  # replace {TIMESTAMP} with the name of the folder created
-saved_model_cli show --dir=$SAVED_MODEL_DIR  --all
-```
-
-### Example translation
-Let's translate **"hello world!"**, **"goodbye world."**, and **"Would you like some pie?"**.
-
-The SignatureDef for "translate" is:
-
-    signature_def['translate']:
-        The given SavedModel SignatureDef contains the following input(s):
-          inputs['input'] tensor_info:
-              dtype: DT_INT64
-              shape: (-1, -1)
-              name: Placeholder:0
-        The given SavedModel SignatureDef contains the following output(s):
-          outputs['outputs'] tensor_info:
-              dtype: DT_INT32
-              shape: (-1, -1)
-              name: model/Transformer/strided_slice_19:0
-          outputs['scores'] tensor_info:
-              dtype: DT_FLOAT
-              shape: (-1)
-              name: model/Transformer/strided_slice_20:0
-
-Follow the steps below to use the translate signature def:
-
-1. #### Encode the inputs to integer arrays.
-   This can be done using `utils.tokenizer.Subtokenizer`, and the vocab file in the SavedModel assets (`$SAVED_MODEL_DIR/assets.extra/vocab.txt`).
-
-   ```
-   from official.transformer.utils.tokenizer import Subtokenizer
-   s = Subtokenizer(PATH_TO_VOCAB_FILE)
-   print(s.encode("hello world!", add_eos=True))
-   ```
-
-   The encoded inputs are:
-   * `"hello world!" = [6170, 3731, 178, 207, 1]`
-   * `"goodbye world." = [15431, 13966, 36, 178, 3, 1]`
-   * `"Would you like some pie?" = [9092, 72, 155, 202, 19851, 102, 1]`
-
-2. #### Run `saved_model_cli` to obtain the predicted translations
-   The encoded inputs should be padded so that they are the same length. The padding token is `0`.
-   ```
-   ENCODED_INPUTS="[[26228, 145, 178, 1, 0, 0, 0], \
-                   [15431, 13966, 36, 178, 3, 1, 0], \
-                   [9092, 72, 155, 202, 19851, 102, 1]]"
-   ```
-
-   Now, use the `run` command with `saved_model_cli` to get the outputs.
-
-   ```
-   saved_model_cli run --dir=$SAVED_MODEL_DIR --tag_set=serve --signature_def=translate \
-     --input_expr="input=$ENCODED_INPUTS"
-   ```
-
-   The outputs will look similar to:
-   ```
-   Result for output key outputs:
-   [[18744   145   297     1     0     0     0     0     0     0     0     0
-         0     0]
-    [ 5450  4642    21    11   297     3     1     0     0     0     0     0
-         0     0]
-    [25940    22    66   103 21713    31   102     1     0     0     0     0
-         0     0]]
-   Result for output key scores:
-   [-1.5493642 -1.4032784 -3.252089 ]
-   ```
-
-3. #### Decode the outputs to strings.
-   Use the `Subtokenizer` and vocab file as described in step 1 to decode the output integer arrays.
-   ```
-   from official.transformer.utils.tokenizer import Subtokenizer
-   s = Subtokenizer(PATH_TO_VOCAB_FILE)
-   print(s.decode([18744, 145, 297, 1]))
-   ```
-   The decoded outputs from above are:
-   * `[18744, 145, 297, 1] = "Hallo Welt<EOS>"`
-   * `[5450, 4642, 21, 11, 297, 3, 1] = "Abschied von der Welt.<EOS>"`
-   * `[25940, 22, 66, 103, 21713, 31, 102, 1] = "Möchten Sie einen Kuchen?<EOS>"`
-
-## Implementation overview
-
-A brief look at each component in the code:
-
-### Model Definition
-The [model](model) subdirectory contains the implementation of the Transformer model. The following files define the Transformer model and its layers:
-* [transformer.py](model/transformer.py): Defines the transformer model and its encoder/decoder layer stacks.
-* [embedding_layer.py](model/embedding_layer.py): Contains the layer that calculates the embeddings. The embedding weights are also used to calculate the pre-softmax probabilities from the decoder output.
-* [attention_layer.py](model/attention_layer.py): Defines the multi-headed and self attention layers that are used in the encoder/decoder stacks.
-* [ffn_layer.py](model/ffn_layer.py): Defines the feedforward network that is used in the encoder/decoder stacks. The network is composed of 2 fully connected layers.
-
-Other files:
-* [beam_search.py](model/beam_search.py) contains the beam search implementation, which is used during model inference to find high scoring translations.
-* [model_params.py](model/model_params.py) contains the parameters used for the big and base models.
-* [model_utils.py](model/model_utils.py) defines some helper functions used in the model (calculating padding, bias, etc.).
-
-
-### Model Estimator
-[transformer_main.py](model/transformer.py) creates an `Estimator` to train and evaluate the model.
-
-Helper functions:
-* [utils/dataset.py](utils/dataset.py): contains functions for creating a `dataset` that is passed to the `Estimator`.
-* [utils/metrics.py](utils/metrics.py): defines metrics functions used by the `Estimator` to evaluate the
-
-### Other scripts
-
-Aside from the main file to train the Transformer model, we provide other scripts for using the model or downloading the data:
-
-#### Data download and preprocessing
-
-[data_download.py](data_download.py) downloads and extracts data, then uses `Subtokenizer` to tokenize strings into arrays of int IDs. The int arrays are converted to `tf.Examples` and saved in the `tf.RecordDataset` format.
-
- The data is downloaded from the Workshop of Machine Translation (WMT) [news translation task](http://www.statmt.org/wmt17/translation-task.html). The following datasets are used:
-
- * Europarl v7
- * Common Crawl corpus
- * News Commentary v12
-
- See the [download section](http://www.statmt.org/wmt17/translation-task.html#download) to explore the raw datasets. The parameters in this model are tuned to fit the English-German translation data, so the EN-DE texts are extracted from the downloaded compressed files.
-
-The text is transformed into arrays of integer IDs using the `Subtokenizer` defined in [`utils/tokenizer.py`](util/tokenizer.py). During initialization of the `Subtokenizer`, the raw training data is used to generate a vocabulary list containing common subtokens.
-
-The target vocabulary size of the WMT dataset is 32,768. The set of subtokens is found through binary search on the minimum number of times a subtoken appears in the data. The actual vocabulary size is 33,708, and is stored in a 324kB file.
-
-#### Translation
-Translation is defined in [translate.py](translate.py). First, `Subtokenizer` tokenizes the input. The vocabulary file is the same used to tokenize the training/eval files. Next, beam search is used to find the combination of tokens that maximizes the probability outputted by the model decoder. The tokens are then converted back to strings with `Subtokenizer`.
-
-#### BLEU computation
-[compute_bleu.py](compute_bleu.py): Implementation from [https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py).
-
-### Test dataset
-The [newstest2014 files](https://storage.googleapis.com/tf-perf-public/official_transformer/test_data/newstest2014.tgz)
-are extracted from the [NMT Seq2Seq tutorial](https://google.github.io/seq2seq/nmt/#download-data).
-The raw text files are converted from the SGM format of the
-[WMT 2016](http://www.statmt.org/wmt16/translation-task.html) test sets. The
-newstest2014 files are put into the `$DATA_DIR` when executing
-`data_download.py`
-
-## Term definitions
-
-**Steps / Epochs**:
-* Step: unit for processing a single batch of data
-* Epoch: a complete run through the dataset
-
-Example: Consider a training a dataset with 100 examples that is divided into 20 batches with 5 examples per batch. A single training step trains the model on one batch. After 20 training steps, the model will have trained on every batch in the dataset, or one epoch.
-
-**Subtoken**: Words are referred to as tokens, and parts of words are referred to as 'subtokens'. For example, the word 'inclined' may be split into `['incline', 'd_']`. The '\_' indicates the end of the token. The subtoken vocabulary list is guaranteed to contain the alphabet (including numbers and special characters), so all words can be tokenized.
--- a/official/r1/transformer/__init__.py
+++ b/official/r1/transformer/__init__.py
--- a/official/r1/transformer/attention_layer.py
+++ b/official/r1/transformer/attention_layer.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of multiheaded attention and self-attention layers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf
-
-
-class Attention(tf.layers.Layer):
-  """Multi-headed attention layer."""
-
-  def __init__(self, hidden_size, num_heads, attention_dropout, train):
-    if hidden_size % num_heads != 0:
-      raise ValueError("Hidden size must be evenly divisible by the number of "
-                       "heads.")
-
-    super(Attention, self).__init__()
-    self.hidden_size = hidden_size
-    self.num_heads = num_heads
-    self.attention_dropout = attention_dropout
-    self.train = train
-
-    # Layers for linearly projecting the queries, keys, and values.
-    self.q_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="q")
-    self.k_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="k")
-    self.v_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="v")
-
-    self.output_dense_layer = tf.layers.Dense(hidden_size, use_bias=False,
-                                              name="output_transform")
-
-  def split_heads(self, x):
-    """Split x into different heads, and transpose the resulting value.
-
-    The tensor is transposed to insure the inner dimensions hold the correct
-    values during the matrix multiplication.
-
-    Args:
-      x: A tensor with shape [batch_size, length, hidden_size]
-
-    Returns:
-      A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads]
-    """
-    with tf.name_scope("split_heads"):
-      batch_size = tf.shape(x)[0]
-      length = tf.shape(x)[1]
-
-      # Calculate depth of last dimension after it has been split.
-      depth = (self.hidden_size // self.num_heads)
-
-      # Split the last dimension
-      x = tf.reshape(x, [batch_size, length, self.num_heads, depth])
-
-      # Transpose the result
-      return tf.transpose(x, [0, 2, 1, 3])
-
-  def combine_heads(self, x):
-    """Combine tensor that has been split.
-
-    Args:
-      x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
-
-    Returns:
-      A tensor with shape [batch_size, length, hidden_size]
-    """
-    with tf.name_scope("combine_heads"):
-      batch_size = tf.shape(x)[0]
-      length = tf.shape(x)[2]
-      x = tf.transpose(x, [0, 2, 1, 3])  # --> [batch, length, num_heads, depth]
-      return tf.reshape(x, [batch_size, length, self.hidden_size])
-
-  def call(self, x, y, bias, cache=None):
-    """Apply attention mechanism to x and y.
-
-    Args:
-      x: a tensor with shape [batch_size, length_x, hidden_size]
-      y: a tensor with shape [batch_size, length_y, hidden_size]
-      bias: attention bias that will be added to the result of the dot product.
-      cache: (Used during prediction) dictionary with tensors containing results
-        of previous attentions. The dictionary must have the items:
-            {"k": tensor with shape [batch_size, i, key_channels],
-             "v": tensor with shape [batch_size, i, value_channels]}
-        where i is the current decoded length.
-
-    Returns:
-      Attention layer output with shape [batch_size, length_x, hidden_size]
-    """
-    # Linearly project the query (q), key (k) and value (v) using different
-    # learned projections. This is in preparation of splitting them into
-    # multiple heads. Multi-head attention uses multiple queries, keys, and
-    # values rather than regular attention (which uses a single q, k, v).
-    q = self.q_dense_layer(x)
-    k = self.k_dense_layer(y)
-    v = self.v_dense_layer(y)
-
-    if cache is not None:
-      # Combine cached keys and values with new keys and values.
-      k = tf.concat([cache["k"], k], axis=1)
-      v = tf.concat([cache["v"], v], axis=1)
-
-      # Update cache
-      cache["k"] = k
-      cache["v"] = v
-
-    # Split q, k, v into heads.
-    q = self.split_heads(q)
-    k = self.split_heads(k)
-    v = self.split_heads(v)
-
-    # Scale q to prevent the dot product between q and k from growing too large.
-    depth = (self.hidden_size // self.num_heads)
-    q *= depth ** -0.5
-
-    # Calculate dot product attention
-    logits = tf.matmul(q, k, transpose_b=True)
-    logits += bias
-    weights = tf.nn.softmax(logits, name="attention_weights")
-    if self.train:
-      weights = tf.nn.dropout(weights, 1.0 - self.attention_dropout)
-    attention_output = tf.matmul(weights, v)
-
-    # Recombine heads --> [batch_size, length, hidden_size]
-    attention_output = self.combine_heads(attention_output)
-
-    # Run the combined outputs through another linear projection layer.
-    attention_output = self.output_dense_layer(attention_output)
-    return attention_output
-
-
-class SelfAttention(Attention):
-  """Multiheaded self-attention layer."""
-
-  def call(self, x, bias, cache=None):
-    return super(SelfAttention, self).call(x, x, bias, cache)
--- a/official/r1/transformer/dataset.py
+++ b/official/r1/transformer/dataset.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Input pipeline for the transformer model to read, filter, and batch examples.
-
-Two things to note in the pipeline:
-
-1. Batching scheme
-
-   The examples encoded in the TFRecord files contain data in the format:
-     {"inputs": [variable length array of integers],
-      "targets": [variable length array of integers]}
-   Where integers in the arrays refer to tokens in the English and German vocab
-   file (named `vocab.ende.32768`).
-
-   Prior to batching, elements in the dataset are grouped by length (max between
-   "inputs" and "targets" length). Each group is then batched such that:
-     group_batch_size * length <= batch_size.
-
-   Another way to view batch_size is the maximum number of tokens in each batch.
-
-   Once batched, each element in the dataset will have the shape:
-     {"inputs": [group_batch_size, padded_input_length],
-      "targets": [group_batch_size, padded_target_length]}
-   Lengths are padded to the longest "inputs" or "targets" sequence in the batch
-   (padded_input_length and padded_target_length can be different).
-
-   This batching scheme decreases the fraction of padding tokens per training
-   batch, thus improving the training speed significantly.
-
-2. Shuffling
-
-   While training, the dataset is shuffled in two places in the code. The first
-   is the list of training files. Second, while reading records using
-   `parallel_interleave`, the `sloppy` argument is used to generate randomness
-   in the order of the examples.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import os
-
-import tensorflow.compat.v1 as tf
-
-from official.utils.misc import model_helpers
-
-# Buffer size for reading records from a TFRecord file. Each training file is
-# 7.2 MB, so 8 MB allows an entire file to be kept in memory.
-_READ_RECORD_BUFFER = 8 * 1000 * 1000
-
-# Example grouping constants. Defines length boundaries for each group.
-# These values are the defaults used in Tensor2Tensor.
-_MIN_BOUNDARY = 8
-_BOUNDARY_SCALE = 1.1
-
-
-def _load_records(filename):
-  """Read file and return a dataset of tf.Examples."""
-  return tf.data.TFRecordDataset(filename, buffer_size=_READ_RECORD_BUFFER)
-
-
-def _parse_example(serialized_example):
-  """Return inputs and targets Tensors from a serialized tf.Example."""
-  data_fields = {
-      "inputs": tf.VarLenFeature(tf.int64),
-      "targets": tf.VarLenFeature(tf.int64)
-  }
-  parsed = tf.parse_single_example(serialized_example, data_fields)
-  inputs = tf.sparse_tensor_to_dense(parsed["inputs"])
-  targets = tf.sparse_tensor_to_dense(parsed["targets"])
-  return inputs, targets
-
-
-def _filter_max_length(example, max_length=256):
-  """Indicates whether the example's length is lower than the maximum length."""
-  return tf.logical_and(tf.size(example[0]) <= max_length,
-                        tf.size(example[1]) <= max_length)
-
-
-def _get_example_length(example):
-  """Returns the maximum length between the example inputs and targets."""
-  length = tf.maximum(tf.shape(example[0])[0], tf.shape(example[1])[0])
-  return length
-
-
-def _create_min_max_boundaries(
-    max_length, min_boundary=_MIN_BOUNDARY, boundary_scale=_BOUNDARY_SCALE):
-  """Create min and max boundary lists up to max_length.
-
-  For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
-  returned values will be:
-    buckets_min = [0, 4, 8, 16, 24]
-    buckets_max = [4, 8, 16, 24, 25]
-
-  Args:
-    max_length: The maximum length of example in dataset.
-    min_boundary: Minimum length in boundary.
-    boundary_scale: Amount to scale consecutive boundaries in the list.
-
-  Returns:
-    min and max boundary lists
-
-  """
-  # Create bucket boundaries list by scaling the previous boundary or adding 1
-  # (to ensure increasing boundary sizes).
-  bucket_boundaries = []
-  x = min_boundary
-  while x < max_length:
-    bucket_boundaries.append(x)
-    x = max(x + 1, int(x * boundary_scale))
-
-  # Create min and max boundary lists from the initial list.
-  buckets_min = [0] + bucket_boundaries
-  buckets_max = bucket_boundaries + [max_length + 1]
-  return buckets_min, buckets_max
-
-
-def _batch_examples(dataset, batch_size, max_length):
-  """Group examples by similar lengths, and return batched dataset.
-
-  Each batch of similar-length examples are padded to the same length, and may
-  have different number of elements in each batch, such that:
-    group_batch_size * padded_length <= batch_size.
-
-  This decreases the number of padding tokens per batch, which improves the
-  training speed.
-
-  Args:
-    dataset: Dataset of unbatched examples.
-    batch_size: Max number of tokens per batch of examples.
-    max_length: Max number of tokens in an example input or target sequence.
-
-  Returns:
-    Dataset of batched examples with similar lengths.
-  """
-  # Get min and max boundary lists for each example. These are used to calculate
-  # the `bucket_id`, which is the index at which:
-  # buckets_min[bucket_id] <= len(example) < buckets_max[bucket_id]
-  # Note that using both min and max lists improves the performance.
-  buckets_min, buckets_max = _create_min_max_boundaries(max_length)
-
-  # Create list of batch sizes for each bucket_id, so that
-  # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
-  bucket_batch_sizes = [batch_size // x for x in buckets_max]
-  # bucket_id will be a tensor, so convert this list to a tensor as well.
-  bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
-
-  def example_to_bucket_id(example_input, example_target):
-    """Return int64 bucket id for this example, calculated based on length."""
-    seq_length = _get_example_length((example_input, example_target))
-
-    # TODO: investigate whether removing code branching improves performance.
-    conditions_c = tf.logical_and(
-        tf.less_equal(buckets_min, seq_length),
-        tf.less(seq_length, buckets_max))
-    bucket_id = tf.reduce_min(tf.where(conditions_c))
-    return bucket_id
-
-  def window_size_fn(bucket_id):
-    """Return number of examples to be grouped when given a bucket id."""
-    return bucket_batch_sizes[bucket_id]
-
-  def batching_fn(bucket_id, grouped_dataset):
-    """Batch and add padding to a dataset of elements with similar lengths."""
-    bucket_batch_size = window_size_fn(bucket_id)
-
-    # Batch the dataset and add padding so that all input sequences in the
-    # examples have the same length, and all target sequences have the same
-    # lengths as well. Resulting lengths of inputs and targets can differ.
-    return grouped_dataset.padded_batch(bucket_batch_size, ([None], [None]))
-
-  return dataset.apply(tf.data.experimental.group_by_window(
-      key_func=example_to_bucket_id,
-      reduce_func=batching_fn,
-      window_size=None,
-      window_size_func=window_size_fn))
-
-
-def _read_and_batch_from_files(
-    file_pattern, batch_size, max_length, num_parallel_calls, shuffle, repeat,
-    static_batch=False):
-  """Create dataset where each item is a dict of "inputs" and "targets".
-
-  Args:
-    file_pattern: String used to match the input TFRecord files.
-    batch_size: Maximum number of tokens per batch of examples
-    max_length: Maximum number of tokens per example
-    num_parallel_calls: Number of cpu cores for parallel input processing.
-    shuffle: If true, randomizes order of elements.
-    repeat: Number of times to repeat the dataset. If None, the dataset is
-      repeated forever.
-    static_batch: Whether the batches in the dataset should have static shapes.
-      If True, the input is batched so that every batch has the
-      shape [batch_size // max_length, max_length]. If False, the input is
-      grouped by length, and batched so that batches may have different
-      shapes [N, M], where:
-        N * M <= batch_size
-        M <= max_length
-      In general, this setting should be False. Dynamic shapes allow the inputs
-      to be grouped so that the number of padding tokens is minimized, and helps
-      model training. In cases where the input shape must be static
-      (e.g. running on TPU), this setting should be set to True.
-
-  Returns:
-    tf.data.Dataset object containing examples loaded from the files.
-  """
-  dataset = tf.data.Dataset.list_files(file_pattern, shuffle=shuffle)
-
-  # Read files and interleave results. When training, the order of the examples
-  # will be non-deterministic.
-  dataset = dataset.apply(
-      tf.data.experimental.parallel_interleave(
-          _load_records, sloppy=shuffle, cycle_length=num_parallel_calls))
-
-  # Parse each tf.Example into a dictionary
-  # TODO: Look into prefetch_input_elements for performance optimization.
-  dataset = dataset.map(_parse_example,
-                        num_parallel_calls=num_parallel_calls)
-
-  # Remove examples where the input or target length exceeds the maximum length,
-  dataset = dataset.filter(lambda x, y: _filter_max_length((x, y), max_length))
-
-  if static_batch:
-    dataset = dataset.padded_batch(
-        batch_size // max_length, ([max_length], [max_length]),
-        drop_remainder=True)
-  else:
-    # Group and batch such that each batch has examples of similar length.
-    dataset = _batch_examples(dataset, batch_size, max_length)
-
-  dataset = dataset.repeat(repeat)
-
-  # Prefetch the next element to improve speed of input pipeline.
-  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
-  return dataset
-
-
-def _generate_synthetic_data(params):
-  """Create synthetic data based on the parameter batch size."""
-  batch = length = int(math.sqrt(params["batch_size"]))
-  return model_helpers.generate_synthetic_data(
-      input_shape=tf.TensorShape([batch, length]),
-      input_value=1,
-      input_dtype=tf.int32,
-      label_shape=tf.TensorShape([batch, length]),
-      label_value=1,
-      label_dtype=tf.int32,
-  )
-
-
-def train_input_fn(params):
-  """Load and return dataset of batched examples for use during training."""
-  file_pattern = os.path.join(params["data_dir"] or "", "*train*")
-  if params["use_synthetic_data"]:
-    return _generate_synthetic_data(params)
-  return _read_and_batch_from_files(
-      file_pattern, params["batch_size"], params["max_length"],
-      params["num_parallel_calls"], shuffle=True,
-      repeat=params["repeat_dataset"], static_batch=params["static_batch"])
-
-
-def eval_input_fn(params):
-  """Load and return dataset of batched examples for use during evaluation."""
-  file_pattern = os.path.join(params["data_dir"] or "", "*dev*")
-  if params["use_synthetic_data"]:
-    return _generate_synthetic_data(params)
-  return _read_and_batch_from_files(
-      file_pattern, params["batch_size"], params["max_length"],
-      params["num_parallel_calls"], shuffle=False, repeat=1,
-      static_batch=params["static_batch"])
--- a/official/r1/transformer/embedding_layer.py
+++ b/official/r1/transformer/embedding_layer.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of embedding layer with shared weights."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf  # pylint: disable=g-bad-import-order
-
-from official.r1.utils import tpu as tpu_utils
-
-
-class EmbeddingSharedWeights(tf.layers.Layer):
-  """Calculates input embeddings and pre-softmax linear with shared weights."""
-
-  def __init__(self, vocab_size, hidden_size, method="gather"):
-    """Specify characteristic parameters of embedding layer.
-
-    Args:
-      vocab_size: Number of tokens in the embedding. (Typically ~32,000)
-      hidden_size: Dimensionality of the embedding. (Typically 512 or 1024)
-      method: Strategy for performing embedding lookup. "gather" uses tf.gather
-        which performs well on CPUs and GPUs, but very poorly on TPUs. "matmul"
-        one-hot encodes the indicies and formulates the embedding as a sparse
-        matrix multiplication. The matmul formulation is wasteful as it does
-        extra work, however matrix multiplication is very fast on TPUs which
-        makes "matmul" considerably faster than "gather" on TPUs.
-    """
-    super(EmbeddingSharedWeights, self).__init__()
-    self.vocab_size = vocab_size
-    self.hidden_size = hidden_size
-    if method not in ("gather", "matmul"):
-      raise ValueError("method {} must be 'gather' or 'matmul'".format(method))
-    self.method = method
-
-  def build(self, _):
-    with tf.variable_scope("embedding_and_softmax", reuse=tf.AUTO_REUSE):
-      # Create and initialize weights. The random normal initializer was chosen
-      # randomly, and works well.
-      self.shared_weights = tf.get_variable(
-          "weights", [self.vocab_size, self.hidden_size],
-          initializer=tf.random_normal_initializer(
-              0., self.hidden_size ** -0.5))
-
-    self.built = True
-
-  def call(self, x):
-    """Get token embeddings of x.
-
-    Args:
-      x: An int64 tensor with shape [batch_size, length]
-    Returns:
-      embeddings: float32 tensor with shape [batch_size, length, embedding_size]
-      padding: float32 tensor with shape [batch_size, length] indicating the
-        locations of the padding tokens in x.
-    """
-    with tf.name_scope("embedding"):
-      # Create binary mask of size [batch_size, length]
-      mask = tf.to_float(tf.not_equal(x, 0))
-
-      if self.method == "gather":
-        embeddings = tf.gather(self.shared_weights, x)
-        embeddings *= tf.expand_dims(mask, -1)
-      else:  # matmul
-        embeddings = tpu_utils.embedding_matmul(
-            embedding_table=self.shared_weights,
-            values=tf.cast(x, dtype=tf.int32),
-            mask=mask
-        )
-        # embedding_matmul already zeros out masked positions, so
-        # `embeddings *= tf.expand_dims(mask, -1)` is unnecessary.
-
-
-      # Scale embedding by the sqrt of the hidden size
-      embeddings *= self.hidden_size ** 0.5
-
-      return embeddings
-
-
-  def linear(self, x):
-    """Computes logits by running x through a linear layer.
-
-    Args:
-      x: A float32 tensor with shape [batch_size, length, hidden_size]
-    Returns:
-      float32 tensor with shape [batch_size, length, vocab_size].
-    """
-    with tf.name_scope("presoftmax_linear"):
-      batch_size = tf.shape(x)[0]
-      length = tf.shape(x)[1]
-
-      x = tf.reshape(x, [-1, self.hidden_size])
-      logits = tf.matmul(x, self.shared_weights, transpose_b=True)
-
-      return tf.reshape(logits, [batch_size, length, self.vocab_size])