Remove this r1 folder from the master branch in June, 2020.

PiperOrigin-RevId: 317772122

Remove this r1 folder from the master branch in June, 2020.
PiperOrigin-RevId: 317772122
f9ac9618 · Hongkun Yu · A. Unique TensorFlower · d4f5c193 · d4f5c193 · d4f5c193
Commit f9ac9618 authored Jun 22, 2020 by Hongkun Yu Committed by A. Unique TensorFlower Jun 22, 2020
20 changed files
--- a/official/r1/resnet/cifar10_download_and_extract.py
+++ b/official/r1/resnet/cifar10_download_and_extract.py
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Downloads and extracts the binary version of the CIFAR-10 dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import os
-import sys
-import tarfile
-
-from six.moves import urllib
-import tensorflow as tf
-
-DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
-
-parser = argparse.ArgumentParser()
-
-parser.add_argument(
-    '--data_dir', type=str, default='/tmp/cifar10_data',
-    help='Directory to download data and extract the tarball')
-
-
-def main(_):
-  """Download and extract the tarball from Alex's website."""
-  if not os.path.exists(FLAGS.data_dir):
-    os.makedirs(FLAGS.data_dir)
-
-  filename = DATA_URL.split('/')[-1]
-  filepath = os.path.join(FLAGS.data_dir, filename)
-
-  if not os.path.exists(filepath):
-    def _progress(count, block_size, total_size):
-      sys.stdout.write('\r>> Downloading %s %.1f%%' % (
-          filename, 100.0 * count * block_size / total_size))
-      sys.stdout.flush()
-
-    filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
-    print()
-    statinfo = os.stat(filepath)
-    print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
-
-  tarfile.open(filepath, 'r:gz').extractall(FLAGS.data_dir)
-
-
-if __name__ == '__main__':
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.compat.v1.app.run(argv=[sys.argv[0]] + unparsed)
--- a/official/r1/resnet/cifar10_main.py
+++ b/official/r1/resnet/cifar10_main.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Runs a ResNet model on the CIFAR-10 dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl import app as absl_app
-from absl import flags
-from absl import logging
-from six.moves import range
-import tensorflow as tf
-
-from official.r1.resnet import resnet_model
-from official.r1.resnet import resnet_run_loop
-from official.r1.utils.logs import logger
-from official.utils.flags import core as flags_core
-
-HEIGHT = 32
-WIDTH = 32
-NUM_CHANNELS = 3
-_DEFAULT_IMAGE_BYTES = HEIGHT * WIDTH * NUM_CHANNELS
-# The record is the image plus a one-byte label
-_RECORD_BYTES = _DEFAULT_IMAGE_BYTES + 1
-NUM_CLASSES = 10
-_NUM_DATA_FILES = 5
-
-# TODO(tobyboyd): Change to best practice 45K(train)/5K(val)/10K(test) splits.
-NUM_IMAGES = {
-    'train': 50000,
-    'validation': 10000,
-}
-
-DATASET_NAME = 'CIFAR-10'
-
-
-###############################################################################
-# Data processing
-###############################################################################
-def get_filenames(is_training, data_dir):
-  """Returns a list of filenames."""
-  assert tf.io.gfile.exists(data_dir), (
-      'Run cifar10_download_and_extract.py first to download and extract the '
-      'CIFAR-10 data.')
-
-  if is_training:
-    return [
-        os.path.join(data_dir, 'data_batch_%d.bin' % i)
-        for i in range(1, _NUM_DATA_FILES + 1)
-    ]
-  else:
-    return [os.path.join(data_dir, 'test_batch.bin')]
-
-
-def parse_record(raw_record, is_training, dtype):
-  """Parse CIFAR-10 image and label from a raw record."""
-  # Convert bytes to a vector of uint8 that is record_bytes long.
-  record_vector = tf.io.decode_raw(raw_record, tf.uint8)
-
-  # The first byte represents the label, which we convert from uint8 to int32
-  # and then to one-hot.
-  label = tf.cast(record_vector[0], tf.int32)
-
-  # The remaining bytes after the label represent the image, which we reshape
-  # from [depth * height * width] to [depth, height, width].
-  depth_major = tf.reshape(record_vector[1:_RECORD_BYTES],
-                           [NUM_CHANNELS, HEIGHT, WIDTH])
-
-  # Convert from [depth, height, width] to [height, width, depth], and cast as
-  # float32.
-  image = tf.cast(tf.transpose(a=depth_major, perm=[1, 2, 0]), tf.float32)
-
-  image = preprocess_image(image, is_training)
-  image = tf.cast(image, dtype)
-
-  return image, label
-
-
-def preprocess_image(image, is_training):
-  """Preprocess a single image of layout [height, width, depth]."""
-  if is_training:
-    # Resize the image to add four extra pixels on each side.
-    image = tf.image.resize_with_crop_or_pad(
-        image, HEIGHT + 8, WIDTH + 8)
-
-    # Randomly crop a [HEIGHT, WIDTH] section of the image.
-    image = tf.image.random_crop(image, [HEIGHT, WIDTH, NUM_CHANNELS])
-
-    # Randomly flip the image horizontally.
-    image = tf.image.random_flip_left_right(image)
-
-  # Subtract off the mean and divide by the variance of the pixels.
-  image = tf.image.per_image_standardization(image)
-  return image
-
-
-def input_fn(is_training,
-             data_dir,
-             batch_size,
-             num_epochs=1,
-             dtype=tf.float32,
-             datasets_num_private_threads=None,
-             parse_record_fn=parse_record,
-             input_context=None,
-             drop_remainder=False):
-  """Input function which provides batches for train or eval.
-
-  Args:
-    is_training: A boolean denoting whether the input is for training.
-    data_dir: The directory containing the input data.
-    batch_size: The number of samples per batch.
-    num_epochs: The number of epochs to repeat the dataset.
-    dtype: Data type to use for images/features
-    datasets_num_private_threads: Number of private threads for tf.data.
-    parse_record_fn: Function to use for parsing the records.
-    input_context: A `tf.distribute.InputContext` object passed in by
-      `tf.distribute.Strategy`.
-    drop_remainder: A boolean indicates whether to drop the remainder of the
-      batches. If True, the batch dimension will be static.
-
-  Returns:
-    A dataset that can be used for iteration.
-  """
-  filenames = get_filenames(is_training, data_dir)
-  dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES)
-
-  if input_context:
-    logging.info(
-        'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
-        input_context.input_pipeline_id, input_context.num_input_pipelines)
-    dataset = dataset.shard(input_context.num_input_pipelines,
-                            input_context.input_pipeline_id)
-
-  return resnet_run_loop.process_record_dataset(
-      dataset=dataset,
-      is_training=is_training,
-      batch_size=batch_size,
-      shuffle_buffer=NUM_IMAGES['train'],
-      parse_record_fn=parse_record_fn,
-      num_epochs=num_epochs,
-      dtype=dtype,
-      datasets_num_private_threads=datasets_num_private_threads,
-      drop_remainder=drop_remainder
-  )
-
-
-def get_synth_input_fn(dtype):
-  return resnet_run_loop.get_synth_input_fn(
-      HEIGHT, WIDTH, NUM_CHANNELS, NUM_CLASSES, dtype=dtype)
-
-
-###############################################################################
-# Running the model
-###############################################################################
-class Cifar10Model(resnet_model.Model):
-  """Model class with appropriate defaults for CIFAR-10 data."""
-
-  def __init__(self, resnet_size, data_format=None, num_classes=NUM_CLASSES,
-               resnet_version=resnet_model.DEFAULT_VERSION,
-               dtype=resnet_model.DEFAULT_DTYPE):
-    """These are the parameters that work for CIFAR-10 data.
-
-    Args:
-      resnet_size: The number of convolutional layers needed in the model.
-      data_format: Either 'channels_first' or 'channels_last', specifying which
-        data format to use when setting up the model.
-      num_classes: The number of output classes needed from the model. This
-        enables users to extend the same model to their own datasets.
-      resnet_version: Integer representing which version of the ResNet network
-      to use. See README for details. Valid values: [1, 2]
-      dtype: The TensorFlow dtype to use for calculations.
-
-    Raises:
-      ValueError: if invalid resnet_size is chosen
-    """
-    if resnet_size % 6 != 2:
-      raise ValueError('resnet_size must be 6n + 2:', resnet_size)
-
-    num_blocks = (resnet_size - 2) // 6
-
-    super(Cifar10Model, self).__init__(
-        resnet_size=resnet_size,
-        bottleneck=False,
-        num_classes=num_classes,
-        num_filters=16,
-        kernel_size=3,
-        conv_stride=1,
-        first_pool_size=None,
-        first_pool_stride=None,
-        block_sizes=[num_blocks] * 3,
-        block_strides=[1, 2, 2],
-        resnet_version=resnet_version,
-        data_format=data_format,
-        dtype=dtype
-    )
-
-
-def cifar10_model_fn(features, labels, mode, params):
-  """Model function for CIFAR-10."""
-  features = tf.reshape(features, [-1, HEIGHT, WIDTH, NUM_CHANNELS])
-  # Learning rate schedule follows arXiv:1512.03385 for ResNet-56 and under.
-  learning_rate_fn = resnet_run_loop.learning_rate_with_decay(
-      batch_size=params['batch_size'] * params.get('num_workers', 1),
-      batch_denom=128, num_images=NUM_IMAGES['train'],
-      boundary_epochs=[91, 136, 182], decay_rates=[1, 0.1, 0.01, 0.001])
-
-  # Weight decay of 2e-4 diverges from 1e-4 decay used in the ResNet paper
-  # and seems more stable in testing. The difference was nominal for ResNet-56.
-  weight_decay = 2e-4
-
-  # Empirical testing showed that including batch_normalization variables
-  # in the calculation of regularized loss helped validation accuracy
-  # for the CIFAR-10 dataset, perhaps because the regularization prevents
-  # overfitting on the small data set. We therefore include all vars when
-  # regularizing and computing loss during training.
-  def loss_filter_fn(_):
-    return True
-
-  return resnet_run_loop.resnet_model_fn(
-      features=features,
-      labels=labels,
-      mode=mode,
-      model_class=Cifar10Model,
-      resnet_size=params['resnet_size'],
-      weight_decay=weight_decay,
-      learning_rate_fn=learning_rate_fn,
-      momentum=0.9,
-      data_format=params['data_format'],
-      resnet_version=params['resnet_version'],
-      loss_scale=params['loss_scale'],
-      loss_filter_fn=loss_filter_fn,
-      dtype=params['dtype'],
-      fine_tune=params['fine_tune']
-  )
-
-
-def define_cifar_flags():
-  resnet_run_loop.define_resnet_flags()
-  flags.adopt_module_key_flags(resnet_run_loop)
-  flags_core.set_defaults(data_dir='/tmp/cifar10_data/cifar-10-batches-bin',
-                          model_dir='/tmp/cifar10_model',
-                          resnet_size='56',
-                          train_epochs=182,
-                          epochs_between_evals=10,
-                          batch_size=128,
-                          image_bytes_as_serving_input=False)
-
-
-def run_cifar(flags_obj):
-  """Run ResNet CIFAR-10 training and eval loop.
-
-  Args:
-    flags_obj: An object containing parsed flag values.
-
-  Returns:
-    Dictionary of results. Including final accuracy.
-  """
-  if flags_obj.image_bytes_as_serving_input:
-    logging.fatal(
-        '--image_bytes_as_serving_input cannot be set to True for CIFAR. '
-        'This flag is only applicable to ImageNet.')
-    return
-
-  input_function = (flags_obj.use_synthetic_data and
-                    get_synth_input_fn(flags_core.get_tf_dtype(flags_obj)) or
-                    input_fn)
-  result = resnet_run_loop.resnet_main(
-      flags_obj, cifar10_model_fn, input_function, DATASET_NAME,
-      shape=[HEIGHT, WIDTH, NUM_CHANNELS])
-
-  return result
-
-
-def main(_):
-  with logger.benchmark_context(flags.FLAGS):
-    run_cifar(flags.FLAGS)
-
-
-if __name__ == '__main__':
-  logging.set_verbosity(logging.INFO)
-  define_cifar_flags()
-  absl_app.run(main)
--- a/official/r1/resnet/cifar10_test.py
+++ b/official/r1/resnet/cifar10_test.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tempfile import mkstemp
-
-from absl import logging
-import numpy as np
-import tensorflow as tf
-
-from official.r1.resnet import cifar10_main
-from official.utils.testing import integration
-
-logging.set_verbosity(logging.ERROR)
-
-_BATCH_SIZE = 128
-_HEIGHT = 32
-_WIDTH = 32
-_NUM_CHANNELS = 3
-
-
-class BaseTest(tf.test.TestCase):
-  """Tests for the Cifar10 version of Resnet.
-  """
-
-  _num_validation_images = None
-
-  @classmethod
-  def setUpClass(cls):  # pylint: disable=invalid-name
-    super(BaseTest, cls).setUpClass()
-    tf.compat.v1.disable_eager_execution()
-    cifar10_main.define_cifar_flags()
-
-  def setUp(self):
-    super(BaseTest, self).setUp()
-    self._num_validation_images = cifar10_main.NUM_IMAGES['validation']
-    cifar10_main.NUM_IMAGES['validation'] = 4
-
-  def tearDown(self):
-    super(BaseTest, self).tearDown()
-    tf.io.gfile.rmtree(self.get_temp_dir())
-    cifar10_main.NUM_IMAGES['validation'] = self._num_validation_images
-
-  def test_dataset_input_fn(self):
-    fake_data = bytearray()
-    fake_data.append(7)
-    for i in range(_NUM_CHANNELS):
-      for _ in range(_HEIGHT * _WIDTH):
-        fake_data.append(i)
-
-    _, filename = mkstemp(dir=self.get_temp_dir())
-    data_file = open(filename, 'wb')
-    data_file.write(fake_data)
-    data_file.close()
-
-    fake_dataset = tf.data.FixedLengthRecordDataset(
-        filename, cifar10_main._RECORD_BYTES)  # pylint: disable=protected-access
-    fake_dataset = fake_dataset.map(
-        lambda val: cifar10_main.parse_record(val, False, tf.float32))
-    image, label = tf.compat.v1.data.make_one_shot_iterator(
-        fake_dataset).get_next()
-
-    self.assertAllEqual(label.shape, ())
-    self.assertAllEqual(image.shape, (_HEIGHT, _WIDTH, _NUM_CHANNELS))
-
-    with self.session() as sess:
-      image, label = sess.run([image, label])
-
-      self.assertEqual(label, 7)
-
-      for row in image:
-        for pixel in row:
-          self.assertAllClose(pixel, np.array([-1.225, 0., 1.225]), rtol=1e-3)
-
-  def cifar10_model_fn_helper(self, mode, resnet_version, dtype):
-    input_fn = cifar10_main.get_synth_input_fn(dtype)
-    dataset = input_fn(True, '', _BATCH_SIZE)
-    iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
-    features, labels = iterator.get_next()
-    spec = cifar10_main.cifar10_model_fn(
-        features, labels, mode, {
-            'dtype': dtype,
-            'resnet_size': 32,
-            'data_format': 'channels_last',
-            'batch_size': _BATCH_SIZE,
-            'resnet_version': resnet_version,
-            'loss_scale': 128 if dtype == tf.float16 else 1,
-            'fine_tune': False,
-        })
-
-    predictions = spec.predictions
-    self.assertAllEqual(predictions['probabilities'].shape,
-                        (_BATCH_SIZE, 10))
-    self.assertEqual(predictions['probabilities'].dtype, tf.float32)
-    self.assertAllEqual(predictions['classes'].shape, (_BATCH_SIZE,))
-    self.assertEqual(predictions['classes'].dtype, tf.int64)
-
-    if mode != tf.estimator.ModeKeys.PREDICT:
-      loss = spec.loss
-      self.assertAllEqual(loss.shape, ())
-      self.assertEqual(loss.dtype, tf.float32)
-
-    if mode == tf.estimator.ModeKeys.EVAL:
-      eval_metric_ops = spec.eval_metric_ops
-      self.assertAllEqual(eval_metric_ops['accuracy'][0].shape, ())
-      self.assertAllEqual(eval_metric_ops['accuracy'][1].shape, ())
-      self.assertEqual(eval_metric_ops['accuracy'][0].dtype, tf.float32)
-      self.assertEqual(eval_metric_ops['accuracy'][1].dtype, tf.float32)
-
-  def test_cifar10_model_fn_train_mode_v1(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.TRAIN, resnet_version=1,
-                                 dtype=tf.float32)
-
-  def test_cifar10_model_fn_trainmode__v2(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.TRAIN, resnet_version=2,
-                                 dtype=tf.float32)
-
-  def test_cifar10_model_fn_eval_mode_v1(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.EVAL, resnet_version=1,
-                                 dtype=tf.float32)
-
-  def test_cifar10_model_fn_eval_mode_v2(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.EVAL, resnet_version=2,
-                                 dtype=tf.float32)
-
-  def test_cifar10_model_fn_predict_mode_v1(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.PREDICT,
-                                 resnet_version=1, dtype=tf.float32)
-
-  def test_cifar10_model_fn_predict_mode_v2(self):
-    self.cifar10_model_fn_helper(tf.estimator.ModeKeys.PREDICT,
-                                 resnet_version=2, dtype=tf.float32)
-
-  def _test_cifar10model_shape(self, resnet_version):
-    batch_size = 135
-    num_classes = 246
-
-    model = cifar10_main.Cifar10Model(32, data_format='channels_last',
-                                      num_classes=num_classes,
-                                      resnet_version=resnet_version)
-    fake_input = tf.random.uniform([batch_size, _HEIGHT, _WIDTH, _NUM_CHANNELS])
-    output = model(fake_input, training=True)
-
-    self.assertAllEqual(output.shape, (batch_size, num_classes))
-
-  def test_cifar10model_shape_v1(self):
-    self._test_cifar10model_shape(resnet_version=1)
-
-  def test_cifar10model_shape_v2(self):
-    self._test_cifar10model_shape(resnet_version=2)
-
-  def test_cifar10_end_to_end_synthetic_v1(self):
-    integration.run_synthetic(
-        main=cifar10_main.run_cifar, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '1', '-batch_size', '4',
-                     '--max_train_steps', '1']
-    )
-
-  def test_cifar10_end_to_end_synthetic_v2(self):
-    integration.run_synthetic(
-        main=cifar10_main.run_cifar, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '2', '-batch_size', '4',
-                     '--max_train_steps', '1']
-    )
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/r1/resnet/estimator_benchmark.py
+++ b/official/r1/resnet/estimator_benchmark.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes Estimator benchmarks and accuracy tests."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-
-from absl import flags
-from absl import logging
-from absl.testing import flagsaver
-import tensorflow as tf
-
-from official.r1.resnet import cifar10_main as cifar_main
-from official.r1.resnet import imagenet_main
-from official.r1.utils.logs import hooks
-from official.utils.flags import core as flags_core
-
-IMAGENET_DATA_DIR_NAME = 'imagenet'
-CIFAR_DATA_DIR_NAME = 'cifar-10-batches-bin'
-FLAGS = flags.FLAGS
-
-
-class EstimatorBenchmark(tf.test.Benchmark):
-  """Base class to hold methods common to test classes in the module.
-
-     Code under test for Estimator models (ResNet50 and 56) report mostly the
-     same data and require the same FLAG setup.
-  """
-
-  local_flags = None
-
-  def __init__(self, output_dir=None, default_flags=None, flag_methods=None):
-    if not output_dir:
-      output_dir = '/tmp'
-    self.output_dir = output_dir
-    self.default_flags = default_flags or {}
-    self.flag_methods = flag_methods or {}
-
-  def _get_model_dir(self, folder_name):
-    """Returns directory to store info, e.g. saved model and event log."""
-    return os.path.join(self.output_dir, folder_name)
-
-  def _setup(self):
-    """Sets up and resets flags before each test."""
-    logging.set_verbosity(logging.INFO)
-    if EstimatorBenchmark.local_flags is None:
-      for flag_method in self.flag_methods:
-        flag_method()
-      # Loads flags to get defaults to then override. List cannot be empty.
-      flags.FLAGS(['foo'])
-      # Overrides flag values with defaults for the class of tests.
-      for k, v in self.default_flags.items():
-        setattr(FLAGS, k, v)
-      saved_flag_values = flagsaver.save_flag_values()
-      EstimatorBenchmark.local_flags = saved_flag_values
-    else:
-      flagsaver.restore_flag_values(EstimatorBenchmark.local_flags)
-
-  def _report_benchmark(self,
-                        stats,
-                        wall_time_sec,
-                        top_1_max=None,
-                        top_1_min=None):
-    """Report benchmark results by writing to local protobuf file.
-
-    Args:
-      stats: dict returned from estimator models with known entries.
-      wall_time_sec: the during of the benchmark execution in seconds
-      top_1_max: highest passing level for top_1 accuracy.
-      top_1_min: lowest passing level for top_1 accuracy.
-    """
-
-    examples_per_sec_hook = None
-    for hook in stats['train_hooks']:
-      if isinstance(hook, hooks.ExamplesPerSecondHook):
-        examples_per_sec_hook = hook
-        break
-
-    eval_results = stats['eval_results']
-    metrics = []
-    if 'accuracy' in eval_results:
-      metrics.append({'name': 'accuracy_top_1',
-                      'value': float(eval_results['accuracy']),
-                      'min_value': top_1_min,
-                      'max_value': top_1_max})
-    if 'accuracy_top_5' in eval_results:
-      metrics.append({'name': 'accuracy_top_5',
-                      'value': float(eval_results['accuracy_top_5'])})
-
-    if examples_per_sec_hook:
-      exp_per_second_list = examples_per_sec_hook.current_examples_per_sec_list
-      # ExamplesPerSecondHook skips the first 10 steps.
-      exp_per_sec = sum(exp_per_second_list) / (len(exp_per_second_list))
-      metrics.append({'name': 'exp_per_second',
-                      'value': exp_per_sec})
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(
-        iters=eval_results.get('global_step', None),
-        wall_time=wall_time_sec,
-        metrics=metrics,
-        extras={'flags': flags_str})
-
-
-class Resnet50EstimatorAccuracy(EstimatorBenchmark):
-  """Benchmark accuracy tests for ResNet50 w/ Estimator."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """Benchmark accuracy tests for ResNet50 w/ Estimator.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-    flag_methods = [imagenet_main.define_imagenet_flags]
-
-    self.data_dir = os.path.join(root_data_dir, IMAGENET_DATA_DIR_NAME)
-    super(Resnet50EstimatorAccuracy, self).__init__(
-        output_dir=output_dir, flag_methods=flag_methods)
-
-  def benchmark_graph_8_gpu(self):
-    """Test 8 GPUs graph mode."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
-    FLAGS.dtype = 'fp32'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_8_gpu(self):
-    """Test FP16 8 GPUs graph mode."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 256 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
-    FLAGS.dtype = 'fp16'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_graph_rewrite_8_gpu(self):
-    """Test FP16 graph rewrite 8 GPUs graph mode."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 256 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_graph_fp16_graph_rewrite_8_gpu')
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def _run_and_report_benchmark(self):
-    start_time_sec = time.time()
-    stats = imagenet_main.run_imagenet(flags.FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-    self._report_benchmark(stats,
-                           wall_time_sec,
-                           top_1_min=0.762,
-                           top_1_max=0.766)
-
-
-class Resnet50EstimatorBenchmarkBase(EstimatorBenchmark):
-  """Base class for benchmarks for ResNet50 using Estimator."""
-  local_flags = None
-
-  def __init__(self, output_dir=None, default_flags=None):
-    flag_methods = [imagenet_main.define_imagenet_flags]
-
-    super(Resnet50EstimatorBenchmarkBase, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        flag_methods=flag_methods)
-
-  def _run_and_report_benchmark(self):
-    start_time_sec = time.time()
-    stats = imagenet_main.run_imagenet(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-    print(stats)
-    # Remove values to skip triggering accuracy check.
-    stats['eval_results'].pop('accuracy', None)
-    stats['eval_results'].pop('accuracy_top_5', None)
-
-    self._report_benchmark(stats, wall_time_sec)
-
-
-class Resnet50EstimatorBenchmark(Resnet50EstimatorBenchmarkBase):
-  """Benchmarks for ResNet50 using Estimator with 1 worker."""
-
-  def __init__(self, output_dir=None, default_flags=None):
-    super(Resnet50EstimatorBenchmark, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags)
-
-  def benchmark_graph_fp16_1_gpu(self):
-    """Benchmarks graph fp16 1 gpu."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu')
-    FLAGS.batch_size = 128
-    FLAGS.dtype = 'fp16'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_1_gpu_tweaked(self):
-    """Benchmarks graph fp16 1 gpu tweaked."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.intra_op_parallelism_threads = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu_tweaked')
-    FLAGS.batch_size = 256
-    FLAGS.dtype = 'fp16'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_graph_rewrite_1_gpu_tweaked(self):
-    """Benchmarks graph fp16 graph rewrite 1 gpu tweaked."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.intra_op_parallelism_threads = 1
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_graph_fp16_graph_rewrite_1_gpu_tweaked')
-    FLAGS.batch_size = 256
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_1_gpu(self):
-    """Benchmarks graph 1 gpu."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
-    FLAGS.batch_size = 128
-    FLAGS.dtype = 'fp32'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_8_gpu(self):
-    """Benchmarks graph 8 gpus."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
-    FLAGS.batch_size = 128*8
-    FLAGS.dtype = 'fp32'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_8_gpu(self):
-    """Benchmarks graph fp16 8 gpus."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
-    FLAGS.batch_size = 256*8
-    FLAGS.dtype = 'fp16'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_8_gpu_tweaked(self):
-    """Benchmarks graph fp16 8 gpus tweaked."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.intra_op_parallelism_threads = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu_tweaked')
-    FLAGS.batch_size = 256*8
-    FLAGS.dtype = 'fp16'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_graph_rewrite_8_gpu_tweaked(self):
-    """Benchmarks graph fp16 graph rewrite 8 gpus tweaked."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.intra_op_parallelism_threads = 1
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_graph_fp16_graph_rewrite_8_gpu_tweaked')
-    FLAGS.batch_size = 256*8
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-
-class Resnet50EstimatorBenchmarkSynth(Resnet50EstimatorBenchmark):
-  """Resnet50 synthetic benchmark tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    def_flags = {}
-    def_flags['use_synthetic_data'] = True
-    def_flags['max_train_steps'] = 110
-    def_flags['train_epochs'] = 1
-
-    super(Resnet50EstimatorBenchmarkSynth, self).__init__(
-        output_dir=output_dir, default_flags=def_flags)
-
-
-class Resnet50EstimatorBenchmarkReal(Resnet50EstimatorBenchmark):
-  """Resnet50 real data benchmark tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    def_flags = {}
-    def_flags['data_dir'] = os.path.join(root_data_dir, IMAGENET_DATA_DIR_NAME)
-    def_flags['max_train_steps'] = 110
-    def_flags['train_epochs'] = 1
-
-    super(Resnet50EstimatorBenchmarkReal, self).__init__(
-        output_dir=output_dir, default_flags=def_flags)
-
-
-class Resnet50MultiWorkerEstimatorBenchmark(Resnet50EstimatorBenchmarkBase):
-  """Benchmarks for ResNet50 using Estimator with multiple workers."""
-
-  def __init__(self, output_dir=None, default_flags=None):
-    super(Resnet50MultiWorkerEstimatorBenchmark, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags)
-
-  def benchmark_graph_fp16_8_gpu_ring_tweaked(self):
-    """Benchmarks graph fp16 8 gpus with ring collective tweaked."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.distribution_strategy = 'multi_worker_mirrored'
-    FLAGS.all_reduce_alg = 'ring'
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.intra_op_parallelism_threads = 1
-    FLAGS.datasets_num_private_threads = 32
-    FLAGS.model_dir = self._get_model_dir(
-        folder_name='benchmark_graph_fp16_8_gpu_ring_tweaked')
-    FLAGS.batch_size = 256*8
-    FLAGS.dtype = 'fp16'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_8_gpu_nccl_tweaked(self):
-    """Benchmarks graph fp16 8 gpus with nccl collective tweaked."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.distribution_strategy = 'multi_worker_mirrored'
-    FLAGS.all_reduce_alg = 'nccl'
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.intra_op_parallelism_threads = 1
-    FLAGS.datasets_num_private_threads = 32
-    FLAGS.model_dir = self._get_model_dir(
-        folder_name='benchmark_graph_fp16_8_gpu_nccl_tweaked')
-    FLAGS.batch_size = 256*8
-    FLAGS.dtype = 'fp16'
-    FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-
-class Resnet50MultiWorkerEstimatorBenchmarkSynth(
-    Resnet50MultiWorkerEstimatorBenchmark):
-  """ResNet50, multi-worker, Estimator, synthetic data."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    def_flags = {}
-    def_flags['use_synthetic_data'] = True
-    def_flags['max_train_steps'] = 110
-    def_flags['train_epochs'] = 1
-
-    super(Resnet50MultiWorkerEstimatorBenchmarkSynth, self).__init__(
-        output_dir=output_dir, default_flags=def_flags)
-
-
-class Resnet56EstimatorAccuracy(EstimatorBenchmark):
-  """Accuracy tests for Estimator ResNet56."""
-
-  local_flags = None
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """A benchmark class.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-    flag_methods = [cifar_main.define_cifar_flags]
-
-    self.data_dir = os.path.join(root_data_dir, CIFAR_DATA_DIR_NAME)
-    super(Resnet56EstimatorAccuracy, self).__init__(
-        output_dir=output_dir, flag_methods=flag_methods)
-
-  def benchmark_graph_1_gpu(self):
-    """Test layers model with Estimator and distribution strategies."""
-    self._setup()
-    flags.FLAGS.num_gpus = 1
-    flags.FLAGS.data_dir = self.data_dir
-    flags.FLAGS.batch_size = 128
-    flags.FLAGS.train_epochs = 182
-    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
-    flags.FLAGS.resnet_size = 56
-    flags.FLAGS.dtype = 'fp32'
-    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_1_gpu(self):
-    """Test layers FP16 model with Estimator and distribution strategies."""
-    self._setup()
-    flags.FLAGS.num_gpus = 1
-    flags.FLAGS.data_dir = self.data_dir
-    flags.FLAGS.batch_size = 128
-    flags.FLAGS.train_epochs = 182
-    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu')
-    flags.FLAGS.resnet_size = 56
-    flags.FLAGS.dtype = 'fp16'
-    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_2_gpu(self):
-    """Test layers model with Estimator and dist_strat. 2 GPUs."""
-    self._setup()
-    flags.FLAGS.num_gpus = 2
-    flags.FLAGS.data_dir = self.data_dir
-    flags.FLAGS.batch_size = 128
-    flags.FLAGS.train_epochs = 182
-    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
-    flags.FLAGS.resnet_size = 56
-    flags.FLAGS.dtype = 'fp32'
-    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_fp16_2_gpu(self):
-    """Test layers FP16 model with Estimator and dist_strat. 2 GPUs."""
-    self._setup()
-    flags.FLAGS.num_gpus = 2
-    flags.FLAGS.data_dir = self.data_dir
-    flags.FLAGS.batch_size = 128
-    flags.FLAGS.train_epochs = 182
-    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_2_gpu')
-    flags.FLAGS.resnet_size = 56
-    flags.FLAGS.dtype = 'fp16'
-    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def unit_test(self):
-    """A lightweight test that can finish quickly."""
-    self._setup()
-    flags.FLAGS.num_gpus = 1
-    flags.FLAGS.data_dir = self.data_dir
-    flags.FLAGS.batch_size = 128
-    flags.FLAGS.train_epochs = 1
-    flags.FLAGS.model_dir = self._get_model_dir('unit_test')
-    flags.FLAGS.resnet_size = 8
-    flags.FLAGS.dtype = 'fp32'
-    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
-    self._run_and_report_benchmark()
-
-  def _run_and_report_benchmark(self):
-    """Executes benchmark and reports result."""
-    start_time_sec = time.time()
-    stats = cifar_main.run_cifar(flags.FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    self._report_benchmark(stats,
-                           wall_time_sec,
-                           top_1_min=0.926,
-                           top_1_max=0.938)
--- a/official/r1/resnet/imagenet_main.py
+++ b/official/r1/resnet/imagenet_main.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Runs a ResNet model on the ImageNet dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl import app as absl_app
-from absl import flags
-from absl import logging
-from six.moves import range
-import tensorflow as tf
-
-from official.r1.resnet import imagenet_preprocessing
-from official.r1.resnet import resnet_model
-from official.r1.resnet import resnet_run_loop
-from official.r1.utils.logs import logger
-from official.utils.flags import core as flags_core
-
-DEFAULT_IMAGE_SIZE = 224
-NUM_CHANNELS = 3
-NUM_CLASSES = 1001
-
-NUM_IMAGES = {
-    'train': 1281167,
-    'validation': 50000,
-}
-
-_NUM_TRAIN_FILES = 1024
-_SHUFFLE_BUFFER = 10000
-
-DATASET_NAME = 'ImageNet'
-
-###############################################################################
-# Data processing
-###############################################################################
-def get_filenames(is_training, data_dir):
-  """Return filenames for dataset."""
-  if is_training:
-    return [
-        os.path.join(data_dir, 'train-%05d-of-01024' % i)
-        for i in range(_NUM_TRAIN_FILES)]
-  else:
-    return [
-        os.path.join(data_dir, 'validation-%05d-of-00128' % i)
-        for i in range(128)]
-
-
-def _parse_example_proto(example_serialized):
-  """Parses an Example proto containing a training example of an image.
-
-  The output of the build_image_data.py image preprocessing script is a dataset
-  containing serialized Example protocol buffers. Each Example proto contains
-  the following fields (values are included as examples):
-
-    image/height: 462
-    image/width: 581
-    image/colorspace: 'RGB'
-    image/channels: 3
-    image/class/label: 615
-    image/class/synset: 'n03623198'
-    image/class/text: 'knee pad'
-    image/object/bbox/xmin: 0.1
-    image/object/bbox/xmax: 0.9
-    image/object/bbox/ymin: 0.2
-    image/object/bbox/ymax: 0.6
-    image/object/bbox/label: 615
-    image/format: 'JPEG'
-    image/filename: 'ILSVRC2012_val_00041207.JPEG'
-    image/encoded: <JPEG encoded string>
-
-  Args:
-    example_serialized: scalar Tensor tf.string containing a serialized
-      Example protocol buffer.
-
-  Returns:
-    image_buffer: Tensor tf.string containing the contents of a JPEG file.
-    label: Tensor tf.int32 containing the label.
-    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
-      where each coordinate is [0, 1) and the coordinates are arranged as
-      [ymin, xmin, ymax, xmax].
-  """
-  # Dense features in Example proto.
-  feature_map = {
-      'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string,
-                                             default_value=''),
-      'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64,
-                                                 default_value=-1),
-      'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string,
-                                                default_value=''),
-  }
-  sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
-  # Sparse features in Example proto.
-  feature_map.update(
-      {k: sparse_float32 for k in ['image/object/bbox/xmin',
-                                   'image/object/bbox/ymin',
-                                   'image/object/bbox/xmax',
-                                   'image/object/bbox/ymax']})
-
-  features = tf.io.parse_single_example(serialized=example_serialized,
-                                        features=feature_map)
-  label = tf.cast(features['image/class/label'], dtype=tf.int32)
-
-  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
-  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
-  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
-  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
-
-  # Note that we impose an ordering of (y, x) just to make life difficult.
-  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
-
-  # Force the variable number of bounding boxes into the shape
-  # [1, num_boxes, coords].
-  bbox = tf.expand_dims(bbox, 0)
-  bbox = tf.transpose(a=bbox, perm=[0, 2, 1])
-
-  return features['image/encoded'], label, bbox
-
-
-def parse_record(raw_record, is_training, dtype):
-  """Parses a record containing a training example of an image.
-
-  The input record is parsed into a label and image, and the image is passed
-  through preprocessing steps (cropping, flipping, and so on).
-
-  Args:
-    raw_record: scalar Tensor tf.string containing a serialized
-      Example protocol buffer.
-    is_training: A boolean denoting whether the input is for training.
-    dtype: data type to use for images/features.
-
-  Returns:
-    Tuple with processed image tensor and one-hot-encoded label tensor.
-  """
-  image_buffer, label, bbox = _parse_example_proto(raw_record)
-
-  image = imagenet_preprocessing.preprocess_image(
-      image_buffer=image_buffer,
-      bbox=bbox,
-      output_height=DEFAULT_IMAGE_SIZE,
-      output_width=DEFAULT_IMAGE_SIZE,
-      num_channels=NUM_CHANNELS,
-      is_training=is_training)
-  image = tf.cast(image, dtype)
-
-  return image, label
-
-
-def input_fn(is_training,
-             data_dir,
-             batch_size,
-             num_epochs=1,
-             dtype=tf.float32,
-             datasets_num_private_threads=None,
-             parse_record_fn=parse_record,
-             input_context=None,
-             drop_remainder=False,
-             tf_data_experimental_slack=False):
-  """Input function which provides batches for train or eval.
-
-  Args:
-    is_training: A boolean denoting whether the input is for training.
-    data_dir: The directory containing the input data.
-    batch_size: The number of samples per batch.
-    num_epochs: The number of epochs to repeat the dataset.
-    dtype: Data type to use for images/features
-    datasets_num_private_threads: Number of private threads for tf.data.
-    parse_record_fn: Function to use for parsing the records.
-    input_context: A `tf.distribute.InputContext` object passed in by
-      `tf.distribute.Strategy`.
-    drop_remainder: A boolean indicates whether to drop the remainder of the
-      batches. If True, the batch dimension will be static.
-    tf_data_experimental_slack: Whether to enable tf.data's
-      `experimental_slack` option.
-
-  Returns:
-    A dataset that can be used for iteration.
-  """
-  filenames = get_filenames(is_training, data_dir)
-  dataset = tf.data.Dataset.from_tensor_slices(filenames)
-
-  if input_context:
-    logging.info(
-        'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
-        input_context.input_pipeline_id, input_context.num_input_pipelines)
-    dataset = dataset.shard(input_context.num_input_pipelines,
-                            input_context.input_pipeline_id)
-
-  if is_training:
-    # Shuffle the input files
-    dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)
-
-  # Convert to individual records.
-  # cycle_length = 10 means that up to 10 files will be read and deserialized in
-  # parallel. You may want to increase this number if you have a large number of
-  # CPU cores.
-  dataset = dataset.interleave(
-      tf.data.TFRecordDataset,
-      cycle_length=10,
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-  return resnet_run_loop.process_record_dataset(
-      dataset=dataset,
-      is_training=is_training,
-      batch_size=batch_size,
-      shuffle_buffer=_SHUFFLE_BUFFER,
-      parse_record_fn=parse_record_fn,
-      num_epochs=num_epochs,
-      dtype=dtype,
-      datasets_num_private_threads=datasets_num_private_threads,
-      drop_remainder=drop_remainder,
-      tf_data_experimental_slack=tf_data_experimental_slack,
-  )
-
-
-def get_synth_input_fn(dtype):
-  return resnet_run_loop.get_synth_input_fn(
-      DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, NUM_CHANNELS, NUM_CLASSES,
-      dtype=dtype)
-
-
-###############################################################################
-# Running the model
-###############################################################################
-class ImagenetModel(resnet_model.Model):
-  """Model class with appropriate defaults for Imagenet data."""
-
-  def __init__(self, resnet_size, data_format=None, num_classes=NUM_CLASSES,
-               resnet_version=resnet_model.DEFAULT_VERSION,
-               dtype=resnet_model.DEFAULT_DTYPE):
-    """These are the parameters that work for Imagenet data.
-
-    Args:
-      resnet_size: The number of convolutional layers needed in the model.
-      data_format: Either 'channels_first' or 'channels_last', specifying which
-        data format to use when setting up the model.
-      num_classes: The number of output classes needed from the model. This
-        enables users to extend the same model to their own datasets.
-      resnet_version: Integer representing which version of the ResNet network
-        to use. See README for details. Valid values: [1, 2]
-      dtype: The TensorFlow dtype to use for calculations.
-    """
-
-    # For bigger models, we want to use "bottleneck" layers
-    if resnet_size < 50:
-      bottleneck = False
-    else:
-      bottleneck = True
-
-    super(ImagenetModel, self).__init__(
-        resnet_size=resnet_size,
-        bottleneck=bottleneck,
-        num_classes=num_classes,
-        num_filters=64,
-        kernel_size=7,
-        conv_stride=2,
-        first_pool_size=3,
-        first_pool_stride=2,
-        block_sizes=_get_block_sizes(resnet_size),
-        block_strides=[1, 2, 2, 2],
-        resnet_version=resnet_version,
-        data_format=data_format,
-        dtype=dtype
-    )
-
-
-def _get_block_sizes(resnet_size):
-  """Retrieve the size of each block_layer in the ResNet model.
-
-  The number of block layers used for the Resnet model varies according
-  to the size of the model. This helper grabs the layer set we want, throwing
-  an error if a non-standard size has been selected.
-
-  Args:
-    resnet_size: The number of convolutional layers needed in the model.
-
-  Returns:
-    A list of block sizes to use in building the model.
-
-  Raises:
-    KeyError: if invalid resnet_size is received.
-  """
-  choices = {
-      18: [2, 2, 2, 2],
-      34: [3, 4, 6, 3],
-      50: [3, 4, 6, 3],
-      101: [3, 4, 23, 3],
-      152: [3, 8, 36, 3],
-      200: [3, 24, 36, 3]
-  }
-
-  try:
-    return choices[resnet_size]
-  except KeyError:
-    err = ('Could not find layers for selected Resnet size.\n'
-           'Size received: {}; sizes allowed: {}.'.format(
-               resnet_size, list(choices.keys())))
-    raise ValueError(err)
-
-
-def imagenet_model_fn(features, labels, mode, params):
-  """Our model_fn for ResNet to be used with our Estimator."""
-
-  # Warmup and higher lr may not be valid for fine tuning with small batches
-  # and smaller numbers of training images.
-  if params['fine_tune']:
-    warmup = False
-    base_lr = .1
-  else:
-    warmup = True
-    base_lr = .128
-
-  learning_rate_fn = resnet_run_loop.learning_rate_with_decay(
-      batch_size=params['batch_size'] * params.get('num_workers', 1),
-      batch_denom=256, num_images=NUM_IMAGES['train'],
-      boundary_epochs=[30, 60, 80, 90], decay_rates=[1, 0.1, 0.01, 0.001, 1e-4],
-      warmup=warmup, base_lr=base_lr)
-
-  return resnet_run_loop.resnet_model_fn(
-      features=features,
-      labels=labels,
-      mode=mode,
-      model_class=ImagenetModel,
-      resnet_size=params['resnet_size'],
-      weight_decay=flags.FLAGS.weight_decay,
-      learning_rate_fn=learning_rate_fn,
-      momentum=0.9,
-      data_format=params['data_format'],
-      resnet_version=params['resnet_version'],
-      loss_scale=params['loss_scale'],
-      loss_filter_fn=None,
-      dtype=params['dtype'],
-      fine_tune=params['fine_tune'],
-      label_smoothing=flags.FLAGS.label_smoothing
-  )
-
-
-def define_imagenet_flags():
-  resnet_run_loop.define_resnet_flags(
-      resnet_size_choices=['18', '34', '50', '101', '152', '200'],
-      dynamic_loss_scale=True,
-      fp16_implementation=True)
-  flags.adopt_module_key_flags(resnet_run_loop)
-  flags_core.set_defaults(train_epochs=90)
-
-
-def run_imagenet(flags_obj):
-  """Run ResNet ImageNet training and eval loop.
-
-  Args:
-    flags_obj: An object containing parsed flag values.
-
-  Returns:
-    Dict of results of the run.  Contains the keys `eval_results` and
-      `train_hooks`. `eval_results` contains accuracy (top_1) and
-      accuracy_top_5. `train_hooks` is a list the instances of hooks used during
-      training.
-  """
-  input_function = (flags_obj.use_synthetic_data and
-                    get_synth_input_fn(flags_core.get_tf_dtype(flags_obj)) or
-                    input_fn)
-
-  result = resnet_run_loop.resnet_main(
-      flags_obj, imagenet_model_fn, input_function, DATASET_NAME,
-      shape=[DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, NUM_CHANNELS])
-
-  return result
-
-
-def main(_):
-  with logger.benchmark_context(flags.FLAGS):
-    run_imagenet(flags.FLAGS)
-
-
-if __name__ == '__main__':
-  logging.set_verbosity(logging.INFO)
-  define_imagenet_flags()
-  absl_app.run(main)
--- a/official/r1/resnet/imagenet_preprocessing.py
+++ b/official/r1/resnet/imagenet_preprocessing.py
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Provides utilities to preprocess images.
-
-Training images are sampled using the provided bounding boxes, and subsequently
-cropped to the sampled bounding box. Images are additionally flipped randomly,
-then resized to the target output size (without aspect-ratio preservation).
-
-Images used during evaluation are resized (with aspect-ratio preservation) and
-centrally cropped.
-
-All images undergo mean color subtraction.
-
-Note that these steps are colloquially referred to as "ResNet preprocessing,"
-and they differ from "VGG preprocessing," which does not use bounding boxes
-and instead does an aspect-preserving resize followed by random crop during
-training. (These both differ from "Inception preprocessing," which introduces
-color distortion steps.)
-
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-_R_MEAN = 123.68
-_G_MEAN = 116.78
-_B_MEAN = 103.94
-_CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
-
-# The lower bound for the smallest side of the image for aspect-preserving
-# resizing. For example, if an image is 500 x 1000, it will be resized to
-# _RESIZE_MIN x (_RESIZE_MIN * 2).
-_RESIZE_MIN = 256
-
-
-def _decode_crop_and_flip(image_buffer, bbox, num_channels):
-  """Crops the given image to a random part of the image, and randomly flips.
-
-  We use the fused decode_and_crop op, which performs better than the two ops
-  used separately in series, but note that this requires that the image be
-  passed in as an un-decoded string Tensor.
-
-  Args:
-    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
-    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
-      where each coordinate is [0, 1) and the coordinates are arranged as
-      [ymin, xmin, ymax, xmax].
-    num_channels: Integer depth of the image buffer for decoding.
-
-  Returns:
-    3-D tensor with cropped image.
-
-  """
-  # A large fraction of image datasets contain a human-annotated bounding box
-  # delineating the region of the image containing the object of interest.  We
-  # choose to create a new bounding box for the object which is a randomly
-  # distorted version of the human-annotated bounding box that obeys an
-  # allowed range of aspect ratios, sizes and overlap with the human-annotated
-  # bounding box. If no box is supplied, then we assume the bounding box is
-  # the entire image.
-  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
-      tf.image.extract_jpeg_shape(image_buffer),
-      bounding_boxes=bbox,
-      min_object_covered=0.1,
-      aspect_ratio_range=[0.75, 1.33],
-      area_range=[0.05, 1.0],
-      max_attempts=100,
-      use_image_if_no_bounding_boxes=True)
-  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
-
-  # Reassemble the bounding box in the format the crop op requires.
-  offset_y, offset_x, _ = tf.unstack(bbox_begin)
-  target_height, target_width, _ = tf.unstack(bbox_size)
-  crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
-
-  # Use the fused decode and crop op here, which is faster than each in series.
-  cropped = tf.image.decode_and_crop_jpeg(
-      image_buffer, crop_window, channels=num_channels)
-
-  # Flip to add a little more random distortion in.
-  cropped = tf.image.random_flip_left_right(cropped)
-  return cropped
-
-
-def _central_crop(image, crop_height, crop_width):
-  """Performs central crops of the given image list.
-
-  Args:
-    image: a 3-D image tensor
-    crop_height: the height of the image following the crop.
-    crop_width: the width of the image following the crop.
-
-  Returns:
-    3-D tensor with cropped image.
-  """
-  shape = tf.shape(input=image)
-  height, width = shape[0], shape[1]
-
-  amount_to_be_cropped_h = (height - crop_height)
-  crop_top = amount_to_be_cropped_h // 2
-  amount_to_be_cropped_w = (width - crop_width)
-  crop_left = amount_to_be_cropped_w // 2
-  return tf.slice(
-      image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
-
-
-def _mean_image_subtraction(image, means, num_channels):
-  """Subtracts the given means from each image channel.
-
-  For example:
-    means = [123.68, 116.779, 103.939]
-    image = _mean_image_subtraction(image, means)
-
-  Note that the rank of `image` must be known.
-
-  Args:
-    image: a tensor of size [height, width, C].
-    means: a C-vector of values to subtract from each channel.
-    num_channels: number of color channels in the image that will be distorted.
-
-  Returns:
-    the centered image.
-
-  Raises:
-    ValueError: If the rank of `image` is unknown, if `image` has a rank other
-      than three or if the number of channels in `image` doesn't match the
-      number of values in `means`.
-  """
-  if image.get_shape().ndims != 3:
-    raise ValueError('Input must be of size [height, width, C>0]')
-
-  if len(means) != num_channels:
-    raise ValueError('len(means) must match the number of channels')
-
-  # We have a 1-D tensor of means; convert to 3-D.
-  # Note(b/130245863): we explicitly call `broadcast` instead of simply
-  # expanding dimensions for better performance.
-  means = tf.broadcast_to(means, tf.shape(image))
-
-  return image - means
-
-
-def _smallest_size_at_least(height, width, resize_min):
-  """Computes new shape with the smallest side equal to `smallest_side`.
-
-  Computes new shape with the smallest side equal to `smallest_side` while
-  preserving the original aspect ratio.
-
-  Args:
-    height: an int32 scalar tensor indicating the current height.
-    width: an int32 scalar tensor indicating the current width.
-    resize_min: A python integer or scalar `Tensor` indicating the size of
-      the smallest side after resize.
-
-  Returns:
-    new_height: an int32 scalar tensor indicating the new height.
-    new_width: an int32 scalar tensor indicating the new width.
-  """
-  resize_min = tf.cast(resize_min, tf.float32)
-
-  # Convert to floats to make subsequent calculations go smoothly.
-  height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
-
-  smaller_dim = tf.minimum(height, width)
-  scale_ratio = resize_min / smaller_dim
-
-  # Convert back to ints to make heights and widths that TF ops will accept.
-  new_height = tf.cast(height * scale_ratio, tf.int32)
-  new_width = tf.cast(width * scale_ratio, tf.int32)
-
-  return new_height, new_width
-
-
-def _aspect_preserving_resize(image, resize_min):
-  """Resize images preserving the original aspect ratio.
-
-  Args:
-    image: A 3-D image `Tensor`.
-    resize_min: A python integer or scalar `Tensor` indicating the size of
-      the smallest side after resize.
-
-  Returns:
-    resized_image: A 3-D tensor containing the resized image.
-  """
-  shape = tf.shape(input=image)
-  height, width = shape[0], shape[1]
-
-  new_height, new_width = _smallest_size_at_least(height, width, resize_min)
-
-  return _resize_image(image, new_height, new_width)
-
-
-def _resize_image(image, height, width):
-  """Simple wrapper around tf.resize_images.
-
-  This is primarily to make sure we use the same `ResizeMethod` and other
-  details each time.
-
-  Args:
-    image: A 3-D image `Tensor`.
-    height: The target height for the resized image.
-    width: The target width for the resized image.
-
-  Returns:
-    resized_image: A 3-D tensor containing the resized image. The first two
-      dimensions have the shape [height, width].
-  """
-  return tf.compat.v1.image.resize(
-      image, [height, width], method=tf.image.ResizeMethod.BILINEAR,
-      align_corners=False)
-
-
-def preprocess_image(image_buffer, bbox, output_height, output_width,
-                     num_channels, is_training=False):
-  """Preprocesses the given image.
-
-  Preprocessing includes decoding, cropping, and resizing for both training
-  and eval images. Training preprocessing, however, introduces some random
-  distortion of the image to improve accuracy.
-
-  Args:
-    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
-    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
-      where each coordinate is [0, 1) and the coordinates are arranged as
-      [ymin, xmin, ymax, xmax].
-    output_height: The height of the image after preprocessing.
-    output_width: The width of the image after preprocessing.
-    num_channels: Integer depth of the image buffer for decoding.
-    is_training: `True` if we're preprocessing the image for training and
-      `False` otherwise.
-
-  Returns:
-    A preprocessed image.
-  """
-  if is_training:
-    # For training, we want to randomize some of the distortions.
-    image = _decode_crop_and_flip(image_buffer, bbox, num_channels)
-    image = _resize_image(image, output_height, output_width)
-  else:
-    # For validation, we want to decode, resize, then just crop the middle.
-    image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
-    image = _aspect_preserving_resize(image, _RESIZE_MIN)
-    image = _central_crop(image, output_height, output_width)
-
-  image.set_shape([output_height, output_width, num_channels])
-
-  return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels)
--- a/official/r1/resnet/imagenet_test.py
+++ b/official/r1/resnet/imagenet_test.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-from absl import logging
-
-from official.r1.resnet import imagenet_main
-from official.utils.testing import integration
-
-logging.set_verbosity(logging.ERROR)
-
-_BATCH_SIZE = 32
-_LABEL_CLASSES = 1001
-
-
-class BaseTest(tf.test.TestCase):
-
-  _num_validation_images = None
-
-  @classmethod
-  def setUpClass(cls):  # pylint: disable=invalid-name
-    super(BaseTest, cls).setUpClass()
-    imagenet_main.define_imagenet_flags()
-
-  def setUp(self):
-    super(BaseTest, self).setUp()
-    tf.compat.v1.disable_eager_execution()
-    self._num_validation_images = imagenet_main.NUM_IMAGES['validation']
-    imagenet_main.NUM_IMAGES['validation'] = 4
-
-  def tearDown(self):
-    super(BaseTest, self).tearDown()
-    tf.io.gfile.rmtree(self.get_temp_dir())
-    imagenet_main.NUM_IMAGES['validation'] = self._num_validation_images
-
-  def _tensor_shapes_helper(self, resnet_size, resnet_version, dtype, with_gpu):
-    """Checks the tensor shapes after each phase of the ResNet model."""
-    def reshape(shape):
-      """Returns the expected dimensions depending on if a GPU is being used."""
-
-      # If a GPU is used for the test, the shape is returned (already in NCHW
-      # form). When GPU is not used, the shape is converted to NHWC.
-      if with_gpu:
-        return shape
-      return shape[0], shape[2], shape[3], shape[1]
-
-    graph = tf.Graph()
-
-    with graph.as_default(), self.test_session(
-        graph=graph, use_gpu=with_gpu, force_gpu=with_gpu):
-      model = imagenet_main.ImagenetModel(
-          resnet_size=resnet_size,
-          data_format='channels_first' if with_gpu else 'channels_last',
-          resnet_version=resnet_version,
-          dtype=dtype
-      )
-      inputs = tf.random.uniform([1, 224, 224, 3])
-      output = model(inputs, training=True)
-
-      initial_conv = graph.get_tensor_by_name('resnet_model/initial_conv:0')
-      max_pool = graph.get_tensor_by_name('resnet_model/initial_max_pool:0')
-      block_layer1 = graph.get_tensor_by_name('resnet_model/block_layer1:0')
-      block_layer2 = graph.get_tensor_by_name('resnet_model/block_layer2:0')
-      block_layer3 = graph.get_tensor_by_name('resnet_model/block_layer3:0')
-      block_layer4 = graph.get_tensor_by_name('resnet_model/block_layer4:0')
-      reduce_mean = graph.get_tensor_by_name('resnet_model/final_reduce_mean:0')
-      dense = graph.get_tensor_by_name('resnet_model/final_dense:0')
-
-      self.assertAllEqual(initial_conv.shape, reshape((1, 64, 112, 112)))
-      self.assertAllEqual(max_pool.shape, reshape((1, 64, 56, 56)))
-
-      # The number of channels after each block depends on whether we're
-      # using the building_block or the bottleneck_block.
-      if resnet_size < 50:
-        self.assertAllEqual(block_layer1.shape, reshape((1, 64, 56, 56)))
-        self.assertAllEqual(block_layer2.shape, reshape((1, 128, 28, 28)))
-        self.assertAllEqual(block_layer3.shape, reshape((1, 256, 14, 14)))
-        self.assertAllEqual(block_layer4.shape, reshape((1, 512, 7, 7)))
-        self.assertAllEqual(reduce_mean.shape, reshape((1, 512, 1, 1)))
-      else:
-        self.assertAllEqual(block_layer1.shape, reshape((1, 256, 56, 56)))
-        self.assertAllEqual(block_layer2.shape, reshape((1, 512, 28, 28)))
-        self.assertAllEqual(block_layer3.shape, reshape((1, 1024, 14, 14)))
-        self.assertAllEqual(block_layer4.shape, reshape((1, 2048, 7, 7)))
-        self.assertAllEqual(reduce_mean.shape, reshape((1, 2048, 1, 1)))
-
-      self.assertAllEqual(dense.shape, (1, _LABEL_CLASSES))
-      self.assertAllEqual(output.shape, (1, _LABEL_CLASSES))
-
-  def tensor_shapes_helper(self, resnet_size, resnet_version, with_gpu=False):
-    self._tensor_shapes_helper(resnet_size=resnet_size,
-                               resnet_version=resnet_version,
-                               dtype=tf.float32, with_gpu=with_gpu)
-    self._tensor_shapes_helper(resnet_size=resnet_size,
-                               resnet_version=resnet_version,
-                               dtype=tf.float16, with_gpu=with_gpu)
-
-  def test_tensor_shapes_resnet_18_v1(self):
-    self.tensor_shapes_helper(18, resnet_version=1)
-
-  def test_tensor_shapes_resnet_18_v2(self):
-    self.tensor_shapes_helper(18, resnet_version=2)
-
-  def test_tensor_shapes_resnet_34_v1(self):
-    self.tensor_shapes_helper(34, resnet_version=1)
-
-  def test_tensor_shapes_resnet_34_v2(self):
-    self.tensor_shapes_helper(34, resnet_version=2)
-
-  def test_tensor_shapes_resnet_50_v1(self):
-    self.tensor_shapes_helper(50, resnet_version=1)
-
-  def test_tensor_shapes_resnet_50_v2(self):
-    self.tensor_shapes_helper(50, resnet_version=2)
-
-  def test_tensor_shapes_resnet_101_v1(self):
-    self.tensor_shapes_helper(101, resnet_version=1)
-
-  def test_tensor_shapes_resnet_101_v2(self):
-    self.tensor_shapes_helper(101, resnet_version=2)
-
-  def test_tensor_shapes_resnet_152_v1(self):
-    self.tensor_shapes_helper(152, resnet_version=1)
-
-  def test_tensor_shapes_resnet_152_v2(self):
-    self.tensor_shapes_helper(152, resnet_version=2)
-
-  def test_tensor_shapes_resnet_200_v1(self):
-    self.tensor_shapes_helper(200, resnet_version=1)
-
-  def test_tensor_shapes_resnet_200_v2(self):
-    self.tensor_shapes_helper(200, resnet_version=2)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_18_with_gpu_v1(self):
-    self.tensor_shapes_helper(18, resnet_version=1, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_18_with_gpu_v2(self):
-    self.tensor_shapes_helper(18, resnet_version=2, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_34_with_gpu_v1(self):
-    self.tensor_shapes_helper(34, resnet_version=1, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_34_with_gpu_v2(self):
-    self.tensor_shapes_helper(34, resnet_version=2, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_50_with_gpu_v1(self):
-    self.tensor_shapes_helper(50, resnet_version=1, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_50_with_gpu_v2(self):
-    self.tensor_shapes_helper(50, resnet_version=2, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_101_with_gpu_v1(self):
-    self.tensor_shapes_helper(101, resnet_version=1, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_101_with_gpu_v2(self):
-    self.tensor_shapes_helper(101, resnet_version=2, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_152_with_gpu_v1(self):
-    self.tensor_shapes_helper(152, resnet_version=1, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_152_with_gpu_v2(self):
-    self.tensor_shapes_helper(152, resnet_version=2, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_200_with_gpu_v1(self):
-    self.tensor_shapes_helper(200, resnet_version=1, with_gpu=True)
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_tensor_shapes_resnet_200_with_gpu_v2(self):
-    self.tensor_shapes_helper(200, resnet_version=2, with_gpu=True)
-
-  def resnet_model_fn_helper(self, mode, resnet_version, dtype):
-    """Tests that the EstimatorSpec is given the appropriate arguments."""
-    tf.compat.v1.train.create_global_step()
-
-    input_fn = imagenet_main.get_synth_input_fn(dtype)
-    dataset = input_fn(True, '', _BATCH_SIZE)
-    iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
-    features, labels = iterator.get_next()
-    spec = imagenet_main.imagenet_model_fn(
-        features, labels, mode, {
-            'dtype': dtype,
-            'resnet_size': 50,
-            'data_format': 'channels_last',
-            'batch_size': _BATCH_SIZE,
-            'resnet_version': resnet_version,
-            'loss_scale': 128 if dtype == tf.float16 else 1,
-            'fine_tune': False,
-        })
-
-    predictions = spec.predictions
-    self.assertAllEqual(predictions['probabilities'].shape,
-                        (_BATCH_SIZE, _LABEL_CLASSES))
-    self.assertEqual(predictions['probabilities'].dtype, tf.float32)
-    self.assertAllEqual(predictions['classes'].shape, (_BATCH_SIZE,))
-    self.assertEqual(predictions['classes'].dtype, tf.int64)
-
-    if mode != tf.estimator.ModeKeys.PREDICT:
-      loss = spec.loss
-      self.assertAllEqual(loss.shape, ())
-      self.assertEqual(loss.dtype, tf.float32)
-
-    if mode == tf.estimator.ModeKeys.EVAL:
-      eval_metric_ops = spec.eval_metric_ops
-      self.assertAllEqual(eval_metric_ops['accuracy'][0].shape, ())
-      self.assertAllEqual(eval_metric_ops['accuracy'][1].shape, ())
-      self.assertEqual(eval_metric_ops['accuracy'][0].dtype, tf.float32)
-      self.assertEqual(eval_metric_ops['accuracy'][1].dtype, tf.float32)
-
-  def test_resnet_model_fn_train_mode_v1(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.TRAIN, resnet_version=1,
-                                dtype=tf.float32)
-
-  def test_resnet_model_fn_train_mode_v2(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.TRAIN, resnet_version=2,
-                                dtype=tf.float32)
-
-  def test_resnet_model_fn_eval_mode_v1(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.EVAL, resnet_version=1,
-                                dtype=tf.float32)
-
-  def test_resnet_model_fn_eval_mode_v2(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.EVAL, resnet_version=2,
-                                dtype=tf.float32)
-
-  def test_resnet_model_fn_predict_mode_v1(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.PREDICT, resnet_version=1,
-                                dtype=tf.float32)
-
-  def test_resnet_model_fn_predict_mode_v2(self):
-    self.resnet_model_fn_helper(tf.estimator.ModeKeys.PREDICT, resnet_version=2,
-                                dtype=tf.float32)
-
-  def _test_imagenetmodel_shape(self, resnet_version):
-    batch_size = 135
-    num_classes = 246
-
-    model = imagenet_main.ImagenetModel(
-        50, data_format='channels_last', num_classes=num_classes,
-        resnet_version=resnet_version)
-
-    fake_input = tf.random.uniform([batch_size, 224, 224, 3])
-    output = model(fake_input, training=True)
-
-    self.assertAllEqual(output.shape, (batch_size, num_classes))
-
-  def test_imagenetmodel_shape_v1(self):
-    self._test_imagenetmodel_shape(resnet_version=1)
-
-  def test_imagenetmodel_shape_v2(self):
-    self._test_imagenetmodel_shape(resnet_version=2)
-
-  def test_imagenet_end_to_end_synthetic_v1(self):
-    integration.run_synthetic(
-        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '1', '-batch_size', '4',
-                     '--max_train_steps', '1']
-    )
-
-  def test_imagenet_end_to_end_synthetic_v2(self):
-    integration.run_synthetic(
-        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '2', '-batch_size', '4',
-                     '--max_train_steps', '1']
-    )
-
-  def test_imagenet_end_to_end_synthetic_v1_tiny(self):
-    integration.run_synthetic(
-        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '1', '-batch_size', '4',
-                     '-resnet_size', '18', '--max_train_steps', '1']
-    )
-
-  def test_imagenet_end_to_end_synthetic_v2_tiny(self):
-    integration.run_synthetic(
-        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '2', '-batch_size', '4',
-                     '-resnet_size', '18', '--max_train_steps', '1']
-    )
-
-  def test_imagenet_end_to_end_synthetic_v1_huge(self):
-    integration.run_synthetic(
-        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '1', '-batch_size', '4',
-                     '-resnet_size', '200', '--max_train_steps', '1']
-    )
-
-  def test_imagenet_end_to_end_synthetic_v2_huge(self):
-    integration.run_synthetic(
-        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
-        extra_flags=['-resnet_version', '2', '-batch_size', '4',
-                     '-resnet_size', '200', '--max_train_steps', '1']
-    )
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/r1/resnet/resnet_model.py
+++ b/official/r1/resnet/resnet_model.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Contains definitions for Residual Networks.
-
-Residual networks ('v1' ResNets) were originally proposed in:
-[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
-    Deep Residual Learning for Image Recognition. arXiv:1512.03385
-
-The full preactivation 'v2' ResNet variant was introduced by:
-[2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
-    Identity Mappings in Deep Residual Networks. arXiv: 1603.05027
-
-The key difference of the full preactivation 'v2' variant compared to the
-'v1' variant in [1] is the use of batch normalization before every weight layer
-rather than after.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-_BATCH_NORM_DECAY = 0.997
-_BATCH_NORM_EPSILON = 1e-5
-DEFAULT_VERSION = 2
-DEFAULT_DTYPE = tf.float32
-CASTABLE_TYPES = (tf.float16,)
-ALLOWED_TYPES = (DEFAULT_DTYPE,) + CASTABLE_TYPES
-
-
-################################################################################
-# Convenience functions for building the ResNet model.
-################################################################################
-def batch_norm(inputs, training, data_format):
-  """Performs a batch normalization using a standard set of parameters."""
-  # We set fused=True for a significant performance boost. See
-  # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
-  return tf.compat.v1.layers.batch_normalization(
-      inputs=inputs, axis=1 if data_format == 'channels_first' else 3,
-      momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True,
-      scale=True, training=training, fused=True)
-
-
-def fixed_padding(inputs, kernel_size, data_format):
-  """Pads the input along the spatial dimensions independently of input size.
-
-  Args:
-    inputs: A tensor of size [batch, channels, height_in, width_in] or
-      [batch, height_in, width_in, channels] depending on data_format.
-    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
-                 Should be a positive integer.
-    data_format: The input format ('channels_last' or 'channels_first').
-
-  Returns:
-    A tensor with the same format as the input with the data either intact
-    (if kernel_size == 1) or padded (if kernel_size > 1).
-  """
-  pad_total = kernel_size - 1
-  pad_beg = pad_total // 2
-  pad_end = pad_total - pad_beg
-
-  if data_format == 'channels_first':
-    padded_inputs = tf.pad(tensor=inputs,
-                           paddings=[[0, 0], [0, 0], [pad_beg, pad_end],
-                                     [pad_beg, pad_end]])
-  else:
-    padded_inputs = tf.pad(tensor=inputs,
-                           paddings=[[0, 0], [pad_beg, pad_end],
-                                     [pad_beg, pad_end], [0, 0]])
-  return padded_inputs
-
-
-def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
-  """Strided 2-D convolution with explicit padding."""
-  # The padding is consistent and is based only on `kernel_size`, not on the
-  # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
-  if strides > 1:
-    inputs = fixed_padding(inputs, kernel_size, data_format)
-
-  return tf.compat.v1.layers.conv2d(
-      inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides,
-      padding=('SAME' if strides == 1 else 'VALID'), use_bias=False,
-      kernel_initializer=tf.compat.v1.variance_scaling_initializer(),
-      data_format=data_format)
-
-
-################################################################################
-# ResNet block definitions.
-################################################################################
-def _building_block_v1(inputs, filters, training, projection_shortcut, strides,
-                       data_format):
-  """A single block for ResNet v1, without a bottleneck.
-
-  Convolution then batch normalization then ReLU as described by:
-    Deep Residual Learning for Image Recognition
-    https://arxiv.org/pdf/1512.03385.pdf
-    by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
-
-  Args:
-    inputs: A tensor of size [batch, channels, height_in, width_in] or
-      [batch, height_in, width_in, channels] depending on data_format.
-    filters: The number of filters for the convolutions.
-    training: A Boolean for whether the model is in training or inference
-      mode. Needed for batch normalization.
-    projection_shortcut: The function to use for projection shortcuts
-      (typically a 1x1 convolution when downsampling the input).
-    strides: The block's stride. If greater than 1, this block will ultimately
-      downsample the input.
-    data_format: The input format ('channels_last' or 'channels_first').
-
-  Returns:
-    The output tensor of the block; shape should match inputs.
-  """
-  shortcut = inputs
-
-  if projection_shortcut is not None:
-    shortcut = projection_shortcut(inputs)
-    shortcut = batch_norm(inputs=shortcut, training=training,
-                          data_format=data_format)
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
-      data_format=data_format)
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=3, strides=1,
-      data_format=data_format)
-  inputs = batch_norm(inputs, training, data_format)
-  inputs += shortcut
-  inputs = tf.nn.relu(inputs)
-
-  return inputs
-
-
-def _building_block_v2(inputs, filters, training, projection_shortcut, strides,
-                       data_format):
-  """A single block for ResNet v2, without a bottleneck.
-
-  Batch normalization then ReLu then convolution as described by:
-    Identity Mappings in Deep Residual Networks
-    https://arxiv.org/pdf/1603.05027.pdf
-    by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Jul 2016.
-
-  Args:
-    inputs: A tensor of size [batch, channels, height_in, width_in] or
-      [batch, height_in, width_in, channels] depending on data_format.
-    filters: The number of filters for the convolutions.
-    training: A Boolean for whether the model is in training or inference
-      mode. Needed for batch normalization.
-    projection_shortcut: The function to use for projection shortcuts
-      (typically a 1x1 convolution when downsampling the input).
-    strides: The block's stride. If greater than 1, this block will ultimately
-      downsample the input.
-    data_format: The input format ('channels_last' or 'channels_first').
-
-  Returns:
-    The output tensor of the block; shape should match inputs.
-  """
-  shortcut = inputs
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-
-  # The projection shortcut should come after the first batch norm and ReLU
-  # since it performs a 1x1 convolution.
-  if projection_shortcut is not None:
-    shortcut = projection_shortcut(inputs)
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
-      data_format=data_format)
-
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=3, strides=1,
-      data_format=data_format)
-
-  return inputs + shortcut
-
-
-def _bottleneck_block_v1(inputs, filters, training, projection_shortcut,
-                         strides, data_format):
-  """A single block for ResNet v1, with a bottleneck.
-
-  Similar to _building_block_v1(), except using the "bottleneck" blocks
-  described in:
-    Convolution then batch normalization then ReLU as described by:
-      Deep Residual Learning for Image Recognition
-      https://arxiv.org/pdf/1512.03385.pdf
-      by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
-
-  Args:
-    inputs: A tensor of size [batch, channels, height_in, width_in] or
-      [batch, height_in, width_in, channels] depending on data_format.
-    filters: The number of filters for the convolutions.
-    training: A Boolean for whether the model is in training or inference
-      mode. Needed for batch normalization.
-    projection_shortcut: The function to use for projection shortcuts
-      (typically a 1x1 convolution when downsampling the input).
-    strides: The block's stride. If greater than 1, this block will ultimately
-      downsample the input.
-    data_format: The input format ('channels_last' or 'channels_first').
-
-  Returns:
-    The output tensor of the block; shape should match inputs.
-  """
-  shortcut = inputs
-
-  if projection_shortcut is not None:
-    shortcut = projection_shortcut(inputs)
-    shortcut = batch_norm(inputs=shortcut, training=training,
-                          data_format=data_format)
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=1, strides=1,
-      data_format=data_format)
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
-      data_format=data_format)
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=4 * filters, kernel_size=1, strides=1,
-      data_format=data_format)
-  inputs = batch_norm(inputs, training, data_format)
-  inputs += shortcut
-  inputs = tf.nn.relu(inputs)
-
-  return inputs
-
-
-def _bottleneck_block_v2(inputs, filters, training, projection_shortcut,
-                         strides, data_format):
-  """A single block for ResNet v2, with a bottleneck.
-
-  Similar to _building_block_v2(), except using the "bottleneck" blocks
-  described in:
-    Convolution then batch normalization then ReLU as described by:
-      Deep Residual Learning for Image Recognition
-      https://arxiv.org/pdf/1512.03385.pdf
-      by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
-
-  Adapted to the ordering conventions of:
-    Batch normalization then ReLu then convolution as described by:
-      Identity Mappings in Deep Residual Networks
-      https://arxiv.org/pdf/1603.05027.pdf
-      by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Jul 2016.
-
-  Args:
-    inputs: A tensor of size [batch, channels, height_in, width_in] or
-      [batch, height_in, width_in, channels] depending on data_format.
-    filters: The number of filters for the convolutions.
-    training: A Boolean for whether the model is in training or inference
-      mode. Needed for batch normalization.
-    projection_shortcut: The function to use for projection shortcuts
-      (typically a 1x1 convolution when downsampling the input).
-    strides: The block's stride. If greater than 1, this block will ultimately
-      downsample the input.
-    data_format: The input format ('channels_last' or 'channels_first').
-
-  Returns:
-    The output tensor of the block; shape should match inputs.
-  """
-  shortcut = inputs
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-
-  # The projection shortcut should come after the first batch norm and ReLU
-  # since it performs a 1x1 convolution.
-  if projection_shortcut is not None:
-    shortcut = projection_shortcut(inputs)
-
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=1, strides=1,
-      data_format=data_format)
-
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
-      data_format=data_format)
-
-  inputs = batch_norm(inputs, training, data_format)
-  inputs = tf.nn.relu(inputs)
-  inputs = conv2d_fixed_padding(
-      inputs=inputs, filters=4 * filters, kernel_size=1, strides=1,
-      data_format=data_format)
-
-  return inputs + shortcut
-
-
-def block_layer(inputs, filters, bottleneck, block_fn, blocks, strides,
-                training, name, data_format):
-  """Creates one layer of blocks for the ResNet model.
-
-  Args:
-    inputs: A tensor of size [batch, channels, height_in, width_in] or
-      [batch, height_in, width_in, channels] depending on data_format.
-    filters: The number of filters for the first convolution of the layer.
-    bottleneck: Is the block created a bottleneck block.
-    block_fn: The block to use within the model, either `building_block` or
-      `bottleneck_block`.
-    blocks: The number of blocks contained in the layer.
-    strides: The stride to use for the first convolution of the layer. If
-      greater than 1, this layer will ultimately downsample the input.
-    training: Either True or False, whether we are currently training the
-      model. Needed for batch norm.
-    name: A string name for the tensor output of the block layer.
-    data_format: The input format ('channels_last' or 'channels_first').
-
-  Returns:
-    The output tensor of the block layer.
-  """
-
-  # Bottleneck blocks end with 4x the number of filters as they start with
-  filters_out = filters * 4 if bottleneck else filters
-
-  def projection_shortcut(inputs):
-    return conv2d_fixed_padding(
-        inputs=inputs, filters=filters_out, kernel_size=1, strides=strides,
-        data_format=data_format)
-
-  # Only the first block per block_layer uses projection_shortcut and strides
-  inputs = block_fn(inputs, filters, training, projection_shortcut, strides,
-                    data_format)
-
-  for _ in range(1, blocks):
-    inputs = block_fn(inputs, filters, training, None, 1, data_format)
-
-  return tf.identity(inputs, name)
-
-
-class Model(object):
-  """Base class for building the Resnet Model."""
-
-  def __init__(self, resnet_size, bottleneck, num_classes, num_filters,
-               kernel_size,
-               conv_stride, first_pool_size, first_pool_stride,
-               block_sizes, block_strides,
-               resnet_version=DEFAULT_VERSION, data_format=None,
-               dtype=DEFAULT_DTYPE):
-    """Creates a model for classifying an image.
-
-    Args:
-      resnet_size: A single integer for the size of the ResNet model.
-      bottleneck: Use regular blocks or bottleneck blocks.
-      num_classes: The number of classes used as labels.
-      num_filters: The number of filters to use for the first block layer
-        of the model. This number is then doubled for each subsequent block
-        layer.
-      kernel_size: The kernel size to use for convolution.
-      conv_stride: stride size for the initial convolutional layer
-      first_pool_size: Pool size to be used for the first pooling layer.
-        If none, the first pooling layer is skipped.
-      first_pool_stride: stride size for the first pooling layer. Not used
-        if first_pool_size is None.
-      block_sizes: A list containing n values, where n is the number of sets of
-        block layers desired. Each value should be the number of blocks in the
-        i-th set.
-      block_strides: List of integers representing the desired stride size for
-        each of the sets of block layers. Should be same length as block_sizes.
-      resnet_version: Integer representing which version of the ResNet network
-        to use. See README for details. Valid values: [1, 2]
-      data_format: Input format ('channels_last', 'channels_first', or None).
-        If set to None, the format is dependent on whether a GPU is available.
-      dtype: The TensorFlow dtype to use for calculations. If not specified
-        tf.float32 is used.
-
-    Raises:
-      ValueError: if invalid version is selected.
-    """
-    self.resnet_size = resnet_size
-
-    if not data_format:
-      data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
-                     else 'channels_last')
-
-    self.resnet_version = resnet_version
-    if resnet_version not in (1, 2):
-      raise ValueError(
-          'Resnet version should be 1 or 2. See README for citations.')
-
-    self.bottleneck = bottleneck
-    if bottleneck:
-      if resnet_version == 1:
-        self.block_fn = _bottleneck_block_v1
-      else:
-        self.block_fn = _bottleneck_block_v2
-    else:
-      if resnet_version == 1:
-        self.block_fn = _building_block_v1
-      else:
-        self.block_fn = _building_block_v2
-
-    if dtype not in ALLOWED_TYPES:
-      raise ValueError('dtype must be one of: {}'.format(ALLOWED_TYPES))
-
-    self.data_format = data_format
-    self.num_classes = num_classes
-    self.num_filters = num_filters
-    self.kernel_size = kernel_size
-    self.conv_stride = conv_stride
-    self.first_pool_size = first_pool_size
-    self.first_pool_stride = first_pool_stride
-    self.block_sizes = block_sizes
-    self.block_strides = block_strides
-    self.dtype = dtype
-    self.pre_activation = resnet_version == 2
-
-  def _custom_dtype_getter(self, getter, name, shape=None, dtype=DEFAULT_DTYPE,
-                           *args, **kwargs):
-    """Creates variables in fp32, then casts to fp16 if necessary.
-
-    This function is a custom getter. A custom getter is a function with the
-    same signature as tf.get_variable, except it has an additional getter
-    parameter. Custom getters can be passed as the `custom_getter` parameter of
-    tf.variable_scope. Then, tf.get_variable will call the custom getter,
-    instead of directly getting a variable itself. This can be used to change
-    the types of variables that are retrieved with tf.get_variable.
-    The `getter` parameter is the underlying variable getter, that would have
-    been called if no custom getter was used. Custom getters typically get a
-    variable with `getter`, then modify it in some way.
-
-    This custom getter will create an fp32 variable. If a low precision
-    (e.g. float16) variable was requested it will then cast the variable to the
-    requested dtype. The reason we do not directly create variables in low
-    precision dtypes is that applying small gradients to such variables may
-    cause the variable not to change.
-
-    Args:
-      getter: The underlying variable getter, that has the same signature as
-        tf.get_variable and returns a variable.
-      name: The name of the variable to get.
-      shape: The shape of the variable to get.
-      dtype: The dtype of the variable to get. Note that if this is a low
-        precision dtype, the variable will be created as a tf.float32 variable,
-        then cast to the appropriate dtype
-      *args: Additional arguments to pass unmodified to getter.
-      **kwargs: Additional keyword arguments to pass unmodified to getter.
-
-    Returns:
-      A variable which is cast to fp16 if necessary.
-    """
-
-    if dtype in CASTABLE_TYPES:
-      var = getter(name, shape, tf.float32, *args, **kwargs)
-      return tf.cast(var, dtype=dtype, name=name + '_cast')
-    else:
-      return getter(name, shape, dtype, *args, **kwargs)
-
-  def _model_variable_scope(self):
-    """Returns a variable scope that the model should be created under.
-
-    If self.dtype is a castable type, model variable will be created in fp32
-    then cast to self.dtype before being used.
-
-    Returns:
-      A variable scope for the model.
-    """
-
-    return tf.compat.v1.variable_scope('resnet_model',
-                                       custom_getter=self._custom_dtype_getter)
-
-  def __call__(self, inputs, training):
-    """Add operations to classify a batch of input images.
-
-    Args:
-      inputs: A Tensor representing a batch of input images.
-      training: A boolean. Set to True to add operations required only when
-        training the classifier.
-
-    Returns:
-      A logits Tensor with shape [<batch_size>, self.num_classes].
-    """
-
-    with self._model_variable_scope():
-      if self.data_format == 'channels_first':
-        # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
-        # This provides a large performance boost on GPU. See
-        # https://www.tensorflow.org/performance/performance_guide#data_formats
-        inputs = tf.transpose(a=inputs, perm=[0, 3, 1, 2])
-
-      inputs = conv2d_fixed_padding(
-          inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size,
-          strides=self.conv_stride, data_format=self.data_format)
-      inputs = tf.identity(inputs, 'initial_conv')
-
-      # We do not include batch normalization or activation functions in V2
-      # for the initial conv1 because the first ResNet unit will perform these
-      # for both the shortcut and non-shortcut paths as part of the first
-      # block's projection. Cf. Appendix of [2].
-      if self.resnet_version == 1:
-        inputs = batch_norm(inputs, training, self.data_format)
-        inputs = tf.nn.relu(inputs)
-
-      if self.first_pool_size:
-        inputs = tf.compat.v1.layers.max_pooling2d(
-            inputs=inputs, pool_size=self.first_pool_size,
-            strides=self.first_pool_stride, padding='SAME',
-            data_format=self.data_format)
-        inputs = tf.identity(inputs, 'initial_max_pool')
-
-      for i, num_blocks in enumerate(self.block_sizes):
-        num_filters = self.num_filters * (2**i)
-        inputs = block_layer(
-            inputs=inputs, filters=num_filters, bottleneck=self.bottleneck,
-            block_fn=self.block_fn, blocks=num_blocks,
-            strides=self.block_strides[i], training=training,
-            name='block_layer{}'.format(i + 1), data_format=self.data_format)
-
-      # Only apply the BN and ReLU for model that does pre_activation in each
-      # building/bottleneck block, eg resnet V2.
-      if self.pre_activation:
-        inputs = batch_norm(inputs, training, self.data_format)
-        inputs = tf.nn.relu(inputs)
-
-      # The current top layer has shape
-      # `batch_size x pool_size x pool_size x final_size`.
-      # ResNet does an Average Pooling layer over pool_size,
-      # but that is the same as doing a reduce_mean. We do a reduce_mean
-      # here because it performs better than AveragePooling2D.
-      axes = [2, 3] if self.data_format == 'channels_first' else [1, 2]
-      inputs = tf.reduce_mean(input_tensor=inputs, axis=axes, keepdims=True)
-      inputs = tf.identity(inputs, 'final_reduce_mean')
-
-      inputs = tf.squeeze(inputs, axes)
-      inputs = tf.compat.v1.layers.dense(inputs=inputs, units=self.num_classes)
-      inputs = tf.identity(inputs, 'final_dense')
-      return inputs
--- a/official/r1/resnet/resnet_run_loop.py
+++ b/official/r1/resnet/resnet_run_loop.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Contains utility and supporting functions for ResNet.
-
-  This module contains ResNet code which does not directly build layers. This
-includes dataset management, hyperparameter and optimizer code, and argument
-parsing. Code for defining the ResNet layers can be found in resnet_model.py.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import math
-import multiprocessing
-import os
-
-from absl import flags
-from absl import logging
-import tensorflow as tf
-
-from official.r1.resnet import imagenet_preprocessing
-from official.r1.resnet import resnet_model
-from official.r1.utils import export
-from official.r1.utils.logs import hooks_helper
-from official.r1.utils.logs import logger
-from official.utils.flags import core as flags_core
-from official.utils.misc import distribution_utils
-from official.utils.misc import model_helpers
-
-
-################################################################################
-# Functions for input processing.
-################################################################################
-def process_record_dataset(dataset,
-                           is_training,
-                           batch_size,
-                           shuffle_buffer,
-                           parse_record_fn,
-                           num_epochs=1,
-                           dtype=tf.float32,
-                           datasets_num_private_threads=None,
-                           drop_remainder=False,
-                           tf_data_experimental_slack=False):
-  """Given a Dataset with raw records, return an iterator over the records.
-
-  Args:
-    dataset: A Dataset representing raw records
-    is_training: A boolean denoting whether the input is for training.
-    batch_size: The number of samples per batch.
-    shuffle_buffer: The buffer size to use when shuffling records. A larger
-      value results in better randomness, but smaller values reduce startup
-      time and use less memory.
-    parse_record_fn: A function that takes a raw record and returns the
-      corresponding (image, label) pair.
-    num_epochs: The number of epochs to repeat the dataset.
-    dtype: Data type to use for images/features.
-    datasets_num_private_threads: Number of threads for a private
-      threadpool created for all datasets computation.
-    drop_remainder: A boolean indicates whether to drop the remainder of the
-      batches. If True, the batch dimension will be static.
-    tf_data_experimental_slack: Whether to enable tf.data's
-      `experimental_slack` option.
-
-  Returns:
-    Dataset of (image, label) pairs ready for iteration.
-  """
-  # Defines a specific size thread pool for tf.data operations.
-  if datasets_num_private_threads:
-    options = tf.data.Options()
-    options.experimental_threading.private_threadpool_size = (
-        datasets_num_private_threads)
-    dataset = dataset.with_options(options)
-    logging.info('datasets_num_private_threads: %s',
-                 datasets_num_private_threads)
-
-  # Disable intra-op parallelism to optimize for throughput instead of latency.
-  options = tf.data.Options()
-  options.experimental_threading.max_intra_op_parallelism = 1
-  dataset = dataset.with_options(options)
-
-  # Prefetches a batch at a time to smooth out the time taken to load input
-  # files for shuffling and processing.
-  dataset = dataset.prefetch(buffer_size=batch_size)
-  if is_training:
-    # Shuffles records before repeating to respect epoch boundaries.
-    dataset = dataset.shuffle(buffer_size=shuffle_buffer)
-
-  # Repeats the dataset for the number of epochs to train.
-  dataset = dataset.repeat(num_epochs)
-
-  # Parses the raw records into images and labels.
-  dataset = dataset.map(
-      lambda value: parse_record_fn(value, is_training, dtype),
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
-  dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
-
-  # Operations between the final prefetch and the get_next call to the iterator
-  # will happen synchronously during run time. We prefetch here again to
-  # background all of the above processing work and keep it out of the
-  # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
-  # allows DistributionStrategies to adjust how many batches to fetch based
-  # on how many devices are present.
-  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
-
-  if tf_data_experimental_slack:
-    options = tf.data.Options()
-    options.experimental_slack = True
-    dataset = dataset.with_options(options)
-
-  return dataset
-
-
-def get_synth_input_fn(height, width, num_channels, num_classes,
-                       dtype=tf.float32):
-  """Returns an input function that returns a dataset with random data.
-
-  This input_fn returns a data set that iterates over a set of random data and
-  bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
-  copy is still included. This used to find the upper throughput bound when
-  tunning the full input pipeline.
-
-  Args:
-    height: Integer height that will be used to create a fake image tensor.
-    width: Integer width that will be used to create a fake image tensor.
-    num_channels: Integer depth that will be used to create a fake image tensor.
-    num_classes: Number of classes that should be represented in the fake labels
-      tensor
-    dtype: Data type for features/images.
-
-  Returns:
-    An input_fn that can be used in place of a real one to return a dataset
-    that can be used for iteration.
-  """
-  # pylint: disable=unused-argument
-  def input_fn(is_training, data_dir, batch_size, *args, **kwargs):
-    """Returns dataset filled with random data."""
-    # Synthetic input should be within [0, 255].
-    inputs = tf.random.truncated_normal(
-        [batch_size] + [height, width, num_channels],
-        dtype=dtype,
-        mean=127,
-        stddev=60,
-        name='synthetic_inputs')
-
-    labels = tf.random.uniform(
-        [batch_size],
-        minval=0,
-        maxval=num_classes - 1,
-        dtype=tf.int32,
-        name='synthetic_labels')
-    data = tf.data.Dataset.from_tensors((inputs, labels)).repeat()
-    data = data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
-    return data
-
-  return input_fn
-
-
-def image_bytes_serving_input_fn(image_shape, dtype=tf.float32):
-  """Serving input fn for raw jpeg images."""
-
-  def _preprocess_image(image_bytes):
-    """Preprocess a single raw image."""
-    # Bounding box around the whole image.
-    bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=dtype, shape=[1, 1, 4])
-    height, width, num_channels = image_shape
-    image = imagenet_preprocessing.preprocess_image(
-        image_bytes, bbox, height, width, num_channels, is_training=False)
-    return image
-
-  image_bytes_list = tf.compat.v1.placeholder(
-      shape=[None], dtype=tf.string, name='input_tensor')
-  images = tf.map_fn(
-      _preprocess_image, image_bytes_list, back_prop=False, dtype=dtype)
-  return tf.estimator.export.TensorServingInputReceiver(
-      images, {'image_bytes': image_bytes_list})
-
-
-def override_flags_and_set_envars_for_gpu_thread_pool(flags_obj):
-  """Override flags and set env_vars for performance.
-
-  These settings exist to test the difference between using stock settings
-  and manual tuning. It also shows some of the ENV_VARS that can be tweaked to
-  squeeze a few extra examples per second.  These settings are defaulted to the
-  current platform of interest, which changes over time.
-
-  On systems with small numbers of cpu cores, e.g. under 8 logical cores,
-  setting up a gpu thread pool with `tf_gpu_thread_mode=gpu_private` may perform
-  poorly.
-
-  Args:
-    flags_obj: Current flags, which will be adjusted possibly overriding
-    what has been set by the user on the command-line.
-  """
-  cpu_count = multiprocessing.cpu_count()
-  logging.info('Logical CPU cores: %s', cpu_count)
-
-  # Sets up thread pool for each GPU for op scheduling.
-  per_gpu_thread_count = 1
-  total_gpu_thread_count = per_gpu_thread_count * flags_obj.num_gpus
-  os.environ['TF_GPU_THREAD_MODE'] = flags_obj.tf_gpu_thread_mode
-  os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
-  logging.info('TF_GPU_THREAD_COUNT: %s', os.environ['TF_GPU_THREAD_COUNT'])
-  logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE'])
-
-  # Reduces general thread pool by number of threads used for GPU pool.
-  main_thread_count = cpu_count - total_gpu_thread_count
-  flags_obj.inter_op_parallelism_threads = main_thread_count
-
-  # Sets thread count for tf.data. Logical cores minus threads assign to the
-  # private GPU pool along with 2 thread per GPU for event monitoring and
-  # sending / receiving tensors.
-  num_monitoring_threads = 2 * flags_obj.num_gpus
-  flags_obj.datasets_num_private_threads = (cpu_count - total_gpu_thread_count
-                                            - num_monitoring_threads)
-
-
-################################################################################
-# Functions for running training/eval/validation loops for the model.
-################################################################################
-def learning_rate_with_decay(
-    batch_size, batch_denom, num_images, boundary_epochs, decay_rates,
-    base_lr=0.1, warmup=False):
-  """Get a learning rate that decays step-wise as training progresses.
-
-  Args:
-    batch_size: the number of examples processed in each training batch.
-    batch_denom: this value will be used to scale the base learning rate.
-      `0.1 * batch size` is divided by this number, such that when
-      batch_denom == batch_size, the initial learning rate will be 0.1.
-    num_images: total number of images that will be used for training.
-    boundary_epochs: list of ints representing the epochs at which we
-      decay the learning rate.
-    decay_rates: list of floats representing the decay rates to be used
-      for scaling the learning rate. It should have one more element
-      than `boundary_epochs`, and all elements should have the same type.
-    base_lr: Initial learning rate scaled based on batch_denom.
-    warmup: Run a 5 epoch warmup to the initial lr.
-  Returns:
-    Returns a function that takes a single argument - the number of batches
-    trained so far (global_step)- and returns the learning rate to be used
-    for training the next batch.
-  """
-  initial_learning_rate = base_lr * batch_size / batch_denom
-  batches_per_epoch = num_images / batch_size
-
-  # Reduce the learning rate at certain epochs.
-  # CIFAR-10: divide by 10 at epoch 100, 150, and 200
-  # ImageNet: divide by 10 at epoch 30, 60, 80, and 90
-  boundaries = [int(batches_per_epoch * epoch) for epoch in boundary_epochs]
-  vals = [initial_learning_rate * decay for decay in decay_rates]
-
-  def learning_rate_fn(global_step):
-    """Builds scaled learning rate function with 5 epoch warm up."""
-    lr = tf.compat.v1.train.piecewise_constant(global_step, boundaries, vals)
-    if warmup:
-      warmup_steps = int(batches_per_epoch * 5)
-      warmup_lr = (
-          initial_learning_rate * tf.cast(global_step, tf.float32) / tf.cast(
-              warmup_steps, tf.float32))
-      return tf.cond(pred=global_step < warmup_steps,
-                     true_fn=lambda: warmup_lr,
-                     false_fn=lambda: lr)
-    return lr
-
-  def poly_rate_fn(global_step):
-    """Handles linear scaling rule, gradual warmup, and LR decay.
-
-    The learning rate starts at 0, then it increases linearly per step.  After
-    FLAGS.poly_warmup_epochs, we reach the base learning rate (scaled to account
-    for batch size). The learning rate is then decayed using a polynomial rate
-    decay schedule with power 2.0.
-
-    Args:
-      global_step: the current global_step
-
-    Returns:
-      returns the current learning rate
-    """
-
-    # Learning rate schedule for LARS polynomial schedule
-    if flags.FLAGS.batch_size < 8192:
-      plr = 5.0
-      w_epochs = 5
-    elif flags.FLAGS.batch_size < 16384:
-      plr = 10.0
-      w_epochs = 5
-    elif flags.FLAGS.batch_size < 32768:
-      plr = 25.0
-      w_epochs = 5
-    else:
-      plr = 32.0
-      w_epochs = 14
-
-    w_steps = int(w_epochs * batches_per_epoch)
-    wrate = (plr * tf.cast(global_step, tf.float32) / tf.cast(
-        w_steps, tf.float32))
-
-    # TODO(pkanwar): use a flag to help calc num_epochs.
-    num_epochs = 90
-    train_steps = batches_per_epoch * num_epochs
-
-    min_step = tf.constant(1, dtype=tf.int64)
-    decay_steps = tf.maximum(min_step, tf.subtract(global_step, w_steps))
-    poly_rate = tf.train.polynomial_decay(
-        plr,
-        decay_steps,
-        train_steps - w_steps + 1,
-        power=2.0)
-    return tf.where(global_step <= w_steps, wrate, poly_rate)
-
-  # For LARS we have a new learning rate schedule
-  if flags.FLAGS.enable_lars:
-    return poly_rate_fn
-
-  return learning_rate_fn
-
-
-def per_replica_batch_size(batch_size, num_gpus):
-  """For multi-gpu, batch-size must be a multiple of the number of GPUs.
-
-
-  Note that distribution strategy handles this automatically when used with
-  Keras. For using with Estimator, we need to get per GPU batch.
-
-  Args:
-    batch_size: Global batch size to be divided among devices. This should be
-      equal to num_gpus times the single-GPU batch_size for multi-gpu training.
-    num_gpus: How many GPUs are used with DistributionStrategies.
-
-  Returns:
-    Batch size per device.
-
-  Raises:
-    ValueError: if batch_size is not divisible by number of devices
-  """
-  if num_gpus <= 1:
-    return batch_size
-
-  remainder = batch_size % num_gpus
-  if remainder:
-    err = ('When running with multiple GPUs, batch size '
-           'must be a multiple of the number of available GPUs. Found {} '
-           'GPUs with a batch size of {}; try --batch_size={} instead.'
-          ).format(num_gpus, batch_size, batch_size - remainder)
-    raise ValueError(err)
-  return int(batch_size / num_gpus)
-
-
-def resnet_model_fn(features, labels, mode, model_class,
-                    resnet_size, weight_decay, learning_rate_fn, momentum,
-                    data_format, resnet_version, loss_scale,
-                    loss_filter_fn=None, dtype=resnet_model.DEFAULT_DTYPE,
-                    fine_tune=False, label_smoothing=0.0):
-  """Shared functionality for different resnet model_fns.
-
-  Initializes the ResnetModel representing the model layers
-  and uses that model to build the necessary EstimatorSpecs for
-  the `mode` in question. For training, this means building losses,
-  the optimizer, and the train op that get passed into the EstimatorSpec.
-  For evaluation and prediction, the EstimatorSpec is returned without
-  a train op, but with the necessary parameters for the given mode.
-
-  Args:
-    features: tensor representing input images
-    labels: tensor representing class labels for all input images
-    mode: current estimator mode; should be one of
-      `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`
-    model_class: a class representing a TensorFlow model that has a __call__
-      function. We assume here that this is a subclass of ResnetModel.
-    resnet_size: A single integer for the size of the ResNet model.
-    weight_decay: weight decay loss rate used to regularize learned variables.
-    learning_rate_fn: function that returns the current learning rate given
-      the current global_step
-    momentum: momentum term used for optimization
-    data_format: Input format ('channels_last', 'channels_first', or None).
-      If set to None, the format is dependent on whether a GPU is available.
-    resnet_version: Integer representing which version of the ResNet network to
-      use. See README for details. Valid values: [1, 2]
-    loss_scale: The factor to scale the loss for numerical stability. A detailed
-      summary is present in the arg parser help text.
-    loss_filter_fn: function that takes a string variable name and returns
-      True if the var should be included in loss calculation, and False
-      otherwise. If None, batch_normalization variables will be excluded
-      from the loss.
-    dtype: the TensorFlow dtype to use for calculations.
-    fine_tune: If True only train the dense layers(final layers).
-    label_smoothing: If greater than 0 then smooth the labels.
-
-  Returns:
-    EstimatorSpec parameterized according to the input params and the
-    current mode.
-  """
-
-  # Generate a summary node for the images
-  tf.compat.v1.summary.image('images', features, max_outputs=6)
-  # Checks that features/images have same data type being used for calculations.
-  assert features.dtype == dtype
-
-  model = model_class(resnet_size, data_format, resnet_version=resnet_version,
-                      dtype=dtype)
-
-  logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)
-
-  # This acts as a no-op if the logits are already in fp32 (provided logits are
-  # not a SparseTensor). If dtype is is low precision, logits must be cast to
-  # fp32 for numerical stability.
-  logits = tf.cast(logits, tf.float32)
-
-  predictions = {
-      'classes': tf.argmax(input=logits, axis=1),
-      'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
-  }
-
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    # Return the predictions and the specification for serving a SavedModel
-    return tf.estimator.EstimatorSpec(
-        mode=mode,
-        predictions=predictions,
-        export_outputs={
-            'predict': tf.estimator.export.PredictOutput(predictions)
-        })
-
-  # Calculate loss, which includes softmax cross entropy and L2 regularization.
-  if label_smoothing != 0.0:
-    one_hot_labels = tf.one_hot(labels, 1001)
-    cross_entropy = tf.losses.softmax_cross_entropy(
-        logits=logits, onehot_labels=one_hot_labels,
-        label_smoothing=label_smoothing)
-  else:
-    cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy(
-        logits=logits, labels=labels)
-
-  # Create a tensor named cross_entropy for logging purposes.
-  tf.identity(cross_entropy, name='cross_entropy')
-  tf.compat.v1.summary.scalar('cross_entropy', cross_entropy)
-
-  # If no loss_filter_fn is passed, assume we want the default behavior,
-  # which is that batch_normalization variables are excluded from loss.
-  def exclude_batch_norm(name):
-    return 'batch_normalization' not in name
-  loss_filter_fn = loss_filter_fn or exclude_batch_norm
-
-  # Add weight decay to the loss.
-  l2_loss = weight_decay * tf.add_n(
-      # loss is computed using fp32 for numerical stability.
-      [
-          tf.nn.l2_loss(tf.cast(v, tf.float32))
-          for v in tf.compat.v1.trainable_variables()
-          if loss_filter_fn(v.name)
-      ])
-  tf.compat.v1.summary.scalar('l2_loss', l2_loss)
-  loss = cross_entropy + l2_loss
-
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    global_step = tf.compat.v1.train.get_or_create_global_step()
-
-    learning_rate = learning_rate_fn(global_step)
-
-    # Create a tensor named learning_rate for logging purposes
-    tf.identity(learning_rate, name='learning_rate')
-    tf.compat.v1.summary.scalar('learning_rate', learning_rate)
-
-    if flags.FLAGS.enable_lars:
-      from tensorflow.contrib import opt as contrib_opt  # pylint: disable=g-import-not-at-top
-      optimizer = contrib_opt.LARSOptimizer(
-          learning_rate,
-          momentum=momentum,
-          weight_decay=weight_decay,
-          skip_list=['batch_normalization', 'bias'])
-    else:
-      optimizer = tf.compat.v1.train.MomentumOptimizer(
-          learning_rate=learning_rate,
-          momentum=momentum
-      )
-
-    fp16_implementation = getattr(flags.FLAGS, 'fp16_implementation', None)
-    if fp16_implementation == 'graph_rewrite':
-      optimizer = (
-          tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
-              optimizer, loss_scale=loss_scale))
-
-    def _dense_grad_filter(gvs):
-      """Only apply gradient updates to the final layer.
-
-      This function is used for fine tuning.
-
-      Args:
-        gvs: list of tuples with gradients and variable info
-      Returns:
-        filtered gradients so that only the dense layer remains
-      """
-      return [(g, v) for g, v in gvs if 'dense' in v.name]
-
-    if loss_scale != 1 and fp16_implementation != 'graph_rewrite':
-      # When computing fp16 gradients, often intermediate tensor values are
-      # so small, they underflow to 0. To avoid this, we multiply the loss by
-      # loss_scale to make these tensor values loss_scale times bigger.
-      scaled_grad_vars = optimizer.compute_gradients(loss * loss_scale)
-
-      if fine_tune:
-        scaled_grad_vars = _dense_grad_filter(scaled_grad_vars)
-
-      # Once the gradient computation is complete we can scale the gradients
-      # back to the correct scale before passing them to the optimizer.
-      unscaled_grad_vars = [(grad / loss_scale, var)
-                            for grad, var in scaled_grad_vars]
-      minimize_op = optimizer.apply_gradients(unscaled_grad_vars, global_step)
-    else:
-      grad_vars = optimizer.compute_gradients(loss)
-      if fine_tune:
-        grad_vars = _dense_grad_filter(grad_vars)
-      minimize_op = optimizer.apply_gradients(grad_vars, global_step)
-
-    update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
-    train_op = tf.group(minimize_op, update_ops)
-  else:
-    train_op = None
-
-  accuracy = tf.compat.v1.metrics.accuracy(labels, predictions['classes'])
-  accuracy_top_5 = tf.compat.v1.metrics.mean(
-      tf.nn.in_top_k(predictions=logits, targets=labels, k=5, name='top_5_op'))
-  metrics = {'accuracy': accuracy,
-             'accuracy_top_5': accuracy_top_5}
-
-  # Create a tensor named train_accuracy for logging purposes
-  tf.identity(accuracy[1], name='train_accuracy')
-  tf.identity(accuracy_top_5[1], name='train_accuracy_top_5')
-  tf.compat.v1.summary.scalar('train_accuracy', accuracy[1])
-  tf.compat.v1.summary.scalar('train_accuracy_top_5', accuracy_top_5[1])
-
-  return tf.estimator.EstimatorSpec(
-      mode=mode,
-      predictions=predictions,
-      loss=loss,
-      train_op=train_op,
-      eval_metric_ops=metrics)
-
-
-def resnet_main(
-    flags_obj, model_function, input_function, dataset_name, shape=None):
-  """Shared main loop for ResNet Models.
-
-  Args:
-    flags_obj: An object containing parsed flags. See define_resnet_flags()
-      for details.
-    model_function: the function that instantiates the Model and builds the
-      ops for train/eval. This will be passed directly into the estimator.
-    input_function: the function that processes the dataset and returns a
-      dataset that the estimator can train on. This will be wrapped with
-      all the relevant flags for running and passed to estimator.
-    dataset_name: the name of the dataset for training and evaluation. This is
-      used for logging purpose.
-    shape: list of ints representing the shape of the images used for training.
-      This is only used if flags_obj.export_dir is passed.
-
-  Returns:
-     Dict of results of the run.  Contains the keys `eval_results` and
-    `train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5.
-    `train_hooks` is a list the instances of hooks used during training.
-  """
-
-  model_helpers.apply_clean(flags.FLAGS)
-
-  # Ensures flag override logic is only executed if explicitly triggered.
-  if flags_obj.tf_gpu_thread_mode:
-    override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)
-
-  # Configures cluster spec for distribution strategy.
-  num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts,
-                                                     flags_obj.task_index)
-
-  # Creates session config. allow_soft_placement = True, is required for
-  # multi-GPU and is not harmful for other modes.
-  session_config = tf.compat.v1.ConfigProto(
-      inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
-      intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
-      allow_soft_placement=True)
-
-  distribution_strategy = distribution_utils.get_distribution_strategy(
-      distribution_strategy=flags_obj.distribution_strategy,
-      num_gpus=flags_core.get_num_gpus(flags_obj),
-      all_reduce_alg=flags_obj.all_reduce_alg,
-      num_packs=flags_obj.num_packs)
-
-  # Creates a `RunConfig` that checkpoints every 24 hours which essentially
-  # results in checkpoints determined only by `epochs_between_evals`.
-  run_config = tf.estimator.RunConfig(
-      train_distribute=distribution_strategy,
-      session_config=session_config,
-      save_checkpoints_secs=60*60*24,
-      save_checkpoints_steps=None)
-
-  # Initializes model with all but the dense layer from pretrained ResNet.
-  if flags_obj.pretrained_model_checkpoint_path is not None:
-    warm_start_settings = tf.estimator.WarmStartSettings(
-        flags_obj.pretrained_model_checkpoint_path,
-        vars_to_warm_start='^(?!.*dense)')
-  else:
-    warm_start_settings = None
-
-  classifier = tf.estimator.Estimator(
-      model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
-      warm_start_from=warm_start_settings, params={
-          'resnet_size': int(flags_obj.resnet_size),
-          'data_format': flags_obj.data_format,
-          'batch_size': flags_obj.batch_size,
-          'resnet_version': int(flags_obj.resnet_version),
-          'loss_scale': flags_core.get_loss_scale(flags_obj,
-                                                  default_for_fp16=128),
-          'dtype': flags_core.get_tf_dtype(flags_obj),
-          'fine_tune': flags_obj.fine_tune,
-          'num_workers': num_workers,
-      })
-
-  run_params = {
-      'batch_size': flags_obj.batch_size,
-      'dtype': flags_core.get_tf_dtype(flags_obj),
-      'resnet_size': flags_obj.resnet_size,
-      'resnet_version': flags_obj.resnet_version,
-      'synthetic_data': flags_obj.use_synthetic_data,
-      'train_epochs': flags_obj.train_epochs,
-      'num_workers': num_workers,
-  }
-  if flags_obj.use_synthetic_data:
-    dataset_name = dataset_name + '-synthetic'
-
-  benchmark_logger = logger.get_benchmark_logger()
-  benchmark_logger.log_run_info('resnet', dataset_name, run_params,
-                                test_id=flags_obj.benchmark_test_id)
-
-  train_hooks = hooks_helper.get_train_hooks(
-      flags_obj.hooks,
-      model_dir=flags_obj.model_dir,
-      batch_size=flags_obj.batch_size)
-
-  def input_fn_train(num_epochs, input_context=None):
-    return input_function(
-        is_training=True,
-        data_dir=flags_obj.data_dir,
-        batch_size=per_replica_batch_size(
-            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
-        num_epochs=num_epochs,
-        dtype=flags_core.get_tf_dtype(flags_obj),
-        datasets_num_private_threads=flags_obj.datasets_num_private_threads,
-        input_context=input_context)
-
-  def input_fn_eval():
-    return input_function(
-        is_training=False,
-        data_dir=flags_obj.data_dir,
-        batch_size=per_replica_batch_size(
-            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
-        num_epochs=1,
-        dtype=flags_core.get_tf_dtype(flags_obj))
-
-  train_epochs = (0 if flags_obj.eval_only or not flags_obj.train_epochs else
-                  flags_obj.train_epochs)
-
-  use_train_and_evaluate = flags_obj.use_train_and_evaluate or num_workers > 1
-  if use_train_and_evaluate:
-    train_spec = tf.estimator.TrainSpec(
-        input_fn=lambda input_context=None: input_fn_train(
-            train_epochs, input_context=input_context),
-        hooks=train_hooks,
-        max_steps=flags_obj.max_train_steps)
-    eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval)
-    logging.info('Starting to train and evaluate.')
-    tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
-    # tf.estimator.train_and_evalute doesn't return anything in multi-worker
-    # case.
-    eval_results = {}
-  else:
-    if train_epochs == 0:
-      # If --eval_only is set, perform a single loop with zero train epochs.
-      schedule, n_loops = [0], 1
-    else:
-      # Compute the number of times to loop while training. All but the last
-      # pass will train for `epochs_between_evals` epochs, while the last will
-      # train for the number needed to reach `training_epochs`. For instance if
-      #   train_epochs = 25 and epochs_between_evals = 10
-      # schedule will be set to [10, 10, 5]. That is to say, the loop will:
-      #   Train for 10 epochs and then evaluate.
-      #   Train for another 10 epochs and then evaluate.
-      #   Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
-      n_loops = math.ceil(train_epochs / flags_obj.epochs_between_evals)
-      schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))]
-      schedule[-1] = train_epochs - sum(schedule[:-1])  # over counting.
-
-    for cycle_index, num_train_epochs in enumerate(schedule):
-      logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops))
-
-      if num_train_epochs:
-        # Since we are calling classifier.train immediately in each loop, the
-        # value of num_train_epochs in the lambda function will not be changed
-        # before it is used. So it is safe to ignore the pylint error here
-        # pylint: disable=cell-var-from-loop
-        classifier.train(
-            input_fn=lambda input_context=None: input_fn_train(
-                num_train_epochs, input_context=input_context),
-            hooks=train_hooks,
-            max_steps=flags_obj.max_train_steps)
-
-      # flags_obj.max_train_steps is generally associated with testing and
-      # profiling. As a result it is frequently called with synthetic data,
-      # which will iterate forever. Passing steps=flags_obj.max_train_steps
-      # allows the eval (which is generally unimportant in those circumstances)
-      # to terminate.  Note that eval will run for max_train_steps each loop,
-      # regardless of the global_step count.
-      logging.info('Starting to evaluate.')
-      eval_results = classifier.evaluate(input_fn=input_fn_eval,
-                                         steps=flags_obj.max_train_steps)
-
-      benchmark_logger.log_evaluation_result(eval_results)
-
-      if model_helpers.past_stop_threshold(
-          flags_obj.stop_threshold, eval_results['accuracy']):
-        break
-
-  if flags_obj.export_dir is not None:
-    # Exports a saved model for the given classifier.
-    export_dtype = flags_core.get_tf_dtype(flags_obj)
-    if flags_obj.image_bytes_as_serving_input:
-      input_receiver_fn = functools.partial(
-          image_bytes_serving_input_fn, shape, dtype=export_dtype)
-    else:
-      input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
-          shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
-    classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn,
-                                 strip_default_attrs=True)
-
-  stats = {}
-  stats['eval_results'] = eval_results
-  stats['train_hooks'] = train_hooks
-
-  return stats
-
-
-def define_resnet_flags(resnet_size_choices=None, dynamic_loss_scale=False,
-                        fp16_implementation=False):
-  """Add flags and validators for ResNet."""
-  flags_core.define_base(clean=True, train_epochs=True,
-                         epochs_between_evals=True, stop_threshold=True,
-                         num_gpu=True, hooks=True, export_dir=True,
-                         distribution_strategy=True)
-  flags_core.define_performance(num_parallel_calls=False,
-                                inter_op=True,
-                                intra_op=True,
-                                synthetic_data=True,
-                                dtype=True,
-                                all_reduce_alg=True,
-                                num_packs=True,
-                                tf_gpu_thread_mode=True,
-                                datasets_num_private_threads=True,
-                                dynamic_loss_scale=dynamic_loss_scale,
-                                fp16_implementation=fp16_implementation,
-                                loss_scale=True,
-                                tf_data_experimental_slack=True,
-                                max_train_steps=True)
-  flags_core.define_image()
-  flags_core.define_benchmark()
-  flags_core.define_distribution()
-  flags.adopt_module_key_flags(flags_core)
-
-  flags.DEFINE_enum(
-      name='resnet_version', short_name='rv', default='1',
-      enum_values=['1', '2'],
-      help=flags_core.help_wrap(
-          'Version of ResNet. (1 or 2) See README.md for details.'))
-  flags.DEFINE_bool(
-      name='fine_tune', short_name='ft', default=False,
-      help=flags_core.help_wrap(
-          'If True do not train any parameters except for the final layer.'))
-  flags.DEFINE_string(
-      name='pretrained_model_checkpoint_path', short_name='pmcp', default=None,
-      help=flags_core.help_wrap(
-          'If not None initialize all the network except the final layer with '
-          'these values'))
-  flags.DEFINE_boolean(
-      name='eval_only', default=False,
-      help=flags_core.help_wrap('Skip training and only perform evaluation on '
-                                'the latest checkpoint.'))
-  flags.DEFINE_boolean(
-      name='image_bytes_as_serving_input', default=False,
-      help=flags_core.help_wrap(
-          'If True exports savedmodel with serving signature that accepts '
-          'JPEG image bytes instead of a fixed size [HxWxC] tensor that '
-          'represents the image. The former is easier to use for serving at '
-          'the expense of image resize/cropping being done as part of model '
-          'inference. Note, this flag only applies to ImageNet and cannot '
-          'be used for CIFAR.'))
-  flags.DEFINE_boolean(
-      name='use_train_and_evaluate', default=False,
-      help=flags_core.help_wrap(
-          'If True, uses `tf.estimator.train_and_evaluate` for the training '
-          'and evaluation loop, instead of separate calls to `classifier.train '
-          'and `classifier.evaluate`, which is the default behavior.'))
-  flags.DEFINE_bool(
-      name='enable_lars', default=False,
-      help=flags_core.help_wrap(
-          'Enable LARS optimizer for large batch training.'))
-  flags.DEFINE_float(
-      name='label_smoothing', default=0.0,
-      help=flags_core.help_wrap(
-          'Label smoothing parameter used in the softmax_cross_entropy'))
-  flags.DEFINE_float(
-      name='weight_decay', default=1e-4,
-      help=flags_core.help_wrap(
-          'Weight decay coefficiant for l2 regularization.'))
-
-  choice_kwargs = dict(
-      name='resnet_size', short_name='rs', default='50',
-      help=flags_core.help_wrap('The size of the ResNet model to use.'))
-
-  if resnet_size_choices is None:
-    flags.DEFINE_string(**choice_kwargs)
-  else:
-    flags.DEFINE_enum(enum_values=resnet_size_choices, **choice_kwargs)
--- a/official/r1/transformer/README.md
+++ b/official/r1/transformer/README.md
-![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
-![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
-![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
-
-# Transformer Translation Model
-This is an implementation of the Transformer translation model as described in the [Attention is All You Need](https://arxiv.org/abs/1706.03762) paper. Based on the code provided by the authors: [Transformer code](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py) from [Tensor2Tensor](https://github.com/tensorflow/tensor2tensor). Also, check out the [tutorial](https://www.tensorflow.org/beta/tutorials/text/transformer) on Transformer in TF 2.0.
-
-**Please follow the [README](https://github.com/tensorflow/models/official/transformer/README.md), the new Keras-based TF 2 implementation, to walk through the new Transformer.**
-
-Transformer is a neural network architecture that solves sequence to sequence problems using attention mechanisms. Unlike traditional neural seq2seq models, Transformer does not involve recurrent connections. The attention mechanism learns dependencies between tokens in two sequences. Since attention weights apply to all tokens in the sequences, the Transformer model is able to easily capture long-distance dependencies.
-
-Transformer's overall structure follows the standard encoder-decoder pattern. The encoder uses self-attention to compute a representation of the input sequence. The decoder generates the output sequence one token at a time, taking the encoder output and previous decoder-outputted tokens as inputs.
-
-The model also applies embeddings on the input and output tokens, and adds a constant positional encoding. The positional encoding adds information about the position of each token.
-
-## Contents
-  * [Contents](#contents)
-  * [Walkthrough](#walkthrough)
-  * [Benchmarks](#benchmarks)
-    * [Training times](#training-times)
-    * [Evaluation results](#evaluation-results)
-  * [Detailed instructions](#detailed-instructions)
-    * [Environment preparation](#environment-preparation)
-    * [Download and preprocess datasets](#download-and-preprocess-datasets)
-    * [Model training and evaluation](#model-training-and-evaluation)
-    * [Translate using the model](#translate-using-the-model)
-    * [Compute official BLEU score](#compute-official-bleu-score)
-    * [TPU](#tpu)
-  * [Export trained model](#export-trained-model)
-    * [Example translation](#example-translation)
-  * [Implementation overview](#implementation-overview)
-    * [Model Definition](#model-definition)
-    * [Model Estimator](#model-estimator)
-    * [Other scripts](#other-scripts)
-    * [Test dataset](#test-dataset)
-  * [Term definitions](#term-definitions)
-
-## Walkthrough
-
-Below are the commands for running the Transformer model. See the
-[Detailed instructions](#detailed-instructions) for more details on running the
-model.
-
-```
-cd /path/to/models/official/transformer
-
-# Ensure that PYTHONPATH is correctly defined as described in
-# https://github.com/tensorflow/models/tree/master/official#requirements
-# export PYTHONPATH="$PYTHONPATH:/path/to/models"
-
-# Export variables
-PARAM_SET=big
-DATA_DIR=$HOME/transformer/data
-MODEL_DIR=$HOME/transformer/model_$PARAM_SET
-VOCAB_FILE=$DATA_DIR/vocab.ende.32768
-
-# Download training/evaluation/test datasets
-python data_download.py --data_dir=$DATA_DIR
-
-# Train the model for 10 epochs, and evaluate after every epoch.
-python transformer_main.py --data_dir=$DATA_DIR --model_dir=$MODEL_DIR \
-    --vocab_file=$VOCAB_FILE --param_set=$PARAM_SET \
-    --bleu_source=$DATA_DIR/newstest2014.en --bleu_ref=$DATA_DIR/newstest2014.de
-
-# Run during training in a separate process to get continuous updates,
-# or after training is complete.
-tensorboard --logdir=$MODEL_DIR
-
-# Translate some text using the trained model
-python translate.py --model_dir=$MODEL_DIR --vocab_file=$VOCAB_FILE \
-    --param_set=$PARAM_SET --text="hello world"
-
-# Compute model's BLEU score using the newstest2014 dataset.
-python translate.py --model_dir=$MODEL_DIR --vocab_file=$VOCAB_FILE \
-    --param_set=$PARAM_SET --file=$DATA_DIR/newstest2014.en --file_out=translation.en
-python compute_bleu.py --translation=translation.en --reference=$DATA_DIR/newstest2014.de
-```
-
-## Benchmarks
-### Training times
-
-Currently, both big and base parameter sets run on a single GPU. The measurements below
-are reported from running the model on a P100 GPU.
-
-Param Set | batches/sec | batches per epoch | time per epoch
--- | --- | --- | ---
-base | 4.8 | 83244 | 4 hr
-big | 1.1 | 41365 | 10 hr
-
-### Evaluation results
-Below are the case-insensitive BLEU scores after 10 epochs.
-
-Param Set | Score
--- | --- |
-base | 27.7
-big | 28.9
-
-
-## Detailed instructions
-
-
-0. ### Environment preparation
-
-   #### Add models repo to PYTHONPATH
-   Follow the instructions described in the [Requirements](https://github.com/tensorflow/models/tree/master/official#requirements) section to add the models folder to the python path.
-
-   #### Export variables (optional)
-
-   Export the following variables, or modify the values in each of the snippets below:
-   ```
-   PARAM_SET=big
-   DATA_DIR=$HOME/transformer/data
-   MODEL_DIR=$HOME/transformer/model_$PARAM_SET
-   VOCAB_FILE=$DATA_DIR/vocab.ende.32768
-   ```
-
-1. ### Download and preprocess datasets
-
-   [data_download.py](data_download.py) downloads and preprocesses the training and evaluation WMT datasets. After the data is downloaded and extracted, the training data is used to generate a vocabulary of subtokens. The evaluation and training strings are tokenized, and the resulting data is sharded, shuffled, and saved as TFRecords.
-
-   1.75GB of compressed data will be downloaded. In total, the raw files (compressed, extracted, and combined files) take up 8.4GB of disk space. The resulting TFRecord and vocabulary files are 722MB. The script takes around 40 minutes to run, with the bulk of the time spent downloading and ~15 minutes spent on preprocessing.
-
-   Command to run:
-   ```
-   python data_download.py --data_dir=$DATA_DIR
-   ```
-
-   Arguments:
-   * `--data_dir`: Path where the preprocessed TFRecord data, and vocab file will be saved.
-   * Use the `--help` or `-h` flag to get a full list of possible arguments.
-
-2. ### Model training and evaluation
-
-   [transformer_main.py](transformer_main.py) creates a Transformer model, and trains it using Tensorflow Estimator.
-
-   Command to run:
-   ```
-   python transformer_main.py --data_dir=$DATA_DIR --model_dir=$MODEL_DIR \
-       --vocab_file=$VOCAB_FILE --param_set=$PARAM_SET
-   ```
-
-   Arguments:
-   * `--data_dir`: This should be set to the same directory given to the `data_download`'s `data_dir` argument.
-   * `--model_dir`: Directory to save Transformer model training checkpoints.
-   * `--vocab_file`: Path to subtoken vocabulary file. If data_download was used, you may find the file in `data_dir`.
-   * `--param_set`: Parameter set to use when creating and training the model. Options are `base` and `big` (default).
-   * Use the `--help` or `-h` flag to get a full list of possible arguments.
-
-   #### Customizing training schedule
-
-   By default, the model will train for 10 epochs, and evaluate after every epoch. The training schedule may be defined through the flags:
-   * Training with epochs (default):
-     * `--train_epochs`: The total number of complete passes to make through the dataset
-     * `--epochs_between_evals`: The number of epochs to train between evaluations.
-   * Training with steps:
-     * `--train_steps`: sets the total number of training steps to run.
-     * `--steps_between_evals`: Number of training steps to run between evaluations.
-
-   Only one of `train_epochs` or `train_steps` may be set. Since the default option is to evaluate the model after training for an epoch, it may take 4 or more hours between model evaluations. To get more frequent evaluations, use the flags `--train_steps=250000 --steps_between_evals=1000`.
-
-   Note: At the beginning of each training session, the training dataset is reloaded and shuffled. Stopping the training before completing an epoch may result in worse model quality, due to the chance that some examples may be seen more than others. Therefore, it is recommended to use epochs when the model quality is important.
-
-   #### Compute BLEU score during model evaluation
-
-   Use these flags to compute the BLEU when the model evaluates:
-   * `--bleu_source`: Path to file containing text to translate.
-   * `--bleu_ref`: Path to file containing the reference translation.
-   * `--stop_threshold`: Train until the BLEU score reaches this lower bound. This setting overrides the `--train_steps` and `--train_epochs` flags.
-
-   When running `transformer_main.py`, use the flags: `--bleu_source=$DATA_DIR/newstest2014.en --bleu_ref=$DATA_DIR/newstest2014.de`
-
-   #### Tensorboard
-   Training and evaluation metrics (loss, accuracy, approximate BLEU score, etc.) are logged, and can be displayed in the browser using Tensorboard.
-   ```
-   tensorboard --logdir=$MODEL_DIR
-   ```
-   The values are displayed at [localhost:6006](localhost:6006).
-
-3. ### Translate using the model
-   [translate.py](translate.py) contains the script to use the trained model to translate input text or file. Each line in the file is translated separately.
-
-   Command to run:
-   ```
-   python translate.py --model_dir=$MODEL_DIR --vocab_file=$VOCAB_FILE \
-       --param_set=$PARAM_SET --text="hello world"
-   ```
-
-   Arguments for initializing the Subtokenizer and trained model:
-   * `--model_dir` and `--param_set`: These parameters are used to rebuild the trained model
-   * `--vocab_file`: Path to subtoken vocabulary file. If data_download was used, you may find the file in `data_dir`.
-
-   Arguments for specifying what to translate:
-   * `--text`: Text to translate
-   * `--file`: Path to file containing text to translate
-   * `--file_out`: If `--file` is set, then this file will store the input file's translations.
-
-   To translate the newstest2014 data, run:
-   ```
-   python translate.py --model_dir=$MODEL_DIR --vocab_file=$VOCAB_FILE \
-       --param_set=$PARAM_SET --file=$DATA_DIR/newstest2014.en --file_out=translation.en
-   ```
-
-   Translating the file takes around 15 minutes on a GTX1080, or 5 minutes on a P100.
-
-4. ### Compute official BLEU score
-   Use [compute_bleu.py](compute_bleu.py) to compute the BLEU by comparing generated translations to the reference translation.
-
-   Command to run:
-   ```
-   python compute_bleu.py --translation=translation.en --reference=$DATA_DIR/newstest2014.de
-   ```
-
-   Arguments:
-   * `--translation`: Path to file containing generated translations.
-   * `--reference`: Path to file containing reference translations.
-   * Use the `--help` or `-h` flag to get a full list of possible arguments.
-
-5. ### TPU
-   TPU support for this version of Transformer is experimental. Currently it is present for
-   demonstration purposes only, but will be optimized in the coming weeks.
-
-## Export trained model
-To export the model as a Tensorflow [SavedModel](https://www.tensorflow.org/guide/saved_model) format, use the argument `--export_dir` when running `transformer_main.py`. A folder will be created in the directory with the name as the timestamp (e.g. $EXPORT_DIR/1526427396).
-
-```
-EXPORT_DIR=$HOME/transformer/saved_model
-python transformer_main.py --data_dir=$DATA_DIR --model_dir=$MODEL_DIR \
-  --vocab_file=$VOCAB_FILE --param_set=$PARAM_SET --export_model=$EXPORT_DIR
-```
-
-To inspect the SavedModel, use saved_model_cli:
-```
-SAVED_MODEL_DIR=$EXPORT_DIR/{TIMESTAMP}  # replace {TIMESTAMP} with the name of the folder created
-saved_model_cli show --dir=$SAVED_MODEL_DIR  --all
-```
-
-### Example translation
-Let's translate **"hello world!"**, **"goodbye world."**, and **"Would you like some pie?"**.
-
-The SignatureDef for "translate" is:
-
-    signature_def['translate']:
-        The given SavedModel SignatureDef contains the following input(s):
-          inputs['input'] tensor_info:
-              dtype: DT_INT64
-              shape: (-1, -1)
-              name: Placeholder:0
-        The given SavedModel SignatureDef contains the following output(s):
-          outputs['outputs'] tensor_info:
-              dtype: DT_INT32
-              shape: (-1, -1)
-              name: model/Transformer/strided_slice_19:0
-          outputs['scores'] tensor_info:
-              dtype: DT_FLOAT
-              shape: (-1)
-              name: model/Transformer/strided_slice_20:0
-
-Follow the steps below to use the translate signature def:
-
-1. #### Encode the inputs to integer arrays.
-   This can be done using `utils.tokenizer.Subtokenizer`, and the vocab file in the SavedModel assets (`$SAVED_MODEL_DIR/assets.extra/vocab.txt`).
-
-   ```
-   from official.transformer.utils.tokenizer import Subtokenizer
-   s = Subtokenizer(PATH_TO_VOCAB_FILE)
-   print(s.encode("hello world!", add_eos=True))
-   ```
-
-   The encoded inputs are:
-   * `"hello world!" = [6170, 3731, 178, 207, 1]`
-   * `"goodbye world." = [15431, 13966, 36, 178, 3, 1]`
-   * `"Would you like some pie?" = [9092, 72, 155, 202, 19851, 102, 1]`
-
-2. #### Run `saved_model_cli` to obtain the predicted translations
-   The encoded inputs should be padded so that they are the same length. The padding token is `0`.
-   ```
-   ENCODED_INPUTS="[[26228, 145, 178, 1, 0, 0, 0], \
-                   [15431, 13966, 36, 178, 3, 1, 0], \
-                   [9092, 72, 155, 202, 19851, 102, 1]]"
-   ```
-
-   Now, use the `run` command with `saved_model_cli` to get the outputs.
-
-   ```
-   saved_model_cli run --dir=$SAVED_MODEL_DIR --tag_set=serve --signature_def=translate \
-     --input_expr="input=$ENCODED_INPUTS"
-   ```
-
-   The outputs will look similar to:
-   ```
-   Result for output key outputs:
-   [[18744   145   297     1     0     0     0     0     0     0     0     0
-         0     0]
-    [ 5450  4642    21    11   297     3     1     0     0     0     0     0
-         0     0]
-    [25940    22    66   103 21713    31   102     1     0     0     0     0
-         0     0]]
-   Result for output key scores:
-   [-1.5493642 -1.4032784 -3.252089 ]
-   ```
-
-3. #### Decode the outputs to strings.
-   Use the `Subtokenizer` and vocab file as described in step 1 to decode the output integer arrays.
-   ```
-   from official.transformer.utils.tokenizer import Subtokenizer
-   s = Subtokenizer(PATH_TO_VOCAB_FILE)
-   print(s.decode([18744, 145, 297, 1]))
-   ```
-   The decoded outputs from above are:
-   * `[18744, 145, 297, 1] = "Hallo Welt<EOS>"`
-   * `[5450, 4642, 21, 11, 297, 3, 1] = "Abschied von der Welt.<EOS>"`
-   * `[25940, 22, 66, 103, 21713, 31, 102, 1] = "Möchten Sie einen Kuchen?<EOS>"`
-
-## Implementation overview
-
-A brief look at each component in the code:
-
-### Model Definition
-The [model](model) subdirectory contains the implementation of the Transformer model. The following files define the Transformer model and its layers:
-* [transformer.py](model/transformer.py): Defines the transformer model and its encoder/decoder layer stacks.
-* [embedding_layer.py](model/embedding_layer.py): Contains the layer that calculates the embeddings. The embedding weights are also used to calculate the pre-softmax probabilities from the decoder output.
-* [attention_layer.py](model/attention_layer.py): Defines the multi-headed and self attention layers that are used in the encoder/decoder stacks.
-* [ffn_layer.py](model/ffn_layer.py): Defines the feedforward network that is used in the encoder/decoder stacks. The network is composed of 2 fully connected layers.
-
-Other files:
-* [beam_search.py](model/beam_search.py) contains the beam search implementation, which is used during model inference to find high scoring translations.
-* [model_params.py](model/model_params.py) contains the parameters used for the big and base models.
-* [model_utils.py](model/model_utils.py) defines some helper functions used in the model (calculating padding, bias, etc.).
-
-
-### Model Estimator
-[transformer_main.py](model/transformer.py) creates an `Estimator` to train and evaluate the model.
-
-Helper functions:
-* [utils/dataset.py](utils/dataset.py): contains functions for creating a `dataset` that is passed to the `Estimator`.
-* [utils/metrics.py](utils/metrics.py): defines metrics functions used by the `Estimator` to evaluate the
-
-### Other scripts
-
-Aside from the main file to train the Transformer model, we provide other scripts for using the model or downloading the data:
-
-#### Data download and preprocessing
-
-[data_download.py](data_download.py) downloads and extracts data, then uses `Subtokenizer` to tokenize strings into arrays of int IDs. The int arrays are converted to `tf.Examples` and saved in the `tf.RecordDataset` format.
-
- The data is downloaded from the Workshop of Machine Translation (WMT) [news translation task](http://www.statmt.org/wmt17/translation-task.html). The following datasets are used:
-
- * Europarl v7
- * Common Crawl corpus
- * News Commentary v12
-
- See the [download section](http://www.statmt.org/wmt17/translation-task.html#download) to explore the raw datasets. The parameters in this model are tuned to fit the English-German translation data, so the EN-DE texts are extracted from the downloaded compressed files.
-
-The text is transformed into arrays of integer IDs using the `Subtokenizer` defined in [`utils/tokenizer.py`](util/tokenizer.py). During initialization of the `Subtokenizer`, the raw training data is used to generate a vocabulary list containing common subtokens.
-
-The target vocabulary size of the WMT dataset is 32,768. The set of subtokens is found through binary search on the minimum number of times a subtoken appears in the data. The actual vocabulary size is 33,708, and is stored in a 324kB file.
-
-#### Translation
-Translation is defined in [translate.py](translate.py). First, `Subtokenizer` tokenizes the input. The vocabulary file is the same used to tokenize the training/eval files. Next, beam search is used to find the combination of tokens that maximizes the probability outputted by the model decoder. The tokens are then converted back to strings with `Subtokenizer`.
-
-#### BLEU computation
-[compute_bleu.py](compute_bleu.py): Implementation from [https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py).
-
-### Test dataset
-The [newstest2014 files](https://storage.googleapis.com/tf-perf-public/official_transformer/test_data/newstest2014.tgz)
-are extracted from the [NMT Seq2Seq tutorial](https://google.github.io/seq2seq/nmt/#download-data).
-The raw text files are converted from the SGM format of the
-[WMT 2016](http://www.statmt.org/wmt16/translation-task.html) test sets. The
-newstest2014 files are put into the `$DATA_DIR` when executing
-`data_download.py`
-
-## Term definitions
-
-**Steps / Epochs**:
-* Step: unit for processing a single batch of data
-* Epoch: a complete run through the dataset
-
-Example: Consider a training a dataset with 100 examples that is divided into 20 batches with 5 examples per batch. A single training step trains the model on one batch. After 20 training steps, the model will have trained on every batch in the dataset, or one epoch.
-
-**Subtoken**: Words are referred to as tokens, and parts of words are referred to as 'subtokens'. For example, the word 'inclined' may be split into `['incline', 'd_']`. The '\_' indicates the end of the token. The subtoken vocabulary list is guaranteed to contain the alphabet (including numbers and special characters), so all words can be tokenized.
--- a/official/r1/transformer/__init__.py
+++ b/official/r1/transformer/__init__.py
--- a/official/r1/transformer/attention_layer.py
+++ b/official/r1/transformer/attention_layer.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of multiheaded attention and self-attention layers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf
-
-
-class Attention(tf.layers.Layer):
-  """Multi-headed attention layer."""
-
-  def __init__(self, hidden_size, num_heads, attention_dropout, train):
-    if hidden_size % num_heads != 0:
-      raise ValueError("Hidden size must be evenly divisible by the number of "
-                       "heads.")
-
-    super(Attention, self).__init__()
-    self.hidden_size = hidden_size
-    self.num_heads = num_heads
-    self.attention_dropout = attention_dropout
-    self.train = train
-
-    # Layers for linearly projecting the queries, keys, and values.
-    self.q_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="q")
-    self.k_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="k")
-    self.v_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="v")
-
-    self.output_dense_layer = tf.layers.Dense(hidden_size, use_bias=False,
-                                              name="output_transform")
-
-  def split_heads(self, x):
-    """Split x into different heads, and transpose the resulting value.
-
-    The tensor is transposed to insure the inner dimensions hold the correct
-    values during the matrix multiplication.
-
-    Args:
-      x: A tensor with shape [batch_size, length, hidden_size]
-
-    Returns:
-      A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads]
-    """
-    with tf.name_scope("split_heads"):
-      batch_size = tf.shape(x)[0]
-      length = tf.shape(x)[1]
-
-      # Calculate depth of last dimension after it has been split.
-      depth = (self.hidden_size // self.num_heads)
-
-      # Split the last dimension
-      x = tf.reshape(x, [batch_size, length, self.num_heads, depth])
-
-      # Transpose the result
-      return tf.transpose(x, [0, 2, 1, 3])
-
-  def combine_heads(self, x):
-    """Combine tensor that has been split.
-
-    Args:
-      x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
-
-    Returns:
-      A tensor with shape [batch_size, length, hidden_size]
-    """
-    with tf.name_scope("combine_heads"):
-      batch_size = tf.shape(x)[0]
-      length = tf.shape(x)[2]
-      x = tf.transpose(x, [0, 2, 1, 3])  # --> [batch, length, num_heads, depth]
-      return tf.reshape(x, [batch_size, length, self.hidden_size])
-
-  def call(self, x, y, bias, cache=None):
-    """Apply attention mechanism to x and y.
-
-    Args:
-      x: a tensor with shape [batch_size, length_x, hidden_size]
-      y: a tensor with shape [batch_size, length_y, hidden_size]
-      bias: attention bias that will be added to the result of the dot product.
-      cache: (Used during prediction) dictionary with tensors containing results
-        of previous attentions. The dictionary must have the items:
-            {"k": tensor with shape [batch_size, i, key_channels],
-             "v": tensor with shape [batch_size, i, value_channels]}
-        where i is the current decoded length.
-
-    Returns:
-      Attention layer output with shape [batch_size, length_x, hidden_size]
-    """
-    # Linearly project the query (q), key (k) and value (v) using different
-    # learned projections. This is in preparation of splitting them into
-    # multiple heads. Multi-head attention uses multiple queries, keys, and
-    # values rather than regular attention (which uses a single q, k, v).
-    q = self.q_dense_layer(x)
-    k = self.k_dense_layer(y)
-    v = self.v_dense_layer(y)
-
-    if cache is not None:
-      # Combine cached keys and values with new keys and values.
-      k = tf.concat([cache["k"], k], axis=1)
-      v = tf.concat([cache["v"], v], axis=1)
-
-      # Update cache
-      cache["k"] = k
-      cache["v"] = v
-
-    # Split q, k, v into heads.
-    q = self.split_heads(q)
-    k = self.split_heads(k)
-    v = self.split_heads(v)
-
-    # Scale q to prevent the dot product between q and k from growing too large.
-    depth = (self.hidden_size // self.num_heads)
-    q *= depth ** -0.5
-
-    # Calculate dot product attention
-    logits = tf.matmul(q, k, transpose_b=True)
-    logits += bias
-    weights = tf.nn.softmax(logits, name="attention_weights")
-    if self.train:
-      weights = tf.nn.dropout(weights, 1.0 - self.attention_dropout)
-    attention_output = tf.matmul(weights, v)
-
-    # Recombine heads --> [batch_size, length, hidden_size]
-    attention_output = self.combine_heads(attention_output)
-
-    # Run the combined outputs through another linear projection layer.
-    attention_output = self.output_dense_layer(attention_output)
-    return attention_output
-
-
-class SelfAttention(Attention):
-  """Multiheaded self-attention layer."""
-
-  def call(self, x, bias, cache=None):
-    return super(SelfAttention, self).call(x, x, bias, cache)
--- a/official/r1/transformer/dataset.py
+++ b/official/r1/transformer/dataset.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Input pipeline for the transformer model to read, filter, and batch examples.
-
-Two things to note in the pipeline:
-
-1. Batching scheme
-
-   The examples encoded in the TFRecord files contain data in the format:
-     {"inputs": [variable length array of integers],
-      "targets": [variable length array of integers]}
-   Where integers in the arrays refer to tokens in the English and German vocab
-   file (named `vocab.ende.32768`).
-
-   Prior to batching, elements in the dataset are grouped by length (max between
-   "inputs" and "targets" length). Each group is then batched such that:
-     group_batch_size * length <= batch_size.
-
-   Another way to view batch_size is the maximum number of tokens in each batch.
-
-   Once batched, each element in the dataset will have the shape:
-     {"inputs": [group_batch_size, padded_input_length],
-      "targets": [group_batch_size, padded_target_length]}
-   Lengths are padded to the longest "inputs" or "targets" sequence in the batch
-   (padded_input_length and padded_target_length can be different).
-
-   This batching scheme decreases the fraction of padding tokens per training
-   batch, thus improving the training speed significantly.
-
-2. Shuffling
-
-   While training, the dataset is shuffled in two places in the code. The first
-   is the list of training files. Second, while reading records using
-   `parallel_interleave`, the `sloppy` argument is used to generate randomness
-   in the order of the examples.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import os
-
-import tensorflow.compat.v1 as tf
-
-from official.utils.misc import model_helpers
-
-# Buffer size for reading records from a TFRecord file. Each training file is
-# 7.2 MB, so 8 MB allows an entire file to be kept in memory.
-_READ_RECORD_BUFFER = 8 * 1000 * 1000
-
-# Example grouping constants. Defines length boundaries for each group.
-# These values are the defaults used in Tensor2Tensor.
-_MIN_BOUNDARY = 8
-_BOUNDARY_SCALE = 1.1
-
-
-def _load_records(filename):
-  """Read file and return a dataset of tf.Examples."""
-  return tf.data.TFRecordDataset(filename, buffer_size=_READ_RECORD_BUFFER)
-
-
-def _parse_example(serialized_example):
-  """Return inputs and targets Tensors from a serialized tf.Example."""
-  data_fields = {
-      "inputs": tf.VarLenFeature(tf.int64),
-      "targets": tf.VarLenFeature(tf.int64)
-  }
-  parsed = tf.parse_single_example(serialized_example, data_fields)
-  inputs = tf.sparse_tensor_to_dense(parsed["inputs"])
-  targets = tf.sparse_tensor_to_dense(parsed["targets"])
-  return inputs, targets
-
-
-def _filter_max_length(example, max_length=256):
-  """Indicates whether the example's length is lower than the maximum length."""
-  return tf.logical_and(tf.size(example[0]) <= max_length,
-                        tf.size(example[1]) <= max_length)
-
-
-def _get_example_length(example):
-  """Returns the maximum length between the example inputs and targets."""
-  length = tf.maximum(tf.shape(example[0])[0], tf.shape(example[1])[0])
-  return length
-
-
-def _create_min_max_boundaries(
-    max_length, min_boundary=_MIN_BOUNDARY, boundary_scale=_BOUNDARY_SCALE):
-  """Create min and max boundary lists up to max_length.
-
-  For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
-  returned values will be:
-    buckets_min = [0, 4, 8, 16, 24]
-    buckets_max = [4, 8, 16, 24, 25]
-
-  Args:
-    max_length: The maximum length of example in dataset.
-    min_boundary: Minimum length in boundary.
-    boundary_scale: Amount to scale consecutive boundaries in the list.
-
-  Returns:
-    min and max boundary lists
-
-  """
-  # Create bucket boundaries list by scaling the previous boundary or adding 1
-  # (to ensure increasing boundary sizes).
-  bucket_boundaries = []
-  x = min_boundary
-  while x < max_length:
-    bucket_boundaries.append(x)
-    x = max(x + 1, int(x * boundary_scale))
-
-  # Create min and max boundary lists from the initial list.
-  buckets_min = [0] + bucket_boundaries
-  buckets_max = bucket_boundaries + [max_length + 1]
-  return buckets_min, buckets_max
-
-
-def _batch_examples(dataset, batch_size, max_length):
-  """Group examples by similar lengths, and return batched dataset.
-
-  Each batch of similar-length examples are padded to the same length, and may
-  have different number of elements in each batch, such that:
-    group_batch_size * padded_length <= batch_size.
-
-  This decreases the number of padding tokens per batch, which improves the
-  training speed.
-
-  Args:
-    dataset: Dataset of unbatched examples.
-    batch_size: Max number of tokens per batch of examples.
-    max_length: Max number of tokens in an example input or target sequence.
-
-  Returns:
-    Dataset of batched examples with similar lengths.
-  """
-  # Get min and max boundary lists for each example. These are used to calculate
-  # the `bucket_id`, which is the index at which:
-  # buckets_min[bucket_id] <= len(example) < buckets_max[bucket_id]
-  # Note that using both min and max lists improves the performance.
-  buckets_min, buckets_max = _create_min_max_boundaries(max_length)
-
-  # Create list of batch sizes for each bucket_id, so that
-  # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
-  bucket_batch_sizes = [batch_size // x for x in buckets_max]
-  # bucket_id will be a tensor, so convert this list to a tensor as well.
-  bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
-
-  def example_to_bucket_id(example_input, example_target):
-    """Return int64 bucket id for this example, calculated based on length."""
-    seq_length = _get_example_length((example_input, example_target))
-
-    # TODO: investigate whether removing code branching improves performance.
-    conditions_c = tf.logical_and(
-        tf.less_equal(buckets_min, seq_length),
-        tf.less(seq_length, buckets_max))
-    bucket_id = tf.reduce_min(tf.where(conditions_c))
-    return bucket_id
-
-  def window_size_fn(bucket_id):
-    """Return number of examples to be grouped when given a bucket id."""
-    return bucket_batch_sizes[bucket_id]
-
-  def batching_fn(bucket_id, grouped_dataset):
-    """Batch and add padding to a dataset of elements with similar lengths."""
-    bucket_batch_size = window_size_fn(bucket_id)
-
-    # Batch the dataset and add padding so that all input sequences in the
-    # examples have the same length, and all target sequences have the same
-    # lengths as well. Resulting lengths of inputs and targets can differ.
-    return grouped_dataset.padded_batch(bucket_batch_size, ([None], [None]))
-
-  return dataset.apply(tf.data.experimental.group_by_window(
-      key_func=example_to_bucket_id,
-      reduce_func=batching_fn,
-      window_size=None,
-      window_size_func=window_size_fn))
-
-
-def _read_and_batch_from_files(
-    file_pattern, batch_size, max_length, num_parallel_calls, shuffle, repeat,
-    static_batch=False):
-  """Create dataset where each item is a dict of "inputs" and "targets".
-
-  Args:
-    file_pattern: String used to match the input TFRecord files.
-    batch_size: Maximum number of tokens per batch of examples
-    max_length: Maximum number of tokens per example
-    num_parallel_calls: Number of cpu cores for parallel input processing.
-    shuffle: If true, randomizes order of elements.
-    repeat: Number of times to repeat the dataset. If None, the dataset is
-      repeated forever.
-    static_batch: Whether the batches in the dataset should have static shapes.
-      If True, the input is batched so that every batch has the
-      shape [batch_size // max_length, max_length]. If False, the input is
-      grouped by length, and batched so that batches may have different
-      shapes [N, M], where:
-        N * M <= batch_size
-        M <= max_length
-      In general, this setting should be False. Dynamic shapes allow the inputs
-      to be grouped so that the number of padding tokens is minimized, and helps
-      model training. In cases where the input shape must be static
-      (e.g. running on TPU), this setting should be set to True.
-
-  Returns:
-    tf.data.Dataset object containing examples loaded from the files.
-  """
-  dataset = tf.data.Dataset.list_files(file_pattern, shuffle=shuffle)
-
-  # Read files and interleave results. When training, the order of the examples
-  # will be non-deterministic.
-  dataset = dataset.apply(
-      tf.data.experimental.parallel_interleave(
-          _load_records, sloppy=shuffle, cycle_length=num_parallel_calls))
-
-  # Parse each tf.Example into a dictionary
-  # TODO: Look into prefetch_input_elements for performance optimization.
-  dataset = dataset.map(_parse_example,
-                        num_parallel_calls=num_parallel_calls)
-
-  # Remove examples where the input or target length exceeds the maximum length,
-  dataset = dataset.filter(lambda x, y: _filter_max_length((x, y), max_length))
-
-  if static_batch:
-    dataset = dataset.padded_batch(
-        batch_size // max_length, ([max_length], [max_length]),
-        drop_remainder=True)
-  else:
-    # Group and batch such that each batch has examples of similar length.
-    dataset = _batch_examples(dataset, batch_size, max_length)
-
-  dataset = dataset.repeat(repeat)
-
-  # Prefetch the next element to improve speed of input pipeline.
-  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
-  return dataset
-
-
-def _generate_synthetic_data(params):
-  """Create synthetic data based on the parameter batch size."""
-  batch = length = int(math.sqrt(params["batch_size"]))
-  return model_helpers.generate_synthetic_data(
-      input_shape=tf.TensorShape([batch, length]),
-      input_value=1,
-      input_dtype=tf.int32,
-      label_shape=tf.TensorShape([batch, length]),
-      label_value=1,
-      label_dtype=tf.int32,
-  )
-
-
-def train_input_fn(params):
-  """Load and return dataset of batched examples for use during training."""
-  file_pattern = os.path.join(params["data_dir"] or "", "*train*")
-  if params["use_synthetic_data"]:
-    return _generate_synthetic_data(params)
-  return _read_and_batch_from_files(
-      file_pattern, params["batch_size"], params["max_length"],
-      params["num_parallel_calls"], shuffle=True,
-      repeat=params["repeat_dataset"], static_batch=params["static_batch"])
-
-
-def eval_input_fn(params):
-  """Load and return dataset of batched examples for use during evaluation."""
-  file_pattern = os.path.join(params["data_dir"] or "", "*dev*")
-  if params["use_synthetic_data"]:
-    return _generate_synthetic_data(params)
-  return _read_and_batch_from_files(
-      file_pattern, params["batch_size"], params["max_length"],
-      params["num_parallel_calls"], shuffle=False, repeat=1,
-      static_batch=params["static_batch"])
--- a/official/r1/transformer/embedding_layer.py
+++ b/official/r1/transformer/embedding_layer.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of embedding layer with shared weights."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf  # pylint: disable=g-bad-import-order
-
-from official.r1.utils import tpu as tpu_utils
-
-
-class EmbeddingSharedWeights(tf.layers.Layer):
-  """Calculates input embeddings and pre-softmax linear with shared weights."""
-
-  def __init__(self, vocab_size, hidden_size, method="gather"):
-    """Specify characteristic parameters of embedding layer.
-
-    Args:
-      vocab_size: Number of tokens in the embedding. (Typically ~32,000)
-      hidden_size: Dimensionality of the embedding. (Typically 512 or 1024)
-      method: Strategy for performing embedding lookup. "gather" uses tf.gather
-        which performs well on CPUs and GPUs, but very poorly on TPUs. "matmul"
-        one-hot encodes the indicies and formulates the embedding as a sparse
-        matrix multiplication. The matmul formulation is wasteful as it does
-        extra work, however matrix multiplication is very fast on TPUs which
-        makes "matmul" considerably faster than "gather" on TPUs.
-    """
-    super(EmbeddingSharedWeights, self).__init__()
-    self.vocab_size = vocab_size
-    self.hidden_size = hidden_size
-    if method not in ("gather", "matmul"):
-      raise ValueError("method {} must be 'gather' or 'matmul'".format(method))
-    self.method = method
-
-  def build(self, _):
-    with tf.variable_scope("embedding_and_softmax", reuse=tf.AUTO_REUSE):
-      # Create and initialize weights. The random normal initializer was chosen
-      # randomly, and works well.
-      self.shared_weights = tf.get_variable(
-          "weights", [self.vocab_size, self.hidden_size],
-          initializer=tf.random_normal_initializer(
-              0., self.hidden_size ** -0.5))
-
-    self.built = True
-
-  def call(self, x):
-    """Get token embeddings of x.
-
-    Args:
-      x: An int64 tensor with shape [batch_size, length]
-    Returns:
-      embeddings: float32 tensor with shape [batch_size, length, embedding_size]
-      padding: float32 tensor with shape [batch_size, length] indicating the
-        locations of the padding tokens in x.
-    """
-    with tf.name_scope("embedding"):
-      # Create binary mask of size [batch_size, length]
-      mask = tf.to_float(tf.not_equal(x, 0))
-
-      if self.method == "gather":
-        embeddings = tf.gather(self.shared_weights, x)
-        embeddings *= tf.expand_dims(mask, -1)
-      else:  # matmul
-        embeddings = tpu_utils.embedding_matmul(
-            embedding_table=self.shared_weights,
-            values=tf.cast(x, dtype=tf.int32),
-            mask=mask
-        )
-        # embedding_matmul already zeros out masked positions, so
-        # `embeddings *= tf.expand_dims(mask, -1)` is unnecessary.
-
-
-      # Scale embedding by the sqrt of the hidden size
-      embeddings *= self.hidden_size ** 0.5
-
-      return embeddings
-
-
-  def linear(self, x):
-    """Computes logits by running x through a linear layer.
-
-    Args:
-      x: A float32 tensor with shape [batch_size, length, hidden_size]
-    Returns:
-      float32 tensor with shape [batch_size, length, vocab_size].
-    """
-    with tf.name_scope("presoftmax_linear"):
-      batch_size = tf.shape(x)[0]
-      length = tf.shape(x)[1]
-
-      x = tf.reshape(x, [-1, self.hidden_size])
-      logits = tf.matmul(x, self.shared_weights, transpose_b=True)
-
-      return tf.reshape(logits, [batch_size, length, self.vocab_size])
--- a/official/r1/transformer/ffn_layer.py
+++ b/official/r1/transformer/ffn_layer.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of fully connected network."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf
-
-
-class FeedFowardNetwork(tf.layers.Layer):
-  """Fully connected feedforward network."""
-
-  def __init__(self, hidden_size, filter_size, relu_dropout, train, allow_pad):
-    super(FeedFowardNetwork, self).__init__()
-    self.hidden_size = hidden_size
-    self.filter_size = filter_size
-    self.relu_dropout = relu_dropout
-    self.train = train
-    self.allow_pad = allow_pad
-
-    self.filter_dense_layer = tf.layers.Dense(
-        filter_size, use_bias=True, activation=tf.nn.relu, name="filter_layer")
-    self.output_dense_layer = tf.layers.Dense(
-        hidden_size, use_bias=True, name="output_layer")
-
-  def call(self, x, padding=None):
-    """Return outputs of the feedforward network.
-
-    Args:
-      x: tensor with shape [batch_size, length, hidden_size]
-      padding: (optional) If set, the padding values are temporarily removed
-        from x (provided self.allow_pad is set). The padding values are placed
-        back in the output tensor in the same locations.
-        shape [batch_size, length]
-
-    Returns:
-      Output of the feedforward network.
-      tensor with shape [batch_size, length, hidden_size]
-    """
-    padding = None if not self.allow_pad else padding
-
-    # Retrieve dynamically known shapes
-    batch_size = tf.shape(x)[0]
-    length = tf.shape(x)[1]
-
-    if padding is not None:
-      with tf.name_scope("remove_padding"):
-        # Flatten padding to [batch_size*length]
-        pad_mask = tf.reshape(padding, [-1])
-
-        nonpad_ids = tf.to_int32(tf.where(pad_mask < 1e-9))
-
-        # Reshape x to [batch_size*length, hidden_size] to remove padding
-        x = tf.reshape(x, [-1, self.hidden_size])
-        x = tf.gather_nd(x, indices=nonpad_ids)
-
-        # Reshape x from 2 dimensions to 3 dimensions.
-        x.set_shape([None, self.hidden_size])
-        x = tf.expand_dims(x, axis=0)
-
-    output = self.filter_dense_layer(x)
-    if self.train:
-      output = tf.nn.dropout(output, 1.0 - self.relu_dropout)
-    output = self.output_dense_layer(output)
-
-    if padding is not None:
-      with tf.name_scope("re_add_padding"):
-        output = tf.squeeze(output, axis=0)
-        output = tf.scatter_nd(
-            indices=nonpad_ids,
-            updates=output,
-            shape=[batch_size * length, self.hidden_size]
-        )
-        output = tf.reshape(output, [batch_size, length, self.hidden_size])
-    return output
--- a/official/r1/transformer/schedule.py
+++ b/official/r1/transformer/schedule.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Abstract training on a step or epoch basis."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-import tensorflow.compat.v1 as tf
-
-
-_TRAIN, _EVAL = tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL
-
-
-NUM_EXAMPLES = {
-    tf.estimator.ModeKeys.TRAIN: 4572160,
-    # # Examples that are too long are filtered out, thus the total is less
-    # # than the total number of lines.
-    # 2399123 +  # news-commentary-v12.de-en
-    # 1920209 +  # commoncrawl.de-en
-    # 270769,    # europarl-v7.de-en
-    tf.estimator.ModeKeys.EVAL: 3000,  # newstest2013
-}
-
-
-class Manager(object):
-  """Container for convenience functions to abstract step or epoch basis.
-  Transformer allows users to specify an epoch basis (generally recommended for
-  full training) or a number of steps basis (convenient since epochs are rather
-  large). TPUs furthermore require a step basis; however epochs are the norm in
-  the machine learning community and it is desirable to allow users to specify
-  epochs even when running with TPUS which requires behind the scenes
-  conversions.
-  This container simply groups what are largely mundane checks and conversions
-  rather than interspersing them throughout the run loop code.
-  """
-
-  def __init__(self, train_steps, steps_between_evals, train_epochs,
-               epochs_between_evals, default_train_epochs, batch_size,
-               max_length, use_tpu=False, num_tpu_shards=8):
-    if train_steps and train_epochs:
-      raise ValueError("Both train_steps or train_epochs were be defined.")
-
-    # Determine training schedule based on flags.
-    if train_steps:
-      self.train_eval_iterations = train_steps // steps_between_evals
-      self._single_iteration_train_steps = steps_between_evals
-      self._single_iteration_train_epochs = None
-    else:
-      train_epochs = train_epochs or default_train_epochs
-      self.train_eval_iterations = train_epochs // epochs_between_evals
-      self._single_iteration_train_steps = None
-      self._single_iteration_train_epochs = epochs_between_evals
-
-    self.max_length = max_length
-    self.batch_size = batch_size
-    self.use_tpu = use_tpu
-    self.num_tpu_shards = num_tpu_shards
-
-    if self.use_tpu:
-      assert (self.batch_size // self.max_length) % self.num_tpu_shards == 0
-
-  @property
-  def single_iteration_train_steps(self):
-    if self._single_iteration_train_steps or not self.use_tpu:
-      return self._single_iteration_train_steps
-
-    return self.epochs_to_steps(
-        num_epochs=self._single_iteration_train_epochs, mode=_TRAIN)
-
-  @property
-  def single_iteration_eval_steps(self):
-    if not self.use_tpu:
-      return None
-
-    return self.epochs_to_steps(num_epochs=1, mode=_EVAL)
-
-  @property
-  def train_increment_str(self):
-    if self._single_iteration_train_steps:
-      return "{} steps.".format(self._single_iteration_train_steps)
-
-    if not self.use_tpu:
-      return "{} epochs.".format(self._single_iteration_train_epochs)
-
-    return "~{} epochs. ({} steps)".format(
-        self._single_iteration_train_epochs,
-        self.single_iteration_train_steps)
-
-  @property
-  def repeat_dataset(self):
-    if (self._single_iteration_train_epochs is None and
-        self._single_iteration_train_steps > NUM_EXAMPLES[_TRAIN]):
-      return math.ceil(self._single_iteration_train_steps /
-                       NUM_EXAMPLES[_TRAIN])
-    return self._single_iteration_train_epochs
-
-  def epochs_to_steps(self, num_epochs, mode):
-    """Converts a number of epochs to a number of training steps.
-
-    TPU only: This function assumes that static_batch is True.
-
-      TPU can not tolerate an OutOfRange error from a dataset. As a result the
-    number of examples to be processed must be known ahead of time. TPUs also
-    do not allow partial batches, so this function rounds down.
-
-    Args:
-      num_epochs: An integer of the number of epochs to convert to steps.
-      mode: The estimator ModeKey of the computation
-
-    Returns:
-      An integer of the number of equivalent steps rounded down.
-    """
-    assert self.use_tpu, "epochs_to_steps should only be reached when using TPU"
-    total_num_tokens = NUM_EXAMPLES[mode] * self.max_length * num_epochs
-    return total_num_tokens // self.batch_size
--- a/official/r1/transformer/schedule_test.py
+++ b/official/r1/transformer/schedule_test.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test Transformer's schedule manager."""
-
-import tensorflow.compat.v1 as tf
-
-from official.r1.transformer import schedule
-
-
-class ScheduleBaseTester(tf.test.TestCase):
-  def test_mutual_exclusivity(self):
-    with self.assertRaises(ValueError):
-      schedule.Manager(
-          train_steps=100, steps_between_evals=100, train_epochs=2,
-          epochs_between_evals=1, default_train_epochs=None, batch_size=2048,
-          max_length=256)
-
-  def test_step_basis(self):
-    manager = schedule.Manager(
-        train_steps=1000, steps_between_evals=100, train_epochs=None,
-        epochs_between_evals=None, default_train_epochs=None, batch_size=2048,
-        max_length=256)
-
-    self.assertEqual(manager.single_iteration_train_steps, 100)
-
-    # Evaluation uses the full set
-    self.assertIsNone(manager.single_iteration_eval_steps)
-
-    self.assertIsNone(manager.repeat_dataset)
-
-  def test_epoch_basis(self):
-    manager = schedule.Manager(
-        train_steps=None, steps_between_evals=None, train_epochs=10,
-        epochs_between_evals=2, default_train_epochs=None, batch_size=2048,
-        max_length=256)
-
-    # For non-TPU, estimator relies on dataset exhausion
-    self.assertIsNone(manager.single_iteration_train_steps)
-    self.assertIsNone(manager.single_iteration_eval_steps)
-
-    self.assertEqual(manager.repeat_dataset, 2)
-
-  def test_step_basis_tpu(self):
-    manager = schedule.Manager(
-        train_steps=1000, steps_between_evals=100, train_epochs=None,
-        epochs_between_evals=None, default_train_epochs=None, batch_size=2048,
-        max_length=256, use_tpu=True)
-
-    self.assertEqual(manager.single_iteration_train_steps, 100)
-    # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256)
-    self.assertEqual(manager.single_iteration_eval_steps, 375)
-    self.assertIsNone(manager.repeat_dataset)
-
-  def test_epoch_basis_tpu(self):
-    manager = schedule.Manager(
-        train_steps=None, steps_between_evals=None, train_epochs=10,
-        epochs_between_evals=2, default_train_epochs=None, batch_size=2048,
-        max_length=256, use_tpu=True)
-
-    self.assertEqual(
-        manager.single_iteration_train_steps,
-        schedule.NUM_EXAMPLES[tf.estimator.ModeKeys.TRAIN] * 2 // (2048 / 256)
-    )
-
-    # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256)
-    self.assertEqual(manager.single_iteration_eval_steps, 375)
-
-    self.assertEqual(manager.repeat_dataset, 2)
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/r1/transformer/transformer.py
+++ b/official/r1/transformer/transformer.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Defines the Transformer model, and its encoder and decoder stacks.
-
-Model paper: https://arxiv.org/pdf/1706.03762.pdf
-Transformer model code source: https://github.com/tensorflow/tensor2tensor
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf
-
-from official.nlp.transformer import beam_search_v1 as beam_search
-from official.nlp.transformer import model_utils
-from official.nlp.transformer.utils.tokenizer import EOS_ID
-from official.r1.transformer import attention_layer
-from official.r1.transformer import embedding_layer
-from official.r1.transformer import ffn_layer
-
-_NEG_INF = -1e9
-
-
-class Transformer(object):
-  """Transformer model for sequence to sequence data.
-
-  Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
-
-  The Transformer model consists of an encoder and decoder. The input is an int
-  sequence (or a batch of sequences). The encoder produces a continous
-  representation, and the decoder uses the encoder output to generate
-  probabilities for the output sequence.
-  """
-
-  def __init__(self, params, train):
-    """Initialize layers to build Transformer model.
-
-    Args:
-      params: hyperparameter object defining layer sizes, dropout values, etc.
-      train: boolean indicating whether the model is in training mode. Used to
-        determine if dropout layers should be added.
-    """
-    self.train = train
-    self.params = params
-
-    self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights(
-        params["vocab_size"], params["hidden_size"],
-        method="matmul" if params["tpu"] else "gather")
-    self.encoder_stack = EncoderStack(params, train)
-    self.decoder_stack = DecoderStack(params, train)
-
-  def __call__(self, inputs, targets=None):
-    """Calculate target logits or inferred target sequences.
-
-    Args:
-      inputs: int tensor with shape [batch_size, input_length].
-      targets: None or int tensor with shape [batch_size, target_length].
-
-    Returns:
-      If targets is defined, then return logits for each word in the target
-      sequence. float tensor with shape [batch_size, target_length, vocab_size]
-      If target is none, then generate output sequence one token at a time.
-        returns a dictionary {
-          output: [batch_size, decoded length]
-          score: [batch_size, float]}
-    """
-    # Variance scaling is used here because it seems to work in many problems.
-    # Other reasonable initializers may also work just as well.
-    initializer = tf.variance_scaling_initializer(
-        self.params["initializer_gain"], mode="fan_avg", distribution="uniform")
-    with tf.variable_scope("Transformer", initializer=initializer):
-      # Calculate attention bias for encoder self-attention and decoder
-      # multi-headed attention layers.
-      attention_bias = model_utils.get_padding_bias(inputs)
-
-      # Run the inputs through the encoder layer to map the symbol
-      # representations to continuous representations.
-      encoder_outputs = self.encode(inputs, attention_bias)
-
-      # Generate output sequence if targets is None, or return logits if target
-      # sequence is known.
-      if targets is None:
-        return self.predict(encoder_outputs, attention_bias)
-      else:
-        logits = self.decode(targets, encoder_outputs, attention_bias)
-        return logits
-
-  def encode(self, inputs, attention_bias):
-    """Generate continuous representation for inputs.
-
-    Args:
-      inputs: int tensor with shape [batch_size, input_length].
-      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
-
-    Returns:
-      float tensor with shape [batch_size, input_length, hidden_size]
-    """
-    with tf.name_scope("encode"):
-      # Prepare inputs to the layer stack by adding positional encodings and
-      # applying dropout.
-      embedded_inputs = self.embedding_softmax_layer(inputs)
-      inputs_padding = model_utils.get_padding(inputs)
-
-      with tf.name_scope("add_pos_encoding"):
-        length = tf.shape(embedded_inputs)[1]
-        pos_encoding = model_utils.get_position_encoding(
-            length, self.params["hidden_size"])
-        encoder_inputs = embedded_inputs + pos_encoding
-
-      if self.train:
-        encoder_inputs = tf.nn.dropout(
-            encoder_inputs, 1 - self.params["layer_postprocess_dropout"])
-
-      return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
-
-  def decode(self, targets, encoder_outputs, attention_bias):
-    """Generate logits for each value in the target sequence.
-
-    Args:
-      targets: target values for the output sequence.
-        int tensor with shape [batch_size, target_length]
-      encoder_outputs: continuous representation of input sequence.
-        float tensor with shape [batch_size, input_length, hidden_size]
-      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
-
-    Returns:
-      float32 tensor with shape [batch_size, target_length, vocab_size]
-    """
-    with tf.name_scope("decode"):
-      # Prepare inputs to decoder layers by shifting targets, adding positional
-      # encoding and applying dropout.
-      decoder_inputs = self.embedding_softmax_layer(targets)
-      with tf.name_scope("shift_targets"):
-        # Shift targets to the right, and remove the last element
-        decoder_inputs = tf.pad(
-            decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
-      with tf.name_scope("add_pos_encoding"):
-        length = tf.shape(decoder_inputs)[1]
-        decoder_inputs += model_utils.get_position_encoding(
-            length, self.params["hidden_size"])
-      if self.train:
-        decoder_inputs = tf.nn.dropout(
-            decoder_inputs, 1 - self.params["layer_postprocess_dropout"])
-
-      # Run values
-      decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
-          length)
-      outputs = self.decoder_stack(
-          decoder_inputs, encoder_outputs, decoder_self_attention_bias,
-          attention_bias)
-      logits = self.embedding_softmax_layer.linear(outputs)
-      return logits
-
-  def _get_symbols_to_logits_fn(self, max_decode_length):
-    """Returns a decoding function that calculates logits of the next tokens."""
-
-    timing_signal = model_utils.get_position_encoding(
-        max_decode_length + 1, self.params["hidden_size"])
-    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
-        max_decode_length)
-
-    def symbols_to_logits_fn(ids, i, cache):
-      """Generate logits for next potential IDs.
-
-      Args:
-        ids: Current decoded sequences.
-          int tensor with shape [batch_size * beam_size, i + 1]
-        i: Loop index
-        cache: dictionary of values storing the encoder output, encoder-decoder
-          attention bias, and previous decoder attention values.
-
-      Returns:
-        Tuple of
-          (logits with shape [batch_size * beam_size, vocab_size],
-           updated cache values)
-      """
-      # Set decoder input to the last generated IDs
-      decoder_input = ids[:, -1:]
-
-      # Preprocess decoder input by getting embeddings and adding timing signal.
-      decoder_input = self.embedding_softmax_layer(decoder_input)
-      decoder_input += timing_signal[i:i + 1]
-
-      self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
-      decoder_outputs = self.decoder_stack(
-          decoder_input, cache.get("encoder_outputs"), self_attention_bias,
-          cache.get("encoder_decoder_attention_bias"), cache)
-      logits = self.embedding_softmax_layer.linear(decoder_outputs)
-      logits = tf.squeeze(logits, axis=[1])
-      return logits, cache
-    return symbols_to_logits_fn
-
-  def predict(self, encoder_outputs, encoder_decoder_attention_bias):
-    """Return predicted sequence."""
-    batch_size = tf.shape(encoder_outputs)[0]
-    input_length = tf.shape(encoder_outputs)[1]
-    max_decode_length = input_length + self.params["extra_decode_length"]
-
-    symbols_to_logits_fn = self._get_symbols_to_logits_fn(max_decode_length)
-
-    # Create initial set of IDs that will be passed into symbols_to_logits_fn.
-    initial_ids = tf.zeros([batch_size], dtype=tf.int32)
-
-    # Create cache storing decoder attention values for each layer.
-    cache = {
-        "layer_%d" % layer: {
-            "k": tf.zeros([batch_size, 0, self.params["hidden_size"]]),
-            "v": tf.zeros([batch_size, 0, self.params["hidden_size"]]),
-        } for layer in range(self.params["num_hidden_layers"])}
-
-    # Add encoder output and attention bias to the cache.
-    cache["encoder_outputs"] = encoder_outputs
-    cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
-
-    # Use beam search to find the top beam_size sequences and scores.
-    decoded_ids, scores = beam_search.sequence_beam_search(
-        symbols_to_logits_fn=symbols_to_logits_fn,
-        initial_ids=initial_ids,
-        initial_cache=cache,
-        vocab_size=self.params["vocab_size"],
-        beam_size=self.params["beam_size"],
-        alpha=self.params["alpha"],
-        max_decode_length=max_decode_length,
-        eos_id=EOS_ID)
-
-    # Get the top sequence for each batch element
-    top_decoded_ids = decoded_ids[:, 0, 1:]
-    top_scores = scores[:, 0]
-
-    return {"outputs": top_decoded_ids, "scores": top_scores}
-
-
-class LayerNormalization(tf.layers.Layer):
-  """Applies layer normalization."""
-
-  def __init__(self, hidden_size):
-    super(LayerNormalization, self).__init__()
-    self.hidden_size = hidden_size
-
-  def build(self, _):
-    self.scale = tf.get_variable("layer_norm_scale", [self.hidden_size],
-                                 initializer=tf.ones_initializer())
-    self.bias = tf.get_variable("layer_norm_bias", [self.hidden_size],
-                                initializer=tf.zeros_initializer())
-    self.built = True
-
-  def call(self, x, epsilon=1e-6):
-    mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
-    variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
-    norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
-    return norm_x * self.scale + self.bias
-
-
-class PrePostProcessingWrapper(object):
-  """Wrapper class that applies layer pre-processing and post-processing."""
-
-  def __init__(self, layer, params, train):
-    self.layer = layer
-    self.postprocess_dropout = params["layer_postprocess_dropout"]
-    self.train = train
-
-    # Create normalization layer
-    self.layer_norm = LayerNormalization(params["hidden_size"])
-
-  def __call__(self, x, *args, **kwargs):
-    # Preprocessing: apply layer normalization
-    y = self.layer_norm(x)
-
-    # Get layer output
-    y = self.layer(y, *args, **kwargs)
-
-    # Postprocessing: apply dropout and residual connection
-    if self.train:
-      y = tf.nn.dropout(y, 1 - self.postprocess_dropout)
-    return x + y
-
-
-class EncoderStack(tf.layers.Layer):
-  """Transformer encoder stack.
-
-  The encoder stack is made up of N identical layers. Each layer is composed
-  of the sublayers:
-    1. Self-attention layer
-    2. Feedforward network (which is 2 fully-connected layers)
-  """
-
-  def __init__(self, params, train):
-    super(EncoderStack, self).__init__()
-    self.layers = []
-    for _ in range(params["num_hidden_layers"]):
-      # Create sublayers for each layer.
-      self_attention_layer = attention_layer.SelfAttention(
-          params["hidden_size"], params["num_heads"],
-          params["attention_dropout"], train)
-      feed_forward_network = ffn_layer.FeedFowardNetwork(
-          params["hidden_size"], params["filter_size"],
-          params["relu_dropout"], train, params["allow_ffn_pad"])
-
-      self.layers.append([
-          PrePostProcessingWrapper(self_attention_layer, params, train),
-          PrePostProcessingWrapper(feed_forward_network, params, train)])
-
-    # Create final layer normalization layer.
-    self.output_normalization = LayerNormalization(params["hidden_size"])
-
-  def call(self, encoder_inputs, attention_bias, inputs_padding):
-    """Return the output of the encoder layer stacks.
-
-    Args:
-      encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
-      attention_bias: bias for the encoder self-attention layer.
-        [batch_size, 1, 1, input_length]
-      inputs_padding: P
-
-    Returns:
-      Output of encoder layer stack.
-      float32 tensor with shape [batch_size, input_length, hidden_size]
-    """
-    for n, layer in enumerate(self.layers):
-      # Run inputs through the sublayers.
-      self_attention_layer = layer[0]
-      feed_forward_network = layer[1]
-
-      with tf.variable_scope("layer_%d" % n):
-        with tf.variable_scope("self_attention"):
-          encoder_inputs = self_attention_layer(encoder_inputs, attention_bias)
-        with tf.variable_scope("ffn"):
-          encoder_inputs = feed_forward_network(encoder_inputs, inputs_padding)
-
-    return self.output_normalization(encoder_inputs)
-
-
-class DecoderStack(tf.layers.Layer):
-  """Transformer decoder stack.
-
-  Like the encoder stack, the decoder stack is made up of N identical layers.
-  Each layer is composed of the sublayers:
-    1. Self-attention layer
-    2. Multi-headed attention layer combining encoder outputs with results from
-       the previous self-attention layer.
-    3. Feedforward network (2 fully-connected layers)
-  """
-
-  def __init__(self, params, train):
-    super(DecoderStack, self).__init__()
-    self.layers = []
-    for _ in range(params["num_hidden_layers"]):
-      self_attention_layer = attention_layer.SelfAttention(
-          params["hidden_size"], params["num_heads"],
-          params["attention_dropout"], train)
-      enc_dec_attention_layer = attention_layer.Attention(
-          params["hidden_size"], params["num_heads"],
-          params["attention_dropout"], train)
-      feed_forward_network = ffn_layer.FeedFowardNetwork(
-          params["hidden_size"], params["filter_size"],
-          params["relu_dropout"], train, params["allow_ffn_pad"])
-
-      self.layers.append([
-          PrePostProcessingWrapper(self_attention_layer, params, train),
-          PrePostProcessingWrapper(enc_dec_attention_layer, params, train),
-          PrePostProcessingWrapper(feed_forward_network, params, train)])
-
-    self.output_normalization = LayerNormalization(params["hidden_size"])
-
-  def call(self, decoder_inputs, encoder_outputs, decoder_self_attention_bias,
-           attention_bias, cache=None):
-    """Return the output of the decoder layer stacks.
-
-    Args:
-      decoder_inputs: tensor with shape [batch_size, target_length, hidden_size]
-      encoder_outputs: tensor with shape [batch_size, input_length, hidden_size]
-      decoder_self_attention_bias: bias for decoder self-attention layer.
-        [1, 1, target_len, target_length]
-      attention_bias: bias for encoder-decoder attention layer.
-        [batch_size, 1, 1, input_length]
-      cache: (Used for fast decoding) A nested dictionary storing previous
-        decoder self-attention values. The items are:
-          {layer_n: {"k": tensor with shape [batch_size, i, key_channels],
-                     "v": tensor with shape [batch_size, i, value_channels]},
-           ...}
-
-    Returns:
-      Output of decoder layer stack.
-      float32 tensor with shape [batch_size, target_length, hidden_size]
-    """
-    for n, layer in enumerate(self.layers):
-      self_attention_layer = layer[0]
-      enc_dec_attention_layer = layer[1]
-      feed_forward_network = layer[2]
-
-      # Run inputs through the sublayers.
-      layer_name = "layer_%d" % n
-      layer_cache = cache[layer_name] if cache is not None else None
-      with tf.variable_scope(layer_name):
-        with tf.variable_scope("self_attention"):
-          decoder_inputs = self_attention_layer(
-              decoder_inputs, decoder_self_attention_bias, cache=layer_cache)
-        with tf.variable_scope("encdec_attention"):
-          decoder_inputs = enc_dec_attention_layer(
-              decoder_inputs, encoder_outputs, attention_bias)
-        with tf.variable_scope("ffn"):
-          decoder_inputs = feed_forward_network(decoder_inputs)
-
-    return self.output_normalization(decoder_inputs)
--- a/official/r1/transformer/transformer_main.py
+++ b/official/r1/transformer/transformer_main.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Train and evaluate the Transformer model.
-
-See README for description of setting the training schedule and evaluating the
-BLEU score.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import tempfile
-
-# pylint: disable=g-bad-import-order
-from six.moves import xrange  # pylint: disable=redefined-builtin
-from absl import app as absl_app
-from absl import flags
-import tensorflow.compat.v1 as tf
-# pylint: enable=g-bad-import-order
-
-from official.nlp.transformer import model_params
-from official.r1.utils import export
-from official.r1.utils import tpu as tpu_util
-from official.r1.transformer import translate
-from official.r1.transformer import transformer
-from official.r1.transformer import dataset
-from official.r1.transformer import schedule
-from official.nlp.transformer import compute_bleu
-from official.nlp.transformer.utils import metrics
-from official.nlp.transformer.utils import tokenizer
-from official.utils.flags import core as flags_core
-from official.r1.utils.logs import hooks_helper
-from official.r1.utils.logs import logger
-from official.utils.misc import distribution_utils
-from official.utils.misc import model_helpers
-
-PARAMS_MAP = {
-    "tiny": model_params.TINY_PARAMS,
-    "base": model_params.BASE_PARAMS,
-    "big": model_params.BIG_PARAMS,
-}
-
-
-DEFAULT_TRAIN_EPOCHS = 10
-INF = 1000000000  # 1e9
-BLEU_DIR = "bleu"
-
-# Dictionary containing tensors that are logged by the logging hooks. Each item
-# maps a string to the tensor name.
-TENSORS_TO_LOG = {
-    "learning_rate": "model/get_train_op/learning_rate/learning_rate",
-    "cross_entropy_loss": "model/cross_entropy"}
-
-
-def model_fn(features, labels, mode, params):
-  """Defines how to train, evaluate and predict from the transformer model."""
-  with tf.variable_scope("model"):
-    inputs, targets = features, labels
-
-    # Create model and get output logits.
-    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)
-
-    logits = model(inputs, targets)
-
-    # When in prediction mode, the labels/targets is None. The model output
-    # is the prediction
-    if mode == tf.estimator.ModeKeys.PREDICT:
-      if params["use_tpu"]:
-        raise NotImplementedError("Prediction is not yet supported on TPUs.")
-      return tf.estimator.EstimatorSpec(
-          tf.estimator.ModeKeys.PREDICT,
-          predictions=logits,
-          export_outputs={
-              "translate": tf.estimator.export.PredictOutput(logits)
-          })
-
-    # Explicitly set the shape of the logits for XLA (TPU). This is needed
-    # because the logits are passed back to the host VM CPU for metric
-    # evaluation, and the shape of [?, ?, vocab_size] is too vague. However
-    # it is known from Transformer that the first two dimensions of logits
-    # are the dimensions of targets. Note that the ambiguous shape of logits is
-    # not a problem when computing xentropy, because padded_cross_entropy_loss
-    # resolves the shape on the TPU.
-    logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:])
-
-    # Calculate model loss.
-    # xentropy contains the cross entropy loss of every nonpadding token in the
-    # targets.
-    xentropy, weights = metrics.padded_cross_entropy_loss(
-        logits, targets, params["label_smoothing"], params["vocab_size"])
-    loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
-
-    # Save loss as named tensor that will be logged with the logging hook.
-    tf.identity(loss, "cross_entropy")
-
-    if mode == tf.estimator.ModeKeys.EVAL:
-      if params["use_tpu"]:
-        # host call functions should only have tensors as arguments.
-        # This lambda pre-populates params so that metric_fn is
-        # TPUEstimator compliant.
-        metric_fn = lambda logits, labels: (
-            metrics.get_eval_metrics(logits, labels, params=params))
-        eval_metrics = (metric_fn, [logits, labels])
-        return tf.estimator.tpu.TPUEstimatorSpec(
-            mode=mode,
-            loss=loss,
-            predictions={"predictions": logits},
-            eval_metrics=eval_metrics)
-      return tf.estimator.EstimatorSpec(
-          mode=mode, loss=loss, predictions={"predictions": logits},
-          eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
-    else:
-      train_op, metric_dict = get_train_op_and_metrics(loss, params)
-
-      # Epochs can be quite long. This gives some intermediate information
-      # in TensorBoard.
-      metric_dict["minibatch_loss"] = loss
-      if params["use_tpu"]:
-        return tf.estimator.tpu.TPUEstimatorSpec(
-            mode=mode,
-            loss=loss,
-            train_op=train_op,
-            host_call=tpu_util.construct_scalar_host_call(
-                metric_dict=metric_dict,
-                model_dir=params["model_dir"],
-                prefix="training/"))
-      record_scalars(metric_dict)
-      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
-
-
-def record_scalars(metric_dict):
-  for key, value in metric_dict.items():
-    tf.summary.scalar(name=key, tensor=value)
-
-
-def get_learning_rate(learning_rate, hidden_size, learning_rate_warmup_steps):
-  """Calculate learning rate with linear warmup and rsqrt decay."""
-  with tf.name_scope("learning_rate"):
-    warmup_steps = tf.to_float(learning_rate_warmup_steps)
-    step = tf.to_float(tf.train.get_or_create_global_step())
-
-    learning_rate *= (hidden_size ** -0.5)
-    # Apply linear warmup
-    learning_rate *= tf.minimum(1.0, step / warmup_steps)
-    # Apply rsqrt decay
-    learning_rate *= tf.rsqrt(tf.maximum(step, warmup_steps))
-
-    # Create a named tensor that will be logged using the logging hook.
-    # The full name includes variable and names scope. In this case, the name
-    # is model/get_train_op/learning_rate/learning_rate
-    tf.identity(learning_rate, "learning_rate")
-
-    return learning_rate
-
-
-def get_train_op_and_metrics(loss, params):
-  """Generate training op and metrics to save in TensorBoard."""
-  with tf.variable_scope("get_train_op"):
-    learning_rate = get_learning_rate(
-        learning_rate=params["learning_rate"],
-        hidden_size=params["hidden_size"],
-        learning_rate_warmup_steps=params["learning_rate_warmup_steps"])
-
-    # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster
-    # than the TF core Adam optimizer.
-    from tensorflow.contrib import opt as contrib_opt  # pylint: disable=g-import-not-at-top
-    optimizer = contrib_opt.LazyAdamOptimizer(
-        learning_rate,
-        beta1=params["optimizer_adam_beta1"],
-        beta2=params["optimizer_adam_beta2"],
-        epsilon=params["optimizer_adam_epsilon"])
-
-    if params["use_tpu"] and params["tpu"] != tpu_util.LOCAL:
-      optimizer = tf.compat.v1.tpu.CrossShardOptimizer(optimizer)
-
-    # Uses automatic mixed precision FP16 training if on GPU.
-    if params["dtype"] == "fp16":
-      optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
-          optimizer)
-
-    # Calculate and apply gradients using LazyAdamOptimizer.
-    global_step = tf.train.get_global_step()
-    tvars = tf.trainable_variables()
-    gradients = optimizer.compute_gradients(
-        loss, tvars, colocate_gradients_with_ops=True)
-    minimize_op = optimizer.apply_gradients(
-        gradients, global_step=global_step, name="train")
-    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-    train_op = tf.group(minimize_op, update_ops)
-
-    train_metrics = {"learning_rate": learning_rate}
-
-    if not params["use_tpu"]:
-      # gradient norm is not included as a summary when running on TPU, as
-      # it can cause instability between the TPU and the host controller.
-      gradient_norm = tf.global_norm(list(zip(*gradients))[0])
-      train_metrics["global_norm/gradient_norm"] = gradient_norm
-
-    return train_op, train_metrics
-
-
-def translate_and_compute_bleu(estimator, subtokenizer, bleu_source, bleu_ref):
-  """Translate file and report the cased and uncased bleu scores."""
-  # Create temporary file to store translation.
-  tmp = tempfile.NamedTemporaryFile(delete=False)
-  tmp_filename = tmp.name
-
-  translate.translate_file(
-      estimator, subtokenizer, bleu_source, output_file=tmp_filename,
-      print_all_translations=False)
-
-  # Compute uncased and cased bleu scores.
-  uncased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, False)
-  cased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, True)
-  os.remove(tmp_filename)
-  return uncased_score, cased_score
-
-
-def get_global_step(estimator):
-  """Return estimator's last checkpoint."""
-  return int(estimator.latest_checkpoint().split("-")[-1])
-
-
-def evaluate_and_log_bleu(estimator, bleu_source, bleu_ref, vocab_file):
-  """Calculate and record the BLEU score."""
-  subtokenizer = tokenizer.Subtokenizer(vocab_file)
-
-  uncased_score, cased_score = translate_and_compute_bleu(
-      estimator, subtokenizer, bleu_source, bleu_ref)
-
-  tf.logging.info("Bleu score (uncased): %f", uncased_score)
-  tf.logging.info("Bleu score (cased): %f", cased_score)
-  return uncased_score, cased_score
-
-
-def _validate_file(filepath):
-  """Make sure that file exists."""
-  if not tf.io.gfile.exists(filepath):
-    raise tf.errors.NotFoundError(None, None, "File %s not found." % filepath)
-
-
-def run_loop(
-    estimator, schedule_manager, train_hooks=None, benchmark_logger=None,
-    bleu_source=None, bleu_ref=None, bleu_threshold=None, vocab_file=None):
-  """Train and evaluate model, and optionally compute model's BLEU score.
-
-  **Step vs. Epoch vs. Iteration**
-
-  Steps and epochs are canonical terms used in TensorFlow and general machine
-  learning. They are used to describe running a single process (train/eval):
-    - Step refers to running the process through a single or batch of examples.
-    - Epoch refers to running the process through an entire dataset.
-
-  E.g. training a dataset with 100 examples. The dataset is
-  divided into 20 batches with 5 examples per batch. A single training step
-  trains the model on one batch. After 20 training steps, the model will have
-  trained on every batch in the dataset, or, in other words, one epoch.
-
-  Meanwhile, iteration is used in this implementation to describe running
-  multiple processes (training and eval).
-    - A single iteration:
-      1. trains the model for a specific number of steps or epochs.
-      2. evaluates the model.
-      3. (if source and ref files are provided) compute BLEU score.
-
-  This function runs through multiple train+eval+bleu iterations.
-
-  Args:
-    estimator: tf.Estimator containing model to train.
-    schedule_manager: A schedule.Manager object to guide the run loop.
-    train_hooks: List of hooks to pass to the estimator during training.
-    benchmark_logger: a BenchmarkLogger object that logs evaluation data
-    bleu_source: File containing text to be translated for BLEU calculation.
-    bleu_ref: File containing reference translations for BLEU calculation.
-    bleu_threshold: minimum BLEU score before training is stopped.
-    vocab_file: Path to vocab file that will be used to subtokenize bleu_source.
-
-  Returns:
-    Dict of results of the run.  Contains the keys `eval_results`,
-    `train_hooks`, `bleu_cased`, and `bleu_uncased`. `train_hooks` is a list the
-    instances of hooks used during training.
-
-  Raises:
-    ValueError: if both or none of single_iteration_train_steps and
-      single_iteration_train_epochs were defined.
-    NotFoundError: if the vocab file or bleu files don't exist.
-  """
-  if bleu_source:
-    _validate_file(bleu_source)
-  if bleu_ref:
-    _validate_file(bleu_ref)
-  if vocab_file:
-    _validate_file(vocab_file)
-
-  evaluate_bleu = bleu_source is not None and bleu_ref is not None
-  if evaluate_bleu and schedule_manager.use_tpu:
-    raise ValueError("BLEU score can not be computed when training with a TPU, "
-                     "as it requires estimator.predict which is not yet "
-                     "supported.")
-
-  # Print details of training schedule.
-  tf.logging.info("Training schedule:")
-  tf.logging.info(
-      "\t1. Train for {}".format(schedule_manager.train_increment_str))
-  tf.logging.info("\t2. Evaluate model.")
-  if evaluate_bleu:
-    tf.logging.info("\t3. Compute BLEU score.")
-    if bleu_threshold is not None:
-      tf.logging.info("Repeat above steps until the BLEU score reaches %f" %
-                      bleu_threshold)
-  if not evaluate_bleu or bleu_threshold is None:
-    tf.logging.info("Repeat above steps %d times." %
-                    schedule_manager.train_eval_iterations)
-
-  if evaluate_bleu:
-    # Create summary writer to log bleu score (values can be displayed in
-    # Tensorboard).
-    bleu_writer = tf.summary.FileWriter(
-        os.path.join(estimator.model_dir, BLEU_DIR))
-    if bleu_threshold is not None:
-      # Change loop stopping condition if bleu_threshold is defined.
-      schedule_manager.train_eval_iterations = INF
-
-  # Loop training/evaluation/bleu cycles
-  stats = {}
-  for i in xrange(schedule_manager.train_eval_iterations):
-    tf.logging.info("Starting iteration %d" % (i + 1))
-
-    # Train the model for single_iteration_train_steps or until the input fn
-    # runs out of examples (if single_iteration_train_steps is None).
-    estimator.train(
-        dataset.train_input_fn,
-        steps=schedule_manager.single_iteration_train_steps,
-        hooks=train_hooks)
-
-    eval_results = None
-    eval_results = estimator.evaluate(
-        input_fn=dataset.eval_input_fn,
-        steps=schedule_manager.single_iteration_eval_steps)
-
-    tf.logging.info("Evaluation results (iter %d/%d):" %
-                    (i + 1, schedule_manager.train_eval_iterations))
-    tf.logging.info(eval_results)
-    benchmark_logger.log_evaluation_result(eval_results)
-
-    # The results from estimator.evaluate() are measured on an approximate
-    # translation, which utilize the target golden values provided. The actual
-    # bleu score must be computed using the estimator.predict() path, which
-    # outputs translations that are not based on golden values. The translations
-    # are compared to reference file to get the actual bleu score.
-    if evaluate_bleu:
-      uncased_score, cased_score = evaluate_and_log_bleu(
-          estimator, bleu_source, bleu_ref, vocab_file)
-
-      stats["bleu_uncased"] = uncased_score
-      stats["bleu_cased"] = cased_score
-
-      # Write actual bleu scores using summary writer and benchmark logger
-      global_step = get_global_step(estimator)
-      summary = tf.Summary(value=[
-          tf.Summary.Value(tag="bleu/uncased", simple_value=uncased_score),
-          tf.Summary.Value(tag="bleu/cased", simple_value=cased_score),
-      ])
-      bleu_writer.add_summary(summary, global_step)
-      bleu_writer.flush()
-      benchmark_logger.log_metric(
-          "bleu_uncased", uncased_score, global_step=global_step)
-      benchmark_logger.log_metric(
-          "bleu_cased", cased_score, global_step=global_step)
-
-      # Stop training if bleu stopping threshold is met.
-      if model_helpers.past_stop_threshold(bleu_threshold, uncased_score):
-        bleu_writer.close()
-        break
-
-  stats["eval_results"] = eval_results
-  stats["train_hooks"] = train_hooks
-
-  return stats
-
-
-def define_transformer_flags():
-  """Add flags and flag validators for running transformer_main."""
-  # Add common flags (data_dir, model_dir, train_epochs, etc.).
-  flags.DEFINE_integer(
-      name="max_length", short_name="ml", default=None,
-      help=flags_core.help_wrap("Max length."))
-
-  flags_core.define_base(clean=True, train_epochs=True,
-                         epochs_between_evals=True, stop_threshold=True,
-                         num_gpu=True, hooks=True, export_dir=True,
-                         distribution_strategy=True)
-  flags_core.define_performance(
-      num_parallel_calls=True,
-      inter_op=False,
-      intra_op=False,
-      synthetic_data=True,
-      max_train_steps=False,
-      dtype=True,
-      all_reduce_alg=True
-  )
-  flags_core.define_benchmark()
-  flags_core.define_device(tpu=True)
-
-  # Set flags from the flags_core module as "key flags" so they're listed when
-  # the '-h' flag is used. Without this line, the flags defined above are
-  # only shown in the full `--helpful` help text.
-  flags.adopt_module_key_flags(flags_core)
-
-  # Add transformer-specific flags
-  flags.DEFINE_enum(
-      name="param_set", short_name="mp", default="big",
-      enum_values=PARAMS_MAP.keys(),
-      help=flags_core.help_wrap(
-          "Parameter set to use when creating and training the model. The "
-          "parameters define the input shape (batch size and max length), "
-          "model configuration (size of embedding, # of hidden layers, etc.), "
-          "and various other settings. The big parameter set increases the "
-          "default batch size, embedding/hidden size, and filter size. For a "
-          "complete list of parameters, please see model/model_params.py."))
-
-  flags.DEFINE_bool(
-      name="static_batch", default=False,
-      help=flags_core.help_wrap(
-          "Whether the batches in the dataset should have static shapes. In "
-          "general, this setting should be False. Dynamic shapes allow the "
-          "inputs to be grouped so that the number of padding tokens is "
-          "minimized, and helps model training. In cases where the input shape "
-          "must be static (e.g. running on TPU), this setting will be ignored "
-          "and static batching will always be used."))
-
-  # Flags for training with steps (may be used for debugging)
-  flags.DEFINE_integer(
-      name="train_steps", short_name="ts", default=None,
-      help=flags_core.help_wrap("The number of steps used to train."))
-  flags.DEFINE_integer(
-      name="steps_between_evals", short_name="sbe", default=1000,
-      help=flags_core.help_wrap(
-          "The Number of training steps to run between evaluations. This is "
-          "used if --train_steps is defined."))
-
-  # BLEU score computation
-  flags.DEFINE_string(
-      name="bleu_source", short_name="bls", default=None,
-      help=flags_core.help_wrap(
-          "Path to source file containing text translate when calculating the "
-          "official BLEU score. Both --bleu_source and --bleu_ref must be set. "
-          "Use the flag --stop_threshold to stop the script based on the "
-          "uncased BLEU score."))
-  flags.DEFINE_string(
-      name="bleu_ref", short_name="blr", default=None,
-      help=flags_core.help_wrap(
-          "Path to source file containing text translate when calculating the "
-          "official BLEU score. Both --bleu_source and --bleu_ref must be set. "
-          "Use the flag --stop_threshold to stop the script based on the "
-          "uncased BLEU score."))
-  flags.DEFINE_string(
-      name="vocab_file", short_name="vf", default=None,
-      help=flags_core.help_wrap(
-          "Path to subtoken vocabulary file. If data_download.py was used to "
-          "download and encode the training data, look in the data_dir to find "
-          "the vocab file."))
-
-  flags_core.set_defaults(data_dir="/tmp/translate_ende",
-                          model_dir="/tmp/transformer_model",
-                          batch_size=None,
-                          train_epochs=None)
-
-  @flags.multi_flags_validator(
-      ["train_epochs", "train_steps"],
-      message="Both --train_steps and --train_epochs were set. Only one may be "
-              "defined.")
-  def _check_train_limits(flag_dict):
-    return flag_dict["train_epochs"] is None or flag_dict["train_steps"] is None
-
-  @flags.multi_flags_validator(
-      ["bleu_source", "bleu_ref"],
-      message="Both or neither --bleu_source and --bleu_ref must be defined.")
-  def _check_bleu_files(flags_dict):
-    return (flags_dict["bleu_source"] is None) == (
-        flags_dict["bleu_ref"] is None)
-
-  @flags.multi_flags_validator(
-      ["bleu_source", "bleu_ref", "vocab_file"],
-      message="--vocab_file must be defined if --bleu_source and --bleu_ref "
-              "are defined.")
-  def _check_bleu_vocab_file(flags_dict):
-    if flags_dict["bleu_source"] and flags_dict["bleu_ref"]:
-      return flags_dict["vocab_file"] is not None
-    return True
-
-  @flags.multi_flags_validator(
-      ["export_dir", "vocab_file"],
-      message="--vocab_file must be defined if --export_dir is set.")
-  def _check_export_vocab_file(flags_dict):
-    if flags_dict["export_dir"]:
-      return flags_dict["vocab_file"] is not None
-    return True
-
-  flags_core.require_cloud_storage(["data_dir", "model_dir", "export_dir"])
-
-
-def construct_estimator(flags_obj, params, schedule_manager):
-  """Construct an estimator from either Estimator or TPUEstimator.
-
-  Args:
-    flags_obj: The FLAGS object parsed from command line.
-    params: A dict of run specific parameters.
-    schedule_manager: A schedule.Manager object containing the run schedule.
-
-  Returns:
-    An estimator object to be used for training and eval.
-  """
-  if not params["use_tpu"]:
-    distribution_strategy = distribution_utils.get_distribution_strategy(
-        distribution_strategy=flags_obj.distribution_strategy,
-        num_gpus=flags_core.get_num_gpus(flags_obj),
-        all_reduce_alg=flags_obj.all_reduce_alg)
-    return tf.estimator.Estimator(
-        model_fn=model_fn, model_dir=flags_obj.model_dir, params=params,
-        config=tf.estimator.RunConfig(train_distribute=distribution_strategy))
-
-  tpu_cluster_resolver = tf.compat.v1.cluster_resolver.TPUClusterResolver(
-      tpu=flags_obj.tpu,
-      zone=flags_obj.tpu_zone,
-      project=flags_obj.tpu_gcp_project
-  )
-
-  tpu_config = tf.estimator.tpu.TPUConfig(
-      iterations_per_loop=schedule_manager.single_iteration_train_steps,
-      num_shards=flags_obj.num_tpu_shards)
-
-  run_config = tf.estimator.tpu.RunConfig(
-      cluster=tpu_cluster_resolver,
-      model_dir=flags_obj.model_dir,
-      session_config=tf.ConfigProto(
-          allow_soft_placement=True, log_device_placement=True),
-      tpu_config=tpu_config)
-
-  return tf.estimator.tpu.TPUEstimator(
-      model_fn=model_fn,
-      use_tpu=params["use_tpu"] and flags_obj.tpu != tpu_util.LOCAL,
-      train_batch_size=schedule_manager.batch_size,
-      eval_batch_size=schedule_manager.batch_size,
-      params={
-          # TPUEstimator needs to populate batch_size itself due to sharding.
-          key: value for key, value in params.items() if key != "batch_size"
-      },
-      config=run_config)
-
-def per_replica_batch_size(batch_size, num_gpus):
-  """For multi-gpu, batch-size must be a multiple of the number of GPUs.
-
-
-  Note that distribution strategy handles this automatically when used with
-  Keras. For using with Estimator, we need to get per GPU batch.
-
-  Args:
-    batch_size: Global batch size to be divided among devices. This should be
-      equal to num_gpus times the single-GPU batch_size for multi-gpu training.
-    num_gpus: How many GPUs are used with DistributionStrategies.
-
-  Returns:
-    Batch size per device.
-
-  Raises:
-    ValueError: if batch_size is not divisible by number of devices
-  """
-  if num_gpus <= 1:
-    return batch_size
-
-  remainder = batch_size % num_gpus
-  if remainder:
-    err = ('When running with multiple GPUs, batch size '
-           'must be a multiple of the number of available GPUs. Found {} '
-           'GPUs with a batch size of {}; try --batch_size={} instead.'
-          ).format(num_gpus, batch_size, batch_size - remainder)
-    raise ValueError(err)
-  return int(batch_size / num_gpus)
-
-
-def run_transformer(flags_obj):
-  """Create tf.Estimator to train and evaluate transformer model.
-
-  Args:
-    flags_obj: Object containing parsed flag values.
-
-  Returns:
-    Dict of results of the run.  Contains the keys `eval_results`,
-    `train_hooks`, `bleu_cased`, and `bleu_uncased`. `train_hooks` is a list the
-    instances of hooks used during training.
-  """
-  num_gpus = flags_core.get_num_gpus(flags_obj)
-
-  # Add flag-defined parameters to params object
-  params = PARAMS_MAP[flags_obj.param_set]
-  if num_gpus > 1:
-    if flags_obj.param_set == "big":
-      params = model_params.BIG_MULTI_GPU_PARAMS
-    elif flags_obj.param_set == "base":
-      params = model_params.BASE_MULTI_GPU_PARAMS
-
-  params["data_dir"] = flags_obj.data_dir
-  params["model_dir"] = flags_obj.model_dir
-  params["num_parallel_calls"] = flags_obj.num_parallel_calls
-
-  params["tpu"] = flags_obj.tpu
-  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
-  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
-  params["allow_ffn_pad"] = not params["use_tpu"]
-
-  params["max_length"] = flags_obj.max_length or params["max_length"]
-
-  params["use_synthetic_data"] = flags_obj.use_synthetic_data
-
-  # Set batch size parameter, which depends on the availability of
-  # TPU and GPU, and distribution settings.
-  params["batch_size"] = (flags_obj.batch_size or (
-      params["default_batch_size_tpu"] if params["use_tpu"]
-      else params["default_batch_size"]))
-
-  total_batch_size = params["batch_size"]
-  if not params["use_tpu"]:
-    params["batch_size"] = per_replica_batch_size(params["batch_size"],
-                                                  num_gpus)
-
-  schedule_manager = schedule.Manager(
-      train_steps=flags_obj.train_steps,
-      steps_between_evals=flags_obj.steps_between_evals,
-      train_epochs=flags_obj.train_epochs,
-      epochs_between_evals=flags_obj.epochs_between_evals,
-      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
-      batch_size=params["batch_size"],
-      max_length=params["max_length"],
-      use_tpu=params["use_tpu"],
-      num_tpu_shards=flags_obj.num_tpu_shards
-  )
-
-  params["repeat_dataset"] = schedule_manager.repeat_dataset
-
-  model_helpers.apply_clean(flags.FLAGS)
-
-  # Create hooks that log information about the training and metric values
-  train_hooks = hooks_helper.get_train_hooks(
-      flags_obj.hooks,
-      model_dir=flags_obj.model_dir,
-      tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
-      batch_size=total_batch_size,  # for ExamplesPerSecondHook
-      use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
-  )
-  benchmark_logger = logger.get_benchmark_logger()
-  benchmark_logger.log_run_info(
-      model_name="transformer",
-      dataset_name="wmt_translate_ende",
-      run_params=params,
-      test_id=flags_obj.benchmark_test_id)
-
-  # Train and evaluate transformer model
-  estimator = construct_estimator(flags_obj, params, schedule_manager)
-  stats = run_loop(
-      estimator=estimator,
-      # Training arguments
-      schedule_manager=schedule_manager,
-      train_hooks=train_hooks,
-      benchmark_logger=benchmark_logger,
-      # BLEU calculation arguments
-      bleu_source=flags_obj.bleu_source,
-      bleu_ref=flags_obj.bleu_ref,
-      bleu_threshold=flags_obj.stop_threshold,
-      vocab_file=flags_obj.vocab_file)
-
-  if flags_obj.export_dir and not params["use_tpu"]:
-    serving_input_fn = export.build_tensor_serving_input_receiver_fn(
-        shape=[None], dtype=tf.int64, batch_size=None)
-    # Export saved model, and save the vocab file as an extra asset. The vocab
-    # file is saved to allow consistent input encoding and output decoding.
-    # (See the "Export trained model" section in the README for an example of
-    # how to use the vocab file.)
-    # Since the model itself does not use the vocab file, this file is saved as
-    # an extra asset rather than a core asset.
-    estimator.export_savedmodel(
-        flags_obj.export_dir, serving_input_fn,
-        assets_extra={"vocab.txt": flags_obj.vocab_file},
-        strip_default_attrs=True)
-  return stats
-
-
-def main(_):
-  with logger.benchmark_context(flags.FLAGS):
-    run_transformer(flags.FLAGS)
-
-
-if __name__ == "__main__":
-  tf.logging.set_verbosity(tf.logging.INFO)
-  define_transformer_flags()
-  absl_app.run(main)
--- a/official/r1/transformer/translate.py
+++ b/official/r1/transformer/translate.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Translate text or files using trained transformer model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-# pylint: disable=g-bad-import-order
-from absl import app as absl_app
-from absl import flags
-import tensorflow.compat.v1 as tf
-# pylint: enable=g-bad-import-order
-
-from official.nlp.transformer.utils import tokenizer
-from official.utils.flags import core as flags_core
-
-_DECODE_BATCH_SIZE = 32
-_EXTRA_DECODE_LENGTH = 100
-_BEAM_SIZE = 4
-_ALPHA = 0.6
-
-
-def _get_sorted_inputs(filename):
-  """Read and sort lines from the file sorted by decreasing length.
-
-  Args:
-    filename: String name of file to read inputs from.
-  Returns:
-    Sorted list of inputs, and dictionary mapping original index->sorted index
-    of each element.
-  """
-  with tf.io.gfile.GFile(filename) as f:
-    records = f.read().split("\n")
-    inputs = [record.strip() for record in records]
-    if not inputs[-1]:
-      inputs.pop()
-
-  input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
-  sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)
-
-  sorted_inputs = [None] * len(sorted_input_lens)
-  sorted_keys = [0] * len(sorted_input_lens)
-  for i, (index, _) in enumerate(sorted_input_lens):
-    sorted_inputs[i] = inputs[index]
-    sorted_keys[index] = i
-
-  return sorted_inputs, sorted_keys
-
-
-def _encode_and_add_eos(line, subtokenizer):
-  """Encode line with subtokenizer, and add EOS id to the end."""
-  return subtokenizer.encode(line) + [tokenizer.EOS_ID]
-
-
-def _trim_and_decode(ids, subtokenizer):
-  """Trim EOS and PAD tokens from ids, and decode to return a string."""
-  try:
-    index = list(ids).index(tokenizer.EOS_ID)
-    return subtokenizer.decode(ids[:index])
-  except ValueError:  # No EOS found in sequence
-    return subtokenizer.decode(ids)
-
-
-def translate_file(
-    estimator, subtokenizer, input_file, output_file=None,
-    print_all_translations=True):
-  """Translate lines in file, and save to output file if specified.
-
-  Args:
-    estimator: tf.Estimator used to generate the translations.
-    subtokenizer: Subtokenizer object for encoding and decoding source and
-       translated lines.
-    input_file: file containing lines to translate
-    output_file: file that stores the generated translations.
-    print_all_translations: If true, all translations are printed to stdout.
-
-  Raises:
-    ValueError: if output file is invalid.
-  """
-  batch_size = _DECODE_BATCH_SIZE
-
-  # Read and sort inputs by length. Keep dictionary (original index-->new index
-  # in sorted list) to write translations in the original order.
-  sorted_inputs, sorted_keys = _get_sorted_inputs(input_file)
-  num_decode_batches = (len(sorted_inputs) - 1) // batch_size + 1
-
-  def input_generator():
-    """Yield encoded strings from sorted_inputs."""
-    for i, line in enumerate(sorted_inputs):
-      if i % batch_size == 0:
-        batch_num = (i // batch_size) + 1
-
-        tf.logging.info("Decoding batch %d out of %d." %
-                        (batch_num, num_decode_batches))
-      yield _encode_and_add_eos(line, subtokenizer)
-
-  def input_fn():
-    """Created batched dataset of encoded inputs."""
-    ds = tf.data.Dataset.from_generator(
-        input_generator, tf.int64, tf.TensorShape([None]))
-    ds = ds.padded_batch(batch_size, [None])
-    return ds
-
-  translations = []
-  for i, prediction in enumerate(estimator.predict(input_fn)):
-    translation = _trim_and_decode(prediction["outputs"], subtokenizer)
-    translations.append(translation)
-
-    if print_all_translations:
-      tf.logging.info("Translating:\n\tInput: %s\n\tOutput: %s" %
-                      (sorted_inputs[i], translation))
-
-  # Write translations in the order they appeared in the original file.
-  if output_file is not None:
-    if tf.io.gfile.isdir(output_file):
-      raise ValueError("File output is a directory, will not save outputs to "
-                       "file.")
-    tf.logging.info("Writing to file %s" % output_file)
-    with tf.io.gfile.GFile(output_file, "w") as f:
-      for i in sorted_keys:
-        f.write("%s\n" % translations[i])
-
-
-def translate_text(estimator, subtokenizer, txt):
-  """Translate a single string."""
-  encoded_txt = _encode_and_add_eos(txt, subtokenizer)
-
-  def input_fn():
-    ds = tf.data.Dataset.from_tensors(encoded_txt)
-    ds = ds.batch(_DECODE_BATCH_SIZE)
-    return ds
-
-  predictions = estimator.predict(input_fn)
-  translation = next(predictions)["outputs"]
-  translation = _trim_and_decode(translation, subtokenizer)
-  tf.logging.info("Translation of \"%s\": \"%s\"" % (txt, translation))
-
-
-def main(unused_argv):
-  from official.transformer import transformer_main
-
-  tf.logging.set_verbosity(tf.logging.INFO)
-
-  if FLAGS.text is None and FLAGS.file is None:
-    tf.logging.warn("Nothing to translate. Make sure to call this script using "
-                    "flags --text or --file.")
-    return
-
-  subtokenizer = tokenizer.Subtokenizer(FLAGS.vocab_file)
-
-  # Set up estimator and params
-  params = transformer_main.PARAMS_MAP[FLAGS.param_set]
-  params["beam_size"] = _BEAM_SIZE
-  params["alpha"] = _ALPHA
-  params["extra_decode_length"] = _EXTRA_DECODE_LENGTH
-  params["batch_size"] = _DECODE_BATCH_SIZE
-  estimator = tf.estimator.Estimator(
-      model_fn=transformer_main.model_fn, model_dir=FLAGS.model_dir,
-      params=params)
-
-  if FLAGS.text is not None:
-    tf.logging.info("Translating text: %s" % FLAGS.text)
-    translate_text(estimator, subtokenizer, FLAGS.text)
-
-  if FLAGS.file is not None:
-    input_file = os.path.abspath(FLAGS.file)
-    tf.logging.info("Translating file: %s" % input_file)
-    if not tf.gfile.Exists(FLAGS.file):
-      raise ValueError("File does not exist: %s" % input_file)
-
-    output_file = None
-    if FLAGS.file_out is not None:
-      output_file = os.path.abspath(FLAGS.file_out)
-      tf.logging.info("File output specified: %s" % output_file)
-
-    translate_file(estimator, subtokenizer, input_file, output_file)
-
-
-def define_translate_flags():
-  """Define flags used for translation script."""
-  # Model flags
-  flags.DEFINE_string(
-      name="model_dir", short_name="md", default="/tmp/transformer_model",
-      help=flags_core.help_wrap(
-          "Directory containing Transformer model checkpoints."))
-  flags.DEFINE_enum(
-      name="param_set", short_name="mp", default="big",
-      enum_values=["base", "big"],
-      help=flags_core.help_wrap(
-          "Parameter set to use when creating and training the model. The "
-          "parameters define the input shape (batch size and max length), "
-          "model configuration (size of embedding, # of hidden layers, etc.), "
-          "and various other settings. The big parameter set increases the "
-          "default batch size, embedding/hidden size, and filter size. For a "
-          "complete list of parameters, please see model/model_params.py."))
-  flags.DEFINE_string(
-      name="vocab_file", short_name="vf", default=None,
-      help=flags_core.help_wrap(
-          "Path to subtoken vocabulary file. If data_download.py was used to "
-          "download and encode the training data, look in the data_dir to find "
-          "the vocab file."))
-  flags.mark_flag_as_required("vocab_file")
-
-  flags.DEFINE_string(
-      name="text", default=None,
-      help=flags_core.help_wrap(
-          "Text to translate. Output will be printed to console."))
-  flags.DEFINE_string(
-      name="file", default=None,
-      help=flags_core.help_wrap(
-          "File containing text to translate. Translation will be printed to "
-          "console and, if --file_out is provided, saved to an output file."))
-  flags.DEFINE_string(
-      name="file_out", default=None,
-      help=flags_core.help_wrap(
-          "If --file flag is specified, save translation to this file."))
-
-
-if __name__ == "__main__":
-  define_translate_flags()
-  FLAGS = flags.FLAGS
-  absl_app.run(main)