Merge branch 'master' into move_to_keraslayers_fasterrcnn_fpn_keras_feature_extractor

0cceabfc · Yiming Shi · GitHub · 17821c0d · 39ee0ac9 · 17821c0d
Unverified Commit 0cceabfc authored Aug 03, 2020 by Yiming Shi Committed by GitHub Aug 03, 2020
20 changed files
--- a/official/r1/transformer/__init__.py
+++ b/official/r1/transformer/__init__.py
--- a/official/r1/transformer/attention_layer.py
+++ b/official/r1/transformer/attention_layer.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of multiheaded attention and self-attention layers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf
-
-
-class Attention(tf.layers.Layer):
-  """Multi-headed attention layer."""
-
-  def __init__(self, hidden_size, num_heads, attention_dropout, train):
-    if hidden_size % num_heads != 0:
-      raise ValueError("Hidden size must be evenly divisible by the number of "
-                       "heads.")
-
-    super(Attention, self).__init__()
-    self.hidden_size = hidden_size
-    self.num_heads = num_heads
-    self.attention_dropout = attention_dropout
-    self.train = train
-
-    # Layers for linearly projecting the queries, keys, and values.
-    self.q_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="q")
-    self.k_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="k")
-    self.v_dense_layer = tf.layers.Dense(hidden_size, use_bias=False, name="v")
-
-    self.output_dense_layer = tf.layers.Dense(hidden_size, use_bias=False,
-                                              name="output_transform")
-
-  def split_heads(self, x):
-    """Split x into different heads, and transpose the resulting value.
-
-    The tensor is transposed to insure the inner dimensions hold the correct
-    values during the matrix multiplication.
-
-    Args:
-      x: A tensor with shape [batch_size, length, hidden_size]
-
-    Returns:
-      A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads]
-    """
-    with tf.name_scope("split_heads"):
-      batch_size = tf.shape(x)[0]
-      length = tf.shape(x)[1]
-
-      # Calculate depth of last dimension after it has been split.
-      depth = (self.hidden_size // self.num_heads)
-
-      # Split the last dimension
-      x = tf.reshape(x, [batch_size, length, self.num_heads, depth])
-
-      # Transpose the result
-      return tf.transpose(x, [0, 2, 1, 3])
-
-  def combine_heads(self, x):
-    """Combine tensor that has been split.
-
-    Args:
-      x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
-
-    Returns:
-      A tensor with shape [batch_size, length, hidden_size]
-    """
-    with tf.name_scope("combine_heads"):
-      batch_size = tf.shape(x)[0]
-      length = tf.shape(x)[2]
-      x = tf.transpose(x, [0, 2, 1, 3])  # --> [batch, length, num_heads, depth]
-      return tf.reshape(x, [batch_size, length, self.hidden_size])
-
-  def call(self, x, y, bias, cache=None):
-    """Apply attention mechanism to x and y.
-
-    Args:
-      x: a tensor with shape [batch_size, length_x, hidden_size]
-      y: a tensor with shape [batch_size, length_y, hidden_size]
-      bias: attention bias that will be added to the result of the dot product.
-      cache: (Used during prediction) dictionary with tensors containing results
-        of previous attentions. The dictionary must have the items:
-            {"k": tensor with shape [batch_size, i, key_channels],
-             "v": tensor with shape [batch_size, i, value_channels]}
-        where i is the current decoded length.
-
-    Returns:
-      Attention layer output with shape [batch_size, length_x, hidden_size]
-    """
-    # Linearly project the query (q), key (k) and value (v) using different
-    # learned projections. This is in preparation of splitting them into
-    # multiple heads. Multi-head attention uses multiple queries, keys, and
-    # values rather than regular attention (which uses a single q, k, v).
-    q = self.q_dense_layer(x)
-    k = self.k_dense_layer(y)
-    v = self.v_dense_layer(y)
-
-    if cache is not None:
-      # Combine cached keys and values with new keys and values.
-      k = tf.concat([cache["k"], k], axis=1)
-      v = tf.concat([cache["v"], v], axis=1)
-
-      # Update cache
-      cache["k"] = k
-      cache["v"] = v
-
-    # Split q, k, v into heads.
-    q = self.split_heads(q)
-    k = self.split_heads(k)
-    v = self.split_heads(v)
-
-    # Scale q to prevent the dot product between q and k from growing too large.
-    depth = (self.hidden_size // self.num_heads)
-    q *= depth ** -0.5
-
-    # Calculate dot product attention
-    logits = tf.matmul(q, k, transpose_b=True)
-    logits += bias
-    weights = tf.nn.softmax(logits, name="attention_weights")
-    if self.train:
-      weights = tf.nn.dropout(weights, 1.0 - self.attention_dropout)
-    attention_output = tf.matmul(weights, v)
-
-    # Recombine heads --> [batch_size, length, hidden_size]
-    attention_output = self.combine_heads(attention_output)
-
-    # Run the combined outputs through another linear projection layer.
-    attention_output = self.output_dense_layer(attention_output)
-    return attention_output
-
-
-class SelfAttention(Attention):
-  """Multiheaded self-attention layer."""
-
-  def call(self, x, bias, cache=None):
-    return super(SelfAttention, self).call(x, x, bias, cache)
--- a/official/r1/transformer/dataset.py
+++ b/official/r1/transformer/dataset.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Input pipeline for the transformer model to read, filter, and batch examples.
-
-Two things to note in the pipeline:
-
-1. Batching scheme
-
-   The examples encoded in the TFRecord files contain data in the format:
-     {"inputs": [variable length array of integers],
-      "targets": [variable length array of integers]}
-   Where integers in the arrays refer to tokens in the English and German vocab
-   file (named `vocab.ende.32768`).
-
-   Prior to batching, elements in the dataset are grouped by length (max between
-   "inputs" and "targets" length). Each group is then batched such that:
-     group_batch_size * length <= batch_size.
-
-   Another way to view batch_size is the maximum number of tokens in each batch.
-
-   Once batched, each element in the dataset will have the shape:
-     {"inputs": [group_batch_size, padded_input_length],
-      "targets": [group_batch_size, padded_target_length]}
-   Lengths are padded to the longest "inputs" or "targets" sequence in the batch
-   (padded_input_length and padded_target_length can be different).
-
-   This batching scheme decreases the fraction of padding tokens per training
-   batch, thus improving the training speed significantly.
-
-2. Shuffling
-
-   While training, the dataset is shuffled in two places in the code. The first
-   is the list of training files. Second, while reading records using
-   `parallel_interleave`, the `sloppy` argument is used to generate randomness
-   in the order of the examples.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import os
-
-import tensorflow.compat.v1 as tf
-
-from official.utils.misc import model_helpers
-
-# Buffer size for reading records from a TFRecord file. Each training file is
-# 7.2 MB, so 8 MB allows an entire file to be kept in memory.
-_READ_RECORD_BUFFER = 8 * 1000 * 1000
-
-# Example grouping constants. Defines length boundaries for each group.
-# These values are the defaults used in Tensor2Tensor.
-_MIN_BOUNDARY = 8
-_BOUNDARY_SCALE = 1.1
-
-
-def _load_records(filename):
-  """Read file and return a dataset of tf.Examples."""
-  return tf.data.TFRecordDataset(filename, buffer_size=_READ_RECORD_BUFFER)
-
-
-def _parse_example(serialized_example):
-  """Return inputs and targets Tensors from a serialized tf.Example."""
-  data_fields = {
-      "inputs": tf.VarLenFeature(tf.int64),
-      "targets": tf.VarLenFeature(tf.int64)
-  }
-  parsed = tf.parse_single_example(serialized_example, data_fields)
-  inputs = tf.sparse_tensor_to_dense(parsed["inputs"])
-  targets = tf.sparse_tensor_to_dense(parsed["targets"])
-  return inputs, targets
-
-
-def _filter_max_length(example, max_length=256):
-  """Indicates whether the example's length is lower than the maximum length."""
-  return tf.logical_and(tf.size(example[0]) <= max_length,
-                        tf.size(example[1]) <= max_length)
-
-
-def _get_example_length(example):
-  """Returns the maximum length between the example inputs and targets."""
-  length = tf.maximum(tf.shape(example[0])[0], tf.shape(example[1])[0])
-  return length
-
-
-def _create_min_max_boundaries(
-    max_length, min_boundary=_MIN_BOUNDARY, boundary_scale=_BOUNDARY_SCALE):
-  """Create min and max boundary lists up to max_length.
-
-  For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
-  returned values will be:
-    buckets_min = [0, 4, 8, 16, 24]
-    buckets_max = [4, 8, 16, 24, 25]
-
-  Args:
-    max_length: The maximum length of example in dataset.
-    min_boundary: Minimum length in boundary.
-    boundary_scale: Amount to scale consecutive boundaries in the list.
-
-  Returns:
-    min and max boundary lists
-
-  """
-  # Create bucket boundaries list by scaling the previous boundary or adding 1
-  # (to ensure increasing boundary sizes).
-  bucket_boundaries = []
-  x = min_boundary
-  while x < max_length:
-    bucket_boundaries.append(x)
-    x = max(x + 1, int(x * boundary_scale))
-
-  # Create min and max boundary lists from the initial list.
-  buckets_min = [0] + bucket_boundaries
-  buckets_max = bucket_boundaries + [max_length + 1]
-  return buckets_min, buckets_max
-
-
-def _batch_examples(dataset, batch_size, max_length):
-  """Group examples by similar lengths, and return batched dataset.
-
-  Each batch of similar-length examples are padded to the same length, and may
-  have different number of elements in each batch, such that:
-    group_batch_size * padded_length <= batch_size.
-
-  This decreases the number of padding tokens per batch, which improves the
-  training speed.
-
-  Args:
-    dataset: Dataset of unbatched examples.
-    batch_size: Max number of tokens per batch of examples.
-    max_length: Max number of tokens in an example input or target sequence.
-
-  Returns:
-    Dataset of batched examples with similar lengths.
-  """
-  # Get min and max boundary lists for each example. These are used to calculate
-  # the `bucket_id`, which is the index at which:
-  # buckets_min[bucket_id] <= len(example) < buckets_max[bucket_id]
-  # Note that using both min and max lists improves the performance.
-  buckets_min, buckets_max = _create_min_max_boundaries(max_length)
-
-  # Create list of batch sizes for each bucket_id, so that
-  # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
-  bucket_batch_sizes = [batch_size // x for x in buckets_max]
-  # bucket_id will be a tensor, so convert this list to a tensor as well.
-  bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
-
-  def example_to_bucket_id(example_input, example_target):
-    """Return int64 bucket id for this example, calculated based on length."""
-    seq_length = _get_example_length((example_input, example_target))
-
-    # TODO: investigate whether removing code branching improves performance.
-    conditions_c = tf.logical_and(
-        tf.less_equal(buckets_min, seq_length),
-        tf.less(seq_length, buckets_max))
-    bucket_id = tf.reduce_min(tf.where(conditions_c))
-    return bucket_id
-
-  def window_size_fn(bucket_id):
-    """Return number of examples to be grouped when given a bucket id."""
-    return bucket_batch_sizes[bucket_id]
-
-  def batching_fn(bucket_id, grouped_dataset):
-    """Batch and add padding to a dataset of elements with similar lengths."""
-    bucket_batch_size = window_size_fn(bucket_id)
-
-    # Batch the dataset and add padding so that all input sequences in the
-    # examples have the same length, and all target sequences have the same
-    # lengths as well. Resulting lengths of inputs and targets can differ.
-    return grouped_dataset.padded_batch(bucket_batch_size, ([None], [None]))
-
-  return dataset.apply(tf.data.experimental.group_by_window(
-      key_func=example_to_bucket_id,
-      reduce_func=batching_fn,
-      window_size=None,
-      window_size_func=window_size_fn))
-
-
-def _read_and_batch_from_files(
-    file_pattern, batch_size, max_length, num_parallel_calls, shuffle, repeat,
-    static_batch=False):
-  """Create dataset where each item is a dict of "inputs" and "targets".
-
-  Args:
-    file_pattern: String used to match the input TFRecord files.
-    batch_size: Maximum number of tokens per batch of examples
-    max_length: Maximum number of tokens per example
-    num_parallel_calls: Number of cpu cores for parallel input processing.
-    shuffle: If true, randomizes order of elements.
-    repeat: Number of times to repeat the dataset. If None, the dataset is
-      repeated forever.
-    static_batch: Whether the batches in the dataset should have static shapes.
-      If True, the input is batched so that every batch has the
-      shape [batch_size // max_length, max_length]. If False, the input is
-      grouped by length, and batched so that batches may have different
-      shapes [N, M], where:
-        N * M <= batch_size
-        M <= max_length
-      In general, this setting should be False. Dynamic shapes allow the inputs
-      to be grouped so that the number of padding tokens is minimized, and helps
-      model training. In cases where the input shape must be static
-      (e.g. running on TPU), this setting should be set to True.
-
-  Returns:
-    tf.data.Dataset object containing examples loaded from the files.
-  """
-  dataset = tf.data.Dataset.list_files(file_pattern, shuffle=shuffle)
-
-  # Read files and interleave results. When training, the order of the examples
-  # will be non-deterministic.
-  dataset = dataset.apply(
-      tf.data.experimental.parallel_interleave(
-          _load_records, sloppy=shuffle, cycle_length=num_parallel_calls))
-
-  # Parse each tf.Example into a dictionary
-  # TODO: Look into prefetch_input_elements for performance optimization.
-  dataset = dataset.map(_parse_example,
-                        num_parallel_calls=num_parallel_calls)
-
-  # Remove examples where the input or target length exceeds the maximum length,
-  dataset = dataset.filter(lambda x, y: _filter_max_length((x, y), max_length))
-
-  if static_batch:
-    dataset = dataset.padded_batch(
-        batch_size // max_length, ([max_length], [max_length]),
-        drop_remainder=True)
-  else:
-    # Group and batch such that each batch has examples of similar length.
-    dataset = _batch_examples(dataset, batch_size, max_length)
-
-  dataset = dataset.repeat(repeat)
-
-  # Prefetch the next element to improve speed of input pipeline.
-  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
-  return dataset
-
-
-def _generate_synthetic_data(params):
-  """Create synthetic data based on the parameter batch size."""
-  batch = length = int(math.sqrt(params["batch_size"]))
-  return model_helpers.generate_synthetic_data(
-      input_shape=tf.TensorShape([batch, length]),
-      input_value=1,
-      input_dtype=tf.int32,
-      label_shape=tf.TensorShape([batch, length]),
-      label_value=1,
-      label_dtype=tf.int32,
-  )
-
-
-def train_input_fn(params):
-  """Load and return dataset of batched examples for use during training."""
-  file_pattern = os.path.join(params["data_dir"] or "", "*train*")
-  if params["use_synthetic_data"]:
-    return _generate_synthetic_data(params)
-  return _read_and_batch_from_files(
-      file_pattern, params["batch_size"], params["max_length"],
-      params["num_parallel_calls"], shuffle=True,
-      repeat=params["repeat_dataset"], static_batch=params["static_batch"])
-
-
-def eval_input_fn(params):
-  """Load and return dataset of batched examples for use during evaluation."""
-  file_pattern = os.path.join(params["data_dir"] or "", "*dev*")
-  if params["use_synthetic_data"]:
-    return _generate_synthetic_data(params)
-  return _read_and_batch_from_files(
-      file_pattern, params["batch_size"], params["max_length"],
-      params["num_parallel_calls"], shuffle=False, repeat=1,
-      static_batch=params["static_batch"])
--- a/official/r1/transformer/embedding_layer.py
+++ b/official/r1/transformer/embedding_layer.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of embedding layer with shared weights."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf  # pylint: disable=g-bad-import-order
-
-from official.r1.utils import tpu as tpu_utils
-
-
-class EmbeddingSharedWeights(tf.layers.Layer):
-  """Calculates input embeddings and pre-softmax linear with shared weights."""
-
-  def __init__(self, vocab_size, hidden_size, method="gather"):
-    """Specify characteristic parameters of embedding layer.
-
-    Args:
-      vocab_size: Number of tokens in the embedding. (Typically ~32,000)
-      hidden_size: Dimensionality of the embedding. (Typically 512 or 1024)
-      method: Strategy for performing embedding lookup. "gather" uses tf.gather
-        which performs well on CPUs and GPUs, but very poorly on TPUs. "matmul"
-        one-hot encodes the indicies and formulates the embedding as a sparse
-        matrix multiplication. The matmul formulation is wasteful as it does
-        extra work, however matrix multiplication is very fast on TPUs which
-        makes "matmul" considerably faster than "gather" on TPUs.
-    """
-    super(EmbeddingSharedWeights, self).__init__()
-    self.vocab_size = vocab_size
-    self.hidden_size = hidden_size
-    if method not in ("gather", "matmul"):
-      raise ValueError("method {} must be 'gather' or 'matmul'".format(method))
-    self.method = method
-
-  def build(self, _):
-    with tf.variable_scope("embedding_and_softmax", reuse=tf.AUTO_REUSE):
-      # Create and initialize weights. The random normal initializer was chosen
-      # randomly, and works well.
-      self.shared_weights = tf.get_variable(
-          "weights", [self.vocab_size, self.hidden_size],
-          initializer=tf.random_normal_initializer(
-              0., self.hidden_size ** -0.5))
-
-    self.built = True
-
-  def call(self, x):
-    """Get token embeddings of x.
-
-    Args:
-      x: An int64 tensor with shape [batch_size, length]
-    Returns:
-      embeddings: float32 tensor with shape [batch_size, length, embedding_size]
-      padding: float32 tensor with shape [batch_size, length] indicating the
-        locations of the padding tokens in x.
-    """
-    with tf.name_scope("embedding"):
-      # Create binary mask of size [batch_size, length]
-      mask = tf.to_float(tf.not_equal(x, 0))
-
-      if self.method == "gather":
-        embeddings = tf.gather(self.shared_weights, x)
-        embeddings *= tf.expand_dims(mask, -1)
-      else:  # matmul
-        embeddings = tpu_utils.embedding_matmul(
-            embedding_table=self.shared_weights,
-            values=tf.cast(x, dtype=tf.int32),
-            mask=mask
-        )
-        # embedding_matmul already zeros out masked positions, so
-        # `embeddings *= tf.expand_dims(mask, -1)` is unnecessary.
-
-
-      # Scale embedding by the sqrt of the hidden size
-      embeddings *= self.hidden_size ** 0.5
-
-      return embeddings
-
-
-  def linear(self, x):
-    """Computes logits by running x through a linear layer.
-
-    Args:
-      x: A float32 tensor with shape [batch_size, length, hidden_size]
-    Returns:
-      float32 tensor with shape [batch_size, length, vocab_size].
-    """
-    with tf.name_scope("presoftmax_linear"):
-      batch_size = tf.shape(x)[0]
-      length = tf.shape(x)[1]
-
-      x = tf.reshape(x, [-1, self.hidden_size])
-      logits = tf.matmul(x, self.shared_weights, transpose_b=True)
-
-      return tf.reshape(logits, [batch_size, length, self.vocab_size])
--- a/official/r1/transformer/ffn_layer.py
+++ b/official/r1/transformer/ffn_layer.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of fully connected network."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf
-
-
-class FeedFowardNetwork(tf.layers.Layer):
-  """Fully connected feedforward network."""
-
-  def __init__(self, hidden_size, filter_size, relu_dropout, train, allow_pad):
-    super(FeedFowardNetwork, self).__init__()
-    self.hidden_size = hidden_size
-    self.filter_size = filter_size
-    self.relu_dropout = relu_dropout
-    self.train = train
-    self.allow_pad = allow_pad
-
-    self.filter_dense_layer = tf.layers.Dense(
-        filter_size, use_bias=True, activation=tf.nn.relu, name="filter_layer")
-    self.output_dense_layer = tf.layers.Dense(
-        hidden_size, use_bias=True, name="output_layer")
-
-  def call(self, x, padding=None):
-    """Return outputs of the feedforward network.
-
-    Args:
-      x: tensor with shape [batch_size, length, hidden_size]
-      padding: (optional) If set, the padding values are temporarily removed
-        from x (provided self.allow_pad is set). The padding values are placed
-        back in the output tensor in the same locations.
-        shape [batch_size, length]
-
-    Returns:
-      Output of the feedforward network.
-      tensor with shape [batch_size, length, hidden_size]
-    """
-    padding = None if not self.allow_pad else padding
-
-    # Retrieve dynamically known shapes
-    batch_size = tf.shape(x)[0]
-    length = tf.shape(x)[1]
-
-    if padding is not None:
-      with tf.name_scope("remove_padding"):
-        # Flatten padding to [batch_size*length]
-        pad_mask = tf.reshape(padding, [-1])
-
-        nonpad_ids = tf.to_int32(tf.where(pad_mask < 1e-9))
-
-        # Reshape x to [batch_size*length, hidden_size] to remove padding
-        x = tf.reshape(x, [-1, self.hidden_size])
-        x = tf.gather_nd(x, indices=nonpad_ids)
-
-        # Reshape x from 2 dimensions to 3 dimensions.
-        x.set_shape([None, self.hidden_size])
-        x = tf.expand_dims(x, axis=0)
-
-    output = self.filter_dense_layer(x)
-    if self.train:
-      output = tf.nn.dropout(output, 1.0 - self.relu_dropout)
-    output = self.output_dense_layer(output)
-
-    if padding is not None:
-      with tf.name_scope("re_add_padding"):
-        output = tf.squeeze(output, axis=0)
-        output = tf.scatter_nd(
-            indices=nonpad_ids,
-            updates=output,
-            shape=[batch_size * length, self.hidden_size]
-        )
-        output = tf.reshape(output, [batch_size, length, self.hidden_size])
-    return output
--- a/official/r1/transformer/schedule.py
+++ b/official/r1/transformer/schedule.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Abstract training on a step or epoch basis."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-import tensorflow.compat.v1 as tf
-
-
-_TRAIN, _EVAL = tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL
-
-
-NUM_EXAMPLES = {
-    tf.estimator.ModeKeys.TRAIN: 4572160,
-    # # Examples that are too long are filtered out, thus the total is less
-    # # than the total number of lines.
-    # 2399123 +  # news-commentary-v12.de-en
-    # 1920209 +  # commoncrawl.de-en
-    # 270769,    # europarl-v7.de-en
-    tf.estimator.ModeKeys.EVAL: 3000,  # newstest2013
-}
-
-
-class Manager(object):
-  """Container for convenience functions to abstract step or epoch basis.
-  Transformer allows users to specify an epoch basis (generally recommended for
-  full training) or a number of steps basis (convenient since epochs are rather
-  large). TPUs furthermore require a step basis; however epochs are the norm in
-  the machine learning community and it is desirable to allow users to specify
-  epochs even when running with TPUS which requires behind the scenes
-  conversions.
-  This container simply groups what are largely mundane checks and conversions
-  rather than interspersing them throughout the run loop code.
-  """
-
-  def __init__(self, train_steps, steps_between_evals, train_epochs,
-               epochs_between_evals, default_train_epochs, batch_size,
-               max_length, use_tpu=False, num_tpu_shards=8):
-    if train_steps and train_epochs:
-      raise ValueError("Both train_steps or train_epochs were be defined.")
-
-    # Determine training schedule based on flags.
-    if train_steps:
-      self.train_eval_iterations = train_steps // steps_between_evals
-      self._single_iteration_train_steps = steps_between_evals
-      self._single_iteration_train_epochs = None
-    else:
-      train_epochs = train_epochs or default_train_epochs
-      self.train_eval_iterations = train_epochs // epochs_between_evals
-      self._single_iteration_train_steps = None
-      self._single_iteration_train_epochs = epochs_between_evals
-
-    self.max_length = max_length
-    self.batch_size = batch_size
-    self.use_tpu = use_tpu
-    self.num_tpu_shards = num_tpu_shards
-
-    if self.use_tpu:
-      assert (self.batch_size // self.max_length) % self.num_tpu_shards == 0
-
-  @property
-  def single_iteration_train_steps(self):
-    if self._single_iteration_train_steps or not self.use_tpu:
-      return self._single_iteration_train_steps
-
-    return self.epochs_to_steps(
-        num_epochs=self._single_iteration_train_epochs, mode=_TRAIN)
-
-  @property
-  def single_iteration_eval_steps(self):
-    if not self.use_tpu:
-      return None
-
-    return self.epochs_to_steps(num_epochs=1, mode=_EVAL)
-
-  @property
-  def train_increment_str(self):
-    if self._single_iteration_train_steps:
-      return "{} steps.".format(self._single_iteration_train_steps)
-
-    if not self.use_tpu:
-      return "{} epochs.".format(self._single_iteration_train_epochs)
-
-    return "~{} epochs. ({} steps)".format(
-        self._single_iteration_train_epochs,
-        self.single_iteration_train_steps)
-
-  @property
-  def repeat_dataset(self):
-    if (self._single_iteration_train_epochs is None and
-        self._single_iteration_train_steps > NUM_EXAMPLES[_TRAIN]):
-      return math.ceil(self._single_iteration_train_steps /
-                       NUM_EXAMPLES[_TRAIN])
-    return self._single_iteration_train_epochs
-
-  def epochs_to_steps(self, num_epochs, mode):
-    """Converts a number of epochs to a number of training steps.
-
-    TPU only: This function assumes that static_batch is True.
-
-      TPU can not tolerate an OutOfRange error from a dataset. As a result the
-    number of examples to be processed must be known ahead of time. TPUs also
-    do not allow partial batches, so this function rounds down.
-
-    Args:
-      num_epochs: An integer of the number of epochs to convert to steps.
-      mode: The estimator ModeKey of the computation
-
-    Returns:
-      An integer of the number of equivalent steps rounded down.
-    """
-    assert self.use_tpu, "epochs_to_steps should only be reached when using TPU"
-    total_num_tokens = NUM_EXAMPLES[mode] * self.max_length * num_epochs
-    return total_num_tokens // self.batch_size
--- a/official/r1/transformer/schedule_test.py
+++ b/official/r1/transformer/schedule_test.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test Transformer's schedule manager."""
-
-import tensorflow.compat.v1 as tf
-
-from official.r1.transformer import schedule
-
-
-class ScheduleBaseTester(tf.test.TestCase):
-  def test_mutual_exclusivity(self):
-    with self.assertRaises(ValueError):
-      schedule.Manager(
-          train_steps=100, steps_between_evals=100, train_epochs=2,
-          epochs_between_evals=1, default_train_epochs=None, batch_size=2048,
-          max_length=256)
-
-  def test_step_basis(self):
-    manager = schedule.Manager(
-        train_steps=1000, steps_between_evals=100, train_epochs=None,
-        epochs_between_evals=None, default_train_epochs=None, batch_size=2048,
-        max_length=256)
-
-    self.assertEqual(manager.single_iteration_train_steps, 100)
-
-    # Evaluation uses the full set
-    self.assertIsNone(manager.single_iteration_eval_steps)
-
-    self.assertIsNone(manager.repeat_dataset)
-
-  def test_epoch_basis(self):
-    manager = schedule.Manager(
-        train_steps=None, steps_between_evals=None, train_epochs=10,
-        epochs_between_evals=2, default_train_epochs=None, batch_size=2048,
-        max_length=256)
-
-    # For non-TPU, estimator relies on dataset exhausion
-    self.assertIsNone(manager.single_iteration_train_steps)
-    self.assertIsNone(manager.single_iteration_eval_steps)
-
-    self.assertEqual(manager.repeat_dataset, 2)
-
-  def test_step_basis_tpu(self):
-    manager = schedule.Manager(
-        train_steps=1000, steps_between_evals=100, train_epochs=None,
-        epochs_between_evals=None, default_train_epochs=None, batch_size=2048,
-        max_length=256, use_tpu=True)
-
-    self.assertEqual(manager.single_iteration_train_steps, 100)
-    # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256)
-    self.assertEqual(manager.single_iteration_eval_steps, 375)
-    self.assertIsNone(manager.repeat_dataset)
-
-  def test_epoch_basis_tpu(self):
-    manager = schedule.Manager(
-        train_steps=None, steps_between_evals=None, train_epochs=10,
-        epochs_between_evals=2, default_train_epochs=None, batch_size=2048,
-        max_length=256, use_tpu=True)
-
-    self.assertEqual(
-        manager.single_iteration_train_steps,
-        schedule.NUM_EXAMPLES[tf.estimator.ModeKeys.TRAIN] * 2 // (2048 / 256)
-    )
-
-    # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256)
-    self.assertEqual(manager.single_iteration_eval_steps, 375)
-
-    self.assertEqual(manager.repeat_dataset, 2)
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/r1/transformer/transformer.py
+++ b/official/r1/transformer/transformer.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Defines the Transformer model, and its encoder and decoder stacks.
-
-Model paper: https://arxiv.org/pdf/1706.03762.pdf
-Transformer model code source: https://github.com/tensorflow/tensor2tensor
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf
-
-from official.nlp.transformer import beam_search_v1 as beam_search
-from official.nlp.transformer import model_utils
-from official.nlp.transformer.utils.tokenizer import EOS_ID
-from official.r1.transformer import attention_layer
-from official.r1.transformer import embedding_layer
-from official.r1.transformer import ffn_layer
-
-_NEG_INF = -1e9
-
-
-class Transformer(object):
-  """Transformer model for sequence to sequence data.
-
-  Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
-
-  The Transformer model consists of an encoder and decoder. The input is an int
-  sequence (or a batch of sequences). The encoder produces a continous
-  representation, and the decoder uses the encoder output to generate
-  probabilities for the output sequence.
-  """
-
-  def __init__(self, params, train):
-    """Initialize layers to build Transformer model.
-
-    Args:
-      params: hyperparameter object defining layer sizes, dropout values, etc.
-      train: boolean indicating whether the model is in training mode. Used to
-        determine if dropout layers should be added.
-    """
-    self.train = train
-    self.params = params
-
-    self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights(
-        params["vocab_size"], params["hidden_size"],
-        method="matmul" if params["tpu"] else "gather")
-    self.encoder_stack = EncoderStack(params, train)
-    self.decoder_stack = DecoderStack(params, train)
-
-  def __call__(self, inputs, targets=None):
-    """Calculate target logits or inferred target sequences.
-
-    Args:
-      inputs: int tensor with shape [batch_size, input_length].
-      targets: None or int tensor with shape [batch_size, target_length].
-
-    Returns:
-      If targets is defined, then return logits for each word in the target
-      sequence. float tensor with shape [batch_size, target_length, vocab_size]
-      If target is none, then generate output sequence one token at a time.
-        returns a dictionary {
-          output: [batch_size, decoded length]
-          score: [batch_size, float]}
-    """
-    # Variance scaling is used here because it seems to work in many problems.
-    # Other reasonable initializers may also work just as well.
-    initializer = tf.variance_scaling_initializer(
-        self.params["initializer_gain"], mode="fan_avg", distribution="uniform")
-    with tf.variable_scope("Transformer", initializer=initializer):
-      # Calculate attention bias for encoder self-attention and decoder
-      # multi-headed attention layers.
-      attention_bias = model_utils.get_padding_bias(inputs)
-
-      # Run the inputs through the encoder layer to map the symbol
-      # representations to continuous representations.
-      encoder_outputs = self.encode(inputs, attention_bias)
-
-      # Generate output sequence if targets is None, or return logits if target
-      # sequence is known.
-      if targets is None:
-        return self.predict(encoder_outputs, attention_bias)
-      else:
-        logits = self.decode(targets, encoder_outputs, attention_bias)
-        return logits
-
-  def encode(self, inputs, attention_bias):
-    """Generate continuous representation for inputs.
-
-    Args:
-      inputs: int tensor with shape [batch_size, input_length].
-      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
-
-    Returns:
-      float tensor with shape [batch_size, input_length, hidden_size]
-    """
-    with tf.name_scope("encode"):
-      # Prepare inputs to the layer stack by adding positional encodings and
-      # applying dropout.
-      embedded_inputs = self.embedding_softmax_layer(inputs)
-      inputs_padding = model_utils.get_padding(inputs)
-
-      with tf.name_scope("add_pos_encoding"):
-        length = tf.shape(embedded_inputs)[1]
-        pos_encoding = model_utils.get_position_encoding(
-            length, self.params["hidden_size"])
-        encoder_inputs = embedded_inputs + pos_encoding
-
-      if self.train:
-        encoder_inputs = tf.nn.dropout(
-            encoder_inputs, 1 - self.params["layer_postprocess_dropout"])
-
-      return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
-
-  def decode(self, targets, encoder_outputs, attention_bias):
-    """Generate logits for each value in the target sequence.
-
-    Args:
-      targets: target values for the output sequence.
-        int tensor with shape [batch_size, target_length]
-      encoder_outputs: continuous representation of input sequence.
-        float tensor with shape [batch_size, input_length, hidden_size]
-      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
-
-    Returns:
-      float32 tensor with shape [batch_size, target_length, vocab_size]
-    """
-    with tf.name_scope("decode"):
-      # Prepare inputs to decoder layers by shifting targets, adding positional
-      # encoding and applying dropout.
-      decoder_inputs = self.embedding_softmax_layer(targets)
-      with tf.name_scope("shift_targets"):
-        # Shift targets to the right, and remove the last element
-        decoder_inputs = tf.pad(
-            decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
-      with tf.name_scope("add_pos_encoding"):
-        length = tf.shape(decoder_inputs)[1]
-        decoder_inputs += model_utils.get_position_encoding(
-            length, self.params["hidden_size"])
-      if self.train:
-        decoder_inputs = tf.nn.dropout(
-            decoder_inputs, 1 - self.params["layer_postprocess_dropout"])
-
-      # Run values
-      decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
-          length)
-      outputs = self.decoder_stack(
-          decoder_inputs, encoder_outputs, decoder_self_attention_bias,
-          attention_bias)
-      logits = self.embedding_softmax_layer.linear(outputs)
-      return logits
-
-  def _get_symbols_to_logits_fn(self, max_decode_length):
-    """Returns a decoding function that calculates logits of the next tokens."""
-
-    timing_signal = model_utils.get_position_encoding(
-        max_decode_length + 1, self.params["hidden_size"])
-    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
-        max_decode_length)
-
-    def symbols_to_logits_fn(ids, i, cache):
-      """Generate logits for next potential IDs.
-
-      Args:
-        ids: Current decoded sequences.
-          int tensor with shape [batch_size * beam_size, i + 1]
-        i: Loop index
-        cache: dictionary of values storing the encoder output, encoder-decoder
-          attention bias, and previous decoder attention values.
-
-      Returns:
-        Tuple of
-          (logits with shape [batch_size * beam_size, vocab_size],
-           updated cache values)
-      """
-      # Set decoder input to the last generated IDs
-      decoder_input = ids[:, -1:]
-
-      # Preprocess decoder input by getting embeddings and adding timing signal.
-      decoder_input = self.embedding_softmax_layer(decoder_input)
-      decoder_input += timing_signal[i:i + 1]
-
-      self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
-      decoder_outputs = self.decoder_stack(
-          decoder_input, cache.get("encoder_outputs"), self_attention_bias,
-          cache.get("encoder_decoder_attention_bias"), cache)
-      logits = self.embedding_softmax_layer.linear(decoder_outputs)
-      logits = tf.squeeze(logits, axis=[1])
-      return logits, cache
-    return symbols_to_logits_fn
-
-  def predict(self, encoder_outputs, encoder_decoder_attention_bias):
-    """Return predicted sequence."""
-    batch_size = tf.shape(encoder_outputs)[0]
-    input_length = tf.shape(encoder_outputs)[1]
-    max_decode_length = input_length + self.params["extra_decode_length"]
-
-    symbols_to_logits_fn = self._get_symbols_to_logits_fn(max_decode_length)
-
-    # Create initial set of IDs that will be passed into symbols_to_logits_fn.
-    initial_ids = tf.zeros([batch_size], dtype=tf.int32)
-
-    # Create cache storing decoder attention values for each layer.
-    cache = {
-        "layer_%d" % layer: {
-            "k": tf.zeros([batch_size, 0, self.params["hidden_size"]]),
-            "v": tf.zeros([batch_size, 0, self.params["hidden_size"]]),
-        } for layer in range(self.params["num_hidden_layers"])}
-
-    # Add encoder output and attention bias to the cache.
-    cache["encoder_outputs"] = encoder_outputs
-    cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
-
-    # Use beam search to find the top beam_size sequences and scores.
-    decoded_ids, scores = beam_search.sequence_beam_search(
-        symbols_to_logits_fn=symbols_to_logits_fn,
-        initial_ids=initial_ids,
-        initial_cache=cache,
-        vocab_size=self.params["vocab_size"],
-        beam_size=self.params["beam_size"],
-        alpha=self.params["alpha"],
-        max_decode_length=max_decode_length,
-        eos_id=EOS_ID)
-
-    # Get the top sequence for each batch element
-    top_decoded_ids = decoded_ids[:, 0, 1:]
-    top_scores = scores[:, 0]
-
-    return {"outputs": top_decoded_ids, "scores": top_scores}
-
-
-class LayerNormalization(tf.layers.Layer):
-  """Applies layer normalization."""
-
-  def __init__(self, hidden_size):
-    super(LayerNormalization, self).__init__()
-    self.hidden_size = hidden_size
-
-  def build(self, _):
-    self.scale = tf.get_variable("layer_norm_scale", [self.hidden_size],
-                                 initializer=tf.ones_initializer())
-    self.bias = tf.get_variable("layer_norm_bias", [self.hidden_size],
-                                initializer=tf.zeros_initializer())
-    self.built = True
-
-  def call(self, x, epsilon=1e-6):
-    mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
-    variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
-    norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
-    return norm_x * self.scale + self.bias
-
-
-class PrePostProcessingWrapper(object):
-  """Wrapper class that applies layer pre-processing and post-processing."""
-
-  def __init__(self, layer, params, train):
-    self.layer = layer
-    self.postprocess_dropout = params["layer_postprocess_dropout"]
-    self.train = train
-
-    # Create normalization layer
-    self.layer_norm = LayerNormalization(params["hidden_size"])
-
-  def __call__(self, x, *args, **kwargs):
-    # Preprocessing: apply layer normalization
-    y = self.layer_norm(x)
-
-    # Get layer output
-    y = self.layer(y, *args, **kwargs)
-
-    # Postprocessing: apply dropout and residual connection
-    if self.train:
-      y = tf.nn.dropout(y, 1 - self.postprocess_dropout)
-    return x + y
-
-
-class EncoderStack(tf.layers.Layer):
-  """Transformer encoder stack.
-
-  The encoder stack is made up of N identical layers. Each layer is composed
-  of the sublayers:
-    1. Self-attention layer
-    2. Feedforward network (which is 2 fully-connected layers)
-  """
-
-  def __init__(self, params, train):
-    super(EncoderStack, self).__init__()
-    self.layers = []
-    for _ in range(params["num_hidden_layers"]):
-      # Create sublayers for each layer.
-      self_attention_layer = attention_layer.SelfAttention(
-          params["hidden_size"], params["num_heads"],
-          params["attention_dropout"], train)
-      feed_forward_network = ffn_layer.FeedFowardNetwork(
-          params["hidden_size"], params["filter_size"],
-          params["relu_dropout"], train, params["allow_ffn_pad"])
-
-      self.layers.append([
-          PrePostProcessingWrapper(self_attention_layer, params, train),
-          PrePostProcessingWrapper(feed_forward_network, params, train)])
-
-    # Create final layer normalization layer.
-    self.output_normalization = LayerNormalization(params["hidden_size"])
-
-  def call(self, encoder_inputs, attention_bias, inputs_padding):
-    """Return the output of the encoder layer stacks.
-
-    Args:
-      encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
-      attention_bias: bias for the encoder self-attention layer.
-        [batch_size, 1, 1, input_length]
-      inputs_padding: P
-
-    Returns:
-      Output of encoder layer stack.
-      float32 tensor with shape [batch_size, input_length, hidden_size]
-    """
-    for n, layer in enumerate(self.layers):
-      # Run inputs through the sublayers.
-      self_attention_layer = layer[0]
-      feed_forward_network = layer[1]
-
-      with tf.variable_scope("layer_%d" % n):
-        with tf.variable_scope("self_attention"):
-          encoder_inputs = self_attention_layer(encoder_inputs, attention_bias)
-        with tf.variable_scope("ffn"):
-          encoder_inputs = feed_forward_network(encoder_inputs, inputs_padding)
-
-    return self.output_normalization(encoder_inputs)
-
-
-class DecoderStack(tf.layers.Layer):
-  """Transformer decoder stack.
-
-  Like the encoder stack, the decoder stack is made up of N identical layers.
-  Each layer is composed of the sublayers:
-    1. Self-attention layer
-    2. Multi-headed attention layer combining encoder outputs with results from
-       the previous self-attention layer.
-    3. Feedforward network (2 fully-connected layers)
-  """
-
-  def __init__(self, params, train):
-    super(DecoderStack, self).__init__()
-    self.layers = []
-    for _ in range(params["num_hidden_layers"]):
-      self_attention_layer = attention_layer.SelfAttention(
-          params["hidden_size"], params["num_heads"],
-          params["attention_dropout"], train)
-      enc_dec_attention_layer = attention_layer.Attention(
-          params["hidden_size"], params["num_heads"],
-          params["attention_dropout"], train)
-      feed_forward_network = ffn_layer.FeedFowardNetwork(
-          params["hidden_size"], params["filter_size"],
-          params["relu_dropout"], train, params["allow_ffn_pad"])
-
-      self.layers.append([
-          PrePostProcessingWrapper(self_attention_layer, params, train),
-          PrePostProcessingWrapper(enc_dec_attention_layer, params, train),
-          PrePostProcessingWrapper(feed_forward_network, params, train)])
-
-    self.output_normalization = LayerNormalization(params["hidden_size"])
-
-  def call(self, decoder_inputs, encoder_outputs, decoder_self_attention_bias,
-           attention_bias, cache=None):
-    """Return the output of the decoder layer stacks.
-
-    Args:
-      decoder_inputs: tensor with shape [batch_size, target_length, hidden_size]
-      encoder_outputs: tensor with shape [batch_size, input_length, hidden_size]
-      decoder_self_attention_bias: bias for decoder self-attention layer.
-        [1, 1, target_len, target_length]
-      attention_bias: bias for encoder-decoder attention layer.
-        [batch_size, 1, 1, input_length]
-      cache: (Used for fast decoding) A nested dictionary storing previous
-        decoder self-attention values. The items are:
-          {layer_n: {"k": tensor with shape [batch_size, i, key_channels],
-                     "v": tensor with shape [batch_size, i, value_channels]},
-           ...}
-
-    Returns:
-      Output of decoder layer stack.
-      float32 tensor with shape [batch_size, target_length, hidden_size]
-    """
-    for n, layer in enumerate(self.layers):
-      self_attention_layer = layer[0]
-      enc_dec_attention_layer = layer[1]
-      feed_forward_network = layer[2]
-
-      # Run inputs through the sublayers.
-      layer_name = "layer_%d" % n
-      layer_cache = cache[layer_name] if cache is not None else None
-      with tf.variable_scope(layer_name):
-        with tf.variable_scope("self_attention"):
-          decoder_inputs = self_attention_layer(
-              decoder_inputs, decoder_self_attention_bias, cache=layer_cache)
-        with tf.variable_scope("encdec_attention"):
-          decoder_inputs = enc_dec_attention_layer(
-              decoder_inputs, encoder_outputs, attention_bias)
-        with tf.variable_scope("ffn"):
-          decoder_inputs = feed_forward_network(decoder_inputs)
-
-    return self.output_normalization(decoder_inputs)
--- a/official/r1/transformer/transformer_main.py
+++ b/official/r1/transformer/transformer_main.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Train and evaluate the Transformer model.
-
-See README for description of setting the training schedule and evaluating the
-BLEU score.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import tempfile
-
-# pylint: disable=g-bad-import-order
-from six.moves import xrange  # pylint: disable=redefined-builtin
-from absl import app as absl_app
-from absl import flags
-import tensorflow.compat.v1 as tf
-# pylint: enable=g-bad-import-order
-
-from official.nlp.transformer import model_params
-from official.r1.utils import export
-from official.r1.utils import tpu as tpu_util
-from official.r1.transformer import translate
-from official.r1.transformer import transformer
-from official.r1.transformer import dataset
-from official.r1.transformer import schedule
-from official.nlp.transformer import compute_bleu
-from official.nlp.transformer.utils import metrics
-from official.nlp.transformer.utils import tokenizer
-from official.utils.flags import core as flags_core
-from official.r1.utils.logs import hooks_helper
-from official.r1.utils.logs import logger
-from official.utils.misc import distribution_utils
-from official.utils.misc import model_helpers
-
-PARAMS_MAP = {
-    "tiny": model_params.TINY_PARAMS,
-    "base": model_params.BASE_PARAMS,
-    "big": model_params.BIG_PARAMS,
-}
-
-
-DEFAULT_TRAIN_EPOCHS = 10
-INF = 1000000000  # 1e9
-BLEU_DIR = "bleu"
-
-# Dictionary containing tensors that are logged by the logging hooks. Each item
-# maps a string to the tensor name.
-TENSORS_TO_LOG = {
-    "learning_rate": "model/get_train_op/learning_rate/learning_rate",
-    "cross_entropy_loss": "model/cross_entropy"}
-
-
-def model_fn(features, labels, mode, params):
-  """Defines how to train, evaluate and predict from the transformer model."""
-  with tf.variable_scope("model"):
-    inputs, targets = features, labels
-
-    # Create model and get output logits.
-    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)
-
-    logits = model(inputs, targets)
-
-    # When in prediction mode, the labels/targets is None. The model output
-    # is the prediction
-    if mode == tf.estimator.ModeKeys.PREDICT:
-      if params["use_tpu"]:
-        raise NotImplementedError("Prediction is not yet supported on TPUs.")
-      return tf.estimator.EstimatorSpec(
-          tf.estimator.ModeKeys.PREDICT,
-          predictions=logits,
-          export_outputs={
-              "translate": tf.estimator.export.PredictOutput(logits)
-          })
-
-    # Explicitly set the shape of the logits for XLA (TPU). This is needed
-    # because the logits are passed back to the host VM CPU for metric
-    # evaluation, and the shape of [?, ?, vocab_size] is too vague. However
-    # it is known from Transformer that the first two dimensions of logits
-    # are the dimensions of targets. Note that the ambiguous shape of logits is
-    # not a problem when computing xentropy, because padded_cross_entropy_loss
-    # resolves the shape on the TPU.
-    logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:])
-
-    # Calculate model loss.
-    # xentropy contains the cross entropy loss of every nonpadding token in the
-    # targets.
-    xentropy, weights = metrics.padded_cross_entropy_loss(
-        logits, targets, params["label_smoothing"], params["vocab_size"])
-    loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
-
-    # Save loss as named tensor that will be logged with the logging hook.
-    tf.identity(loss, "cross_entropy")
-
-    if mode == tf.estimator.ModeKeys.EVAL:
-      if params["use_tpu"]:
-        # host call functions should only have tensors as arguments.
-        # This lambda pre-populates params so that metric_fn is
-        # TPUEstimator compliant.
-        metric_fn = lambda logits, labels: (
-            metrics.get_eval_metrics(logits, labels, params=params))
-        eval_metrics = (metric_fn, [logits, labels])
-        return tf.estimator.tpu.TPUEstimatorSpec(
-            mode=mode,
-            loss=loss,
-            predictions={"predictions": logits},
-            eval_metrics=eval_metrics)
-      return tf.estimator.EstimatorSpec(
-          mode=mode, loss=loss, predictions={"predictions": logits},
-          eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
-    else:
-      train_op, metric_dict = get_train_op_and_metrics(loss, params)
-
-      # Epochs can be quite long. This gives some intermediate information
-      # in TensorBoard.
-      metric_dict["minibatch_loss"] = loss
-      if params["use_tpu"]:
-        return tf.estimator.tpu.TPUEstimatorSpec(
-            mode=mode,
-            loss=loss,
-            train_op=train_op,
-            host_call=tpu_util.construct_scalar_host_call(
-                metric_dict=metric_dict,
-                model_dir=params["model_dir"],
-                prefix="training/"))
-      record_scalars(metric_dict)
-      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
-
-
-def record_scalars(metric_dict):
-  for key, value in metric_dict.items():
-    tf.summary.scalar(name=key, tensor=value)
-
-
-def get_learning_rate(learning_rate, hidden_size, learning_rate_warmup_steps):
-  """Calculate learning rate with linear warmup and rsqrt decay."""
-  with tf.name_scope("learning_rate"):
-    warmup_steps = tf.to_float(learning_rate_warmup_steps)
-    step = tf.to_float(tf.train.get_or_create_global_step())
-
-    learning_rate *= (hidden_size ** -0.5)
-    # Apply linear warmup
-    learning_rate *= tf.minimum(1.0, step / warmup_steps)
-    # Apply rsqrt decay
-    learning_rate *= tf.rsqrt(tf.maximum(step, warmup_steps))
-
-    # Create a named tensor that will be logged using the logging hook.
-    # The full name includes variable and names scope. In this case, the name
-    # is model/get_train_op/learning_rate/learning_rate
-    tf.identity(learning_rate, "learning_rate")
-
-    return learning_rate
-
-
-def get_train_op_and_metrics(loss, params):
-  """Generate training op and metrics to save in TensorBoard."""
-  with tf.variable_scope("get_train_op"):
-    learning_rate = get_learning_rate(
-        learning_rate=params["learning_rate"],
-        hidden_size=params["hidden_size"],
-        learning_rate_warmup_steps=params["learning_rate_warmup_steps"])
-
-    # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster
-    # than the TF core Adam optimizer.
-    from tensorflow.contrib import opt as contrib_opt  # pylint: disable=g-import-not-at-top
-    optimizer = contrib_opt.LazyAdamOptimizer(
-        learning_rate,
-        beta1=params["optimizer_adam_beta1"],
-        beta2=params["optimizer_adam_beta2"],
-        epsilon=params["optimizer_adam_epsilon"])
-
-    if params["use_tpu"] and params["tpu"] != tpu_util.LOCAL:
-      optimizer = tf.compat.v1.tpu.CrossShardOptimizer(optimizer)
-
-    # Uses automatic mixed precision FP16 training if on GPU.
-    if params["dtype"] == "fp16":
-      optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
-          optimizer)
-
-    # Calculate and apply gradients using LazyAdamOptimizer.
-    global_step = tf.train.get_global_step()
-    tvars = tf.trainable_variables()
-    gradients = optimizer.compute_gradients(
-        loss, tvars, colocate_gradients_with_ops=True)
-    minimize_op = optimizer.apply_gradients(
-        gradients, global_step=global_step, name="train")
-    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-    train_op = tf.group(minimize_op, update_ops)
-
-    train_metrics = {"learning_rate": learning_rate}
-
-    if not params["use_tpu"]:
-      # gradient norm is not included as a summary when running on TPU, as
-      # it can cause instability between the TPU and the host controller.
-      gradient_norm = tf.global_norm(list(zip(*gradients))[0])
-      train_metrics["global_norm/gradient_norm"] = gradient_norm
-
-    return train_op, train_metrics
-
-
-def translate_and_compute_bleu(estimator, subtokenizer, bleu_source, bleu_ref):
-  """Translate file and report the cased and uncased bleu scores."""
-  # Create temporary file to store translation.
-  tmp = tempfile.NamedTemporaryFile(delete=False)
-  tmp_filename = tmp.name
-
-  translate.translate_file(
-      estimator, subtokenizer, bleu_source, output_file=tmp_filename,
-      print_all_translations=False)
-
-  # Compute uncased and cased bleu scores.
-  uncased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, False)
-  cased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, True)
-  os.remove(tmp_filename)
-  return uncased_score, cased_score
-
-
-def get_global_step(estimator):
-  """Return estimator's last checkpoint."""
-  return int(estimator.latest_checkpoint().split("-")[-1])
-
-
-def evaluate_and_log_bleu(estimator, bleu_source, bleu_ref, vocab_file):
-  """Calculate and record the BLEU score."""
-  subtokenizer = tokenizer.Subtokenizer(vocab_file)
-
-  uncased_score, cased_score = translate_and_compute_bleu(
-      estimator, subtokenizer, bleu_source, bleu_ref)
-
-  tf.logging.info("Bleu score (uncased): %f", uncased_score)
-  tf.logging.info("Bleu score (cased): %f", cased_score)
-  return uncased_score, cased_score
-
-
-def _validate_file(filepath):
-  """Make sure that file exists."""
-  if not tf.io.gfile.exists(filepath):
-    raise tf.errors.NotFoundError(None, None, "File %s not found." % filepath)
-
-
-def run_loop(
-    estimator, schedule_manager, train_hooks=None, benchmark_logger=None,
-    bleu_source=None, bleu_ref=None, bleu_threshold=None, vocab_file=None):
-  """Train and evaluate model, and optionally compute model's BLEU score.
-
-  **Step vs. Epoch vs. Iteration**
-
-  Steps and epochs are canonical terms used in TensorFlow and general machine
-  learning. They are used to describe running a single process (train/eval):
-    - Step refers to running the process through a single or batch of examples.
-    - Epoch refers to running the process through an entire dataset.
-
-  E.g. training a dataset with 100 examples. The dataset is
-  divided into 20 batches with 5 examples per batch. A single training step
-  trains the model on one batch. After 20 training steps, the model will have
-  trained on every batch in the dataset, or, in other words, one epoch.
-
-  Meanwhile, iteration is used in this implementation to describe running
-  multiple processes (training and eval).
-    - A single iteration:
-      1. trains the model for a specific number of steps or epochs.
-      2. evaluates the model.
-      3. (if source and ref files are provided) compute BLEU score.
-
-  This function runs through multiple train+eval+bleu iterations.
-
-  Args:
-    estimator: tf.Estimator containing model to train.
-    schedule_manager: A schedule.Manager object to guide the run loop.
-    train_hooks: List of hooks to pass to the estimator during training.
-    benchmark_logger: a BenchmarkLogger object that logs evaluation data
-    bleu_source: File containing text to be translated for BLEU calculation.
-    bleu_ref: File containing reference translations for BLEU calculation.
-    bleu_threshold: minimum BLEU score before training is stopped.
-    vocab_file: Path to vocab file that will be used to subtokenize bleu_source.
-
-  Returns:
-    Dict of results of the run.  Contains the keys `eval_results`,
-    `train_hooks`, `bleu_cased`, and `bleu_uncased`. `train_hooks` is a list the
-    instances of hooks used during training.
-
-  Raises:
-    ValueError: if both or none of single_iteration_train_steps and
-      single_iteration_train_epochs were defined.
-    NotFoundError: if the vocab file or bleu files don't exist.
-  """
-  if bleu_source:
-    _validate_file(bleu_source)
-  if bleu_ref:
-    _validate_file(bleu_ref)
-  if vocab_file:
-    _validate_file(vocab_file)
-
-  evaluate_bleu = bleu_source is not None and bleu_ref is not None
-  if evaluate_bleu and schedule_manager.use_tpu:
-    raise ValueError("BLEU score can not be computed when training with a TPU, "
-                     "as it requires estimator.predict which is not yet "
-                     "supported.")
-
-  # Print details of training schedule.
-  tf.logging.info("Training schedule:")
-  tf.logging.info(
-      "\t1. Train for {}".format(schedule_manager.train_increment_str))
-  tf.logging.info("\t2. Evaluate model.")
-  if evaluate_bleu:
-    tf.logging.info("\t3. Compute BLEU score.")
-    if bleu_threshold is not None:
-      tf.logging.info("Repeat above steps until the BLEU score reaches %f" %
-                      bleu_threshold)
-  if not evaluate_bleu or bleu_threshold is None:
-    tf.logging.info("Repeat above steps %d times." %
-                    schedule_manager.train_eval_iterations)
-
-  if evaluate_bleu:
-    # Create summary writer to log bleu score (values can be displayed in
-    # Tensorboard).
-    bleu_writer = tf.summary.FileWriter(
-        os.path.join(estimator.model_dir, BLEU_DIR))
-    if bleu_threshold is not None:
-      # Change loop stopping condition if bleu_threshold is defined.
-      schedule_manager.train_eval_iterations = INF
-
-  # Loop training/evaluation/bleu cycles
-  stats = {}
-  for i in xrange(schedule_manager.train_eval_iterations):
-    tf.logging.info("Starting iteration %d" % (i + 1))
-
-    # Train the model for single_iteration_train_steps or until the input fn
-    # runs out of examples (if single_iteration_train_steps is None).
-    estimator.train(
-        dataset.train_input_fn,
-        steps=schedule_manager.single_iteration_train_steps,
-        hooks=train_hooks)
-
-    eval_results = None
-    eval_results = estimator.evaluate(
-        input_fn=dataset.eval_input_fn,
-        steps=schedule_manager.single_iteration_eval_steps)
-
-    tf.logging.info("Evaluation results (iter %d/%d):" %
-                    (i + 1, schedule_manager.train_eval_iterations))
-    tf.logging.info(eval_results)
-    benchmark_logger.log_evaluation_result(eval_results)
-
-    # The results from estimator.evaluate() are measured on an approximate
-    # translation, which utilize the target golden values provided. The actual
-    # bleu score must be computed using the estimator.predict() path, which
-    # outputs translations that are not based on golden values. The translations
-    # are compared to reference file to get the actual bleu score.
-    if evaluate_bleu:
-      uncased_score, cased_score = evaluate_and_log_bleu(
-          estimator, bleu_source, bleu_ref, vocab_file)
-
-      stats["bleu_uncased"] = uncased_score
-      stats["bleu_cased"] = cased_score
-
-      # Write actual bleu scores using summary writer and benchmark logger
-      global_step = get_global_step(estimator)
-      summary = tf.Summary(value=[
-          tf.Summary.Value(tag="bleu/uncased", simple_value=uncased_score),
-          tf.Summary.Value(tag="bleu/cased", simple_value=cased_score),
-      ])
-      bleu_writer.add_summary(summary, global_step)
-      bleu_writer.flush()
-      benchmark_logger.log_metric(
-          "bleu_uncased", uncased_score, global_step=global_step)
-      benchmark_logger.log_metric(
-          "bleu_cased", cased_score, global_step=global_step)
-
-      # Stop training if bleu stopping threshold is met.
-      if model_helpers.past_stop_threshold(bleu_threshold, uncased_score):
-        bleu_writer.close()
-        break
-
-  stats["eval_results"] = eval_results
-  stats["train_hooks"] = train_hooks
-
-  return stats
-
-
-def define_transformer_flags():
-  """Add flags and flag validators for running transformer_main."""
-  # Add common flags (data_dir, model_dir, train_epochs, etc.).
-  flags.DEFINE_integer(
-      name="max_length", short_name="ml", default=None,
-      help=flags_core.help_wrap("Max length."))
-
-  flags_core.define_base(clean=True, train_epochs=True,
-                         epochs_between_evals=True, stop_threshold=True,
-                         num_gpu=True, hooks=True, export_dir=True,
-                         distribution_strategy=True)
-  flags_core.define_performance(
-      num_parallel_calls=True,
-      inter_op=False,
-      intra_op=False,
-      synthetic_data=True,
-      max_train_steps=False,
-      dtype=True,
-      all_reduce_alg=True
-  )
-  flags_core.define_benchmark()
-  flags_core.define_device(tpu=True)
-
-  # Set flags from the flags_core module as "key flags" so they're listed when
-  # the '-h' flag is used. Without this line, the flags defined above are
-  # only shown in the full `--helpful` help text.
-  flags.adopt_module_key_flags(flags_core)
-
-  # Add transformer-specific flags
-  flags.DEFINE_enum(
-      name="param_set", short_name="mp", default="big",
-      enum_values=PARAMS_MAP.keys(),
-      help=flags_core.help_wrap(
-          "Parameter set to use when creating and training the model. The "
-          "parameters define the input shape (batch size and max length), "
-          "model configuration (size of embedding, # of hidden layers, etc.), "
-          "and various other settings. The big parameter set increases the "
-          "default batch size, embedding/hidden size, and filter size. For a "
-          "complete list of parameters, please see model/model_params.py."))
-
-  flags.DEFINE_bool(
-      name="static_batch", default=False,
-      help=flags_core.help_wrap(
-          "Whether the batches in the dataset should have static shapes. In "
-          "general, this setting should be False. Dynamic shapes allow the "
-          "inputs to be grouped so that the number of padding tokens is "
-          "minimized, and helps model training. In cases where the input shape "
-          "must be static (e.g. running on TPU), this setting will be ignored "
-          "and static batching will always be used."))
-
-  # Flags for training with steps (may be used for debugging)
-  flags.DEFINE_integer(
-      name="train_steps", short_name="ts", default=None,
-      help=flags_core.help_wrap("The number of steps used to train."))
-  flags.DEFINE_integer(
-      name="steps_between_evals", short_name="sbe", default=1000,
-      help=flags_core.help_wrap(
-          "The Number of training steps to run between evaluations. This is "
-          "used if --train_steps is defined."))
-
-  # BLEU score computation
-  flags.DEFINE_string(
-      name="bleu_source", short_name="bls", default=None,
-      help=flags_core.help_wrap(
-          "Path to source file containing text translate when calculating the "
-          "official BLEU score. Both --bleu_source and --bleu_ref must be set. "
-          "Use the flag --stop_threshold to stop the script based on the "
-          "uncased BLEU score."))
-  flags.DEFINE_string(
-      name="bleu_ref", short_name="blr", default=None,
-      help=flags_core.help_wrap(
-          "Path to source file containing text translate when calculating the "
-          "official BLEU score. Both --bleu_source and --bleu_ref must be set. "
-          "Use the flag --stop_threshold to stop the script based on the "
-          "uncased BLEU score."))
-  flags.DEFINE_string(
-      name="vocab_file", short_name="vf", default=None,
-      help=flags_core.help_wrap(
-          "Path to subtoken vocabulary file. If data_download.py was used to "
-          "download and encode the training data, look in the data_dir to find "
-          "the vocab file."))
-
-  flags_core.set_defaults(data_dir="/tmp/translate_ende",
-                          model_dir="/tmp/transformer_model",
-                          batch_size=None,
-                          train_epochs=None)
-
-  @flags.multi_flags_validator(
-      ["train_epochs", "train_steps"],
-      message="Both --train_steps and --train_epochs were set. Only one may be "
-              "defined.")
-  def _check_train_limits(flag_dict):
-    return flag_dict["train_epochs"] is None or flag_dict["train_steps"] is None
-
-  @flags.multi_flags_validator(
-      ["bleu_source", "bleu_ref"],
-      message="Both or neither --bleu_source and --bleu_ref must be defined.")
-  def _check_bleu_files(flags_dict):
-    return (flags_dict["bleu_source"] is None) == (
-        flags_dict["bleu_ref"] is None)
-
-  @flags.multi_flags_validator(
-      ["bleu_source", "bleu_ref", "vocab_file"],
-      message="--vocab_file must be defined if --bleu_source and --bleu_ref "
-              "are defined.")
-  def _check_bleu_vocab_file(flags_dict):
-    if flags_dict["bleu_source"] and flags_dict["bleu_ref"]:
-      return flags_dict["vocab_file"] is not None
-    return True
-
-  @flags.multi_flags_validator(
-      ["export_dir", "vocab_file"],
-      message="--vocab_file must be defined if --export_dir is set.")
-  def _check_export_vocab_file(flags_dict):
-    if flags_dict["export_dir"]:
-      return flags_dict["vocab_file"] is not None
-    return True
-
-  flags_core.require_cloud_storage(["data_dir", "model_dir", "export_dir"])
-
-
-def construct_estimator(flags_obj, params, schedule_manager):
-  """Construct an estimator from either Estimator or TPUEstimator.
-
-  Args:
-    flags_obj: The FLAGS object parsed from command line.
-    params: A dict of run specific parameters.
-    schedule_manager: A schedule.Manager object containing the run schedule.
-
-  Returns:
-    An estimator object to be used for training and eval.
-  """
-  if not params["use_tpu"]:
-    distribution_strategy = distribution_utils.get_distribution_strategy(
-        distribution_strategy=flags_obj.distribution_strategy,
-        num_gpus=flags_core.get_num_gpus(flags_obj),
-        all_reduce_alg=flags_obj.all_reduce_alg)
-    return tf.estimator.Estimator(
-        model_fn=model_fn, model_dir=flags_obj.model_dir, params=params,
-        config=tf.estimator.RunConfig(train_distribute=distribution_strategy))
-
-  tpu_cluster_resolver = tf.compat.v1.cluster_resolver.TPUClusterResolver(
-      tpu=flags_obj.tpu,
-      zone=flags_obj.tpu_zone,
-      project=flags_obj.tpu_gcp_project
-  )
-
-  tpu_config = tf.estimator.tpu.TPUConfig(
-      iterations_per_loop=schedule_manager.single_iteration_train_steps,
-      num_shards=flags_obj.num_tpu_shards)
-
-  run_config = tf.estimator.tpu.RunConfig(
-      cluster=tpu_cluster_resolver,
-      model_dir=flags_obj.model_dir,
-      session_config=tf.ConfigProto(
-          allow_soft_placement=True, log_device_placement=True),
-      tpu_config=tpu_config)
-
-  return tf.estimator.tpu.TPUEstimator(
-      model_fn=model_fn,
-      use_tpu=params["use_tpu"] and flags_obj.tpu != tpu_util.LOCAL,
-      train_batch_size=schedule_manager.batch_size,
-      eval_batch_size=schedule_manager.batch_size,
-      params={
-          # TPUEstimator needs to populate batch_size itself due to sharding.
-          key: value for key, value in params.items() if key != "batch_size"
-      },
-      config=run_config)
-
-def per_replica_batch_size(batch_size, num_gpus):
-  """For multi-gpu, batch-size must be a multiple of the number of GPUs.
-
-
-  Note that distribution strategy handles this automatically when used with
-  Keras. For using with Estimator, we need to get per GPU batch.
-
-  Args:
-    batch_size: Global batch size to be divided among devices. This should be
-      equal to num_gpus times the single-GPU batch_size for multi-gpu training.
-    num_gpus: How many GPUs are used with DistributionStrategies.
-
-  Returns:
-    Batch size per device.
-
-  Raises:
-    ValueError: if batch_size is not divisible by number of devices
-  """
-  if num_gpus <= 1:
-    return batch_size
-
-  remainder = batch_size % num_gpus
-  if remainder:
-    err = ('When running with multiple GPUs, batch size '
-           'must be a multiple of the number of available GPUs. Found {} '
-           'GPUs with a batch size of {}; try --batch_size={} instead.'
-          ).format(num_gpus, batch_size, batch_size - remainder)
-    raise ValueError(err)
-  return int(batch_size / num_gpus)
-
-
-def run_transformer(flags_obj):
-  """Create tf.Estimator to train and evaluate transformer model.
-
-  Args:
-    flags_obj: Object containing parsed flag values.
-
-  Returns:
-    Dict of results of the run.  Contains the keys `eval_results`,
-    `train_hooks`, `bleu_cased`, and `bleu_uncased`. `train_hooks` is a list the
-    instances of hooks used during training.
-  """
-  num_gpus = flags_core.get_num_gpus(flags_obj)
-
-  # Add flag-defined parameters to params object
-  params = PARAMS_MAP[flags_obj.param_set]
-  if num_gpus > 1:
-    if flags_obj.param_set == "big":
-      params = model_params.BIG_MULTI_GPU_PARAMS
-    elif flags_obj.param_set == "base":
-      params = model_params.BASE_MULTI_GPU_PARAMS
-
-  params["data_dir"] = flags_obj.data_dir
-  params["model_dir"] = flags_obj.model_dir
-  params["num_parallel_calls"] = flags_obj.num_parallel_calls
-
-  params["tpu"] = flags_obj.tpu
-  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
-  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
-  params["allow_ffn_pad"] = not params["use_tpu"]
-
-  params["max_length"] = flags_obj.max_length or params["max_length"]
-
-  params["use_synthetic_data"] = flags_obj.use_synthetic_data
-
-  # Set batch size parameter, which depends on the availability of
-  # TPU and GPU, and distribution settings.
-  params["batch_size"] = (flags_obj.batch_size or (
-      params["default_batch_size_tpu"] if params["use_tpu"]
-      else params["default_batch_size"]))
-
-  total_batch_size = params["batch_size"]
-  if not params["use_tpu"]:
-    params["batch_size"] = per_replica_batch_size(params["batch_size"],
-                                                  num_gpus)
-
-  schedule_manager = schedule.Manager(
-      train_steps=flags_obj.train_steps,
-      steps_between_evals=flags_obj.steps_between_evals,
-      train_epochs=flags_obj.train_epochs,
-      epochs_between_evals=flags_obj.epochs_between_evals,
-      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
-      batch_size=params["batch_size"],
-      max_length=params["max_length"],
-      use_tpu=params["use_tpu"],
-      num_tpu_shards=flags_obj.num_tpu_shards
-  )
-
-  params["repeat_dataset"] = schedule_manager.repeat_dataset
-
-  model_helpers.apply_clean(flags.FLAGS)
-
-  # Create hooks that log information about the training and metric values
-  train_hooks = hooks_helper.get_train_hooks(
-      flags_obj.hooks,
-      model_dir=flags_obj.model_dir,
-      tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
-      batch_size=total_batch_size,  # for ExamplesPerSecondHook
-      use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
-  )
-  benchmark_logger = logger.get_benchmark_logger()
-  benchmark_logger.log_run_info(
-      model_name="transformer",
-      dataset_name="wmt_translate_ende",
-      run_params=params,
-      test_id=flags_obj.benchmark_test_id)
-
-  # Train and evaluate transformer model
-  estimator = construct_estimator(flags_obj, params, schedule_manager)
-  stats = run_loop(
-      estimator=estimator,
-      # Training arguments
-      schedule_manager=schedule_manager,
-      train_hooks=train_hooks,
-      benchmark_logger=benchmark_logger,
-      # BLEU calculation arguments
-      bleu_source=flags_obj.bleu_source,
-      bleu_ref=flags_obj.bleu_ref,
-      bleu_threshold=flags_obj.stop_threshold,
-      vocab_file=flags_obj.vocab_file)
-
-  if flags_obj.export_dir and not params["use_tpu"]:
-    serving_input_fn = export.build_tensor_serving_input_receiver_fn(
-        shape=[None], dtype=tf.int64, batch_size=None)
-    # Export saved model, and save the vocab file as an extra asset. The vocab
-    # file is saved to allow consistent input encoding and output decoding.
-    # (See the "Export trained model" section in the README for an example of
-    # how to use the vocab file.)
-    # Since the model itself does not use the vocab file, this file is saved as
-    # an extra asset rather than a core asset.
-    estimator.export_savedmodel(
-        flags_obj.export_dir, serving_input_fn,
-        assets_extra={"vocab.txt": flags_obj.vocab_file},
-        strip_default_attrs=True)
-  return stats
-
-
-def main(_):
-  with logger.benchmark_context(flags.FLAGS):
-    run_transformer(flags.FLAGS)
-
-
-if __name__ == "__main__":
-  tf.logging.set_verbosity(tf.logging.INFO)
-  define_transformer_flags()
-  absl_app.run(main)
--- a/official/r1/transformer/translate.py
+++ b/official/r1/transformer/translate.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Translate text or files using trained transformer model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-# pylint: disable=g-bad-import-order
-from absl import app as absl_app
-from absl import flags
-import tensorflow.compat.v1 as tf
-# pylint: enable=g-bad-import-order
-
-from official.nlp.transformer.utils import tokenizer
-from official.utils.flags import core as flags_core
-
-_DECODE_BATCH_SIZE = 32
-_EXTRA_DECODE_LENGTH = 100
-_BEAM_SIZE = 4
-_ALPHA = 0.6
-
-
-def _get_sorted_inputs(filename):
-  """Read and sort lines from the file sorted by decreasing length.
-
-  Args:
-    filename: String name of file to read inputs from.
-  Returns:
-    Sorted list of inputs, and dictionary mapping original index->sorted index
-    of each element.
-  """
-  with tf.io.gfile.GFile(filename) as f:
-    records = f.read().split("\n")
-    inputs = [record.strip() for record in records]
-    if not inputs[-1]:
-      inputs.pop()
-
-  input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
-  sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)
-
-  sorted_inputs = [None] * len(sorted_input_lens)
-  sorted_keys = [0] * len(sorted_input_lens)
-  for i, (index, _) in enumerate(sorted_input_lens):
-    sorted_inputs[i] = inputs[index]
-    sorted_keys[index] = i
-
-  return sorted_inputs, sorted_keys
-
-
-def _encode_and_add_eos(line, subtokenizer):
-  """Encode line with subtokenizer, and add EOS id to the end."""
-  return subtokenizer.encode(line) + [tokenizer.EOS_ID]
-
-
-def _trim_and_decode(ids, subtokenizer):
-  """Trim EOS and PAD tokens from ids, and decode to return a string."""
-  try:
-    index = list(ids).index(tokenizer.EOS_ID)
-    return subtokenizer.decode(ids[:index])
-  except ValueError:  # No EOS found in sequence
-    return subtokenizer.decode(ids)
-
-
-def translate_file(
-    estimator, subtokenizer, input_file, output_file=None,
-    print_all_translations=True):
-  """Translate lines in file, and save to output file if specified.
-
-  Args:
-    estimator: tf.Estimator used to generate the translations.
-    subtokenizer: Subtokenizer object for encoding and decoding source and
-       translated lines.
-    input_file: file containing lines to translate
-    output_file: file that stores the generated translations.
-    print_all_translations: If true, all translations are printed to stdout.
-
-  Raises:
-    ValueError: if output file is invalid.
-  """
-  batch_size = _DECODE_BATCH_SIZE
-
-  # Read and sort inputs by length. Keep dictionary (original index-->new index
-  # in sorted list) to write translations in the original order.
-  sorted_inputs, sorted_keys = _get_sorted_inputs(input_file)
-  num_decode_batches = (len(sorted_inputs) - 1) // batch_size + 1
-
-  def input_generator():
-    """Yield encoded strings from sorted_inputs."""
-    for i, line in enumerate(sorted_inputs):
-      if i % batch_size == 0:
-        batch_num = (i // batch_size) + 1
-
-        tf.logging.info("Decoding batch %d out of %d." %
-                        (batch_num, num_decode_batches))
-      yield _encode_and_add_eos(line, subtokenizer)
-
-  def input_fn():
-    """Created batched dataset of encoded inputs."""
-    ds = tf.data.Dataset.from_generator(
-        input_generator, tf.int64, tf.TensorShape([None]))
-    ds = ds.padded_batch(batch_size, [None])
-    return ds
-
-  translations = []
-  for i, prediction in enumerate(estimator.predict(input_fn)):
-    translation = _trim_and_decode(prediction["outputs"], subtokenizer)
-    translations.append(translation)
-
-    if print_all_translations:
-      tf.logging.info("Translating:\n\tInput: %s\n\tOutput: %s" %
-                      (sorted_inputs[i], translation))
-
-  # Write translations in the order they appeared in the original file.
-  if output_file is not None:
-    if tf.io.gfile.isdir(output_file):
-      raise ValueError("File output is a directory, will not save outputs to "
-                       "file.")
-    tf.logging.info("Writing to file %s" % output_file)
-    with tf.io.gfile.GFile(output_file, "w") as f:
-      for i in sorted_keys:
-        f.write("%s\n" % translations[i])
-
-
-def translate_text(estimator, subtokenizer, txt):
-  """Translate a single string."""
-  encoded_txt = _encode_and_add_eos(txt, subtokenizer)
-
-  def input_fn():
-    ds = tf.data.Dataset.from_tensors(encoded_txt)
-    ds = ds.batch(_DECODE_BATCH_SIZE)
-    return ds
-
-  predictions = estimator.predict(input_fn)
-  translation = next(predictions)["outputs"]
-  translation = _trim_and_decode(translation, subtokenizer)
-  tf.logging.info("Translation of \"%s\": \"%s\"" % (txt, translation))
-
-
-def main(unused_argv):
-  from official.transformer import transformer_main
-
-  tf.logging.set_verbosity(tf.logging.INFO)
-
-  if FLAGS.text is None and FLAGS.file is None:
-    tf.logging.warn("Nothing to translate. Make sure to call this script using "
-                    "flags --text or --file.")
-    return
-
-  subtokenizer = tokenizer.Subtokenizer(FLAGS.vocab_file)
-
-  # Set up estimator and params
-  params = transformer_main.PARAMS_MAP[FLAGS.param_set]
-  params["beam_size"] = _BEAM_SIZE
-  params["alpha"] = _ALPHA
-  params["extra_decode_length"] = _EXTRA_DECODE_LENGTH
-  params["batch_size"] = _DECODE_BATCH_SIZE
-  estimator = tf.estimator.Estimator(
-      model_fn=transformer_main.model_fn, model_dir=FLAGS.model_dir,
-      params=params)
-
-  if FLAGS.text is not None:
-    tf.logging.info("Translating text: %s" % FLAGS.text)
-    translate_text(estimator, subtokenizer, FLAGS.text)
-
-  if FLAGS.file is not None:
-    input_file = os.path.abspath(FLAGS.file)
-    tf.logging.info("Translating file: %s" % input_file)
-    if not tf.gfile.Exists(FLAGS.file):
-      raise ValueError("File does not exist: %s" % input_file)
-
-    output_file = None
-    if FLAGS.file_out is not None:
-      output_file = os.path.abspath(FLAGS.file_out)
-      tf.logging.info("File output specified: %s" % output_file)
-
-    translate_file(estimator, subtokenizer, input_file, output_file)
-
-
-def define_translate_flags():
-  """Define flags used for translation script."""
-  # Model flags
-  flags.DEFINE_string(
-      name="model_dir", short_name="md", default="/tmp/transformer_model",
-      help=flags_core.help_wrap(
-          "Directory containing Transformer model checkpoints."))
-  flags.DEFINE_enum(
-      name="param_set", short_name="mp", default="big",
-      enum_values=["base", "big"],
-      help=flags_core.help_wrap(
-          "Parameter set to use when creating and training the model. The "
-          "parameters define the input shape (batch size and max length), "
-          "model configuration (size of embedding, # of hidden layers, etc.), "
-          "and various other settings. The big parameter set increases the "
-          "default batch size, embedding/hidden size, and filter size. For a "
-          "complete list of parameters, please see model/model_params.py."))
-  flags.DEFINE_string(
-      name="vocab_file", short_name="vf", default=None,
-      help=flags_core.help_wrap(
-          "Path to subtoken vocabulary file. If data_download.py was used to "
-          "download and encode the training data, look in the data_dir to find "
-          "the vocab file."))
-  flags.mark_flag_as_required("vocab_file")
-
-  flags.DEFINE_string(
-      name="text", default=None,
-      help=flags_core.help_wrap(
-          "Text to translate. Output will be printed to console."))
-  flags.DEFINE_string(
-      name="file", default=None,
-      help=flags_core.help_wrap(
-          "File containing text to translate. Translation will be printed to "
-          "console and, if --file_out is provided, saved to an output file."))
-  flags.DEFINE_string(
-      name="file_out", default=None,
-      help=flags_core.help_wrap(
-          "If --file flag is specified, save translation to this file."))
-
-
-if __name__ == "__main__":
-  define_translate_flags()
-  FLAGS = flags.FLAGS
-  absl_app.run(main)
--- a/official/r1/utils/__init__.py
+++ b/official/r1/utils/__init__.py
--- a/official/r1/utils/data/__init__.py
+++ b/official/r1/utils/data/__init__.py
--- a/official/r1/utils/data/file_io.py
+++ b/official/r1/utils/data/file_io.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Convenience functions for managing dataset file buffers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import atexit
-import multiprocessing
-import multiprocessing.dummy
-import os
-import tempfile
-import uuid
-
-from absl import logging
-import numpy as np
-import six
-import tensorflow as tf
-# pylint:disable=logging-format-interpolation
-
-
-class _GarbageCollector(object):
-  """Deletes temporary buffer files at exit.
-
-  Certain tasks (such as NCF Recommendation) require writing buffers to
-  temporary files. (Which may be local or distributed.) It is not generally safe
-  to delete these files during operation, but they should be cleaned up. This
-  class keeps track of temporary files created, and deletes them at exit.
-  """
-  def __init__(self):
-    self.temp_buffers = []
-
-  def register(self, filepath):
-    self.temp_buffers.append(filepath)
-
-  def purge(self):
-    try:
-      for i in self.temp_buffers:
-        if tf.io.gfile.exists(i):
-          tf.io.gfile.remove(i)
-          logging.info("Buffer file {} removed".format(i))
-    except Exception as e:
-      logging.error("Failed to cleanup buffer files: {}".format(e))
-
-
-_GARBAGE_COLLECTOR = _GarbageCollector()
-atexit.register(_GARBAGE_COLLECTOR.purge)
-
-_ROWS_PER_CORE = 50000
-
-
-def write_to_temp_buffer(dataframe, buffer_folder, columns):
-  if buffer_folder is None:
-    _, buffer_path = tempfile.mkstemp()
-  else:
-    tf.io.gfile.makedirs(buffer_folder)
-    buffer_path = os.path.join(buffer_folder, str(uuid.uuid4()))
-  _GARBAGE_COLLECTOR.register(buffer_path)
-
-  return write_to_buffer(dataframe, buffer_path, columns)
-
-
-def iter_shard_dataframe(df, rows_per_core=1000):
-  """Two way shard of a dataframe.
-
-  This function evenly shards a dataframe so that it can be mapped efficiently.
-  It yields a list of dataframes with length equal to the number of CPU cores,
-  with each dataframe having rows_per_core rows. (Except for the last batch
-  which may have fewer rows in the dataframes.) Passing vectorized inputs to
-  a pool is more effecient than iterating through a dataframe in serial and
-  passing a list of inputs to the pool.
-
-  Args:
-    df: Pandas dataframe to be sharded.
-    rows_per_core: Number of rows in each shard.
-
-  Returns:
-    A list of dataframe shards.
-  """
-  n = len(df)
-  num_cores = min([multiprocessing.cpu_count(), n])
-
-  num_blocks = int(np.ceil(n / num_cores / rows_per_core))
-  max_batch_size = num_cores * rows_per_core
-  for i in range(num_blocks):
-    min_index = i * max_batch_size
-    max_index = min([(i + 1) * max_batch_size, n])
-    df_shard = df[min_index:max_index]
-    n_shard = len(df_shard)
-    boundaries = np.linspace(0, n_shard, num_cores + 1, dtype=np.int64)
-    yield [df_shard[boundaries[j]:boundaries[j+1]] for j in range(num_cores)]
-
-
-def _shard_dict_to_examples(shard_dict):
-  """Converts a dict of arrays into a list of example bytes."""
-  n = [i for i in shard_dict.values()][0].shape[0]
-  feature_list = [{} for _ in range(n)]
-  for column, values in shard_dict.items():
-    if len(values.shape) == 1:
-      values = np.reshape(values, values.shape + (1,))
-
-    if values.dtype.kind == "i":
-      feature_map = lambda x: tf.train.Feature(
-          int64_list=tf.train.Int64List(value=x))
-    elif values.dtype.kind == "f":
-      feature_map = lambda x: tf.train.Feature(
-          float_list=tf.train.FloatList(value=x))
-    else:
-      raise ValueError("Invalid dtype")
-    for i in range(n):
-      feature_list[i][column] = feature_map(values[i])
-  examples = [
-      tf.train.Example(features=tf.train.Features(feature=example_features))
-      for example_features in feature_list
-  ]
-
-  return [e.SerializeToString() for e in examples]
-
-
-def _serialize_shards(df_shards, columns, pool, writer):
-  """Map sharded dataframes to bytes, and write them to a buffer.
-
-  Args:
-    df_shards: A list of pandas dataframes. (Should be of similar size)
-    columns: The dataframe columns to be serialized.
-    pool: A pool to serialize in parallel.
-    writer: A TFRecordWriter to write the serialized shards.
-  """
-  # Pandas does not store columns of arrays as nd arrays. stack remedies this.
-  map_inputs = [{c: np.stack(shard[c].values, axis=0) for c in columns}
-                for shard in df_shards]
-
-  # Failure within pools is very irksome. Thus, it is better to thoroughly check
-  # inputs in the main process.
-  for inp in map_inputs:
-    # Check that all fields have the same number of rows.
-    assert len(set([v.shape[0] for v in inp.values()])) == 1
-    for val in inp.values():
-      assert hasattr(val, "dtype")
-      assert hasattr(val.dtype, "kind")
-      assert val.dtype.kind in ("i", "f")
-      assert len(val.shape) in (1, 2)
-  shard_bytes = pool.map(_shard_dict_to_examples, map_inputs)
-  for s in shard_bytes:
-    for example in s:
-      writer.write(example)
-
-
-def write_to_buffer(dataframe, buffer_path, columns, expected_size=None):
-  """Write a dataframe to a binary file for a dataset to consume.
-
-  Args:
-    dataframe: The pandas dataframe to be serialized.
-    buffer_path: The path where the serialized results will be written.
-    columns: The dataframe columns to be serialized.
-    expected_size: The size in bytes of the serialized results. This is used to
-      lazily construct the buffer.
-
-  Returns:
-    The path of the buffer.
-  """
-  if (tf.io.gfile.exists(buffer_path) and
-      tf.io.gfile.stat(buffer_path).length > 0):
-    actual_size = tf.io.gfile.stat(buffer_path).length
-    if expected_size == actual_size:
-      return buffer_path
-    logging.warning(
-        "Existing buffer {} has size {}. Expected size {}. Deleting and "
-        "rebuilding buffer.".format(buffer_path, actual_size, expected_size))
-    tf.io.gfile.remove(buffer_path)
-
-  if dataframe is None:
-    raise ValueError(
-        "dataframe was None but a valid existing buffer was not found.")
-
-  tf.io.gfile.makedirs(os.path.split(buffer_path)[0])
-
-  logging.info("Constructing TFRecordDataset buffer: {}".format(buffer_path))
-
-  count = 0
-  pool = multiprocessing.dummy.Pool(multiprocessing.cpu_count())
-  try:
-    with tf.io.TFRecordWriter(buffer_path) as writer:
-      for df_shards in iter_shard_dataframe(df=dataframe,
-                                            rows_per_core=_ROWS_PER_CORE):
-        _serialize_shards(df_shards, columns, pool, writer)
-        count += sum([len(s) for s in df_shards])
-        logging.info("{}/{} examples written.".format(
-            str(count).ljust(8), len(dataframe)))
-  finally:
-    pool.terminate()
-
-  logging.info("Buffer write complete.")
-  return buffer_path
--- a/official/r1/utils/data/file_io_test.py
+++ b/official/r1/utils/data/file_io_test.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for binary data file utilities."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import contextlib
-import multiprocessing
-
-# pylint: disable=wrong-import-order
-import numpy as np
-import pandas as pd
-import tensorflow as tf
-# pylint: enable=wrong-import-order
-
-from official.r1.utils.data import file_io
-
-
-_RAW_ROW = "raw_row"
-_DUMMY_COL = "column_0"
-_DUMMY_VEC_COL = "column_1"
-_DUMMY_VEC_LEN = 4
-
-_ROWS_PER_CORE = 4
-_TEST_CASES = [
-    # One batch of one
-    dict(row_count=1, cpu_count=1, expected=[
-        [[0]]
-    ]),
-
-    dict(row_count=10, cpu_count=1, expected=[
-        [[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9]]
-    ]),
-
-    dict(row_count=21, cpu_count=1, expected=[
-        [[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9, 10, 11]],
-        [[12, 13, 14, 15]], [[16, 17, 18, 19]], [[20]]
-    ]),
-
-    dict(row_count=1, cpu_count=4, expected=[
-        [[0]]
-    ]),
-
-    dict(row_count=10, cpu_count=4, expected=[
-        [[0, 1], [2, 3, 4], [5, 6], [7, 8, 9]]
-    ]),
-
-    dict(row_count=21, cpu_count=4, expected=[
-        [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]],
-        [[16], [17], [18], [19, 20]]
-    ]),
-
-    dict(row_count=10, cpu_count=8, expected=[
-        [[0], [1], [2], [3, 4], [5], [6], [7], [8, 9]]
-    ]),
-
-    dict(row_count=40, cpu_count=8, expected=[
-        [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15],
-         [16, 17, 18, 19], [20, 21, 22, 23], [24, 25, 26, 27],
-         [28, 29, 30, 31]],
-        [[32], [33], [34], [35], [36], [37], [38], [39]]
-    ]),
-]
-
-_FEATURE_MAP = {
-    _RAW_ROW: tf.io.FixedLenFeature([1], dtype=tf.int64),
-    _DUMMY_COL: tf.io.FixedLenFeature([1], dtype=tf.int64),
-    _DUMMY_VEC_COL: tf.io.FixedLenFeature([_DUMMY_VEC_LEN], dtype=tf.float32)
-}
-
-
-@contextlib.contextmanager
-def fixed_core_count(cpu_count):
-  """Override CPU count.
-
-  file_io.py uses the cpu_count function to scale to the size of the instance.
-  However, this is not desirable for testing because it can make the test flaky.
-  Instead, this context manager fixes the count for more robust testing.
-
-  Args:
-    cpu_count: How many cores multiprocessing claims to have.
-
-  Yields:
-    Nothing. (for context manager only)
-  """
-  old_count_fn = multiprocessing.cpu_count
-  multiprocessing.cpu_count = lambda: cpu_count
-  yield
-  multiprocessing.cpu_count = old_count_fn
-
-
-class BaseTest(tf.test.TestCase):
-
-  def setUp(self):
-    super(BaseTest, self).setUp()
-    tf.compat.v1.disable_eager_execution()
-
-  def _test_sharding(self, row_count, cpu_count, expected):
-    df = pd.DataFrame({_DUMMY_COL: list(range(row_count))})
-    with fixed_core_count(cpu_count):
-      shards = list(file_io.iter_shard_dataframe(df, _ROWS_PER_CORE))
-    result = [[j[_DUMMY_COL].tolist() for j in i] for i in shards]
-    self.assertAllEqual(expected, result)
-
-  def test_tiny_rows_low_core(self):
-    self._test_sharding(**_TEST_CASES[0])
-
-  def test_small_rows_low_core(self):
-    self._test_sharding(**_TEST_CASES[1])
-
-  def test_large_rows_low_core(self):
-    self._test_sharding(**_TEST_CASES[2])
-
-  def test_tiny_rows_medium_core(self):
-    self._test_sharding(**_TEST_CASES[3])
-
-  def test_small_rows_medium_core(self):
-    self._test_sharding(**_TEST_CASES[4])
-
-  def test_large_rows_medium_core(self):
-    self._test_sharding(**_TEST_CASES[5])
-
-  def test_small_rows_large_core(self):
-    self._test_sharding(**_TEST_CASES[6])
-
-  def test_large_rows_large_core(self):
-    self._test_sharding(**_TEST_CASES[7])
-
-  def _serialize_deserialize(self, num_cores=1, num_rows=20):
-    np.random.seed(1)
-    df = pd.DataFrame({
-        # Serialization order is only deterministic for num_cores=1. raw_row is
-        # used in validation after the deserialization.
-        _RAW_ROW: np.array(range(num_rows), dtype=np.int64),
-        _DUMMY_COL: np.random.randint(0, 35, size=(num_rows,)),
-        _DUMMY_VEC_COL: [
-            np.array([np.random.random() for _ in range(_DUMMY_VEC_LEN)])
-            for i in range(num_rows)  # pylint: disable=unused-variable
-        ]
-    })
-
-    with fixed_core_count(num_cores):
-      buffer_path = file_io.write_to_temp_buffer(
-          df, self.get_temp_dir(), [_RAW_ROW, _DUMMY_COL, _DUMMY_VEC_COL])
-
-    with self.session(graph=tf.Graph()) as sess:
-      dataset = tf.data.TFRecordDataset(buffer_path)
-      dataset = dataset.batch(1).map(
-          lambda x: tf.io.parse_example(serialized=x, features=_FEATURE_MAP))
-
-      data_iter = tf.compat.v1.data.make_one_shot_iterator(dataset)
-      seen_rows = set()
-      for i in range(num_rows+5):
-        row = data_iter.get_next()
-        try:
-          row_id, val_0, val_1 = sess.run(
-              [row[_RAW_ROW], row[_DUMMY_COL], row[_DUMMY_VEC_COL]])
-          row_id, val_0, val_1 = row_id[0][0], val_0[0][0], val_1[0]
-          assert row_id not in seen_rows
-          seen_rows.add(row_id)
-
-          self.assertEqual(val_0, df[_DUMMY_COL][row_id])
-          self.assertAllClose(val_1, df[_DUMMY_VEC_COL][row_id])
-
-          self.assertLess(i, num_rows, msg="Too many rows.")
-        except tf.errors.OutOfRangeError:
-          self.assertGreaterEqual(i, num_rows, msg="Too few rows.")
-
-    file_io._GARBAGE_COLLECTOR.purge()
-    assert not tf.io.gfile.exists(buffer_path)
-
-  def test_serialize_deserialize_0(self):
-    self._serialize_deserialize(num_cores=1)
-
-  def test_serialize_deserialize_1(self):
-    self._serialize_deserialize(num_cores=2)
-
-  def test_serialize_deserialize_2(self):
-    self._serialize_deserialize(num_cores=8)
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/r1/utils/export.py
+++ b/official/r1/utils/export.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Convenience functions for exporting models as SavedModels or other types."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-
-def build_tensor_serving_input_receiver_fn(shape, dtype=tf.float32,
-                                           batch_size=1):
-  """Returns a input_receiver_fn that can be used during serving.
-
-  This expects examples to come through as float tensors, and simply
-  wraps them as TensorServingInputReceivers.
-
-  Arguably, this should live in tf.estimator.export. Testing here first.
-
-  Args:
-    shape: list representing target size of a single example.
-    dtype: the expected datatype for the input example
-    batch_size: number of input tensors that will be passed for prediction
-
-  Returns:
-    A function that itself returns a TensorServingInputReceiver.
-  """
-  def serving_input_receiver_fn():
-    # Prep a placeholder where the input example will be fed in
-    features = tf.compat.v1.placeholder(
-        dtype=dtype, shape=[batch_size] + shape, name='input_tensor')
-
-    return tf.estimator.export.TensorServingInputReceiver(
-        features=features, receiver_tensors=features)
-
-  return serving_input_receiver_fn
--- a/official/r1/utils/export_test.py
+++ b/official/r1/utils/export_test.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for exporting utils."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.r1.utils import export
-
-
-class ExportUtilsTest(tf.test.TestCase):
-  """Tests for the ExportUtils."""
-
-  def test_build_tensor_serving_input_receiver_fn(self):
-    receiver_fn = export.build_tensor_serving_input_receiver_fn(shape=[4, 5])
-    with tf.Graph().as_default():
-      receiver = receiver_fn()
-      self.assertIsInstance(
-          receiver, tf.estimator.export.TensorServingInputReceiver)
-
-      self.assertIsInstance(receiver.features, tf.Tensor)
-      self.assertEqual(receiver.features.shape, tf.TensorShape([1, 4, 5]))
-      self.assertEqual(receiver.features.dtype, tf.float32)
-      self.assertIsInstance(receiver.receiver_tensors, dict)
-      # Note that Python 3 can no longer index .values() directly; cast to list.
-      self.assertEqual(list(receiver.receiver_tensors.values())[0].shape,
-                       tf.TensorShape([1, 4, 5]))
-
-  def test_build_tensor_serving_input_receiver_fn_batch_dtype(self):
-    receiver_fn = export.build_tensor_serving_input_receiver_fn(
-        shape=[4, 5], dtype=tf.int8, batch_size=10)
-
-    with tf.Graph().as_default():
-      receiver = receiver_fn()
-      self.assertIsInstance(
-          receiver, tf.estimator.export.TensorServingInputReceiver)
-
-      self.assertIsInstance(receiver.features, tf.Tensor)
-      self.assertEqual(receiver.features.shape, tf.TensorShape([10, 4, 5]))
-      self.assertEqual(receiver.features.dtype, tf.int8)
-      self.assertIsInstance(receiver.receiver_tensors, dict)
-      # Note that Python 3 can no longer index .values() directly; cast to list.
-      self.assertEqual(list(receiver.receiver_tensors.values())[0].shape,
-                       tf.TensorShape([10, 4, 5]))
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/r1/utils/logs/__init__.py
+++ b/official/r1/utils/logs/__init__.py
--- a/official/r1/utils/logs/cloud_lib.py
+++ b/official/r1/utils/logs/cloud_lib.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Utilities that interact with cloud service.
-"""
-
-import requests
-
-GCP_METADATA_URL = "http://metadata/computeMetadata/v1/instance/hostname"
-GCP_METADATA_HEADER = {"Metadata-Flavor": "Google"}
-
-
-def on_gcp():
-  """Detect whether the current running environment is on GCP."""
-  try:
-    # Timeout in 5 seconds, in case the test environment has connectivity issue.
-    # There is not default timeout, which means it might block forever.
-    response = requests.get(
-        GCP_METADATA_URL, headers=GCP_METADATA_HEADER, timeout=5)
-    return response.status_code == 200
-  except requests.exceptions.RequestException:
-    return False
--- a/official/r1/utils/logs/cloud_lib_test.py
+++ b/official/r1/utils/logs/cloud_lib_test.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Tests for cloud_lib."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-
-import mock
-import requests
-
-from official.r1.utils.logs import cloud_lib
-
-
-class CloudLibTest(unittest.TestCase):
-
-  @mock.patch("requests.get")
-  def test_on_gcp(self, mock_requests_get):
-    mock_response = mock.MagicMock()
-    mock_requests_get.return_value = mock_response
-    mock_response.status_code = 200
-
-    self.assertEqual(cloud_lib.on_gcp(), True)
-
-  @mock.patch("requests.get")
-  def test_not_on_gcp(self, mock_requests_get):
-    mock_requests_get.side_effect = requests.exceptions.ConnectionError()
-
-    self.assertEqual(cloud_lib.on_gcp(), False)
-
-
-if __name__ == "__main__":
-  unittest.main()
--- a/official/r1/utils/logs/guidelines.md
+++ b/official/r1/utils/logs/guidelines.md
-# Logging in official models
-
-This library adds logging functions that print or save tensor values. Official models should define all common hooks
-(using hooks helper) and a benchmark logger.
-
-1. **Training Hooks**
-
-   Hooks are a TensorFlow concept that define specific actions at certain points of the execution. We use them to obtain and log
-   tensor values during training.
-
-   hooks_helper.py provides an easy way to create common hooks. The following hooks are currently defined:
-   * LoggingTensorHook: Logs tensor values
-   * ProfilerHook: Writes a timeline json that can be loaded into chrome://tracing.
-   * ExamplesPerSecondHook: Logs the number of examples processed per second.
-   * LoggingMetricHook: Similar to LoggingTensorHook, except that the tensors are logged in a format defined by our data
-     anaylsis pipeline.
-
-
-2. **Benchmarks**
-
-   The benchmark logger provides useful functions for logging environment information, and evaluation results.
-   The module also contains a context which is used to update the status of the run.
-
-Example usage:
-
-```
-from absl import app as absl_app
-
-from official.utils.logs import hooks_helper
-from official.utils.logs import logger
-
-def model_main(flags_obj):
-  estimator = ...
-
-  benchmark_logger = logger.get_benchmark_logger()
-  benchmark_logger.log_run_info(...)
-
-  train_hooks = hooks_helper.get_train_hooks(...)
-
-  for epoch in range(10):
-    estimator.train(..., hooks=train_hooks)
-    eval_results = estimator.evaluate(...)
-
-    # Log a dictionary of metrics
-    benchmark_logger.log_evaluation_result(eval_results)
-
-    # Log an individual metric
-    benchmark_logger.log_metric(...)
-
-
-def main(_):
-  with logger.benchmark_context(flags.FLAGS):
-    model_main(flags.FLAGS)
-
-if __name__ == "__main__":
-  # define flags
-  absl_app.run(main)
-```