Update code to v2.11.0

32e4ca51 · qianyj · 9485aa1d · 71060f67 · 9485aa1d · 9485aa1d
Commit 32e4ca51 authored Nov 28, 2023 by qianyj
20 changed files
--- a/official/nlp/transformer/data_pipeline.py
+++ b/official/nlp/transformer/data_pipeline.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Input pipeline for the transformer model to read, filter, and batch examples.
-
-Two things to note in the pipeline:
-
-1. Batching scheme
-
-   The examples encoded in the TFRecord files contain data in the format:
-     {"inputs": [variable length array of integers],
-      "targets": [variable length array of integers]}
-   Where integers in the arrays refer to tokens in the English and German vocab
-   file (named `vocab.ende.32768`).
-
-   Prior to batching, elements in the dataset are grouped by length (max between
-   "inputs" and "targets" length). Each group is then batched such that:
-     group_batch_size * length <= batch_size.
-
-   Another way to view batch_size is the maximum number of tokens in each batch.
-
-   Once batched, each element in the dataset will have the shape:
-     {"inputs": [group_batch_size, padded_input_length],
-      "targets": [group_batch_size, padded_target_length]}
-   Lengths are padded to the longest "inputs" or "targets" sequence in the batch
-   (padded_input_length and padded_target_length can be different).
-
-   This batching scheme decreases the fraction of padding tokens per training
-   batch, thus improving the training speed significantly.
-
-2. Shuffling
-
-   While training, the dataset is shuffled in two places in the code. The first
-   is the list of training files. Second, while reading records using
-   `parallel_interleave`, the `sloppy` argument is used to generate randomness
-   in the order of the examples.
-"""
-
-import os
-
-from absl import logging
-import tensorflow as tf
-
-from official.utils.misc import model_helpers
-
-# Buffer size for reading records from a TFRecord file. Each training file is
-# 7.2 MB, so 8 MB allows an entire file to be kept in memory.
-_READ_RECORD_BUFFER = 8 * 1000 * 1000
-
-# Example grouping constants. Defines length boundaries for each group.
-# These values are the defaults used in Tensor2Tensor.
-_MIN_BOUNDARY = 8
-_BOUNDARY_SCALE = 1.1
-
-
-def _load_records(filename):
-  """Read file and return a dataset of tf.Examples."""
-  return tf.data.TFRecordDataset(filename, buffer_size=_READ_RECORD_BUFFER)
-
-
-def _parse_example(serialized_example):
-  """Return inputs and targets Tensors from a serialized tf.Example."""
-  data_fields = {
-      "inputs": tf.io.VarLenFeature(tf.int64),
-      "targets": tf.io.VarLenFeature(tf.int64)
-  }
-  parsed = tf.io.parse_single_example(serialized_example, data_fields)
-  inputs = tf.sparse.to_dense(parsed["inputs"])
-  targets = tf.sparse.to_dense(parsed["targets"])
-  return inputs, targets
-
-
-def _filter_max_length(example, max_length=256):
-  """Indicates whether the example's length is lower than the maximum length."""
-  return tf.logical_and(
-      tf.size(example[0]) <= max_length,
-      tf.size(example[1]) <= max_length)
-
-
-def _get_example_length(example):
-  """Returns the maximum length between the example inputs and targets."""
-  length = tf.maximum(tf.shape(example[0])[0], tf.shape(example[1])[0])
-  return length
-
-
-def _create_min_max_boundaries(max_length,
-                               min_boundary=_MIN_BOUNDARY,
-                               boundary_scale=_BOUNDARY_SCALE):
-  """Create min and max boundary lists up to max_length.
-
-  For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
-  returned values will be:
-    buckets_min = [0, 4, 8, 16, 24]
-    buckets_max = [4, 8, 16, 24, 25]
-
-  Args:
-    max_length: The maximum length of example in dataset.
-    min_boundary: Minimum length in boundary.
-    boundary_scale: Amount to scale consecutive boundaries in the list.
-
-  Returns:
-    min and max boundary lists
-
-  """
-  # Create bucket boundaries list by scaling the previous boundary or adding 1
-  # (to ensure increasing boundary sizes).
-  bucket_boundaries = []
-  x = min_boundary
-  while x < max_length:
-    bucket_boundaries.append(x)
-    x = max(x + 1, int(x * boundary_scale))
-
-  # Create min and max boundary lists from the initial list.
-  buckets_min = [0] + bucket_boundaries
-  buckets_max = bucket_boundaries + [max_length + 1]
-  return buckets_min, buckets_max
-
-
-def _batch_examples(dataset, batch_size, max_length):
-  """Group examples by similar lengths, and return batched dataset.
-
-  Each batch of similar-length examples are padded to the same length, and may
-  have different number of elements in each batch, such that:
-    group_batch_size * padded_length <= batch_size.
-
-  This decreases the number of padding tokens per batch, which improves the
-  training speed.
-
-  Args:
-    dataset: Dataset of unbatched examples.
-    batch_size: Max number of tokens per batch of examples.
-    max_length: Max number of tokens in an example input or target sequence.
-
-  Returns:
-    Dataset of batched examples with similar lengths.
-  """
-  # Get min and max boundary lists for each example. These are used to calculate
-  # the `bucket_id`, which is the index at which:
-  # buckets_min[bucket_id] <= len(example) < buckets_max[bucket_id]
-  # Note that using both min and max lists improves the performance.
-  buckets_min, buckets_max = _create_min_max_boundaries(max_length)
-
-  # Create list of batch sizes for each bucket_id, so that
-  # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
-  bucket_batch_sizes = [int(batch_size) // x for x in buckets_max]
-  # bucket_id will be a tensor, so convert this list to a tensor as well.
-  bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
-
-  def example_to_bucket_id(example_input, example_target):
-    """Return int64 bucket id for this example, calculated based on length."""
-    seq_length = _get_example_length((example_input, example_target))
-
-    # TODO(xunkai): investigate if removing code branching improves performance.
-    conditions_c = tf.logical_and(
-        tf.less_equal(buckets_min, seq_length), tf.less(seq_length,
-                                                        buckets_max))
-    bucket_id = tf.reduce_min(tf.where(conditions_c))
-    return bucket_id
-
-  def window_size_fn(bucket_id):
-    """Return number of examples to be grouped when given a bucket id."""
-    return bucket_batch_sizes[bucket_id]
-
-  def batching_fn(bucket_id, grouped_dataset):
-    """Batch and add padding to a dataset of elements with similar lengths."""
-    bucket_batch_size = window_size_fn(bucket_id)
-
-    # Batch the dataset and add padding so that all input sequences in the
-    # examples have the same length, and all target sequences have the same
-    # lengths as well. Resulting lengths of inputs and targets can differ.
-    return grouped_dataset.padded_batch(bucket_batch_size, ([None], [None]))
-
-  return dataset.apply(
-      tf.data.experimental.group_by_window(
-          key_func=example_to_bucket_id,
-          reduce_func=batching_fn,
-          window_size=None,
-          window_size_func=window_size_fn))
-
-
-def _read_and_batch_from_files(file_pattern,
-                               batch_size,
-                               max_length,
-                               max_io_parallelism,
-                               shuffle,
-                               repeat,
-                               static_batch=False,
-                               num_replicas=1,
-                               ctx=None):
-  """Create dataset where each item is a dict of "inputs" and "targets".
-
-  Args:
-    file_pattern: String used to match the input TFRecord files.
-    batch_size: Maximum number of tokens per global batch of examples.
-    max_length: Maximum number of tokens per example
-    max_io_parallelism: Max number of cpu cores for parallel input processing.
-    shuffle: If true, randomizes order of elements.
-    repeat: Number of times to repeat the dataset. If None, the dataset is
-      repeated forever.
-    static_batch: Whether the batches in the dataset should have static shapes.
-      If True, the input is batched so that every batch has the shape
-      [batch_size // max_length, max_length]. If False, the input is grouped by
-      length, and batched so that batches may have different
-      shapes [N, M], where: N * M <= batch_size M <= max_length In general, this
-        setting should be False. Dynamic shapes allow the inputs to be grouped
-        so that the number of padding tokens is minimized, and helps model
-        training. In cases where the input shape must be static (e.g. running on
-        TPU), this setting should be set to True.
-    num_replicas: Number of GPUs or other workers. We will generate global
-      batches, and each global batch is equally divisible by number of replicas.
-      Currently it is only effective when static_batch==True. TODO: make it
-        effective when static_batch=False.
-    ctx: Input context.
-
-  Returns:
-    tf.data.Dataset object containing examples loaded from the files.
-  """
-  dataset = tf.data.Dataset.list_files(file_pattern, shuffle=shuffle)
-
-  if ctx and ctx.num_input_pipelines > 1:
-    logging.info("Shard %d of the dataset.", ctx.input_pipeline_id)
-    dataset = dataset.shard(ctx.num_input_pipelines, ctx.input_pipeline_id)
-
-  # Read files and interleave results. When training, the order of the examples
-  # will be non-deterministic.
-  options = tf.data.Options()
-  options.experimental_deterministic = False
-  dataset = dataset.interleave(
-      _load_records,
-      cycle_length=max_io_parallelism,
-      num_parallel_calls=tf.data.experimental.AUTOTUNE).with_options(options)
-
-  # Parse each tf.Example into a dictionary
-  # TODO: Look into prefetch_input_elements for performance optimization.  # pylint: disable=g-bad-todo
-  dataset = dataset.map(
-      _parse_example, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-  # Remove examples where the input or target length exceeds the maximum length,
-  dataset = dataset.filter(lambda x, y: _filter_max_length((x, y), max_length))
-
-  if static_batch:
-    dataset = dataset.padded_batch(
-        # First calculate batch size (token number) per worker, then divide it
-        # into sentences, and finally expand to a global batch. It could prove
-        # the global batch divisble for distribution strategy.
-        int(batch_size // num_replicas // max_length * num_replicas),
-        ([max_length], [max_length]),
-        drop_remainder=True)
-  else:
-    # Group and batch such that each batch has examples of similar length.
-    # TODO(xunkai): _batch_examples might need to do something special for
-    # num_replicas.
-    dataset = _batch_examples(dataset, batch_size, max_length)
-
-  dataset = dataset.repeat(repeat)
-
-  # Prefetch the next element to improve speed of input pipeline.
-  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
-  return dataset
-
-
-def _generate_synthetic_data(params):
-  """Create synthetic data based on the parameter batch size."""
-  batch_size = int(params["batch_size"] // params["max_length"])
-  length = params["max_length"]
-  dataset = model_helpers.generate_synthetic_data(
-      input_shape=tf.TensorShape([length]),
-      input_value=1,
-      input_dtype=tf.int64,
-      label_shape=tf.TensorShape([length]),
-      label_value=1,
-      label_dtype=tf.int64,
-  )
-  if params["static_batch"]:
-    dataset = dataset.batch(batch_size, drop_remainder=True)
-  else:
-    dataset = dataset.padded_batch(batch_size, ([None], [None]))
-  return dataset
-
-
-def train_input_fn(params, ctx=None):
-  """Load and return dataset of batched examples for use during training."""
-  file_pattern = os.path.join(params["data_dir"] or "", "*train*")
-  if params["use_synthetic_data"]:
-    return _generate_synthetic_data(params)
-  return _read_and_batch_from_files(
-      file_pattern,
-      params["batch_size"],
-      params["max_length"],
-      params["max_io_parallelism"],
-      shuffle=True,
-      repeat=params["repeat_dataset"],
-      static_batch=params["static_batch"],
-      num_replicas=params["num_gpus"],
-      ctx=ctx)
-
-
-def eval_input_fn(params, ctx=None):
-  """Load and return dataset of batched examples for use during evaluation."""
-  file_pattern = os.path.join(params["data_dir"] or "", "*dev*")
-  if params["use_synthetic_data"]:
-    return _generate_synthetic_data(params)
-  return _read_and_batch_from_files(
-      file_pattern,
-      params["batch_size"],
-      params["max_length"],
-      params["max_io_parallelism"],
-      shuffle=False,
-      repeat=1,
-      static_batch=params["static_batch"],
-      num_replicas=params["num_gpus"],
-      ctx=ctx)
-
-
-def map_data_for_transformer_fn(x, y):
-  """Maps data for training, and handles weried behaviors for different vers."""
-  # Will transform input x and targets y into tuple(x, y) as new model inputs.
-  # For TF v2, the 2nd parameter is omitted to make Keras training work.
-  return ((x, y),)
--- a/official/nlp/transformer/embedding_layer.py
+++ b/official/nlp/transformer/embedding_layer.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Implementation of embedding layer with shared weights."""
-
-import tensorflow as tf
-
-
-class EmbeddingSharedWeights(tf.keras.layers.Layer):
-  """Calculates input embeddings and pre-softmax linear with shared weights."""
-
-  def __init__(self, vocab_size, hidden_size):
-    """Specify characteristic parameters of embedding layer.
-
-    Args:
-      vocab_size: Number of tokens in the embedding. (Typically ~32,000)
-      hidden_size: Dimensionality of the embedding. (Typically 512 or 1024)
-    """
-    super(EmbeddingSharedWeights, self).__init__()
-    self.vocab_size = vocab_size
-    self.hidden_size = hidden_size
-
-  def build(self, input_shape):
-    """Build embedding layer."""
-    with tf.name_scope("embedding_and_softmax"):
-      # Create and initialize weights. The random normal initializer was chosen
-      # arbitrarily, and works well.
-      self.shared_weights = self.add_weight(
-          "weights",
-          shape=[self.vocab_size, self.hidden_size],
-          dtype=tf.float32,
-          initializer=tf.random_normal_initializer(
-              mean=0., stddev=self.hidden_size**-0.5))
-    super(EmbeddingSharedWeights, self).build(input_shape)
-
-  def get_config(self):
-    return {
-        "vocab_size": self.vocab_size,
-        "hidden_size": self.hidden_size,
-    }
-
-  def call(self, inputs, mode="embedding"):
-    """Get token embeddings of inputs.
-
-    Args:
-      inputs: An int64 tensor with shape [batch_size, length]
-      mode: string, a valid value is one of "embedding" and "linear".
-
-    Returns:
-      outputs: (1) If mode == "embedding", output embedding tensor, float32 with
-        shape [batch_size, length, embedding_size]; (2) mode == "linear", output
-        linear tensor, float32 with shape [batch_size, length, vocab_size].
-    Raises:
-      ValueError: if mode is not valid.
-    """
-    if mode == "embedding":
-      return self._embedding(inputs)
-    elif mode == "linear":
-      return self._linear(inputs)
-    else:
-      raise ValueError("mode {} is not valid.".format(mode))
-
-  def _embedding(self, inputs):
-    """Applies embedding based on inputs tensor."""
-    with tf.name_scope("embedding"):
-      # Create binary mask of size [batch_size, length]
-      embeddings = tf.gather(self.shared_weights, inputs)
-      # mask = tf.cast(tf.not_equal(inputs, 0), embeddings.dtype)
-      # embeddings *= tf.expand_dims(mask, -1)
-      # Scale embedding by the sqrt of the hidden size
-      embeddings *= self.hidden_size**0.5
-
-      return embeddings
-
-  def _linear(self, inputs):
-    """Computes logits by running inputs through a linear layer.
-
-    Args:
-      inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-
-    Returns:
-      float32 tensor with shape [batch_size, length, vocab_size].
-    """
-    with tf.name_scope("presoftmax_linear"):
-      batch_size = tf.shape(inputs)[0]
-      length = tf.shape(inputs)[1]
-
-      x = tf.reshape(inputs, [-1, self.hidden_size])
-      logits = tf.matmul(x, self.shared_weights, transpose_b=True)
-
-      return tf.reshape(logits, [batch_size, length, self.vocab_size])
--- a/official/nlp/transformer/ffn_layer.py
+++ b/official/nlp/transformer/ffn_layer.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Implementation of fully connected network."""
-
-import tensorflow as tf
-
-
-class FeedForwardNetwork(tf.keras.layers.Layer):
-  """Fully connected feedforward network."""
-
-  def __init__(self, hidden_size, filter_size, relu_dropout):
-    """Initialize FeedForwardNetwork.
-
-    Args:
-      hidden_size: int, output dim of hidden layer.
-      filter_size: int, filter size for the inner (first) dense layer.
-      relu_dropout: float, dropout rate for training.
-    """
-    super(FeedForwardNetwork, self).__init__()
-    self.hidden_size = hidden_size
-    self.filter_size = filter_size
-    self.relu_dropout = relu_dropout
-
-  def build(self, input_shape):
-    self.filter_dense_layer = tf.keras.layers.Dense(
-        self.filter_size,
-        use_bias=True,
-        activation=tf.nn.relu,
-        name="filter_layer")
-    self.output_dense_layer = tf.keras.layers.Dense(
-        self.hidden_size, use_bias=True, name="output_layer")
-    super(FeedForwardNetwork, self).build(input_shape)
-
-  def get_config(self):
-    return {
-        "hidden_size": self.hidden_size,
-        "filter_size": self.filter_size,
-        "relu_dropout": self.relu_dropout,
-    }
-
-  def call(self, x, training):
-    """Return outputs of the feedforward network.
-
-    Args:
-      x: tensor with shape [batch_size, length, hidden_size]
-      training: boolean, whether in training mode or not.
-
-    Returns:
-      Output of the feedforward network.
-      tensor with shape [batch_size, length, hidden_size]
-    """
-    # Retrieve dynamically known shapes
-
-    output = self.filter_dense_layer(x)
-    if training:
-      output = tf.nn.dropout(output, rate=self.relu_dropout)
-    output = self.output_dense_layer(output)
-
-    return output
--- a/official/nlp/transformer/metrics.py
+++ b/official/nlp/transformer/metrics.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Functions for calculating loss, accuracy, and other model metrics.
-
-Metrics:
- - Padded loss, accuracy, and negative log perplexity. Source:
-     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
- - BLEU approximation. Source:
-     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
- - ROUGE score. Source:
-     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
-"""
-
-import functools
-
-import tensorflow as tf
-
-
-def _pad_tensors_to_same_length(x, y):
-  """Pad x and y so that the results have the same length (second dimension)."""
-  with tf.name_scope("pad_to_same_length"):
-    x_length = tf.shape(x)[1]
-    y_length = tf.shape(y)[1]
-
-    max_length = tf.maximum(x_length, y_length)
-
-    x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
-    y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
-    return x, y
-
-
-def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
-  """Calculate cross entropy loss while ignoring padding.
-
-  Args:
-    logits: Tensor of size [batch_size, length_logits, vocab_size]
-    labels: Tensor of size [batch_size, length_labels]
-    smoothing: Label smoothing constant, used to determine the on and off values
-    vocab_size: int size of the vocabulary
-
-  Returns:
-    Returns the cross entropy loss and weight tensors: float32 tensors with
-      shape [batch_size, max(length_logits, length_labels)]
-  """
-  with tf.name_scope("loss"):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-
-    # Calculate smoothing cross entropy
-    with tf.name_scope("smoothing_cross_entropy"):
-      confidence = 1.0 - smoothing
-      low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32)
-      soft_targets = tf.one_hot(
-          tf.cast(labels, tf.int32),
-          depth=vocab_size,
-          on_value=confidence,
-          off_value=low_confidence)
-      xentropy = tf.nn.softmax_cross_entropy_with_logits(
-          logits=logits, labels=soft_targets)
-
-      # Calculate the best (lowest) possible value of cross entropy, and
-      # subtract from the cross entropy loss.
-      normalizing_constant = -(
-          confidence * tf.math.log(confidence) +
-          tf.cast(vocab_size - 1, tf.float32) * low_confidence *
-          tf.math.log(low_confidence + 1e-20))
-      xentropy -= normalizing_constant
-
-    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
-    return xentropy * weights, weights
-
-
-def padded_accuracy(logits, labels):
-  """Percentage of times that predictions matches labels on non-0s."""
-  with tf.name_scope("padded_accuracy"):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
-    outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
-    padded_labels = tf.cast(labels, tf.int32)
-    return tf.cast(tf.equal(outputs, padded_labels), tf.float32), weights
-
-
-def padded_accuracy_topk(logits, labels, k):
-  """Percentage of times that top-k predictions matches labels on non-0s."""
-  with tf.name_scope("padded_accuracy_topk"):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
-    effective_k = tf.minimum(k, tf.shape(logits)[-1])
-    _, outputs = tf.nn.top_k(logits, k=effective_k)
-    outputs = tf.cast(outputs, tf.int32)
-    padded_labels = tf.cast(labels, tf.int32)
-    padded_labels = tf.expand_dims(padded_labels, axis=-1)
-    padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
-    same = tf.cast(tf.equal(outputs, padded_labels), tf.float32)
-    same_topk = tf.reduce_sum(same, axis=-1)
-    return same_topk, weights
-
-
-def padded_accuracy_top5(logits, labels):
-  return padded_accuracy_topk(logits, labels, 5)
-
-
-def padded_sequence_accuracy(logits, labels):
-  """Percentage of times that predictions matches labels everywhere (non-0)."""
-  with tf.name_scope("padded_sequence_accuracy"):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
-    outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
-    padded_labels = tf.cast(labels, tf.int32)
-    not_correct = tf.cast(tf.not_equal(outputs, padded_labels),
-                          tf.float32) * weights
-    axis = list(range(1, len(outputs.get_shape())))
-    correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
-    return correct_seq, tf.constant(1.0)
-
-
-def padded_neg_log_perplexity(logits, labels, vocab_size):
-  """Average log-perplexity excluding padding 0s. No smoothing."""
-  num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size)
-  return -num, den
-
-
-class MetricLayer(tf.keras.layers.Layer):
-  """Custom a layer of metrics for Transformer model."""
-
-  def __init__(self, vocab_size):
-    super(MetricLayer, self).__init__()
-    self.vocab_size = vocab_size
-    self.metric_mean_fns = []
-
-  def build(self, input_shape):
-    """"Builds metric layer."""
-    neg_log_perplexity = functools.partial(
-        padded_neg_log_perplexity, vocab_size=self.vocab_size)
-    self.metric_mean_fns = [
-        (tf.keras.metrics.Mean("accuracy"), padded_accuracy),
-        (tf.keras.metrics.Mean("accuracy_top5"), padded_accuracy_top5),
-        (tf.keras.metrics.Mean("accuracy_per_sequence"),
-         padded_sequence_accuracy),
-        (tf.keras.metrics.Mean("neg_log_perplexity"), neg_log_perplexity),
-    ]
-    super(MetricLayer, self).build(input_shape)
-
-  def get_config(self):
-    return {"vocab_size": self.vocab_size}
-
-  def call(self, inputs):
-    logits, targets = inputs[0], inputs[1]
-    for mean, fn in self.metric_mean_fns:
-      m = mean(*fn(logits, targets))
-      self.add_metric(m)
-    return logits
-
-
-def transformer_loss(logits, labels, smoothing, vocab_size):
-  """Calculates total loss containing cross entropy with padding ignored.
-
-  Args:
-    logits: Tensor of size [batch_size, length_logits, vocab_size]
-    labels: Tensor of size [batch_size, length_labels]
-    smoothing: Label smoothing constant, used to determine the on and off values
-    vocab_size: int size of the vocabulary
-
-  Returns:
-    A scalar float tensor for loss.
-  """
-  xentropy, weights = padded_cross_entropy_loss(logits, labels, smoothing,
-                                                vocab_size)
-  return tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
--- a/official/nlp/transformer/misc.py
+++ b/official/nlp/transformer/misc.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Misc for Transformer."""
-
-# pylint: disable=g-bad-import-order
-
-from absl import flags
-import tensorflow as tf
-
-from official.nlp.transformer import model_params
-from official.utils.flags import core as flags_core
-from official.utils.misc import keras_utils
-
-FLAGS = flags.FLAGS
-
-PARAMS_MAP = {
-    'tiny': model_params.TINY_PARAMS,
-    'base': model_params.BASE_PARAMS,
-    'big': model_params.BIG_PARAMS,
-}
-
-
-def get_model_params(param_set, num_gpus):
-  """Gets predefined model params."""
-  if num_gpus > 1:
-    if param_set == 'big':
-      return model_params.BIG_MULTI_GPU_PARAMS.copy()
-    elif param_set == 'base':
-      return model_params.BASE_MULTI_GPU_PARAMS.copy()
-    else:
-      raise ValueError('Not valid params: param_set={} num_gpus={}'.format(
-          param_set, num_gpus))
-
-  return PARAMS_MAP[param_set].copy()
-
-
-def define_transformer_flags():
-  """Add flags and flag validators for running transformer_main."""
-  # Add common flags (data_dir, model_dir, etc.).
-  flags_core.define_base(num_gpu=True, distribution_strategy=True)
-  flags_core.define_performance(
-      num_parallel_calls=True,
-      inter_op=False,
-      intra_op=False,
-      synthetic_data=True,
-      max_train_steps=False,
-      dtype=True,
-      loss_scale=True,
-      all_reduce_alg=True,
-      num_packs=True,
-      tf_gpu_thread_mode=True,
-      datasets_num_private_threads=True,
-      enable_xla=True,
-      fp16_implementation=True)
-
-  flags_core.define_benchmark()
-  flags_core.define_device(tpu=True)
-
-  flags.DEFINE_integer(
-      name='train_steps',
-      short_name='ts',
-      default=300000,
-      help=flags_core.help_wrap('The number of steps used to train.'))
-  flags.DEFINE_integer(
-      name='steps_between_evals',
-      short_name='sbe',
-      default=5000,
-      help=flags_core.help_wrap(
-          'The Number of training steps to run between evaluations. This is '
-          'used if --train_steps is defined.'))
-  flags.DEFINE_boolean(
-      name='enable_time_history',
-      default=True,
-      help='Whether to enable TimeHistory callback.')
-  flags.DEFINE_boolean(
-      name='enable_tensorboard',
-      default=False,
-      help='Whether to enable Tensorboard callback.')
-  flags.DEFINE_boolean(
-      name='enable_metrics_in_training',
-      default=False,
-      help='Whether to enable metrics during training.')
-  flags.DEFINE_boolean(
-      name='enable_mlir_bridge',
-      default=False,
-      help='Whether to enable the TF to XLA bridge.')
-  # Set flags from the flags_core module as 'key flags' so they're listed when
-  # the '-h' flag is used. Without this line, the flags defined above are
-  # only shown in the full `--helpful` help text.
-  flags.adopt_module_key_flags(flags_core)
-
-  # Add transformer-specific flags
-  flags.DEFINE_enum(
-      name='param_set',
-      short_name='mp',
-      default='big',
-      enum_values=PARAMS_MAP.keys(),
-      help=flags_core.help_wrap(
-          'Parameter set to use when creating and training the model. The '
-          'parameters define the input shape (batch size and max length), '
-          'model configuration (size of embedding, # of hidden layers, etc.), '
-          'and various other settings. The big parameter set increases the '
-          'default batch size, embedding/hidden size, and filter size. For a '
-          'complete list of parameters, please see model/model_params.py.'))
-
-  flags.DEFINE_bool(
-      name='static_batch',
-      short_name='sb',
-      default=False,
-      help=flags_core.help_wrap(
-          'Whether the batches in the dataset should have static shapes. In '
-          'general, this setting should be False. Dynamic shapes allow the '
-          'inputs to be grouped so that the number of padding tokens is '
-          'minimized, and helps model training. In cases where the input shape '
-          'must be static (e.g. running on TPU), this setting will be ignored '
-          'and static batching will always be used.'))
-  flags.DEFINE_integer(
-      name='max_length',
-      short_name='ml',
-      default=256,
-      help=flags_core.help_wrap(
-          'Max sentence length for Transformer. Default is 256. Note: Usually '
-          'it is more effective to use a smaller max length if static_batch is '
-          'enabled, e.g. 64.'))
-
-  # Flags for training with steps (may be used for debugging)
-  flags.DEFINE_integer(
-      name='validation_steps',
-      short_name='vs',
-      default=64,
-      help=flags_core.help_wrap('The number of steps used in validation.'))
-
-  # BLEU score computation
-  flags.DEFINE_string(
-      name='bleu_source',
-      short_name='bls',
-      default=None,
-      help=flags_core.help_wrap(
-          'Path to source file containing text translate when calculating the '
-          'official BLEU score. Both --bleu_source and --bleu_ref must be set. '
-      ))
-  flags.DEFINE_string(
-      name='bleu_ref',
-      short_name='blr',
-      default=None,
-      help=flags_core.help_wrap(
-          'Path to source file containing text translate when calculating the '
-          'official BLEU score. Both --bleu_source and --bleu_ref must be set. '
-      ))
-  flags.DEFINE_string(
-      name='vocab_file',
-      short_name='vf',
-      default=None,
-      help=flags_core.help_wrap(
-          'Path to subtoken vocabulary file. If data_download.py was used to '
-          'download and encode the training data, look in the data_dir to find '
-          'the vocab file.'))
-  flags.DEFINE_string(
-      name='mode',
-      default='train',
-      help=flags_core.help_wrap('mode: train, eval, or predict'))
-  flags.DEFINE_bool(
-      name='use_ctl',
-      default=False,
-      help=flags_core.help_wrap(
-          'Whether the model runs with custom training loop.'))
-  flags.DEFINE_integer(
-      name='decode_batch_size',
-      default=32,
-      help=flags_core.help_wrap(
-          'Global batch size used for Transformer autoregressive decoding on '
-          'TPU.'))
-  flags.DEFINE_integer(
-      name='decode_max_length',
-      default=97,
-      help=flags_core.help_wrap(
-          'Max sequence length of the decode/eval data. This is used by '
-          'Transformer autoregressive decoding on TPU to have minimum '
-          'paddings.'))
-  flags.DEFINE_bool(
-      name='padded_decode',
-      default=False,
-      help=flags_core.help_wrap(
-          'Whether the autoregressive decoding runs with input data padded to '
-          'the decode_max_length. For TPU/XLA-GPU runs, this flag has to be '
-          'set due the static shape requirement. Although CPU/GPU could also '
-          'use padded_decode, it has not been tested. In addition, this method '
-          'will introduce unnecessary overheads which grow quadratically with '
-          'the max sequence length.'))
-  flags.DEFINE_bool(
-      name='enable_checkpointing',
-      default=True,
-      help=flags_core.help_wrap(
-          'Whether to do checkpointing during training. When running under '
-          'benchmark harness, we will avoid checkpointing.'))
-  flags.DEFINE_bool(
-      name='save_weights_only',
-      default=True,
-      help=flags_core.help_wrap(
-          'Only used when above `enable_checkpointing` is True. '
-          'If True, then only the model\'s weights will be saved '
-          '(`model.save_weights(filepath)`), else the full model is saved '
-          '(`model.save(filepath)`)'))
-
-  flags_core.set_defaults(
-      data_dir='/tmp/translate_ende',
-      model_dir='/tmp/transformer_model',
-      batch_size=None)
-
-  # pylint: disable=unused-variable
-  @flags.multi_flags_validator(
-      ['bleu_source', 'bleu_ref'],
-      message='Both or neither --bleu_source and --bleu_ref must be defined.')
-  def _check_bleu_files(flags_dict):
-    return (flags_dict['bleu_source'] is None) == (
-        flags_dict['bleu_ref'] is None)
-
-  @flags.multi_flags_validator(
-      ['bleu_source', 'bleu_ref', 'vocab_file'],
-      message='--vocab_file must be defined if --bleu_source and --bleu_ref '
-      'are defined.')
-  def _check_bleu_vocab_file(flags_dict):
-    if flags_dict['bleu_source'] and flags_dict['bleu_ref']:
-      return flags_dict['vocab_file'] is not None
-    return True
-
-  # pylint: enable=unused-variable
-
-
-def get_callbacks():
-  """Returns common callbacks."""
-  callbacks = []
-  if FLAGS.enable_time_history:
-    time_callback = keras_utils.TimeHistory(
-        FLAGS.batch_size,
-        FLAGS.log_steps,
-        logdir=FLAGS.model_dir if FLAGS.enable_tensorboard else None)
-    callbacks.append(time_callback)
-
-  if FLAGS.enable_tensorboard:
-    tensorboard_callback = tf.keras.callbacks.TensorBoard(
-        log_dir=FLAGS.model_dir)
-    callbacks.append(tensorboard_callback)
-
-  return callbacks
-
-
-def update_stats(history, stats, callbacks):
-  """Normalizes and updates dictionary of stats.
-
-  Args:
-    history: Results of the training step.
-    stats: Dict with pre-existing training stats.
-    callbacks: a list of callbacks which might include a time history callback
-      used during keras.fit.
-  """
-
-  if history and history.history:
-    train_hist = history.history
-    # Gets final loss from training.
-    stats['loss'] = float(train_hist['loss'][-1])
-
-  if not callbacks:
-    return
-
-  # Look for the time history callback which was used during keras.fit
-  for callback in callbacks:
-    if isinstance(callback, keras_utils.TimeHistory):
-      timestamp_log = callback.timestamp_log
-      stats['step_timestamp_log'] = timestamp_log
-      stats['train_finish_time'] = callback.train_finish_time
-      if len(timestamp_log) > 1:
-        stats['avg_exp_per_second'] = (
-            callback.batch_size * callback.log_steps *
-            (len(callback.timestamp_log) - 1) /
-            (timestamp_log[-1].timestamp - timestamp_log[0].timestamp))
--- a/official/nlp/transformer/model_params.py
+++ b/official/nlp/transformer/model_params.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Defines Transformer model parameters."""
-
-import collections
-
-
-BASE_PARAMS = collections.defaultdict(
-    lambda: None,  # Set default value to None.
-
-    # Input params
-    default_batch_size=2048,  # Maximum number of tokens per batch of examples.
-    default_batch_size_tpu=32768,
-    max_length=256,  # Maximum number of tokens per example.
-
-    # Model params
-    initializer_gain=1.0,  # Used in trainable variable initialization.
-    vocab_size=33708,  # Number of tokens defined in the vocabulary file.
-    hidden_size=512,  # Model dimension in the hidden layers.
-    num_hidden_layers=6,  # Number of layers in the encoder and decoder stacks.
-    num_heads=8,  # Number of heads to use in multi-headed attention.
-    filter_size=2048,  # Inner layer dimension in the feedforward network.
-
-    # Dropout values (only used when training)
-    layer_postprocess_dropout=0.1,
-    attention_dropout=0.1,
-    relu_dropout=0.1,
-
-    # Training params
-    label_smoothing=0.1,
-    learning_rate=2.0,
-    learning_rate_decay_rate=1.0,
-    learning_rate_warmup_steps=16000,
-
-    # Optimizer params
-    optimizer_adam_beta1=0.9,
-    optimizer_adam_beta2=0.997,
-    optimizer_adam_epsilon=1e-09,
-
-    # Default prediction params
-    extra_decode_length=50,
-    beam_size=4,
-    alpha=0.6,  # used to calculate length normalization in beam search
-
-    # TPU specific parameters
-    use_tpu=False,
-    static_batch=False,
-    allow_ffn_pad=True,
-)
-
-BIG_PARAMS = BASE_PARAMS.copy()
-BIG_PARAMS.update(
-    default_batch_size=4096,
-
-    # default batch size is smaller than for BASE_PARAMS due to memory limits.
-    default_batch_size_tpu=16384,
-
-    hidden_size=1024,
-    filter_size=4096,
-    num_heads=16,
-)
-
-# Parameters for running the model in multi gpu. These should not change the
-# params that modify the model shape (such as the hidden_size or num_heads).
-BASE_MULTI_GPU_PARAMS = BASE_PARAMS.copy()
-BASE_MULTI_GPU_PARAMS.update(
-    learning_rate_warmup_steps=8000
-)
-
-BIG_MULTI_GPU_PARAMS = BIG_PARAMS.copy()
-BIG_MULTI_GPU_PARAMS.update(
-    layer_postprocess_dropout=0.3,
-    learning_rate_warmup_steps=8000
-)
-
-# Parameters for testing the model
-TINY_PARAMS = BASE_PARAMS.copy()
-TINY_PARAMS.update(
-    default_batch_size=1024,
-    default_batch_size_tpu=1024,
-    hidden_size=32,
-    num_heads=4,
-    filter_size=256,
-)
--- a/official/nlp/transformer/model_utils.py
+++ b/official/nlp/transformer/model_utils.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Transformer model helper methods."""
-
-import math
-
-import numpy as np
-import tensorflow as tf
-
-# Very low numbers to represent -infinity. We do not actually use -Inf, since we
-# want to be able to multiply these values by zero to get zero. (-Inf * 0 = NaN)
-_NEG_INF_FP32 = -1e9
-_NEG_INF_FP16 = np.finfo(np.float16).min
-
-
-def get_position_encoding(length,
-                          hidden_size,
-                          min_timescale=1.0,
-                          max_timescale=1.0e4):
-  """Return positional encoding.
-
-  Calculates the position encoding as a mix of sine and cosine functions with
-  geometrically increasing wavelengths.
-  Defined and formulized in Attention is All You Need, section 3.5.
-
-  Args:
-    length: Sequence length.
-    hidden_size: Size of the
-    min_timescale: Minimum scale that will be applied at each position
-    max_timescale: Maximum scale that will be applied at each position
-
-  Returns:
-    Tensor with shape [length, hidden_size]
-  """
-  # We compute the positional encoding in float32 even if the model uses
-  # float16, as many of the ops used, like log and exp, are numerically unstable
-  # in float16.
-  position = tf.cast(tf.range(length), tf.float32)
-  num_timescales = hidden_size // 2
-  log_timescale_increment = (
-      math.log(float(max_timescale) / float(min_timescale)) /
-      (tf.cast(num_timescales, tf.float32) - 1))
-  inv_timescales = min_timescale * tf.exp(
-      tf.cast(tf.range(num_timescales), tf.float32) * -log_timescale_increment)
-  scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
-  signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
-  return signal
-
-
-def get_decoder_self_attention_bias(length, dtype=tf.float32):
-  """Calculate bias for decoder that maintains model's autoregressive property.
-
-  Creates a tensor that masks out locations that correspond to illegal
-  connections, so prediction at position i cannot draw information from future
-  positions.
-
-  Args:
-    length: int length of sequences in batch.
-    dtype: The dtype of the return value.
-
-  Returns:
-    float tensor of shape [1, 1, length, length]
-  """
-  neg_inf = _NEG_INF_FP16 if dtype == tf.float16 else _NEG_INF_FP32
-  with tf.name_scope("decoder_self_attention_bias"):
-    valid_locs = tf.linalg.band_part(
-        tf.ones([length, length], dtype=dtype), -1, 0)
-    valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
-    decoder_bias = neg_inf * (1.0 - valid_locs)
-  return decoder_bias
-
-
-def get_padding(x, padding_value=0, dtype=tf.float32):
-  """Return float tensor representing the padding values in x.
-
-  Args:
-    x: int tensor with any shape
-    padding_value: int which represents padded values in input
-    dtype: The dtype of the return value.
-
-  Returns:
-    float tensor with same shape as x containing values 0 or 1.
-      0 -> non-padding, 1 -> padding
-  """
-  with tf.name_scope("padding"):
-    return tf.cast(tf.equal(x, padding_value), dtype)
-
-
-def get_padding_bias(x, padding_value=0, dtype=tf.float32):
-  """Calculate bias tensor from padding values in tensor.
-
-  Bias tensor that is added to the pre-softmax multi-headed attention logits,
-  which has shape [batch_size, num_heads, length, length]. The tensor is zero at
-  non-padding locations, and -1e9 (negative infinity) at padding locations.
-
-  Args:
-    x: int tensor with shape [batch_size, length]
-    padding_value: int which represents padded values in input
-    dtype: The dtype of the return value
-
-  Returns:
-    Attention bias tensor of shape [batch_size, 1, 1, length].
-  """
-  with tf.name_scope("attention_bias"):
-    padding = get_padding(x, padding_value, dtype)
-    attention_bias = padding * _NEG_INF_FP32
-    attention_bias = tf.expand_dims(
-        tf.expand_dims(attention_bias, axis=1), axis=1)
-  return attention_bias
--- a/official/nlp/transformer/model_utils_test.py
+++ b/official/nlp/transformer/model_utils_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Test Transformer model helper methods."""
-
-import tensorflow as tf
-
-from official.nlp.transformer import model_utils
-
-NEG_INF = -1e9
-
-
-class ModelUtilsTest(tf.test.TestCase):
-
-  def test_get_padding(self):
-    x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
-    padding = model_utils.get_padding(x, padding_value=0)
-
-    self.assertAllEqual([[0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [1, 0, 0, 1, 0]],
-                        padding)
-
-  def test_get_padding_bias(self):
-    x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
-    bias = model_utils.get_padding_bias(x)
-    bias_shape = tf.shape(bias)
-    flattened_bias = tf.reshape(bias, [3, 5])
-
-    self.assertAllEqual(
-        [[0, NEG_INF, NEG_INF, NEG_INF, 0], [0, 0, NEG_INF, NEG_INF, NEG_INF],
-         [NEG_INF, 0, 0, NEG_INF, 0]], flattened_bias)
-    self.assertAllEqual([3, 1, 1, 5], bias_shape)
-
-  def test_get_decoder_self_attention_bias(self):
-    length = 5
-    bias = model_utils.get_decoder_self_attention_bias(length)
-
-    self.assertAllEqual(
-        [[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
-           [0, 0, NEG_INF, NEG_INF, NEG_INF], [0, 0, 0, NEG_INF, NEG_INF],
-           [0, 0, 0, 0, NEG_INF], [0, 0, 0, 0, 0]]]], bias)
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/nlp/transformer/optimizer.py
+++ b/official/nlp/transformer/optimizer.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Optimizer from addons and learning rate scheduler."""
-
-import tensorflow as tf
-
-
-class LearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
-  """Learning rate schedule."""
-
-  def __init__(self, initial_learning_rate, hidden_size, warmup_steps):
-    """Initialize configuration of the learning rate schedule.
-
-    Args:
-      initial_learning_rate: A float, the initial learning rate.
-      hidden_size: An integer, the model dimension in the hidden layers.
-      warmup_steps: An integer, the number of steps required for linear warmup.
-    """
-    super(LearningRateSchedule, self).__init__()
-    self.initial_learning_rate = initial_learning_rate
-    self.hidden_size = hidden_size
-    self.warmup_steps = warmup_steps
-    self.warmup_steps_tensor = tf.cast(warmup_steps, tf.float32)
-
-  def __call__(self, global_step):
-    """Calculate learning rate with linear warmup and rsqrt decay.
-
-    Args:
-      global_step: An integer, the current global step used for learning rate
-        calculation.
-
-    Returns:
-      A float, the learning rate needs to be used for current global step.
-    """
-    with tf.name_scope('learning_rate_schedule'):
-      global_step = tf.cast(global_step, tf.float32)
-      learning_rate = self.initial_learning_rate
-      learning_rate *= (self.hidden_size**-0.5)
-      # Apply linear warmup
-      learning_rate *= tf.minimum(1.0, global_step / self.warmup_steps_tensor)
-      # Apply rsqrt decay
-      learning_rate /= tf.sqrt(
-          tf.maximum(global_step, self.warmup_steps_tensor))
-      return learning_rate
-
-  def get_config(self):
-    """Get the configuration of the learning rate schedule."""
-    return {
-        'initial_learning_rate': self.initial_learning_rate,
-        'hidden_size': self.hidden_size,
-        'warmup_steps': self.warmup_steps,
-    }
--- a/official/nlp/transformer/transformer.py
+++ b/official/nlp/transformer/transformer.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Defines the Transformer model in TF 2.0.
-
-Model paper: https://arxiv.org/pdf/1706.03762.pdf
-Transformer model code source: https://github.com/tensorflow/tensor2tensor
-"""
-
-import tensorflow as tf
-from official.nlp.modeling.layers import position_embedding
-from official.nlp.modeling.ops import beam_search
-from official.nlp.transformer import attention_layer
-from official.nlp.transformer import embedding_layer
-from official.nlp.transformer import ffn_layer
-from official.nlp.transformer import metrics
-from official.nlp.transformer import model_utils
-from official.nlp.transformer.utils.tokenizer import EOS_ID
-
-# Disable the not-callable lint error, since it claims many objects are not
-# callable when they actually are.
-# pylint: disable=not-callable
-
-
-def create_model(params, is_train):
-  """Creates transformer model."""
-  with tf.name_scope("model"):
-    if is_train:
-      inputs = tf.keras.layers.Input((None,), dtype="int64", name="inputs")
-      targets = tf.keras.layers.Input((None,), dtype="int64", name="targets")
-      internal_model = Transformer(params, name="transformer_v2")
-      logits = internal_model([inputs, targets], training=is_train)
-      vocab_size = params["vocab_size"]
-      label_smoothing = params["label_smoothing"]
-      if params["enable_metrics_in_training"]:
-        logits = metrics.MetricLayer(vocab_size)([logits, targets])
-      logits = tf.keras.layers.Lambda(
-          lambda x: x, name="logits", dtype=tf.float32)(
-              logits)
-      model = tf.keras.Model([inputs, targets], logits)
-      loss = metrics.transformer_loss(logits, targets, label_smoothing,
-                                      vocab_size)
-      model.add_loss(loss)
-      return model
-
-    else:
-      inputs = tf.keras.layers.Input((None,), dtype="int64", name="inputs")
-      internal_model = Transformer(params, name="transformer_v2")
-      ret = internal_model([inputs], training=is_train)
-      outputs, scores = ret["outputs"], ret["scores"]
-      return tf.keras.Model(inputs, [outputs, scores])
-
-
-class Transformer(tf.keras.Model):
-  """Transformer model with Keras.
-
-  Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
-
-  The Transformer model consists of an encoder and decoder. The input is an int
-  sequence (or a batch of sequences). The encoder produces a continuous
-  representation, and the decoder uses the encoder output to generate
-  probabilities for the output sequence.
-  """
-
-  def __init__(self, params, name=None):
-    """Initialize layers to build Transformer model.
-
-    Args:
-      params: hyperparameter object defining layer sizes, dropout values, etc.
-      name: name of the model.
-    """
-    super(Transformer, self).__init__(name=name)
-    self.params = params
-    self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights(
-        params["vocab_size"], params["hidden_size"])
-    self.encoder_stack = EncoderStack(params)
-    self.decoder_stack = DecoderStack(params)
-    self.position_embedding = position_embedding.RelativePositionEmbedding(
-        hidden_size=self.params["hidden_size"])
-
-  def get_config(self):
-    return {
-        "params": self.params,
-    }
-
-  def call(self, inputs, training):
-    """Calculate target logits or inferred target sequences.
-
-    Args:
-      inputs: input tensor list of size 1 or 2.
-        First item, inputs: int tensor with shape [batch_size, input_length].
-        Second item (optional), targets: None or int tensor with shape
-          [batch_size, target_length].
-      training: boolean, whether in training mode or not.
-
-    Returns:
-      If targets is defined, then return logits for each word in the target
-      sequence. float tensor with shape [batch_size, target_length, vocab_size]
-      If target is none, then generate output sequence one token at a time.
-        returns a dictionary {
-          outputs: int tensor with shape [batch_size, decoded_length]
-          scores: float tensor with shape [batch_size]}
-      Even when float16 is used, the output tensor(s) are always float32.
-
-    Raises:
-      NotImplementedError: If try to use padded decode method on CPU/GPUs.
-    """
-    inputs = inputs if isinstance(inputs, list) else [inputs]
-    if len(inputs) == 2:
-      inputs, targets = inputs[0], inputs[1]
-    else:
-      # Decoding path.
-      inputs, targets = inputs[0], None
-      if self.params["padded_decode"]:
-        if not self.params["num_replicas"]:
-          raise NotImplementedError(
-              "Padded decoding on CPU/GPUs is not supported.")
-        decode_batch_size = int(self.params["decode_batch_size"] /
-                                self.params["num_replicas"])
-        inputs.set_shape([decode_batch_size, self.params["decode_max_length"]])
-
-    # Variance scaling is used here because it seems to work in many problems.
-    # Other reasonable initializers may also work just as well.
-    with tf.name_scope("Transformer"):
-      # Calculate attention bias for encoder self-attention and decoder
-      # multi-headed attention layers.
-      attention_bias = model_utils.get_padding_bias(inputs)
-
-      # Run the inputs through the encoder layer to map the symbol
-      # representations to continuous representations.
-      encoder_outputs = self.encode(inputs, attention_bias, training)
-      # Generate output sequence if targets is None, or return logits if target
-      # sequence is known.
-      if targets is None:
-        return self.predict(encoder_outputs, attention_bias, training)
-      else:
-        logits = self.decode(targets, encoder_outputs, attention_bias, training)
-        return logits
-
-  def encode(self, inputs, attention_bias, training):
-    """Generate continuous representation for inputs.
-
-    Args:
-      inputs: int tensor with shape [batch_size, input_length].
-      attention_bias: float tensor with shape [batch_size, 1, 1, input_length].
-      training: boolean, whether in training mode or not.
-
-    Returns:
-      float tensor with shape [batch_size, input_length, hidden_size]
-    """
-    with tf.name_scope("encode"):
-      # Prepare inputs to the layer stack by adding positional encodings and
-      # applying dropout.
-      embedded_inputs = self.embedding_softmax_layer(inputs)
-      embedded_inputs = tf.cast(embedded_inputs, self.params["dtype"])
-      inputs_padding = model_utils.get_padding(inputs)
-      attention_bias = tf.cast(attention_bias, self.params["dtype"])
-
-      with tf.name_scope("add_pos_encoding"):
-        pos_encoding = self.position_embedding(inputs=embedded_inputs)
-        pos_encoding = tf.cast(pos_encoding, self.params["dtype"])
-        encoder_inputs = embedded_inputs + pos_encoding
-
-      if training:
-        encoder_inputs = tf.nn.dropout(
-            encoder_inputs, rate=self.params["layer_postprocess_dropout"])
-
-      return self.encoder_stack(
-          encoder_inputs, attention_bias, inputs_padding, training=training)
-
-  def decode(self, targets, encoder_outputs, attention_bias, training):
-    """Generate logits for each value in the target sequence.
-
-    Args:
-      targets: target values for the output sequence. int tensor with shape
-        [batch_size, target_length]
-      encoder_outputs: continuous representation of input sequence. float tensor
-        with shape [batch_size, input_length, hidden_size]
-      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
-      training: boolean, whether in training mode or not.
-
-    Returns:
-      float32 tensor with shape [batch_size, target_length, vocab_size]
-    """
-    with tf.name_scope("decode"):
-      # Prepare inputs to decoder layers by shifting targets, adding positional
-      # encoding and applying dropout.
-      with tf.name_scope("shift_targets"):
-        # Shift targets to the right, and remove the last element
-        targets = tf.pad(targets, [[0, 0], [1, 0]])[:, :-1]
-      decoder_inputs = self.embedding_softmax_layer(targets)
-      decoder_inputs = tf.cast(decoder_inputs, self.params["dtype"])
-      attention_bias = tf.cast(attention_bias, self.params["dtype"])
-      with tf.name_scope("add_pos_encoding"):
-        length = tf.shape(decoder_inputs)[1]
-        pos_encoding = self.position_embedding(decoder_inputs)
-        pos_encoding = tf.cast(pos_encoding, self.params["dtype"])
-        decoder_inputs += pos_encoding
-      if training:
-        decoder_inputs = tf.nn.dropout(
-            decoder_inputs, rate=self.params["layer_postprocess_dropout"])
-
-      # Run values
-      decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
-          length, dtype=self.params["dtype"])
-      outputs = self.decoder_stack(
-          decoder_inputs,
-          encoder_outputs,
-          decoder_self_attention_bias,
-          attention_bias,
-          training=training)
-      logits = self.embedding_softmax_layer(outputs, mode="linear")
-      logits = tf.cast(logits, tf.float32)
-      return logits
-
-  def _get_symbols_to_logits_fn(self, max_decode_length, training):
-    """Returns a decoding function that calculates logits of the next tokens."""
-    timing_signal = self.position_embedding(
-        inputs=None, length=max_decode_length + 1)
-    timing_signal = tf.cast(timing_signal, self.params["dtype"])
-    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
-        max_decode_length, dtype=self.params["dtype"])
-
-    def symbols_to_logits_fn(ids, i, cache):
-      """Generate logits for next potential IDs.
-
-      Args:
-        ids: Current decoded sequences. int tensor with shape [batch_size *
-          beam_size, i + 1].
-        i: Loop index.
-        cache: dictionary of values storing the encoder output, encoder-decoder
-          attention bias, and previous decoder attention values.
-
-      Returns:
-        Tuple of
-          (logits with shape [batch_size * beam_size, vocab_size],
-           updated cache values)
-      """
-      # Set decoder input to the last generated IDs
-      decoder_input = ids[:, -1:]
-
-      # Preprocess decoder input by getting embeddings and adding timing signal.
-      decoder_input = self.embedding_softmax_layer(decoder_input)
-      decoder_input += timing_signal[i]
-      if self.params["padded_decode"]:
-        bias_shape = decoder_self_attention_bias.shape.as_list()
-        self_attention_bias = tf.slice(
-            decoder_self_attention_bias, [0, 0, i, 0],
-            [bias_shape[0], bias_shape[1], 1, bias_shape[3]])
-      else:
-        self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
-
-      decoder_outputs = self.decoder_stack(
-          decoder_input,
-          cache.get("encoder_outputs"),
-          self_attention_bias,
-          cache.get("encoder_decoder_attention_bias"),
-          training=training,
-          cache=cache,
-          decode_loop_step=i if self.params["padded_decode"] else None)
-      logits = self.embedding_softmax_layer(decoder_outputs, mode="linear")
-      logits = tf.squeeze(logits, axis=[1])
-      return logits, cache
-
-    return symbols_to_logits_fn
-
-  def predict(self, encoder_outputs, encoder_decoder_attention_bias, training):
-    """Return predicted sequence."""
-    encoder_outputs = tf.cast(encoder_outputs, self.params["dtype"])
-    if self.params["padded_decode"]:
-      batch_size = encoder_outputs.shape.as_list()[0]
-      input_length = encoder_outputs.shape.as_list()[1]
-    else:
-      batch_size = tf.shape(encoder_outputs)[0]
-      input_length = tf.shape(encoder_outputs)[1]
-    max_decode_length = input_length + self.params["extra_decode_length"]
-    encoder_decoder_attention_bias = tf.cast(encoder_decoder_attention_bias,
-                                             self.params["dtype"])
-
-    symbols_to_logits_fn = self._get_symbols_to_logits_fn(
-        max_decode_length, training)
-
-    # Create initial set of IDs that will be passed into symbols_to_logits_fn.
-    initial_ids = tf.zeros([batch_size], dtype=tf.int32)
-
-    # Create cache storing decoder attention values for each layer.
-    # pylint: disable=g-complex-comprehension
-    init_decode_length = (
-        max_decode_length if self.params["padded_decode"] else 0)
-    num_heads = self.params["num_heads"]
-    dim_per_head = self.params["hidden_size"] // num_heads
-    cache = {
-        "layer_%d" % layer: {
-            "k":
-                tf.zeros(
-                    [batch_size, init_decode_length, num_heads, dim_per_head],
-                    dtype=self.params["dtype"]),
-            "v":
-                tf.zeros(
-                    [batch_size, init_decode_length, num_heads, dim_per_head],
-                    dtype=self.params["dtype"])
-        } for layer in range(self.params["num_hidden_layers"])
-    }
-    # pylint: enable=g-complex-comprehension
-
-    # Add encoder output and attention bias to the cache.
-    cache["encoder_outputs"] = encoder_outputs
-    cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
-
-    # Use beam search to find the top beam_size sequences and scores.
-    decoded_ids, scores = beam_search.sequence_beam_search(
-        symbols_to_logits_fn=symbols_to_logits_fn,
-        initial_ids=initial_ids,
-        initial_cache=cache,
-        vocab_size=self.params["vocab_size"],
-        beam_size=self.params["beam_size"],
-        alpha=self.params["alpha"],
-        max_decode_length=max_decode_length,
-        eos_id=EOS_ID,
-        padded_decode=self.params["padded_decode"],
-        dtype=self.params["dtype"])
-
-    # Get the top sequence for each batch element
-    top_decoded_ids = decoded_ids[:, 0, 1:]
-    top_scores = scores[:, 0]
-
-    return {"outputs": top_decoded_ids, "scores": top_scores}
-
-
-class PrePostProcessingWrapper(tf.keras.layers.Layer):
-  """Wrapper class that applies layer pre-processing and post-processing."""
-
-  def __init__(self, layer, params):
-    super(PrePostProcessingWrapper, self).__init__()
-    self.layer = layer
-    self.params = params
-    self.postprocess_dropout = params["layer_postprocess_dropout"]
-
-  def build(self, input_shape):
-    # Create normalization layer
-    self.layer_norm = tf.keras.layers.LayerNormalization(
-        epsilon=1e-6, dtype="float32")
-    super(PrePostProcessingWrapper, self).build(input_shape)
-
-  def get_config(self):
-    return {
-        "params": self.params,
-    }
-
-  def call(self, x, *args, **kwargs):
-    """Calls wrapped layer with same parameters."""
-    # Preprocessing: apply layer normalization
-    training = kwargs["training"]
-
-    y = self.layer_norm(x)
-
-    # Get layer output
-    y = self.layer(y, *args, **kwargs)
-
-    # Postprocessing: apply dropout and residual connection
-    if training:
-      y = tf.nn.dropout(y, rate=self.postprocess_dropout)
-    return x + y
-
-
-class EncoderStack(tf.keras.layers.Layer):
-  """Transformer encoder stack.
-
-  The encoder stack is made up of N identical layers. Each layer is composed
-  of the sublayers:
-    1. Self-attention layer
-    2. Feedforward network (which is 2 fully-connected layers)
-  """
-
-  def __init__(self, params):
-    super(EncoderStack, self).__init__()
-    self.params = params
-    self.layers = []
-
-  def build(self, input_shape):
-    """Builds the encoder stack."""
-    params = self.params
-    for _ in range(params["num_hidden_layers"]):
-      # Create sublayers for each layer.
-      self_attention_layer = attention_layer.SelfAttention(
-          params["hidden_size"], params["num_heads"],
-          params["attention_dropout"])
-      feed_forward_network = ffn_layer.FeedForwardNetwork(
-          params["hidden_size"], params["filter_size"], params["relu_dropout"])
-
-      self.layers.append([
-          PrePostProcessingWrapper(self_attention_layer, params),
-          PrePostProcessingWrapper(feed_forward_network, params)
-      ])
-
-    # Create final layer normalization layer.
-    self.output_normalization = tf.keras.layers.LayerNormalization(
-        epsilon=1e-6, dtype="float32")
-    super(EncoderStack, self).build(input_shape)
-
-  def get_config(self):
-    return {
-        "params": self.params,
-    }
-
-  def call(self, encoder_inputs, attention_bias, inputs_padding, training):
-    """Return the output of the encoder layer stacks.
-
-    Args:
-      encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
-      attention_bias: bias for the encoder self-attention layer. [batch_size, 1,
-        1, input_length]
-      inputs_padding: tensor with shape [batch_size, input_length], inputs with
-        zero paddings.
-      training: boolean, whether in training mode or not.
-
-    Returns:
-      Output of encoder layer stack.
-      float32 tensor with shape [batch_size, input_length, hidden_size]
-    """
-    for n, layer in enumerate(self.layers):
-      # Run inputs through the sublayers.
-      self_attention_layer = layer[0]
-      feed_forward_network = layer[1]
-
-      with tf.name_scope("layer_%d" % n):
-        with tf.name_scope("self_attention"):
-          encoder_inputs = self_attention_layer(
-              encoder_inputs, attention_bias, training=training)
-        with tf.name_scope("ffn"):
-          encoder_inputs = feed_forward_network(
-              encoder_inputs, training=training)
-
-    return self.output_normalization(encoder_inputs)
-
-
-class DecoderStack(tf.keras.layers.Layer):
-  """Transformer decoder stack.
-
-  Like the encoder stack, the decoder stack is made up of N identical layers.
-  Each layer is composed of the sublayers:
-    1. Self-attention layer
-    2. Multi-headed attention layer combining encoder outputs with results from
-       the previous self-attention layer.
-    3. Feedforward network (2 fully-connected layers)
-  """
-
-  def __init__(self, params):
-    super(DecoderStack, self).__init__()
-    self.params = params
-    self.layers = []
-
-  def build(self, input_shape):
-    """Builds the decoder stack."""
-    params = self.params
-    for _ in range(params["num_hidden_layers"]):
-      self_attention_layer = attention_layer.SelfAttention(
-          params["hidden_size"], params["num_heads"],
-          params["attention_dropout"])
-      enc_dec_attention_layer = attention_layer.Attention(
-          params["hidden_size"], params["num_heads"],
-          params["attention_dropout"])
-      feed_forward_network = ffn_layer.FeedForwardNetwork(
-          params["hidden_size"], params["filter_size"], params["relu_dropout"])
-
-      self.layers.append([
-          PrePostProcessingWrapper(self_attention_layer, params),
-          PrePostProcessingWrapper(enc_dec_attention_layer, params),
-          PrePostProcessingWrapper(feed_forward_network, params)
-      ])
-    self.output_normalization = tf.keras.layers.LayerNormalization(
-        epsilon=1e-6, dtype="float32")
-    super(DecoderStack, self).build(input_shape)
-
-  def get_config(self):
-    return {
-        "params": self.params,
-    }
-
-  def call(self,
-           decoder_inputs,
-           encoder_outputs,
-           decoder_self_attention_bias,
-           attention_bias,
-           training,
-           cache=None,
-           decode_loop_step=None):
-    """Return the output of the decoder layer stacks.
-
-    Args:
-      decoder_inputs: A tensor with shape [batch_size, target_length,
-        hidden_size].
-      encoder_outputs: A tensor with shape [batch_size, input_length,
-        hidden_size]
-      decoder_self_attention_bias: A tensor with shape [1, 1, target_len,
-        target_length], the bias for decoder self-attention layer.
-      attention_bias: A tensor with shape [batch_size, 1, 1, input_length], the
-        bias for encoder-decoder attention layer.
-      training: A bool, whether in training mode or not.
-      cache: (Used for fast decoding) A nested dictionary storing previous
-        decoder self-attention values. The items are:
-          {layer_n: {"k": A tensor with shape [batch_size, i, key_channels],
-                     "v": A tensor with shape [batch_size, i, value_channels]},
-                       ...}
-      decode_loop_step: An integer, the step number of the decoding loop. Used
-        only for autoregressive inference on TPU.
-
-    Returns:
-      Output of decoder layer stack.
-      float32 tensor with shape [batch_size, target_length, hidden_size]
-    """
-    for n, layer in enumerate(self.layers):
-      self_attention_layer = layer[0]
-      enc_dec_attention_layer = layer[1]
-      feed_forward_network = layer[2]
-
-      # Run inputs through the sublayers.
-      layer_name = "layer_%d" % n
-      layer_cache = cache[layer_name] if cache is not None else None
-      with tf.name_scope(layer_name):
-        with tf.name_scope("self_attention"):
-          decoder_inputs = self_attention_layer(
-              decoder_inputs,
-              decoder_self_attention_bias,
-              training=training,
-              cache=layer_cache,
-              decode_loop_step=decode_loop_step)
-        with tf.name_scope("encdec_attention"):
-          decoder_inputs = enc_dec_attention_layer(
-              decoder_inputs,
-              encoder_outputs,
-              attention_bias,
-              training=training)
-        with tf.name_scope("ffn"):
-          decoder_inputs = feed_forward_network(
-              decoder_inputs, training=training)
-
-    return self.output_normalization(decoder_inputs)
--- a/official/nlp/transformer/transformer_forward_test.py
+++ b/official/nlp/transformer/transformer_forward_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Forward pass test for Transformer model refactoring."""
-
-import numpy as np
-
-import tensorflow as tf
-
-from official.nlp.modeling import models
-from official.nlp.transformer import metrics
-from official.nlp.transformer import model_params
-from official.nlp.transformer import transformer
-
-
-def _count_params(layer, trainable_only=True):
-  """Returns the count of all model parameters, or just trainable ones."""
-  if not trainable_only:
-    return layer.count_params()
-  else:
-    return int(
-        np.sum([
-            tf.keras.backend.count_params(p) for p in layer.trainable_weights
-        ]))
-
-
-def _create_model(params, is_train):
-  """Creates transformer model."""
-
-  encdec_kwargs = dict(
-      num_layers=params["num_hidden_layers"],
-      num_attention_heads=params["num_heads"],
-      intermediate_size=params["filter_size"],
-      activation="relu",
-      dropout_rate=params["relu_dropout"],
-      attention_dropout_rate=params["attention_dropout"],
-      use_bias=False,
-      norm_first=True,
-      norm_epsilon=1e-6,
-      intermediate_dropout=params["relu_dropout"])
-  encoder_layer = models.TransformerEncoder(**encdec_kwargs)
-  decoder_layer = models.TransformerDecoder(**encdec_kwargs)
-
-  model_kwargs = dict(
-      vocab_size=params["vocab_size"],
-      embedding_width=params["hidden_size"],
-      dropout_rate=params["layer_postprocess_dropout"],
-      padded_decode=params["padded_decode"],
-      decode_max_length=params["decode_max_length"],
-      dtype=params["dtype"],
-      extra_decode_length=params["extra_decode_length"],
-      beam_size=params["beam_size"],
-      alpha=params["alpha"],
-      encoder_layer=encoder_layer,
-      decoder_layer=decoder_layer,
-      name="transformer_v2")
-
-  if is_train:
-    inputs = tf.keras.layers.Input((None,), dtype="int64", name="inputs")
-    targets = tf.keras.layers.Input((None,), dtype="int64", name="targets")
-    internal_model = models.Seq2SeqTransformer(**model_kwargs)
-    logits = internal_model(
-        dict(inputs=inputs, targets=targets), training=is_train)
-    vocab_size = params["vocab_size"]
-    label_smoothing = params["label_smoothing"]
-    if params["enable_metrics_in_training"]:
-      logits = metrics.MetricLayer(vocab_size)([logits, targets])
-    logits = tf.keras.layers.Lambda(
-        lambda x: x, name="logits", dtype=tf.float32)(
-            logits)
-    model = tf.keras.Model([inputs, targets], logits)
-    loss = metrics.transformer_loss(logits, targets, label_smoothing,
-                                    vocab_size)
-    model.add_loss(loss)
-    return model
-
-  batch_size = params["decode_batch_size"] if params["padded_decode"] else None
-  inputs = tf.keras.layers.Input((None,),
-                                 batch_size=batch_size,
-                                 dtype="int64",
-                                 name="inputs")
-  internal_model = models.Seq2SeqTransformer(**model_kwargs)
-  ret = internal_model(dict(inputs=inputs), training=is_train)
-  outputs, scores = ret["outputs"], ret["scores"]
-  return tf.keras.Model(inputs, [outputs, scores])
-
-
-class TransformerForwardTest(tf.test.TestCase):
-
-  def setUp(self):
-    super(TransformerForwardTest, self).setUp()
-    self.params = params = model_params.TINY_PARAMS
-    params["batch_size"] = params["default_batch_size"] = 16
-    params["hidden_size"] = 12
-    params["num_hidden_layers"] = 3
-    params["filter_size"] = 14
-    params["num_heads"] = 2
-    params["vocab_size"] = 41
-    params["extra_decode_length"] = 0
-    params["beam_size"] = 3
-    params["dtype"] = tf.float32
-    params["layer_postprocess_dropout"] = 0.0
-    params["attention_dropout"] = 0.0
-    params["relu_dropout"] = 0.0
-
-  def test_forward_pass_train(self):
-    # Set input_len different from target_len
-    inputs = np.asarray([[5, 2, 1], [7, 5, 0], [1, 4, 0], [7, 5, 11]])
-    targets = np.asarray([[4, 3, 4, 0], [13, 19, 17, 8], [20, 14, 1, 2],
-                          [5, 7, 3, 0]])
-
-    # src_model is the original model before refactored.
-    src_model = transformer.create_model(self.params, True)
-    src_num_weights = _count_params(src_model)
-    src_weights = src_model.get_weights()
-    src_model_output = src_model([inputs, targets], training=True)
-
-    # dest_model is the refactored model.
-    dest_model = _create_model(self.params, True)
-    dest_num_weights = _count_params(dest_model)
-    self.assertEqual(src_num_weights, dest_num_weights)
-    dest_model.set_weights(src_weights)
-    dest_model_output = dest_model([inputs, targets], training=True)
-    self.assertAllEqual(src_model_output, dest_model_output)
-
-  def test_forward_pass_not_train(self):
-    inputs = np.asarray([[5, 2, 1], [7, 5, 0], [1, 4, 0], [7, 5, 11]])
-
-    # src_model is the original model before refactored.
-    src_model = transformer.create_model(self.params, False)
-    src_num_weights = _count_params(src_model)
-    src_weights = src_model.get_weights()
-    src_model_output = src_model([inputs], training=False)
-
-    # dest_model is the refactored model.
-    dest_model = _create_model(self.params, False)
-    dest_num_weights = _count_params(dest_model)
-    self.assertEqual(src_num_weights, dest_num_weights)
-    dest_model.set_weights(src_weights)
-    dest_model_output = dest_model([inputs], training=False)
-    self.assertAllEqual(src_model_output[0], dest_model_output[0])
-    self.assertAllEqual(src_model_output[1], dest_model_output[1])
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/nlp/transformer/transformer_layers_test.py
+++ b/official/nlp/transformer/transformer_layers_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for layers in Transformer."""
-
-import tensorflow as tf
-
-from official.nlp.transformer import attention_layer
-from official.nlp.transformer import embedding_layer
-from official.nlp.transformer import ffn_layer
-from official.nlp.transformer import metrics
-
-
-class TransformerLayersTest(tf.test.TestCase):
-
-  def test_attention_layer(self):
-    hidden_size = 64
-    num_heads = 4
-    dropout = 0.5
-    dim_per_head = hidden_size // num_heads
-    layer = attention_layer.SelfAttention(hidden_size, num_heads, dropout)
-    self.assertDictEqual(
-        layer.get_config(), {
-            "hidden_size": hidden_size,
-            "num_heads": num_heads,
-            "attention_dropout": dropout,
-        })
-    length = 2
-    x = tf.ones([1, length, hidden_size])
-    bias = tf.ones([1])
-    cache = {
-        "k": tf.zeros([1, 0, num_heads, dim_per_head]),
-        "v": tf.zeros([1, 0, num_heads, dim_per_head]),
-    }
-    y = layer(x, bias, training=True, cache=cache)
-    self.assertEqual(y.shape, (
-        1,
-        length,
-        64,
-    ))
-    self.assertEqual(cache["k"].shape, (
-        1,
-        length,
-        num_heads,
-        dim_per_head,
-    ))
-    self.assertEqual(cache["v"].shape, (
-        1,
-        length,
-        num_heads,
-        dim_per_head,
-    ))
-
-  def test_embedding_shared_weights(self):
-    vocab_size = 50
-    hidden_size = 64
-    length = 2
-    layer = embedding_layer.EmbeddingSharedWeights(vocab_size, hidden_size)
-    self.assertDictEqual(layer.get_config(), {
-        "vocab_size": 50,
-        "hidden_size": 64,
-    })
-
-    idx = tf.ones([1, length], dtype="int32")
-    y = layer(idx)
-    self.assertEqual(y.shape, (
-        1,
-        length,
-        hidden_size,
-    ))
-    x = tf.ones([1, length, hidden_size])
-    output = layer(x, "linear")
-    self.assertEqual(output.shape, (
-        1,
-        length,
-        vocab_size,
-    ))
-
-  def test_feed_forward_network(self):
-    hidden_size = 64
-    filter_size = 32
-    relu_dropout = 0.5
-    layer = ffn_layer.FeedForwardNetwork(hidden_size, filter_size, relu_dropout)
-    self.assertDictEqual(
-        layer.get_config(), {
-            "hidden_size": hidden_size,
-            "filter_size": filter_size,
-            "relu_dropout": relu_dropout,
-        })
-    length = 2
-    x = tf.ones([1, length, hidden_size])
-    y = layer(x, training=True)
-    self.assertEqual(y.shape, (
-        1,
-        length,
-        hidden_size,
-    ))
-
-  def test_metric_layer(self):
-    vocab_size = 50
-    logits = tf.keras.layers.Input((None, vocab_size),
-                                   dtype="float32",
-                                   name="logits")
-    targets = tf.keras.layers.Input((None,), dtype="int64", name="targets")
-    output_logits = metrics.MetricLayer(vocab_size)([logits, targets])
-    self.assertEqual(output_logits.shape.as_list(), [
-        None,
-        None,
-        vocab_size,
-    ])
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/nlp/transformer/transformer_main.py
+++ b/official/nlp/transformer/transformer_main.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Train and evaluate the Transformer model.
-
-See README for description of setting the training schedule and evaluating the
-BLEU score.
-"""
-
-import os
-import tempfile
-
-# Import libraries
-from absl import app
-from absl import flags
-from absl import logging
-import tensorflow as tf
-from official.common import distribute_utils
-from official.modeling import performance
-from official.nlp.transformer import compute_bleu
-from official.nlp.transformer import data_pipeline
-from official.nlp.transformer import metrics
-from official.nlp.transformer import misc
-from official.nlp.transformer import optimizer
-from official.nlp.transformer import transformer
-from official.nlp.transformer import translate
-from official.nlp.transformer.utils import tokenizer
-from official.utils.flags import core as flags_core
-from official.utils.misc import keras_utils
-# pylint:disable=logging-format-interpolation
-
-INF = int(1e9)
-BLEU_DIR = "bleu"
-_SINGLE_SAMPLE = 1
-
-
-def translate_and_compute_bleu(model,
-                               params,
-                               subtokenizer,
-                               bleu_source,
-                               bleu_ref,
-                               distribution_strategy=None):
-  """Translate file and report the cased and uncased bleu scores.
-
-  Args:
-    model: A Keras model, used to generate the translations.
-    params: A dictionary, containing the translation related parameters.
-    subtokenizer: A subtokenizer object, used for encoding and decoding source
-      and translated lines.
-    bleu_source: A file containing source sentences for translation.
-    bleu_ref: A file containing the reference for the translated sentences.
-    distribution_strategy: A platform distribution strategy, used for TPU based
-      translation.
-
-  Returns:
-    uncased_score: A float, the case insensitive BLEU score.
-    cased_score: A float, the case sensitive BLEU score.
-  """
-  # Create temporary file to store translation.
-  tmp = tempfile.NamedTemporaryFile(delete=False)
-  tmp_filename = tmp.name
-
-  translate.translate_file(
-      model,
-      params,
-      subtokenizer,
-      bleu_source,
-      output_file=tmp_filename,
-      print_all_translations=False,
-      distribution_strategy=distribution_strategy)
-
-  # Compute uncased and cased bleu scores.
-  uncased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, False)
-  cased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, True)
-  os.remove(tmp_filename)
-  return uncased_score, cased_score
-
-
-def evaluate_and_log_bleu(model,
-                          params,
-                          bleu_source,
-                          bleu_ref,
-                          vocab_file,
-                          distribution_strategy=None):
-  """Calculate and record the BLEU score.
-
-  Args:
-    model: A Keras model, used to generate the translations.
-    params: A dictionary, containing the translation related parameters.
-    bleu_source: A file containing source sentences for translation.
-    bleu_ref: A file containing the reference for the translated sentences.
-    vocab_file: A file containing the vocabulary for translation.
-    distribution_strategy: A platform distribution strategy, used for TPU based
-      translation.
-
-  Returns:
-    uncased_score: A float, the case insensitive BLEU score.
-    cased_score: A float, the case sensitive BLEU score.
-  """
-  subtokenizer = tokenizer.Subtokenizer(vocab_file)
-
-  uncased_score, cased_score = translate_and_compute_bleu(
-      model, params, subtokenizer, bleu_source, bleu_ref, distribution_strategy)
-
-  logging.info("Bleu score (uncased): %s", uncased_score)
-  logging.info("Bleu score (cased): %s", cased_score)
-  return uncased_score, cased_score
-
-
-class TransformerTask(object):
-  """Main entry of Transformer model."""
-
-  def __init__(self, flags_obj):
-    """Init function of TransformerMain.
-
-    Args:
-      flags_obj: Object containing parsed flag values, i.e., FLAGS.
-
-    Raises:
-      ValueError: if not using static batch for input data on TPU.
-    """
-    self.flags_obj = flags_obj
-    self.predict_model = None
-
-    # Add flag-defined parameters to params object
-    num_gpus = flags_core.get_num_gpus(flags_obj)
-    self.params = params = misc.get_model_params(flags_obj.param_set, num_gpus)
-
-    params["num_gpus"] = num_gpus
-    params["use_ctl"] = flags_obj.use_ctl
-    params["data_dir"] = flags_obj.data_dir
-    params["model_dir"] = flags_obj.model_dir
-    params["static_batch"] = flags_obj.static_batch
-    params["max_length"] = flags_obj.max_length
-    params["decode_batch_size"] = flags_obj.decode_batch_size
-    params["decode_max_length"] = flags_obj.decode_max_length
-    params["padded_decode"] = flags_obj.padded_decode
-    params["max_io_parallelism"] = (
-        flags_obj.num_parallel_calls or tf.data.experimental.AUTOTUNE)
-
-    params["use_synthetic_data"] = flags_obj.use_synthetic_data
-    params["batch_size"] = flags_obj.batch_size or params["default_batch_size"]
-    params["repeat_dataset"] = None
-    params["dtype"] = flags_core.get_tf_dtype(flags_obj)
-    params["enable_tensorboard"] = flags_obj.enable_tensorboard
-    params["enable_metrics_in_training"] = flags_obj.enable_metrics_in_training
-    params["steps_between_evals"] = flags_obj.steps_between_evals
-    params["enable_checkpointing"] = flags_obj.enable_checkpointing
-    params["save_weights_only"] = flags_obj.save_weights_only
-
-    self.distribution_strategy = distribute_utils.get_distribution_strategy(
-        distribution_strategy=flags_obj.distribution_strategy,
-        num_gpus=num_gpus,
-        all_reduce_alg=flags_obj.all_reduce_alg,
-        num_packs=flags_obj.num_packs,
-        tpu_address=flags_obj.tpu or "")
-    if self.use_tpu:
-      params["num_replicas"] = self.distribution_strategy.num_replicas_in_sync
-    else:
-      logging.info("Running transformer with num_gpus = %d", num_gpus)
-
-    if self.distribution_strategy:
-      logging.info("For training, using distribution strategy: %s",
-                   self.distribution_strategy)
-    else:
-      logging.info("Not using any distribution strategy.")
-
-    performance.set_mixed_precision_policy(params["dtype"])
-
-  @property
-  def use_tpu(self):
-    if self.distribution_strategy:
-      return isinstance(self.distribution_strategy, tf.distribute.TPUStrategy)
-    return False
-
-  def train(self):
-    """Trains the model."""
-    params = self.params
-    flags_obj = self.flags_obj
-    # Sets config options.
-    keras_utils.set_session_config(enable_xla=flags_obj.enable_xla)
-
-    _ensure_dir(flags_obj.model_dir)
-    with distribute_utils.get_strategy_scope(self.distribution_strategy):
-      model = transformer.create_model(params, is_train=True)
-      opt = self._create_optimizer()
-
-      current_step = 0
-      checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)
-      latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir)
-      if latest_checkpoint:
-        checkpoint.restore(latest_checkpoint)
-        logging.info("Loaded checkpoint %s", latest_checkpoint)
-        current_step = opt.iterations.numpy()
-
-      if params["use_ctl"]:
-        train_loss_metric = tf.keras.metrics.Mean(
-            "training_loss", dtype=tf.float32)
-        if params["enable_tensorboard"]:
-          summary_writer = tf.summary.create_file_writer(
-              os.path.join(flags_obj.model_dir, "summary"))
-        else:
-          summary_writer = tf.summary.create_noop_writer()
-        train_metrics = [train_loss_metric]
-        if params["enable_metrics_in_training"]:
-          train_metrics = train_metrics + model.metrics
-      else:
-        model.compile(opt)
-
-    model.summary()
-
-    if self.use_tpu:
-      # Different from experimental_distribute_dataset,
-      # distribute_datasets_from_function requires
-      # per-replica/local batch size.
-      params["batch_size"] /= self.distribution_strategy.num_replicas_in_sync
-      train_ds = (
-          self.distribution_strategy.distribute_datasets_from_function(
-              lambda ctx: data_pipeline.train_input_fn(params, ctx)))
-    else:
-      train_ds = data_pipeline.train_input_fn(params)
-      map_data_fn = data_pipeline.map_data_for_transformer_fn
-      train_ds = train_ds.map(
-          map_data_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    if params["use_ctl"]:
-      train_ds_iterator = iter(train_ds)
-
-    callbacks = self._create_callbacks(flags_obj.model_dir, params)
-
-    # Only TimeHistory callback is supported for CTL
-    if params["use_ctl"]:
-      callbacks = [cb for cb in callbacks
-                   if isinstance(cb, keras_utils.TimeHistory)]
-
-    @tf.function
-    def train_steps(iterator, steps):
-      """Training steps function for TPU runs.
-
-      Args:
-        iterator: The input iterator of the training dataset.
-        steps: An integer, the number of training steps.
-
-      Returns:
-        A float, the loss value.
-      """
-
-      def _step_fn(inputs):
-        """Per-replica step function."""
-        inputs, targets = inputs
-        with tf.GradientTape() as tape:
-          logits = model([inputs, targets], training=True)
-          loss = metrics.transformer_loss(logits, targets,
-                                          params["label_smoothing"],
-                                          params["vocab_size"])
-          # Scales the loss, which results in using the average loss across all
-          # of the replicas for backprop.
-          scaled_loss = loss / self.distribution_strategy.num_replicas_in_sync
-
-        # De-dupes variables due to keras tracking issues.
-        tvars = list({id(v): v for v in model.trainable_variables}.values())
-        grads = tape.gradient(scaled_loss, tvars)
-        opt.apply_gradients(zip(grads, tvars))
-        # For reporting, the metric takes the mean of losses.
-        train_loss_metric.update_state(loss)
-
-      for _ in tf.range(steps):
-        train_loss_metric.reset_states()
-        self.distribution_strategy.run(
-            _step_fn, args=(next(iterator),))
-
-    cased_score, uncased_score = None, None
-    cased_score_history, uncased_score_history = [], []
-    while current_step < flags_obj.train_steps:
-      remaining_steps = flags_obj.train_steps - current_step
-      train_steps_per_eval = (
-          remaining_steps if remaining_steps < flags_obj.steps_between_evals
-          else flags_obj.steps_between_evals)
-      current_iteration = current_step // flags_obj.steps_between_evals
-
-      logging.info(
-          "Start train iteration at global step:{}".format(current_step))
-      history = None
-      if params["use_ctl"]:
-        if not self.use_tpu:
-          raise NotImplementedError(
-              "Custom training loop on GPUs is not implemented.")
-
-        # Runs training steps.
-        with summary_writer.as_default():
-          for cb in callbacks:
-            cb.on_epoch_begin(current_iteration)
-            cb.on_batch_begin(0)
-
-          train_steps(
-              train_ds_iterator,
-              tf.convert_to_tensor(train_steps_per_eval, dtype=tf.int32))
-          current_step += train_steps_per_eval
-          train_loss = train_loss_metric.result().numpy().astype(float)
-          logging.info("Train Step: %d/%d / loss = %s", current_step,
-                       flags_obj.train_steps, train_loss)
-
-          for cb in callbacks:
-            cb.on_batch_end(train_steps_per_eval - 1)
-            cb.on_epoch_end(current_iteration)
-
-          if params["enable_tensorboard"]:
-            for metric_obj in train_metrics:
-              tf.summary.scalar(metric_obj.name, metric_obj.result(),
-                                current_step)
-              summary_writer.flush()
-
-        for cb in callbacks:
-          cb.on_train_end()
-
-        if flags_obj.enable_checkpointing:
-          # avoid check-pointing when running for benchmarking.
-          checkpoint_name = checkpoint.save(
-              os.path.join(flags_obj.model_dir,
-                           "ctl_step_{}.ckpt".format(current_step)))
-          logging.info("Saved checkpoint to %s", checkpoint_name)
-      else:
-        if self.use_tpu:
-          raise NotImplementedError(
-              "Keras model.fit on TPUs is not implemented.")
-        history = model.fit(
-            train_ds,
-            initial_epoch=current_iteration,
-            epochs=current_iteration + 1,
-            steps_per_epoch=train_steps_per_eval,
-            callbacks=callbacks,
-            # If TimeHistory is enabled, progress bar would be messy. Increase
-            # the verbose level to get rid of it.
-            verbose=(2 if flags_obj.enable_time_history else 1))
-        current_step += train_steps_per_eval
-        logging.info("Train history: {}".format(history.history))
-
-      logging.info("End train iteration at global step:{}".format(current_step))
-
-      if (flags_obj.bleu_source and flags_obj.bleu_ref):
-        uncased_score, cased_score = self.eval()
-        cased_score_history.append([current_iteration + 1, cased_score])
-        uncased_score_history.append([current_iteration + 1, uncased_score])
-
-    stats = ({
-        "loss": train_loss
-    } if history is None else {})
-    misc.update_stats(history, stats, callbacks)
-    if uncased_score and cased_score:
-      stats["bleu_uncased"] = uncased_score
-      stats["bleu_cased"] = cased_score
-      stats["bleu_uncased_history"] = uncased_score_history
-      stats["bleu_cased_history"] = cased_score_history
-    return stats
-
-  def eval(self):
-    """Evaluates the model."""
-    distribution_strategy = self.distribution_strategy if self.use_tpu else None
-
-    # We only want to create the model under DS scope for TPU case.
-    # When 'distribution_strategy' is None, a no-op DummyContextManager will
-    # be used.
-    with distribute_utils.get_strategy_scope(distribution_strategy):
-      if not self.predict_model:
-        self.predict_model = transformer.create_model(self.params, False)
-      self._load_weights_if_possible(
-          self.predict_model,
-          tf.train.latest_checkpoint(self.flags_obj.model_dir))
-      self.predict_model.summary()
-    return evaluate_and_log_bleu(
-        self.predict_model, self.params, self.flags_obj.bleu_source,
-        self.flags_obj.bleu_ref, self.flags_obj.vocab_file,
-        distribution_strategy)
-
-  def predict(self):
-    """Predicts result from the model."""
-    params = self.params
-    flags_obj = self.flags_obj
-
-    with tf.name_scope("model"):
-      model = transformer.create_model(params, is_train=False)
-      self._load_weights_if_possible(
-          model, tf.train.latest_checkpoint(self.flags_obj.model_dir))
-      model.summary()
-    subtokenizer = tokenizer.Subtokenizer(flags_obj.vocab_file)
-
-    ds = data_pipeline.eval_input_fn(params)
-    ds = ds.map(lambda x, y: x).take(_SINGLE_SAMPLE)
-    ret = model.predict(ds)
-    val_outputs, _ = ret
-    length = len(val_outputs)
-    for i in range(length):
-      translate.translate_from_input(val_outputs[i], subtokenizer)
-
-  def _create_callbacks(self, cur_log_dir, params):
-    """Creates a list of callbacks."""
-    callbacks = misc.get_callbacks()
-    if params["enable_checkpointing"]:
-      ckpt_full_path = os.path.join(cur_log_dir, "cp-{epoch:04d}.ckpt")
-      callbacks.append(
-          tf.keras.callbacks.ModelCheckpoint(
-              ckpt_full_path, save_weights_only=params["save_weights_only"]))
-    return callbacks
-
-  def _load_weights_if_possible(self, model, init_weight_path=None):
-    """Loads model weights when it is provided."""
-    if init_weight_path:
-      logging.info("Load weights: {}".format(init_weight_path))
-      if self.use_tpu:
-        checkpoint = tf.train.Checkpoint(
-            model=model, optimizer=self._create_optimizer())
-        checkpoint.restore(init_weight_path)
-      else:
-        model.load_weights(init_weight_path)
-    else:
-      logging.info("Weights not loaded from path:{}".format(init_weight_path))
-
-  def _create_optimizer(self):
-    """Creates optimizer."""
-    params = self.params
-    lr_schedule = optimizer.LearningRateSchedule(
-        params["learning_rate"], params["hidden_size"],
-        params["learning_rate_warmup_steps"])
-    opt = tf.keras.optimizers.Adam(
-        lr_schedule,
-        params["optimizer_adam_beta1"],
-        params["optimizer_adam_beta2"],
-        epsilon=params["optimizer_adam_epsilon"])
-
-    opt = performance.configure_optimizer(
-        opt,
-        use_float16=params["dtype"] == tf.float16,
-        loss_scale=flags_core.get_loss_scale(
-            self.flags_obj, default_for_fp16="dynamic"))
-
-    return opt
-
-
-def _ensure_dir(log_dir):
-  """Makes log dir if not existed."""
-  if not tf.io.gfile.exists(log_dir):
-    tf.io.gfile.makedirs(log_dir)
-
-
-def main(_):
-  flags_obj = flags.FLAGS
-  if flags_obj.enable_mlir_bridge:
-    tf.config.experimental.enable_mlir_bridge()
-  task = TransformerTask(flags_obj)
-
-  # Execute flag override logic for better model performance
-  if flags_obj.tf_gpu_thread_mode:
-    keras_utils.set_gpu_thread_mode_and_count(
-        per_gpu_thread_count=flags_obj.per_gpu_thread_count,
-        gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
-        num_gpus=flags_obj.num_gpus,
-        datasets_num_private_threads=flags_obj.datasets_num_private_threads)
-
-  if flags_obj.mode == "train":
-    task.train()
-  elif flags_obj.mode == "predict":
-    task.predict()
-  elif flags_obj.mode == "eval":
-    task.eval()
-  else:
-    raise ValueError("Invalid mode {}".format(flags_obj.mode))
-
-
-if __name__ == "__main__":
-  logging.set_verbosity(logging.INFO)
-  misc.define_transformer_flags()
-  app.run(main)
--- a/official/nlp/transformer/transformer_main_test.py
+++ b/official/nlp/transformer/transformer_main_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Test Transformer model."""
-
-import os
-import re
-import sys
-import unittest
-
-from absl import flags
-from absl.testing import flagsaver
-import tensorflow as tf
-from tensorflow.python.eager import context  # pylint: disable=ungrouped-imports
-from official.nlp.transformer import misc
-from official.nlp.transformer import transformer_main
-
-FLAGS = flags.FLAGS
-FIXED_TIMESTAMP = 'my_time_stamp'
-WEIGHT_PATTERN = re.compile(r'weights-epoch-.+\.hdf5')
-
-
-def _generate_file(filepath, lines):
-  with open(filepath, 'w') as f:
-    for l in lines:
-      f.write('{}\n'.format(l))
-
-
-class TransformerTaskTest(tf.test.TestCase):
-  local_flags = None
-
-  def setUp(self):  # pylint: disable=g-missing-super-call
-    temp_dir = self.get_temp_dir()
-    if TransformerTaskTest.local_flags is None:
-      misc.define_transformer_flags()
-      # Loads flags, array cannot be blank.
-      flags.FLAGS(['foo'])
-      TransformerTaskTest.local_flags = flagsaver.save_flag_values()
-    else:
-      flagsaver.restore_flag_values(TransformerTaskTest.local_flags)
-    FLAGS.model_dir = os.path.join(temp_dir, FIXED_TIMESTAMP)
-    FLAGS.param_set = 'tiny'
-    FLAGS.use_synthetic_data = True
-    FLAGS.steps_between_evals = 1
-    FLAGS.train_steps = 1
-    FLAGS.validation_steps = 1
-    FLAGS.batch_size = 4
-    FLAGS.max_length = 1
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.dtype = 'fp32'
-    self.model_dir = FLAGS.model_dir
-    self.temp_dir = temp_dir
-    self.vocab_file = os.path.join(temp_dir, 'vocab')
-    self.vocab_size = misc.get_model_params(FLAGS.param_set, 0)['vocab_size']
-    self.bleu_source = os.path.join(temp_dir, 'bleu_source')
-    self.bleu_ref = os.path.join(temp_dir, 'bleu_ref')
-    self.orig_policy = (
-        tf.compat.v2.keras.mixed_precision.global_policy())
-
-  def tearDown(self):  # pylint: disable=g-missing-super-call
-    tf.compat.v2.keras.mixed_precision.set_global_policy(self.orig_policy)
-
-  def _assert_exists(self, filepath):
-    self.assertTrue(os.path.exists(filepath))
-
-  def test_train_no_dist_strat(self):
-    if context.num_gpus() >= 2:
-      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
-    t = transformer_main.TransformerTask(FLAGS)
-    t.train()
-
-  def test_train_save_full_model(self):
-    if context.num_gpus() >= 2:
-      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
-    FLAGS.save_weights_only = False
-    t = transformer_main.TransformerTask(FLAGS)
-    t.train()
-
-  def test_train_static_batch(self):
-    if context.num_gpus() >= 2:
-      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
-    FLAGS.distribution_strategy = 'one_device'
-    if tf.test.is_built_with_cuda():
-      FLAGS.num_gpus = 1
-    else:
-      FLAGS.num_gpus = 0
-    FLAGS.static_batch = True
-    t = transformer_main.TransformerTask(FLAGS)
-    t.train()
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_train_1_gpu_with_dist_strat(self):
-    FLAGS.distribution_strategy = 'one_device'
-    t = transformer_main.TransformerTask(FLAGS)
-    t.train()
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_train_fp16(self):
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.dtype = 'fp16'
-    t = transformer_main.TransformerTask(FLAGS)
-    t.train()
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_train_2_gpu(self):
-    if context.num_gpus() < 2:
-      self.skipTest(
-          '{} GPUs are not available for this test. {} GPUs are available'
-          .format(2, context.num_gpus()))
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.num_gpus = 2
-    FLAGS.param_set = 'base'
-    t = transformer_main.TransformerTask(FLAGS)
-    t.train()
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_train_2_gpu_fp16(self):
-    if context.num_gpus() < 2:
-      self.skipTest(
-          '{} GPUs are not available for this test. {} GPUs are available'
-          .format(2, context.num_gpus()))
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.num_gpus = 2
-    FLAGS.param_set = 'base'
-    FLAGS.dtype = 'fp16'
-    t = transformer_main.TransformerTask(FLAGS)
-    t.train()
-
-  def _prepare_files_and_flags(self, *extra_flags):
-    # Make log dir.
-    if not os.path.exists(self.temp_dir):
-      os.makedirs(self.temp_dir)
-
-    # Fake vocab, bleu_source and bleu_ref.
-    tokens = [
-        "'<pad>'", "'<EOS>'", "'_'", "'a'", "'b'", "'c'", "'d'", "'a_'", "'b_'",
-        "'c_'", "'d_'"
-    ]
-    tokens += ["'{}'".format(i) for i in range(self.vocab_size - len(tokens))]
-    _generate_file(self.vocab_file, tokens)
-    _generate_file(self.bleu_source, ['a b', 'c d'])
-    _generate_file(self.bleu_ref, ['a b', 'd c'])
-
-    # Update flags.
-    update_flags = [
-        'ignored_program_name',
-        '--vocab_file={}'.format(self.vocab_file),
-        '--bleu_source={}'.format(self.bleu_source),
-        '--bleu_ref={}'.format(self.bleu_ref),
-    ]
-    if extra_flags:
-      update_flags.extend(extra_flags)
-    FLAGS(update_flags)
-
-  def test_predict(self):
-    if context.num_gpus() >= 2:
-      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
-    self._prepare_files_and_flags()
-    t = transformer_main.TransformerTask(FLAGS)
-    t.predict()
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_predict_fp16(self):
-    if context.num_gpus() >= 2:
-      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
-    self._prepare_files_and_flags('--dtype=fp16')
-    t = transformer_main.TransformerTask(FLAGS)
-    t.predict()
-
-  def test_eval(self):
-    if context.num_gpus() >= 2:
-      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
-    if 'test_xla' in sys.argv[0]:
-      self.skipTest('TODO(xla): Make this test faster under XLA.')
-    self._prepare_files_and_flags()
-    t = transformer_main.TransformerTask(FLAGS)
-    t.eval()
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/nlp/transformer/transformer_test.py
+++ b/official/nlp/transformer/transformer_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Test Transformer model."""
-
-import tensorflow as tf
-
-from official.nlp.transformer import model_params
-from official.nlp.transformer import transformer
-
-
-class TransformerV2Test(tf.test.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self.params = params = model_params.TINY_PARAMS
-    params["batch_size"] = params["default_batch_size"] = 16
-    params["use_synthetic_data"] = True
-    params["hidden_size"] = 12
-    params["num_hidden_layers"] = 2
-    params["filter_size"] = 14
-    params["num_heads"] = 2
-    params["vocab_size"] = 41
-    params["extra_decode_length"] = 2
-    params["beam_size"] = 3
-    params["dtype"] = tf.float32
-
-  def test_create_model_train(self):
-    model = transformer.create_model(self.params, True)
-    inputs, outputs = model.inputs, model.outputs
-    self.assertEqual(len(inputs), 2)
-    self.assertEqual(len(outputs), 1)
-    self.assertEqual(inputs[0].shape.as_list(), [None, None])
-    self.assertEqual(inputs[0].dtype, tf.int64)
-    self.assertEqual(inputs[1].shape.as_list(), [None, None])
-    self.assertEqual(inputs[1].dtype, tf.int64)
-    self.assertEqual(outputs[0].shape.as_list(), [None, None, 41])
-    self.assertEqual(outputs[0].dtype, tf.float32)
-
-  def test_create_model_not_train(self):
-    model = transformer.create_model(self.params, False)
-    inputs, outputs = model.inputs, model.outputs
-    self.assertEqual(len(inputs), 1)
-    self.assertEqual(len(outputs), 2)
-    self.assertEqual(inputs[0].shape.as_list(), [None, None])
-    self.assertEqual(inputs[0].dtype, tf.int64)
-    self.assertEqual(outputs[0].shape.as_list(), [None, None])
-    self.assertEqual(outputs[0].dtype, tf.int32)
-    self.assertEqual(outputs[1].shape.as_list(), [None])
-    self.assertEqual(outputs[1].dtype, tf.float32)
-
-  def test_export(self):
-    model = transformer.Transformer(self.params, name="transformer_v2")
-    export_dir = self.get_temp_dir()
-    batch_size = 5
-    max_length = 6
-
-    class SaveModule(tf.Module):
-
-      def __init__(self, model):
-        super(SaveModule, self).__init__()
-        self.model = model
-
-      @tf.function
-      def serve(self, x):
-        return self.model.call([x], training=False)
-
-    save_module = SaveModule(model)
-    tensor_shape = (None, None)
-    sample_input = tf.zeros((batch_size, max_length), dtype=tf.int64)
-    _ = save_module.serve(sample_input)
-    signatures = dict(
-        serving_default=save_module.serve.get_concrete_function(
-            tf.TensorSpec(shape=tensor_shape, dtype=tf.int64, name="x")))
-    tf.saved_model.save(save_module, export_dir, signatures=signatures)
-    imported = tf.saved_model.load(export_dir)
-    serving_fn = imported.signatures["serving_default"]
-    all_outputs = serving_fn(sample_input)
-    output = all_outputs["outputs"]
-    output_shapes = output.shape.as_list()
-    self.assertEqual(output_shapes[0], batch_size)
-    self.assertEqual(output_shapes[1],
-                     max_length + model.params["extra_decode_length"])
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/nlp/transformer/translate.py
+++ b/official/nlp/transformer/translate.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Translate text or files using trained transformer model."""
-
-# Import libraries
-from absl import logging
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.transformer.utils import tokenizer
-
-_EXTRA_DECODE_LENGTH = 100
-_BEAM_SIZE = 4
-_ALPHA = 0.6
-
-
-def _get_sorted_inputs(filename):
-  """Read and sort lines from the file sorted by decreasing length.
-
-  Args:
-    filename: String name of file to read inputs from.
-  Returns:
-    Sorted list of inputs, and dictionary mapping original index->sorted index
-    of each element.
-  """
-  with tf.io.gfile.GFile(filename) as f:
-    records = f.read().split("\n")
-    inputs = [record.strip() for record in records]
-    if not inputs[-1]:
-      inputs.pop()
-
-  input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
-  sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)
-
-  sorted_inputs = [None] * len(sorted_input_lens)
-  sorted_keys = [0] * len(sorted_input_lens)
-  for i, (index, _) in enumerate(sorted_input_lens):
-    sorted_inputs[i] = inputs[index]
-    sorted_keys[index] = i
-  return sorted_inputs, sorted_keys
-
-
-def _encode_and_add_eos(line, subtokenizer):
-  """Encode line with subtokenizer, and add EOS id to the end."""
-  return subtokenizer.encode(line) + [tokenizer.EOS_ID]
-
-
-def _trim_and_decode(ids, subtokenizer):
-  """Trim EOS and PAD tokens from ids, and decode to return a string."""
-  try:
-    index = list(ids).index(tokenizer.EOS_ID)
-    return subtokenizer.decode(ids[:index])
-  except ValueError:  # No EOS found in sequence
-    return subtokenizer.decode(ids)
-
-
-def translate_file(model,
-                   params,
-                   subtokenizer,
-                   input_file,
-                   output_file=None,
-                   print_all_translations=True,
-                   distribution_strategy=None):
-  """Translate lines in file, and save to output file if specified.
-
-  Args:
-    model: A Keras model, used to generate the translations.
-    params: A dictionary, containing the translation related parameters.
-    subtokenizer: A subtokenizer object, used for encoding and decoding source
-      and translated lines.
-    input_file: A file containing lines to translate.
-    output_file: A file that stores the generated translations.
-    print_all_translations: A bool. If true, all translations are printed to
-      stdout.
-    distribution_strategy: A distribution strategy, used to perform inference
-      directly with tf.function instead of Keras model.predict().
-
-  Raises:
-    ValueError: if output file is invalid.
-  """
-  batch_size = params["decode_batch_size"]
-
-  # Read and sort inputs by length. Keep dictionary (original index-->new index
-  # in sorted list) to write translations in the original order.
-  sorted_inputs, sorted_keys = _get_sorted_inputs(input_file)
-  total_samples = len(sorted_inputs)
-  num_decode_batches = (total_samples - 1) // batch_size + 1
-
-  def input_generator():
-    """Yield encoded strings from sorted_inputs."""
-    for i in range(num_decode_batches):
-      lines = [
-          sorted_inputs[j + i * batch_size]
-          for j in range(batch_size)
-          if j + i * batch_size < total_samples
-      ]
-      lines = [_encode_and_add_eos(l, subtokenizer) for l in lines]
-      if distribution_strategy:
-        for j in range(batch_size - len(lines)):
-          lines.append([tokenizer.EOS_ID])
-      batch = tf.keras.preprocessing.sequence.pad_sequences(
-          lines,
-          maxlen=params["decode_max_length"],
-          dtype="int32",
-          padding="post")
-      logging.info("Decoding batch %d out of %d.", i, num_decode_batches)
-      yield batch
-
-  @tf.function
-  def predict_step(inputs):
-    """Decoding step function for TPU runs."""
-
-    def _step_fn(inputs):
-      """Per replica step function."""
-      tag = inputs[0]
-      val_inputs = inputs[1]
-      val_outputs, _ = model([val_inputs], training=False)
-      return tag, val_outputs
-
-    return distribution_strategy.run(_step_fn, args=(inputs,))
-
-  translations = []
-  if distribution_strategy:
-    num_replicas = distribution_strategy.num_replicas_in_sync
-    local_batch_size = params["decode_batch_size"] // num_replicas
-  for i, text in enumerate(input_generator()):
-    if distribution_strategy:
-      text = np.reshape(text, [num_replicas, local_batch_size, -1])
-      # Add tag to the input of each replica with the reordering logic after
-      # outputs, to ensure the output order matches the input order.
-      text = tf.constant(text)
-
-      @tf.function
-      def text_as_per_replica():
-        replica_context = tf.distribute.get_replica_context()
-        replica_id = replica_context.replica_id_in_sync_group
-        return replica_id, text[replica_id]  # pylint: disable=cell-var-from-loop
-
-      text = distribution_strategy.run(text_as_per_replica)
-      outputs = distribution_strategy.experimental_local_results(
-          predict_step(text))
-      val_outputs = [output for _, output in outputs]
-
-      val_outputs = np.reshape(val_outputs, [params["decode_batch_size"], -1])
-    else:
-      val_outputs, _ = model.predict(text)
-
-    length = len(val_outputs)
-    for j in range(length):
-      if j + i * batch_size < total_samples:
-        translation = _trim_and_decode(val_outputs[j], subtokenizer)
-        translations.append(translation)
-        if print_all_translations:
-          logging.info("Translating:\n\tInput: %s\n\tOutput: %s",
-                       sorted_inputs[j + i * batch_size], translation)
-
-  # Write translations in the order they appeared in the original file.
-  if output_file is not None:
-    if tf.io.gfile.isdir(output_file):
-      raise ValueError("File output is a directory, will not save outputs to "
-                       "file.")
-    logging.info("Writing to file %s", output_file)
-    with tf.io.gfile.GFile(output_file, "w") as f:
-      for i in sorted_keys:
-        f.write("%s\n" % translations[i])
-
-
-def translate_from_text(model, subtokenizer, txt):
-  encoded_txt = _encode_and_add_eos(txt, subtokenizer)
-  result = model.predict(encoded_txt)
-  outputs = result["outputs"]
-  logging.info("Original: \"%s\"", txt)
-  translate_from_input(outputs, subtokenizer)
-
-
-def translate_from_input(outputs, subtokenizer):
-  translation = _trim_and_decode(outputs, subtokenizer)
-  logging.info("Translation: \"%s\"", translation)
--- a/official/nlp/transformer/utils/__init__.py
+++ b/official/nlp/transformer/utils/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
--- a/official/nlp/transformer/utils/metrics.py
+++ b/official/nlp/transformer/utils/metrics.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Functions for calculating loss, accuracy, and other model metrics.
-
-Metrics:
- - Padded loss, accuracy, and negative log perplexity. Source:
-     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
- - BLEU approximation. Source:
-     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
- - ROUGE score. Source:
-     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import math
-
-import numpy as np
-import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow.compat.v1 as tf
-
-
-def _pad_tensors_to_same_length(x, y):
-  """Pad x and y so that the results have the same length (second dimension)."""
-  with tf.name_scope("pad_to_same_length"):
-    x_length = tf.shape(x)[1]
-    y_length = tf.shape(y)[1]
-
-    max_length = tf.maximum(x_length, y_length)
-
-    x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
-    y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
-    return x, y
-
-
-def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
-  """Calculate cross entropy loss while ignoring padding.
-
-  Args:
-    logits: Tensor of size [batch_size, length_logits, vocab_size]
-    labels: Tensor of size [batch_size, length_labels]
-    smoothing: Label smoothing constant, used to determine the on and off values
-    vocab_size: int size of the vocabulary
-  Returns:
-    Returns the cross entropy loss and weight tensors: float32 tensors with
-      shape [batch_size, max(length_logits, length_labels)]
-  """
-  with tf.name_scope("loss", values=[logits, labels]):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-
-    # Calculate smoothing cross entropy
-    with tf.name_scope("smoothing_cross_entropy", values=[logits, labels]):
-      confidence = 1.0 - smoothing
-      low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32)
-      soft_targets = tf.one_hot(
-          tf.cast(labels, tf.int32),
-          depth=vocab_size,
-          on_value=confidence,
-          off_value=low_confidence)
-      xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(
-          logits=logits, labels=soft_targets)
-
-      # Calculate the best (lowest) possible value of cross entropy, and
-      # subtract from the cross entropy loss.
-      normalizing_constant = -(
-          confidence * tf.log(confidence) + tf.cast(vocab_size - 1, tf.float32)
-          * low_confidence * tf.log(low_confidence + 1e-20))
-      xentropy -= normalizing_constant
-
-    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
-    return xentropy * weights, weights
-
-
-def _convert_to_eval_metric(metric_fn):
-  """Wrap a metric fn that returns scores and weights as an eval metric fn.
-
-  The input metric_fn returns values for the current batch. The wrapper
-  aggregates the return values collected over all of the batches evaluated.
-
-  Args:
-    metric_fn: function that returns scores and weights for the current batch's
-      logits and predicted labels.
-
-  Returns:
-    function that aggregates the scores and weights from metric_fn.
-  """
-  def problem_metric_fn(*args):
-    """Returns an aggregation of the metric_fn's returned values."""
-    (scores, weights) = metric_fn(*args)
-
-    # The tf.metrics.mean function assures correct aggregation.
-    return tf.metrics.mean(scores, weights)
-  return problem_metric_fn
-
-
-def get_eval_metrics(logits, labels, params):
-  """Return dictionary of model evaluation metrics."""
-  metrics = {
-      "accuracy": _convert_to_eval_metric(padded_accuracy)(logits, labels),
-      "accuracy_top5": _convert_to_eval_metric(padded_accuracy_top5)(
-          logits, labels),
-      "accuracy_per_sequence": _convert_to_eval_metric(
-          padded_sequence_accuracy)(logits, labels),
-      "neg_log_perplexity": _convert_to_eval_metric(padded_neg_log_perplexity)(
-          logits, labels, params["vocab_size"]),
-  }
-
-  if not params["use_tpu"]:
-    # TPU does not support tf.py_func
-    metrics.update({
-        "approx_bleu_score": _convert_to_eval_metric(
-            bleu_score)(logits, labels),
-        "rouge_2_fscore": _convert_to_eval_metric(
-            rouge_2_fscore)(logits, labels),
-        "rouge_L_fscore": _convert_to_eval_metric(
-            rouge_l_fscore)(logits, labels),
-    })
-
-  # Prefix each of the metric names with "metrics/". This allows the metric
-  # graphs to display under the "metrics" category in TensorBoard.
-  metrics = {"metrics/%s" % k: v for k, v in six.iteritems(metrics)}
-  return metrics
-
-
-def padded_accuracy(logits, labels):
-  """Percentage of times that predictions matches labels on non-0s."""
-  with tf.variable_scope("padded_accuracy", values=[logits, labels]):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
-    outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
-    padded_labels = tf.cast(labels, tf.int32)
-    return tf.cast(tf.equal(outputs, padded_labels), tf.float32), weights
-
-
-def padded_accuracy_topk(logits, labels, k):
-  """Percentage of times that top-k predictions matches labels on non-0s."""
-  with tf.variable_scope("padded_accuracy_topk", values=[logits, labels]):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
-    effective_k = tf.minimum(k, tf.shape(logits)[-1])
-    _, outputs = tf.nn.top_k(logits, k=effective_k)
-    outputs = tf.cast(outputs, tf.int32)
-    padded_labels = tf.cast(labels, tf.int32)
-    padded_labels = tf.expand_dims(padded_labels, axis=-1)
-    padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
-    same = tf.cast(tf.equal(outputs, padded_labels), tf.float32)
-    same_topk = tf.reduce_sum(same, axis=-1)
-    return same_topk, weights
-
-
-def padded_accuracy_top5(logits, labels):
-  return padded_accuracy_topk(logits, labels, 5)
-
-
-def padded_sequence_accuracy(logits, labels):
-  """Percentage of times that predictions matches labels everywhere (non-0)."""
-  with tf.variable_scope("padded_sequence_accuracy", values=[logits, labels]):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
-    outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
-    padded_labels = tf.cast(labels, tf.int32)
-    not_correct = (tf.cast(tf.not_equal(outputs, padded_labels), tf.float32) *
-                   weights)
-    axis = list(range(1, len(outputs.get_shape())))
-    correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
-    return correct_seq, tf.constant(1.0)
-
-
-def padded_neg_log_perplexity(logits, labels, vocab_size):
-  """Average log-perplexity excluding padding 0s. No smoothing."""
-  num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size)
-  return -num, den
-
-
-def bleu_score(logits, labels):
-  """Approximate BLEU score computation between labels and predictions.
-
-  An approximate BLEU scoring method since we do not glue word pieces or
-  decode the ids and tokenize the output. By default, we use ngram order of 4
-  and use brevity penalty. Also, this does not have beam search.
-
-  Args:
-    logits: Tensor of size [batch_size, length_logits, vocab_size]
-    labels: Tensor of size [batch-size, length_labels]
-
-  Returns:
-    bleu: int, approx bleu score
-  """
-  predictions = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
-  # TODO: Look into removing use of py_func  # pylint: disable=g-bad-todo
-  bleu = tf.py_func(compute_bleu, (labels, predictions), tf.float32)
-  return bleu, tf.constant(1.0)
-
-
-def _get_ngrams_with_counter(segment, max_order):
-  """Extracts all n-grams up to a given maximum order from an input segment.
-
-  Args:
-    segment: text segment from which n-grams will be extracted.
-    max_order: maximum length in tokens of the n-grams returned by this
-        methods.
-
-  Returns:
-    The Counter containing all n-grams upto max_order in segment
-    with a count of how many times each n-gram occurred.
-  """
-  ngram_counts = collections.Counter()
-  for order in xrange(1, max_order + 1):
-    for i in xrange(0, len(segment) - order + 1):
-      ngram = tuple(segment[i:i + order])
-      ngram_counts[ngram] += 1
-  return ngram_counts
-
-
-def compute_bleu(reference_corpus, translation_corpus, max_order=4,
-                 use_bp=True):
-  """Computes BLEU score of translated segments against one or more references.
-
-  Args:
-    reference_corpus: list of references for each translation. Each
-        reference should be tokenized into a list of tokens.
-    translation_corpus: list of translations to score. Each translation
-        should be tokenized into a list of tokens.
-    max_order: Maximum n-gram order to use when computing BLEU score.
-    use_bp: boolean, whether to apply brevity penalty.
-
-  Returns:
-    BLEU score.
-  """
-  reference_length = 0
-  translation_length = 0
-  bp = 1.0
-  geo_mean = 0
-
-  matches_by_order = [0] * max_order
-  possible_matches_by_order = [0] * max_order
-  precisions = []
-
-  for (references, translations) in zip(reference_corpus, translation_corpus):
-    reference_length += len(references)
-    translation_length += len(translations)
-    ref_ngram_counts = _get_ngrams_with_counter(references, max_order)
-    translation_ngram_counts = _get_ngrams_with_counter(translations, max_order)
-
-    overlap = dict((ngram,
-                    min(count, translation_ngram_counts[ngram]))
-                   for ngram, count in ref_ngram_counts.items())
-
-    for ngram in overlap:
-      matches_by_order[len(ngram) - 1] += overlap[ngram]
-    for ngram in translation_ngram_counts:
-      possible_matches_by_order[len(ngram) - 1] += translation_ngram_counts[
-          ngram]
-
-  precisions = [0] * max_order
-  smooth = 1.0
-
-  for i in xrange(0, max_order):
-    if possible_matches_by_order[i] > 0:
-      precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
-      if matches_by_order[i] > 0:
-        precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[
-            i]
-      else:
-        smooth *= 2
-        precisions[i] = 1.0 / (smooth * possible_matches_by_order[i])
-    else:
-      precisions[i] = 0.0
-
-  if max(precisions) > 0:
-    p_log_sum = sum(math.log(p) for p in precisions if p)
-    geo_mean = math.exp(p_log_sum / max_order)
-
-  if use_bp:
-    ratio = translation_length / reference_length
-    bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
-  bleu = geo_mean * bp
-  return np.float32(bleu)
-
-
-def rouge_2_fscore(logits, labels):
-  """ROUGE-2 F1 score computation between labels and predictions.
-
-  This is an approximate ROUGE scoring method since we do not glue word pieces
-  or decode the ids and tokenize the output.
-
-  Args:
-    logits: tensor, model predictions
-    labels: tensor, gold output.
-
-  Returns:
-    rouge2_fscore: approx rouge-2 f1 score.
-  """
-  predictions = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
-  # TODO: Look into removing use of py_func  # pylint: disable=g-bad-todo
-  rouge_2_f_score = tf.py_func(rouge_n, (predictions, labels), tf.float32)
-  return rouge_2_f_score, tf.constant(1.0)
-
-
-def _get_ngrams(n, text):
-  """Calculates n-grams.
-
-  Args:
-    n: which n-grams to calculate
-    text: An array of tokens
-
-  Returns:
-    A set of n-grams
-  """
-  ngram_set = set()
-  text_length = len(text)
-  max_index_ngram_start = text_length - n
-  for i in range(max_index_ngram_start + 1):
-    ngram_set.add(tuple(text[i:i + n]))
-  return ngram_set
-
-
-def rouge_n(eval_sentences, ref_sentences, n=2):
-  """Computes ROUGE-N f1 score of two text collections of sentences.
-
-  Source: https://www.microsoft.com/en-us/research/publication/
-  rouge-a-package-for-automatic-evaluation-of-summaries/
-
-  Args:
-    eval_sentences: Predicted sentences.
-    ref_sentences: Sentences from the reference set
-    n: Size of ngram.  Defaults to 2.
-
-  Returns:
-    f1 score for ROUGE-N
-  """
-  f1_scores = []
-  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
-    eval_ngrams = _get_ngrams(n, eval_sentence)
-    ref_ngrams = _get_ngrams(n, ref_sentence)
-    ref_count = len(ref_ngrams)
-    eval_count = len(eval_ngrams)
-
-    # Count the overlapping ngrams between evaluated and reference
-    overlapping_ngrams = eval_ngrams.intersection(ref_ngrams)
-    overlapping_count = len(overlapping_ngrams)
-
-    # Handle edge case. This isn't mathematically correct, but it's good enough
-    if eval_count == 0:
-      precision = 0.0
-    else:
-      precision = float(overlapping_count) / eval_count
-    if ref_count == 0:
-      recall = 0.0
-    else:
-      recall = float(overlapping_count) / ref_count
-    f1_scores.append(2.0 * ((precision * recall) / (precision + recall + 1e-8)))
-
-  # return overlapping_count / reference_count
-  return np.mean(f1_scores, dtype=np.float32)
-
-
-def rouge_l_fscore(predictions, labels):
-  """ROUGE scores computation between labels and predictions.
-
-  This is an approximate ROUGE scoring method since we do not glue word pieces
-  or decode the ids and tokenize the output.
-
-  Args:
-    predictions: tensor, model predictions
-    labels: tensor, gold output.
-
-  Returns:
-    rouge_l_fscore: approx rouge-l f1 score.
-  """
-  outputs = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
-  rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (outputs, labels),
-                               tf.float32)
-  return rouge_l_f_score, tf.constant(1.0)
-
-
-def rouge_l_sentence_level(eval_sentences, ref_sentences):
-  """Computes ROUGE-L (sentence level) of two collections of sentences.
-
-  Source: https://www.microsoft.com/en-us/research/publication/
-  rouge-a-package-for-automatic-evaluation-of-summaries/
-
-  Calculated according to:
-  R_lcs = LCS(X,Y)/m
-  P_lcs = LCS(X,Y)/n
-  F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
-
-  where:
-  X = reference summary
-  Y = Candidate summary
-  m = length of reference summary
-  n = length of candidate summary
-
-  Args:
-    eval_sentences: The sentences that have been picked by the summarizer
-    ref_sentences: The sentences from the reference set
-
-  Returns:
-    A float: F_lcs
-  """
-
-  f1_scores = []
-  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
-    m = float(len(ref_sentence))
-    n = float(len(eval_sentence))
-    lcs = _len_lcs(eval_sentence, ref_sentence)
-    f1_scores.append(_f_lcs(lcs, m, n))
-  return np.mean(f1_scores, dtype=np.float32)
-
-
-def _len_lcs(x, y):
-  """Returns the length of the Longest Common Subsequence between two seqs.
-
-  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
-
-  Args:
-    x: sequence of words
-    y: sequence of words
-
-  Returns
-    integer: Length of LCS between x and y
-  """
-  table = _lcs(x, y)
-  n, m = len(x), len(y)
-  return table[n, m]
-
-
-def _lcs(x, y):
-  """Computes the length of the LCS between two seqs.
-
-  The implementation below uses a DP programming algorithm and runs
-  in O(nm) time where n = len(x) and m = len(y).
-  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
-
-  Args:
-    x: collection of words
-    y: collection of words
-
-  Returns:
-    Table of dictionary of coord and len lcs
-  """
-  n, m = len(x), len(y)
-  table = dict()
-  for i in range(n + 1):
-    for j in range(m + 1):
-      if i == 0 or j == 0:
-        table[i, j] = 0
-      elif x[i - 1] == y[j - 1]:
-        table[i, j] = table[i - 1, j - 1] + 1
-      else:
-        table[i, j] = max(table[i - 1, j], table[i, j - 1])
-  return table
-
-
-def _f_lcs(llcs, m, n):
-  """Computes the LCS-based F-measure score.
-
-  Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
-  rouge-working-note-v1.3.1.pdf
-
-  Args:
-    llcs: Length of LCS
-    m: number of words in reference summary
-    n: number of words in candidate summary
-
-  Returns:
-    Float. LCS-based F-measure score
-  """
-  r_lcs = llcs / m
-  p_lcs = llcs / n
-  beta = p_lcs / (r_lcs + 1e-12)
-  num = (1 + (beta ** 2)) * r_lcs * p_lcs
-  denom = r_lcs + ((beta ** 2) * p_lcs)
-  f_lcs = num / (denom + 1e-12)
-  return f_lcs
--- a/official/nlp/transformer/utils/tokenizer.py
+++ b/official/nlp/transformer/utils/tokenizer.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Defines Subtokenizer class to encode and decode strings."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import re
-import sys
-import unicodedata
-
-from absl import logging
-
-import numpy as np
-import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-# pylint: disable=g-complex-comprehension
-PAD = "<pad>"
-PAD_ID = 0
-EOS = "<EOS>"
-EOS_ID = 1
-RESERVED_TOKENS = [PAD, EOS]
-
-# Set of characters that will be used in the function _escape_token() (see func
-# docstring for more details).
-# This set is added to the alphabet list to ensure that all escaped tokens can
-# be encoded.
-_ESCAPE_CHARS = set(u"\\_u;0123456789")
-# Regex for the function _unescape_token(), the inverse of _escape_token().
-# This is used to find "\u", "\\", and "\###;" substrings in the token.
-_UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);")
-
-_UNDEFINED_UNICODE = u"\u3013"
-
-
-def alphanumeric_char_set():
-  return set(
-      six.unichr(i)
-      for i in xrange(sys.maxunicode)
-      if (unicodedata.category(six.unichr(i)).startswith("L") or
-          unicodedata.category(six.unichr(i)).startswith("N")))
-
-
-# Set contains all letter and number characters.
-_ALPHANUMERIC_CHAR_SET = alphanumeric_char_set()
-
-# min_count is the minimum number of times a subtoken must appear in the data
-# before before it is added to the vocabulary. The value is found using binary
-# search to obtain the target vocabulary size.
-_MIN_MIN_COUNT = 1  # min value to use when binary searching for min_count
-_MAX_MIN_COUNT = 1000  # max value to use when binary searching for min_count
-
-
-class Subtokenizer(object):
-  """Encodes and decodes strings to/from integer IDs."""
-
-  def __init__(self, vocab_file, reserved_tokens=None, master_char_set=None):
-    """Initializes class, creating a vocab file if data_files is provided."""
-    logging.info("Initializing Subtokenizer from file %s.", vocab_file)
-
-    if master_char_set is None:
-      master_char_set = _ALPHANUMERIC_CHAR_SET
-
-    if reserved_tokens is None:
-      reserved_tokens = RESERVED_TOKENS
-
-    self.subtoken_list = _load_vocab_file(vocab_file, reserved_tokens)
-    self.alphabet = _generate_alphabet_dict(self.subtoken_list)
-    self.subtoken_to_id_dict = _list_to_index_dict(self.subtoken_list)
-
-    self.max_subtoken_length = 0
-    for subtoken in self.subtoken_list:
-      self.max_subtoken_length = max(self.max_subtoken_length, len(subtoken))
-
-    # Create cache to speed up subtokenization
-    self._cache_size = 2**20
-    self._cache = [(None, None)] * self._cache_size
-    self._master_char_set = master_char_set
-
-  @staticmethod
-  def init_from_files(vocab_file,
-                      files,
-                      target_vocab_size,
-                      threshold,
-                      min_count=None,
-                      file_byte_limit=1e6,
-                      reserved_tokens=None,
-                      correct_strip=True,
-                      master_char_set=None):
-    """Create subtoken vocabulary based on files, and save vocab to file.
-
-    Args:
-      vocab_file: String name of vocab file to store subtoken vocabulary.
-      files: List of file paths that will be used to generate vocabulary.
-      target_vocab_size: target vocabulary size to generate.
-      threshold: int threshold of vocabulary size to accept.
-      min_count: int minimum count to use for generating the vocabulary. The min
-        count is the minimum number of times a subtoken should appear in the
-        files before it is added to the vocabulary. If set to none, this value
-        is found using binary search.
-      file_byte_limit: (Default 1e6) Maximum number of bytes of sample text that
-        will be drawn from the files.
-      reserved_tokens: List of string tokens that are guaranteed to be at the
-        beginning of the subtoken vocabulary list.
-      correct_strip: Whether to convert text to unicode before strip.
-      master_char_set: the char set.
-
-    Returns:
-      Subtokenizer object
-    """
-    if master_char_set is None:
-      master_char_set = _ALPHANUMERIC_CHAR_SET
-    if reserved_tokens is None:
-      reserved_tokens = RESERVED_TOKENS
-
-    if tf.io.gfile.exists(vocab_file):
-      logging.info("Vocab file already exists (%s)", vocab_file)
-    else:
-      logging.info("Begin steps to create subtoken vocabulary...")
-      token_counts = _count_tokens(files, file_byte_limit, correct_strip,
-                                   master_char_set)
-      alphabet = _generate_alphabet_dict(token_counts)
-      subtoken_list = _generate_subtokens_with_target_vocab_size(
-          token_counts, alphabet, target_vocab_size, threshold, min_count,
-          reserved_tokens)
-      logging.info("Generated vocabulary with %d subtokens.",
-                   len(subtoken_list))
-      _save_vocab_file(vocab_file, subtoken_list)
-    return Subtokenizer(vocab_file, master_char_set=master_char_set)
-
-  def encode(self, raw_string, add_eos=False):
-    """Encodes a string into a list of int subtoken ids."""
-    ret = []
-    tokens = _split_string_to_tokens(
-        native_to_unicode(raw_string), self._master_char_set)
-    for token in tokens:
-      ret.extend(self._token_to_subtoken_ids(token))
-    if add_eos:
-      assert EOS in self.subtoken_list, \
-          "Can't append 'EOS' because it is not in list of known subtokens."
-      ret.append(EOS_ID)
-    return ret
-
-  def _token_to_subtoken_ids(self, token):
-    """Encode a single token into a list of subtoken ids."""
-    cache_location = hash(token) % self._cache_size
-    cache_key, cache_value = self._cache[cache_location]
-    if cache_key == token:
-      return cache_value
-
-    ret = _split_token_to_subtokens(
-        _escape_token(token, self.alphabet), self.subtoken_to_id_dict,
-        self.max_subtoken_length)
-    ret = [self.subtoken_to_id_dict[subtoken_id] for subtoken_id in ret]
-
-    self._cache[cache_location] = (token, ret)
-    return ret
-
-  def decode(self, subtokens):
-    """Converts list of int subtokens ids into a string."""
-    if isinstance(subtokens, np.ndarray):
-      # Note that list(subtokens) converts subtokens to a python list, but the
-      # items remain as np.int32. This converts both the array and its items.
-      subtokens = subtokens.tolist()
-
-    if not subtokens:
-      return ""
-
-    assert isinstance(subtokens, list) and isinstance(subtokens[0], int), (
-        "Subtokens argument passed into decode() must be a list of integers.")
-
-    return _unicode_to_native(
-        _join_tokens_to_string(
-            self._subtoken_ids_to_tokens(subtokens), self._master_char_set))
-
-  def _subtoken_ids_to_tokens(self, subtokens):
-    """Convert list of int subtoken ids to a list of string tokens."""
-    escaped_tokens = "".join([
-        self.subtoken_list[s] for s in subtokens if s < len(self.subtoken_list)
-    ])
-    escaped_tokens = escaped_tokens.split("_")
-
-    # All tokens in the vocabulary list have been escaped (see _escape_token())
-    # so each token must be unescaped when decoding.
-    ret = []
-    for token in escaped_tokens:
-      if token:
-        ret.append(_unescape_token(token))
-    return ret
-
-
-def _save_vocab_file(vocab_file, subtoken_list):
-  """Save subtokens to file."""
-  with tf.io.gfile.GFile(vocab_file, mode="w") as f:
-    for subtoken in subtoken_list:
-      f.write("'%s'\n" % _unicode_to_native(subtoken))
-
-
-def _load_vocab_file(vocab_file, reserved_tokens=None):
-  """Load vocabulary while ensuring reserved tokens are at the top."""
-  if reserved_tokens is None:
-    reserved_tokens = RESERVED_TOKENS
-
-  subtoken_list = []
-  with tf.io.gfile.GFile(vocab_file, mode="r") as f:
-    for line in f:
-      subtoken = native_to_unicode(line.strip())
-      subtoken = subtoken[1:-1]  # Remove surrounding single-quotes
-      if subtoken in reserved_tokens:
-        continue
-      subtoken_list.append(native_to_unicode(subtoken))
-  return reserved_tokens + subtoken_list
-
-
-def native_to_unicode(s):
-  """Convert string to unicode (required in Python 2)."""
-  try:  # Python 2
-    return s if isinstance(s, unicode) else s.decode("utf-8")
-  except NameError:  # Python 3
-    return s
-
-
-def _unicode_to_native(s):
-  """Convert string from unicode to native format (required in Python 2)."""
-  try:  # Python 2
-    return s.encode("utf-8") if isinstance(s, unicode) else s
-  except NameError:  # Python 3
-    return s
-
-
-def _split_string_to_tokens(text, master_char_set):
-  """Splits text to a list of string tokens."""
-  if not text:
-    return []
-  ret = []
-  token_start = 0
-  # Classify each character in the input string
-  is_master = [c in master_char_set for c in text]
-  for pos in xrange(1, len(text)):
-    if is_master[pos] != is_master[pos - 1]:
-      token = text[token_start:pos]
-      if token != u" " or token_start == 0:
-        ret.append(token)
-      token_start = pos
-  final_token = text[token_start:]
-  ret.append(final_token)
-  return ret
-
-
-def _join_tokens_to_string(tokens, master_char_set):
-  """Join a list of string tokens into a single string."""
-  token_is_master = [t[0] in master_char_set for t in tokens]
-  ret = []
-  for i, token in enumerate(tokens):
-    if i > 0 and token_is_master[i - 1] and token_is_master[i]:
-      ret.append(u" ")
-    ret.append(token)
-  return "".join(ret)
-
-
-def _escape_token(token, alphabet):
-  r"""Replace characters that aren't in the alphabet and append "_" to token.
-
-  Apply three transformations to the token:
-    1. Replace underline character "_" with "\u", and backslash "\" with "\\".
-    2. Replace characters outside of the alphabet with "\###;", where ### is the
-       character's Unicode code point.
-    3. Appends "_" to mark the end of a token.
-
-  Args:
-    token: unicode string to be escaped
-    alphabet: list of all known characters
-
-  Returns:
-    escaped string
-  """
-  token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u")
-  ret = [c if c in alphabet and c != u"\n" else r"\%d;" % ord(c) for c in token]
-  return u"".join(ret) + "_"
-
-
-def _unescape_token(token):
-  r"""Replaces escaped characters in the token with their unescaped versions.
-
-  Applies inverse transformations as _escape_token():
-    1. Replace "\u" with "_", and "\\" with "\".
-    2. Replace "\###;" with the unicode character the ### refers to.
-
-  Args:
-    token: escaped string
-
-  Returns:
-    unescaped string
-  """
-
-  def match(m):
-    r"""Returns replacement string for matched object.
-
-    Matched objects contain one of the strings that matches the regex pattern:
-      r"\\u|\\\\|\\([0-9]+);"
-    The strings can be '\u', '\\', or '\###;' (### is any digit number).
-
-    m.group(0) refers to the entire matched string ('\u', '\\', or '\###;').
-    m.group(1) refers to the first parenthesized subgroup ('###').
-
-    m.group(0) exists for all match objects, while m.group(1) exists only for
-    the string '\###;'.
-
-    This function looks to see if m.group(1) exists. If it doesn't, then the
-    matched string must be '\u' or '\\' . In this case, the corresponding
-    replacement ('_' and '\') are returned. Note that in python, a single
-    backslash is written as '\\', and double backslash as '\\\\'.
-
-    If m.goup(1) exists, then use the integer in m.group(1) to return a
-    unicode character.
-
-    Args:
-      m: match object
-
-    Returns:
-      String to replace matched object with.
-    """
-    # Check if the matched strings are '\u' or '\\'.
-    if m.group(1) is None:
-      return u"_" if m.group(0) == u"\\u" else u"\\"
-
-    # If m.group(1) exists, try and return unicode character.
-    try:
-      return six.unichr(int(m.group(1)))
-    except (ValueError, OverflowError) as _:
-      return _UNDEFINED_UNICODE
-
-  # Use match function to replace escaped substrings in the token.
-  return _UNESCAPE_REGEX.sub(match, token)
-
-
-def _count_tokens(files,
-                  file_byte_limit=1e6,
-                  correct_strip=True,
-                  master_char_set=None):
-  """Return token counts of words in the files.
-
-  Samples file_byte_limit bytes from each file, and counts the words that appear
-  in the samples. The samples are semi-evenly distributed across the file.
-
-  Args:
-    files: List of filepaths
-    file_byte_limit: Max number of bytes that will be read from each file.
-    correct_strip: Whether to convert text to unicode before strip. This affects
-      vocabulary generation for PY2. Sets correct_strip to False in PY2 to
-      reproduce previous common public result. Sets correct_strip to True will
-      let PY2 and PY3 get a consistent vocabulary.
-    master_char_set: the char set.
-
-  Returns:
-    Dictionary mapping tokens to the number of times they appear in the sampled
-    lines from the files.
-  """
-  if master_char_set is None:
-    master_char_set = _ALPHANUMERIC_CHAR_SET
-
-  token_counts = collections.defaultdict(int)
-
-  for filepath in files:
-    with tf.io.gfile.GFile(filepath, mode="r") as reader:
-      file_byte_budget = file_byte_limit
-      counter = 0
-      lines_to_skip = int(reader.size() / (file_byte_budget * 2))
-      for line in reader:
-        if counter < lines_to_skip:
-          counter += 1
-        else:
-          if file_byte_budget < 0:
-            break
-          if correct_strip:
-            line = native_to_unicode(line)
-          line = line.strip()
-          file_byte_budget -= len(line)
-          counter = 0
-
-          # Add words to token counts
-          for token in _split_string_to_tokens(
-              native_to_unicode(line), master_char_set):
-            token_counts[token] += 1
-  return token_counts
-
-
-def _list_to_index_dict(lst):
-  """Create dictionary mapping list items to their indices in the list."""
-  return {item: n for n, item in enumerate(lst)}
-
-
-def _split_token_to_subtokens(token, subtoken_dict, max_subtoken_length):
-  """Splits a token into subtokens defined in the subtoken dict."""
-  ret = []
-  start = 0
-  token_len = len(token)
-  while start < token_len:
-    # Find the longest subtoken, so iterate backwards.
-    for end in xrange(min(token_len, start + max_subtoken_length), start, -1):
-      subtoken = token[start:end]
-      if subtoken in subtoken_dict:
-        ret.append(subtoken)
-        start = end
-        break
-    else:  # Did not break
-      # If there is no possible encoding of the escaped token then one of the
-      # characters in the token is not in the alphabet. This should be
-      # impossible and would be indicative of a bug.
-      raise ValueError("Was unable to split token \"%s\" into subtokens." %
-                       token)
-  return ret
-
-
-def _generate_subtokens_with_target_vocab_size(token_counts,
-                                               alphabet,
-                                               target_size,
-                                               threshold,
-                                               min_count=None,
-                                               reserved_tokens=None):
-  """Generate subtoken vocabulary close to the target size."""
-  if reserved_tokens is None:
-    reserved_tokens = RESERVED_TOKENS
-
-  if min_count is not None:
-    logging.info("Using min_count=%d to generate vocab with target size %d",
-                 min_count, target_size)
-    return _generate_subtokens(
-        token_counts, alphabet, min_count, reserved_tokens=reserved_tokens)
-
-  def bisect(min_val, max_val):
-    """Recursive function to binary search for subtoken vocabulary."""
-    cur_count = (min_val + max_val) // 2
-    logging.info("Binary search: trying min_count=%d (%d %d)", cur_count,
-                 min_val, max_val)
-    subtoken_list = _generate_subtokens(
-        token_counts, alphabet, cur_count, reserved_tokens=reserved_tokens)
-
-    val = len(subtoken_list)
-    logging.info("Binary search: min_count=%d resulted in %d tokens", cur_count,
-                 val)
-
-    within_threshold = abs(val - target_size) < threshold
-    if within_threshold or min_val >= max_val or cur_count < 2:
-      return subtoken_list
-    if val > target_size:
-      other_subtoken_list = bisect(cur_count + 1, max_val)
-    else:
-      other_subtoken_list = bisect(min_val, cur_count - 1)
-
-    # Return vocabulary dictionary with the closest number of tokens.
-    other_val = len(other_subtoken_list)
-    if abs(other_val - target_size) < abs(val - target_size):
-      return other_subtoken_list
-    return subtoken_list
-
-  logging.info("Finding best min_count to get target size of %d", target_size)
-  return bisect(_MIN_MIN_COUNT, _MAX_MIN_COUNT)
-
-
-def _generate_alphabet_dict(iterable, reserved_tokens=None):
-  """Create set of characters that appear in any element in the iterable."""
-  if reserved_tokens is None:
-    reserved_tokens = RESERVED_TOKENS
-  alphabet = {c for token in iterable for c in token}
-  alphabet |= {c for token in reserved_tokens for c in token}
-  alphabet |= _ESCAPE_CHARS  # Add escape characters to alphabet set.
-  return alphabet
-
-
-def _count_and_gen_subtokens(token_counts, alphabet, subtoken_dict,
-                             max_subtoken_length):
-  """Count number of times subtokens appear, and generate new subtokens.
-
-  Args:
-    token_counts: dict mapping tokens to the number of times they appear in the
-      original files.
-    alphabet: list of allowed characters. Used to escape the tokens, which
-      guarantees that all tokens can be split into subtokens.
-    subtoken_dict: dict mapping subtokens to ids.
-    max_subtoken_length: maximum length of subtoken in subtoken_dict.
-
-  Returns:
-    A defaultdict mapping subtokens to the number of times they appear in the
-    tokens. The dict may contain new subtokens.
-  """
-  subtoken_counts = collections.defaultdict(int)
-  for token, count in six.iteritems(token_counts):
-    token = _escape_token(token, alphabet)
-    subtokens = _split_token_to_subtokens(token, subtoken_dict,
-                                          max_subtoken_length)
-
-    # Generate new subtokens by taking substrings from token.
-    start = 0
-    for subtoken in subtokens:
-      for end in xrange(start + 1, len(token) + 1):
-        new_subtoken = token[start:end]
-        subtoken_counts[new_subtoken] += count
-      start += len(subtoken)
-
-  return subtoken_counts
-
-
-def _filter_and_bucket_subtokens(subtoken_counts, min_count):
-  """Return a bucketed list of subtokens that are filtered by count.
-
-  Args:
-    subtoken_counts: defaultdict mapping subtokens to their counts
-    min_count: int count used to filter subtokens
-
-  Returns:
-    List of subtoken sets, where subtokens in set i have the same length=i.
-  """
-  # Create list of buckets, where subtokens in bucket i have length i.
-  subtoken_buckets = []
-  for subtoken, count in six.iteritems(subtoken_counts):
-    if count < min_count:  # Filter out subtokens that don't appear enough
-      continue
-    while len(subtoken_buckets) <= len(subtoken):
-      subtoken_buckets.append(set())
-    subtoken_buckets[len(subtoken)].add(subtoken)
-  return subtoken_buckets
-
-
-def _gen_new_subtoken_list(subtoken_counts,
-                           min_count,
-                           alphabet,
-                           reserved_tokens=None):
-  """Generate candidate subtokens ordered by count, and new max subtoken length.
-
-  Add subtokens to the candiate list in order of length (longest subtokens
-  first). When a subtoken is added, the counts of each of its prefixes are
-  decreased. Prefixes that don't appear much outside the subtoken are not added
-  to the candidate list.
-
-  For example:
-    subtoken being added to candidate list: 'translate'
-    subtoken_counts: {'translate':10, 't':40, 'tr':16, 'tra':12, ...}
-    min_count: 5
-
-  When 'translate' is added, subtoken_counts is updated to:
-    {'translate':0, 't':30, 'tr':6, 'tra': 2, ...}
-
-  The subtoken 'tra' will not be added to the candidate list, because it appears
-  twice (less than min_count) outside of 'translate'.
-
-  Args:
-    subtoken_counts: defaultdict mapping str subtokens to int counts
-    min_count: int minumum count requirement for subtokens
-    alphabet: set of characters. Each character is added to the subtoken list to
-      guarantee that all tokens can be encoded.
-    reserved_tokens: list of tokens that will be added to the beginning of the
-      returned subtoken list.
-
-  Returns:
-    List of candidate subtokens in decreasing count order, and maximum subtoken
-    length
-  """
-  if reserved_tokens is None:
-    reserved_tokens = RESERVED_TOKENS
-
-  # Create a list of (count, subtoken) for each candidate subtoken.
-  subtoken_candidates = []
-
-  # Use bucketted list to iterate through subtokens in order of length.
-  # subtoken_buckets[i] = set(subtokens), where each subtoken has length i.
-  subtoken_buckets = _filter_and_bucket_subtokens(subtoken_counts, min_count)
-  max_subtoken_length = len(subtoken_buckets) - 1
-
-  # Go through the list in reverse order to consider longer subtokens first.
-  for subtoken_len in xrange(max_subtoken_length, 0, -1):
-    for subtoken in subtoken_buckets[subtoken_len]:
-      count = subtoken_counts[subtoken]
-
-      # Possible if this subtoken is a prefix of another token.
-      if count < min_count:
-        continue
-
-      # Ignore alphabet/reserved tokens, which will be added manually later.
-      if subtoken not in alphabet and subtoken not in reserved_tokens:
-        subtoken_candidates.append((count, subtoken))
-
-      # Decrement count of the subtoken's prefixes (if a longer subtoken is
-      # added, its prefixes lose priority to be added).
-      for end in xrange(1, subtoken_len):
-        subtoken_counts[subtoken[:end]] -= count
-
-  # Add alphabet subtokens (guarantees that all strings are encodable).
-  subtoken_candidates.extend((subtoken_counts.get(a, 0), a) for a in alphabet)
-
-  # Order subtoken candidates by decreasing count.
-  subtoken_list = [t for _, t in sorted(subtoken_candidates, reverse=True)]
-
-  # Add reserved tokens to beginning of the list.
-  subtoken_list = reserved_tokens + subtoken_list
-  return subtoken_list, max_subtoken_length
-
-
-def _generate_subtokens(token_counts,
-                        alphabet,
-                        min_count,
-                        num_iterations=4,
-                        reserved_tokens=None):
-  """Create a list of subtokens in decreasing order of frequency.
-
-  Args:
-    token_counts: dict mapping str tokens -> int count
-    alphabet: set of characters
-    min_count: int minimum number of times a subtoken must appear before it is
-      added to the vocabulary.
-    num_iterations: int number of iterations to generate new tokens.
-    reserved_tokens: list of tokens that will be added to the beginning to the
-      returned subtoken list.
-
-  Returns:
-    Sorted list of subtokens (most frequent first)
-  """
-  if reserved_tokens is None:
-    reserved_tokens = RESERVED_TOKENS
-
-  # Use alphabet set to create initial list of subtokens
-  subtoken_list = reserved_tokens + list(alphabet)
-  max_subtoken_length = 1
-
-  # On each iteration, segment all words using the subtokens defined in
-  # subtoken_dict, count how often the resulting subtokens appear, and update
-  # the dictionary with subtokens w/ high enough counts.
-  for i in xrange(num_iterations):
-    logging.info("\tGenerating subtokens: iteration %d", i)
-    # Generate new subtoken->id dictionary using the new subtoken list.
-    subtoken_dict = _list_to_index_dict(subtoken_list)
-
-    # Create dict mapping subtoken->count, with additional subtokens created
-    # from substrings taken from the tokens.
-    subtoken_counts = _count_and_gen_subtokens(token_counts, alphabet,
-                                               subtoken_dict,
-                                               max_subtoken_length)
-
-    # Generate new list of subtokens sorted by subtoken count.
-    subtoken_list, max_subtoken_length = _gen_new_subtoken_list(
-        subtoken_counts, min_count, alphabet, reserved_tokens)
-
-    logging.info("\tVocab size: %d", len(subtoken_list))
-  return subtoken_list
--- a/official/nlp/transformer/utils/tokenizer_test.py
+++ b/official/nlp/transformer/utils/tokenizer_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Test Subtokenizer and string helper methods."""
-
-import collections
-import tempfile
-
-import tensorflow as tf
-
-from official.nlp.transformer.utils import tokenizer
-
-
-class SubtokenizerTest(tf.test.TestCase):
-
-  def _init_subtokenizer(self, vocab_list):
-    temp_file = tempfile.NamedTemporaryFile(delete=False)
-    with tf.io.gfile.GFile(temp_file.name, "w") as w:
-      for subtoken in vocab_list:
-        w.write("'%s'" % subtoken)
-        w.write("\n")
-    return tokenizer.Subtokenizer(temp_file.name, reserved_tokens=[])
-
-  def test_encode(self):
-    vocab_list = ["123_", "test", "ing_"]
-    subtokenizer = self._init_subtokenizer(vocab_list)
-    s = "testing 123"
-    encoded_list = subtokenizer.encode(s)
-    self.assertEqual([1, 2, 0], encoded_list)
-
-  def test_decode(self):
-    vocab_list = ["123_", "test", "ing_"]
-    subtokenizer = self._init_subtokenizer(vocab_list)
-    encoded_list = [1, 2, 0]  # testing 123
-    decoded_str = subtokenizer.decode(encoded_list)
-    self.assertEqual("testing 123", decoded_str)
-
-  def test_subtoken_ids_to_tokens(self):
-    vocab_list = ["123_", "test", "ing_"]
-    subtokenizer = self._init_subtokenizer(vocab_list)
-    encoded_list = [1, 2, 0]  # testing 123
-    token_list = subtokenizer._subtoken_ids_to_tokens(encoded_list)
-    self.assertEqual([u"testing", u"123"], token_list)
-
-
-class StringHelperTest(tf.test.TestCase):
-
-  def test_split_string_to_tokens(self):
-    text = "test? testing 123."
-
-    tokens = tokenizer._split_string_to_tokens(text,
-                                               tokenizer._ALPHANUMERIC_CHAR_SET)
-    self.assertEqual(["test", "? ", "testing", "123", "."], tokens)
-
-  def test_join_tokens_to_string(self):
-    tokens = ["test", "? ", "testing", "123", "."]
-
-    s = tokenizer._join_tokens_to_string(tokens,
-                                         tokenizer._ALPHANUMERIC_CHAR_SET)
-    self.assertEqual("test? testing 123.", s)
-
-  def test_escape_token(self):
-    token = u"abc_\\4"
-    alphabet = set("abc_\\u;")
-
-    escaped_token = tokenizer._escape_token(token, alphabet)
-    self.assertEqual("abc\\u\\\\\\52;_", escaped_token)
-
-  def test_unescape_token(self):
-    escaped_token = u"Underline: \\u, Backslash: \\\\, Unicode: \\52;"
-
-    unescaped_token = tokenizer._unescape_token(escaped_token)
-    self.assertEqual("Underline: _, Backslash: \\, Unicode: 4", unescaped_token)
-
-  def test_list_to_index_dict(self):
-    lst = ["test", "strings"]
-
-    d = tokenizer._list_to_index_dict(lst)
-    self.assertDictEqual({"test": 0, "strings": 1}, d)
-
-  def test_split_token_to_subtokens(self):
-    token = "abc"
-    subtoken_dict = {"a": 0, "b": 1, "c": 2, "ab": 3}
-    max_subtoken_length = 2
-
-    subtokens = tokenizer._split_token_to_subtokens(token, subtoken_dict,
-                                                    max_subtoken_length)
-    self.assertEqual(["ab", "c"], subtokens)
-
-  def test_generate_alphabet_dict(self):
-    s = ["testing", "123"]
-    reserved_tokens = ["???"]
-
-    alphabet = tokenizer._generate_alphabet_dict(s, reserved_tokens)
-    self.assertIn("?", alphabet)
-    self.assertIn("t", alphabet)
-    self.assertIn("e", alphabet)
-    self.assertIn("s", alphabet)
-    self.assertIn("i", alphabet)
-    self.assertIn("n", alphabet)
-    self.assertIn("g", alphabet)
-    self.assertIn("1", alphabet)
-    self.assertIn("2", alphabet)
-    self.assertIn("3", alphabet)
-
-  def test_count_and_gen_subtokens(self):
-    token_counts = {"abc": 5}
-    alphabet = set("abc_")
-    subtoken_dict = {"a": 0, "b": 1, "c": 2, "_": 3}
-    max_subtoken_length = 2
-
-    subtoken_counts = tokenizer._count_and_gen_subtokens(
-        token_counts, alphabet, subtoken_dict, max_subtoken_length)
-
-    self.assertIsInstance(subtoken_counts, collections.defaultdict)
-    self.assertDictEqual(
-        {
-            "a": 5,
-            "b": 5,
-            "c": 5,
-            "_": 5,
-            "ab": 5,
-            "bc": 5,
-            "c_": 5,
-            "abc": 5,
-            "bc_": 5,
-            "abc_": 5
-        }, subtoken_counts)
-
-  def test_filter_and_bucket_subtokens(self):
-    subtoken_counts = collections.defaultdict(int, {
-        "a": 2,
-        "b": 4,
-        "c": 1,
-        "ab": 6,
-        "ac": 3,
-        "abbc": 5
-    })
-    min_count = 3
-
-    subtoken_buckets = tokenizer._filter_and_bucket_subtokens(
-        subtoken_counts, min_count)
-
-    self.assertEqual(len(subtoken_buckets[0]), 0)
-    self.assertEqual(set("b"), subtoken_buckets[1])
-    self.assertEqual(set(["ab", "ac"]), subtoken_buckets[2])
-    self.assertEqual(len(subtoken_buckets[3]), 0)
-    self.assertEqual(set(["abbc"]), subtoken_buckets[4])
-
-  def test_gen_new_subtoken_list(self):
-    subtoken_counts = collections.defaultdict(int, {
-        "translate": 10,
-        "t": 40,
-        "tr": 16,
-        "tra": 12
-    })
-    min_count = 5
-    alphabet = set("translate")
-    reserved_tokens = ["reserved", "tokens"]
-
-    subtoken_list, max_token_length = tokenizer._gen_new_subtoken_list(
-        subtoken_counts, min_count, alphabet, reserved_tokens)
-
-    # Check that "tra" isn"t in the list (its count should be decremented to 2,
-    # so it should not be added to the canddiate list).
-    self.assertNotIn("tra", subtoken_list)
-
-    self.assertIn("tr", subtoken_list)
-    self.assertIn("t", subtoken_list)
-
-    self.assertEqual(len("translate"), max_token_length)
-
-  def test_generate_subtokens(self):
-    token_counts = {"ab": 1, "bc": 3, "abc": 5}
-    alphabet = set("abc_")
-    min_count = 100
-    num_iterations = 1
-    reserved_tokens = ["reserved", "tokens"]
-
-    vocab_list = tokenizer._generate_subtokens(token_counts, alphabet,
-                                               min_count, num_iterations,
-                                               reserved_tokens)
-
-    # Check that reserved tokens are at the front of the list
-    self.assertEqual(vocab_list[:2], reserved_tokens)
-
-    # Check that each character in alphabet is in the vocab list
-    for c in alphabet:
-      self.assertIn(c, vocab_list)
-
-
-if __name__ == "__main__":
-  tf.test.main()