Update code to v2.8.0

9485aa1d · qianyj · 89cfa348 · f5fc733a · 9485aa1d · 9485aa1d
Commit 9485aa1d authored Nov 28, 2023 by qianyj
20 changed files
--- a/official/legacy/transformer/beam_search_v1.py
+++ b/official/legacy/transformer/beam_search_v1.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Beam search to find the translated sequence with the highest probability."""
+
+import tensorflow.compat.v1 as tf
+from official.nlp.modeling.ops import beam_search
+
+_StateKeys = beam_search._StateKeys  # pylint: disable=protected-access
+
+
+class SequenceBeamSearch(beam_search.SequenceBeamSearch):
+  """Implementation of beam search loop."""
+
+  def _process_finished_state(self, finished_state):
+    alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
+    alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
+    finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
+    finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
+    finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
+
+    # Account for corner case where there are no finished sequences for a
+    # particular batch item. In that case, return alive sequences for that batch
+    # item.
+    finished_seq = tf.where(
+        tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
+    finished_scores = tf.where(
+        tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
+    return finished_seq, finished_scores
+
+
+def sequence_beam_search(symbols_to_logits_fn,
+                         initial_ids,
+                         initial_cache,
+                         vocab_size,
+                         beam_size,
+                         alpha,
+                         max_decode_length,
+                         eos_id,
+                         padded_decode=False):
+  """Search for sequence of subtoken ids with the largest probability.
+
+  Args:
+    symbols_to_logits_fn: A function that takes in ids, index, and cache as
+      arguments. The passed in arguments will have shape: ids -> A tensor with
+        shape [batch_size * beam_size, index]. index -> A scalar. cache -> A
+        nested dictionary of tensors [batch_size * beam_size, ...].
+      The function must return a tuple of logits and new cache: logits -> A
+        tensor with shape [batch * beam_size, vocab_size]. new cache -> A nested
+        dictionary with the same shape/structure as the inputted cache.
+    initial_ids: An int32 tensor with shape [batch_size]. Starting ids for each
+      batch item.
+    initial_cache: A dictionary, containing starting decoder variables
+      information.
+    vocab_size: An integer, the size of the vocabulary, used for topk
+      computation.
+    beam_size: An integer, the number of beams.
+    alpha: A float, defining the strength of length normalization.
+    max_decode_length: An integer, the maximum length to decoded a sequence.
+    eos_id: An integer, ID of eos token, used to determine when a sequence has
+      finished.
+    padded_decode: A bool, indicating if max_sequence_length padding is used for
+      beam search.
+
+  Returns:
+    Top decoded sequences [batch_size, beam_size, max_decode_length]
+    sequence scores [batch_size, beam_size]
+  """
+  sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, beam_size, alpha,
+                           max_decode_length, eos_id, padded_decode)
+  return sbs.search(initial_ids, initial_cache)
--- a/official/legacy/transformer/compute_bleu.py
+++ b/official/legacy/transformer/compute_bleu.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script to compute official BLEU score.
+
+Source:
+https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
+"""
+
+import re
+import sys
+import unicodedata
+
+from absl import app
+from absl import flags
+from absl import logging
+import six
+from six.moves import range
+import tensorflow as tf
+
+from official.legacy.transformer.utils import metrics
+from official.legacy.transformer.utils import tokenizer
+from official.utils.flags import core as flags_core
+
+
+class UnicodeRegex(object):
+  """Ad-hoc hack to recognize all punctuation and symbols."""
+
+  def __init__(self):
+    punctuation = self.property_chars("P")
+    self.nondigit_punct_re = re.compile(r"([^\d])([" + punctuation + r"])")
+    self.punct_nondigit_re = re.compile(r"([" + punctuation + r"])([^\d])")
+    self.symbol_re = re.compile("([" + self.property_chars("S") + "])")
+
+  def property_chars(self, prefix):
+    return "".join(
+        six.unichr(x)
+        for x in range(sys.maxunicode)
+        if unicodedata.category(six.unichr(x)).startswith(prefix))
+
+
+uregex = UnicodeRegex()
+
+
+def bleu_tokenize(string):
+  r"""Tokenize a string following the official BLEU implementation.
+
+  See https://github.com/moses-smt/mosesdecoder/'
+           'blob/master/scripts/generic/mteval-v14.pl#L954-L983
+  In our case, the input string is expected to be just one line
+  and no HTML entities de-escaping is needed.
+  So we just tokenize on punctuation and symbols,
+  except when a punctuation is preceded and followed by a digit
+  (e.g. a comma/dot as a thousand/decimal separator).
+
+  Note that a numer (e.g. a year) followed by a dot at the end of sentence
+  is NOT tokenized,
+  i.e. the dot stays with the number because `s/(\p{P})(\P{N})/ $1 $2/g`
+  does not match this case (unless we add a space after each sentence).
+  However, this error is already in the original mteval-v14.pl
+  and we want to be consistent with it.
+
+  Args:
+    string: the input string
+
+  Returns:
+    a list of tokens
+  """
+  string = uregex.nondigit_punct_re.sub(r"\1 \2 ", string)
+  string = uregex.punct_nondigit_re.sub(r" \1 \2", string)
+  string = uregex.symbol_re.sub(r" \1 ", string)
+  return string.split()
+
+
+def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False):
+  """Compute BLEU for two files (reference and hypothesis translation)."""
+  ref_lines = tokenizer.native_to_unicode(
+      tf.io.gfile.GFile(ref_filename).read()).strip().splitlines()
+  hyp_lines = tokenizer.native_to_unicode(
+      tf.io.gfile.GFile(hyp_filename).read()).strip().splitlines()
+  return bleu_on_list(ref_lines, hyp_lines, case_sensitive)
+
+
+def bleu_on_list(ref_lines, hyp_lines, case_sensitive=False):
+  """Compute BLEU for two list of strings (reference and hypothesis)."""
+  if len(ref_lines) != len(hyp_lines):
+    raise ValueError(
+        "Reference and translation files have different number of "
+        "lines (%d VS %d). If training only a few steps (100-200), the "
+        "translation may be empty." % (len(ref_lines), len(hyp_lines)))
+  if not case_sensitive:
+    ref_lines = [x.lower() for x in ref_lines]
+    hyp_lines = [x.lower() for x in hyp_lines]
+  ref_tokens = [bleu_tokenize(x) for x in ref_lines]
+  hyp_tokens = [bleu_tokenize(x) for x in hyp_lines]
+  return metrics.compute_bleu(ref_tokens, hyp_tokens) * 100
+
+
+def main(unused_argv):
+  if FLAGS.bleu_variant in ("both", "uncased"):
+    score = bleu_wrapper(FLAGS.reference, FLAGS.translation, False)
+    logging.info("Case-insensitive results: %f", score)
+
+  if FLAGS.bleu_variant in ("both", "cased"):
+    score = bleu_wrapper(FLAGS.reference, FLAGS.translation, True)
+    logging.info("Case-sensitive results: %f", score)
+
+
+def define_compute_bleu_flags():
+  """Add flags for computing BLEU score."""
+  flags.DEFINE_string(
+      name="translation",
+      default=None,
+      help=flags_core.help_wrap("File containing translated text."))
+  flags.mark_flag_as_required("translation")
+
+  flags.DEFINE_string(
+      name="reference",
+      default=None,
+      help=flags_core.help_wrap("File containing reference translation."))
+  flags.mark_flag_as_required("reference")
+
+  flags.DEFINE_enum(
+      name="bleu_variant",
+      short_name="bv",
+      default="both",
+      enum_values=["both", "uncased", "cased"],
+      case_sensitive=False,
+      help=flags_core.help_wrap(
+          "Specify one or more BLEU variants to calculate. Variants: \"cased\""
+          ", \"uncased\", or \"both\"."))
+
+
+if __name__ == "__main__":
+  define_compute_bleu_flags()
+  FLAGS = flags.FLAGS
+  app.run(main)
--- a/official/legacy/transformer/compute_bleu_test.py
+++ b/official/legacy/transformer/compute_bleu_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test functions in compute_blue.py."""
+
+import tempfile
+
+import tensorflow as tf
+
+from official.legacy.transformer import compute_bleu
+
+
+class ComputeBleuTest(tf.test.TestCase):
+
+  def _create_temp_file(self, text):
+    temp_file = tempfile.NamedTemporaryFile(delete=False)
+    with tf.io.gfile.GFile(temp_file.name, "w") as w:
+      w.write(text)
+    return temp_file.name
+
+  def test_bleu_same(self):
+    ref = self._create_temp_file("test 1 two 3\nmore tests!")
+    hyp = self._create_temp_file("test 1 two 3\nmore tests!")
+
+    uncased_score = compute_bleu.bleu_wrapper(ref, hyp, False)
+    cased_score = compute_bleu.bleu_wrapper(ref, hyp, True)
+    self.assertEqual(100, uncased_score)
+    self.assertEqual(100, cased_score)
+
+  def test_bleu_same_different_case(self):
+    ref = self._create_temp_file("Test 1 two 3\nmore tests!")
+    hyp = self._create_temp_file("test 1 two 3\nMore tests!")
+    uncased_score = compute_bleu.bleu_wrapper(ref, hyp, False)
+    cased_score = compute_bleu.bleu_wrapper(ref, hyp, True)
+    self.assertEqual(100, uncased_score)
+    self.assertLess(cased_score, 100)
+
+  def test_bleu_different(self):
+    ref = self._create_temp_file("Testing\nmore tests!")
+    hyp = self._create_temp_file("Dog\nCat")
+    uncased_score = compute_bleu.bleu_wrapper(ref, hyp, False)
+    cased_score = compute_bleu.bleu_wrapper(ref, hyp, True)
+    self.assertLess(uncased_score, 100)
+    self.assertLess(cased_score, 100)
+
+  def test_bleu_tokenize(self):
+    s = "Test0, 1 two, 3"
+    tokenized = compute_bleu.bleu_tokenize(s)
+    self.assertEqual(["Test0", ",", "1", "two", ",", "3"], tokenized)
+
+  def test_bleu_list(self):
+    ref = ["test 1 two 3", "more tests!"]
+    hyp = ["test 1 two 3", "More tests!"]
+    uncased_score = compute_bleu.bleu_on_list(ref, hyp, False)
+    cased_score = compute_bleu.bleu_on_list(ref, hyp, True)
+    self.assertEqual(uncased_score, 100)
+    self.assertLess(cased_score, 100)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/legacy/transformer/data_download.py
+++ b/official/legacy/transformer/data_download.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Download and preprocess WMT17 ende training and evaluation datasets."""
+
+import os
+import random
+import tarfile
+
+# pylint: disable=g-bad-import-order
+
+from absl import app
+from absl import flags
+from absl import logging
+import six
+from six.moves import range
+from six.moves import urllib
+from six.moves import zip
+import tensorflow.compat.v1 as tf
+
+from official.legacy.transformer.utils import tokenizer
+from official.utils.flags import core as flags_core
+# pylint: enable=g-bad-import-order
+
+# Data sources for training/evaluating the transformer translation model.
+# If any of the training sources are changed, then either:
+#   1) use the flag `--search` to find the best min count or
+#   2) update the _TRAIN_DATA_MIN_COUNT constant.
+# min_count is the minimum number of times a token must appear in the data
+# before it is added to the vocabulary. "Best min count" refers to the value
+# that generates a vocabulary set that is closest in size to _TARGET_VOCAB_SIZE.
+_TRAIN_DATA_SOURCES = [
+    {
+        "url": "http://data.statmt.org/wmt17/translation-task/"
+               "training-parallel-nc-v12.tgz",
+        "input": "news-commentary-v12.de-en.en",
+        "target": "news-commentary-v12.de-en.de",
+    },
+    {
+        "url": "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz",
+        "input": "commoncrawl.de-en.en",
+        "target": "commoncrawl.de-en.de",
+    },
+    {
+        "url": "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz",
+        "input": "europarl-v7.de-en.en",
+        "target": "europarl-v7.de-en.de",
+    },
+]
+# Use pre-defined minimum count to generate subtoken vocabulary.
+_TRAIN_DATA_MIN_COUNT = 6
+
+_EVAL_DATA_SOURCES = [{
+    "url": "http://data.statmt.org/wmt17/translation-task/dev.tgz",
+    "input": "newstest2013.en",
+    "target": "newstest2013.de",
+}]
+
+_TEST_DATA_SOURCES = [{
+    "url": ("https://storage.googleapis.com/cloud-tpu-test-datasets/"
+            "transformer_data/newstest2014.tgz"),
+    "input": "newstest2014.en",
+    "target": "newstest2014.de",
+}]
+
+# Vocabulary constants
+_TARGET_VOCAB_SIZE = 32768  # Number of subtokens in the vocabulary list.
+_TARGET_THRESHOLD = 327  # Accept vocabulary if size is within this threshold
+VOCAB_FILE = "vocab.ende.%d" % _TARGET_VOCAB_SIZE
+
+# Strings to inclue in the generated files.
+_PREFIX = "wmt32k"
+_TRAIN_TAG = "train"
+_EVAL_TAG = "dev"  # Following WMT and Tensor2Tensor conventions, in which the
+# evaluation datasets are tagged as "dev" for development.
+
+# Number of files to split train and evaluation data
+_TRAIN_SHARDS = 100
+_EVAL_SHARDS = 1
+
+
+def find_file(path, filename, max_depth=5):
+  """Returns full filepath if the file is in path or a subdirectory."""
+  for root, dirs, files in os.walk(path):
+    if filename in files:
+      return os.path.join(root, filename)
+
+    # Don't search past max_depth
+    depth = root[len(path) + 1:].count(os.sep)
+    if depth > max_depth:
+      del dirs[:]  # Clear dirs
+  return None
+
+
+###############################################################################
+# Download and extraction functions
+###############################################################################
+def get_raw_files(raw_dir, data_source):
+  """Return raw files from source.
+
+  Downloads/extracts if needed.
+
+  Args:
+    raw_dir: string directory to store raw files
+    data_source: dictionary with
+      {"url": url of compressed dataset containing input and target files
+       "input": file with data in input language
+       "target": file with data in target language}
+
+  Returns:
+    dictionary with
+      {"inputs": list of files containing data in input language
+       "targets": list of files containing corresponding data in target language
+      }
+  """
+  raw_files = {
+      "inputs": [],
+      "targets": [],
+  }  # keys
+  for d in data_source:
+    input_file, target_file = download_and_extract(raw_dir, d["url"],
+                                                   d["input"], d["target"])
+    raw_files["inputs"].append(input_file)
+    raw_files["targets"].append(target_file)
+  return raw_files
+
+
+def download_report_hook(count, block_size, total_size):
+  """Report hook for download progress.
+
+  Args:
+    count: current block number
+    block_size: block size
+    total_size: total size
+  """
+  percent = int(count * block_size * 100 / total_size)
+  print(six.ensure_str("\r%d%%" % percent) + " completed", end="\r")
+
+
+def download_from_url(path, url):
+  """Download content from a url.
+
+  Args:
+    path: string directory where file will be downloaded
+    url: string url
+
+  Returns:
+    Full path to downloaded file
+  """
+  filename = six.ensure_str(url).split("/")[-1]
+  found_file = find_file(path, filename, max_depth=0)
+  if found_file is None:
+    filename = os.path.join(path, filename)
+    logging.info("Downloading from %s to %s.", url, filename)
+    inprogress_filepath = six.ensure_str(filename) + ".incomplete"
+    inprogress_filepath, _ = urllib.request.urlretrieve(
+        url, inprogress_filepath, reporthook=download_report_hook)
+    # Print newline to clear the carriage return from the download progress.
+    print()
+    tf.gfile.Rename(inprogress_filepath, filename)
+    return filename
+  else:
+    logging.info("Already downloaded: %s (at %s).", url, found_file)
+    return found_file
+
+
+def download_and_extract(path, url, input_filename, target_filename):
+  """Extract files from downloaded compressed archive file.
+
+  Args:
+    path: string directory where the files will be downloaded
+    url: url containing the compressed input and target files
+    input_filename: name of file containing data in source language
+    target_filename: name of file containing data in target language
+
+  Returns:
+    Full paths to extracted input and target files.
+
+  Raises:
+    OSError: if the the download/extraction fails.
+  """
+  # Check if extracted files already exist in path
+  input_file = find_file(path, input_filename)
+  target_file = find_file(path, target_filename)
+  if input_file and target_file:
+    logging.info("Already downloaded and extracted %s.", url)
+    return input_file, target_file
+
+  # Download archive file if it doesn't already exist.
+  compressed_file = download_from_url(path, url)
+
+  # Extract compressed files
+  logging.info("Extracting %s.", compressed_file)
+  with tarfile.open(compressed_file, "r:gz") as corpus_tar:
+    corpus_tar.extractall(path)
+
+  # Return file paths of the requested files.
+  input_file = find_file(path, input_filename)
+  target_file = find_file(path, target_filename)
+
+  if input_file and target_file:
+    return input_file, target_file
+
+  raise OSError("Download/extraction failed for url %s to path %s" %
+                (url, path))
+
+
+def txt_line_iterator(path):
+  """Iterate through lines of file."""
+  with tf.io.gfile.GFile(path) as f:
+    for line in f:
+      yield line.strip()
+
+
+def compile_files(raw_dir, raw_files, tag):
+  """Compile raw files into a single file for each language.
+
+  Args:
+    raw_dir: Directory containing downloaded raw files.
+    raw_files: Dict containing filenames of input and target data.
+      {"inputs": list of files containing data in input language
+       "targets": list of files containing corresponding data in target language
+         }
+    tag: String to append to the compiled filename.
+
+  Returns:
+    Full path of compiled input and target files.
+  """
+  logging.info("Compiling files with tag %s.", tag)
+  filename = "%s-%s" % (_PREFIX, tag)
+  input_compiled_file = os.path.join(raw_dir,
+                                     six.ensure_str(filename) + ".lang1")
+  target_compiled_file = os.path.join(raw_dir,
+                                      six.ensure_str(filename) + ".lang2")
+
+  with tf.io.gfile.GFile(input_compiled_file, mode="w") as input_writer:
+    with tf.io.gfile.GFile(target_compiled_file, mode="w") as target_writer:
+      for i in range(len(raw_files["inputs"])):
+        input_file = raw_files["inputs"][i]
+        target_file = raw_files["targets"][i]
+
+        logging.info("Reading files %s and %s.", input_file, target_file)
+        write_file(input_writer, input_file)
+        write_file(target_writer, target_file)
+  return input_compiled_file, target_compiled_file
+
+
+def write_file(writer, filename):
+  """Write all of lines from file using the writer."""
+  for line in txt_line_iterator(filename):
+    writer.write(line)
+    writer.write("\n")
+
+
+###############################################################################
+# Data preprocessing
+###############################################################################
+def encode_and_save_files(subtokenizer, data_dir, raw_files, tag, total_shards):
+  """Save data from files as encoded Examples in TFrecord format.
+
+  Args:
+    subtokenizer: Subtokenizer object that will be used to encode the strings.
+    data_dir: The directory in which to write the examples
+    raw_files: A tuple of (input, target) data files. Each line in the input and
+      the corresponding line in target file will be saved in a tf.Example.
+    tag: String that will be added onto the file names.
+    total_shards: Number of files to divide the data into.
+
+  Returns:
+    List of all files produced.
+  """
+  # Create a file for each shard.
+  filepaths = [
+      shard_filename(data_dir, tag, n + 1, total_shards)
+      for n in range(total_shards)
+  ]
+
+  if all_exist(filepaths):
+    logging.info("Files with tag %s already exist.", tag)
+    return filepaths
+
+  logging.info("Saving files with tag %s.", tag)
+  input_file = raw_files[0]
+  target_file = raw_files[1]
+
+  # Write examples to each shard in round robin order.
+  tmp_filepaths = [six.ensure_str(fname) + ".incomplete" for fname in filepaths]
+  writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filepaths]
+  counter, shard = 0, 0
+  for counter, (input_line, target_line) in enumerate(
+      zip(txt_line_iterator(input_file), txt_line_iterator(target_file))):
+    if counter > 0 and counter % 100000 == 0:
+      logging.info("\tSaving case %d.", counter)
+    example = dict_to_example({
+        "inputs": subtokenizer.encode(input_line, add_eos=True),
+        "targets": subtokenizer.encode(target_line, add_eos=True)
+    })
+    writers[shard].write(example.SerializeToString())
+    shard = (shard + 1) % total_shards
+  for writer in writers:
+    writer.close()
+
+  for tmp_name, final_name in zip(tmp_filepaths, filepaths):
+    tf.gfile.Rename(tmp_name, final_name)
+
+  logging.info("Saved %d Examples", counter + 1)
+  return filepaths
+
+
+def shard_filename(path, tag, shard_num, total_shards):
+  """Create filename for data shard."""
+  return os.path.join(
+      path, "%s-%s-%.5d-of-%.5d" % (_PREFIX, tag, shard_num, total_shards))
+
+
+def shuffle_records(fname):
+  """Shuffle records in a single file."""
+  logging.info("Shuffling records in file %s", fname)
+
+  # Rename file prior to shuffling
+  tmp_fname = six.ensure_str(fname) + ".unshuffled"
+  tf.gfile.Rename(fname, tmp_fname)
+
+  reader = tf.io.tf_record_iterator(tmp_fname)
+  records = []
+  for record in reader:
+    records.append(record)
+    if len(records) % 100000 == 0:
+      logging.info("\tRead: %d", len(records))
+
+  random.shuffle(records)
+
+  # Write shuffled records to original file name
+  with tf.python_io.TFRecordWriter(fname) as w:
+    for count, record in enumerate(records):
+      w.write(record)
+      if count > 0 and count % 100000 == 0:
+        logging.info("\tWriting record: %d", count)
+
+  tf.gfile.Remove(tmp_fname)
+
+
+def dict_to_example(dictionary):
+  """Converts a dictionary of string->int to a tf.Example."""
+  features = {}
+  for k, v in six.iteritems(dictionary):
+    features[k] = tf.train.Feature(int64_list=tf.train.Int64List(value=v))
+  return tf.train.Example(features=tf.train.Features(feature=features))
+
+
+def all_exist(filepaths):
+  """Returns true if all files in the list exist."""
+  for fname in filepaths:
+    if not tf.gfile.Exists(fname):
+      return False
+  return True
+
+
+def make_dir(path):
+  if not tf.gfile.Exists(path):
+    logging.info("Creating directory %s", path)
+    tf.gfile.MakeDirs(path)
+
+
+def main(unused_argv):
+  """Obtain training and evaluation data for the Transformer model."""
+  make_dir(FLAGS.raw_dir)
+  make_dir(FLAGS.data_dir)
+
+  # Download test_data
+  logging.info("Step 1/5: Downloading test data")
+  get_raw_files(FLAGS.data_dir, _TEST_DATA_SOURCES)
+
+  # Get paths of download/extracted training and evaluation files.
+  logging.info("Step 2/5: Downloading data from source")
+  train_files = get_raw_files(FLAGS.raw_dir, _TRAIN_DATA_SOURCES)
+  eval_files = get_raw_files(FLAGS.raw_dir, _EVAL_DATA_SOURCES)
+
+  # Create subtokenizer based on the training files.
+  logging.info("Step 3/5: Creating subtokenizer and building vocabulary")
+  train_files_flat = train_files["inputs"] + train_files["targets"]
+  vocab_file = os.path.join(FLAGS.data_dir, VOCAB_FILE)
+  subtokenizer = tokenizer.Subtokenizer.init_from_files(
+      vocab_file,
+      train_files_flat,
+      _TARGET_VOCAB_SIZE,
+      _TARGET_THRESHOLD,
+      min_count=None if FLAGS.search else _TRAIN_DATA_MIN_COUNT)
+
+  logging.info("Step 4/5: Compiling training and evaluation data")
+  compiled_train_files = compile_files(FLAGS.raw_dir, train_files, _TRAIN_TAG)
+  compiled_eval_files = compile_files(FLAGS.raw_dir, eval_files, _EVAL_TAG)
+
+  # Tokenize and save data as Examples in the TFRecord format.
+  logging.info("Step 5/5: Preprocessing and saving data")
+  train_tfrecord_files = encode_and_save_files(subtokenizer, FLAGS.data_dir,
+                                               compiled_train_files, _TRAIN_TAG,
+                                               _TRAIN_SHARDS)
+  encode_and_save_files(subtokenizer, FLAGS.data_dir, compiled_eval_files,
+                        _EVAL_TAG, _EVAL_SHARDS)
+
+  for fname in train_tfrecord_files:
+    shuffle_records(fname)
+
+
+def define_data_download_flags():
+  """Add flags specifying data download arguments."""
+  flags.DEFINE_string(
+      name="data_dir",
+      short_name="dd",
+      default="/tmp/translate_ende",
+      help=flags_core.help_wrap(
+          "Directory for where the translate_ende_wmt32k dataset is saved."))
+  flags.DEFINE_string(
+      name="raw_dir",
+      short_name="rd",
+      default="/tmp/translate_ende_raw",
+      help=flags_core.help_wrap(
+          "Path where the raw data will be downloaded and extracted."))
+  flags.DEFINE_bool(
+      name="search",
+      default=False,
+      help=flags_core.help_wrap(
+          "If set, use binary search to find the vocabulary set with size"
+          "closest to the target size (%d)." % _TARGET_VOCAB_SIZE))
+
+
+if __name__ == "__main__":
+  logging.set_verbosity(logging.INFO)
+  define_data_download_flags()
+  FLAGS = flags.FLAGS
+  app.run(main)
--- a/official/legacy/transformer/data_pipeline.py
+++ b/official/legacy/transformer/data_pipeline.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Input pipeline for the transformer model to read, filter, and batch examples.
+
+Two things to note in the pipeline:
+
+1. Batching scheme
+
+   The examples encoded in the TFRecord files contain data in the format:
+     {"inputs": [variable length array of integers],
+      "targets": [variable length array of integers]}
+   Where integers in the arrays refer to tokens in the English and German vocab
+   file (named `vocab.ende.32768`).
+
+   Prior to batching, elements in the dataset are grouped by length (max between
+   "inputs" and "targets" length). Each group is then batched such that:
+     group_batch_size * length <= batch_size.
+
+   Another way to view batch_size is the maximum number of tokens in each batch.
+
+   Once batched, each element in the dataset will have the shape:
+     {"inputs": [group_batch_size, padded_input_length],
+      "targets": [group_batch_size, padded_target_length]}
+   Lengths are padded to the longest "inputs" or "targets" sequence in the batch
+   (padded_input_length and padded_target_length can be different).
+
+   This batching scheme decreases the fraction of padding tokens per training
+   batch, thus improving the training speed significantly.
+
+2. Shuffling
+
+   While training, the dataset is shuffled in two places in the code. The first
+   is the list of training files. Second, while reading records using
+   `parallel_interleave`, the `sloppy` argument is used to generate randomness
+   in the order of the examples.
+"""
+
+import os
+
+from absl import logging
+import tensorflow as tf
+
+from official.utils.misc import model_helpers
+
+# Buffer size for reading records from a TFRecord file. Each training file is
+# 7.2 MB, so 8 MB allows an entire file to be kept in memory.
+_READ_RECORD_BUFFER = 8 * 1000 * 1000
+
+# Example grouping constants. Defines length boundaries for each group.
+# These values are the defaults used in Tensor2Tensor.
+_MIN_BOUNDARY = 8
+_BOUNDARY_SCALE = 1.1
+
+
+def _load_records(filename):
+  """Read file and return a dataset of tf.Examples."""
+  return tf.data.TFRecordDataset(filename, buffer_size=_READ_RECORD_BUFFER)
+
+
+def _parse_example(serialized_example):
+  """Return inputs and targets Tensors from a serialized tf.Example."""
+  data_fields = {
+      "inputs": tf.io.VarLenFeature(tf.int64),
+      "targets": tf.io.VarLenFeature(tf.int64)
+  }
+  parsed = tf.io.parse_single_example(serialized_example, data_fields)
+  inputs = tf.sparse.to_dense(parsed["inputs"])
+  targets = tf.sparse.to_dense(parsed["targets"])
+  return inputs, targets
+
+
+def _filter_max_length(example, max_length=256):
+  """Indicates whether the example's length is lower than the maximum length."""
+  return tf.logical_and(
+      tf.size(example[0]) <= max_length,
+      tf.size(example[1]) <= max_length)
+
+
+def _get_example_length(example):
+  """Returns the maximum length between the example inputs and targets."""
+  length = tf.maximum(tf.shape(example[0])[0], tf.shape(example[1])[0])
+  return length
+
+
+def _create_min_max_boundaries(max_length,
+                               min_boundary=_MIN_BOUNDARY,
+                               boundary_scale=_BOUNDARY_SCALE):
+  """Create min and max boundary lists up to max_length.
+
+  For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
+  returned values will be:
+    buckets_min = [0, 4, 8, 16, 24]
+    buckets_max = [4, 8, 16, 24, 25]
+
+  Args:
+    max_length: The maximum length of example in dataset.
+    min_boundary: Minimum length in boundary.
+    boundary_scale: Amount to scale consecutive boundaries in the list.
+
+  Returns:
+    min and max boundary lists
+
+  """
+  # Create bucket boundaries list by scaling the previous boundary or adding 1
+  # (to ensure increasing boundary sizes).
+  bucket_boundaries = []
+  x = min_boundary
+  while x < max_length:
+    bucket_boundaries.append(x)
+    x = max(x + 1, int(x * boundary_scale))
+
+  # Create min and max boundary lists from the initial list.
+  buckets_min = [0] + bucket_boundaries
+  buckets_max = bucket_boundaries + [max_length + 1]
+  return buckets_min, buckets_max
+
+
+def _batch_examples(dataset, batch_size, max_length):
+  """Group examples by similar lengths, and return batched dataset.
+
+  Each batch of similar-length examples are padded to the same length, and may
+  have different number of elements in each batch, such that:
+    group_batch_size * padded_length <= batch_size.
+
+  This decreases the number of padding tokens per batch, which improves the
+  training speed.
+
+  Args:
+    dataset: Dataset of unbatched examples.
+    batch_size: Max number of tokens per batch of examples.
+    max_length: Max number of tokens in an example input or target sequence.
+
+  Returns:
+    Dataset of batched examples with similar lengths.
+  """
+  # Get min and max boundary lists for each example. These are used to calculate
+  # the `bucket_id`, which is the index at which:
+  # buckets_min[bucket_id] <= len(example) < buckets_max[bucket_id]
+  # Note that using both min and max lists improves the performance.
+  buckets_min, buckets_max = _create_min_max_boundaries(max_length)
+
+  # Create list of batch sizes for each bucket_id, so that
+  # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
+  bucket_batch_sizes = [int(batch_size) // x for x in buckets_max]
+  # bucket_id will be a tensor, so convert this list to a tensor as well.
+  bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
+
+  def example_to_bucket_id(example_input, example_target):
+    """Return int64 bucket id for this example, calculated based on length."""
+    seq_length = _get_example_length((example_input, example_target))
+
+    # TODO(xunkai): investigate if removing code branching improves performance.
+    conditions_c = tf.logical_and(
+        tf.less_equal(buckets_min, seq_length), tf.less(seq_length,
+                                                        buckets_max))
+    bucket_id = tf.reduce_min(tf.where(conditions_c))
+    return bucket_id
+
+  def window_size_fn(bucket_id):
+    """Return number of examples to be grouped when given a bucket id."""
+    return bucket_batch_sizes[bucket_id]
+
+  def batching_fn(bucket_id, grouped_dataset):
+    """Batch and add padding to a dataset of elements with similar lengths."""
+    bucket_batch_size = window_size_fn(bucket_id)
+
+    # Batch the dataset and add padding so that all input sequences in the
+    # examples have the same length, and all target sequences have the same
+    # lengths as well. Resulting lengths of inputs and targets can differ.
+    return grouped_dataset.padded_batch(bucket_batch_size, ([None], [None]))
+
+  return dataset.apply(
+      tf.data.experimental.group_by_window(
+          key_func=example_to_bucket_id,
+          reduce_func=batching_fn,
+          window_size=None,
+          window_size_func=window_size_fn))
+
+
+def _read_and_batch_from_files(file_pattern,
+                               batch_size,
+                               max_length,
+                               max_io_parallelism,
+                               shuffle,
+                               repeat,
+                               static_batch=False,
+                               num_replicas=1,
+                               ctx=None):
+  """Create dataset where each item is a dict of "inputs" and "targets".
+
+  Args:
+    file_pattern: String used to match the input TFRecord files.
+    batch_size: Maximum number of tokens per global batch of examples.
+    max_length: Maximum number of tokens per example
+    max_io_parallelism: Max number of cpu cores for parallel input processing.
+    shuffle: If true, randomizes order of elements.
+    repeat: Number of times to repeat the dataset. If None, the dataset is
+      repeated forever.
+    static_batch: Whether the batches in the dataset should have static shapes.
+      If True, the input is batched so that every batch has the shape
+      [batch_size // max_length, max_length]. If False, the input is grouped by
+      length, and batched so that batches may have different
+      shapes [N, M], where: N * M <= batch_size M <= max_length In general, this
+        setting should be False. Dynamic shapes allow the inputs to be grouped
+        so that the number of padding tokens is minimized, and helps model
+        training. In cases where the input shape must be static (e.g. running on
+        TPU), this setting should be set to True.
+    num_replicas: Number of GPUs or other workers. We will generate global
+      batches, and each global batch is equally divisible by number of replicas.
+      Currently it is only effective when static_batch==True. TODO: make it
+        effective when static_batch=False.
+    ctx: Input context.
+
+  Returns:
+    tf.data.Dataset object containing examples loaded from the files.
+  """
+  dataset = tf.data.Dataset.list_files(file_pattern, shuffle=shuffle)
+
+  if ctx and ctx.num_input_pipelines > 1:
+    logging.info("Shard %d of the dataset.", ctx.input_pipeline_id)
+    dataset = dataset.shard(ctx.num_input_pipelines, ctx.input_pipeline_id)
+
+  # Read files and interleave results. When training, the order of the examples
+  # will be non-deterministic.
+  options = tf.data.Options()
+  options.experimental_deterministic = False
+  dataset = dataset.interleave(
+      _load_records,
+      cycle_length=max_io_parallelism,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE).with_options(options)
+
+  # Parse each tf.Example into a dictionary
+  # TODO: Look into prefetch_input_elements for performance optimization.  # pylint: disable=g-bad-todo
+  dataset = dataset.map(
+      _parse_example, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  # Remove examples where the input or target length exceeds the maximum length,
+  dataset = dataset.filter(lambda x, y: _filter_max_length((x, y), max_length))
+
+  if static_batch:
+    dataset = dataset.padded_batch(
+        # First calculate batch size (token number) per worker, then divide it
+        # into sentences, and finally expand to a global batch. It could prove
+        # the global batch divisble for distribution strategy.
+        int(batch_size // num_replicas // max_length * num_replicas),
+        ([max_length], [max_length]),
+        drop_remainder=True)
+  else:
+    # Group and batch such that each batch has examples of similar length.
+    # TODO(xunkai): _batch_examples might need to do something special for
+    # num_replicas.
+    dataset = _batch_examples(dataset, batch_size, max_length)
+
+  dataset = dataset.repeat(repeat)
+
+  # Prefetch the next element to improve speed of input pipeline.
+  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+  return dataset
+
+
+def _generate_synthetic_data(params):
+  """Create synthetic data based on the parameter batch size."""
+  batch_size = int(params["batch_size"] // params["max_length"])
+  length = params["max_length"]
+  dataset = model_helpers.generate_synthetic_data(
+      input_shape=tf.TensorShape([length]),
+      input_value=1,
+      input_dtype=tf.int64,
+      label_shape=tf.TensorShape([length]),
+      label_value=1,
+      label_dtype=tf.int64,
+  )
+  if params["static_batch"]:
+    dataset = dataset.batch(batch_size, drop_remainder=True)
+  else:
+    dataset = dataset.padded_batch(batch_size, ([None], [None]))
+  return dataset
+
+
+def train_input_fn(params, ctx=None):
+  """Load and return dataset of batched examples for use during training."""
+  file_pattern = os.path.join(params["data_dir"] or "", "*train*")
+  if params["use_synthetic_data"]:
+    return _generate_synthetic_data(params)
+  return _read_and_batch_from_files(
+      file_pattern,
+      params["batch_size"],
+      params["max_length"],
+      params["max_io_parallelism"],
+      shuffle=True,
+      repeat=params["repeat_dataset"],
+      static_batch=params["static_batch"],
+      num_replicas=params["num_gpus"],
+      ctx=ctx)
+
+
+def eval_input_fn(params, ctx=None):
+  """Load and return dataset of batched examples for use during evaluation."""
+  file_pattern = os.path.join(params["data_dir"] or "", "*dev*")
+  if params["use_synthetic_data"]:
+    return _generate_synthetic_data(params)
+  return _read_and_batch_from_files(
+      file_pattern,
+      params["batch_size"],
+      params["max_length"],
+      params["max_io_parallelism"],
+      shuffle=False,
+      repeat=1,
+      static_batch=params["static_batch"],
+      num_replicas=params["num_gpus"],
+      ctx=ctx)
+
+
+def map_data_for_transformer_fn(x, y):
+  """Maps data for training, and handles weried behaviors for different vers."""
+  # Will transform input x and targets y into tuple(x, y) as new model inputs.
+  # For TF v2, the 2nd parameter is omitted to make Keras training work.
+  return ((x, y),)
--- a/official/legacy/transformer/embedding_layer.py
+++ b/official/legacy/transformer/embedding_layer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implementation of embedding layer with shared weights."""
+
+import tensorflow as tf
+
+
+class EmbeddingSharedWeights(tf.keras.layers.Layer):
+  """Calculates input embeddings and pre-softmax linear with shared weights."""
+
+  def __init__(self, vocab_size, hidden_size):
+    """Specify characteristic parameters of embedding layer.
+
+    Args:
+      vocab_size: Number of tokens in the embedding. (Typically ~32,000)
+      hidden_size: Dimensionality of the embedding. (Typically 512 or 1024)
+    """
+    super(EmbeddingSharedWeights, self).__init__()
+    self.vocab_size = vocab_size
+    self.hidden_size = hidden_size
+
+  def build(self, input_shape):
+    """Build embedding layer."""
+    with tf.name_scope("embedding_and_softmax"):
+      # Create and initialize weights. The random normal initializer was chosen
+      # arbitrarily, and works well.
+      self.shared_weights = self.add_weight(
+          "weights",
+          shape=[self.vocab_size, self.hidden_size],
+          dtype=tf.float32,
+          initializer=tf.random_normal_initializer(
+              mean=0., stddev=self.hidden_size**-0.5))
+    super(EmbeddingSharedWeights, self).build(input_shape)
+
+  def get_config(self):
+    return {
+        "vocab_size": self.vocab_size,
+        "hidden_size": self.hidden_size,
+    }
+
+  def call(self, inputs, mode="embedding"):
+    """Get token embeddings of inputs.
+
+    Args:
+      inputs: An int64 tensor with shape [batch_size, length]
+      mode: string, a valid value is one of "embedding" and "linear".
+
+    Returns:
+      outputs: (1) If mode == "embedding", output embedding tensor, float32 with
+        shape [batch_size, length, embedding_size]; (2) mode == "linear", output
+        linear tensor, float32 with shape [batch_size, length, vocab_size].
+    Raises:
+      ValueError: if mode is not valid.
+    """
+    if mode == "embedding":
+      return self._embedding(inputs)
+    elif mode == "linear":
+      return self._linear(inputs)
+    else:
+      raise ValueError("mode {} is not valid.".format(mode))
+
+  def _embedding(self, inputs):
+    """Applies embedding based on inputs tensor."""
+    with tf.name_scope("embedding"):
+      # Create binary mask of size [batch_size, length]
+      embeddings = tf.gather(self.shared_weights, inputs)
+      # mask = tf.cast(tf.not_equal(inputs, 0), embeddings.dtype)
+      # embeddings *= tf.expand_dims(mask, -1)
+      # Scale embedding by the sqrt of the hidden size
+      embeddings *= self.hidden_size**0.5
+
+      return embeddings
+
+  def _linear(self, inputs):
+    """Computes logits by running inputs through a linear layer.
+
+    Args:
+      inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+
+    Returns:
+      float32 tensor with shape [batch_size, length, vocab_size].
+    """
+    with tf.name_scope("presoftmax_linear"):
+      batch_size = tf.shape(inputs)[0]
+      length = tf.shape(inputs)[1]
+
+      x = tf.reshape(inputs, [-1, self.hidden_size])
+      logits = tf.matmul(x, self.shared_weights, transpose_b=True)
+
+      return tf.reshape(logits, [batch_size, length, self.vocab_size])
--- a/official/legacy/transformer/ffn_layer.py
+++ b/official/legacy/transformer/ffn_layer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implementation of fully connected network."""
+
+import tensorflow as tf
+
+
+class FeedForwardNetwork(tf.keras.layers.Layer):
+  """Fully connected feedforward network."""
+
+  def __init__(self, hidden_size, filter_size, relu_dropout):
+    """Initialize FeedForwardNetwork.
+
+    Args:
+      hidden_size: int, output dim of hidden layer.
+      filter_size: int, filter size for the inner (first) dense layer.
+      relu_dropout: float, dropout rate for training.
+    """
+    super(FeedForwardNetwork, self).__init__()
+    self.hidden_size = hidden_size
+    self.filter_size = filter_size
+    self.relu_dropout = relu_dropout
+
+  def build(self, input_shape):
+    self.filter_dense_layer = tf.keras.layers.Dense(
+        self.filter_size,
+        use_bias=True,
+        activation=tf.nn.relu,
+        name="filter_layer")
+    self.output_dense_layer = tf.keras.layers.Dense(
+        self.hidden_size, use_bias=True, name="output_layer")
+    super(FeedForwardNetwork, self).build(input_shape)
+
+  def get_config(self):
+    return {
+        "hidden_size": self.hidden_size,
+        "filter_size": self.filter_size,
+        "relu_dropout": self.relu_dropout,
+    }
+
+  def call(self, x, training):
+    """Return outputs of the feedforward network.
+
+    Args:
+      x: tensor with shape [batch_size, length, hidden_size]
+      training: boolean, whether in training mode or not.
+
+    Returns:
+      Output of the feedforward network.
+      tensor with shape [batch_size, length, hidden_size]
+    """
+    # Retrieve dynamically known shapes
+
+    output = self.filter_dense_layer(x)
+    if training:
+      output = tf.nn.dropout(output, rate=self.relu_dropout)
+    output = self.output_dense_layer(output)
+
+    return output
--- a/official/legacy/transformer/metrics.py
+++ b/official/legacy/transformer/metrics.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions for calculating loss, accuracy, and other model metrics.
+
+Metrics:
+ - Padded loss, accuracy, and negative log perplexity. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
+ - BLEU approximation. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
+ - ROUGE score. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
+"""
+
+import functools
+
+import tensorflow as tf
+
+
+def _pad_tensors_to_same_length(x, y):
+  """Pad x and y so that the results have the same length (second dimension)."""
+  with tf.name_scope("pad_to_same_length"):
+    x_length = tf.shape(x)[1]
+    y_length = tf.shape(y)[1]
+
+    max_length = tf.maximum(x_length, y_length)
+
+    x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
+    y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
+    return x, y
+
+
+def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
+  """Calculate cross entropy loss while ignoring padding.
+
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch_size, length_labels]
+    smoothing: Label smoothing constant, used to determine the on and off values
+    vocab_size: int size of the vocabulary
+
+  Returns:
+    Returns the cross entropy loss and weight tensors: float32 tensors with
+      shape [batch_size, max(length_logits, length_labels)]
+  """
+  with tf.name_scope("loss"):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+
+    # Calculate smoothing cross entropy
+    with tf.name_scope("smoothing_cross_entropy"):
+      confidence = 1.0 - smoothing
+      low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32)
+      soft_targets = tf.one_hot(
+          tf.cast(labels, tf.int32),
+          depth=vocab_size,
+          on_value=confidence,
+          off_value=low_confidence)
+      xentropy = tf.nn.softmax_cross_entropy_with_logits(
+          logits=logits, labels=soft_targets)
+
+      # Calculate the best (lowest) possible value of cross entropy, and
+      # subtract from the cross entropy loss.
+      normalizing_constant = -(
+          confidence * tf.math.log(confidence) +
+          tf.cast(vocab_size - 1, tf.float32) * low_confidence *
+          tf.math.log(low_confidence + 1e-20))
+      xentropy -= normalizing_constant
+
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+    return xentropy * weights, weights
+
+
+def padded_accuracy(logits, labels):
+  """Percentage of times that predictions matches labels on non-0s."""
+  with tf.name_scope("padded_accuracy"):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+    outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+    padded_labels = tf.cast(labels, tf.int32)
+    return tf.cast(tf.equal(outputs, padded_labels), tf.float32), weights
+
+
+def padded_accuracy_topk(logits, labels, k):
+  """Percentage of times that top-k predictions matches labels on non-0s."""
+  with tf.name_scope("padded_accuracy_topk"):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+    effective_k = tf.minimum(k, tf.shape(logits)[-1])
+    _, outputs = tf.nn.top_k(logits, k=effective_k)
+    outputs = tf.cast(outputs, tf.int32)
+    padded_labels = tf.cast(labels, tf.int32)
+    padded_labels = tf.expand_dims(padded_labels, axis=-1)
+    padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
+    same = tf.cast(tf.equal(outputs, padded_labels), tf.float32)
+    same_topk = tf.reduce_sum(same, axis=-1)
+    return same_topk, weights
+
+
+def padded_accuracy_top5(logits, labels):
+  return padded_accuracy_topk(logits, labels, 5)
+
+
+def padded_sequence_accuracy(logits, labels):
+  """Percentage of times that predictions matches labels everywhere (non-0)."""
+  with tf.name_scope("padded_sequence_accuracy"):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+    outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+    padded_labels = tf.cast(labels, tf.int32)
+    not_correct = tf.cast(tf.not_equal(outputs, padded_labels),
+                          tf.float32) * weights
+    axis = list(range(1, len(outputs.get_shape())))
+    correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
+    return correct_seq, tf.constant(1.0)
+
+
+def padded_neg_log_perplexity(logits, labels, vocab_size):
+  """Average log-perplexity excluding padding 0s. No smoothing."""
+  num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size)
+  return -num, den
+
+
+class MetricLayer(tf.keras.layers.Layer):
+  """Custom a layer of metrics for Transformer model."""
+
+  def __init__(self, vocab_size):
+    super(MetricLayer, self).__init__()
+    self.vocab_size = vocab_size
+    self.metric_mean_fns = []
+
+  def build(self, input_shape):
+    """"Builds metric layer."""
+    neg_log_perplexity = functools.partial(
+        padded_neg_log_perplexity, vocab_size=self.vocab_size)
+    self.metric_mean_fns = [
+        (tf.keras.metrics.Mean("accuracy"), padded_accuracy),
+        (tf.keras.metrics.Mean("accuracy_top5"), padded_accuracy_top5),
+        (tf.keras.metrics.Mean("accuracy_per_sequence"),
+         padded_sequence_accuracy),
+        (tf.keras.metrics.Mean("neg_log_perplexity"), neg_log_perplexity),
+    ]
+    super(MetricLayer, self).build(input_shape)
+
+  def get_config(self):
+    return {"vocab_size": self.vocab_size}
+
+  def call(self, inputs):
+    logits, targets = inputs[0], inputs[1]
+    for mean, fn in self.metric_mean_fns:
+      m = mean(*fn(logits, targets))
+      self.add_metric(m)
+    return logits
+
+
+def transformer_loss(logits, labels, smoothing, vocab_size):
+  """Calculates total loss containing cross entropy with padding ignored.
+
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch_size, length_labels]
+    smoothing: Label smoothing constant, used to determine the on and off values
+    vocab_size: int size of the vocabulary
+
+  Returns:
+    A scalar float tensor for loss.
+  """
+  xentropy, weights = padded_cross_entropy_loss(logits, labels, smoothing,
+                                                vocab_size)
+  return tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
--- a/official/legacy/transformer/misc.py
+++ b/official/legacy/transformer/misc.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Misc for Transformer."""
+
+# pylint: disable=g-bad-import-order
+
+from absl import flags
+import tensorflow as tf
+
+from official.legacy.transformer import model_params
+from official.utils.flags import core as flags_core
+from official.utils.misc import keras_utils
+
+FLAGS = flags.FLAGS
+
+PARAMS_MAP = {
+    'tiny': model_params.TINY_PARAMS,
+    'base': model_params.BASE_PARAMS,
+    'big': model_params.BIG_PARAMS,
+}
+
+
+def get_model_params(param_set, num_gpus):
+  """Gets predefined model params."""
+  if num_gpus > 1:
+    if param_set == 'big':
+      return model_params.BIG_MULTI_GPU_PARAMS.copy()
+    elif param_set == 'base':
+      return model_params.BASE_MULTI_GPU_PARAMS.copy()
+    else:
+      raise ValueError('Not valid params: param_set={} num_gpus={}'.format(
+          param_set, num_gpus))
+
+  return PARAMS_MAP[param_set].copy()
+
+
+def define_transformer_flags():
+  """Add flags and flag validators for running transformer_main."""
+  # Add common flags (data_dir, model_dir, etc.).
+  flags_core.define_base(num_gpu=True, distribution_strategy=True)
+  flags_core.define_performance(
+      num_parallel_calls=True,
+      inter_op=False,
+      intra_op=False,
+      synthetic_data=True,
+      max_train_steps=False,
+      dtype=True,
+      loss_scale=True,
+      all_reduce_alg=True,
+      num_packs=True,
+      tf_gpu_thread_mode=True,
+      datasets_num_private_threads=True,
+      enable_xla=True,
+      fp16_implementation=True)
+
+  flags_core.define_benchmark()
+  flags_core.define_device(tpu=True)
+
+  flags.DEFINE_integer(
+      name='train_steps',
+      short_name='ts',
+      default=300000,
+      help=flags_core.help_wrap('The number of steps used to train.'))
+  flags.DEFINE_integer(
+      name='steps_between_evals',
+      short_name='sbe',
+      default=5000,
+      help=flags_core.help_wrap(
+          'The Number of training steps to run between evaluations. This is '
+          'used if --train_steps is defined.'))
+  flags.DEFINE_boolean(
+      name='enable_time_history',
+      default=True,
+      help='Whether to enable TimeHistory callback.')
+  flags.DEFINE_boolean(
+      name='enable_tensorboard',
+      default=False,
+      help='Whether to enable Tensorboard callback.')
+  flags.DEFINE_boolean(
+      name='enable_metrics_in_training',
+      default=False,
+      help='Whether to enable metrics during training.')
+  flags.DEFINE_boolean(
+      name='enable_mlir_bridge',
+      default=False,
+      help='Whether to enable the TF to XLA bridge.')
+  # Set flags from the flags_core module as 'key flags' so they're listed when
+  # the '-h' flag is used. Without this line, the flags defined above are
+  # only shown in the full `--helpful` help text.
+  flags.adopt_module_key_flags(flags_core)
+
+  # Add transformer-specific flags
+  flags.DEFINE_enum(
+      name='param_set',
+      short_name='mp',
+      default='big',
+      enum_values=PARAMS_MAP.keys(),
+      help=flags_core.help_wrap(
+          'Parameter set to use when creating and training the model. The '
+          'parameters define the input shape (batch size and max length), '
+          'model configuration (size of embedding, # of hidden layers, etc.), '
+          'and various other settings. The big parameter set increases the '
+          'default batch size, embedding/hidden size, and filter size. For a '
+          'complete list of parameters, please see model/model_params.py.'))
+
+  flags.DEFINE_bool(
+      name='static_batch',
+      short_name='sb',
+      default=False,
+      help=flags_core.help_wrap(
+          'Whether the batches in the dataset should have static shapes. In '
+          'general, this setting should be False. Dynamic shapes allow the '
+          'inputs to be grouped so that the number of padding tokens is '
+          'minimized, and helps model training. In cases where the input shape '
+          'must be static (e.g. running on TPU), this setting will be ignored '
+          'and static batching will always be used.'))
+  flags.DEFINE_integer(
+      name='max_length',
+      short_name='ml',
+      default=256,
+      help=flags_core.help_wrap(
+          'Max sentence length for Transformer. Default is 256. Note: Usually '
+          'it is more effective to use a smaller max length if static_batch is '
+          'enabled, e.g. 64.'))
+
+  # Flags for training with steps (may be used for debugging)
+  flags.DEFINE_integer(
+      name='validation_steps',
+      short_name='vs',
+      default=64,
+      help=flags_core.help_wrap('The number of steps used in validation.'))
+
+  # BLEU score computation
+  flags.DEFINE_string(
+      name='bleu_source',
+      short_name='bls',
+      default=None,
+      help=flags_core.help_wrap(
+          'Path to source file containing text translate when calculating the '
+          'official BLEU score. Both --bleu_source and --bleu_ref must be set. '
+      ))
+  flags.DEFINE_string(
+      name='bleu_ref',
+      short_name='blr',
+      default=None,
+      help=flags_core.help_wrap(
+          'Path to source file containing text translate when calculating the '
+          'official BLEU score. Both --bleu_source and --bleu_ref must be set. '
+      ))
+  flags.DEFINE_string(
+      name='vocab_file',
+      short_name='vf',
+      default=None,
+      help=flags_core.help_wrap(
+          'Path to subtoken vocabulary file. If data_download.py was used to '
+          'download and encode the training data, look in the data_dir to find '
+          'the vocab file.'))
+  flags.DEFINE_string(
+      name='mode',
+      default='train',
+      help=flags_core.help_wrap('mode: train, eval, or predict'))
+  flags.DEFINE_bool(
+      name='use_ctl',
+      default=False,
+      help=flags_core.help_wrap(
+          'Whether the model runs with custom training loop.'))
+  flags.DEFINE_integer(
+      name='decode_batch_size',
+      default=32,
+      help=flags_core.help_wrap(
+          'Global batch size used for Transformer autoregressive decoding on '
+          'TPU.'))
+  flags.DEFINE_integer(
+      name='decode_max_length',
+      default=97,
+      help=flags_core.help_wrap(
+          'Max sequence length of the decode/eval data. This is used by '
+          'Transformer autoregressive decoding on TPU to have minimum '
+          'paddings.'))
+  flags.DEFINE_bool(
+      name='padded_decode',
+      default=False,
+      help=flags_core.help_wrap(
+          'Whether the autoregressive decoding runs with input data padded to '
+          'the decode_max_length. For TPU/XLA-GPU runs, this flag has to be '
+          'set due the static shape requirement. Although CPU/GPU could also '
+          'use padded_decode, it has not been tested. In addition, this method '
+          'will introduce unnecessary overheads which grow quadratically with '
+          'the max sequence length.'))
+  flags.DEFINE_bool(
+      name='enable_checkpointing',
+      default=True,
+      help=flags_core.help_wrap(
+          'Whether to do checkpointing during training. When running under '
+          'benchmark harness, we will avoid checkpointing.'))
+  flags.DEFINE_bool(
+      name='save_weights_only',
+      default=True,
+      help=flags_core.help_wrap(
+          'Only used when above `enable_checkpointing` is True. '
+          'If True, then only the model\'s weights will be saved '
+          '(`model.save_weights(filepath)`), else the full model is saved '
+          '(`model.save(filepath)`)'))
+
+  flags_core.set_defaults(
+      data_dir='/tmp/translate_ende',
+      model_dir='/tmp/transformer_model',
+      batch_size=None)
+
+  # pylint: disable=unused-variable
+  @flags.multi_flags_validator(
+      ['bleu_source', 'bleu_ref'],
+      message='Both or neither --bleu_source and --bleu_ref must be defined.')
+  def _check_bleu_files(flags_dict):
+    return (flags_dict['bleu_source'] is None) == (
+        flags_dict['bleu_ref'] is None)
+
+  @flags.multi_flags_validator(
+      ['bleu_source', 'bleu_ref', 'vocab_file'],
+      message='--vocab_file must be defined if --bleu_source and --bleu_ref '
+      'are defined.')
+  def _check_bleu_vocab_file(flags_dict):
+    if flags_dict['bleu_source'] and flags_dict['bleu_ref']:
+      return flags_dict['vocab_file'] is not None
+    return True
+
+  # pylint: enable=unused-variable
+
+
+def get_callbacks():
+  """Returns common callbacks."""
+  callbacks = []
+  if FLAGS.enable_time_history:
+    time_callback = keras_utils.TimeHistory(
+        FLAGS.batch_size,
+        FLAGS.log_steps,
+        logdir=FLAGS.model_dir if FLAGS.enable_tensorboard else None)
+    callbacks.append(time_callback)
+
+  if FLAGS.enable_tensorboard:
+    tensorboard_callback = tf.keras.callbacks.TensorBoard(
+        log_dir=FLAGS.model_dir)
+    callbacks.append(tensorboard_callback)
+
+  return callbacks
+
+
+def update_stats(history, stats, callbacks):
+  """Normalizes and updates dictionary of stats.
+
+  Args:
+    history: Results of the training step.
+    stats: Dict with pre-existing training stats.
+    callbacks: a list of callbacks which might include a time history callback
+      used during keras.fit.
+  """
+
+  if history and history.history:
+    train_hist = history.history
+    # Gets final loss from training.
+    stats['loss'] = float(train_hist['loss'][-1])
+
+  if not callbacks:
+    return
+
+  # Look for the time history callback which was used during keras.fit
+  for callback in callbacks:
+    if isinstance(callback, keras_utils.TimeHistory):
+      timestamp_log = callback.timestamp_log
+      stats['step_timestamp_log'] = timestamp_log
+      stats['train_finish_time'] = callback.train_finish_time
+      if len(timestamp_log) > 1:
+        stats['avg_exp_per_second'] = (
+            callback.batch_size * callback.log_steps *
+            (len(callback.timestamp_log) - 1) /
+            (timestamp_log[-1].timestamp - timestamp_log[0].timestamp))
--- a/official/legacy/transformer/model_params.py
+++ b/official/legacy/transformer/model_params.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Defines Transformer model parameters."""
+
+import collections
+
+
+BASE_PARAMS = collections.defaultdict(
+    lambda: None,  # Set default value to None.
+
+    # Input params
+    default_batch_size=2048,  # Maximum number of tokens per batch of examples.
+    default_batch_size_tpu=32768,
+    max_length=256,  # Maximum number of tokens per example.
+
+    # Model params
+    initializer_gain=1.0,  # Used in trainable variable initialization.
+    vocab_size=33708,  # Number of tokens defined in the vocabulary file.
+    hidden_size=512,  # Model dimension in the hidden layers.
+    num_hidden_layers=6,  # Number of layers in the encoder and decoder stacks.
+    num_heads=8,  # Number of heads to use in multi-headed attention.
+    filter_size=2048,  # Inner layer dimension in the feedforward network.
+
+    # Dropout values (only used when training)
+    layer_postprocess_dropout=0.1,
+    attention_dropout=0.1,
+    relu_dropout=0.1,
+
+    # Training params
+    label_smoothing=0.1,
+    learning_rate=2.0,
+    learning_rate_decay_rate=1.0,
+    learning_rate_warmup_steps=16000,
+
+    # Optimizer params
+    optimizer_adam_beta1=0.9,
+    optimizer_adam_beta2=0.997,
+    optimizer_adam_epsilon=1e-09,
+
+    # Default prediction params
+    extra_decode_length=50,
+    beam_size=4,
+    alpha=0.6,  # used to calculate length normalization in beam search
+
+    # TPU specific parameters
+    use_tpu=False,
+    static_batch=False,
+    allow_ffn_pad=True,
+)
+
+BIG_PARAMS = BASE_PARAMS.copy()
+BIG_PARAMS.update(
+    default_batch_size=4096,
+
+    # default batch size is smaller than for BASE_PARAMS due to memory limits.
+    default_batch_size_tpu=16384,
+
+    hidden_size=1024,
+    filter_size=4096,
+    num_heads=16,
+)
+
+# Parameters for running the model in multi gpu. These should not change the
+# params that modify the model shape (such as the hidden_size or num_heads).
+BASE_MULTI_GPU_PARAMS = BASE_PARAMS.copy()
+BASE_MULTI_GPU_PARAMS.update(
+    learning_rate_warmup_steps=8000
+)
+
+BIG_MULTI_GPU_PARAMS = BIG_PARAMS.copy()
+BIG_MULTI_GPU_PARAMS.update(
+    layer_postprocess_dropout=0.3,
+    learning_rate_warmup_steps=8000
+)
+
+# Parameters for testing the model
+TINY_PARAMS = BASE_PARAMS.copy()
+TINY_PARAMS.update(
+    default_batch_size=1024,
+    default_batch_size_tpu=1024,
+    hidden_size=32,
+    num_heads=4,
+    filter_size=256,
+)
--- a/official/legacy/transformer/model_utils.py
+++ b/official/legacy/transformer/model_utils.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Transformer model helper methods."""
+
+import math
+
+import numpy as np
+import tensorflow as tf
+
+# Very low numbers to represent -infinity. We do not actually use -Inf, since we
+# want to be able to multiply these values by zero to get zero. (-Inf * 0 = NaN)
+_NEG_INF_FP32 = -1e9
+_NEG_INF_FP16 = np.finfo(np.float16).min
+
+
+def get_position_encoding(length,
+                          hidden_size,
+                          min_timescale=1.0,
+                          max_timescale=1.0e4):
+  """Return positional encoding.
+
+  Calculates the position encoding as a mix of sine and cosine functions with
+  geometrically increasing wavelengths.
+  Defined and formulized in Attention is All You Need, section 3.5.
+
+  Args:
+    length: Sequence length.
+    hidden_size: Size of the
+    min_timescale: Minimum scale that will be applied at each position
+    max_timescale: Maximum scale that will be applied at each position
+
+  Returns:
+    Tensor with shape [length, hidden_size]
+  """
+  # We compute the positional encoding in float32 even if the model uses
+  # float16, as many of the ops used, like log and exp, are numerically unstable
+  # in float16.
+  position = tf.cast(tf.range(length), tf.float32)
+  num_timescales = hidden_size // 2
+  log_timescale_increment = (
+      math.log(float(max_timescale) / float(min_timescale)) /
+      (tf.cast(num_timescales, tf.float32) - 1))
+  inv_timescales = min_timescale * tf.exp(
+      tf.cast(tf.range(num_timescales), tf.float32) * -log_timescale_increment)
+  scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
+  signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
+  return signal
+
+
+def get_decoder_self_attention_bias(length, dtype=tf.float32):
+  """Calculate bias for decoder that maintains model's autoregressive property.
+
+  Creates a tensor that masks out locations that correspond to illegal
+  connections, so prediction at position i cannot draw information from future
+  positions.
+
+  Args:
+    length: int length of sequences in batch.
+    dtype: The dtype of the return value.
+
+  Returns:
+    float tensor of shape [1, 1, length, length]
+  """
+  neg_inf = _NEG_INF_FP16 if dtype == tf.float16 else _NEG_INF_FP32
+  with tf.name_scope("decoder_self_attention_bias"):
+    valid_locs = tf.linalg.band_part(
+        tf.ones([length, length], dtype=dtype), -1, 0)
+    valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
+    decoder_bias = neg_inf * (1.0 - valid_locs)
+  return decoder_bias
+
+
+def get_padding(x, padding_value=0, dtype=tf.float32):
+  """Return float tensor representing the padding values in x.
+
+  Args:
+    x: int tensor with any shape
+    padding_value: int which represents padded values in input
+    dtype: The dtype of the return value.
+
+  Returns:
+    float tensor with same shape as x containing values 0 or 1.
+      0 -> non-padding, 1 -> padding
+  """
+  with tf.name_scope("padding"):
+    return tf.cast(tf.equal(x, padding_value), dtype)
+
+
+def get_padding_bias(x, padding_value=0, dtype=tf.float32):
+  """Calculate bias tensor from padding values in tensor.
+
+  Bias tensor that is added to the pre-softmax multi-headed attention logits,
+  which has shape [batch_size, num_heads, length, length]. The tensor is zero at
+  non-padding locations, and -1e9 (negative infinity) at padding locations.
+
+  Args:
+    x: int tensor with shape [batch_size, length]
+    padding_value: int which represents padded values in input
+    dtype: The dtype of the return value
+
+  Returns:
+    Attention bias tensor of shape [batch_size, 1, 1, length].
+  """
+  with tf.name_scope("attention_bias"):
+    padding = get_padding(x, padding_value, dtype)
+    attention_bias = padding * _NEG_INF_FP32
+    attention_bias = tf.expand_dims(
+        tf.expand_dims(attention_bias, axis=1), axis=1)
+  return attention_bias
--- a/official/legacy/transformer/model_utils_test.py
+++ b/official/legacy/transformer/model_utils_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test Transformer model helper methods."""
+
+import tensorflow as tf
+
+from official.legacy.transformer import model_utils
+
+NEG_INF = -1e9
+
+
+class ModelUtilsTest(tf.test.TestCase):
+
+  def test_get_padding(self):
+    x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
+    padding = model_utils.get_padding(x, padding_value=0)
+
+    self.assertAllEqual([[0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [1, 0, 0, 1, 0]],
+                        padding)
+
+  def test_get_padding_bias(self):
+    x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
+    bias = model_utils.get_padding_bias(x)
+    bias_shape = tf.shape(bias)
+    flattened_bias = tf.reshape(bias, [3, 5])
+
+    self.assertAllEqual(
+        [[0, NEG_INF, NEG_INF, NEG_INF, 0], [0, 0, NEG_INF, NEG_INF, NEG_INF],
+         [NEG_INF, 0, 0, NEG_INF, 0]], flattened_bias)
+    self.assertAllEqual([3, 1, 1, 5], bias_shape)
+
+  def test_get_decoder_self_attention_bias(self):
+    length = 5
+    bias = model_utils.get_decoder_self_attention_bias(length)
+
+    self.assertAllEqual(
+        [[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
+           [0, 0, NEG_INF, NEG_INF, NEG_INF], [0, 0, 0, NEG_INF, NEG_INF],
+           [0, 0, 0, 0, NEG_INF], [0, 0, 0, 0, 0]]]], bias)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/legacy/transformer/optimizer.py
+++ b/official/legacy/transformer/optimizer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Optimizer from addons and learning rate scheduler."""
+
+import tensorflow as tf
+
+
+class LearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Learning rate schedule."""
+
+  def __init__(self, initial_learning_rate, hidden_size, warmup_steps):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      initial_learning_rate: A float, the initial learning rate.
+      hidden_size: An integer, the model dimension in the hidden layers.
+      warmup_steps: An integer, the number of steps required for linear warmup.
+    """
+    super(LearningRateSchedule, self).__init__()
+    self.initial_learning_rate = initial_learning_rate
+    self.hidden_size = hidden_size
+    self.warmup_steps = warmup_steps
+    self.warmup_steps_tensor = tf.cast(warmup_steps, tf.float32)
+
+  def __call__(self, global_step):
+    """Calculate learning rate with linear warmup and rsqrt decay.
+
+    Args:
+      global_step: An integer, the current global step used for learning rate
+        calculation.
+
+    Returns:
+      A float, the learning rate needs to be used for current global step.
+    """
+    with tf.name_scope('learning_rate_schedule'):
+      global_step = tf.cast(global_step, tf.float32)
+      learning_rate = self.initial_learning_rate
+      learning_rate *= (self.hidden_size**-0.5)
+      # Apply linear warmup
+      learning_rate *= tf.minimum(1.0, global_step / self.warmup_steps_tensor)
+      # Apply rsqrt decay
+      learning_rate /= tf.sqrt(
+          tf.maximum(global_step, self.warmup_steps_tensor))
+      return learning_rate
+
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        'initial_learning_rate': self.initial_learning_rate,
+        'hidden_size': self.hidden_size,
+        'warmup_steps': self.warmup_steps,
+    }
--- a/official/legacy/transformer/transformer.py
+++ b/official/legacy/transformer/transformer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Defines the Transformer model in TF 2.0.
+
+Model paper: https://arxiv.org/pdf/1706.03762.pdf
+Transformer model code source: https://github.com/tensorflow/tensor2tensor
+"""
+
+import tensorflow as tf
+
+from official.legacy.transformer import attention_layer
+from official.legacy.transformer import embedding_layer
+from official.legacy.transformer import ffn_layer
+from official.legacy.transformer import metrics
+from official.legacy.transformer import model_utils
+from official.legacy.transformer.utils.tokenizer import EOS_ID
+from official.nlp.modeling.layers import position_embedding
+from official.nlp.modeling.ops import beam_search
+
+# Disable the not-callable lint error, since it claims many objects are not
+# callable when they actually are.
+# pylint: disable=not-callable
+
+
+def create_model(params, is_train):
+  """Creates transformer model."""
+  with tf.name_scope("model"):
+    if is_train:
+      inputs = tf.keras.layers.Input((None,), dtype="int64", name="inputs")
+      targets = tf.keras.layers.Input((None,), dtype="int64", name="targets")
+      internal_model = Transformer(params, name="transformer_v2")
+      logits = internal_model([inputs, targets], training=is_train)
+      vocab_size = params["vocab_size"]
+      label_smoothing = params["label_smoothing"]
+      if params["enable_metrics_in_training"]:
+        logits = metrics.MetricLayer(vocab_size)([logits, targets])
+      logits = tf.keras.layers.Lambda(
+          lambda x: x, name="logits", dtype=tf.float32)(
+              logits)
+      model = tf.keras.Model([inputs, targets], logits)
+      loss = metrics.transformer_loss(logits, targets, label_smoothing,
+                                      vocab_size)
+      model.add_loss(loss)
+      return model
+
+    else:
+      inputs = tf.keras.layers.Input((None,), dtype="int64", name="inputs")
+      internal_model = Transformer(params, name="transformer_v2")
+      ret = internal_model([inputs], training=is_train)
+      outputs, scores = ret["outputs"], ret["scores"]
+      return tf.keras.Model(inputs, [outputs, scores])
+
+
+class Transformer(tf.keras.Model):
+  """Transformer model with Keras.
+
+  Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
+
+  The Transformer model consists of an encoder and decoder. The input is an int
+  sequence (or a batch of sequences). The encoder produces a continuous
+  representation, and the decoder uses the encoder output to generate
+  probabilities for the output sequence.
+  """
+
+  def __init__(self, params, name=None):
+    """Initialize layers to build Transformer model.
+
+    Args:
+      params: hyperparameter object defining layer sizes, dropout values, etc.
+      name: name of the model.
+    """
+    super(Transformer, self).__init__(name=name)
+    self.params = params
+    self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights(
+        params["vocab_size"], params["hidden_size"])
+    self.encoder_stack = EncoderStack(params)
+    self.decoder_stack = DecoderStack(params)
+    self.position_embedding = position_embedding.RelativePositionEmbedding(
+        hidden_size=self.params["hidden_size"])
+
+  def get_config(self):
+    return {
+        "params": self.params,
+    }
+
+  def call(self, inputs, training):
+    """Calculate target logits or inferred target sequences.
+
+    Args:
+      inputs: input tensor list of size 1 or 2.
+        First item, inputs: int tensor with shape [batch_size, input_length].
+        Second item (optional), targets: None or int tensor with shape
+          [batch_size, target_length].
+      training: boolean, whether in training mode or not.
+
+    Returns:
+      If targets is defined, then return logits for each word in the target
+      sequence. float tensor with shape [batch_size, target_length, vocab_size]
+      If target is none, then generate output sequence one token at a time.
+        returns a dictionary {
+          outputs: int tensor with shape [batch_size, decoded_length]
+          scores: float tensor with shape [batch_size]}
+      Even when float16 is used, the output tensor(s) are always float32.
+
+    Raises:
+      NotImplementedError: If try to use padded decode method on CPU/GPUs.
+    """
+    inputs = inputs if isinstance(inputs, list) else [inputs]
+    if len(inputs) == 2:
+      inputs, targets = inputs[0], inputs[1]
+    else:
+      # Decoding path.
+      inputs, targets = inputs[0], None
+      if self.params["padded_decode"]:
+        if not self.params["num_replicas"]:
+          raise NotImplementedError(
+              "Padded decoding on CPU/GPUs is not supported.")
+        decode_batch_size = int(self.params["decode_batch_size"] /
+                                self.params["num_replicas"])
+        inputs.set_shape([decode_batch_size, self.params["decode_max_length"]])
+
+    # Variance scaling is used here because it seems to work in many problems.
+    # Other reasonable initializers may also work just as well.
+    with tf.name_scope("Transformer"):
+      # Calculate attention bias for encoder self-attention and decoder
+      # multi-headed attention layers.
+      attention_bias = model_utils.get_padding_bias(inputs)
+
+      # Run the inputs through the encoder layer to map the symbol
+      # representations to continuous representations.
+      encoder_outputs = self.encode(inputs, attention_bias, training)
+      # Generate output sequence if targets is None, or return logits if target
+      # sequence is known.
+      if targets is None:
+        return self.predict(encoder_outputs, attention_bias, training)
+      else:
+        logits = self.decode(targets, encoder_outputs, attention_bias, training)
+        return logits
+
+  def encode(self, inputs, attention_bias, training):
+    """Generate continuous representation for inputs.
+
+    Args:
+      inputs: int tensor with shape [batch_size, input_length].
+      attention_bias: float tensor with shape [batch_size, 1, 1, input_length].
+      training: boolean, whether in training mode or not.
+
+    Returns:
+      float tensor with shape [batch_size, input_length, hidden_size]
+    """
+    with tf.name_scope("encode"):
+      # Prepare inputs to the layer stack by adding positional encodings and
+      # applying dropout.
+      embedded_inputs = self.embedding_softmax_layer(inputs)
+      embedded_inputs = tf.cast(embedded_inputs, self.params["dtype"])
+      inputs_padding = model_utils.get_padding(inputs)
+      attention_bias = tf.cast(attention_bias, self.params["dtype"])
+
+      with tf.name_scope("add_pos_encoding"):
+        pos_encoding = self.position_embedding(inputs=embedded_inputs)
+        pos_encoding = tf.cast(pos_encoding, self.params["dtype"])
+        encoder_inputs = embedded_inputs + pos_encoding
+
+      if training:
+        encoder_inputs = tf.nn.dropout(
+            encoder_inputs, rate=self.params["layer_postprocess_dropout"])
+
+      return self.encoder_stack(
+          encoder_inputs, attention_bias, inputs_padding, training=training)
+
+  def decode(self, targets, encoder_outputs, attention_bias, training):
+    """Generate logits for each value in the target sequence.
+
+    Args:
+      targets: target values for the output sequence. int tensor with shape
+        [batch_size, target_length]
+      encoder_outputs: continuous representation of input sequence. float tensor
+        with shape [batch_size, input_length, hidden_size]
+      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
+      training: boolean, whether in training mode or not.
+
+    Returns:
+      float32 tensor with shape [batch_size, target_length, vocab_size]
+    """
+    with tf.name_scope("decode"):
+      # Prepare inputs to decoder layers by shifting targets, adding positional
+      # encoding and applying dropout.
+      with tf.name_scope("shift_targets"):
+        # Shift targets to the right, and remove the last element
+        targets = tf.pad(targets, [[0, 0], [1, 0]])[:, :-1]
+      decoder_inputs = self.embedding_softmax_layer(targets)
+      decoder_inputs = tf.cast(decoder_inputs, self.params["dtype"])
+      attention_bias = tf.cast(attention_bias, self.params["dtype"])
+      with tf.name_scope("add_pos_encoding"):
+        length = tf.shape(decoder_inputs)[1]
+        pos_encoding = self.position_embedding(decoder_inputs)
+        pos_encoding = tf.cast(pos_encoding, self.params["dtype"])
+        decoder_inputs += pos_encoding
+      if training:
+        decoder_inputs = tf.nn.dropout(
+            decoder_inputs, rate=self.params["layer_postprocess_dropout"])
+
+      # Run values
+      decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
+          length, dtype=self.params["dtype"])
+      outputs = self.decoder_stack(
+          decoder_inputs,
+          encoder_outputs,
+          decoder_self_attention_bias,
+          attention_bias,
+          training=training)
+      logits = self.embedding_softmax_layer(outputs, mode="linear")
+      logits = tf.cast(logits, tf.float32)
+      return logits
+
+  def _get_symbols_to_logits_fn(self, max_decode_length, training):
+    """Returns a decoding function that calculates logits of the next tokens."""
+    timing_signal = self.position_embedding(
+        inputs=None, length=max_decode_length + 1)
+    timing_signal = tf.cast(timing_signal, self.params["dtype"])
+    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
+        max_decode_length, dtype=self.params["dtype"])
+
+    def symbols_to_logits_fn(ids, i, cache):
+      """Generate logits for next potential IDs.
+
+      Args:
+        ids: Current decoded sequences. int tensor with shape [batch_size *
+          beam_size, i + 1].
+        i: Loop index.
+        cache: dictionary of values storing the encoder output, encoder-decoder
+          attention bias, and previous decoder attention values.
+
+      Returns:
+        Tuple of
+          (logits with shape [batch_size * beam_size, vocab_size],
+           updated cache values)
+      """
+      # Set decoder input to the last generated IDs
+      decoder_input = ids[:, -1:]
+
+      # Preprocess decoder input by getting embeddings and adding timing signal.
+      decoder_input = self.embedding_softmax_layer(decoder_input)
+      decoder_input += timing_signal[i]
+      if self.params["padded_decode"]:
+        bias_shape = decoder_self_attention_bias.shape.as_list()
+        self_attention_bias = tf.slice(
+            decoder_self_attention_bias, [0, 0, i, 0],
+            [bias_shape[0], bias_shape[1], 1, bias_shape[3]])
+      else:
+        self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
+
+      decoder_outputs = self.decoder_stack(
+          decoder_input,
+          cache.get("encoder_outputs"),
+          self_attention_bias,
+          cache.get("encoder_decoder_attention_bias"),
+          training=training,
+          cache=cache,
+          decode_loop_step=i if self.params["padded_decode"] else None)
+      logits = self.embedding_softmax_layer(decoder_outputs, mode="linear")
+      logits = tf.squeeze(logits, axis=[1])
+      return logits, cache
+
+    return symbols_to_logits_fn
+
+  def predict(self, encoder_outputs, encoder_decoder_attention_bias, training):
+    """Return predicted sequence."""
+    encoder_outputs = tf.cast(encoder_outputs, self.params["dtype"])
+    if self.params["padded_decode"]:
+      batch_size = encoder_outputs.shape.as_list()[0]
+      input_length = encoder_outputs.shape.as_list()[1]
+    else:
+      batch_size = tf.shape(encoder_outputs)[0]
+      input_length = tf.shape(encoder_outputs)[1]
+    max_decode_length = input_length + self.params["extra_decode_length"]
+    encoder_decoder_attention_bias = tf.cast(encoder_decoder_attention_bias,
+                                             self.params["dtype"])
+
+    symbols_to_logits_fn = self._get_symbols_to_logits_fn(
+        max_decode_length, training)
+
+    # Create initial set of IDs that will be passed into symbols_to_logits_fn.
+    initial_ids = tf.zeros([batch_size], dtype=tf.int32)
+
+    # Create cache storing decoder attention values for each layer.
+    # pylint: disable=g-complex-comprehension
+    init_decode_length = (
+        max_decode_length if self.params["padded_decode"] else 0)
+    num_heads = self.params["num_heads"]
+    dim_per_head = self.params["hidden_size"] // num_heads
+    cache = {
+        "layer_%d" % layer: {
+            "k":
+                tf.zeros(
+                    [batch_size, init_decode_length, num_heads, dim_per_head],
+                    dtype=self.params["dtype"]),
+            "v":
+                tf.zeros(
+                    [batch_size, init_decode_length, num_heads, dim_per_head],
+                    dtype=self.params["dtype"])
+        } for layer in range(self.params["num_hidden_layers"])
+    }
+    # pylint: enable=g-complex-comprehension
+
+    # Add encoder output and attention bias to the cache.
+    cache["encoder_outputs"] = encoder_outputs
+    cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
+
+    # Use beam search to find the top beam_size sequences and scores.
+    decoded_ids, scores = beam_search.sequence_beam_search(
+        symbols_to_logits_fn=symbols_to_logits_fn,
+        initial_ids=initial_ids,
+        initial_cache=cache,
+        vocab_size=self.params["vocab_size"],
+        beam_size=self.params["beam_size"],
+        alpha=self.params["alpha"],
+        max_decode_length=max_decode_length,
+        eos_id=EOS_ID,
+        padded_decode=self.params["padded_decode"],
+        dtype=self.params["dtype"])
+
+    # Get the top sequence for each batch element
+    top_decoded_ids = decoded_ids[:, 0, 1:]
+    top_scores = scores[:, 0]
+
+    return {"outputs": top_decoded_ids, "scores": top_scores}
+
+
+class PrePostProcessingWrapper(tf.keras.layers.Layer):
+  """Wrapper class that applies layer pre-processing and post-processing."""
+
+  def __init__(self, layer, params):
+    super(PrePostProcessingWrapper, self).__init__()
+    self.layer = layer
+    self.params = params
+    self.postprocess_dropout = params["layer_postprocess_dropout"]
+
+  def build(self, input_shape):
+    # Create normalization layer
+    self.layer_norm = tf.keras.layers.LayerNormalization(
+        epsilon=1e-6, dtype="float32")
+    super(PrePostProcessingWrapper, self).build(input_shape)
+
+  def get_config(self):
+    return {
+        "params": self.params,
+    }
+
+  def call(self, x, *args, **kwargs):
+    """Calls wrapped layer with same parameters."""
+    # Preprocessing: apply layer normalization
+    training = kwargs["training"]
+
+    y = self.layer_norm(x)
+
+    # Get layer output
+    y = self.layer(y, *args, **kwargs)
+
+    # Postprocessing: apply dropout and residual connection
+    if training:
+      y = tf.nn.dropout(y, rate=self.postprocess_dropout)
+    return x + y
+
+
+class EncoderStack(tf.keras.layers.Layer):
+  """Transformer encoder stack.
+
+  The encoder stack is made up of N identical layers. Each layer is composed
+  of the sublayers:
+    1. Self-attention layer
+    2. Feedforward network (which is 2 fully-connected layers)
+  """
+
+  def __init__(self, params):
+    super(EncoderStack, self).__init__()
+    self.params = params
+    self.layers = []
+
+  def build(self, input_shape):
+    """Builds the encoder stack."""
+    params = self.params
+    for _ in range(params["num_hidden_layers"]):
+      # Create sublayers for each layer.
+      self_attention_layer = attention_layer.SelfAttention(
+          params["hidden_size"], params["num_heads"],
+          params["attention_dropout"])
+      feed_forward_network = ffn_layer.FeedForwardNetwork(
+          params["hidden_size"], params["filter_size"], params["relu_dropout"])
+
+      self.layers.append([
+          PrePostProcessingWrapper(self_attention_layer, params),
+          PrePostProcessingWrapper(feed_forward_network, params)
+      ])
+
+    # Create final layer normalization layer.
+    self.output_normalization = tf.keras.layers.LayerNormalization(
+        epsilon=1e-6, dtype="float32")
+    super(EncoderStack, self).build(input_shape)
+
+  def get_config(self):
+    return {
+        "params": self.params,
+    }
+
+  def call(self, encoder_inputs, attention_bias, inputs_padding, training):
+    """Return the output of the encoder layer stacks.
+
+    Args:
+      encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
+      attention_bias: bias for the encoder self-attention layer. [batch_size, 1,
+        1, input_length]
+      inputs_padding: tensor with shape [batch_size, input_length], inputs with
+        zero paddings.
+      training: boolean, whether in training mode or not.
+
+    Returns:
+      Output of encoder layer stack.
+      float32 tensor with shape [batch_size, input_length, hidden_size]
+    """
+    for n, layer in enumerate(self.layers):
+      # Run inputs through the sublayers.
+      self_attention_layer = layer[0]
+      feed_forward_network = layer[1]
+
+      with tf.name_scope("layer_%d" % n):
+        with tf.name_scope("self_attention"):
+          encoder_inputs = self_attention_layer(
+              encoder_inputs, attention_bias, training=training)
+        with tf.name_scope("ffn"):
+          encoder_inputs = feed_forward_network(
+              encoder_inputs, training=training)
+
+    return self.output_normalization(encoder_inputs)
+
+
+class DecoderStack(tf.keras.layers.Layer):
+  """Transformer decoder stack.
+
+  Like the encoder stack, the decoder stack is made up of N identical layers.
+  Each layer is composed of the sublayers:
+    1. Self-attention layer
+    2. Multi-headed attention layer combining encoder outputs with results from
+       the previous self-attention layer.
+    3. Feedforward network (2 fully-connected layers)
+  """
+
+  def __init__(self, params):
+    super(DecoderStack, self).__init__()
+    self.params = params
+    self.layers = []
+
+  def build(self, input_shape):
+    """Builds the decoder stack."""
+    params = self.params
+    for _ in range(params["num_hidden_layers"]):
+      self_attention_layer = attention_layer.SelfAttention(
+          params["hidden_size"], params["num_heads"],
+          params["attention_dropout"])
+      enc_dec_attention_layer = attention_layer.Attention(
+          params["hidden_size"], params["num_heads"],
+          params["attention_dropout"])
+      feed_forward_network = ffn_layer.FeedForwardNetwork(
+          params["hidden_size"], params["filter_size"], params["relu_dropout"])
+
+      self.layers.append([
+          PrePostProcessingWrapper(self_attention_layer, params),
+          PrePostProcessingWrapper(enc_dec_attention_layer, params),
+          PrePostProcessingWrapper(feed_forward_network, params)
+      ])
+    self.output_normalization = tf.keras.layers.LayerNormalization(
+        epsilon=1e-6, dtype="float32")
+    super(DecoderStack, self).build(input_shape)
+
+  def get_config(self):
+    return {
+        "params": self.params,
+    }
+
+  def call(self,
+           decoder_inputs,
+           encoder_outputs,
+           decoder_self_attention_bias,
+           attention_bias,
+           training,
+           cache=None,
+           decode_loop_step=None):
+    """Return the output of the decoder layer stacks.
+
+    Args:
+      decoder_inputs: A tensor with shape [batch_size, target_length,
+        hidden_size].
+      encoder_outputs: A tensor with shape [batch_size, input_length,
+        hidden_size]
+      decoder_self_attention_bias: A tensor with shape [1, 1, target_len,
+        target_length], the bias for decoder self-attention layer.
+      attention_bias: A tensor with shape [batch_size, 1, 1, input_length], the
+        bias for encoder-decoder attention layer.
+      training: A bool, whether in training mode or not.
+      cache: (Used for fast decoding) A nested dictionary storing previous
+        decoder self-attention values. The items are:
+          {layer_n: {"k": A tensor with shape [batch_size, i, key_channels],
+                     "v": A tensor with shape [batch_size, i, value_channels]},
+                       ...}
+      decode_loop_step: An integer, the step number of the decoding loop. Used
+        only for autoregressive inference on TPU.
+
+    Returns:
+      Output of decoder layer stack.
+      float32 tensor with shape [batch_size, target_length, hidden_size]
+    """
+    for n, layer in enumerate(self.layers):
+      self_attention_layer = layer[0]
+      enc_dec_attention_layer = layer[1]
+      feed_forward_network = layer[2]
+
+      # Run inputs through the sublayers.
+      layer_name = "layer_%d" % n
+      layer_cache = cache[layer_name] if cache is not None else None
+      with tf.name_scope(layer_name):
+        with tf.name_scope("self_attention"):
+          decoder_inputs = self_attention_layer(
+              decoder_inputs,
+              decoder_self_attention_bias,
+              training=training,
+              cache=layer_cache,
+              decode_loop_step=decode_loop_step)
+        with tf.name_scope("encdec_attention"):
+          decoder_inputs = enc_dec_attention_layer(
+              decoder_inputs,
+              encoder_outputs,
+              attention_bias,
+              training=training)
+        with tf.name_scope("ffn"):
+          decoder_inputs = feed_forward_network(
+              decoder_inputs, training=training)
+
+    return self.output_normalization(decoder_inputs)
--- a/official/legacy/transformer/transformer_forward_test.py
+++ b/official/legacy/transformer/transformer_forward_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Forward pass test for Transformer model refactoring."""
+
+import numpy as np
+import tensorflow as tf
+
+from official.legacy.transformer import metrics
+from official.legacy.transformer import model_params
+from official.legacy.transformer import transformer
+from official.nlp.modeling import models
+
+
+def _count_params(layer, trainable_only=True):
+  """Returns the count of all model parameters, or just trainable ones."""
+  if not trainable_only:
+    return layer.count_params()
+  else:
+    return int(
+        np.sum([
+            tf.keras.backend.count_params(p) for p in layer.trainable_weights
+        ]))
+
+
+def _create_model(params, is_train):
+  """Creates transformer model."""
+
+  encdec_kwargs = dict(
+      num_layers=params["num_hidden_layers"],
+      num_attention_heads=params["num_heads"],
+      intermediate_size=params["filter_size"],
+      activation="relu",
+      dropout_rate=params["relu_dropout"],
+      attention_dropout_rate=params["attention_dropout"],
+      use_bias=False,
+      norm_first=True,
+      norm_epsilon=1e-6,
+      intermediate_dropout=params["relu_dropout"])
+  encoder_layer = models.TransformerEncoder(**encdec_kwargs)
+  decoder_layer = models.TransformerDecoder(**encdec_kwargs)
+
+  model_kwargs = dict(
+      vocab_size=params["vocab_size"],
+      embedding_width=params["hidden_size"],
+      dropout_rate=params["layer_postprocess_dropout"],
+      padded_decode=params["padded_decode"],
+      decode_max_length=params["decode_max_length"],
+      dtype=params["dtype"],
+      extra_decode_length=params["extra_decode_length"],
+      beam_size=params["beam_size"],
+      alpha=params["alpha"],
+      encoder_layer=encoder_layer,
+      decoder_layer=decoder_layer,
+      name="transformer_v2")
+
+  if is_train:
+    inputs = tf.keras.layers.Input((None,), dtype="int64", name="inputs")
+    targets = tf.keras.layers.Input((None,), dtype="int64", name="targets")
+    internal_model = models.Seq2SeqTransformer(**model_kwargs)
+    logits = internal_model(
+        dict(inputs=inputs, targets=targets), training=is_train)
+    vocab_size = params["vocab_size"]
+    label_smoothing = params["label_smoothing"]
+    if params["enable_metrics_in_training"]:
+      logits = metrics.MetricLayer(vocab_size)([logits, targets])
+    logits = tf.keras.layers.Lambda(
+        lambda x: x, name="logits", dtype=tf.float32)(
+            logits)
+    model = tf.keras.Model([inputs, targets], logits)
+    loss = metrics.transformer_loss(logits, targets, label_smoothing,
+                                    vocab_size)
+    model.add_loss(loss)
+    return model
+
+  batch_size = params["decode_batch_size"] if params["padded_decode"] else None
+  inputs = tf.keras.layers.Input((None,),
+                                 batch_size=batch_size,
+                                 dtype="int64",
+                                 name="inputs")
+  internal_model = models.Seq2SeqTransformer(**model_kwargs)
+  ret = internal_model(dict(inputs=inputs), training=is_train)
+  outputs, scores = ret["outputs"], ret["scores"]
+  return tf.keras.Model(inputs, [outputs, scores])
+
+
+class TransformerForwardTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(TransformerForwardTest, self).setUp()
+    self.params = params = model_params.TINY_PARAMS
+    params["batch_size"] = params["default_batch_size"] = 16
+    params["hidden_size"] = 12
+    params["num_hidden_layers"] = 3
+    params["filter_size"] = 14
+    params["num_heads"] = 2
+    params["vocab_size"] = 41
+    params["extra_decode_length"] = 0
+    params["beam_size"] = 3
+    params["dtype"] = tf.float32
+    params["layer_postprocess_dropout"] = 0.0
+    params["attention_dropout"] = 0.0
+    params["relu_dropout"] = 0.0
+
+  def test_forward_pass_train(self):
+    # Set input_len different from target_len
+    inputs = np.asarray([[5, 2, 1], [7, 5, 0], [1, 4, 0], [7, 5, 11]])
+    targets = np.asarray([[4, 3, 4, 0], [13, 19, 17, 8], [20, 14, 1, 2],
+                          [5, 7, 3, 0]])
+
+    # src_model is the original model before refactored.
+    src_model = transformer.create_model(self.params, True)
+    src_num_weights = _count_params(src_model)
+    src_weights = src_model.get_weights()
+    src_model_output = src_model([inputs, targets], training=True)
+
+    # dest_model is the refactored model.
+    dest_model = _create_model(self.params, True)
+    dest_num_weights = _count_params(dest_model)
+    self.assertEqual(src_num_weights, dest_num_weights)
+    dest_model.set_weights(src_weights)
+    dest_model_output = dest_model([inputs, targets], training=True)
+    self.assertAllEqual(src_model_output, dest_model_output)
+
+  def test_forward_pass_not_train(self):
+    inputs = np.asarray([[5, 2, 1], [7, 5, 0], [1, 4, 0], [7, 5, 11]])
+
+    # src_model is the original model before refactored.
+    src_model = transformer.create_model(self.params, False)
+    src_num_weights = _count_params(src_model)
+    src_weights = src_model.get_weights()
+    src_model_output = src_model([inputs], training=False)
+
+    # dest_model is the refactored model.
+    dest_model = _create_model(self.params, False)
+    dest_num_weights = _count_params(dest_model)
+    self.assertEqual(src_num_weights, dest_num_weights)
+    dest_model.set_weights(src_weights)
+    dest_model_output = dest_model([inputs], training=False)
+    self.assertAllEqual(src_model_output[0], dest_model_output[0])
+    self.assertAllEqual(src_model_output[1], dest_model_output[1])
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/legacy/transformer/transformer_layers_test.py
+++ b/official/legacy/transformer/transformer_layers_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for layers in Transformer."""
+
+import tensorflow as tf
+
+from official.legacy.transformer import attention_layer
+from official.legacy.transformer import embedding_layer
+from official.legacy.transformer import ffn_layer
+from official.legacy.transformer import metrics
+
+
+class TransformerLayersTest(tf.test.TestCase):
+
+  def test_attention_layer(self):
+    hidden_size = 64
+    num_heads = 4
+    dropout = 0.5
+    dim_per_head = hidden_size // num_heads
+    layer = attention_layer.SelfAttention(hidden_size, num_heads, dropout)
+    self.assertDictEqual(
+        layer.get_config(), {
+            "hidden_size": hidden_size,
+            "num_heads": num_heads,
+            "attention_dropout": dropout,
+        })
+    length = 2
+    x = tf.ones([1, length, hidden_size])
+    bias = tf.ones([1])
+    cache = {
+        "k": tf.zeros([1, 0, num_heads, dim_per_head]),
+        "v": tf.zeros([1, 0, num_heads, dim_per_head]),
+    }
+    y = layer(x, bias, training=True, cache=cache)
+    self.assertEqual(y.shape, (
+        1,
+        length,
+        64,
+    ))
+    self.assertEqual(cache["k"].shape, (
+        1,
+        length,
+        num_heads,
+        dim_per_head,
+    ))
+    self.assertEqual(cache["v"].shape, (
+        1,
+        length,
+        num_heads,
+        dim_per_head,
+    ))
+
+  def test_embedding_shared_weights(self):
+    vocab_size = 50
+    hidden_size = 64
+    length = 2
+    layer = embedding_layer.EmbeddingSharedWeights(vocab_size, hidden_size)
+    self.assertDictEqual(layer.get_config(), {
+        "vocab_size": 50,
+        "hidden_size": 64,
+    })
+
+    idx = tf.ones([1, length], dtype="int32")
+    y = layer(idx)
+    self.assertEqual(y.shape, (
+        1,
+        length,
+        hidden_size,
+    ))
+    x = tf.ones([1, length, hidden_size])
+    output = layer(x, "linear")
+    self.assertEqual(output.shape, (
+        1,
+        length,
+        vocab_size,
+    ))
+
+  def test_feed_forward_network(self):
+    hidden_size = 64
+    filter_size = 32
+    relu_dropout = 0.5
+    layer = ffn_layer.FeedForwardNetwork(hidden_size, filter_size, relu_dropout)
+    self.assertDictEqual(
+        layer.get_config(), {
+            "hidden_size": hidden_size,
+            "filter_size": filter_size,
+            "relu_dropout": relu_dropout,
+        })
+    length = 2
+    x = tf.ones([1, length, hidden_size])
+    y = layer(x, training=True)
+    self.assertEqual(y.shape, (
+        1,
+        length,
+        hidden_size,
+    ))
+
+  def test_metric_layer(self):
+    vocab_size = 50
+    logits = tf.keras.layers.Input((None, vocab_size),
+                                   dtype="float32",
+                                   name="logits")
+    targets = tf.keras.layers.Input((None,), dtype="int64", name="targets")
+    output_logits = metrics.MetricLayer(vocab_size)([logits, targets])
+    self.assertEqual(output_logits.shape.as_list(), [
+        None,
+        None,
+        vocab_size,
+    ])
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/legacy/transformer/transformer_main.py
+++ b/official/legacy/transformer/transformer_main.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Train and evaluate the Transformer model.
+
+See README for description of setting the training schedule and evaluating the
+BLEU score.
+"""
+
+import os
+import tempfile
+
+# Import libraries
+
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+
+from official.common import distribute_utils
+from official.legacy.transformer import compute_bleu
+from official.legacy.transformer import data_pipeline
+from official.legacy.transformer import metrics
+from official.legacy.transformer import misc
+from official.legacy.transformer import optimizer
+from official.legacy.transformer import transformer
+from official.legacy.transformer import translate
+from official.legacy.transformer.utils import tokenizer
+from official.modeling import performance
+from official.utils.flags import core as flags_core
+from official.utils.misc import keras_utils
+
+# pylint:disable=logging-format-interpolation
+
+INF = int(1e9)
+BLEU_DIR = "bleu"
+_SINGLE_SAMPLE = 1
+
+
+def translate_and_compute_bleu(model,
+                               params,
+                               subtokenizer,
+                               bleu_source,
+                               bleu_ref,
+                               distribution_strategy=None):
+  """Translate file and report the cased and uncased bleu scores.
+
+  Args:
+    model: A Keras model, used to generate the translations.
+    params: A dictionary, containing the translation related parameters.
+    subtokenizer: A subtokenizer object, used for encoding and decoding source
+      and translated lines.
+    bleu_source: A file containing source sentences for translation.
+    bleu_ref: A file containing the reference for the translated sentences.
+    distribution_strategy: A platform distribution strategy, used for TPU based
+      translation.
+
+  Returns:
+    uncased_score: A float, the case insensitive BLEU score.
+    cased_score: A float, the case sensitive BLEU score.
+  """
+  # Create temporary file to store translation.
+  tmp = tempfile.NamedTemporaryFile(delete=False)
+  tmp_filename = tmp.name
+
+  translate.translate_file(
+      model,
+      params,
+      subtokenizer,
+      bleu_source,
+      output_file=tmp_filename,
+      print_all_translations=False,
+      distribution_strategy=distribution_strategy)
+
+  # Compute uncased and cased bleu scores.
+  uncased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, False)
+  cased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, True)
+  os.remove(tmp_filename)
+  return uncased_score, cased_score
+
+
+def evaluate_and_log_bleu(model,
+                          params,
+                          bleu_source,
+                          bleu_ref,
+                          vocab_file,
+                          distribution_strategy=None):
+  """Calculate and record the BLEU score.
+
+  Args:
+    model: A Keras model, used to generate the translations.
+    params: A dictionary, containing the translation related parameters.
+    bleu_source: A file containing source sentences for translation.
+    bleu_ref: A file containing the reference for the translated sentences.
+    vocab_file: A file containing the vocabulary for translation.
+    distribution_strategy: A platform distribution strategy, used for TPU based
+      translation.
+
+  Returns:
+    uncased_score: A float, the case insensitive BLEU score.
+    cased_score: A float, the case sensitive BLEU score.
+  """
+  subtokenizer = tokenizer.Subtokenizer(vocab_file)
+
+  uncased_score, cased_score = translate_and_compute_bleu(
+      model, params, subtokenizer, bleu_source, bleu_ref, distribution_strategy)
+
+  logging.info("Bleu score (uncased): %s", uncased_score)
+  logging.info("Bleu score (cased): %s", cased_score)
+  return uncased_score, cased_score
+
+
+class TransformerTask(object):
+  """Main entry of Transformer model."""
+
+  def __init__(self, flags_obj):
+    """Init function of TransformerMain.
+
+    Args:
+      flags_obj: Object containing parsed flag values, i.e., FLAGS.
+
+    Raises:
+      ValueError: if not using static batch for input data on TPU.
+    """
+    self.flags_obj = flags_obj
+    self.predict_model = None
+
+    # Add flag-defined parameters to params object
+    num_gpus = flags_core.get_num_gpus(flags_obj)
+    self.params = params = misc.get_model_params(flags_obj.param_set, num_gpus)
+
+    params["num_gpus"] = num_gpus
+    params["use_ctl"] = flags_obj.use_ctl
+    params["data_dir"] = flags_obj.data_dir
+    params["model_dir"] = flags_obj.model_dir
+    params["static_batch"] = flags_obj.static_batch
+    params["max_length"] = flags_obj.max_length
+    params["decode_batch_size"] = flags_obj.decode_batch_size
+    params["decode_max_length"] = flags_obj.decode_max_length
+    params["padded_decode"] = flags_obj.padded_decode
+    params["max_io_parallelism"] = (
+        flags_obj.num_parallel_calls or tf.data.experimental.AUTOTUNE)
+
+    params["use_synthetic_data"] = flags_obj.use_synthetic_data
+    params["batch_size"] = flags_obj.batch_size or params["default_batch_size"]
+    params["repeat_dataset"] = None
+    params["dtype"] = flags_core.get_tf_dtype(flags_obj)
+    params["enable_tensorboard"] = flags_obj.enable_tensorboard
+    params["enable_metrics_in_training"] = flags_obj.enable_metrics_in_training
+    params["steps_between_evals"] = flags_obj.steps_between_evals
+    params["enable_checkpointing"] = flags_obj.enable_checkpointing
+    params["save_weights_only"] = flags_obj.save_weights_only
+
+    self.distribution_strategy = distribute_utils.get_distribution_strategy(
+        distribution_strategy=flags_obj.distribution_strategy,
+        num_gpus=num_gpus,
+        all_reduce_alg=flags_obj.all_reduce_alg,
+        num_packs=flags_obj.num_packs,
+        tpu_address=flags_obj.tpu or "")
+    if self.use_tpu:
+      params["num_replicas"] = self.distribution_strategy.num_replicas_in_sync
+    else:
+      logging.info("Running transformer with num_gpus = %d", num_gpus)
+
+    if self.distribution_strategy:
+      logging.info("For training, using distribution strategy: %s",
+                   self.distribution_strategy)
+    else:
+      logging.info("Not using any distribution strategy.")
+
+    performance.set_mixed_precision_policy(params["dtype"])
+
+  @property
+  def use_tpu(self):
+    if self.distribution_strategy:
+      return isinstance(self.distribution_strategy, tf.distribute.TPUStrategy)
+    return False
+
+  def train(self):
+    """Trains the model."""
+    params = self.params
+    flags_obj = self.flags_obj
+    # Sets config options.
+    keras_utils.set_session_config(enable_xla=flags_obj.enable_xla)
+
+    _ensure_dir(flags_obj.model_dir)
+    with distribute_utils.get_strategy_scope(self.distribution_strategy):
+      model = transformer.create_model(params, is_train=True)
+      opt = self._create_optimizer()
+
+      current_step = 0
+      checkpoint = tf.train.Checkpoint(model=model, optimizer=opt)
+      latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir)
+      if latest_checkpoint:
+        checkpoint.restore(latest_checkpoint)
+        logging.info("Loaded checkpoint %s", latest_checkpoint)
+        current_step = opt.iterations.numpy()
+
+      if params["use_ctl"]:
+        train_loss_metric = tf.keras.metrics.Mean(
+            "training_loss", dtype=tf.float32)
+        if params["enable_tensorboard"]:
+          summary_writer = tf.summary.create_file_writer(
+              os.path.join(flags_obj.model_dir, "summary"))
+        else:
+          summary_writer = tf.summary.create_noop_writer()
+        train_metrics = [train_loss_metric]
+        if params["enable_metrics_in_training"]:
+          train_metrics = train_metrics + model.metrics
+      else:
+        model.compile(opt)
+
+    model.summary()
+
+    if self.use_tpu:
+      # Different from experimental_distribute_dataset,
+      # distribute_datasets_from_function requires
+      # per-replica/local batch size.
+      params["batch_size"] /= self.distribution_strategy.num_replicas_in_sync
+      train_ds = (
+          self.distribution_strategy.distribute_datasets_from_function(
+              lambda ctx: data_pipeline.train_input_fn(params, ctx)))
+    else:
+      train_ds = data_pipeline.train_input_fn(params)
+      map_data_fn = data_pipeline.map_data_for_transformer_fn
+      train_ds = train_ds.map(
+          map_data_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    if params["use_ctl"]:
+      train_ds_iterator = iter(train_ds)
+
+    callbacks = self._create_callbacks(flags_obj.model_dir, params)
+
+    # Only TimeHistory callback is supported for CTL
+    if params["use_ctl"]:
+      callbacks = [cb for cb in callbacks
+                   if isinstance(cb, keras_utils.TimeHistory)]
+
+    @tf.function
+    def train_steps(iterator, steps):
+      """Training steps function for TPU runs.
+
+      Args:
+        iterator: The input iterator of the training dataset.
+        steps: An integer, the number of training steps.
+
+      Returns:
+        A float, the loss value.
+      """
+
+      def _step_fn(inputs):
+        """Per-replica step function."""
+        inputs, targets = inputs
+        with tf.GradientTape() as tape:
+          logits = model([inputs, targets], training=True)
+          loss = metrics.transformer_loss(logits, targets,
+                                          params["label_smoothing"],
+                                          params["vocab_size"])
+          # Scales the loss, which results in using the average loss across all
+          # of the replicas for backprop.
+          scaled_loss = loss / self.distribution_strategy.num_replicas_in_sync
+
+        # De-dupes variables due to keras tracking issues.
+        tvars = list({id(v): v for v in model.trainable_variables}.values())
+        grads = tape.gradient(scaled_loss, tvars)
+        opt.apply_gradients(zip(grads, tvars))
+        # For reporting, the metric takes the mean of losses.
+        train_loss_metric.update_state(loss)
+
+      for _ in tf.range(steps):
+        train_loss_metric.reset_states()
+        self.distribution_strategy.run(
+            _step_fn, args=(next(iterator),))
+
+    cased_score, uncased_score = None, None
+    cased_score_history, uncased_score_history = [], []
+    while current_step < flags_obj.train_steps:
+      remaining_steps = flags_obj.train_steps - current_step
+      train_steps_per_eval = (
+          remaining_steps if remaining_steps < flags_obj.steps_between_evals
+          else flags_obj.steps_between_evals)
+      current_iteration = current_step // flags_obj.steps_between_evals
+
+      logging.info(
+          "Start train iteration at global step:{}".format(current_step))
+      history = None
+      if params["use_ctl"]:
+        if not self.use_tpu:
+          raise NotImplementedError(
+              "Custom training loop on GPUs is not implemented.")
+
+        # Runs training steps.
+        with summary_writer.as_default():
+          for cb in callbacks:
+            cb.on_epoch_begin(current_iteration)
+            cb.on_batch_begin(0)
+
+          train_steps(
+              train_ds_iterator,
+              tf.convert_to_tensor(train_steps_per_eval, dtype=tf.int32))
+          current_step += train_steps_per_eval
+          train_loss = train_loss_metric.result().numpy().astype(float)
+          logging.info("Train Step: %d/%d / loss = %s", current_step,
+                       flags_obj.train_steps, train_loss)
+
+          for cb in callbacks:
+            cb.on_batch_end(train_steps_per_eval - 1)
+            cb.on_epoch_end(current_iteration)
+
+          if params["enable_tensorboard"]:
+            for metric_obj in train_metrics:
+              tf.summary.scalar(metric_obj.name, metric_obj.result(),
+                                current_step)
+              summary_writer.flush()
+
+        for cb in callbacks:
+          cb.on_train_end()
+
+        if flags_obj.enable_checkpointing:
+          # avoid check-pointing when running for benchmarking.
+          checkpoint_name = checkpoint.save(
+              os.path.join(flags_obj.model_dir,
+                           "ctl_step_{}.ckpt".format(current_step)))
+          logging.info("Saved checkpoint to %s", checkpoint_name)
+      else:
+        if self.use_tpu:
+          raise NotImplementedError(
+              "Keras model.fit on TPUs is not implemented.")
+        history = model.fit(
+            train_ds,
+            initial_epoch=current_iteration,
+            epochs=current_iteration + 1,
+            steps_per_epoch=train_steps_per_eval,
+            callbacks=callbacks,
+            # If TimeHistory is enabled, progress bar would be messy. Increase
+            # the verbose level to get rid of it.
+            verbose=(2 if flags_obj.enable_time_history else 1))
+        current_step += train_steps_per_eval
+        logging.info("Train history: {}".format(history.history))
+
+      logging.info("End train iteration at global step:{}".format(current_step))
+
+      if (flags_obj.bleu_source and flags_obj.bleu_ref):
+        uncased_score, cased_score = self.eval()
+        cased_score_history.append([current_iteration + 1, cased_score])
+        uncased_score_history.append([current_iteration + 1, uncased_score])
+
+    stats = ({
+        "loss": train_loss
+    } if history is None else {})
+    misc.update_stats(history, stats, callbacks)
+    if uncased_score and cased_score:
+      stats["bleu_uncased"] = uncased_score
+      stats["bleu_cased"] = cased_score
+      stats["bleu_uncased_history"] = uncased_score_history
+      stats["bleu_cased_history"] = cased_score_history
+    return stats
+
+  def eval(self):
+    """Evaluates the model."""
+    distribution_strategy = self.distribution_strategy if self.use_tpu else None
+
+    # We only want to create the model under DS scope for TPU case.
+    # When 'distribution_strategy' is None, a no-op DummyContextManager will
+    # be used.
+    with distribute_utils.get_strategy_scope(distribution_strategy):
+      if not self.predict_model:
+        self.predict_model = transformer.create_model(self.params, False)
+      self._load_weights_if_possible(
+          self.predict_model,
+          tf.train.latest_checkpoint(self.flags_obj.model_dir))
+      self.predict_model.summary()
+    return evaluate_and_log_bleu(
+        self.predict_model, self.params, self.flags_obj.bleu_source,
+        self.flags_obj.bleu_ref, self.flags_obj.vocab_file,
+        distribution_strategy)
+
+  def predict(self):
+    """Predicts result from the model."""
+    params = self.params
+    flags_obj = self.flags_obj
+
+    with tf.name_scope("model"):
+      model = transformer.create_model(params, is_train=False)
+      self._load_weights_if_possible(
+          model, tf.train.latest_checkpoint(self.flags_obj.model_dir))
+      model.summary()
+    subtokenizer = tokenizer.Subtokenizer(flags_obj.vocab_file)
+
+    ds = data_pipeline.eval_input_fn(params)
+    ds = ds.map(lambda x, y: x).take(_SINGLE_SAMPLE)
+    ret = model.predict(ds)
+    val_outputs, _ = ret
+    length = len(val_outputs)
+    for i in range(length):
+      translate.translate_from_input(val_outputs[i], subtokenizer)
+
+  def _create_callbacks(self, cur_log_dir, params):
+    """Creates a list of callbacks."""
+    callbacks = misc.get_callbacks()
+    if params["enable_checkpointing"]:
+      ckpt_full_path = os.path.join(cur_log_dir, "cp-{epoch:04d}.ckpt")
+      callbacks.append(
+          tf.keras.callbacks.ModelCheckpoint(
+              ckpt_full_path, save_weights_only=params["save_weights_only"]))
+    return callbacks
+
+  def _load_weights_if_possible(self, model, init_weight_path=None):
+    """Loads model weights when it is provided."""
+    if init_weight_path:
+      logging.info("Load weights: {}".format(init_weight_path))
+      if self.use_tpu:
+        checkpoint = tf.train.Checkpoint(
+            model=model, optimizer=self._create_optimizer())
+        checkpoint.restore(init_weight_path)
+      else:
+        model.load_weights(init_weight_path)
+    else:
+      logging.info("Weights not loaded from path:{}".format(init_weight_path))
+
+  def _create_optimizer(self):
+    """Creates optimizer."""
+    params = self.params
+    lr_schedule = optimizer.LearningRateSchedule(
+        params["learning_rate"], params["hidden_size"],
+        params["learning_rate_warmup_steps"])
+    opt = tf.keras.optimizers.Adam(
+        lr_schedule,
+        params["optimizer_adam_beta1"],
+        params["optimizer_adam_beta2"],
+        epsilon=params["optimizer_adam_epsilon"])
+
+    opt = performance.configure_optimizer(
+        opt,
+        use_float16=params["dtype"] == tf.float16,
+        loss_scale=flags_core.get_loss_scale(
+            self.flags_obj, default_for_fp16="dynamic"))
+
+    return opt
+
+
+def _ensure_dir(log_dir):
+  """Makes log dir if not existed."""
+  if not tf.io.gfile.exists(log_dir):
+    tf.io.gfile.makedirs(log_dir)
+
+
+def main(_):
+  flags_obj = flags.FLAGS
+  if flags_obj.enable_mlir_bridge:
+    tf.config.experimental.enable_mlir_bridge()
+  task = TransformerTask(flags_obj)
+
+  # Execute flag override logic for better model performance
+  if flags_obj.tf_gpu_thread_mode:
+    keras_utils.set_gpu_thread_mode_and_count(
+        per_gpu_thread_count=flags_obj.per_gpu_thread_count,
+        gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
+        num_gpus=flags_obj.num_gpus,
+        datasets_num_private_threads=flags_obj.datasets_num_private_threads)
+
+  if flags_obj.mode == "train":
+    task.train()
+  elif flags_obj.mode == "predict":
+    task.predict()
+  elif flags_obj.mode == "eval":
+    task.eval()
+  else:
+    raise ValueError("Invalid mode {}".format(flags_obj.mode))
+
+
+if __name__ == "__main__":
+  logging.set_verbosity(logging.INFO)
+  misc.define_transformer_flags()
+  app.run(main)
--- a/official/legacy/transformer/transformer_main_test.py
+++ b/official/legacy/transformer/transformer_main_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test Transformer model."""
+
+import os
+import re
+import sys
+import unittest
+
+from absl import flags
+from absl.testing import flagsaver
+import tensorflow as tf
+from tensorflow.python.eager import context  # pylint: disable=ungrouped-imports
+from official.legacy.transformer import misc
+from official.legacy.transformer import transformer_main
+
+FLAGS = flags.FLAGS
+FIXED_TIMESTAMP = 'my_time_stamp'
+WEIGHT_PATTERN = re.compile(r'weights-epoch-.+\.hdf5')
+
+
+def _generate_file(filepath, lines):
+  with open(filepath, 'w') as f:
+    for l in lines:
+      f.write('{}\n'.format(l))
+
+
+class TransformerTaskTest(tf.test.TestCase):
+  local_flags = None
+
+  def setUp(self):  # pylint: disable=g-missing-super-call
+    temp_dir = self.get_temp_dir()
+    if TransformerTaskTest.local_flags is None:
+      misc.define_transformer_flags()
+      # Loads flags, array cannot be blank.
+      flags.FLAGS(['foo'])
+      TransformerTaskTest.local_flags = flagsaver.save_flag_values()
+    else:
+      flagsaver.restore_flag_values(TransformerTaskTest.local_flags)
+    FLAGS.model_dir = os.path.join(temp_dir, FIXED_TIMESTAMP)
+    FLAGS.param_set = 'tiny'
+    FLAGS.use_synthetic_data = True
+    FLAGS.steps_between_evals = 1
+    FLAGS.train_steps = 1
+    FLAGS.validation_steps = 1
+    FLAGS.batch_size = 4
+    FLAGS.max_length = 1
+    FLAGS.num_gpus = 1
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.dtype = 'fp32'
+    self.model_dir = FLAGS.model_dir
+    self.temp_dir = temp_dir
+    self.vocab_file = os.path.join(temp_dir, 'vocab')
+    self.vocab_size = misc.get_model_params(FLAGS.param_set, 0)['vocab_size']
+    self.bleu_source = os.path.join(temp_dir, 'bleu_source')
+    self.bleu_ref = os.path.join(temp_dir, 'bleu_ref')
+    self.orig_policy = (
+        tf.compat.v2.keras.mixed_precision.global_policy())
+
+  def tearDown(self):  # pylint: disable=g-missing-super-call
+    tf.compat.v2.keras.mixed_precision.set_global_policy(self.orig_policy)
+
+  def _assert_exists(self, filepath):
+    self.assertTrue(os.path.exists(filepath))
+
+  def test_train_no_dist_strat(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  def test_train_save_full_model(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    FLAGS.save_weights_only = False
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  def test_train_static_batch(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    FLAGS.distribution_strategy = 'one_device'
+    if tf.test.is_built_with_cuda():
+      FLAGS.num_gpus = 1
+    else:
+      FLAGS.num_gpus = 0
+    FLAGS.static_batch = True
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_train_1_gpu_with_dist_strat(self):
+    FLAGS.distribution_strategy = 'one_device'
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_train_fp16(self):
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.dtype = 'fp16'
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_train_2_gpu(self):
+    if context.num_gpus() < 2:
+      self.skipTest(
+          '{} GPUs are not available for this test. {} GPUs are available'
+          .format(2, context.num_gpus()))
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.num_gpus = 2
+    FLAGS.param_set = 'base'
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_train_2_gpu_fp16(self):
+    if context.num_gpus() < 2:
+      self.skipTest(
+          '{} GPUs are not available for this test. {} GPUs are available'
+          .format(2, context.num_gpus()))
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.num_gpus = 2
+    FLAGS.param_set = 'base'
+    FLAGS.dtype = 'fp16'
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  def _prepare_files_and_flags(self, *extra_flags):
+    # Make log dir.
+    if not os.path.exists(self.temp_dir):
+      os.makedirs(self.temp_dir)
+
+    # Fake vocab, bleu_source and bleu_ref.
+    tokens = [
+        "'<pad>'", "'<EOS>'", "'_'", "'a'", "'b'", "'c'", "'d'", "'a_'", "'b_'",
+        "'c_'", "'d_'"
+    ]
+    tokens += ["'{}'".format(i) for i in range(self.vocab_size - len(tokens))]
+    _generate_file(self.vocab_file, tokens)
+    _generate_file(self.bleu_source, ['a b', 'c d'])
+    _generate_file(self.bleu_ref, ['a b', 'd c'])
+
+    # Update flags.
+    update_flags = [
+        'ignored_program_name',
+        '--vocab_file={}'.format(self.vocab_file),
+        '--bleu_source={}'.format(self.bleu_source),
+        '--bleu_ref={}'.format(self.bleu_ref),
+    ]
+    if extra_flags:
+      update_flags.extend(extra_flags)
+    FLAGS(update_flags)
+
+  def test_predict(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    self._prepare_files_and_flags()
+    t = transformer_main.TransformerTask(FLAGS)
+    t.predict()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_predict_fp16(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    self._prepare_files_and_flags('--dtype=fp16')
+    t = transformer_main.TransformerTask(FLAGS)
+    t.predict()
+
+  def test_eval(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    if 'test_xla' in sys.argv[0]:
+      self.skipTest('TODO(xla): Make this test faster under XLA.')
+    self._prepare_files_and_flags()
+    t = transformer_main.TransformerTask(FLAGS)
+    t.eval()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/legacy/transformer/transformer_test.py
+++ b/official/legacy/transformer/transformer_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test Transformer model."""
+
+import tensorflow as tf
+
+from official.legacy.transformer import model_params
+from official.legacy.transformer import transformer
+
+
+class TransformerV2Test(tf.test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.params = params = model_params.TINY_PARAMS
+    params["batch_size"] = params["default_batch_size"] = 16
+    params["use_synthetic_data"] = True
+    params["hidden_size"] = 12
+    params["num_hidden_layers"] = 2
+    params["filter_size"] = 14
+    params["num_heads"] = 2
+    params["vocab_size"] = 41
+    params["extra_decode_length"] = 2
+    params["beam_size"] = 3
+    params["dtype"] = tf.float32
+
+  def test_create_model_train(self):
+    model = transformer.create_model(self.params, True)
+    inputs, outputs = model.inputs, model.outputs
+    self.assertEqual(len(inputs), 2)
+    self.assertEqual(len(outputs), 1)
+    self.assertEqual(inputs[0].shape.as_list(), [None, None])
+    self.assertEqual(inputs[0].dtype, tf.int64)
+    self.assertEqual(inputs[1].shape.as_list(), [None, None])
+    self.assertEqual(inputs[1].dtype, tf.int64)
+    self.assertEqual(outputs[0].shape.as_list(), [None, None, 41])
+    self.assertEqual(outputs[0].dtype, tf.float32)
+
+  def test_create_model_not_train(self):
+    model = transformer.create_model(self.params, False)
+    inputs, outputs = model.inputs, model.outputs
+    self.assertEqual(len(inputs), 1)
+    self.assertEqual(len(outputs), 2)
+    self.assertEqual(inputs[0].shape.as_list(), [None, None])
+    self.assertEqual(inputs[0].dtype, tf.int64)
+    self.assertEqual(outputs[0].shape.as_list(), [None, None])
+    self.assertEqual(outputs[0].dtype, tf.int32)
+    self.assertEqual(outputs[1].shape.as_list(), [None])
+    self.assertEqual(outputs[1].dtype, tf.float32)
+
+  def test_export(self):
+    model = transformer.Transformer(self.params, name="transformer_v2")
+    export_dir = self.get_temp_dir()
+    batch_size = 5
+    max_length = 6
+
+    class SaveModule(tf.Module):
+
+      def __init__(self, model):
+        super(SaveModule, self).__init__()
+        self.model = model
+
+      @tf.function
+      def serve(self, x):
+        return self.model.call([x], training=False)
+
+    save_module = SaveModule(model)
+    tensor_shape = (None, None)
+    sample_input = tf.zeros((batch_size, max_length), dtype=tf.int64)
+    _ = save_module.serve(sample_input)
+    signatures = dict(
+        serving_default=save_module.serve.get_concrete_function(
+            tf.TensorSpec(shape=tensor_shape, dtype=tf.int64, name="x")))
+    tf.saved_model.save(save_module, export_dir, signatures=signatures)
+    imported = tf.saved_model.load(export_dir)
+    serving_fn = imported.signatures["serving_default"]
+    all_outputs = serving_fn(sample_input)
+    output = all_outputs["outputs"]
+    output_shapes = output.shape.as_list()
+    self.assertEqual(output_shapes[0], batch_size)
+    self.assertEqual(output_shapes[1],
+                     max_length + model.params["extra_decode_length"])
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/legacy/transformer/translate.py
+++ b/official/legacy/transformer/translate.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Translate text or files using trained transformer model."""
+
+# Import libraries
+from absl import logging
+import numpy as np
+import tensorflow as tf
+
+from official.legacy.transformer.utils import tokenizer
+
+_EXTRA_DECODE_LENGTH = 100
+_BEAM_SIZE = 4
+_ALPHA = 0.6
+
+
+def _get_sorted_inputs(filename):
+  """Read and sort lines from the file sorted by decreasing length.
+
+  Args:
+    filename: String name of file to read inputs from.
+  Returns:
+    Sorted list of inputs, and dictionary mapping original index->sorted index
+    of each element.
+  """
+  with tf.io.gfile.GFile(filename) as f:
+    records = f.read().split("\n")
+    inputs = [record.strip() for record in records]
+    if not inputs[-1]:
+      inputs.pop()
+
+  input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
+  sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)
+
+  sorted_inputs = [None] * len(sorted_input_lens)
+  sorted_keys = [0] * len(sorted_input_lens)
+  for i, (index, _) in enumerate(sorted_input_lens):
+    sorted_inputs[i] = inputs[index]
+    sorted_keys[index] = i
+  return sorted_inputs, sorted_keys
+
+
+def _encode_and_add_eos(line, subtokenizer):
+  """Encode line with subtokenizer, and add EOS id to the end."""
+  return subtokenizer.encode(line) + [tokenizer.EOS_ID]
+
+
+def _trim_and_decode(ids, subtokenizer):
+  """Trim EOS and PAD tokens from ids, and decode to return a string."""
+  try:
+    index = list(ids).index(tokenizer.EOS_ID)
+    return subtokenizer.decode(ids[:index])
+  except ValueError:  # No EOS found in sequence
+    return subtokenizer.decode(ids)
+
+
+def translate_file(model,
+                   params,
+                   subtokenizer,
+                   input_file,
+                   output_file=None,
+                   print_all_translations=True,
+                   distribution_strategy=None):
+  """Translate lines in file, and save to output file if specified.
+
+  Args:
+    model: A Keras model, used to generate the translations.
+    params: A dictionary, containing the translation related parameters.
+    subtokenizer: A subtokenizer object, used for encoding and decoding source
+      and translated lines.
+    input_file: A file containing lines to translate.
+    output_file: A file that stores the generated translations.
+    print_all_translations: A bool. If true, all translations are printed to
+      stdout.
+    distribution_strategy: A distribution strategy, used to perform inference
+      directly with tf.function instead of Keras model.predict().
+
+  Raises:
+    ValueError: if output file is invalid.
+  """
+  batch_size = params["decode_batch_size"]
+
+  # Read and sort inputs by length. Keep dictionary (original index-->new index
+  # in sorted list) to write translations in the original order.
+  sorted_inputs, sorted_keys = _get_sorted_inputs(input_file)
+  total_samples = len(sorted_inputs)
+  num_decode_batches = (total_samples - 1) // batch_size + 1
+
+  def input_generator():
+    """Yield encoded strings from sorted_inputs."""
+    for i in range(num_decode_batches):
+      lines = [
+          sorted_inputs[j + i * batch_size]
+          for j in range(batch_size)
+          if j + i * batch_size < total_samples
+      ]
+      lines = [_encode_and_add_eos(l, subtokenizer) for l in lines]
+      if distribution_strategy:
+        for j in range(batch_size - len(lines)):
+          lines.append([tokenizer.EOS_ID])
+      batch = tf.keras.preprocessing.sequence.pad_sequences(
+          lines,
+          maxlen=params["decode_max_length"],
+          dtype="int32",
+          padding="post")
+      logging.info("Decoding batch %d out of %d.", i, num_decode_batches)
+      yield batch
+
+  @tf.function
+  def predict_step(inputs):
+    """Decoding step function for TPU runs."""
+
+    def _step_fn(inputs):
+      """Per replica step function."""
+      tag = inputs[0]
+      val_inputs = inputs[1]
+      val_outputs, _ = model([val_inputs], training=False)
+      return tag, val_outputs
+
+    return distribution_strategy.run(_step_fn, args=(inputs,))
+
+  translations = []
+  if distribution_strategy:
+    num_replicas = distribution_strategy.num_replicas_in_sync
+    local_batch_size = params["decode_batch_size"] // num_replicas
+  for i, text in enumerate(input_generator()):
+    if distribution_strategy:
+      text = np.reshape(text, [num_replicas, local_batch_size, -1])
+      # Add tag to the input of each replica with the reordering logic after
+      # outputs, to ensure the output order matches the input order.
+      text = tf.constant(text)
+
+      @tf.function
+      def text_as_per_replica():
+        replica_context = tf.distribute.get_replica_context()
+        replica_id = replica_context.replica_id_in_sync_group
+        return replica_id, text[replica_id]  # pylint: disable=cell-var-from-loop
+
+      text = distribution_strategy.run(text_as_per_replica)
+      outputs = distribution_strategy.experimental_local_results(
+          predict_step(text))
+      val_outputs = [output for _, output in outputs]
+
+      val_outputs = np.reshape(val_outputs, [params["decode_batch_size"], -1])
+    else:
+      val_outputs, _ = model.predict(text)
+
+    length = len(val_outputs)
+    for j in range(length):
+      if j + i * batch_size < total_samples:
+        translation = _trim_and_decode(val_outputs[j], subtokenizer)
+        translations.append(translation)
+        if print_all_translations:
+          logging.info("Translating:\n\tInput: %s\n\tOutput: %s",
+                       sorted_inputs[j + i * batch_size], translation)
+
+  # Write translations in the order they appeared in the original file.
+  if output_file is not None:
+    if tf.io.gfile.isdir(output_file):
+      raise ValueError("File output is a directory, will not save outputs to "
+                       "file.")
+    logging.info("Writing to file %s", output_file)
+    with tf.io.gfile.GFile(output_file, "w") as f:
+      for i in sorted_keys:
+        f.write("%s\n" % translations[i])
+
+
+def translate_from_text(model, subtokenizer, txt):
+  encoded_txt = _encode_and_add_eos(txt, subtokenizer)
+  result = model.predict(encoded_txt)
+  outputs = result["outputs"]
+  logging.info("Original: \"%s\"", txt)
+  translate_from_input(outputs, subtokenizer)
+
+
+def translate_from_input(outputs, subtokenizer):
+  translation = _trim_and_decode(outputs, subtokenizer)
+  logging.info("Translation: \"%s\"", translation)