Internal change

PiperOrigin-RevId: 417673004

Internal change
PiperOrigin-RevId: 417673004
90dd6310 · Frederick Liu · A. Unique TensorFlower · ddaca60a · 90dd6310 · 90dd6310
Commit 90dd6310 authored Dec 21, 2021 by Frederick Liu Committed by A. Unique TensorFlower Dec 21, 2021
7 changed files
--- a/official/legacy/transformer/transformer_main_test.py
+++ b/official/legacy/transformer/transformer_main_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test Transformer model."""
+
+import os
+import re
+import sys
+import unittest
+
+from absl import flags
+from absl.testing import flagsaver
+import tensorflow as tf
+from tensorflow.python.eager import context  # pylint: disable=ungrouped-imports
+from official.legacy.transformer import misc
+from official.legacy.transformer import transformer_main
+
+FLAGS = flags.FLAGS
+FIXED_TIMESTAMP = 'my_time_stamp'
+WEIGHT_PATTERN = re.compile(r'weights-epoch-.+\.hdf5')
+
+
+def _generate_file(filepath, lines):
+  with open(filepath, 'w') as f:
+    for l in lines:
+      f.write('{}\n'.format(l))
+
+
+class TransformerTaskTest(tf.test.TestCase):
+  local_flags = None
+
+  def setUp(self):  # pylint: disable=g-missing-super-call
+    temp_dir = self.get_temp_dir()
+    if TransformerTaskTest.local_flags is None:
+      misc.define_transformer_flags()
+      # Loads flags, array cannot be blank.
+      flags.FLAGS(['foo'])
+      TransformerTaskTest.local_flags = flagsaver.save_flag_values()
+    else:
+      flagsaver.restore_flag_values(TransformerTaskTest.local_flags)
+    FLAGS.model_dir = os.path.join(temp_dir, FIXED_TIMESTAMP)
+    FLAGS.param_set = 'tiny'
+    FLAGS.use_synthetic_data = True
+    FLAGS.steps_between_evals = 1
+    FLAGS.train_steps = 1
+    FLAGS.validation_steps = 1
+    FLAGS.batch_size = 4
+    FLAGS.max_length = 1
+    FLAGS.num_gpus = 1
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.dtype = 'fp32'
+    self.model_dir = FLAGS.model_dir
+    self.temp_dir = temp_dir
+    self.vocab_file = os.path.join(temp_dir, 'vocab')
+    self.vocab_size = misc.get_model_params(FLAGS.param_set, 0)['vocab_size']
+    self.bleu_source = os.path.join(temp_dir, 'bleu_source')
+    self.bleu_ref = os.path.join(temp_dir, 'bleu_ref')
+    self.orig_policy = (
+        tf.compat.v2.keras.mixed_precision.global_policy())
+
+  def tearDown(self):  # pylint: disable=g-missing-super-call
+    tf.compat.v2.keras.mixed_precision.set_global_policy(self.orig_policy)
+
+  def _assert_exists(self, filepath):
+    self.assertTrue(os.path.exists(filepath))
+
+  def test_train_no_dist_strat(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  def test_train_save_full_model(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    FLAGS.save_weights_only = False
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  def test_train_static_batch(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    FLAGS.distribution_strategy = 'one_device'
+    if tf.test.is_built_with_cuda():
+      FLAGS.num_gpus = 1
+    else:
+      FLAGS.num_gpus = 0
+    FLAGS.static_batch = True
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_train_1_gpu_with_dist_strat(self):
+    FLAGS.distribution_strategy = 'one_device'
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_train_fp16(self):
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.dtype = 'fp16'
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_train_2_gpu(self):
+    if context.num_gpus() < 2:
+      self.skipTest(
+          '{} GPUs are not available for this test. {} GPUs are available'
+          .format(2, context.num_gpus()))
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.num_gpus = 2
+    FLAGS.param_set = 'base'
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_train_2_gpu_fp16(self):
+    if context.num_gpus() < 2:
+      self.skipTest(
+          '{} GPUs are not available for this test. {} GPUs are available'
+          .format(2, context.num_gpus()))
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.num_gpus = 2
+    FLAGS.param_set = 'base'
+    FLAGS.dtype = 'fp16'
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  def _prepare_files_and_flags(self, *extra_flags):
+    # Make log dir.
+    if not os.path.exists(self.temp_dir):
+      os.makedirs(self.temp_dir)
+
+    # Fake vocab, bleu_source and bleu_ref.
+    tokens = [
+        "'<pad>'", "'<EOS>'", "'_'", "'a'", "'b'", "'c'", "'d'", "'a_'", "'b_'",
+        "'c_'", "'d_'"
+    ]
+    tokens += ["'{}'".format(i) for i in range(self.vocab_size - len(tokens))]
+    _generate_file(self.vocab_file, tokens)
+    _generate_file(self.bleu_source, ['a b', 'c d'])
+    _generate_file(self.bleu_ref, ['a b', 'd c'])
+
+    # Update flags.
+    update_flags = [
+        'ignored_program_name',
+        '--vocab_file={}'.format(self.vocab_file),
+        '--bleu_source={}'.format(self.bleu_source),
+        '--bleu_ref={}'.format(self.bleu_ref),
+    ]
+    if extra_flags:
+      update_flags.extend(extra_flags)
+    FLAGS(update_flags)
+
+  def test_predict(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    self._prepare_files_and_flags()
+    t = transformer_main.TransformerTask(FLAGS)
+    t.predict()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_predict_fp16(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    self._prepare_files_and_flags('--dtype=fp16')
+    t = transformer_main.TransformerTask(FLAGS)
+    t.predict()
+
+  def test_eval(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    if 'test_xla' in sys.argv[0]:
+      self.skipTest('TODO(xla): Make this test faster under XLA.')
+    self._prepare_files_and_flags()
+    t = transformer_main.TransformerTask(FLAGS)
+    t.eval()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/legacy/transformer/transformer_test.py
+++ b/official/legacy/transformer/transformer_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test Transformer model."""
+
+import tensorflow as tf
+
+from official.legacy.transformer import model_params
+from official.legacy.transformer import transformer
+
+
+class TransformerV2Test(tf.test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.params = params = model_params.TINY_PARAMS
+    params["batch_size"] = params["default_batch_size"] = 16
+    params["use_synthetic_data"] = True
+    params["hidden_size"] = 12
+    params["num_hidden_layers"] = 2
+    params["filter_size"] = 14
+    params["num_heads"] = 2
+    params["vocab_size"] = 41
+    params["extra_decode_length"] = 2
+    params["beam_size"] = 3
+    params["dtype"] = tf.float32
+
+  def test_create_model_train(self):
+    model = transformer.create_model(self.params, True)
+    inputs, outputs = model.inputs, model.outputs
+    self.assertEqual(len(inputs), 2)
+    self.assertEqual(len(outputs), 1)
+    self.assertEqual(inputs[0].shape.as_list(), [None, None])
+    self.assertEqual(inputs[0].dtype, tf.int64)
+    self.assertEqual(inputs[1].shape.as_list(), [None, None])
+    self.assertEqual(inputs[1].dtype, tf.int64)
+    self.assertEqual(outputs[0].shape.as_list(), [None, None, 41])
+    self.assertEqual(outputs[0].dtype, tf.float32)
+
+  def test_create_model_not_train(self):
+    model = transformer.create_model(self.params, False)
+    inputs, outputs = model.inputs, model.outputs
+    self.assertEqual(len(inputs), 1)
+    self.assertEqual(len(outputs), 2)
+    self.assertEqual(inputs[0].shape.as_list(), [None, None])
+    self.assertEqual(inputs[0].dtype, tf.int64)
+    self.assertEqual(outputs[0].shape.as_list(), [None, None])
+    self.assertEqual(outputs[0].dtype, tf.int32)
+    self.assertEqual(outputs[1].shape.as_list(), [None])
+    self.assertEqual(outputs[1].dtype, tf.float32)
+
+  def test_export(self):
+    model = transformer.Transformer(self.params, name="transformer_v2")
+    export_dir = self.get_temp_dir()
+    batch_size = 5
+    max_length = 6
+
+    class SaveModule(tf.Module):
+
+      def __init__(self, model):
+        super(SaveModule, self).__init__()
+        self.model = model
+
+      @tf.function
+      def serve(self, x):
+        return self.model.call([x], training=False)
+
+    save_module = SaveModule(model)
+    tensor_shape = (None, None)
+    sample_input = tf.zeros((batch_size, max_length), dtype=tf.int64)
+    _ = save_module.serve(sample_input)
+    signatures = dict(
+        serving_default=save_module.serve.get_concrete_function(
+            tf.TensorSpec(shape=tensor_shape, dtype=tf.int64, name="x")))
+    tf.saved_model.save(save_module, export_dir, signatures=signatures)
+    imported = tf.saved_model.load(export_dir)
+    serving_fn = imported.signatures["serving_default"]
+    all_outputs = serving_fn(sample_input)
+    output = all_outputs["outputs"]
+    output_shapes = output.shape.as_list()
+    self.assertEqual(output_shapes[0], batch_size)
+    self.assertEqual(output_shapes[1],
+                     max_length + model.params["extra_decode_length"])
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/legacy/transformer/translate.py
+++ b/official/legacy/transformer/translate.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Translate text or files using trained transformer model."""
+
+# Import libraries
+from absl import logging
+import numpy as np
+import tensorflow as tf
+
+from official.legacy.transformer.utils import tokenizer
+
+_EXTRA_DECODE_LENGTH = 100
+_BEAM_SIZE = 4
+_ALPHA = 0.6
+
+
+def _get_sorted_inputs(filename):
+  """Read and sort lines from the file sorted by decreasing length.
+
+  Args:
+    filename: String name of file to read inputs from.
+  Returns:
+    Sorted list of inputs, and dictionary mapping original index->sorted index
+    of each element.
+  """
+  with tf.io.gfile.GFile(filename) as f:
+    records = f.read().split("\n")
+    inputs = [record.strip() for record in records]
+    if not inputs[-1]:
+      inputs.pop()
+
+  input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
+  sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)
+
+  sorted_inputs = [None] * len(sorted_input_lens)
+  sorted_keys = [0] * len(sorted_input_lens)
+  for i, (index, _) in enumerate(sorted_input_lens):
+    sorted_inputs[i] = inputs[index]
+    sorted_keys[index] = i
+  return sorted_inputs, sorted_keys
+
+
+def _encode_and_add_eos(line, subtokenizer):
+  """Encode line with subtokenizer, and add EOS id to the end."""
+  return subtokenizer.encode(line) + [tokenizer.EOS_ID]
+
+
+def _trim_and_decode(ids, subtokenizer):
+  """Trim EOS and PAD tokens from ids, and decode to return a string."""
+  try:
+    index = list(ids).index(tokenizer.EOS_ID)
+    return subtokenizer.decode(ids[:index])
+  except ValueError:  # No EOS found in sequence
+    return subtokenizer.decode(ids)
+
+
+def translate_file(model,
+                   params,
+                   subtokenizer,
+                   input_file,
+                   output_file=None,
+                   print_all_translations=True,
+                   distribution_strategy=None):
+  """Translate lines in file, and save to output file if specified.
+
+  Args:
+    model: A Keras model, used to generate the translations.
+    params: A dictionary, containing the translation related parameters.
+    subtokenizer: A subtokenizer object, used for encoding and decoding source
+      and translated lines.
+    input_file: A file containing lines to translate.
+    output_file: A file that stores the generated translations.
+    print_all_translations: A bool. If true, all translations are printed to
+      stdout.
+    distribution_strategy: A distribution strategy, used to perform inference
+      directly with tf.function instead of Keras model.predict().
+
+  Raises:
+    ValueError: if output file is invalid.
+  """
+  batch_size = params["decode_batch_size"]
+
+  # Read and sort inputs by length. Keep dictionary (original index-->new index
+  # in sorted list) to write translations in the original order.
+  sorted_inputs, sorted_keys = _get_sorted_inputs(input_file)
+  total_samples = len(sorted_inputs)
+  num_decode_batches = (total_samples - 1) // batch_size + 1
+
+  def input_generator():
+    """Yield encoded strings from sorted_inputs."""
+    for i in range(num_decode_batches):
+      lines = [
+          sorted_inputs[j + i * batch_size]
+          for j in range(batch_size)
+          if j + i * batch_size < total_samples
+      ]
+      lines = [_encode_and_add_eos(l, subtokenizer) for l in lines]
+      if distribution_strategy:
+        for j in range(batch_size - len(lines)):
+          lines.append([tokenizer.EOS_ID])
+      batch = tf.keras.preprocessing.sequence.pad_sequences(
+          lines,
+          maxlen=params["decode_max_length"],
+          dtype="int32",
+          padding="post")
+      logging.info("Decoding batch %d out of %d.", i, num_decode_batches)
+      yield batch
+
+  @tf.function
+  def predict_step(inputs):
+    """Decoding step function for TPU runs."""
+
+    def _step_fn(inputs):
+      """Per replica step function."""
+      tag = inputs[0]
+      val_inputs = inputs[1]
+      val_outputs, _ = model([val_inputs], training=False)
+      return tag, val_outputs
+
+    return distribution_strategy.run(_step_fn, args=(inputs,))
+
+  translations = []
+  if distribution_strategy:
+    num_replicas = distribution_strategy.num_replicas_in_sync
+    local_batch_size = params["decode_batch_size"] // num_replicas
+  for i, text in enumerate(input_generator()):
+    if distribution_strategy:
+      text = np.reshape(text, [num_replicas, local_batch_size, -1])
+      # Add tag to the input of each replica with the reordering logic after
+      # outputs, to ensure the output order matches the input order.
+      text = tf.constant(text)
+
+      @tf.function
+      def text_as_per_replica():
+        replica_context = tf.distribute.get_replica_context()
+        replica_id = replica_context.replica_id_in_sync_group
+        return replica_id, text[replica_id]  # pylint: disable=cell-var-from-loop
+
+      text = distribution_strategy.run(text_as_per_replica)
+      outputs = distribution_strategy.experimental_local_results(
+          predict_step(text))
+      val_outputs = [output for _, output in outputs]
+
+      val_outputs = np.reshape(val_outputs, [params["decode_batch_size"], -1])
+    else:
+      val_outputs, _ = model.predict(text)
+
+    length = len(val_outputs)
+    for j in range(length):
+      if j + i * batch_size < total_samples:
+        translation = _trim_and_decode(val_outputs[j], subtokenizer)
+        translations.append(translation)
+        if print_all_translations:
+          logging.info("Translating:\n\tInput: %s\n\tOutput: %s",
+                       sorted_inputs[j + i * batch_size], translation)
+
+  # Write translations in the order they appeared in the original file.
+  if output_file is not None:
+    if tf.io.gfile.isdir(output_file):
+      raise ValueError("File output is a directory, will not save outputs to "
+                       "file.")
+    logging.info("Writing to file %s", output_file)
+    with tf.io.gfile.GFile(output_file, "w") as f:
+      for i in sorted_keys:
+        f.write("%s\n" % translations[i])
+
+
+def translate_from_text(model, subtokenizer, txt):
+  encoded_txt = _encode_and_add_eos(txt, subtokenizer)
+  result = model.predict(encoded_txt)
+  outputs = result["outputs"]
+  logging.info("Original: \"%s\"", txt)
+  translate_from_input(outputs, subtokenizer)
+
+
+def translate_from_input(outputs, subtokenizer):
+  translation = _trim_and_decode(outputs, subtokenizer)
+  logging.info("Translation: \"%s\"", translation)
--- a/official/legacy/transformer/utils/__init__.py
+++ b/official/legacy/transformer/utils/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/legacy/transformer/utils/metrics.py
+++ b/official/legacy/transformer/utils/metrics.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions for calculating loss, accuracy, and other model metrics.
+
+Metrics:
+ - Padded loss, accuracy, and negative log perplexity. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
+ - BLEU approximation. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
+ - ROUGE score. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+
+import numpy as np
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow.compat.v1 as tf
+
+
+def _pad_tensors_to_same_length(x, y):
+  """Pad x and y so that the results have the same length (second dimension)."""
+  with tf.name_scope("pad_to_same_length"):
+    x_length = tf.shape(x)[1]
+    y_length = tf.shape(y)[1]
+
+    max_length = tf.maximum(x_length, y_length)
+
+    x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
+    y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
+    return x, y
+
+
+def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
+  """Calculate cross entropy loss while ignoring padding.
+
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch_size, length_labels]
+    smoothing: Label smoothing constant, used to determine the on and off values
+    vocab_size: int size of the vocabulary
+  Returns:
+    Returns the cross entropy loss and weight tensors: float32 tensors with
+      shape [batch_size, max(length_logits, length_labels)]
+  """
+  with tf.name_scope("loss", values=[logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+
+    # Calculate smoothing cross entropy
+    with tf.name_scope("smoothing_cross_entropy", values=[logits, labels]):
+      confidence = 1.0 - smoothing
+      low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32)
+      soft_targets = tf.one_hot(
+          tf.cast(labels, tf.int32),
+          depth=vocab_size,
+          on_value=confidence,
+          off_value=low_confidence)
+      xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(
+          logits=logits, labels=soft_targets)
+
+      # Calculate the best (lowest) possible value of cross entropy, and
+      # subtract from the cross entropy loss.
+      normalizing_constant = -(
+          confidence * tf.log(confidence) + tf.cast(vocab_size - 1, tf.float32)
+          * low_confidence * tf.log(low_confidence + 1e-20))
+      xentropy -= normalizing_constant
+
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+    return xentropy * weights, weights
+
+
+def _convert_to_eval_metric(metric_fn):
+  """Wrap a metric fn that returns scores and weights as an eval metric fn.
+
+  The input metric_fn returns values for the current batch. The wrapper
+  aggregates the return values collected over all of the batches evaluated.
+
+  Args:
+    metric_fn: function that returns scores and weights for the current batch's
+      logits and predicted labels.
+
+  Returns:
+    function that aggregates the scores and weights from metric_fn.
+  """
+  def problem_metric_fn(*args):
+    """Returns an aggregation of the metric_fn's returned values."""
+    (scores, weights) = metric_fn(*args)
+
+    # The tf.metrics.mean function assures correct aggregation.
+    return tf.metrics.mean(scores, weights)
+  return problem_metric_fn
+
+
+def get_eval_metrics(logits, labels, params):
+  """Return dictionary of model evaluation metrics."""
+  metrics = {
+      "accuracy": _convert_to_eval_metric(padded_accuracy)(logits, labels),
+      "accuracy_top5": _convert_to_eval_metric(padded_accuracy_top5)(
+          logits, labels),
+      "accuracy_per_sequence": _convert_to_eval_metric(
+          padded_sequence_accuracy)(logits, labels),
+      "neg_log_perplexity": _convert_to_eval_metric(padded_neg_log_perplexity)(
+          logits, labels, params["vocab_size"]),
+  }
+
+  if not params["use_tpu"]:
+    # TPU does not support tf.py_func
+    metrics.update({
+        "approx_bleu_score": _convert_to_eval_metric(
+            bleu_score)(logits, labels),
+        "rouge_2_fscore": _convert_to_eval_metric(
+            rouge_2_fscore)(logits, labels),
+        "rouge_L_fscore": _convert_to_eval_metric(
+            rouge_l_fscore)(logits, labels),
+    })
+
+  # Prefix each of the metric names with "metrics/". This allows the metric
+  # graphs to display under the "metrics" category in TensorBoard.
+  metrics = {"metrics/%s" % k: v for k, v in six.iteritems(metrics)}
+  return metrics
+
+
+def padded_accuracy(logits, labels):
+  """Percentage of times that predictions matches labels on non-0s."""
+  with tf.variable_scope("padded_accuracy", values=[logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+    outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+    padded_labels = tf.cast(labels, tf.int32)
+    return tf.cast(tf.equal(outputs, padded_labels), tf.float32), weights
+
+
+def padded_accuracy_topk(logits, labels, k):
+  """Percentage of times that top-k predictions matches labels on non-0s."""
+  with tf.variable_scope("padded_accuracy_topk", values=[logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+    effective_k = tf.minimum(k, tf.shape(logits)[-1])
+    _, outputs = tf.nn.top_k(logits, k=effective_k)
+    outputs = tf.cast(outputs, tf.int32)
+    padded_labels = tf.cast(labels, tf.int32)
+    padded_labels = tf.expand_dims(padded_labels, axis=-1)
+    padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
+    same = tf.cast(tf.equal(outputs, padded_labels), tf.float32)
+    same_topk = tf.reduce_sum(same, axis=-1)
+    return same_topk, weights
+
+
+def padded_accuracy_top5(logits, labels):
+  return padded_accuracy_topk(logits, labels, 5)
+
+
+def padded_sequence_accuracy(logits, labels):
+  """Percentage of times that predictions matches labels everywhere (non-0)."""
+  with tf.variable_scope("padded_sequence_accuracy", values=[logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+    outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+    padded_labels = tf.cast(labels, tf.int32)
+    not_correct = (tf.cast(tf.not_equal(outputs, padded_labels), tf.float32) *
+                   weights)
+    axis = list(range(1, len(outputs.get_shape())))
+    correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
+    return correct_seq, tf.constant(1.0)
+
+
+def padded_neg_log_perplexity(logits, labels, vocab_size):
+  """Average log-perplexity excluding padding 0s. No smoothing."""
+  num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size)
+  return -num, den
+
+
+def bleu_score(logits, labels):
+  """Approximate BLEU score computation between labels and predictions.
+
+  An approximate BLEU scoring method since we do not glue word pieces or
+  decode the ids and tokenize the output. By default, we use ngram order of 4
+  and use brevity penalty. Also, this does not have beam search.
+
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch-size, length_labels]
+
+  Returns:
+    bleu: int, approx bleu score
+  """
+  predictions = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+  # TODO: Look into removing use of py_func  # pylint: disable=g-bad-todo
+  bleu = tf.py_func(compute_bleu, (labels, predictions), tf.float32)
+  return bleu, tf.constant(1.0)
+
+
+def _get_ngrams_with_counter(segment, max_order):
+  """Extracts all n-grams up to a given maximum order from an input segment.
+
+  Args:
+    segment: text segment from which n-grams will be extracted.
+    max_order: maximum length in tokens of the n-grams returned by this
+        methods.
+
+  Returns:
+    The Counter containing all n-grams upto max_order in segment
+    with a count of how many times each n-gram occurred.
+  """
+  ngram_counts = collections.Counter()
+  for order in xrange(1, max_order + 1):
+    for i in xrange(0, len(segment) - order + 1):
+      ngram = tuple(segment[i:i + order])
+      ngram_counts[ngram] += 1
+  return ngram_counts
+
+
+def compute_bleu(reference_corpus, translation_corpus, max_order=4,
+                 use_bp=True):
+  """Computes BLEU score of translated segments against one or more references.
+
+  Args:
+    reference_corpus: list of references for each translation. Each
+        reference should be tokenized into a list of tokens.
+    translation_corpus: list of translations to score. Each translation
+        should be tokenized into a list of tokens.
+    max_order: Maximum n-gram order to use when computing BLEU score.
+    use_bp: boolean, whether to apply brevity penalty.
+
+  Returns:
+    BLEU score.
+  """
+  reference_length = 0
+  translation_length = 0
+  bp = 1.0
+  geo_mean = 0
+
+  matches_by_order = [0] * max_order
+  possible_matches_by_order = [0] * max_order
+  precisions = []
+
+  for (references, translations) in zip(reference_corpus, translation_corpus):
+    reference_length += len(references)
+    translation_length += len(translations)
+    ref_ngram_counts = _get_ngrams_with_counter(references, max_order)
+    translation_ngram_counts = _get_ngrams_with_counter(translations, max_order)
+
+    overlap = dict((ngram,
+                    min(count, translation_ngram_counts[ngram]))
+                   for ngram, count in ref_ngram_counts.items())
+
+    for ngram in overlap:
+      matches_by_order[len(ngram) - 1] += overlap[ngram]
+    for ngram in translation_ngram_counts:
+      possible_matches_by_order[len(ngram) - 1] += translation_ngram_counts[
+          ngram]
+
+  precisions = [0] * max_order
+  smooth = 1.0
+
+  for i in xrange(0, max_order):
+    if possible_matches_by_order[i] > 0:
+      precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
+      if matches_by_order[i] > 0:
+        precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[
+            i]
+      else:
+        smooth *= 2
+        precisions[i] = 1.0 / (smooth * possible_matches_by_order[i])
+    else:
+      precisions[i] = 0.0
+
+  if max(precisions) > 0:
+    p_log_sum = sum(math.log(p) for p in precisions if p)
+    geo_mean = math.exp(p_log_sum / max_order)
+
+  if use_bp:
+    ratio = translation_length / reference_length
+    bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
+  bleu = geo_mean * bp
+  return np.float32(bleu)
+
+
+def rouge_2_fscore(logits, labels):
+  """ROUGE-2 F1 score computation between labels and predictions.
+
+  This is an approximate ROUGE scoring method since we do not glue word pieces
+  or decode the ids and tokenize the output.
+
+  Args:
+    logits: tensor, model predictions
+    labels: tensor, gold output.
+
+  Returns:
+    rouge2_fscore: approx rouge-2 f1 score.
+  """
+  predictions = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+  # TODO: Look into removing use of py_func  # pylint: disable=g-bad-todo
+  rouge_2_f_score = tf.py_func(rouge_n, (predictions, labels), tf.float32)
+  return rouge_2_f_score, tf.constant(1.0)
+
+
+def _get_ngrams(n, text):
+  """Calculates n-grams.
+
+  Args:
+    n: which n-grams to calculate
+    text: An array of tokens
+
+  Returns:
+    A set of n-grams
+  """
+  ngram_set = set()
+  text_length = len(text)
+  max_index_ngram_start = text_length - n
+  for i in range(max_index_ngram_start + 1):
+    ngram_set.add(tuple(text[i:i + n]))
+  return ngram_set
+
+
+def rouge_n(eval_sentences, ref_sentences, n=2):
+  """Computes ROUGE-N f1 score of two text collections of sentences.
+
+  Source: https://www.microsoft.com/en-us/research/publication/
+  rouge-a-package-for-automatic-evaluation-of-summaries/
+
+  Args:
+    eval_sentences: Predicted sentences.
+    ref_sentences: Sentences from the reference set
+    n: Size of ngram.  Defaults to 2.
+
+  Returns:
+    f1 score for ROUGE-N
+  """
+  f1_scores = []
+  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
+    eval_ngrams = _get_ngrams(n, eval_sentence)
+    ref_ngrams = _get_ngrams(n, ref_sentence)
+    ref_count = len(ref_ngrams)
+    eval_count = len(eval_ngrams)
+
+    # Count the overlapping ngrams between evaluated and reference
+    overlapping_ngrams = eval_ngrams.intersection(ref_ngrams)
+    overlapping_count = len(overlapping_ngrams)
+
+    # Handle edge case. This isn't mathematically correct, but it's good enough
+    if eval_count == 0:
+      precision = 0.0
+    else:
+      precision = float(overlapping_count) / eval_count
+    if ref_count == 0:
+      recall = 0.0
+    else:
+      recall = float(overlapping_count) / ref_count
+    f1_scores.append(2.0 * ((precision * recall) / (precision + recall + 1e-8)))
+
+  # return overlapping_count / reference_count
+  return np.mean(f1_scores, dtype=np.float32)
+
+
+def rouge_l_fscore(predictions, labels):
+  """ROUGE scores computation between labels and predictions.
+
+  This is an approximate ROUGE scoring method since we do not glue word pieces
+  or decode the ids and tokenize the output.
+
+  Args:
+    predictions: tensor, model predictions
+    labels: tensor, gold output.
+
+  Returns:
+    rouge_l_fscore: approx rouge-l f1 score.
+  """
+  outputs = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
+  rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (outputs, labels),
+                               tf.float32)
+  return rouge_l_f_score, tf.constant(1.0)
+
+
+def rouge_l_sentence_level(eval_sentences, ref_sentences):
+  """Computes ROUGE-L (sentence level) of two collections of sentences.
+
+  Source: https://www.microsoft.com/en-us/research/publication/
+  rouge-a-package-for-automatic-evaluation-of-summaries/
+
+  Calculated according to:
+  R_lcs = LCS(X,Y)/m
+  P_lcs = LCS(X,Y)/n
+  F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
+
+  where:
+  X = reference summary
+  Y = Candidate summary
+  m = length of reference summary
+  n = length of candidate summary
+
+  Args:
+    eval_sentences: The sentences that have been picked by the summarizer
+    ref_sentences: The sentences from the reference set
+
+  Returns:
+    A float: F_lcs
+  """
+
+  f1_scores = []
+  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
+    m = float(len(ref_sentence))
+    n = float(len(eval_sentence))
+    lcs = _len_lcs(eval_sentence, ref_sentence)
+    f1_scores.append(_f_lcs(lcs, m, n))
+  return np.mean(f1_scores, dtype=np.float32)
+
+
+def _len_lcs(x, y):
+  """Returns the length of the Longest Common Subsequence between two seqs.
+
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+
+  Args:
+    x: sequence of words
+    y: sequence of words
+
+  Returns
+    integer: Length of LCS between x and y
+  """
+  table = _lcs(x, y)
+  n, m = len(x), len(y)
+  return table[n, m]
+
+
+def _lcs(x, y):
+  """Computes the length of the LCS between two seqs.
+
+  The implementation below uses a DP programming algorithm and runs
+  in O(nm) time where n = len(x) and m = len(y).
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+
+  Args:
+    x: collection of words
+    y: collection of words
+
+  Returns:
+    Table of dictionary of coord and len lcs
+  """
+  n, m = len(x), len(y)
+  table = dict()
+  for i in range(n + 1):
+    for j in range(m + 1):
+      if i == 0 or j == 0:
+        table[i, j] = 0
+      elif x[i - 1] == y[j - 1]:
+        table[i, j] = table[i - 1, j - 1] + 1
+      else:
+        table[i, j] = max(table[i - 1, j], table[i, j - 1])
+  return table
+
+
+def _f_lcs(llcs, m, n):
+  """Computes the LCS-based F-measure score.
+
+  Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
+  rouge-working-note-v1.3.1.pdf
+
+  Args:
+    llcs: Length of LCS
+    m: number of words in reference summary
+    n: number of words in candidate summary
+
+  Returns:
+    Float. LCS-based F-measure score
+  """
+  r_lcs = llcs / m
+  p_lcs = llcs / n
+  beta = p_lcs / (r_lcs + 1e-12)
+  num = (1 + (beta ** 2)) * r_lcs * p_lcs
+  denom = r_lcs + ((beta ** 2) * p_lcs)
+  f_lcs = num / (denom + 1e-12)
+  return f_lcs
--- a/official/legacy/transformer/utils/tokenizer.py
+++ b/official/legacy/transformer/utils/tokenizer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Defines Subtokenizer class to encode and decode strings."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+import sys
+import unicodedata
+
+from absl import logging
+
+import numpy as np
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+# pylint: disable=g-complex-comprehension
+PAD = "<pad>"
+PAD_ID = 0
+EOS = "<EOS>"
+EOS_ID = 1
+RESERVED_TOKENS = [PAD, EOS]
+
+# Set of characters that will be used in the function _escape_token() (see func
+# docstring for more details).
+# This set is added to the alphabet list to ensure that all escaped tokens can
+# be encoded.
+_ESCAPE_CHARS = set(u"\\_u;0123456789")
+# Regex for the function _unescape_token(), the inverse of _escape_token().
+# This is used to find "\u", "\\", and "\###;" substrings in the token.
+_UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);")
+
+_UNDEFINED_UNICODE = u"\u3013"
+
+
+def alphanumeric_char_set():
+  return set(
+      six.unichr(i)
+      for i in xrange(sys.maxunicode)
+      if (unicodedata.category(six.unichr(i)).startswith("L") or
+          unicodedata.category(six.unichr(i)).startswith("N")))
+
+
+# Set contains all letter and number characters.
+_ALPHANUMERIC_CHAR_SET = alphanumeric_char_set()
+
+# min_count is the minimum number of times a subtoken must appear in the data
+# before before it is added to the vocabulary. The value is found using binary
+# search to obtain the target vocabulary size.
+_MIN_MIN_COUNT = 1  # min value to use when binary searching for min_count
+_MAX_MIN_COUNT = 1000  # max value to use when binary searching for min_count
+
+
+class Subtokenizer(object):
+  """Encodes and decodes strings to/from integer IDs."""
+
+  def __init__(self, vocab_file, reserved_tokens=None, master_char_set=None):
+    """Initializes class, creating a vocab file if data_files is provided."""
+    logging.info("Initializing Subtokenizer from file %s.", vocab_file)
+
+    if master_char_set is None:
+      master_char_set = _ALPHANUMERIC_CHAR_SET
+
+    if reserved_tokens is None:
+      reserved_tokens = RESERVED_TOKENS
+
+    self.subtoken_list = _load_vocab_file(vocab_file, reserved_tokens)
+    self.alphabet = _generate_alphabet_dict(self.subtoken_list)
+    self.subtoken_to_id_dict = _list_to_index_dict(self.subtoken_list)
+
+    self.max_subtoken_length = 0
+    for subtoken in self.subtoken_list:
+      self.max_subtoken_length = max(self.max_subtoken_length, len(subtoken))
+
+    # Create cache to speed up subtokenization
+    self._cache_size = 2**20
+    self._cache = [(None, None)] * self._cache_size
+    self._master_char_set = master_char_set
+
+  @staticmethod
+  def init_from_files(vocab_file,
+                      files,
+                      target_vocab_size,
+                      threshold,
+                      min_count=None,
+                      file_byte_limit=1e6,
+                      reserved_tokens=None,
+                      correct_strip=True,
+                      master_char_set=None):
+    """Create subtoken vocabulary based on files, and save vocab to file.
+
+    Args:
+      vocab_file: String name of vocab file to store subtoken vocabulary.
+      files: List of file paths that will be used to generate vocabulary.
+      target_vocab_size: target vocabulary size to generate.
+      threshold: int threshold of vocabulary size to accept.
+      min_count: int minimum count to use for generating the vocabulary. The min
+        count is the minimum number of times a subtoken should appear in the
+        files before it is added to the vocabulary. If set to none, this value
+        is found using binary search.
+      file_byte_limit: (Default 1e6) Maximum number of bytes of sample text that
+        will be drawn from the files.
+      reserved_tokens: List of string tokens that are guaranteed to be at the
+        beginning of the subtoken vocabulary list.
+      correct_strip: Whether to convert text to unicode before strip.
+      master_char_set: the char set.
+
+    Returns:
+      Subtokenizer object
+    """
+    if master_char_set is None:
+      master_char_set = _ALPHANUMERIC_CHAR_SET
+    if reserved_tokens is None:
+      reserved_tokens = RESERVED_TOKENS
+
+    if tf.io.gfile.exists(vocab_file):
+      logging.info("Vocab file already exists (%s)", vocab_file)
+    else:
+      logging.info("Begin steps to create subtoken vocabulary...")
+      token_counts = _count_tokens(files, file_byte_limit, correct_strip,
+                                   master_char_set)
+      alphabet = _generate_alphabet_dict(token_counts)
+      subtoken_list = _generate_subtokens_with_target_vocab_size(
+          token_counts, alphabet, target_vocab_size, threshold, min_count,
+          reserved_tokens)
+      logging.info("Generated vocabulary with %d subtokens.",
+                   len(subtoken_list))
+      _save_vocab_file(vocab_file, subtoken_list)
+    return Subtokenizer(vocab_file, master_char_set=master_char_set)
+
+  def encode(self, raw_string, add_eos=False):
+    """Encodes a string into a list of int subtoken ids."""
+    ret = []
+    tokens = _split_string_to_tokens(
+        native_to_unicode(raw_string), self._master_char_set)
+    for token in tokens:
+      ret.extend(self._token_to_subtoken_ids(token))
+    if add_eos:
+      assert EOS in self.subtoken_list, \
+          "Can't append 'EOS' because it is not in list of known subtokens."
+      ret.append(EOS_ID)
+    return ret
+
+  def _token_to_subtoken_ids(self, token):
+    """Encode a single token into a list of subtoken ids."""
+    cache_location = hash(token) % self._cache_size
+    cache_key, cache_value = self._cache[cache_location]
+    if cache_key == token:
+      return cache_value
+
+    ret = _split_token_to_subtokens(
+        _escape_token(token, self.alphabet), self.subtoken_to_id_dict,
+        self.max_subtoken_length)
+    ret = [self.subtoken_to_id_dict[subtoken_id] for subtoken_id in ret]
+
+    self._cache[cache_location] = (token, ret)
+    return ret
+
+  def decode(self, subtokens):
+    """Converts list of int subtokens ids into a string."""
+    if isinstance(subtokens, np.ndarray):
+      # Note that list(subtokens) converts subtokens to a python list, but the
+      # items remain as np.int32. This converts both the array and its items.
+      subtokens = subtokens.tolist()
+
+    if not subtokens:
+      return ""
+
+    assert isinstance(subtokens, list) and isinstance(subtokens[0], int), (
+        "Subtokens argument passed into decode() must be a list of integers.")
+
+    return _unicode_to_native(
+        _join_tokens_to_string(
+            self._subtoken_ids_to_tokens(subtokens), self._master_char_set))
+
+  def _subtoken_ids_to_tokens(self, subtokens):
+    """Convert list of int subtoken ids to a list of string tokens."""
+    escaped_tokens = "".join([
+        self.subtoken_list[s] for s in subtokens if s < len(self.subtoken_list)
+    ])
+    escaped_tokens = escaped_tokens.split("_")
+
+    # All tokens in the vocabulary list have been escaped (see _escape_token())
+    # so each token must be unescaped when decoding.
+    ret = []
+    for token in escaped_tokens:
+      if token:
+        ret.append(_unescape_token(token))
+    return ret
+
+
+def _save_vocab_file(vocab_file, subtoken_list):
+  """Save subtokens to file."""
+  with tf.io.gfile.GFile(vocab_file, mode="w") as f:
+    for subtoken in subtoken_list:
+      f.write("'%s'\n" % _unicode_to_native(subtoken))
+
+
+def _load_vocab_file(vocab_file, reserved_tokens=None):
+  """Load vocabulary while ensuring reserved tokens are at the top."""
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+
+  subtoken_list = []
+  with tf.io.gfile.GFile(vocab_file, mode="r") as f:
+    for line in f:
+      subtoken = native_to_unicode(line.strip())
+      subtoken = subtoken[1:-1]  # Remove surrounding single-quotes
+      if subtoken in reserved_tokens:
+        continue
+      subtoken_list.append(native_to_unicode(subtoken))
+  return reserved_tokens + subtoken_list
+
+
+def native_to_unicode(s):
+  """Convert string to unicode (required in Python 2)."""
+  try:  # Python 2
+    return s if isinstance(s, unicode) else s.decode("utf-8")
+  except NameError:  # Python 3
+    return s
+
+
+def _unicode_to_native(s):
+  """Convert string from unicode to native format (required in Python 2)."""
+  try:  # Python 2
+    return s.encode("utf-8") if isinstance(s, unicode) else s
+  except NameError:  # Python 3
+    return s
+
+
+def _split_string_to_tokens(text, master_char_set):
+  """Splits text to a list of string tokens."""
+  if not text:
+    return []
+  ret = []
+  token_start = 0
+  # Classify each character in the input string
+  is_master = [c in master_char_set for c in text]
+  for pos in xrange(1, len(text)):
+    if is_master[pos] != is_master[pos - 1]:
+      token = text[token_start:pos]
+      if token != u" " or token_start == 0:
+        ret.append(token)
+      token_start = pos
+  final_token = text[token_start:]
+  ret.append(final_token)
+  return ret
+
+
+def _join_tokens_to_string(tokens, master_char_set):
+  """Join a list of string tokens into a single string."""
+  token_is_master = [t[0] in master_char_set for t in tokens]
+  ret = []
+  for i, token in enumerate(tokens):
+    if i > 0 and token_is_master[i - 1] and token_is_master[i]:
+      ret.append(u" ")
+    ret.append(token)
+  return "".join(ret)
+
+
+def _escape_token(token, alphabet):
+  r"""Replace characters that aren't in the alphabet and append "_" to token.
+
+  Apply three transformations to the token:
+    1. Replace underline character "_" with "\u", and backslash "\" with "\\".
+    2. Replace characters outside of the alphabet with "\###;", where ### is the
+       character's Unicode code point.
+    3. Appends "_" to mark the end of a token.
+
+  Args:
+    token: unicode string to be escaped
+    alphabet: list of all known characters
+
+  Returns:
+    escaped string
+  """
+  token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u")
+  ret = [c if c in alphabet and c != u"\n" else r"\%d;" % ord(c) for c in token]
+  return u"".join(ret) + "_"
+
+
+def _unescape_token(token):
+  r"""Replaces escaped characters in the token with their unescaped versions.
+
+  Applies inverse transformations as _escape_token():
+    1. Replace "\u" with "_", and "\\" with "\".
+    2. Replace "\###;" with the unicode character the ### refers to.
+
+  Args:
+    token: escaped string
+
+  Returns:
+    unescaped string
+  """
+
+  def match(m):
+    r"""Returns replacement string for matched object.
+
+    Matched objects contain one of the strings that matches the regex pattern:
+      r"\\u|\\\\|\\([0-9]+);"
+    The strings can be '\u', '\\', or '\###;' (### is any digit number).
+
+    m.group(0) refers to the entire matched string ('\u', '\\', or '\###;').
+    m.group(1) refers to the first parenthesized subgroup ('###').
+
+    m.group(0) exists for all match objects, while m.group(1) exists only for
+    the string '\###;'.
+
+    This function looks to see if m.group(1) exists. If it doesn't, then the
+    matched string must be '\u' or '\\' . In this case, the corresponding
+    replacement ('_' and '\') are returned. Note that in python, a single
+    backslash is written as '\\', and double backslash as '\\\\'.
+
+    If m.goup(1) exists, then use the integer in m.group(1) to return a
+    unicode character.
+
+    Args:
+      m: match object
+
+    Returns:
+      String to replace matched object with.
+    """
+    # Check if the matched strings are '\u' or '\\'.
+    if m.group(1) is None:
+      return u"_" if m.group(0) == u"\\u" else u"\\"
+
+    # If m.group(1) exists, try and return unicode character.
+    try:
+      return six.unichr(int(m.group(1)))
+    except (ValueError, OverflowError) as _:
+      return _UNDEFINED_UNICODE
+
+  # Use match function to replace escaped substrings in the token.
+  return _UNESCAPE_REGEX.sub(match, token)
+
+
+def _count_tokens(files,
+                  file_byte_limit=1e6,
+                  correct_strip=True,
+                  master_char_set=None):
+  """Return token counts of words in the files.
+
+  Samples file_byte_limit bytes from each file, and counts the words that appear
+  in the samples. The samples are semi-evenly distributed across the file.
+
+  Args:
+    files: List of filepaths
+    file_byte_limit: Max number of bytes that will be read from each file.
+    correct_strip: Whether to convert text to unicode before strip. This affects
+      vocabulary generation for PY2. Sets correct_strip to False in PY2 to
+      reproduce previous common public result. Sets correct_strip to True will
+      let PY2 and PY3 get a consistent vocabulary.
+    master_char_set: the char set.
+
+  Returns:
+    Dictionary mapping tokens to the number of times they appear in the sampled
+    lines from the files.
+  """
+  if master_char_set is None:
+    master_char_set = _ALPHANUMERIC_CHAR_SET
+
+  token_counts = collections.defaultdict(int)
+
+  for filepath in files:
+    with tf.io.gfile.GFile(filepath, mode="r") as reader:
+      file_byte_budget = file_byte_limit
+      counter = 0
+      lines_to_skip = int(reader.size() / (file_byte_budget * 2))
+      for line in reader:
+        if counter < lines_to_skip:
+          counter += 1
+        else:
+          if file_byte_budget < 0:
+            break
+          if correct_strip:
+            line = native_to_unicode(line)
+          line = line.strip()
+          file_byte_budget -= len(line)
+          counter = 0
+
+          # Add words to token counts
+          for token in _split_string_to_tokens(
+              native_to_unicode(line), master_char_set):
+            token_counts[token] += 1
+  return token_counts
+
+
+def _list_to_index_dict(lst):
+  """Create dictionary mapping list items to their indices in the list."""
+  return {item: n for n, item in enumerate(lst)}
+
+
+def _split_token_to_subtokens(token, subtoken_dict, max_subtoken_length):
+  """Splits a token into subtokens defined in the subtoken dict."""
+  ret = []
+  start = 0
+  token_len = len(token)
+  while start < token_len:
+    # Find the longest subtoken, so iterate backwards.
+    for end in xrange(min(token_len, start + max_subtoken_length), start, -1):
+      subtoken = token[start:end]
+      if subtoken in subtoken_dict:
+        ret.append(subtoken)
+        start = end
+        break
+    else:  # Did not break
+      # If there is no possible encoding of the escaped token then one of the
+      # characters in the token is not in the alphabet. This should be
+      # impossible and would be indicative of a bug.
+      raise ValueError("Was unable to split token \"%s\" into subtokens." %
+                       token)
+  return ret
+
+
+def _generate_subtokens_with_target_vocab_size(token_counts,
+                                               alphabet,
+                                               target_size,
+                                               threshold,
+                                               min_count=None,
+                                               reserved_tokens=None):
+  """Generate subtoken vocabulary close to the target size."""
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+
+  if min_count is not None:
+    logging.info("Using min_count=%d to generate vocab with target size %d",
+                 min_count, target_size)
+    return _generate_subtokens(
+        token_counts, alphabet, min_count, reserved_tokens=reserved_tokens)
+
+  def bisect(min_val, max_val):
+    """Recursive function to binary search for subtoken vocabulary."""
+    cur_count = (min_val + max_val) // 2
+    logging.info("Binary search: trying min_count=%d (%d %d)", cur_count,
+                 min_val, max_val)
+    subtoken_list = _generate_subtokens(
+        token_counts, alphabet, cur_count, reserved_tokens=reserved_tokens)
+
+    val = len(subtoken_list)
+    logging.info("Binary search: min_count=%d resulted in %d tokens", cur_count,
+                 val)
+
+    within_threshold = abs(val - target_size) < threshold
+    if within_threshold or min_val >= max_val or cur_count < 2:
+      return subtoken_list
+    if val > target_size:
+      other_subtoken_list = bisect(cur_count + 1, max_val)
+    else:
+      other_subtoken_list = bisect(min_val, cur_count - 1)
+
+    # Return vocabulary dictionary with the closest number of tokens.
+    other_val = len(other_subtoken_list)
+    if abs(other_val - target_size) < abs(val - target_size):
+      return other_subtoken_list
+    return subtoken_list
+
+  logging.info("Finding best min_count to get target size of %d", target_size)
+  return bisect(_MIN_MIN_COUNT, _MAX_MIN_COUNT)
+
+
+def _generate_alphabet_dict(iterable, reserved_tokens=None):
+  """Create set of characters that appear in any element in the iterable."""
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+  alphabet = {c for token in iterable for c in token}
+  alphabet |= {c for token in reserved_tokens for c in token}
+  alphabet |= _ESCAPE_CHARS  # Add escape characters to alphabet set.
+  return alphabet
+
+
+def _count_and_gen_subtokens(token_counts, alphabet, subtoken_dict,
+                             max_subtoken_length):
+  """Count number of times subtokens appear, and generate new subtokens.
+
+  Args:
+    token_counts: dict mapping tokens to the number of times they appear in the
+      original files.
+    alphabet: list of allowed characters. Used to escape the tokens, which
+      guarantees that all tokens can be split into subtokens.
+    subtoken_dict: dict mapping subtokens to ids.
+    max_subtoken_length: maximum length of subtoken in subtoken_dict.
+
+  Returns:
+    A defaultdict mapping subtokens to the number of times they appear in the
+    tokens. The dict may contain new subtokens.
+  """
+  subtoken_counts = collections.defaultdict(int)
+  for token, count in six.iteritems(token_counts):
+    token = _escape_token(token, alphabet)
+    subtokens = _split_token_to_subtokens(token, subtoken_dict,
+                                          max_subtoken_length)
+
+    # Generate new subtokens by taking substrings from token.
+    start = 0
+    for subtoken in subtokens:
+      for end in xrange(start + 1, len(token) + 1):
+        new_subtoken = token[start:end]
+        subtoken_counts[new_subtoken] += count
+      start += len(subtoken)
+
+  return subtoken_counts
+
+
+def _filter_and_bucket_subtokens(subtoken_counts, min_count):
+  """Return a bucketed list of subtokens that are filtered by count.
+
+  Args:
+    subtoken_counts: defaultdict mapping subtokens to their counts
+    min_count: int count used to filter subtokens
+
+  Returns:
+    List of subtoken sets, where subtokens in set i have the same length=i.
+  """
+  # Create list of buckets, where subtokens in bucket i have length i.
+  subtoken_buckets = []
+  for subtoken, count in six.iteritems(subtoken_counts):
+    if count < min_count:  # Filter out subtokens that don't appear enough
+      continue
+    while len(subtoken_buckets) <= len(subtoken):
+      subtoken_buckets.append(set())
+    subtoken_buckets[len(subtoken)].add(subtoken)
+  return subtoken_buckets
+
+
+def _gen_new_subtoken_list(subtoken_counts,
+                           min_count,
+                           alphabet,
+                           reserved_tokens=None):
+  """Generate candidate subtokens ordered by count, and new max subtoken length.
+
+  Add subtokens to the candiate list in order of length (longest subtokens
+  first). When a subtoken is added, the counts of each of its prefixes are
+  decreased. Prefixes that don't appear much outside the subtoken are not added
+  to the candidate list.
+
+  For example:
+    subtoken being added to candidate list: 'translate'
+    subtoken_counts: {'translate':10, 't':40, 'tr':16, 'tra':12, ...}
+    min_count: 5
+
+  When 'translate' is added, subtoken_counts is updated to:
+    {'translate':0, 't':30, 'tr':6, 'tra': 2, ...}
+
+  The subtoken 'tra' will not be added to the candidate list, because it appears
+  twice (less than min_count) outside of 'translate'.
+
+  Args:
+    subtoken_counts: defaultdict mapping str subtokens to int counts
+    min_count: int minumum count requirement for subtokens
+    alphabet: set of characters. Each character is added to the subtoken list to
+      guarantee that all tokens can be encoded.
+    reserved_tokens: list of tokens that will be added to the beginning of the
+      returned subtoken list.
+
+  Returns:
+    List of candidate subtokens in decreasing count order, and maximum subtoken
+    length
+  """
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+
+  # Create a list of (count, subtoken) for each candidate subtoken.
+  subtoken_candidates = []
+
+  # Use bucketted list to iterate through subtokens in order of length.
+  # subtoken_buckets[i] = set(subtokens), where each subtoken has length i.
+  subtoken_buckets = _filter_and_bucket_subtokens(subtoken_counts, min_count)
+  max_subtoken_length = len(subtoken_buckets) - 1
+
+  # Go through the list in reverse order to consider longer subtokens first.
+  for subtoken_len in xrange(max_subtoken_length, 0, -1):
+    for subtoken in subtoken_buckets[subtoken_len]:
+      count = subtoken_counts[subtoken]
+
+      # Possible if this subtoken is a prefix of another token.
+      if count < min_count:
+        continue
+
+      # Ignore alphabet/reserved tokens, which will be added manually later.
+      if subtoken not in alphabet and subtoken not in reserved_tokens:
+        subtoken_candidates.append((count, subtoken))
+
+      # Decrement count of the subtoken's prefixes (if a longer subtoken is
+      # added, its prefixes lose priority to be added).
+      for end in xrange(1, subtoken_len):
+        subtoken_counts[subtoken[:end]] -= count
+
+  # Add alphabet subtokens (guarantees that all strings are encodable).
+  subtoken_candidates.extend((subtoken_counts.get(a, 0), a) for a in alphabet)
+
+  # Order subtoken candidates by decreasing count.
+  subtoken_list = [t for _, t in sorted(subtoken_candidates, reverse=True)]
+
+  # Add reserved tokens to beginning of the list.
+  subtoken_list = reserved_tokens + subtoken_list
+  return subtoken_list, max_subtoken_length
+
+
+def _generate_subtokens(token_counts,
+                        alphabet,
+                        min_count,
+                        num_iterations=4,
+                        reserved_tokens=None):
+  """Create a list of subtokens in decreasing order of frequency.
+
+  Args:
+    token_counts: dict mapping str tokens -> int count
+    alphabet: set of characters
+    min_count: int minimum number of times a subtoken must appear before it is
+      added to the vocabulary.
+    num_iterations: int number of iterations to generate new tokens.
+    reserved_tokens: list of tokens that will be added to the beginning to the
+      returned subtoken list.
+
+  Returns:
+    Sorted list of subtokens (most frequent first)
+  """
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+
+  # Use alphabet set to create initial list of subtokens
+  subtoken_list = reserved_tokens + list(alphabet)
+  max_subtoken_length = 1
+
+  # On each iteration, segment all words using the subtokens defined in
+  # subtoken_dict, count how often the resulting subtokens appear, and update
+  # the dictionary with subtokens w/ high enough counts.
+  for i in xrange(num_iterations):
+    logging.info("\tGenerating subtokens: iteration %d", i)
+    # Generate new subtoken->id dictionary using the new subtoken list.
+    subtoken_dict = _list_to_index_dict(subtoken_list)
+
+    # Create dict mapping subtoken->count, with additional subtokens created
+    # from substrings taken from the tokens.
+    subtoken_counts = _count_and_gen_subtokens(token_counts, alphabet,
+                                               subtoken_dict,
+                                               max_subtoken_length)
+
+    # Generate new list of subtokens sorted by subtoken count.
+    subtoken_list, max_subtoken_length = _gen_new_subtoken_list(
+        subtoken_counts, min_count, alphabet, reserved_tokens)
+
+    logging.info("\tVocab size: %d", len(subtoken_list))
+  return subtoken_list
--- a/official/legacy/transformer/utils/tokenizer_test.py
+++ b/official/legacy/transformer/utils/tokenizer_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test Subtokenizer and string helper methods."""
+
+import collections
+import tempfile
+
+import tensorflow as tf
+
+from official.legacy.transformer.utils import tokenizer
+
+
+class SubtokenizerTest(tf.test.TestCase):
+
+  def _init_subtokenizer(self, vocab_list):
+    temp_file = tempfile.NamedTemporaryFile(delete=False)
+    with tf.io.gfile.GFile(temp_file.name, "w") as w:
+      for subtoken in vocab_list:
+        w.write("'%s'" % subtoken)
+        w.write("\n")
+    return tokenizer.Subtokenizer(temp_file.name, reserved_tokens=[])
+
+  def test_encode(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    s = "testing 123"
+    encoded_list = subtokenizer.encode(s)
+    self.assertEqual([1, 2, 0], encoded_list)
+
+  def test_decode(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    encoded_list = [1, 2, 0]  # testing 123
+    decoded_str = subtokenizer.decode(encoded_list)
+    self.assertEqual("testing 123", decoded_str)
+
+  def test_subtoken_ids_to_tokens(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    encoded_list = [1, 2, 0]  # testing 123
+    token_list = subtokenizer._subtoken_ids_to_tokens(encoded_list)
+    self.assertEqual([u"testing", u"123"], token_list)
+
+
+class StringHelperTest(tf.test.TestCase):
+
+  def test_split_string_to_tokens(self):
+    text = "test? testing 123."
+
+    tokens = tokenizer._split_string_to_tokens(text,
+                                               tokenizer._ALPHANUMERIC_CHAR_SET)
+    self.assertEqual(["test", "? ", "testing", "123", "."], tokens)
+
+  def test_join_tokens_to_string(self):
+    tokens = ["test", "? ", "testing", "123", "."]
+
+    s = tokenizer._join_tokens_to_string(tokens,
+                                         tokenizer._ALPHANUMERIC_CHAR_SET)
+    self.assertEqual("test? testing 123.", s)
+
+  def test_escape_token(self):
+    token = u"abc_\\4"
+    alphabet = set("abc_\\u;")
+
+    escaped_token = tokenizer._escape_token(token, alphabet)
+    self.assertEqual("abc\\u\\\\\\52;_", escaped_token)
+
+  def test_unescape_token(self):
+    escaped_token = u"Underline: \\u, Backslash: \\\\, Unicode: \\52;"
+
+    unescaped_token = tokenizer._unescape_token(escaped_token)
+    self.assertEqual("Underline: _, Backslash: \\, Unicode: 4", unescaped_token)
+
+  def test_list_to_index_dict(self):
+    lst = ["test", "strings"]
+
+    d = tokenizer._list_to_index_dict(lst)
+    self.assertDictEqual({"test": 0, "strings": 1}, d)
+
+  def test_split_token_to_subtokens(self):
+    token = "abc"
+    subtoken_dict = {"a": 0, "b": 1, "c": 2, "ab": 3}
+    max_subtoken_length = 2
+
+    subtokens = tokenizer._split_token_to_subtokens(token, subtoken_dict,
+                                                    max_subtoken_length)
+    self.assertEqual(["ab", "c"], subtokens)
+
+  def test_generate_alphabet_dict(self):
+    s = ["testing", "123"]
+    reserved_tokens = ["???"]
+
+    alphabet = tokenizer._generate_alphabet_dict(s, reserved_tokens)
+    self.assertIn("?", alphabet)
+    self.assertIn("t", alphabet)
+    self.assertIn("e", alphabet)
+    self.assertIn("s", alphabet)
+    self.assertIn("i", alphabet)
+    self.assertIn("n", alphabet)
+    self.assertIn("g", alphabet)
+    self.assertIn("1", alphabet)
+    self.assertIn("2", alphabet)
+    self.assertIn("3", alphabet)
+
+  def test_count_and_gen_subtokens(self):
+    token_counts = {"abc": 5}
+    alphabet = set("abc_")
+    subtoken_dict = {"a": 0, "b": 1, "c": 2, "_": 3}
+    max_subtoken_length = 2
+
+    subtoken_counts = tokenizer._count_and_gen_subtokens(
+        token_counts, alphabet, subtoken_dict, max_subtoken_length)
+
+    self.assertIsInstance(subtoken_counts, collections.defaultdict)
+    self.assertDictEqual(
+        {
+            "a": 5,
+            "b": 5,
+            "c": 5,
+            "_": 5,
+            "ab": 5,
+            "bc": 5,
+            "c_": 5,
+            "abc": 5,
+            "bc_": 5,
+            "abc_": 5
+        }, subtoken_counts)
+
+  def test_filter_and_bucket_subtokens(self):
+    subtoken_counts = collections.defaultdict(int, {
+        "a": 2,
+        "b": 4,
+        "c": 1,
+        "ab": 6,
+        "ac": 3,
+        "abbc": 5
+    })
+    min_count = 3
+
+    subtoken_buckets = tokenizer._filter_and_bucket_subtokens(
+        subtoken_counts, min_count)
+
+    self.assertEqual(len(subtoken_buckets[0]), 0)
+    self.assertEqual(set("b"), subtoken_buckets[1])
+    self.assertEqual(set(["ab", "ac"]), subtoken_buckets[2])
+    self.assertEqual(len(subtoken_buckets[3]), 0)
+    self.assertEqual(set(["abbc"]), subtoken_buckets[4])
+
+  def test_gen_new_subtoken_list(self):
+    subtoken_counts = collections.defaultdict(int, {
+        "translate": 10,
+        "t": 40,
+        "tr": 16,
+        "tra": 12
+    })
+    min_count = 5
+    alphabet = set("translate")
+    reserved_tokens = ["reserved", "tokens"]
+
+    subtoken_list, max_token_length = tokenizer._gen_new_subtoken_list(
+        subtoken_counts, min_count, alphabet, reserved_tokens)
+
+    # Check that "tra" isn"t in the list (its count should be decremented to 2,
+    # so it should not be added to the canddiate list).
+    self.assertNotIn("tra", subtoken_list)
+
+    self.assertIn("tr", subtoken_list)
+    self.assertIn("t", subtoken_list)
+
+    self.assertEqual(len("translate"), max_token_length)
+
+  def test_generate_subtokens(self):
+    token_counts = {"ab": 1, "bc": 3, "abc": 5}
+    alphabet = set("abc_")
+    min_count = 100
+    num_iterations = 1
+    reserved_tokens = ["reserved", "tokens"]
+
+    vocab_list = tokenizer._generate_subtokens(token_counts, alphabet,
+                                               min_count, num_iterations,
+                                               reserved_tokens)
+
+    # Check that reserved tokens are at the front of the list
+    self.assertEqual(vocab_list[:2], reserved_tokens)
+
+    # Check that each character in alphabet is in the vocab list
+    for c in alphabet:
+      self.assertIn(c, vocab_list)
+
+
+if __name__ == "__main__":
+  tf.test.main()