[tfnlp] Remove nlp/transformer as we moved all usage to legacy/transformer

please use Seq2SeqTransformer in tfnlp instead. PiperOrigin-RevId: 419604366

[tfnlp] Remove nlp/transformer as we moved all usage to legacy/transformer
please use Seq2SeqTransformer in tfnlp instead. PiperOrigin-RevId: 419604366
6ce292df · Frederick Liu · A. Unique TensorFlower · d78ec6ea · d78ec6ea · d78ec6ea
Commit 6ce292df authored Jan 04, 2022 by Frederick Liu Committed by A. Unique TensorFlower Jan 04, 2022
7 changed files
--- a/official/nlp/transformer/transformer_main_test.py
+++ b/official/nlp/transformer/transformer_main_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Test Transformer model."""
-
-import os
-import re
-import sys
-import unittest
-
-from absl import flags
-from absl.testing import flagsaver
-import tensorflow as tf
-from tensorflow.python.eager import context  # pylint: disable=ungrouped-imports
-from official.nlp.transformer import misc
-from official.nlp.transformer import transformer_main
-
-FLAGS = flags.FLAGS
-FIXED_TIMESTAMP = 'my_time_stamp'
-WEIGHT_PATTERN = re.compile(r'weights-epoch-.+\.hdf5')
-
-
-def _generate_file(filepath, lines):
-  with open(filepath, 'w') as f:
-    for l in lines:
-      f.write('{}\n'.format(l))
-
-
-class TransformerTaskTest(tf.test.TestCase):
-  local_flags = None
-
-  def setUp(self):  # pylint: disable=g-missing-super-call
-    temp_dir = self.get_temp_dir()
-    if TransformerTaskTest.local_flags is None:
-      misc.define_transformer_flags()
-      # Loads flags, array cannot be blank.
-      flags.FLAGS(['foo'])
-      TransformerTaskTest.local_flags = flagsaver.save_flag_values()
-    else:
-      flagsaver.restore_flag_values(TransformerTaskTest.local_flags)
-    FLAGS.model_dir = os.path.join(temp_dir, FIXED_TIMESTAMP)
-    FLAGS.param_set = 'tiny'
-    FLAGS.use_synthetic_data = True
-    FLAGS.steps_between_evals = 1
-    FLAGS.train_steps = 1
-    FLAGS.validation_steps = 1
-    FLAGS.batch_size = 4
-    FLAGS.max_length = 1
-    FLAGS.num_gpus = 1
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.dtype = 'fp32'
-    self.model_dir = FLAGS.model_dir
-    self.temp_dir = temp_dir
-    self.vocab_file = os.path.join(temp_dir, 'vocab')
-    self.vocab_size = misc.get_model_params(FLAGS.param_set, 0)['vocab_size']
-    self.bleu_source = os.path.join(temp_dir, 'bleu_source')
-    self.bleu_ref = os.path.join(temp_dir, 'bleu_ref')
-    self.orig_policy = (
-        tf.compat.v2.keras.mixed_precision.global_policy())
-
-  def tearDown(self):  # pylint: disable=g-missing-super-call
-    tf.compat.v2.keras.mixed_precision.set_global_policy(self.orig_policy)
-
-  def _assert_exists(self, filepath):
-    self.assertTrue(os.path.exists(filepath))
-
-  def test_train_no_dist_strat(self):
-    if context.num_gpus() >= 2:
-      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
-    t = transformer_main.TransformerTask(FLAGS)
-    t.train()
-
-  def test_train_save_full_model(self):
-    if context.num_gpus() >= 2:
-      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
-    FLAGS.save_weights_only = False
-    t = transformer_main.TransformerTask(FLAGS)
-    t.train()
-
-  def test_train_static_batch(self):
-    if context.num_gpus() >= 2:
-      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
-    FLAGS.distribution_strategy = 'one_device'
-    if tf.test.is_built_with_cuda():
-      FLAGS.num_gpus = 1
-    else:
-      FLAGS.num_gpus = 0
-    FLAGS.static_batch = True
-    t = transformer_main.TransformerTask(FLAGS)
-    t.train()
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_train_1_gpu_with_dist_strat(self):
-    FLAGS.distribution_strategy = 'one_device'
-    t = transformer_main.TransformerTask(FLAGS)
-    t.train()
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_train_fp16(self):
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.dtype = 'fp16'
-    t = transformer_main.TransformerTask(FLAGS)
-    t.train()
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_train_2_gpu(self):
-    if context.num_gpus() < 2:
-      self.skipTest(
-          '{} GPUs are not available for this test. {} GPUs are available'
-          .format(2, context.num_gpus()))
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.num_gpus = 2
-    FLAGS.param_set = 'base'
-    t = transformer_main.TransformerTask(FLAGS)
-    t.train()
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_train_2_gpu_fp16(self):
-    if context.num_gpus() < 2:
-      self.skipTest(
-          '{} GPUs are not available for this test. {} GPUs are available'
-          .format(2, context.num_gpus()))
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.num_gpus = 2
-    FLAGS.param_set = 'base'
-    FLAGS.dtype = 'fp16'
-    t = transformer_main.TransformerTask(FLAGS)
-    t.train()
-
-  def _prepare_files_and_flags(self, *extra_flags):
-    # Make log dir.
-    if not os.path.exists(self.temp_dir):
-      os.makedirs(self.temp_dir)
-
-    # Fake vocab, bleu_source and bleu_ref.
-    tokens = [
-        "'<pad>'", "'<EOS>'", "'_'", "'a'", "'b'", "'c'", "'d'", "'a_'", "'b_'",
-        "'c_'", "'d_'"
-    ]
-    tokens += ["'{}'".format(i) for i in range(self.vocab_size - len(tokens))]
-    _generate_file(self.vocab_file, tokens)
-    _generate_file(self.bleu_source, ['a b', 'c d'])
-    _generate_file(self.bleu_ref, ['a b', 'd c'])
-
-    # Update flags.
-    update_flags = [
-        'ignored_program_name',
-        '--vocab_file={}'.format(self.vocab_file),
-        '--bleu_source={}'.format(self.bleu_source),
-        '--bleu_ref={}'.format(self.bleu_ref),
-    ]
-    if extra_flags:
-      update_flags.extend(extra_flags)
-    FLAGS(update_flags)
-
-  def test_predict(self):
-    if context.num_gpus() >= 2:
-      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
-    self._prepare_files_and_flags()
-    t = transformer_main.TransformerTask(FLAGS)
-    t.predict()
-
-  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
-  def test_predict_fp16(self):
-    if context.num_gpus() >= 2:
-      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
-    self._prepare_files_and_flags('--dtype=fp16')
-    t = transformer_main.TransformerTask(FLAGS)
-    t.predict()
-
-  def test_eval(self):
-    if context.num_gpus() >= 2:
-      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
-    if 'test_xla' in sys.argv[0]:
-      self.skipTest('TODO(xla): Make this test faster under XLA.')
-    self._prepare_files_and_flags()
-    t = transformer_main.TransformerTask(FLAGS)
-    t.eval()
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/nlp/transformer/transformer_test.py
+++ b/official/nlp/transformer/transformer_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Test Transformer model."""
-
-import tensorflow as tf
-
-from official.nlp.transformer import model_params
-from official.nlp.transformer import transformer
-
-
-class TransformerV2Test(tf.test.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self.params = params = model_params.TINY_PARAMS
-    params["batch_size"] = params["default_batch_size"] = 16
-    params["use_synthetic_data"] = True
-    params["hidden_size"] = 12
-    params["num_hidden_layers"] = 2
-    params["filter_size"] = 14
-    params["num_heads"] = 2
-    params["vocab_size"] = 41
-    params["extra_decode_length"] = 2
-    params["beam_size"] = 3
-    params["dtype"] = tf.float32
-
-  def test_create_model_train(self):
-    model = transformer.create_model(self.params, True)
-    inputs, outputs = model.inputs, model.outputs
-    self.assertEqual(len(inputs), 2)
-    self.assertEqual(len(outputs), 1)
-    self.assertEqual(inputs[0].shape.as_list(), [None, None])
-    self.assertEqual(inputs[0].dtype, tf.int64)
-    self.assertEqual(inputs[1].shape.as_list(), [None, None])
-    self.assertEqual(inputs[1].dtype, tf.int64)
-    self.assertEqual(outputs[0].shape.as_list(), [None, None, 41])
-    self.assertEqual(outputs[0].dtype, tf.float32)
-
-  def test_create_model_not_train(self):
-    model = transformer.create_model(self.params, False)
-    inputs, outputs = model.inputs, model.outputs
-    self.assertEqual(len(inputs), 1)
-    self.assertEqual(len(outputs), 2)
-    self.assertEqual(inputs[0].shape.as_list(), [None, None])
-    self.assertEqual(inputs[0].dtype, tf.int64)
-    self.assertEqual(outputs[0].shape.as_list(), [None, None])
-    self.assertEqual(outputs[0].dtype, tf.int32)
-    self.assertEqual(outputs[1].shape.as_list(), [None])
-    self.assertEqual(outputs[1].dtype, tf.float32)
-
-  def test_export(self):
-    model = transformer.Transformer(self.params, name="transformer_v2")
-    export_dir = self.get_temp_dir()
-    batch_size = 5
-    max_length = 6
-
-    class SaveModule(tf.Module):
-
-      def __init__(self, model):
-        super(SaveModule, self).__init__()
-        self.model = model
-
-      @tf.function
-      def serve(self, x):
-        return self.model.call([x], training=False)
-
-    save_module = SaveModule(model)
-    tensor_shape = (None, None)
-    sample_input = tf.zeros((batch_size, max_length), dtype=tf.int64)
-    _ = save_module.serve(sample_input)
-    signatures = dict(
-        serving_default=save_module.serve.get_concrete_function(
-            tf.TensorSpec(shape=tensor_shape, dtype=tf.int64, name="x")))
-    tf.saved_model.save(save_module, export_dir, signatures=signatures)
-    imported = tf.saved_model.load(export_dir)
-    serving_fn = imported.signatures["serving_default"]
-    all_outputs = serving_fn(sample_input)
-    output = all_outputs["outputs"]
-    output_shapes = output.shape.as_list()
-    self.assertEqual(output_shapes[0], batch_size)
-    self.assertEqual(output_shapes[1],
-                     max_length + model.params["extra_decode_length"])
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/nlp/transformer/translate.py
+++ b/official/nlp/transformer/translate.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Translate text or files using trained transformer model."""
-
-# Import libraries
-from absl import logging
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.transformer.utils import tokenizer
-
-_EXTRA_DECODE_LENGTH = 100
-_BEAM_SIZE = 4
-_ALPHA = 0.6
-
-
-def _get_sorted_inputs(filename):
-  """Read and sort lines from the file sorted by decreasing length.
-
-  Args:
-    filename: String name of file to read inputs from.
-  Returns:
-    Sorted list of inputs, and dictionary mapping original index->sorted index
-    of each element.
-  """
-  with tf.io.gfile.GFile(filename) as f:
-    records = f.read().split("\n")
-    inputs = [record.strip() for record in records]
-    if not inputs[-1]:
-      inputs.pop()
-
-  input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
-  sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)
-
-  sorted_inputs = [None] * len(sorted_input_lens)
-  sorted_keys = [0] * len(sorted_input_lens)
-  for i, (index, _) in enumerate(sorted_input_lens):
-    sorted_inputs[i] = inputs[index]
-    sorted_keys[index] = i
-  return sorted_inputs, sorted_keys
-
-
-def _encode_and_add_eos(line, subtokenizer):
-  """Encode line with subtokenizer, and add EOS id to the end."""
-  return subtokenizer.encode(line) + [tokenizer.EOS_ID]
-
-
-def _trim_and_decode(ids, subtokenizer):
-  """Trim EOS and PAD tokens from ids, and decode to return a string."""
-  try:
-    index = list(ids).index(tokenizer.EOS_ID)
-    return subtokenizer.decode(ids[:index])
-  except ValueError:  # No EOS found in sequence
-    return subtokenizer.decode(ids)
-
-
-def translate_file(model,
-                   params,
-                   subtokenizer,
-                   input_file,
-                   output_file=None,
-                   print_all_translations=True,
-                   distribution_strategy=None):
-  """Translate lines in file, and save to output file if specified.
-
-  Args:
-    model: A Keras model, used to generate the translations.
-    params: A dictionary, containing the translation related parameters.
-    subtokenizer: A subtokenizer object, used for encoding and decoding source
-      and translated lines.
-    input_file: A file containing lines to translate.
-    output_file: A file that stores the generated translations.
-    print_all_translations: A bool. If true, all translations are printed to
-      stdout.
-    distribution_strategy: A distribution strategy, used to perform inference
-      directly with tf.function instead of Keras model.predict().
-
-  Raises:
-    ValueError: if output file is invalid.
-  """
-  batch_size = params["decode_batch_size"]
-
-  # Read and sort inputs by length. Keep dictionary (original index-->new index
-  # in sorted list) to write translations in the original order.
-  sorted_inputs, sorted_keys = _get_sorted_inputs(input_file)
-  total_samples = len(sorted_inputs)
-  num_decode_batches = (total_samples - 1) // batch_size + 1
-
-  def input_generator():
-    """Yield encoded strings from sorted_inputs."""
-    for i in range(num_decode_batches):
-      lines = [
-          sorted_inputs[j + i * batch_size]
-          for j in range(batch_size)
-          if j + i * batch_size < total_samples
-      ]
-      lines = [_encode_and_add_eos(l, subtokenizer) for l in lines]
-      if distribution_strategy:
-        for j in range(batch_size - len(lines)):
-          lines.append([tokenizer.EOS_ID])
-      batch = tf.keras.preprocessing.sequence.pad_sequences(
-          lines,
-          maxlen=params["decode_max_length"],
-          dtype="int32",
-          padding="post")
-      logging.info("Decoding batch %d out of %d.", i, num_decode_batches)
-      yield batch
-
-  @tf.function
-  def predict_step(inputs):
-    """Decoding step function for TPU runs."""
-
-    def _step_fn(inputs):
-      """Per replica step function."""
-      tag = inputs[0]
-      val_inputs = inputs[1]
-      val_outputs, _ = model([val_inputs], training=False)
-      return tag, val_outputs
-
-    return distribution_strategy.run(_step_fn, args=(inputs,))
-
-  translations = []
-  if distribution_strategy:
-    num_replicas = distribution_strategy.num_replicas_in_sync
-    local_batch_size = params["decode_batch_size"] // num_replicas
-  for i, text in enumerate(input_generator()):
-    if distribution_strategy:
-      text = np.reshape(text, [num_replicas, local_batch_size, -1])
-      # Add tag to the input of each replica with the reordering logic after
-      # outputs, to ensure the output order matches the input order.
-      text = tf.constant(text)
-
-      @tf.function
-      def text_as_per_replica():
-        replica_context = tf.distribute.get_replica_context()
-        replica_id = replica_context.replica_id_in_sync_group
-        return replica_id, text[replica_id]  # pylint: disable=cell-var-from-loop
-
-      text = distribution_strategy.run(text_as_per_replica)
-      outputs = distribution_strategy.experimental_local_results(
-          predict_step(text))
-      val_outputs = [output for _, output in outputs]
-
-      val_outputs = np.reshape(val_outputs, [params["decode_batch_size"], -1])
-    else:
-      val_outputs, _ = model.predict(text)
-
-    length = len(val_outputs)
-    for j in range(length):
-      if j + i * batch_size < total_samples:
-        translation = _trim_and_decode(val_outputs[j], subtokenizer)
-        translations.append(translation)
-        if print_all_translations:
-          logging.info("Translating:\n\tInput: %s\n\tOutput: %s",
-                       sorted_inputs[j + i * batch_size], translation)
-
-  # Write translations in the order they appeared in the original file.
-  if output_file is not None:
-    if tf.io.gfile.isdir(output_file):
-      raise ValueError("File output is a directory, will not save outputs to "
-                       "file.")
-    logging.info("Writing to file %s", output_file)
-    with tf.io.gfile.GFile(output_file, "w") as f:
-      for i in sorted_keys:
-        f.write("%s\n" % translations[i])
-
-
-def translate_from_text(model, subtokenizer, txt):
-  encoded_txt = _encode_and_add_eos(txt, subtokenizer)
-  result = model.predict(encoded_txt)
-  outputs = result["outputs"]
-  logging.info("Original: \"%s\"", txt)
-  translate_from_input(outputs, subtokenizer)
-
-
-def translate_from_input(outputs, subtokenizer):
-  translation = _trim_and_decode(outputs, subtokenizer)
-  logging.info("Translation: \"%s\"", translation)
--- a/official/nlp/transformer/utils/__init__.py
+++ b/official/nlp/transformer/utils/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
--- a/official/nlp/transformer/utils/metrics.py
+++ b/official/nlp/transformer/utils/metrics.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Functions for calculating loss, accuracy, and other model metrics.
-
-Metrics:
- - Padded loss, accuracy, and negative log perplexity. Source:
-     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
- - BLEU approximation. Source:
-     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
- - ROUGE score. Source:
-     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import math
-
-import numpy as np
-import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow.compat.v1 as tf
-
-
-def _pad_tensors_to_same_length(x, y):
-  """Pad x and y so that the results have the same length (second dimension)."""
-  with tf.name_scope("pad_to_same_length"):
-    x_length = tf.shape(x)[1]
-    y_length = tf.shape(y)[1]
-
-    max_length = tf.maximum(x_length, y_length)
-
-    x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
-    y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
-    return x, y
-
-
-def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
-  """Calculate cross entropy loss while ignoring padding.
-
-  Args:
-    logits: Tensor of size [batch_size, length_logits, vocab_size]
-    labels: Tensor of size [batch_size, length_labels]
-    smoothing: Label smoothing constant, used to determine the on and off values
-    vocab_size: int size of the vocabulary
-  Returns:
-    Returns the cross entropy loss and weight tensors: float32 tensors with
-      shape [batch_size, max(length_logits, length_labels)]
-  """
-  with tf.name_scope("loss", values=[logits, labels]):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-
-    # Calculate smoothing cross entropy
-    with tf.name_scope("smoothing_cross_entropy", values=[logits, labels]):
-      confidence = 1.0 - smoothing
-      low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32)
-      soft_targets = tf.one_hot(
-          tf.cast(labels, tf.int32),
-          depth=vocab_size,
-          on_value=confidence,
-          off_value=low_confidence)
-      xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(
-          logits=logits, labels=soft_targets)
-
-      # Calculate the best (lowest) possible value of cross entropy, and
-      # subtract from the cross entropy loss.
-      normalizing_constant = -(
-          confidence * tf.log(confidence) + tf.cast(vocab_size - 1, tf.float32)
-          * low_confidence * tf.log(low_confidence + 1e-20))
-      xentropy -= normalizing_constant
-
-    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
-    return xentropy * weights, weights
-
-
-def _convert_to_eval_metric(metric_fn):
-  """Wrap a metric fn that returns scores and weights as an eval metric fn.
-
-  The input metric_fn returns values for the current batch. The wrapper
-  aggregates the return values collected over all of the batches evaluated.
-
-  Args:
-    metric_fn: function that returns scores and weights for the current batch's
-      logits and predicted labels.
-
-  Returns:
-    function that aggregates the scores and weights from metric_fn.
-  """
-  def problem_metric_fn(*args):
-    """Returns an aggregation of the metric_fn's returned values."""
-    (scores, weights) = metric_fn(*args)
-
-    # The tf.metrics.mean function assures correct aggregation.
-    return tf.metrics.mean(scores, weights)
-  return problem_metric_fn
-
-
-def get_eval_metrics(logits, labels, params):
-  """Return dictionary of model evaluation metrics."""
-  metrics = {
-      "accuracy": _convert_to_eval_metric(padded_accuracy)(logits, labels),
-      "accuracy_top5": _convert_to_eval_metric(padded_accuracy_top5)(
-          logits, labels),
-      "accuracy_per_sequence": _convert_to_eval_metric(
-          padded_sequence_accuracy)(logits, labels),
-      "neg_log_perplexity": _convert_to_eval_metric(padded_neg_log_perplexity)(
-          logits, labels, params["vocab_size"]),
-  }
-
-  if not params["use_tpu"]:
-    # TPU does not support tf.py_func
-    metrics.update({
-        "approx_bleu_score": _convert_to_eval_metric(
-            bleu_score)(logits, labels),
-        "rouge_2_fscore": _convert_to_eval_metric(
-            rouge_2_fscore)(logits, labels),
-        "rouge_L_fscore": _convert_to_eval_metric(
-            rouge_l_fscore)(logits, labels),
-    })
-
-  # Prefix each of the metric names with "metrics/". This allows the metric
-  # graphs to display under the "metrics" category in TensorBoard.
-  metrics = {"metrics/%s" % k: v for k, v in six.iteritems(metrics)}
-  return metrics
-
-
-def padded_accuracy(logits, labels):
-  """Percentage of times that predictions matches labels on non-0s."""
-  with tf.variable_scope("padded_accuracy", values=[logits, labels]):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
-    outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
-    padded_labels = tf.cast(labels, tf.int32)
-    return tf.cast(tf.equal(outputs, padded_labels), tf.float32), weights
-
-
-def padded_accuracy_topk(logits, labels, k):
-  """Percentage of times that top-k predictions matches labels on non-0s."""
-  with tf.variable_scope("padded_accuracy_topk", values=[logits, labels]):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
-    effective_k = tf.minimum(k, tf.shape(logits)[-1])
-    _, outputs = tf.nn.top_k(logits, k=effective_k)
-    outputs = tf.cast(outputs, tf.int32)
-    padded_labels = tf.cast(labels, tf.int32)
-    padded_labels = tf.expand_dims(padded_labels, axis=-1)
-    padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
-    same = tf.cast(tf.equal(outputs, padded_labels), tf.float32)
-    same_topk = tf.reduce_sum(same, axis=-1)
-    return same_topk, weights
-
-
-def padded_accuracy_top5(logits, labels):
-  return padded_accuracy_topk(logits, labels, 5)
-
-
-def padded_sequence_accuracy(logits, labels):
-  """Percentage of times that predictions matches labels everywhere (non-0)."""
-  with tf.variable_scope("padded_sequence_accuracy", values=[logits, labels]):
-    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
-    outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
-    padded_labels = tf.cast(labels, tf.int32)
-    not_correct = (tf.cast(tf.not_equal(outputs, padded_labels), tf.float32) *
-                   weights)
-    axis = list(range(1, len(outputs.get_shape())))
-    correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
-    return correct_seq, tf.constant(1.0)
-
-
-def padded_neg_log_perplexity(logits, labels, vocab_size):
-  """Average log-perplexity excluding padding 0s. No smoothing."""
-  num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size)
-  return -num, den
-
-
-def bleu_score(logits, labels):
-  """Approximate BLEU score computation between labels and predictions.
-
-  An approximate BLEU scoring method since we do not glue word pieces or
-  decode the ids and tokenize the output. By default, we use ngram order of 4
-  and use brevity penalty. Also, this does not have beam search.
-
-  Args:
-    logits: Tensor of size [batch_size, length_logits, vocab_size]
-    labels: Tensor of size [batch-size, length_labels]
-
-  Returns:
-    bleu: int, approx bleu score
-  """
-  predictions = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
-  # TODO: Look into removing use of py_func  # pylint: disable=g-bad-todo
-  bleu = tf.py_func(compute_bleu, (labels, predictions), tf.float32)
-  return bleu, tf.constant(1.0)
-
-
-def _get_ngrams_with_counter(segment, max_order):
-  """Extracts all n-grams up to a given maximum order from an input segment.
-
-  Args:
-    segment: text segment from which n-grams will be extracted.
-    max_order: maximum length in tokens of the n-grams returned by this
-        methods.
-
-  Returns:
-    The Counter containing all n-grams upto max_order in segment
-    with a count of how many times each n-gram occurred.
-  """
-  ngram_counts = collections.Counter()
-  for order in xrange(1, max_order + 1):
-    for i in xrange(0, len(segment) - order + 1):
-      ngram = tuple(segment[i:i + order])
-      ngram_counts[ngram] += 1
-  return ngram_counts
-
-
-def compute_bleu(reference_corpus, translation_corpus, max_order=4,
-                 use_bp=True):
-  """Computes BLEU score of translated segments against one or more references.
-
-  Args:
-    reference_corpus: list of references for each translation. Each
-        reference should be tokenized into a list of tokens.
-    translation_corpus: list of translations to score. Each translation
-        should be tokenized into a list of tokens.
-    max_order: Maximum n-gram order to use when computing BLEU score.
-    use_bp: boolean, whether to apply brevity penalty.
-
-  Returns:
-    BLEU score.
-  """
-  reference_length = 0
-  translation_length = 0
-  bp = 1.0
-  geo_mean = 0
-
-  matches_by_order = [0] * max_order
-  possible_matches_by_order = [0] * max_order
-  precisions = []
-
-  for (references, translations) in zip(reference_corpus, translation_corpus):
-    reference_length += len(references)
-    translation_length += len(translations)
-    ref_ngram_counts = _get_ngrams_with_counter(references, max_order)
-    translation_ngram_counts = _get_ngrams_with_counter(translations, max_order)
-
-    overlap = dict((ngram,
-                    min(count, translation_ngram_counts[ngram]))
-                   for ngram, count in ref_ngram_counts.items())
-
-    for ngram in overlap:
-      matches_by_order[len(ngram) - 1] += overlap[ngram]
-    for ngram in translation_ngram_counts:
-      possible_matches_by_order[len(ngram) - 1] += translation_ngram_counts[
-          ngram]
-
-  precisions = [0] * max_order
-  smooth = 1.0
-
-  for i in xrange(0, max_order):
-    if possible_matches_by_order[i] > 0:
-      precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
-      if matches_by_order[i] > 0:
-        precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[
-            i]
-      else:
-        smooth *= 2
-        precisions[i] = 1.0 / (smooth * possible_matches_by_order[i])
-    else:
-      precisions[i] = 0.0
-
-  if max(precisions) > 0:
-    p_log_sum = sum(math.log(p) for p in precisions if p)
-    geo_mean = math.exp(p_log_sum / max_order)
-
-  if use_bp:
-    ratio = translation_length / reference_length
-    bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
-  bleu = geo_mean * bp
-  return np.float32(bleu)
-
-
-def rouge_2_fscore(logits, labels):
-  """ROUGE-2 F1 score computation between labels and predictions.
-
-  This is an approximate ROUGE scoring method since we do not glue word pieces
-  or decode the ids and tokenize the output.
-
-  Args:
-    logits: tensor, model predictions
-    labels: tensor, gold output.
-
-  Returns:
-    rouge2_fscore: approx rouge-2 f1 score.
-  """
-  predictions = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
-  # TODO: Look into removing use of py_func  # pylint: disable=g-bad-todo
-  rouge_2_f_score = tf.py_func(rouge_n, (predictions, labels), tf.float32)
-  return rouge_2_f_score, tf.constant(1.0)
-
-
-def _get_ngrams(n, text):
-  """Calculates n-grams.
-
-  Args:
-    n: which n-grams to calculate
-    text: An array of tokens
-
-  Returns:
-    A set of n-grams
-  """
-  ngram_set = set()
-  text_length = len(text)
-  max_index_ngram_start = text_length - n
-  for i in range(max_index_ngram_start + 1):
-    ngram_set.add(tuple(text[i:i + n]))
-  return ngram_set
-
-
-def rouge_n(eval_sentences, ref_sentences, n=2):
-  """Computes ROUGE-N f1 score of two text collections of sentences.
-
-  Source: https://www.microsoft.com/en-us/research/publication/
-  rouge-a-package-for-automatic-evaluation-of-summaries/
-
-  Args:
-    eval_sentences: Predicted sentences.
-    ref_sentences: Sentences from the reference set
-    n: Size of ngram.  Defaults to 2.
-
-  Returns:
-    f1 score for ROUGE-N
-  """
-  f1_scores = []
-  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
-    eval_ngrams = _get_ngrams(n, eval_sentence)
-    ref_ngrams = _get_ngrams(n, ref_sentence)
-    ref_count = len(ref_ngrams)
-    eval_count = len(eval_ngrams)
-
-    # Count the overlapping ngrams between evaluated and reference
-    overlapping_ngrams = eval_ngrams.intersection(ref_ngrams)
-    overlapping_count = len(overlapping_ngrams)
-
-    # Handle edge case. This isn't mathematically correct, but it's good enough
-    if eval_count == 0:
-      precision = 0.0
-    else:
-      precision = float(overlapping_count) / eval_count
-    if ref_count == 0:
-      recall = 0.0
-    else:
-      recall = float(overlapping_count) / ref_count
-    f1_scores.append(2.0 * ((precision * recall) / (precision + recall + 1e-8)))
-
-  # return overlapping_count / reference_count
-  return np.mean(f1_scores, dtype=np.float32)
-
-
-def rouge_l_fscore(predictions, labels):
-  """ROUGE scores computation between labels and predictions.
-
-  This is an approximate ROUGE scoring method since we do not glue word pieces
-  or decode the ids and tokenize the output.
-
-  Args:
-    predictions: tensor, model predictions
-    labels: tensor, gold output.
-
-  Returns:
-    rouge_l_fscore: approx rouge-l f1 score.
-  """
-  outputs = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
-  rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (outputs, labels),
-                               tf.float32)
-  return rouge_l_f_score, tf.constant(1.0)
-
-
-def rouge_l_sentence_level(eval_sentences, ref_sentences):
-  """Computes ROUGE-L (sentence level) of two collections of sentences.
-
-  Source: https://www.microsoft.com/en-us/research/publication/
-  rouge-a-package-for-automatic-evaluation-of-summaries/
-
-  Calculated according to:
-  R_lcs = LCS(X,Y)/m
-  P_lcs = LCS(X,Y)/n
-  F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
-
-  where:
-  X = reference summary
-  Y = Candidate summary
-  m = length of reference summary
-  n = length of candidate summary
-
-  Args:
-    eval_sentences: The sentences that have been picked by the summarizer
-    ref_sentences: The sentences from the reference set
-
-  Returns:
-    A float: F_lcs
-  """
-
-  f1_scores = []
-  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
-    m = float(len(ref_sentence))
-    n = float(len(eval_sentence))
-    lcs = _len_lcs(eval_sentence, ref_sentence)
-    f1_scores.append(_f_lcs(lcs, m, n))
-  return np.mean(f1_scores, dtype=np.float32)
-
-
-def _len_lcs(x, y):
-  """Returns the length of the Longest Common Subsequence between two seqs.
-
-  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
-
-  Args:
-    x: sequence of words
-    y: sequence of words
-
-  Returns
-    integer: Length of LCS between x and y
-  """
-  table = _lcs(x, y)
-  n, m = len(x), len(y)
-  return table[n, m]
-
-
-def _lcs(x, y):
-  """Computes the length of the LCS between two seqs.
-
-  The implementation below uses a DP programming algorithm and runs
-  in O(nm) time where n = len(x) and m = len(y).
-  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
-
-  Args:
-    x: collection of words
-    y: collection of words
-
-  Returns:
-    Table of dictionary of coord and len lcs
-  """
-  n, m = len(x), len(y)
-  table = dict()
-  for i in range(n + 1):
-    for j in range(m + 1):
-      if i == 0 or j == 0:
-        table[i, j] = 0
-      elif x[i - 1] == y[j - 1]:
-        table[i, j] = table[i - 1, j - 1] + 1
-      else:
-        table[i, j] = max(table[i - 1, j], table[i, j - 1])
-  return table
-
-
-def _f_lcs(llcs, m, n):
-  """Computes the LCS-based F-measure score.
-
-  Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
-  rouge-working-note-v1.3.1.pdf
-
-  Args:
-    llcs: Length of LCS
-    m: number of words in reference summary
-    n: number of words in candidate summary
-
-  Returns:
-    Float. LCS-based F-measure score
-  """
-  r_lcs = llcs / m
-  p_lcs = llcs / n
-  beta = p_lcs / (r_lcs + 1e-12)
-  num = (1 + (beta ** 2)) * r_lcs * p_lcs
-  denom = r_lcs + ((beta ** 2) * p_lcs)
-  f_lcs = num / (denom + 1e-12)
-  return f_lcs
--- a/official/nlp/transformer/utils/tokenizer.py
+++ b/official/nlp/transformer/utils/tokenizer.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Defines Subtokenizer class to encode and decode strings."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import re
-import sys
-import unicodedata
-
-from absl import logging
-
-import numpy as np
-import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-# pylint: disable=g-complex-comprehension
-PAD = "<pad>"
-PAD_ID = 0
-EOS = "<EOS>"
-EOS_ID = 1
-RESERVED_TOKENS = [PAD, EOS]
-
-# Set of characters that will be used in the function _escape_token() (see func
-# docstring for more details).
-# This set is added to the alphabet list to ensure that all escaped tokens can
-# be encoded.
-_ESCAPE_CHARS = set(u"\\_u;0123456789")
-# Regex for the function _unescape_token(), the inverse of _escape_token().
-# This is used to find "\u", "\\", and "\###;" substrings in the token.
-_UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);")
-
-_UNDEFINED_UNICODE = u"\u3013"
-
-
-def alphanumeric_char_set():
-  return set(
-      six.unichr(i)
-      for i in xrange(sys.maxunicode)
-      if (unicodedata.category(six.unichr(i)).startswith("L") or
-          unicodedata.category(six.unichr(i)).startswith("N")))
-
-
-# Set contains all letter and number characters.
-_ALPHANUMERIC_CHAR_SET = alphanumeric_char_set()
-
-# min_count is the minimum number of times a subtoken must appear in the data
-# before before it is added to the vocabulary. The value is found using binary
-# search to obtain the target vocabulary size.
-_MIN_MIN_COUNT = 1  # min value to use when binary searching for min_count
-_MAX_MIN_COUNT = 1000  # max value to use when binary searching for min_count
-
-
-class Subtokenizer(object):
-  """Encodes and decodes strings to/from integer IDs."""
-
-  def __init__(self, vocab_file, reserved_tokens=None, master_char_set=None):
-    """Initializes class, creating a vocab file if data_files is provided."""
-    logging.info("Initializing Subtokenizer from file %s.", vocab_file)
-
-    if master_char_set is None:
-      master_char_set = _ALPHANUMERIC_CHAR_SET
-
-    if reserved_tokens is None:
-      reserved_tokens = RESERVED_TOKENS
-
-    self.subtoken_list = _load_vocab_file(vocab_file, reserved_tokens)
-    self.alphabet = _generate_alphabet_dict(self.subtoken_list)
-    self.subtoken_to_id_dict = _list_to_index_dict(self.subtoken_list)
-
-    self.max_subtoken_length = 0
-    for subtoken in self.subtoken_list:
-      self.max_subtoken_length = max(self.max_subtoken_length, len(subtoken))
-
-    # Create cache to speed up subtokenization
-    self._cache_size = 2**20
-    self._cache = [(None, None)] * self._cache_size
-    self._master_char_set = master_char_set
-
-  @staticmethod
-  def init_from_files(vocab_file,
-                      files,
-                      target_vocab_size,
-                      threshold,
-                      min_count=None,
-                      file_byte_limit=1e6,
-                      reserved_tokens=None,
-                      correct_strip=True,
-                      master_char_set=None):
-    """Create subtoken vocabulary based on files, and save vocab to file.
-
-    Args:
-      vocab_file: String name of vocab file to store subtoken vocabulary.
-      files: List of file paths that will be used to generate vocabulary.
-      target_vocab_size: target vocabulary size to generate.
-      threshold: int threshold of vocabulary size to accept.
-      min_count: int minimum count to use for generating the vocabulary. The min
-        count is the minimum number of times a subtoken should appear in the
-        files before it is added to the vocabulary. If set to none, this value
-        is found using binary search.
-      file_byte_limit: (Default 1e6) Maximum number of bytes of sample text that
-        will be drawn from the files.
-      reserved_tokens: List of string tokens that are guaranteed to be at the
-        beginning of the subtoken vocabulary list.
-      correct_strip: Whether to convert text to unicode before strip.
-      master_char_set: the char set.
-
-    Returns:
-      Subtokenizer object
-    """
-    if master_char_set is None:
-      master_char_set = _ALPHANUMERIC_CHAR_SET
-    if reserved_tokens is None:
-      reserved_tokens = RESERVED_TOKENS
-
-    if tf.io.gfile.exists(vocab_file):
-      logging.info("Vocab file already exists (%s)", vocab_file)
-    else:
-      logging.info("Begin steps to create subtoken vocabulary...")
-      token_counts = _count_tokens(files, file_byte_limit, correct_strip,
-                                   master_char_set)
-      alphabet = _generate_alphabet_dict(token_counts)
-      subtoken_list = _generate_subtokens_with_target_vocab_size(
-          token_counts, alphabet, target_vocab_size, threshold, min_count,
-          reserved_tokens)
-      logging.info("Generated vocabulary with %d subtokens.",
-                   len(subtoken_list))
-      _save_vocab_file(vocab_file, subtoken_list)
-    return Subtokenizer(vocab_file, master_char_set=master_char_set)
-
-  def encode(self, raw_string, add_eos=False):
-    """Encodes a string into a list of int subtoken ids."""
-    ret = []
-    tokens = _split_string_to_tokens(
-        native_to_unicode(raw_string), self._master_char_set)
-    for token in tokens:
-      ret.extend(self._token_to_subtoken_ids(token))
-    if add_eos:
-      assert EOS in self.subtoken_list, \
-          "Can't append 'EOS' because it is not in list of known subtokens."
-      ret.append(EOS_ID)
-    return ret
-
-  def _token_to_subtoken_ids(self, token):
-    """Encode a single token into a list of subtoken ids."""
-    cache_location = hash(token) % self._cache_size
-    cache_key, cache_value = self._cache[cache_location]
-    if cache_key == token:
-      return cache_value
-
-    ret = _split_token_to_subtokens(
-        _escape_token(token, self.alphabet), self.subtoken_to_id_dict,
-        self.max_subtoken_length)
-    ret = [self.subtoken_to_id_dict[subtoken_id] for subtoken_id in ret]
-
-    self._cache[cache_location] = (token, ret)
-    return ret
-
-  def decode(self, subtokens):
-    """Converts list of int subtokens ids into a string."""
-    if isinstance(subtokens, np.ndarray):
-      # Note that list(subtokens) converts subtokens to a python list, but the
-      # items remain as np.int32. This converts both the array and its items.
-      subtokens = subtokens.tolist()
-
-    if not subtokens:
-      return ""
-
-    assert isinstance(subtokens, list) and isinstance(subtokens[0], int), (
-        "Subtokens argument passed into decode() must be a list of integers.")
-
-    return _unicode_to_native(
-        _join_tokens_to_string(
-            self._subtoken_ids_to_tokens(subtokens), self._master_char_set))
-
-  def _subtoken_ids_to_tokens(self, subtokens):
-    """Convert list of int subtoken ids to a list of string tokens."""
-    escaped_tokens = "".join([
-        self.subtoken_list[s] for s in subtokens if s < len(self.subtoken_list)
-    ])
-    escaped_tokens = escaped_tokens.split("_")
-
-    # All tokens in the vocabulary list have been escaped (see _escape_token())
-    # so each token must be unescaped when decoding.
-    ret = []
-    for token in escaped_tokens:
-      if token:
-        ret.append(_unescape_token(token))
-    return ret
-
-
-def _save_vocab_file(vocab_file, subtoken_list):
-  """Save subtokens to file."""
-  with tf.io.gfile.GFile(vocab_file, mode="w") as f:
-    for subtoken in subtoken_list:
-      f.write("'%s'\n" % _unicode_to_native(subtoken))
-
-
-def _load_vocab_file(vocab_file, reserved_tokens=None):
-  """Load vocabulary while ensuring reserved tokens are at the top."""
-  if reserved_tokens is None:
-    reserved_tokens = RESERVED_TOKENS
-
-  subtoken_list = []
-  with tf.io.gfile.GFile(vocab_file, mode="r") as f:
-    for line in f:
-      subtoken = native_to_unicode(line.strip())
-      subtoken = subtoken[1:-1]  # Remove surrounding single-quotes
-      if subtoken in reserved_tokens:
-        continue
-      subtoken_list.append(native_to_unicode(subtoken))
-  return reserved_tokens + subtoken_list
-
-
-def native_to_unicode(s):
-  """Convert string to unicode (required in Python 2)."""
-  try:  # Python 2
-    return s if isinstance(s, unicode) else s.decode("utf-8")
-  except NameError:  # Python 3
-    return s
-
-
-def _unicode_to_native(s):
-  """Convert string from unicode to native format (required in Python 2)."""
-  try:  # Python 2
-    return s.encode("utf-8") if isinstance(s, unicode) else s
-  except NameError:  # Python 3
-    return s
-
-
-def _split_string_to_tokens(text, master_char_set):
-  """Splits text to a list of string tokens."""
-  if not text:
-    return []
-  ret = []
-  token_start = 0
-  # Classify each character in the input string
-  is_master = [c in master_char_set for c in text]
-  for pos in xrange(1, len(text)):
-    if is_master[pos] != is_master[pos - 1]:
-      token = text[token_start:pos]
-      if token != u" " or token_start == 0:
-        ret.append(token)
-      token_start = pos
-  final_token = text[token_start:]
-  ret.append(final_token)
-  return ret
-
-
-def _join_tokens_to_string(tokens, master_char_set):
-  """Join a list of string tokens into a single string."""
-  token_is_master = [t[0] in master_char_set for t in tokens]
-  ret = []
-  for i, token in enumerate(tokens):
-    if i > 0 and token_is_master[i - 1] and token_is_master[i]:
-      ret.append(u" ")
-    ret.append(token)
-  return "".join(ret)
-
-
-def _escape_token(token, alphabet):
-  r"""Replace characters that aren't in the alphabet and append "_" to token.
-
-  Apply three transformations to the token:
-    1. Replace underline character "_" with "\u", and backslash "\" with "\\".
-    2. Replace characters outside of the alphabet with "\###;", where ### is the
-       character's Unicode code point.
-    3. Appends "_" to mark the end of a token.
-
-  Args:
-    token: unicode string to be escaped
-    alphabet: list of all known characters
-
-  Returns:
-    escaped string
-  """
-  token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u")
-  ret = [c if c in alphabet and c != u"\n" else r"\%d;" % ord(c) for c in token]
-  return u"".join(ret) + "_"
-
-
-def _unescape_token(token):
-  r"""Replaces escaped characters in the token with their unescaped versions.
-
-  Applies inverse transformations as _escape_token():
-    1. Replace "\u" with "_", and "\\" with "\".
-    2. Replace "\###;" with the unicode character the ### refers to.
-
-  Args:
-    token: escaped string
-
-  Returns:
-    unescaped string
-  """
-
-  def match(m):
-    r"""Returns replacement string for matched object.
-
-    Matched objects contain one of the strings that matches the regex pattern:
-      r"\\u|\\\\|\\([0-9]+);"
-    The strings can be '\u', '\\', or '\###;' (### is any digit number).
-
-    m.group(0) refers to the entire matched string ('\u', '\\', or '\###;').
-    m.group(1) refers to the first parenthesized subgroup ('###').
-
-    m.group(0) exists for all match objects, while m.group(1) exists only for
-    the string '\###;'.
-
-    This function looks to see if m.group(1) exists. If it doesn't, then the
-    matched string must be '\u' or '\\' . In this case, the corresponding
-    replacement ('_' and '\') are returned. Note that in python, a single
-    backslash is written as '\\', and double backslash as '\\\\'.
-
-    If m.goup(1) exists, then use the integer in m.group(1) to return a
-    unicode character.
-
-    Args:
-      m: match object
-
-    Returns:
-      String to replace matched object with.
-    """
-    # Check if the matched strings are '\u' or '\\'.
-    if m.group(1) is None:
-      return u"_" if m.group(0) == u"\\u" else u"\\"
-
-    # If m.group(1) exists, try and return unicode character.
-    try:
-      return six.unichr(int(m.group(1)))
-    except (ValueError, OverflowError) as _:
-      return _UNDEFINED_UNICODE
-
-  # Use match function to replace escaped substrings in the token.
-  return _UNESCAPE_REGEX.sub(match, token)
-
-
-def _count_tokens(files,
-                  file_byte_limit=1e6,
-                  correct_strip=True,
-                  master_char_set=None):
-  """Return token counts of words in the files.
-
-  Samples file_byte_limit bytes from each file, and counts the words that appear
-  in the samples. The samples are semi-evenly distributed across the file.
-
-  Args:
-    files: List of filepaths
-    file_byte_limit: Max number of bytes that will be read from each file.
-    correct_strip: Whether to convert text to unicode before strip. This affects
-      vocabulary generation for PY2. Sets correct_strip to False in PY2 to
-      reproduce previous common public result. Sets correct_strip to True will
-      let PY2 and PY3 get a consistent vocabulary.
-    master_char_set: the char set.
-
-  Returns:
-    Dictionary mapping tokens to the number of times they appear in the sampled
-    lines from the files.
-  """
-  if master_char_set is None:
-    master_char_set = _ALPHANUMERIC_CHAR_SET
-
-  token_counts = collections.defaultdict(int)
-
-  for filepath in files:
-    with tf.io.gfile.GFile(filepath, mode="r") as reader:
-      file_byte_budget = file_byte_limit
-      counter = 0
-      lines_to_skip = int(reader.size() / (file_byte_budget * 2))
-      for line in reader:
-        if counter < lines_to_skip:
-          counter += 1
-        else:
-          if file_byte_budget < 0:
-            break
-          if correct_strip:
-            line = native_to_unicode(line)
-          line = line.strip()
-          file_byte_budget -= len(line)
-          counter = 0
-
-          # Add words to token counts
-          for token in _split_string_to_tokens(
-              native_to_unicode(line), master_char_set):
-            token_counts[token] += 1
-  return token_counts
-
-
-def _list_to_index_dict(lst):
-  """Create dictionary mapping list items to their indices in the list."""
-  return {item: n for n, item in enumerate(lst)}
-
-
-def _split_token_to_subtokens(token, subtoken_dict, max_subtoken_length):
-  """Splits a token into subtokens defined in the subtoken dict."""
-  ret = []
-  start = 0
-  token_len = len(token)
-  while start < token_len:
-    # Find the longest subtoken, so iterate backwards.
-    for end in xrange(min(token_len, start + max_subtoken_length), start, -1):
-      subtoken = token[start:end]
-      if subtoken in subtoken_dict:
-        ret.append(subtoken)
-        start = end
-        break
-    else:  # Did not break
-      # If there is no possible encoding of the escaped token then one of the
-      # characters in the token is not in the alphabet. This should be
-      # impossible and would be indicative of a bug.
-      raise ValueError("Was unable to split token \"%s\" into subtokens." %
-                       token)
-  return ret
-
-
-def _generate_subtokens_with_target_vocab_size(token_counts,
-                                               alphabet,
-                                               target_size,
-                                               threshold,
-                                               min_count=None,
-                                               reserved_tokens=None):
-  """Generate subtoken vocabulary close to the target size."""
-  if reserved_tokens is None:
-    reserved_tokens = RESERVED_TOKENS
-
-  if min_count is not None:
-    logging.info("Using min_count=%d to generate vocab with target size %d",
-                 min_count, target_size)
-    return _generate_subtokens(
-        token_counts, alphabet, min_count, reserved_tokens=reserved_tokens)
-
-  def bisect(min_val, max_val):
-    """Recursive function to binary search for subtoken vocabulary."""
-    cur_count = (min_val + max_val) // 2
-    logging.info("Binary search: trying min_count=%d (%d %d)", cur_count,
-                 min_val, max_val)
-    subtoken_list = _generate_subtokens(
-        token_counts, alphabet, cur_count, reserved_tokens=reserved_tokens)
-
-    val = len(subtoken_list)
-    logging.info("Binary search: min_count=%d resulted in %d tokens", cur_count,
-                 val)
-
-    within_threshold = abs(val - target_size) < threshold
-    if within_threshold or min_val >= max_val or cur_count < 2:
-      return subtoken_list
-    if val > target_size:
-      other_subtoken_list = bisect(cur_count + 1, max_val)
-    else:
-      other_subtoken_list = bisect(min_val, cur_count - 1)
-
-    # Return vocabulary dictionary with the closest number of tokens.
-    other_val = len(other_subtoken_list)
-    if abs(other_val - target_size) < abs(val - target_size):
-      return other_subtoken_list
-    return subtoken_list
-
-  logging.info("Finding best min_count to get target size of %d", target_size)
-  return bisect(_MIN_MIN_COUNT, _MAX_MIN_COUNT)
-
-
-def _generate_alphabet_dict(iterable, reserved_tokens=None):
-  """Create set of characters that appear in any element in the iterable."""
-  if reserved_tokens is None:
-    reserved_tokens = RESERVED_TOKENS
-  alphabet = {c for token in iterable for c in token}
-  alphabet |= {c for token in reserved_tokens for c in token}
-  alphabet |= _ESCAPE_CHARS  # Add escape characters to alphabet set.
-  return alphabet
-
-
-def _count_and_gen_subtokens(token_counts, alphabet, subtoken_dict,
-                             max_subtoken_length):
-  """Count number of times subtokens appear, and generate new subtokens.
-
-  Args:
-    token_counts: dict mapping tokens to the number of times they appear in the
-      original files.
-    alphabet: list of allowed characters. Used to escape the tokens, which
-      guarantees that all tokens can be split into subtokens.
-    subtoken_dict: dict mapping subtokens to ids.
-    max_subtoken_length: maximum length of subtoken in subtoken_dict.
-
-  Returns:
-    A defaultdict mapping subtokens to the number of times they appear in the
-    tokens. The dict may contain new subtokens.
-  """
-  subtoken_counts = collections.defaultdict(int)
-  for token, count in six.iteritems(token_counts):
-    token = _escape_token(token, alphabet)
-    subtokens = _split_token_to_subtokens(token, subtoken_dict,
-                                          max_subtoken_length)
-
-    # Generate new subtokens by taking substrings from token.
-    start = 0
-    for subtoken in subtokens:
-      for end in xrange(start + 1, len(token) + 1):
-        new_subtoken = token[start:end]
-        subtoken_counts[new_subtoken] += count
-      start += len(subtoken)
-
-  return subtoken_counts
-
-
-def _filter_and_bucket_subtokens(subtoken_counts, min_count):
-  """Return a bucketed list of subtokens that are filtered by count.
-
-  Args:
-    subtoken_counts: defaultdict mapping subtokens to their counts
-    min_count: int count used to filter subtokens
-
-  Returns:
-    List of subtoken sets, where subtokens in set i have the same length=i.
-  """
-  # Create list of buckets, where subtokens in bucket i have length i.
-  subtoken_buckets = []
-  for subtoken, count in six.iteritems(subtoken_counts):
-    if count < min_count:  # Filter out subtokens that don't appear enough
-      continue
-    while len(subtoken_buckets) <= len(subtoken):
-      subtoken_buckets.append(set())
-    subtoken_buckets[len(subtoken)].add(subtoken)
-  return subtoken_buckets
-
-
-def _gen_new_subtoken_list(subtoken_counts,
-                           min_count,
-                           alphabet,
-                           reserved_tokens=None):
-  """Generate candidate subtokens ordered by count, and new max subtoken length.
-
-  Add subtokens to the candiate list in order of length (longest subtokens
-  first). When a subtoken is added, the counts of each of its prefixes are
-  decreased. Prefixes that don't appear much outside the subtoken are not added
-  to the candidate list.
-
-  For example:
-    subtoken being added to candidate list: 'translate'
-    subtoken_counts: {'translate':10, 't':40, 'tr':16, 'tra':12, ...}
-    min_count: 5
-
-  When 'translate' is added, subtoken_counts is updated to:
-    {'translate':0, 't':30, 'tr':6, 'tra': 2, ...}
-
-  The subtoken 'tra' will not be added to the candidate list, because it appears
-  twice (less than min_count) outside of 'translate'.
-
-  Args:
-    subtoken_counts: defaultdict mapping str subtokens to int counts
-    min_count: int minumum count requirement for subtokens
-    alphabet: set of characters. Each character is added to the subtoken list to
-      guarantee that all tokens can be encoded.
-    reserved_tokens: list of tokens that will be added to the beginning of the
-      returned subtoken list.
-
-  Returns:
-    List of candidate subtokens in decreasing count order, and maximum subtoken
-    length
-  """
-  if reserved_tokens is None:
-    reserved_tokens = RESERVED_TOKENS
-
-  # Create a list of (count, subtoken) for each candidate subtoken.
-  subtoken_candidates = []
-
-  # Use bucketted list to iterate through subtokens in order of length.
-  # subtoken_buckets[i] = set(subtokens), where each subtoken has length i.
-  subtoken_buckets = _filter_and_bucket_subtokens(subtoken_counts, min_count)
-  max_subtoken_length = len(subtoken_buckets) - 1
-
-  # Go through the list in reverse order to consider longer subtokens first.
-  for subtoken_len in xrange(max_subtoken_length, 0, -1):
-    for subtoken in subtoken_buckets[subtoken_len]:
-      count = subtoken_counts[subtoken]
-
-      # Possible if this subtoken is a prefix of another token.
-      if count < min_count:
-        continue
-
-      # Ignore alphabet/reserved tokens, which will be added manually later.
-      if subtoken not in alphabet and subtoken not in reserved_tokens:
-        subtoken_candidates.append((count, subtoken))
-
-      # Decrement count of the subtoken's prefixes (if a longer subtoken is
-      # added, its prefixes lose priority to be added).
-      for end in xrange(1, subtoken_len):
-        subtoken_counts[subtoken[:end]] -= count
-
-  # Add alphabet subtokens (guarantees that all strings are encodable).
-  subtoken_candidates.extend((subtoken_counts.get(a, 0), a) for a in alphabet)
-
-  # Order subtoken candidates by decreasing count.
-  subtoken_list = [t for _, t in sorted(subtoken_candidates, reverse=True)]
-
-  # Add reserved tokens to beginning of the list.
-  subtoken_list = reserved_tokens + subtoken_list
-  return subtoken_list, max_subtoken_length
-
-
-def _generate_subtokens(token_counts,
-                        alphabet,
-                        min_count,
-                        num_iterations=4,
-                        reserved_tokens=None):
-  """Create a list of subtokens in decreasing order of frequency.
-
-  Args:
-    token_counts: dict mapping str tokens -> int count
-    alphabet: set of characters
-    min_count: int minimum number of times a subtoken must appear before it is
-      added to the vocabulary.
-    num_iterations: int number of iterations to generate new tokens.
-    reserved_tokens: list of tokens that will be added to the beginning to the
-      returned subtoken list.
-
-  Returns:
-    Sorted list of subtokens (most frequent first)
-  """
-  if reserved_tokens is None:
-    reserved_tokens = RESERVED_TOKENS
-
-  # Use alphabet set to create initial list of subtokens
-  subtoken_list = reserved_tokens + list(alphabet)
-  max_subtoken_length = 1
-
-  # On each iteration, segment all words using the subtokens defined in
-  # subtoken_dict, count how often the resulting subtokens appear, and update
-  # the dictionary with subtokens w/ high enough counts.
-  for i in xrange(num_iterations):
-    logging.info("\tGenerating subtokens: iteration %d", i)
-    # Generate new subtoken->id dictionary using the new subtoken list.
-    subtoken_dict = _list_to_index_dict(subtoken_list)
-
-    # Create dict mapping subtoken->count, with additional subtokens created
-    # from substrings taken from the tokens.
-    subtoken_counts = _count_and_gen_subtokens(token_counts, alphabet,
-                                               subtoken_dict,
-                                               max_subtoken_length)
-
-    # Generate new list of subtokens sorted by subtoken count.
-    subtoken_list, max_subtoken_length = _gen_new_subtoken_list(
-        subtoken_counts, min_count, alphabet, reserved_tokens)
-
-    logging.info("\tVocab size: %d", len(subtoken_list))
-  return subtoken_list
--- a/official/nlp/transformer/utils/tokenizer_test.py
+++ b/official/nlp/transformer/utils/tokenizer_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Test Subtokenizer and string helper methods."""
-
-import collections
-import tempfile
-
-import tensorflow as tf
-
-from official.nlp.transformer.utils import tokenizer
-
-
-class SubtokenizerTest(tf.test.TestCase):
-
-  def _init_subtokenizer(self, vocab_list):
-    temp_file = tempfile.NamedTemporaryFile(delete=False)
-    with tf.io.gfile.GFile(temp_file.name, "w") as w:
-      for subtoken in vocab_list:
-        w.write("'%s'" % subtoken)
-        w.write("\n")
-    return tokenizer.Subtokenizer(temp_file.name, reserved_tokens=[])
-
-  def test_encode(self):
-    vocab_list = ["123_", "test", "ing_"]
-    subtokenizer = self._init_subtokenizer(vocab_list)
-    s = "testing 123"
-    encoded_list = subtokenizer.encode(s)
-    self.assertEqual([1, 2, 0], encoded_list)
-
-  def test_decode(self):
-    vocab_list = ["123_", "test", "ing_"]
-    subtokenizer = self._init_subtokenizer(vocab_list)
-    encoded_list = [1, 2, 0]  # testing 123
-    decoded_str = subtokenizer.decode(encoded_list)
-    self.assertEqual("testing 123", decoded_str)
-
-  def test_subtoken_ids_to_tokens(self):
-    vocab_list = ["123_", "test", "ing_"]
-    subtokenizer = self._init_subtokenizer(vocab_list)
-    encoded_list = [1, 2, 0]  # testing 123
-    token_list = subtokenizer._subtoken_ids_to_tokens(encoded_list)
-    self.assertEqual([u"testing", u"123"], token_list)
-
-
-class StringHelperTest(tf.test.TestCase):
-
-  def test_split_string_to_tokens(self):
-    text = "test? testing 123."
-
-    tokens = tokenizer._split_string_to_tokens(text,
-                                               tokenizer._ALPHANUMERIC_CHAR_SET)
-    self.assertEqual(["test", "? ", "testing", "123", "."], tokens)
-
-  def test_join_tokens_to_string(self):
-    tokens = ["test", "? ", "testing", "123", "."]
-
-    s = tokenizer._join_tokens_to_string(tokens,
-                                         tokenizer._ALPHANUMERIC_CHAR_SET)
-    self.assertEqual("test? testing 123.", s)
-
-  def test_escape_token(self):
-    token = u"abc_\\4"
-    alphabet = set("abc_\\u;")
-
-    escaped_token = tokenizer._escape_token(token, alphabet)
-    self.assertEqual("abc\\u\\\\\\52;_", escaped_token)
-
-  def test_unescape_token(self):
-    escaped_token = u"Underline: \\u, Backslash: \\\\, Unicode: \\52;"
-
-    unescaped_token = tokenizer._unescape_token(escaped_token)
-    self.assertEqual("Underline: _, Backslash: \\, Unicode: 4", unescaped_token)
-
-  def test_list_to_index_dict(self):
-    lst = ["test", "strings"]
-
-    d = tokenizer._list_to_index_dict(lst)
-    self.assertDictEqual({"test": 0, "strings": 1}, d)
-
-  def test_split_token_to_subtokens(self):
-    token = "abc"
-    subtoken_dict = {"a": 0, "b": 1, "c": 2, "ab": 3}
-    max_subtoken_length = 2
-
-    subtokens = tokenizer._split_token_to_subtokens(token, subtoken_dict,
-                                                    max_subtoken_length)
-    self.assertEqual(["ab", "c"], subtokens)
-
-  def test_generate_alphabet_dict(self):
-    s = ["testing", "123"]
-    reserved_tokens = ["???"]
-
-    alphabet = tokenizer._generate_alphabet_dict(s, reserved_tokens)
-    self.assertIn("?", alphabet)
-    self.assertIn("t", alphabet)
-    self.assertIn("e", alphabet)
-    self.assertIn("s", alphabet)
-    self.assertIn("i", alphabet)
-    self.assertIn("n", alphabet)
-    self.assertIn("g", alphabet)
-    self.assertIn("1", alphabet)
-    self.assertIn("2", alphabet)
-    self.assertIn("3", alphabet)
-
-  def test_count_and_gen_subtokens(self):
-    token_counts = {"abc": 5}
-    alphabet = set("abc_")
-    subtoken_dict = {"a": 0, "b": 1, "c": 2, "_": 3}
-    max_subtoken_length = 2
-
-    subtoken_counts = tokenizer._count_and_gen_subtokens(
-        token_counts, alphabet, subtoken_dict, max_subtoken_length)
-
-    self.assertIsInstance(subtoken_counts, collections.defaultdict)
-    self.assertDictEqual(
-        {
-            "a": 5,
-            "b": 5,
-            "c": 5,
-            "_": 5,
-            "ab": 5,
-            "bc": 5,
-            "c_": 5,
-            "abc": 5,
-            "bc_": 5,
-            "abc_": 5
-        }, subtoken_counts)
-
-  def test_filter_and_bucket_subtokens(self):
-    subtoken_counts = collections.defaultdict(int, {
-        "a": 2,
-        "b": 4,
-        "c": 1,
-        "ab": 6,
-        "ac": 3,
-        "abbc": 5
-    })
-    min_count = 3
-
-    subtoken_buckets = tokenizer._filter_and_bucket_subtokens(
-        subtoken_counts, min_count)
-
-    self.assertEqual(len(subtoken_buckets[0]), 0)
-    self.assertEqual(set("b"), subtoken_buckets[1])
-    self.assertEqual(set(["ab", "ac"]), subtoken_buckets[2])
-    self.assertEqual(len(subtoken_buckets[3]), 0)
-    self.assertEqual(set(["abbc"]), subtoken_buckets[4])
-
-  def test_gen_new_subtoken_list(self):
-    subtoken_counts = collections.defaultdict(int, {
-        "translate": 10,
-        "t": 40,
-        "tr": 16,
-        "tra": 12
-    })
-    min_count = 5
-    alphabet = set("translate")
-    reserved_tokens = ["reserved", "tokens"]
-
-    subtoken_list, max_token_length = tokenizer._gen_new_subtoken_list(
-        subtoken_counts, min_count, alphabet, reserved_tokens)
-
-    # Check that "tra" isn"t in the list (its count should be decremented to 2,
-    # so it should not be added to the canddiate list).
-    self.assertNotIn("tra", subtoken_list)
-
-    self.assertIn("tr", subtoken_list)
-    self.assertIn("t", subtoken_list)
-
-    self.assertEqual(len("translate"), max_token_length)
-
-  def test_generate_subtokens(self):
-    token_counts = {"ab": 1, "bc": 3, "abc": 5}
-    alphabet = set("abc_")
-    min_count = 100
-    num_iterations = 1
-    reserved_tokens = ["reserved", "tokens"]
-
-    vocab_list = tokenizer._generate_subtokens(token_counts, alphabet,
-                                               min_count, num_iterations,
-                                               reserved_tokens)
-
-    # Check that reserved tokens are at the front of the list
-    self.assertEqual(vocab_list[:2], reserved_tokens)
-
-    # Check that each character in alphabet is in the vocab list
-    for c in alphabet:
-      self.assertIn(c, vocab_list)
-
-
-if __name__ == "__main__":
-  tf.test.main()