Internal change

PiperOrigin-RevId: 417673004

Internal change
PiperOrigin-RevId: 417673004
90dd6310 · Frederick Liu · A. Unique TensorFlower · ddaca60a · 90dd6310 · 90dd6310
Commit 90dd6310 authored Dec 21, 2021 by Frederick Liu Committed by A. Unique TensorFlower Dec 21, 2021
7 changed files
--- a/official/legacy/transformer/transformer_main_test.py
+++ b/official/legacy/transformer/transformer_main_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test Transformer model."""
+
+import os
+import re
+import sys
+import unittest
+
+from absl import flags
+from absl.testing import flagsaver
+import tensorflow as tf
+from tensorflow.python.eager import context  # pylint: disable=ungrouped-imports
+from official.legacy.transformer import misc
+from official.legacy.transformer import transformer_main
+
+FLAGS = flags.FLAGS
+FIXED_TIMESTAMP = 'my_time_stamp'
+WEIGHT_PATTERN = re.compile(r'weights-epoch-.+\.hdf5')
+
+
+def _generate_file(filepath, lines):
+  with open(filepath, 'w') as f:
+    for l in lines:
+      f.write('{}\n'.format(l))
+
+
+class TransformerTaskTest(tf.test.TestCase):
+  local_flags = None
+
+  def setUp(self):  # pylint: disable=g-missing-super-call
+    temp_dir = self.get_temp_dir()
+    if TransformerTaskTest.local_flags is None:
+      misc.define_transformer_flags()
+      # Loads flags, array cannot be blank.
+      flags.FLAGS(['foo'])
+      TransformerTaskTest.local_flags = flagsaver.save_flag_values()
+    else:
+      flagsaver.restore_flag_values(TransformerTaskTest.local_flags)
+    FLAGS.model_dir = os.path.join(temp_dir, FIXED_TIMESTAMP)
+    FLAGS.param_set = 'tiny'
+    FLAGS.use_synthetic_data = True
+    FLAGS.steps_between_evals = 1
+    FLAGS.train_steps = 1
+    FLAGS.validation_steps = 1
+    FLAGS.batch_size = 4
+    FLAGS.max_length = 1
+    FLAGS.num_gpus = 1
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.dtype = 'fp32'
+    self.model_dir = FLAGS.model_dir
+    self.temp_dir = temp_dir
+    self.vocab_file = os.path.join(temp_dir, 'vocab')
+    self.vocab_size = misc.get_model_params(FLAGS.param_set, 0)['vocab_size']
+    self.bleu_source = os.path.join(temp_dir, 'bleu_source')
+    self.bleu_ref = os.path.join(temp_dir, 'bleu_ref')
+    self.orig_policy = (
+        tf.compat.v2.keras.mixed_precision.global_policy())
+
+  def tearDown(self):  # pylint: disable=g-missing-super-call
+    tf.compat.v2.keras.mixed_precision.set_global_policy(self.orig_policy)
+
+  def _assert_exists(self, filepath):
+    self.assertTrue(os.path.exists(filepath))
+
+  def test_train_no_dist_strat(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  def test_train_save_full_model(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    FLAGS.save_weights_only = False
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  def test_train_static_batch(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    FLAGS.distribution_strategy = 'one_device'
+    if tf.test.is_built_with_cuda():
+      FLAGS.num_gpus = 1
+    else:
+      FLAGS.num_gpus = 0
+    FLAGS.static_batch = True
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_train_1_gpu_with_dist_strat(self):
+    FLAGS.distribution_strategy = 'one_device'
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_train_fp16(self):
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.dtype = 'fp16'
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_train_2_gpu(self):
+    if context.num_gpus() < 2:
+      self.skipTest(
+          '{} GPUs are not available for this test. {} GPUs are available'
+          .format(2, context.num_gpus()))
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.num_gpus = 2
+    FLAGS.param_set = 'base'
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_train_2_gpu_fp16(self):
+    if context.num_gpus() < 2:
+      self.skipTest(
+          '{} GPUs are not available for this test. {} GPUs are available'
+          .format(2, context.num_gpus()))
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.num_gpus = 2
+    FLAGS.param_set = 'base'
+    FLAGS.dtype = 'fp16'
+    t = transformer_main.TransformerTask(FLAGS)
+    t.train()
+
+  def _prepare_files_and_flags(self, *extra_flags):
+    # Make log dir.
+    if not os.path.exists(self.temp_dir):
+      os.makedirs(self.temp_dir)
+
+    # Fake vocab, bleu_source and bleu_ref.
+    tokens = [
+        "'<pad>'", "'<EOS>'", "'_'", "'a'", "'b'", "'c'", "'d'", "'a_'", "'b_'",
+        "'c_'", "'d_'"
+    ]
+    tokens += ["'{}'".format(i) for i in range(self.vocab_size - len(tokens))]
+    _generate_file(self.vocab_file, tokens)
+    _generate_file(self.bleu_source, ['a b', 'c d'])
+    _generate_file(self.bleu_ref, ['a b', 'd c'])
+
+    # Update flags.
+    update_flags = [
+        'ignored_program_name',
+        '--vocab_file={}'.format(self.vocab_file),
+        '--bleu_source={}'.format(self.bleu_source),
+        '--bleu_ref={}'.format(self.bleu_ref),
+    ]
+    if extra_flags:
+      update_flags.extend(extra_flags)
+    FLAGS(update_flags)
+
+  def test_predict(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    self._prepare_files_and_flags()
+    t = transformer_main.TransformerTask(FLAGS)
+    t.predict()
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_predict_fp16(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    self._prepare_files_and_flags('--dtype=fp16')
+    t = transformer_main.TransformerTask(FLAGS)
+    t.predict()
+
+  def test_eval(self):
+    if context.num_gpus() >= 2:
+      self.skipTest('No need to test 2+ GPUs without a distribution strategy.')
+    if 'test_xla' in sys.argv[0]:
+      self.skipTest('TODO(xla): Make this test faster under XLA.')
+    self._prepare_files_and_flags()
+    t = transformer_main.TransformerTask(FLAGS)
+    t.eval()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/legacy/transformer/transformer_test.py
+++ b/official/legacy/transformer/transformer_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test Transformer model."""
+
+import tensorflow as tf
+
+from official.legacy.transformer import model_params
+from official.legacy.transformer import transformer
+
+
+class TransformerV2Test(tf.test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.params = params = model_params.TINY_PARAMS
+    params["batch_size"] = params["default_batch_size"] = 16
+    params["use_synthetic_data"] = True
+    params["hidden_size"] = 12
+    params["num_hidden_layers"] = 2
+    params["filter_size"] = 14
+    params["num_heads"] = 2
+    params["vocab_size"] = 41
+    params["extra_decode_length"] = 2
+    params["beam_size"] = 3
+    params["dtype"] = tf.float32
+
+  def test_create_model_train(self):
+    model = transformer.create_model(self.params, True)
+    inputs, outputs = model.inputs, model.outputs
+    self.assertEqual(len(inputs), 2)
+    self.assertEqual(len(outputs), 1)
+    self.assertEqual(inputs[0].shape.as_list(), [None, None])
+    self.assertEqual(inputs[0].dtype, tf.int64)
+    self.assertEqual(inputs[1].shape.as_list(), [None, None])
+    self.assertEqual(inputs[1].dtype, tf.int64)
+    self.assertEqual(outputs[0].shape.as_list(), [None, None, 41])
+    self.assertEqual(outputs[0].dtype, tf.float32)
+
+  def test_create_model_not_train(self):
+    model = transformer.create_model(self.params, False)
+    inputs, outputs = model.inputs, model.outputs
+    self.assertEqual(len(inputs), 1)
+    self.assertEqual(len(outputs), 2)
+    self.assertEqual(inputs[0].shape.as_list(), [None, None])
+    self.assertEqual(inputs[0].dtype, tf.int64)
+    self.assertEqual(outputs[0].shape.as_list(), [None, None])
+    self.assertEqual(outputs[0].dtype, tf.int32)
+    self.assertEqual(outputs[1].shape.as_list(), [None])
+    self.assertEqual(outputs[1].dtype, tf.float32)
+
+  def test_export(self):
+    model = transformer.Transformer(self.params, name="transformer_v2")
+    export_dir = self.get_temp_dir()
+    batch_size = 5
+    max_length = 6
+
+    class SaveModule(tf.Module):
+
+      def __init__(self, model):
+        super(SaveModule, self).__init__()
+        self.model = model
+
+      @tf.function
+      def serve(self, x):
+        return self.model.call([x], training=False)
+
+    save_module = SaveModule(model)
+    tensor_shape = (None, None)
+    sample_input = tf.zeros((batch_size, max_length), dtype=tf.int64)
+    _ = save_module.serve(sample_input)
+    signatures = dict(
+        serving_default=save_module.serve.get_concrete_function(
+            tf.TensorSpec(shape=tensor_shape, dtype=tf.int64, name="x")))
+    tf.saved_model.save(save_module, export_dir, signatures=signatures)
+    imported = tf.saved_model.load(export_dir)
+    serving_fn = imported.signatures["serving_default"]
+    all_outputs = serving_fn(sample_input)
+    output = all_outputs["outputs"]
+    output_shapes = output.shape.as_list()
+    self.assertEqual(output_shapes[0], batch_size)
+    self.assertEqual(output_shapes[1],
+                     max_length + model.params["extra_decode_length"])
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/legacy/transformer/translate.py
+++ b/official/legacy/transformer/translate.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Translate text or files using trained transformer model."""
+
+# Import libraries
+from absl import logging
+import numpy as np
+import tensorflow as tf
+
+from official.legacy.transformer.utils import tokenizer
+
+_EXTRA_DECODE_LENGTH = 100
+_BEAM_SIZE = 4
+_ALPHA = 0.6
+
+
+def _get_sorted_inputs(filename):
+  """Read and sort lines from the file sorted by decreasing length.
+
+  Args:
+    filename: String name of file to read inputs from.
+  Returns:
+    Sorted list of inputs, and dictionary mapping original index->sorted index
+    of each element.
+  """
+  with tf.io.gfile.GFile(filename) as f:
+    records = f.read().split("\n")
+    inputs = [record.strip() for record in records]
+    if not inputs[-1]:
+      inputs.pop()
+
+  input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
+  sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)
+
+  sorted_inputs = [None] * len(sorted_input_lens)
+  sorted_keys = [0] * len(sorted_input_lens)
+  for i, (index, _) in enumerate(sorted_input_lens):
+    sorted_inputs[i] = inputs[index]
+    sorted_keys[index] = i
+  return sorted_inputs, sorted_keys
+
+
+def _encode_and_add_eos(line, subtokenizer):
+  """Encode line with subtokenizer, and add EOS id to the end."""
+  return subtokenizer.encode(line) + [tokenizer.EOS_ID]
+
+
+def _trim_and_decode(ids, subtokenizer):
+  """Trim EOS and PAD tokens from ids, and decode to return a string."""
+  try:
+    index = list(ids).index(tokenizer.EOS_ID)
+    return subtokenizer.decode(ids[:index])
+  except ValueError:  # No EOS found in sequence
+    return subtokenizer.decode(ids)
+
+
+def translate_file(model,
+                   params,
+                   subtokenizer,
+                   input_file,
+                   output_file=None,
+                   print_all_translations=True,
+                   distribution_strategy=None):
+  """Translate lines in file, and save to output file if specified.
+
+  Args:
+    model: A Keras model, used to generate the translations.
+    params: A dictionary, containing the translation related parameters.
+    subtokenizer: A subtokenizer object, used for encoding and decoding source
+      and translated lines.
+    input_file: A file containing lines to translate.
+    output_file: A file that stores the generated translations.
+    print_all_translations: A bool. If true, all translations are printed to
+      stdout.
+    distribution_strategy: A distribution strategy, used to perform inference
+      directly with tf.function instead of Keras model.predict().
+
+  Raises:
+    ValueError: if output file is invalid.
+  """
+  batch_size = params["decode_batch_size"]
+
+  # Read and sort inputs by length. Keep dictionary (original index-->new index
+  # in sorted list) to write translations in the original order.
+  sorted_inputs, sorted_keys = _get_sorted_inputs(input_file)
+  total_samples = len(sorted_inputs)
+  num_decode_batches = (total_samples - 1) // batch_size + 1
+
+  def input_generator():
+    """Yield encoded strings from sorted_inputs."""
+    for i in range(num_decode_batches):
+      lines = [
+          sorted_inputs[j + i * batch_size]
+          for j in range(batch_size)
+          if j + i * batch_size < total_samples
+      ]
+      lines = [_encode_and_add_eos(l, subtokenizer) for l in lines]
+      if distribution_strategy:
+        for j in range(batch_size - len(lines)):
+          lines.append([tokenizer.EOS_ID])
+      batch = tf.keras.preprocessing.sequence.pad_sequences(
+          lines,
+          maxlen=params["decode_max_length"],
+          dtype="int32",
+          padding="post")
+      logging.info("Decoding batch %d out of %d.", i, num_decode_batches)
+      yield batch
+
+  @tf.function
+  def predict_step(inputs):
+    """Decoding step function for TPU runs."""
+
+    def _step_fn(inputs):
+      """Per replica step function."""
+      tag = inputs[0]
+      val_inputs = inputs[1]
+      val_outputs, _ = model([val_inputs], training=False)
+      return tag, val_outputs
+
+    return distribution_strategy.run(_step_fn, args=(inputs,))
+
+  translations = []
+  if distribution_strategy:
+    num_replicas = distribution_strategy.num_replicas_in_sync
+    local_batch_size = params["decode_batch_size"] // num_replicas
+  for i, text in enumerate(input_generator()):
+    if distribution_strategy:
+      text = np.reshape(text, [num_replicas, local_batch_size, -1])
+      # Add tag to the input of each replica with the reordering logic after
+      # outputs, to ensure the output order matches the input order.
+      text = tf.constant(text)
+
+      @tf.function
+      def text_as_per_replica():
+        replica_context = tf.distribute.get_replica_context()
+        replica_id = replica_context.replica_id_in_sync_group
+        return replica_id, text[replica_id]  # pylint: disable=cell-var-from-loop
+
+      text = distribution_strategy.run(text_as_per_replica)
+      outputs = distribution_strategy.experimental_local_results(
+          predict_step(text))
+      val_outputs = [output for _, output in outputs]
+
+      val_outputs = np.reshape(val_outputs, [params["decode_batch_size"], -1])
+    else:
+      val_outputs, _ = model.predict(text)
+
+    length = len(val_outputs)
+    for j in range(length):
+      if j + i * batch_size < total_samples:
+        translation = _trim_and_decode(val_outputs[j], subtokenizer)
+        translations.append(translation)
+        if print_all_translations:
+          logging.info("Translating:\n\tInput: %s\n\tOutput: %s",
+                       sorted_inputs[j + i * batch_size], translation)
+
+  # Write translations in the order they appeared in the original file.
+  if output_file is not None:
+    if tf.io.gfile.isdir(output_file):
+      raise ValueError("File output is a directory, will not save outputs to "
+                       "file.")
+    logging.info("Writing to file %s", output_file)
+    with tf.io.gfile.GFile(output_file, "w") as f:
+      for i in sorted_keys:
+        f.write("%s\n" % translations[i])
+
+
+def translate_from_text(model, subtokenizer, txt):
+  encoded_txt = _encode_and_add_eos(txt, subtokenizer)
+  result = model.predict(encoded_txt)
+  outputs = result["outputs"]
+  logging.info("Original: \"%s\"", txt)
+  translate_from_input(outputs, subtokenizer)
+
+
+def translate_from_input(outputs, subtokenizer):
+  translation = _trim_and_decode(outputs, subtokenizer)
+  logging.info("Translation: \"%s\"", translation)
--- a/official/legacy/transformer/utils/__init__.py
+++ b/official/legacy/transformer/utils/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/legacy/transformer/utils/metrics.py
+++ b/official/legacy/transformer/utils/metrics.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions for calculating loss, accuracy, and other model metrics.
+
+Metrics:
+ - Padded loss, accuracy, and negative log perplexity. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
+ - BLEU approximation. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
+ - ROUGE score. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+
+import numpy as np
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow.compat.v1 as tf
+
+
+def _pad_tensors_to_same_length(x, y):
+  """Pad x and y so that the results have the same length (second dimension)."""
+  with tf.name_scope("pad_to_same_length"):
+    x_length = tf.shape(x)[1]
+    y_length = tf.shape(y)[1]
+
+    max_length = tf.maximum(x_length, y_length)
+
+    x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
+    y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
+    return x, y
+
+
+def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
+  """Calculate cross entropy loss while ignoring padding.
+
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch_size, length_labels]
+    smoothing: Label smoothing constant, used to determine the on and off values
+    vocab_size: int size of the vocabulary
+  Returns:
+    Returns the cross entropy loss and weight tensors: float32 tensors with
+      shape [batch_size, max(length_logits, length_labels)]
+  """
+  with tf.name_scope("loss", values=[logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+
+    # Calculate smoothing cross entropy
+    with tf.name_scope("smoothing_cross_entropy", values=[logits, labels]):
+      confidence = 1.0 - smoothing
+      low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32)
+      soft_targets = tf.one_hot(
+          tf.cast(labels, tf.int32),
+          depth=vocab_size,
+          on_value=confidence,
+          off_value=low_confidence)
+      xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(
+          logits=logits, labels=soft_targets)
+
+      # Calculate the best (lowest) possible value of cross entropy, and
+      # subtract from the cross entropy loss.
+      normalizing_constant = -(
+          confidence * tf.log(confidence) + tf.cast(vocab_size - 1, tf.float32)
+          * low_confidence * tf.log(low_confidence + 1e-20))
+      xentropy -= normalizing_constant
+
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+    return xentropy * weights, weights
+
+
+def _convert_to_eval_metric(metric_fn):
+  """Wrap a metric fn that returns scores and weights as an eval metric fn.
+
+  The input metric_fn returns values for the current batch. The wrapper
+  aggregates the return values collected over all of the batches evaluated.
+
+  Args:
+    metric_fn: function that returns scores and weights for the current batch's
+      logits and predicted labels.
+
+  Returns:
+    function that aggregates the scores and weights from metric_fn.
+  """
+  def problem_metric_fn(*args):
+    """Returns an aggregation of the metric_fn's returned values."""
+    (scores, weights) = metric_fn(*args)
+
+    # The tf.metrics.mean function assures correct aggregation.
+    return tf.metrics.mean(scores, weights)
+  return problem_metric_fn
+
+
+def get_eval_metrics(logits, labels, params):
+  """Return dictionary of model evaluation metrics."""
+  metrics = {
+      "accuracy": _convert_to_eval_metric(padded_accuracy)(logits, labels),
+      "accuracy_top5": _convert_to_eval_metric(padded_accuracy_top5)(
+          logits, labels),
+      "accuracy_per_sequence": _convert_to_eval_metric(
+          padded_sequence_accuracy)(logits, labels),
+      "neg_log_perplexity": _convert_to_eval_metric(padded_neg_log_perplexity)(
+          logits, labels, params["vocab_size"]),
+  }
+
+  if not params["use_tpu"]:
+    # TPU does not support tf.py_func
+    metrics.update({
+        "approx_bleu_score": _convert_to_eval_metric(
+            bleu_score)(logits, labels),
+        "rouge_2_fscore": _convert_to_eval_metric(
+            rouge_2_fscore)(logits, labels),
+        "rouge_L_fscore": _convert_to_eval_metric(
+            rouge_l_fscore)(logits, labels),
+    })
+
+  # Prefix each of the metric names with "metrics/". This allows the metric
+  # graphs to display under the "metrics" category in TensorBoard.
+  metrics = {"metrics/%s" % k: v for k, v in six.iteritems(metrics)}
+  return metrics
+
+
+def padded_accuracy(logits, labels):
+  """Percentage of times that predictions matches labels on non-0s."""
+  with tf.variable_scope("padded_accuracy", values=[logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+    outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+    padded_labels = tf.cast(labels, tf.int32)
+    return tf.cast(tf.equal(outputs, padded_labels), tf.float32), weights
+
+
+def padded_accuracy_topk(logits, labels, k):
+  """Percentage of times that top-k predictions matches labels on non-0s."""
+  with tf.variable_scope("padded_accuracy_topk", values=[logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+    effective_k = tf.minimum(k, tf.shape(logits)[-1])
+    _, outputs = tf.nn.top_k(logits, k=effective_k)
+    outputs = tf.cast(outputs, tf.int32)
+    padded_labels = tf.cast(labels, tf.int32)
+    padded_labels = tf.expand_dims(padded_labels, axis=-1)
+    padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
+    same = tf.cast(tf.equal(outputs, padded_labels), tf.float32)
+    same_topk = tf.reduce_sum(same, axis=-1)
+    return same_topk, weights
+
+
+def padded_accuracy_top5(logits, labels):
+  return padded_accuracy_topk(logits, labels, 5)
+
+
+def padded_sequence_accuracy(logits, labels):
+  """Percentage of times that predictions matches labels everywhere (non-0)."""
+  with tf.variable_scope("padded_sequence_accuracy", values=[logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+    outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+    padded_labels = tf.cast(labels, tf.int32)
+    not_correct = (tf.cast(tf.not_equal(outputs, padded_labels), tf.float32) *
+                   weights)
+    axis = list(range(1, len(outputs.get_shape())))
+    correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
+    return correct_seq, tf.constant(1.0)
+
+
+def padded_neg_log_perplexity(logits, labels, vocab_size):
+  """Average log-perplexity excluding padding 0s. No smoothing."""
+  num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size)
+  return -num, den
+
+
+def bleu_score(logits, labels):
+  """Approximate BLEU score computation between labels and predictions.
+
+  An approximate BLEU scoring method since we do not glue word pieces or
+  decode the ids and tokenize the output. By default, we use ngram order of 4
+  and use brevity penalty. Also, this does not have beam search.
+
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch-size, length_labels]
+
+  Returns:
+    bleu: int, approx bleu score
+  """
+  predictions = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+  # TODO: Look into removing use of py_func  # pylint: disable=g-bad-todo
+  bleu = tf.py_func(compute_bleu, (labels, predictions), tf.float32)
+  return bleu, tf.constant(1.0)
+
+
+def _get_ngrams_with_counter(segment, max_order):
+  """Extracts all n-grams up to a given maximum order from an input segment.
+
+  Args:
+    segment: text segment from which n-grams will be extracted.
+    max_order: maximum length in tokens of the n-grams returned by this
+        methods.
+
+  Returns:
+    The Counter containing all n-grams upto max_order in segment
+    with a count of how many times each n-gram occurred.
+  """
+  ngram_counts = collections.Counter()
+  for order in xrange(1, max_order + 1):
+    for i in xrange(0, len(segment) - order + 1):
+      ngram = tuple(segment[i:i + order])
+      ngram_counts[ngram] += 1
+  return ngram_counts
+
+
+def compute_bleu(reference_corpus, translation_corpus, max_order=4,
+                 use_bp=True):
+  """Computes BLEU score of translated segments against one or more references.
+
+  Args:
+    reference_corpus: list of references for each translation. Each
+        reference should be tokenized into a list of tokens.
+    translation_corpus: list of translations to score. Each translation
+        should be tokenized into a list of tokens.
+    max_order: Maximum n-gram order to use when computing BLEU score.
+    use_bp: boolean, whether to apply brevity penalty.
+
+  Returns:
+    BLEU score.
+  """
+  reference_length = 0
+  translation_length = 0
+  bp = 1.0
+  geo_mean = 0
+
+  matches_by_order = [0] * max_order
+  possible_matches_by_order = [0] * max_order
+  precisions = []
+
+  for (references, translations) in zip(reference_corpus, translation_corpus):
+    reference_length += len(references)
+    translation_length += len(translations)
+    ref_ngram_counts = _get_ngrams_with_counter(references, max_order)
+    translation_ngram_counts = _get_ngrams_with_counter(translations, max_order)
+
+    overlap = dict((ngram,
+                    min(count, translation_ngram_counts[ngram]))
+                   for ngram, count in ref_ngram_counts.items())
+
+    for ngram in overlap:
+      matches_by_order[len(ngram) - 1] += overlap[ngram]
+    for ngram in translation_ngram_counts:
+      possible_matches_by_order[len(ngram) - 1] += translation_ngram_counts[
+          ngram]
+
+  precisions = [0] * max_order
+  smooth = 1.0
+
+  for i in xrange(0, max_order):
+    if possible_matches_by_order[i] > 0:
+      precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
+      if matches_by_order[i] > 0:
+        precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[
+            i]
+      else:
+        smooth *= 2
+        precisions[i] = 1.0 / (smooth * possible_matches_by_order[i])
+    else:
+      precisions[i] = 0.0
+
+  if max(precisions) > 0:
+    p_log_sum = sum(math.log(p) for p in precisions if p)
+    geo_mean = math.exp(p_log_sum / max_order)
+
+  if use_bp:
+    ratio = translation_length / reference_length
+    bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
+  bleu = geo_mean * bp
+  return np.float32(bleu)
+
+
+def rouge_2_fscore(logits, labels):
+  """ROUGE-2 F1 score computation between labels and predictions.
+
+  This is an approximate ROUGE scoring method since we do not glue word pieces
+  or decode the ids and tokenize the output.
+
+  Args:
+    logits: tensor, model predictions
+    labels: tensor, gold output.
+
+  Returns:
+    rouge2_fscore: approx rouge-2 f1 score.
+  """
+  predictions = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+  # TODO: Look into removing use of py_func  # pylint: disable=g-bad-todo
+  rouge_2_f_score = tf.py_func(rouge_n, (predictions, labels), tf.float32)
+  return rouge_2_f_score, tf.constant(1.0)
+
+
+def _get_ngrams(n, text):
+  """Calculates n-grams.
+
+  Args:
+    n: which n-grams to calculate
+    text: An array of tokens
+
+  Returns:
+    A set of n-grams
+  """
+  ngram_set = set()
+  text_length = len(text)
+  max_index_ngram_start = text_length - n
+  for i in range(max_index_ngram_start + 1):
+    ngram_set.add(tuple(text[i:i + n]))
+  return ngram_set
+
+
+def rouge_n(eval_sentences, ref_sentences, n=2):
+  """Computes ROUGE-N f1 score of two text collections of sentences.
+
+  Source: https://www.microsoft.com/en-us/research/publication/
+  rouge-a-package-for-automatic-evaluation-of-summaries/
+
+  Args:
+    eval_sentences: Predicted sentences.
+    ref_sentences: Sentences from the reference set
+    n: Size of ngram.  Defaults to 2.
+
+  Returns:
+    f1 score for ROUGE-N
+  """
+  f1_scores = []
+  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
+    eval_ngrams = _get_ngrams(n, eval_sentence)
+    ref_ngrams = _get_ngrams(n, ref_sentence)
+    ref_count = len(ref_ngrams)
+    eval_count = len(eval_ngrams)
+
+    # Count the overlapping ngrams between evaluated and reference
+    overlapping_ngrams = eval_ngrams.intersection(ref_ngrams)
+    overlapping_count = len(overlapping_ngrams)
+
+    # Handle edge case. This isn't mathematically correct, but it's good enough
+    if eval_count == 0:
+      precision = 0.0
+    else:
+      precision = float(overlapping_count) / eval_count
+    if ref_count == 0:
+      recall = 0.0
+    else:
+      recall = float(overlapping_count) / ref_count
+    f1_scores.append(2.0 * ((precision * recall) / (precision + recall + 1e-8)))
+
+  # return overlapping_count / reference_count
+  return np.mean(f1_scores, dtype=np.float32)
+
+
+def rouge_l_fscore(predictions, labels):
+  """ROUGE scores computation between labels and predictions.
+
+  This is an approximate ROUGE scoring method since we do not glue word pieces
+  or decode the ids and tokenize the output.
+
+  Args:
+    predictions: tensor, model predictions
+    labels: tensor, gold output.
+
+  Returns:
+    rouge_l_fscore: approx rouge-l f1 score.
+  """
+  outputs = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
+  rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (outputs, labels),
+                               tf.float32)
+  return rouge_l_f_score, tf.constant(1.0)
+
+
+def rouge_l_sentence_level(eval_sentences, ref_sentences):
+  """Computes ROUGE-L (sentence level) of two collections of sentences.
+
+  Source: https://www.microsoft.com/en-us/research/publication/
+  rouge-a-package-for-automatic-evaluation-of-summaries/
+
+  Calculated according to:
+  R_lcs = LCS(X,Y)/m
+  P_lcs = LCS(X,Y)/n
+  F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
+
+  where:
+  X = reference summary
+  Y = Candidate summary
+  m = length of reference summary
+  n = length of candidate summary
+
+  Args:
+    eval_sentences: The sentences that have been picked by the summarizer
+    ref_sentences: The sentences from the reference set
+
+  Returns:
+    A float: F_lcs
+  """
+
+  f1_scores = []
+  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
+    m = float(len(ref_sentence))
+    n = float(len(eval_sentence))
+    lcs = _len_lcs(eval_sentence, ref_sentence)
+    f1_scores.append(_f_lcs(lcs, m, n))
+  return np.mean(f1_scores, dtype=np.float32)
+
+
+def _len_lcs(x, y):
+  """Returns the length of the Longest Common Subsequence between two seqs.
+
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+
+  Args:
+    x: sequence of words
+    y: sequence of words
+
+  Returns
+    integer: Length of LCS between x and y
+  """
+  table = _lcs(x, y)
+  n, m = len(x), len(y)
+  return table[n, m]
+
+
+def _lcs(x, y):
+  """Computes the length of the LCS between two seqs.
+
+  The implementation below uses a DP programming algorithm and runs
+  in O(nm) time where n = len(x) and m = len(y).
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+
+  Args:
+    x: collection of words
+    y: collection of words
+
+  Returns:
+    Table of dictionary of coord and len lcs
+  """
+  n, m = len(x), len(y)
+  table = dict()
+  for i in range(n + 1):
+    for j in range(m + 1):
+      if i == 0 or j == 0:
+        table[i, j] = 0
+      elif x[i - 1] == y[j - 1]:
+        table[i, j] = table[i - 1, j - 1] + 1
+      else:
+        table[i, j] = max(table[i - 1, j], table[i, j - 1])
+  return table
+
+
+def _f_lcs(llcs, m, n):
+  """Computes the LCS-based F-measure score.
+
+  Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
+  rouge-working-note-v1.3.1.pdf
+
+  Args:
+    llcs: Length of LCS
+    m: number of words in reference summary
+    n: number of words in candidate summary
+
+  Returns:
+    Float. LCS-based F-measure score
+  """
+  r_lcs = llcs / m
+  p_lcs = llcs / n
+  beta = p_lcs / (r_lcs + 1e-12)
+  num = (1 + (beta ** 2)) * r_lcs * p_lcs
+  denom = r_lcs + ((beta ** 2) * p_lcs)
+  f_lcs = num / (denom + 1e-12)
+  return f_lcs
--- a/official/legacy/transformer/utils/tokenizer.py
+++ b/official/legacy/transformer/utils/tokenizer.py
--- a/official/legacy/transformer/utils/tokenizer_test.py
+++ b/official/legacy/transformer/utils/tokenizer_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test Subtokenizer and string helper methods."""
+
+import collections
+import tempfile
+
+import tensorflow as tf
+
+from official.legacy.transformer.utils import tokenizer
+
+
+class SubtokenizerTest(tf.test.TestCase):
+
+  def _init_subtokenizer(self, vocab_list):
+    temp_file = tempfile.NamedTemporaryFile(delete=False)
+    with tf.io.gfile.GFile(temp_file.name, "w") as w:
+      for subtoken in vocab_list:
+        w.write("'%s'" % subtoken)
+        w.write("\n")
+    return tokenizer.Subtokenizer(temp_file.name, reserved_tokens=[])
+
+  def test_encode(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    s = "testing 123"
+    encoded_list = subtokenizer.encode(s)
+    self.assertEqual([1, 2, 0], encoded_list)
+
+  def test_decode(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    encoded_list = [1, 2, 0]  # testing 123
+    decoded_str = subtokenizer.decode(encoded_list)
+    self.assertEqual("testing 123", decoded_str)
+
+  def test_subtoken_ids_to_tokens(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    encoded_list = [1, 2, 0]  # testing 123
+    token_list = subtokenizer._subtoken_ids_to_tokens(encoded_list)
+    self.assertEqual([u"testing", u"123"], token_list)
+
+
+class StringHelperTest(tf.test.TestCase):
+
+  def test_split_string_to_tokens(self):
+    text = "test? testing 123."
+
+    tokens = tokenizer._split_string_to_tokens(text,
+                                               tokenizer._ALPHANUMERIC_CHAR_SET)
+    self.assertEqual(["test", "? ", "testing", "123", "."], tokens)
+
+  def test_join_tokens_to_string(self):
+    tokens = ["test", "? ", "testing", "123", "."]
+
+    s = tokenizer._join_tokens_to_string(tokens,
+                                         tokenizer._ALPHANUMERIC_CHAR_SET)
+    self.assertEqual("test? testing 123.", s)
+
+  def test_escape_token(self):
+    token = u"abc_\\4"
+    alphabet = set("abc_\\u;")
+
+    escaped_token = tokenizer._escape_token(token, alphabet)
+    self.assertEqual("abc\\u\\\\\\52;_", escaped_token)
+
+  def test_unescape_token(self):
+    escaped_token = u"Underline: \\u, Backslash: \\\\, Unicode: \\52;"
+
+    unescaped_token = tokenizer._unescape_token(escaped_token)
+    self.assertEqual("Underline: _, Backslash: \\, Unicode: 4", unescaped_token)
+
+  def test_list_to_index_dict(self):
+    lst = ["test", "strings"]
+
+    d = tokenizer._list_to_index_dict(lst)
+    self.assertDictEqual({"test": 0, "strings": 1}, d)
+
+  def test_split_token_to_subtokens(self):
+    token = "abc"
+    subtoken_dict = {"a": 0, "b": 1, "c": 2, "ab": 3}
+    max_subtoken_length = 2
+
+    subtokens = tokenizer._split_token_to_subtokens(token, subtoken_dict,
+                                                    max_subtoken_length)
+    self.assertEqual(["ab", "c"], subtokens)
+
+  def test_generate_alphabet_dict(self):
+    s = ["testing", "123"]
+    reserved_tokens = ["???"]
+
+    alphabet = tokenizer._generate_alphabet_dict(s, reserved_tokens)
+    self.assertIn("?", alphabet)
+    self.assertIn("t", alphabet)
+    self.assertIn("e", alphabet)
+    self.assertIn("s", alphabet)
+    self.assertIn("i", alphabet)
+    self.assertIn("n", alphabet)
+    self.assertIn("g", alphabet)
+    self.assertIn("1", alphabet)
+    self.assertIn("2", alphabet)
+    self.assertIn("3", alphabet)
+
+  def test_count_and_gen_subtokens(self):
+    token_counts = {"abc": 5}
+    alphabet = set("abc_")
+    subtoken_dict = {"a": 0, "b": 1, "c": 2, "_": 3}
+    max_subtoken_length = 2
+
+    subtoken_counts = tokenizer._count_and_gen_subtokens(
+        token_counts, alphabet, subtoken_dict, max_subtoken_length)
+
+    self.assertIsInstance(subtoken_counts, collections.defaultdict)
+    self.assertDictEqual(
+        {
+            "a": 5,
+            "b": 5,
+            "c": 5,
+            "_": 5,
+            "ab": 5,
+            "bc": 5,
+            "c_": 5,
+            "abc": 5,
+            "bc_": 5,
+            "abc_": 5
+        }, subtoken_counts)
+
+  def test_filter_and_bucket_subtokens(self):
+    subtoken_counts = collections.defaultdict(int, {
+        "a": 2,
+        "b": 4,
+        "c": 1,
+        "ab": 6,
+        "ac": 3,
+        "abbc": 5
+    })
+    min_count = 3
+
+    subtoken_buckets = tokenizer._filter_and_bucket_subtokens(
+        subtoken_counts, min_count)
+
+    self.assertEqual(len(subtoken_buckets[0]), 0)
+    self.assertEqual(set("b"), subtoken_buckets[1])
+    self.assertEqual(set(["ab", "ac"]), subtoken_buckets[2])
+    self.assertEqual(len(subtoken_buckets[3]), 0)
+    self.assertEqual(set(["abbc"]), subtoken_buckets[4])
+
+  def test_gen_new_subtoken_list(self):
+    subtoken_counts = collections.defaultdict(int, {
+        "translate": 10,
+        "t": 40,
+        "tr": 16,
+        "tra": 12
+    })
+    min_count = 5
+    alphabet = set("translate")
+    reserved_tokens = ["reserved", "tokens"]
+
+    subtoken_list, max_token_length = tokenizer._gen_new_subtoken_list(
+        subtoken_counts, min_count, alphabet, reserved_tokens)
+
+    # Check that "tra" isn"t in the list (its count should be decremented to 2,
+    # so it should not be added to the canddiate list).
+    self.assertNotIn("tra", subtoken_list)
+
+    self.assertIn("tr", subtoken_list)
+    self.assertIn("t", subtoken_list)
+
+    self.assertEqual(len("translate"), max_token_length)
+
+  def test_generate_subtokens(self):
+    token_counts = {"ab": 1, "bc": 3, "abc": 5}
+    alphabet = set("abc_")
+    min_count = 100
+    num_iterations = 1
+    reserved_tokens = ["reserved", "tokens"]
+
+    vocab_list = tokenizer._generate_subtokens(token_counts, alphabet,
+                                               min_count, num_iterations,
+                                               reserved_tokens)
+
+    # Check that reserved tokens are at the front of the list
+    self.assertEqual(vocab_list[:2], reserved_tokens)
+
+    # Check that each character in alphabet is in the vocab list
+    for c in alphabet:
+      self.assertIn(c, vocab_list)
+
+
+if __name__ == "__main__":
+  tf.test.main()