Add transformer model (#4148)

3fca8afe · Katherine Wu · GitHub · dea7ecf6 · 3fca8afe · 3fca8afe
Unverified Commit 3fca8afe authored May 02, 2018 by Katherine Wu Committed by GitHub May 02, 2018
4 changed files
--- a/official/transformer/utils/dataset.py
+++ b/official/transformer/utils/dataset.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Input pipeline for the transformer model to read, filter, and batch examples.
+
+Two things to note in the pipeline:
+
+1. Batching scheme
+
+   The examples encoded in the TFRecord files contain data in the format:
+     {"inputs": [variable length array of integers],
+      "targets": [variable length array of integers]}
+   Where integers in the arrays refer to tokens in the English and German vocab
+   file (named `vocab.ende.32768`).
+
+   Prior to batching, elements in the dataset are grouped by length (max between
+   "inputs" and "targets" length). Each group is then batched such that:
+     group_batch_size * length <= batch_size.
+
+   Another way to view batch_size is the maximum number of tokens in each batch.
+
+   Once batched, each element in the dataset will have the shape:
+     {"inputs": [group_batch_size, padded_input_length],
+      "targets": [group_batch_size, padded_target_length]}
+   Lengths are padded to the longest "inputs" or "targets" sequence in the batch
+   (padded_input_length and padded_target_length can be different).
+
+   This batching scheme decreases the fraction of padding tokens per training
+   batch, thus improving the training speed significantly.
+
+2. Shuffling
+
+   While training, the dataset is shuffled in two places in the code. The first
+   is the list of training files. Second, while reading records using
+   `parallel_interleave`, the `sloppy` argument is used to generate randomness
+   in the order of the examples.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+
+# Use the number of training files as the shuffle buffer.
+_FILE_SHUFFLE_BUFFER = 100
+# Buffer size for reading records from a TFRecord file. Each training file is
+# 7.2 MB, so 8 MB allows an entire file to be kept in memory.
+_READ_RECORD_BUFFER = 8 * 1000 * 1000
+
+# Example grouping constants. Defines length boundaries for each group.
+# These values are the defaults used in Tensor2Tensor.
+_MIN_BOUNDARY = 8
+_BOUNDARY_SCALE = 1.1
+
+
+def _load_records(filename):
+  """Read file and return a dataset of tf.Examples."""
+  return tf.data.TFRecordDataset(filename, buffer_size=_READ_RECORD_BUFFER)
+
+
+def _parse_example(serialized_example):
+  """Return inputs and targets Tensors from a serialized tf.Example."""
+  data_fields = {
+      "inputs": tf.VarLenFeature(tf.int64),
+      "targets": tf.VarLenFeature(tf.int64)
+  }
+  parsed = tf.parse_single_example(serialized_example, data_fields)
+  inputs = tf.sparse_tensor_to_dense(parsed["inputs"])
+  targets = tf.sparse_tensor_to_dense(parsed["targets"])
+  return inputs, targets
+
+
+def _filter_max_length(example, max_length=256):
+  """Indicates whether the example's length is lower than the maximum length."""
+  return tf.logical_and(tf.size(example[0]) <= max_length,
+                        tf.size(example[1]) <= max_length)
+
+
+def _get_example_length(example):
+  """Returns the maximum length between the example inputs and targets."""
+  length = tf.maximum(tf.shape(example[0])[0], tf.shape(example[1])[0])
+  return length
+
+
+def _create_min_max_boundaries(
+    max_length, min_boundary=_MIN_BOUNDARY, boundary_scale=_BOUNDARY_SCALE):
+  """Create min and max boundary lists up to max_length.
+
+  For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
+  returned values will be:
+    buckets_min = [0, 4, 8, 16, 24]
+    buckets_max = [4, 8, 16, 24, 25]
+
+  Args:
+    max_length: The maximum length of example in dataset.
+    min_boundary: Minimum length in boundary.
+    boundary_scale: Amount to scale consecutive boundaries in the list.
+
+  Returns:
+    min and max boundary lists
+
+  """
+  # Create bucket boundaries list by scaling the previous boundary or adding 1
+  # (to ensure increasing boundary sizes).
+  bucket_boundaries = []
+  x = min_boundary
+  while x < max_length:
+    bucket_boundaries.append(x)
+    x = max(x + 1, int(x * boundary_scale))
+
+  # Create min and max boundary lists from the initial list.
+  buckets_min = [0] + bucket_boundaries
+  buckets_max = bucket_boundaries + [max_length + 1]
+  return buckets_min, buckets_max
+
+
+def _batch_examples(dataset, batch_size, max_length):
+  """Group examples by similar lengths, and return batched dataset.
+
+  Each batch of similar-length examples are padded to the same length, and may
+  have different number of elements in each batch, such that:
+    group_batch_size * padded_length <= batch_size.
+
+  This decreases the number of padding tokens per batch, which improves the
+  training speed.
+
+  Args:
+    dataset: Dataset of unbatched examples.
+    batch_size: Max number of tokens per batch of examples.
+    max_length: Max number of tokens in an example input or target sequence.
+
+  Returns:
+    Dataset of batched examples with similar lengths.
+  """
+  # Get min and max boundary lists for each example. These are used to calculate
+  # the `bucket_id`, which is the index at which:
+  # buckets_min[bucket_id] <= len(example) < buckets_max[bucket_id]
+  # Note that using both min and max lists improves the performance.
+  buckets_min, buckets_max = _create_min_max_boundaries(max_length)
+
+  # Create list of batch sizes for each bucket_id, so that
+  # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
+  bucket_batch_sizes = [batch_size // x for x in buckets_max]
+  # bucket_id will be a tensor, so convert this list to a tensor as well.
+  bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
+
+  def example_to_bucket_id(example_input, example_target):
+    """Return int64 bucket id for this example, calculated based on length."""
+    seq_length = _get_example_length((example_input, example_target))
+
+    # TODO: investigate whether removing code branching improves performance.
+    conditions_c = tf.logical_and(
+        tf.less_equal(buckets_min, seq_length),
+        tf.less(seq_length, buckets_max))
+    bucket_id = tf.reduce_min(tf.where(conditions_c))
+    return bucket_id
+
+  def window_size_fn(bucket_id):
+    """Return number of examples to be grouped when given a bucket id."""
+    return bucket_batch_sizes[bucket_id]
+
+  def batching_fn(bucket_id, grouped_dataset):
+    """Batch and add padding to a dataset of elements with similar lengths."""
+    bucket_batch_size = window_size_fn(bucket_id)
+
+    # Batch the dataset and add padding so that all input sequences in the
+    # examples have the same length, and all target sequences have the same
+    # lengths as well. Resulting lengths of inputs and targets can differ.
+    return grouped_dataset.padded_batch(bucket_batch_size, ([None], [None]))
+
+  return dataset.apply(tf.contrib.data.group_by_window(
+      key_func=example_to_bucket_id,
+      reduce_func=batching_fn,
+      window_size=None,
+      window_size_func=window_size_fn))
+
+
+def _read_and_batch_from_files(
+    file_pattern, batch_size, max_length, num_cpu_cores, shuffle, repeat):
+  """Create dataset where each item is a dict of "inputs" and "targets".
+
+  Args:
+    file_pattern: String used to match the input TFRecord files.
+    batch_size: Maximum number of tokens per batch of examples
+    max_length: Maximum number of tokens per example
+    num_cpu_cores: Number of cpu cores for parallel input processing.
+    shuffle: If true, randomizes order of elements.
+    repeat: Number of times to repeat the dataset. If None, the dataset is
+      repeated forever.
+
+  Returns:
+    tf.data.Dataset object containing examples loaded from the files.
+  """
+  dataset = tf.data.Dataset.list_files(file_pattern)
+
+  if shuffle:
+    # Shuffle filenames
+    dataset = dataset.shuffle(buffer_size=_FILE_SHUFFLE_BUFFER)
+
+  # Read files and interleave results. When training, the order of the examples
+  # will be non-deterministic.
+  dataset = dataset.apply(
+      tf.contrib.data.parallel_interleave(
+          _load_records, sloppy=shuffle, cycle_length=num_cpu_cores))
+
+  # Parse each tf.Example into a dictionary
+  # TODO: Look into prefetch_input_elements for performance optimization.
+  dataset = dataset.map(_parse_example,
+                        num_parallel_calls=num_cpu_cores)
+
+  # Remove examples where the input or target length exceeds the maximum length,
+  dataset = dataset.filter(lambda x, y: _filter_max_length((x, y), max_length))
+
+  # Batch such that each batch has examples of similar length.
+  dataset = _batch_examples(dataset, batch_size, max_length)
+  dataset = dataset.repeat(repeat)
+
+  # Prefetch the next element to improve speed of input pipeline.
+  dataset = dataset.prefetch(1)
+  return dataset
+
+
+def train_input_fn(params):
+  """Load and return dataset of batched examples for use during training."""
+  file_pattern = os.path.join(getattr(params, "data_dir", ""), "*train*")
+  return _read_and_batch_from_files(
+      file_pattern, params.batch_size, params.max_length, params.num_cpu_cores,
+      shuffle=True, repeat=params.repeat_dataset)
+
+
+def eval_input_fn(params):
+  """Load and return dataset of batched examples for use during evaluation."""
+  file_pattern = os.path.join(getattr(params, "data_dir", ""), "*dev*")
+  return _read_and_batch_from_files(
+      file_pattern, params.batch_size, params.max_length, params.num_cpu_cores,
+      shuffle=False, repeat=1)
--- a/official/transformer/utils/metrics.py
+++ b/official/transformer/utils/metrics.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions for calculating loss, accuracy, and other model metrics.
+
+Metrics:
+ - Padded loss, accuracy, and negative log perplexity. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
+ - BLEU approximation. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
+ - ROUGE score. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+
+import numpy as np
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+
+def _pad_tensors_to_same_length(x, y):
+  """Pad x and y so that the results have the same length (second dimension)."""
+  with tf.name_scope("pad_to_same_length"):
+    x_length = tf.shape(x)[1]
+    y_length = tf.shape(y)[1]
+
+    max_length = tf.maximum(x_length, y_length)
+
+    x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
+    y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
+    return x, y
+
+
+def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
+  """Calculate cross entropy loss while ignoring padding.
+
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch_size, length_labels]
+    smoothing: Label smoothing constant, used to determine the on and off values
+    vocab_size: int size of the vocabulary
+  Returns:
+    Returns a float32 tensor with shape
+      [batch_size, max(length_logits, length_labels)]
+  """
+  with tf.name_scope("loss", [logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+
+    # Calculate smoothing cross entropy
+    with tf.name_scope("smoothing_cross_entropy", [logits, labels]):
+      confidence = 1.0 - smoothing
+      low_confidence = (1.0 - confidence) / tf.to_float(vocab_size - 1)
+      soft_targets = tf.one_hot(
+          tf.cast(labels, tf.int32),
+          depth=vocab_size,
+          on_value=confidence,
+          off_value=low_confidence)
+      xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(
+          logits=logits, labels=soft_targets)
+
+      # Calculate the best (lowest) possible value of cross entropy, and
+      # subtract from the cross entropy loss.
+      normalizing_constant = -(
+          confidence * tf.log(confidence) + tf.to_float(vocab_size - 1) *
+          low_confidence * tf.log(low_confidence + 1e-20))
+      xentropy -= normalizing_constant
+
+    weights = tf.to_float(tf.not_equal(labels, 0))
+    return xentropy * weights, weights
+
+
+def _convert_to_eval_metric(metric_fn):
+  """Wrap a metric fn that returns scores and weights as an eval metric fn.
+
+  The input metric_fn returns values for the current batch. The wrapper
+  aggregates the return values collected over all of the batches evaluated.
+
+  Args:
+    metric_fn: function that returns scores and weights for the current batch's
+      logits and predicted labels.
+
+  Returns:
+    function that aggregates the scores and weights from metric_fn.
+  """
+  def problem_metric_fn(*args):
+    """Returns an aggregation of the metric_fn's returned values."""
+    (scores, weights) = metric_fn(*args)
+
+    # The tf.metrics.mean function assures correct aggregation.
+    return tf.metrics.mean(scores, weights)
+  return problem_metric_fn
+
+
+def get_eval_metrics(logits, labels, params):
+  """Return dictionary of model evaluation metrics."""
+  metrics = {
+      "accuracy": _convert_to_eval_metric(padded_accuracy)(logits, labels),
+      "accuracy_top5": _convert_to_eval_metric(padded_accuracy_top5)(
+          logits, labels),
+      "accuracy_per_sequence": _convert_to_eval_metric(
+          padded_sequence_accuracy)(logits, labels),
+      "neg_log_perplexity": _convert_to_eval_metric(padded_neg_log_perplexity)(
+          logits, labels, params.vocab_size),
+      "approx_bleu_score": _convert_to_eval_metric(bleu_score)(logits, labels),
+      "rouge_2_fscore": _convert_to_eval_metric(rouge_2_fscore)(logits, labels),
+      "rouge_L_fscore": _convert_to_eval_metric(rouge_l_fscore)(logits, labels),
+  }
+
+  # Prefix each of the metric names with "metrics/". This allows the metric
+  # graphs to display under the "metrics" category in TensorBoard.
+  metrics = {"metrics/%s" % k: v for k, v in six.iteritems(metrics)}
+  return metrics
+
+
+def padded_accuracy(logits, labels):
+  """Percentage of times that predictions matches labels on non-0s."""
+  with tf.variable_scope("padded_accuracy", values=[logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.to_float(tf.not_equal(labels, 0))
+    outputs = tf.to_int32(tf.argmax(logits, axis=-1))
+    padded_labels = tf.to_int32(labels)
+    return tf.to_float(tf.equal(outputs, padded_labels)), weights
+
+
+def padded_accuracy_topk(logits, labels, k):
+  """Percentage of times that top-k predictions matches labels on non-0s."""
+  with tf.variable_scope("padded_accuracy_topk", values=[logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.to_float(tf.not_equal(labels, 0))
+    effective_k = tf.minimum(k, tf.shape(logits)[-1])
+    _, outputs = tf.nn.top_k(logits, k=effective_k)
+    outputs = tf.to_int32(outputs)
+    padded_labels = tf.to_int32(labels)
+    padded_labels = tf.expand_dims(padded_labels, axis=-1)
+    padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
+    same = tf.to_float(tf.equal(outputs, padded_labels))
+    same_topk = tf.reduce_sum(same, axis=-1)
+    return same_topk, weights
+
+
+def padded_accuracy_top5(logits, labels):
+  return padded_accuracy_topk(logits, labels, 5)
+
+
+def padded_sequence_accuracy(logits, labels):
+  """Percentage of times that predictions matches labels everywhere (non-0)."""
+  with tf.variable_scope("padded_sequence_accuracy", values=[logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.to_float(tf.not_equal(labels, 0))
+    outputs = tf.to_int32(tf.argmax(logits, axis=-1))
+    padded_labels = tf.to_int32(labels)
+    not_correct = tf.to_float(tf.not_equal(outputs, padded_labels)) * weights
+    axis = list(range(1, len(outputs.get_shape())))
+    correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
+    return correct_seq, tf.constant(1.0)
+
+
+def padded_neg_log_perplexity(logits, labels, vocab_size):
+  """Average log-perplexity excluding padding 0s. No smoothing."""
+  num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size)
+  return -num, den
+
+
+def bleu_score(logits, labels):
+  """Approximate BLEU score computation between labels and predictions.
+
+  An approximate BLEU scoring method since we do not glue word pieces or
+  decode the ids and tokenize the output. By default, we use ngram order of 4
+  and use brevity penalty. Also, this does not have beam search.
+
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch-size, length_labels]
+
+  Returns:
+    bleu: int, approx bleu score
+  """
+  predictions = tf.to_int32(tf.argmax(logits, axis=-1))
+  # TODO: Look into removing use of py_func
+  bleu = tf.py_func(compute_bleu, (labels, predictions), tf.float32)
+  return bleu, tf.constant(1.0)
+
+
+def _get_ngrams_with_counter(segment, max_order):
+  """Extracts all n-grams up to a given maximum order from an input segment.
+
+  Args:
+    segment: text segment from which n-grams will be extracted.
+    max_order: maximum length in tokens of the n-grams returned by this
+        methods.
+
+  Returns:
+    The Counter containing all n-grams upto max_order in segment
+    with a count of how many times each n-gram occurred.
+  """
+  ngram_counts = collections.Counter()
+  for order in xrange(1, max_order + 1):
+    for i in xrange(0, len(segment) - order + 1):
+      ngram = tuple(segment[i:i + order])
+      ngram_counts[ngram] += 1
+  return ngram_counts
+
+
+def compute_bleu(reference_corpus, translation_corpus, max_order=4,
+                 use_bp=True):
+  """Computes BLEU score of translated segments against one or more references.
+
+  Args:
+    reference_corpus: list of references for each translation. Each
+        reference should be tokenized into a list of tokens.
+    translation_corpus: list of translations to score. Each translation
+        should be tokenized into a list of tokens.
+    max_order: Maximum n-gram order to use when computing BLEU score.
+    use_bp: boolean, whether to apply brevity penalty.
+
+  Returns:
+    BLEU score.
+  """
+  reference_length = 0
+  translation_length = 0
+  bp = 1.0
+  geo_mean = 0
+
+  matches_by_order = [0] * max_order
+  possible_matches_by_order = [0] * max_order
+  precisions = []
+
+  for (references, translations) in zip(reference_corpus, translation_corpus):
+    reference_length += len(references)
+    translation_length += len(translations)
+    ref_ngram_counts = _get_ngrams_with_counter(references, max_order)
+    translation_ngram_counts = _get_ngrams_with_counter(translations, max_order)
+
+    overlap = dict((ngram,
+                    min(count, translation_ngram_counts[ngram]))
+                   for ngram, count in ref_ngram_counts.items())
+
+    for ngram in overlap:
+      matches_by_order[len(ngram) - 1] += overlap[ngram]
+    for ngram in translation_ngram_counts:
+      possible_matches_by_order[len(ngram) - 1] += translation_ngram_counts[
+          ngram]
+
+  precisions = [0] * max_order
+  smooth = 1.0
+
+  for i in xrange(0, max_order):
+    if possible_matches_by_order[i] > 0:
+      precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
+      if matches_by_order[i] > 0:
+        precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[
+            i]
+      else:
+        smooth *= 2
+        precisions[i] = 1.0 / (smooth * possible_matches_by_order[i])
+    else:
+      precisions[i] = 0.0
+
+  if max(precisions) > 0:
+    p_log_sum = sum(math.log(p) for p in precisions if p)
+    geo_mean = math.exp(p_log_sum / max_order)
+
+  if use_bp:
+    ratio = translation_length / reference_length
+    bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
+  bleu = geo_mean * bp
+  return np.float32(bleu)
+
+
+def rouge_2_fscore(logits, labels):
+  """ROUGE-2 F1 score computation between labels and predictions.
+
+  This is an approximate ROUGE scoring method since we do not glue word pieces
+  or decode the ids and tokenize the output.
+
+  Args:
+    logits: tensor, model predictions
+    labels: tensor, gold output.
+
+  Returns:
+    rouge2_fscore: approx rouge-2 f1 score.
+  """
+  predictions = tf.to_int32(tf.argmax(logits, axis=-1))
+  # TODO: Look into removing use of py_func
+  rouge_2_f_score = tf.py_func(rouge_n, (predictions, labels), tf.float32)
+  return rouge_2_f_score, tf.constant(1.0)
+
+
+def _get_ngrams(n, text):
+  """Calculates n-grams.
+
+  Args:
+    n: which n-grams to calculate
+    text: An array of tokens
+
+  Returns:
+    A set of n-grams
+  """
+  ngram_set = set()
+  text_length = len(text)
+  max_index_ngram_start = text_length - n
+  for i in range(max_index_ngram_start + 1):
+    ngram_set.add(tuple(text[i:i + n]))
+  return ngram_set
+
+
+def rouge_n(eval_sentences, ref_sentences, n=2):
+  """Computes ROUGE-N f1 score of two text collections of sentences.
+
+  Source: https://www.microsoft.com/en-us/research/publication/
+  rouge-a-package-for-automatic-evaluation-of-summaries/
+
+  Args:
+    eval_sentences: Predicted sentences.
+    ref_sentences: Sentences from the reference set
+    n: Size of ngram.  Defaults to 2.
+
+  Returns:
+    f1 score for ROUGE-N
+  """
+  f1_scores = []
+  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
+    eval_ngrams = _get_ngrams(n, eval_sentence)
+    ref_ngrams = _get_ngrams(n, ref_sentence)
+    ref_count = len(ref_ngrams)
+    eval_count = len(eval_ngrams)
+
+    # Count the overlapping ngrams between evaluated and reference
+    overlapping_ngrams = eval_ngrams.intersection(ref_ngrams)
+    overlapping_count = len(overlapping_ngrams)
+
+    # Handle edge case. This isn't mathematically correct, but it's good enough
+    if eval_count == 0:
+      precision = 0.0
+    else:
+      precision = float(overlapping_count) / eval_count
+    if ref_count == 0:
+      recall = 0.0
+    else:
+      recall = float(overlapping_count) / ref_count
+    f1_scores.append(2.0 * ((precision * recall) / (precision + recall + 1e-8)))
+
+  # return overlapping_count / reference_count
+  return np.mean(f1_scores, dtype=np.float32)
+
+
+def rouge_l_fscore(predictions, labels):
+  """ROUGE scores computation between labels and predictions.
+
+  This is an approximate ROUGE scoring method since we do not glue word pieces
+  or decode the ids and tokenize the output.
+
+  Args:
+    predictions: tensor, model predictions
+    labels: tensor, gold output.
+
+  Returns:
+    rouge_l_fscore: approx rouge-l f1 score.
+  """
+  outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
+  rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (outputs, labels),
+                               tf.float32)
+  return rouge_l_f_score, tf.constant(1.0)
+
+
+def rouge_l_sentence_level(eval_sentences, ref_sentences):
+  """Computes ROUGE-L (sentence level) of two collections of sentences.
+
+  Source: https://www.microsoft.com/en-us/research/publication/
+  rouge-a-package-for-automatic-evaluation-of-summaries/
+
+  Calculated according to:
+  R_lcs = LCS(X,Y)/m
+  P_lcs = LCS(X,Y)/n
+  F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
+
+  where:
+  X = reference summary
+  Y = Candidate summary
+  m = length of reference summary
+  n = length of candidate summary
+
+  Args:
+    eval_sentences: The sentences that have been picked by the summarizer
+    ref_sentences: The sentences from the reference set
+
+  Returns:
+    A float: F_lcs
+  """
+
+  f1_scores = []
+  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
+    m = float(len(ref_sentence))
+    n = float(len(eval_sentence))
+    lcs = _len_lcs(eval_sentence, ref_sentence)
+    f1_scores.append(_f_lcs(lcs, m, n))
+  return np.mean(f1_scores, dtype=np.float32)
+
+
+def _len_lcs(x, y):
+  """Returns the length of the Longest Common Subsequence between two seqs.
+
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+
+  Args:
+    x: sequence of words
+    y: sequence of words
+
+  Returns
+    integer: Length of LCS between x and y
+  """
+  table = _lcs(x, y)
+  n, m = len(x), len(y)
+  return table[n, m]
+
+
+def _lcs(x, y):
+  """Computes the length of the LCS between two seqs.
+
+  The implementation below uses a DP programming algorithm and runs
+  in O(nm) time where n = len(x) and m = len(y).
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+
+  Args:
+    x: collection of words
+    y: collection of words
+
+  Returns:
+    Table of dictionary of coord and len lcs
+  """
+  n, m = len(x), len(y)
+  table = dict()
+  for i in range(n + 1):
+    for j in range(m + 1):
+      if i == 0 or j == 0:
+        table[i, j] = 0
+      elif x[i - 1] == y[j - 1]:
+        table[i, j] = table[i - 1, j - 1] + 1
+      else:
+        table[i, j] = max(table[i - 1, j], table[i, j - 1])
+  return table
+
+
+def _f_lcs(llcs, m, n):
+  """Computes the LCS-based F-measure score.
+
+  Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
+  rouge-working-note-v1.3.1.pdf
+
+  Args:
+    llcs: Length of LCS
+    m: number of words in reference summary
+    n: number of words in candidate summary
+
+  Returns:
+    Float. LCS-based F-measure score
+  """
+  r_lcs = llcs / m
+  p_lcs = llcs / n
+  beta = p_lcs / (r_lcs + 1e-12)
+  num = (1 + (beta ** 2)) * r_lcs * p_lcs
+  denom = r_lcs + ((beta ** 2) * p_lcs)
+  f_lcs = num / (denom + 1e-12)
+  return f_lcs
--- a/official/transformer/utils/tokenizer.py
+++ b/official/transformer/utils/tokenizer.py
--- a/official/transformer/utils/tokenizer_test.py
+++ b/official/transformer/utils/tokenizer_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test Subtokenizer and string helper methods."""
+
+import collections
+import tempfile
+import unittest
+
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+from official.transformer.utils import tokenizer
+
+
+class SubtokenizerTest(unittest.TestCase):
+
+  def _init_subtokenizer(self, vocab_list):
+    temp_file = tempfile.NamedTemporaryFile(delete=False)
+    with tf.gfile.Open(temp_file.name, 'w') as w:
+      for subtoken in vocab_list:
+        w.write("'%s'" % subtoken)
+        w.write("\n")
+    return tokenizer.Subtokenizer(temp_file.name, reserved_tokens=[])
+
+  def test_encode(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    s = "testing 123"
+    encoded_list = subtokenizer.encode(s)
+    self.assertEqual([1, 2, 0], encoded_list)
+
+  def test_decode(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    encoded_list = [1, 2, 0]  # testing 123
+    decoded_str = subtokenizer.decode(encoded_list)
+    self.assertEqual("testing 123", decoded_str)
+
+  def test_subtoken_ids_to_tokens(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    encoded_list = [1, 2, 0]  # testing 123
+    token_list = subtokenizer._subtoken_ids_to_tokens(encoded_list)
+    self.assertEqual([u"testing", u"123"], token_list)
+
+
+class StringHelperTest(unittest.TestCase):
+
+  def test_split_string_to_tokens(self):
+    text = "test? testing 123."
+
+    tokens = tokenizer._split_string_to_tokens(text)
+    self.assertEqual(["test", "? ", "testing", "123", "."], tokens)
+
+  def test_join_tokens_to_string(self):
+    tokens = ["test", "? ", "testing", "123", "."]
+
+    s = tokenizer._join_tokens_to_string(tokens)
+    self.assertEqual("test? testing 123.", s)
+
+  def test_escape_token(self):
+    token = u"abc_\\4"
+    alphabet = set("abc_\\u;")
+
+    escaped_token = tokenizer._escape_token(token, alphabet)
+    self.assertEqual("abc\\u\\\\\\52;_", escaped_token)
+
+  def test_unescape_token(self):
+    escaped_token = u"Underline: \\u, Backslash: \\\\, Unicode: \\52;"
+
+    unescaped_token = tokenizer._unescape_token(escaped_token)
+    self.assertEqual(
+        "Underline: _, Backslash: \\, Unicode: 4", unescaped_token)
+
+  def test_list_to_index_dict(self):
+    lst = ["test", "strings"]
+
+    d = tokenizer._list_to_index_dict(lst)
+    self.assertDictEqual({"test": 0, "strings": 1}, d)
+
+  def test_split_token_to_subtokens(self):
+    token = "abc"
+    subtoken_dict = {"a": 0, "b": 1, "c": 2, "ab": 3}
+    max_subtoken_length = 2
+
+    subtokens = tokenizer._split_token_to_subtokens(
+        token, subtoken_dict, max_subtoken_length)
+    self.assertEqual(["ab", "c"], subtokens)
+
+  def test_generate_alphabet_dict(self):
+    s = ["testing", "123"]
+    reserved_tokens = ["???"]
+
+    alphabet = tokenizer._generate_alphabet_dict(s, reserved_tokens)
+    self.assertIn("?", alphabet)
+    self.assertIn("t", alphabet)
+    self.assertIn("e", alphabet)
+    self.assertIn("s", alphabet)
+    self.assertIn("i", alphabet)
+    self.assertIn("n", alphabet)
+    self.assertIn("g", alphabet)
+    self.assertIn("1", alphabet)
+    self.assertIn("2", alphabet)
+    self.assertIn("3", alphabet)
+
+  def test_count_and_gen_subtokens(self):
+    token_counts = {"abc": 5}
+    alphabet = set("abc_")
+    subtoken_dict = {"a": 0, "b": 1, "c": 2, "_": 3}
+    max_subtoken_length = 2
+
+    subtoken_counts = tokenizer._count_and_gen_subtokens(
+        token_counts, alphabet, subtoken_dict, max_subtoken_length)
+
+    self.assertIsInstance(subtoken_counts, collections.defaultdict)
+    self.assertDictEqual(
+        {"a": 5, "b": 5, "c": 5, "_": 5, "ab": 5, "bc": 5, "c_": 5,
+         "abc": 5, "bc_": 5, "abc_": 5}, subtoken_counts)
+
+  def test_filter_and_bucket_subtokens(self):
+    subtoken_counts = collections.defaultdict(
+        int, {"a": 2, "b": 4, "c": 1, "ab": 6, "ac": 3, "abbc": 5})
+    min_count = 3
+
+    subtoken_buckets = tokenizer._filter_and_bucket_subtokens(
+        subtoken_counts, min_count)
+
+    self.assertEqual(len(subtoken_buckets[0]), 0)
+    self.assertEqual(set("b"), subtoken_buckets[1])
+    self.assertEqual(set(["ab", "ac"]), subtoken_buckets[2])
+    self.assertEqual(len(subtoken_buckets[3]), 0)
+    self.assertEqual(set(["abbc"]), subtoken_buckets[4])
+
+  def test_gen_new_subtoken_list(self):
+    subtoken_counts = collections.defaultdict(
+        int, {"translate": 10, "t": 40, "tr": 16, "tra": 12})
+    min_count = 5
+    alphabet = set("translate")
+    reserved_tokens = ["reserved", "tokens"]
+
+    subtoken_list, max_token_length = tokenizer._gen_new_subtoken_list(
+        subtoken_counts, min_count, alphabet, reserved_tokens)
+
+    # Check that "tra" isn"t in the list (its count should be decremented to 2,
+    # so it should not be added to the canddiate list).
+    self.assertNotIn("tra", subtoken_list)
+
+    self.assertIn("tr", subtoken_list)
+    self.assertIn("t", subtoken_list)
+
+    self.assertEqual(len("translate"), max_token_length)
+
+  def test_generate_subtokens(self):
+    token_counts = {"ab": 1, "bc": 3, "abc": 5}
+    alphabet = set("abc_")
+    min_count = 100
+    num_iterations = 1
+    reserved_tokens = ["reserved", "tokens"]
+
+    vocab_list = tokenizer._generate_subtokens(
+        token_counts, alphabet, min_count, num_iterations, reserved_tokens)
+
+    # Check that reserved tokens are at the front of the list
+    self.assertEqual(vocab_list[:2], reserved_tokens)
+
+    # Check that each character in alphabet is in the vocab list
+    for c in alphabet:
+      self.assertIn(c, vocab_list)
+
+
+if __name__ == "__main__":
+  unittest.main()