Add transformer model (#4148)

3fca8afe · Katherine Wu · GitHub · dea7ecf6 · 3fca8afe · 3fca8afe
Unverified Commit 3fca8afe authored May 02, 2018 by Katherine Wu Committed by GitHub May 02, 2018
4 changed files
--- a/official/transformer/utils/dataset.py
+++ b/official/transformer/utils/dataset.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Input pipeline for the transformer model to read, filter, and batch examples.
+Two things to note in the pipeline:
+1. Batching scheme
+   The examples encoded in the TFRecord files contain data in the format:
+     {"inputs": [variable length array of integers],
+      "targets": [variable length array of integers]}
+   Where integers in the arrays refer to tokens in the English and German vocab
+   file (named `vocab.ende.32768`).
+   Prior to batching, elements in the dataset are grouped by length (max between
+   "inputs" and "targets" length). Each group is then batched such that:
+     group_batch_size * length <= batch_size.
+   Another way to view batch_size is the maximum number of tokens in each batch.
+   Once batched, each element in the dataset will have the shape:
+     {"inputs": [group_batch_size, padded_input_length],
+      "targets": [group_batch_size, padded_target_length]}
+   Lengths are padded to the longest "inputs" or "targets" sequence in the batch
+   (padded_input_length and padded_target_length can be different).
+   This batching scheme decreases the fraction of padding tokens per training
+   batch, thus improving the training speed significantly.
+2. Shuffling
+   While training, the dataset is shuffled in two places in the code. The first
+   is the list of training files. Second, while reading records using
+   `parallel_interleave`, the `sloppy` argument is used to generate randomness
+   in the order of the examples.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import tensorflow as tf
+# Use the number of training files as the shuffle buffer.
+_FILE_SHUFFLE_BUFFER = 100
+# Buffer size for reading records from a TFRecord file. Each training file is
+# 7.2 MB, so 8 MB allows an entire file to be kept in memory.
+_READ_RECORD_BUFFER = 8 * 1000 * 1000
+# Example grouping constants. Defines length boundaries for each group.
+# These values are the defaults used in Tensor2Tensor.
+_MIN_BOUNDARY = 8
+_BOUNDARY_SCALE = 1.1
+def _load_records(filename):
+  """Read file and return a dataset of tf.Examples."""
+  return tf.data.TFRecordDataset(filename, buffer_size=_READ_RECORD_BUFFER)
+def _parse_example(serialized_example):
+  """Return inputs and targets Tensors from a serialized tf.Example."""
+  data_fields = {
+      "inputs": tf.VarLenFeature(tf.int64),
+      "targets": tf.VarLenFeature(tf.int64)
+  }
+  parsed = tf.parse_single_example(serialized_example, data_fields)
+  inputs = tf.sparse_tensor_to_dense(parsed["inputs"])
+  targets = tf.sparse_tensor_to_dense(parsed["targets"])
+  return inputs, targets
+def _filter_max_length(example, max_length=256):
+  """Indicates whether the example's length is lower than the maximum length."""
+  return tf.logical_and(tf.size(example[0]) <= max_length,
+                        tf.size(example[1]) <= max_length)
+def _get_example_length(example):
+  """Returns the maximum length between the example inputs and targets."""
+  length = tf.maximum(tf.shape(example[0])[0], tf.shape(example[1])[0])
+  return length
+def _create_min_max_boundaries(
+    max_length, min_boundary=_MIN_BOUNDARY, boundary_scale=_BOUNDARY_SCALE):
+  """Create min and max boundary lists up to max_length.
+  For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
+  returned values will be:
+    buckets_min = [0, 4, 8, 16, 24]
+    buckets_max = [4, 8, 16, 24, 25]
+  Args:
+    max_length: The maximum length of example in dataset.
+    min_boundary: Minimum length in boundary.
+    boundary_scale: Amount to scale consecutive boundaries in the list.
+  Returns:
+    min and max boundary lists
+  """
+  # Create bucket boundaries list by scaling the previous boundary or adding 1
+  # (to ensure increasing boundary sizes).
+  bucket_boundaries = []
+  x = min_boundary
+  while x < max_length:
+    bucket_boundaries.append(x)
+    x = max(x + 1, int(x * boundary_scale))
+  # Create min and max boundary lists from the initial list.
+  buckets_min = [0] + bucket_boundaries
+  buckets_max = bucket_boundaries + [max_length + 1]
+  return buckets_min, buckets_max
+def _batch_examples(dataset, batch_size, max_length):
+  """Group examples by similar lengths, and return batched dataset.
+  Each batch of similar-length examples are padded to the same length, and may
+  have different number of elements in each batch, such that:
+    group_batch_size * padded_length <= batch_size.
+  This decreases the number of padding tokens per batch, which improves the
+  training speed.
+  Args:
+    dataset: Dataset of unbatched examples.
+    batch_size: Max number of tokens per batch of examples.
+    max_length: Max number of tokens in an example input or target sequence.
+  Returns:
+    Dataset of batched examples with similar lengths.
+  """
+  # Get min and max boundary lists for each example. These are used to calculate
+  # the `bucket_id`, which is the index at which:
+  # buckets_min[bucket_id] <= len(example) < buckets_max[bucket_id]
+  # Note that using both min and max lists improves the performance.
+  buckets_min, buckets_max = _create_min_max_boundaries(max_length)
+  # Create list of batch sizes for each bucket_id, so that
+  # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
+  bucket_batch_sizes = [batch_size // x for x in buckets_max]
+  # bucket_id will be a tensor, so convert this list to a tensor as well.
+  bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
+  def example_to_bucket_id(example_input, example_target):
+    """Return int64 bucket id for this example, calculated based on length."""
+    seq_length = _get_example_length((example_input, example_target))
+    # TODO: investigate whether removing code branching improves performance.
+    conditions_c = tf.logical_and(
+        tf.less_equal(buckets_min, seq_length),
+        tf.less(seq_length, buckets_max))
+    bucket_id = tf.reduce_min(tf.where(conditions_c))
+    return bucket_id
+  def window_size_fn(bucket_id):
+    """Return number of examples to be grouped when given a bucket id."""
+    return bucket_batch_sizes[bucket_id]
+  def batching_fn(bucket_id, grouped_dataset):
+    """Batch and add padding to a dataset of elements with similar lengths."""
+    bucket_batch_size = window_size_fn(bucket_id)
+    # Batch the dataset and add padding so that all input sequences in the
+    # examples have the same length, and all target sequences have the same
+    # lengths as well. Resulting lengths of inputs and targets can differ.
+    return grouped_dataset.padded_batch(bucket_batch_size, ([None], [None]))
+  return dataset.apply(tf.contrib.data.group_by_window(
+      key_func=example_to_bucket_id,
+      reduce_func=batching_fn,
+      window_size=None,
+      window_size_func=window_size_fn))
+def _read_and_batch_from_files(
+    file_pattern, batch_size, max_length, num_cpu_cores, shuffle, repeat):
+  """Create dataset where each item is a dict of "inputs" and "targets".
+  Args:
+    file_pattern: String used to match the input TFRecord files.
+    batch_size: Maximum number of tokens per batch of examples
+    max_length: Maximum number of tokens per example
+    num_cpu_cores: Number of cpu cores for parallel input processing.
+    shuffle: If true, randomizes order of elements.
+    repeat: Number of times to repeat the dataset. If None, the dataset is
+      repeated forever.
+  Returns:
+    tf.data.Dataset object containing examples loaded from the files.
+  """
+  dataset = tf.data.Dataset.list_files(file_pattern)
+  if shuffle:
+    # Shuffle filenames
+    dataset = dataset.shuffle(buffer_size=_FILE_SHUFFLE_BUFFER)
+  # Read files and interleave results. When training, the order of the examples
+  # will be non-deterministic.
+  dataset = dataset.apply(
+      tf.contrib.data.parallel_interleave(
+          _load_records, sloppy=shuffle, cycle_length=num_cpu_cores))
+  # Parse each tf.Example into a dictionary
+  # TODO: Look into prefetch_input_elements for performance optimization.
+  dataset = dataset.map(_parse_example,
+                        num_parallel_calls=num_cpu_cores)
+  # Remove examples where the input or target length exceeds the maximum length,
+  dataset = dataset.filter(lambda x, y: _filter_max_length((x, y), max_length))
+  # Batch such that each batch has examples of similar length.
+  dataset = _batch_examples(dataset, batch_size, max_length)
+  dataset = dataset.repeat(repeat)
+  # Prefetch the next element to improve speed of input pipeline.
+  dataset = dataset.prefetch(1)
+  return dataset
+def train_input_fn(params):
+  """Load and return dataset of batched examples for use during training."""
+  file_pattern = os.path.join(getattr(params, "data_dir", ""), "*train*")
+  return _read_and_batch_from_files(
+      file_pattern, params.batch_size, params.max_length, params.num_cpu_cores,
+      shuffle=True, repeat=params.repeat_dataset)
+def eval_input_fn(params):
+  """Load and return dataset of batched examples for use during evaluation."""
+  file_pattern = os.path.join(getattr(params, "data_dir", ""), "*dev*")
+  return _read_and_batch_from_files(
+      file_pattern, params.batch_size, params.max_length, params.num_cpu_cores,
+      shuffle=False, repeat=1)
--- a/official/transformer/utils/metrics.py
+++ b/official/transformer/utils/metrics.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions for calculating loss, accuracy, and other model metrics.
+Metrics:
+ - Padded loss, accuracy, and negative log perplexity. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/metrics.py
+ - BLEU approximation. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
+ - ROUGE score. Source:
+     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/rouge.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import math
+import numpy as np
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+def _pad_tensors_to_same_length(x, y):
+  """Pad x and y so that the results have the same length (second dimension)."""
+  with tf.name_scope("pad_to_same_length"):
+    x_length = tf.shape(x)[1]
+    y_length = tf.shape(y)[1]
+    max_length = tf.maximum(x_length, y_length)
+    x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
+    y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
+    return x, y
+def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
+  """Calculate cross entropy loss while ignoring padding.
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch_size, length_labels]
+    smoothing: Label smoothing constant, used to determine the on and off values
+    vocab_size: int size of the vocabulary
+  Returns:
+    Returns a float32 tensor with shape
+      [batch_size, max(length_logits, length_labels)]
+  """
+  with tf.name_scope("loss", [logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    # Calculate smoothing cross entropy
+    with tf.name_scope("smoothing_cross_entropy", [logits, labels]):
+      confidence = 1.0 - smoothing
+      low_confidence = (1.0 - confidence) / tf.to_float(vocab_size - 1)
+      soft_targets = tf.one_hot(
+          tf.cast(labels, tf.int32),
+          depth=vocab_size,
+          on_value=confidence,
+          off_value=low_confidence)
+      xentropy = tf.nn.softmax_cross_entropy_with_logits_v2(
+          logits=logits, labels=soft_targets)
+      # Calculate the best (lowest) possible value of cross entropy, and
+      # subtract from the cross entropy loss.
+      normalizing_constant = -(
+          confidence * tf.log(confidence) + tf.to_float(vocab_size - 1) *
+          low_confidence * tf.log(low_confidence + 1e-20))
+      xentropy -= normalizing_constant
+    weights = tf.to_float(tf.not_equal(labels, 0))
+    return xentropy * weights, weights
+def _convert_to_eval_metric(metric_fn):
+  """Wrap a metric fn that returns scores and weights as an eval metric fn.
+  The input metric_fn returns values for the current batch. The wrapper
+  aggregates the return values collected over all of the batches evaluated.
+  Args:
+    metric_fn: function that returns scores and weights for the current batch's
+      logits and predicted labels.
+  Returns:
+    function that aggregates the scores and weights from metric_fn.
+  """
+  def problem_metric_fn(*args):
+    """Returns an aggregation of the metric_fn's returned values."""
+    (scores, weights) = metric_fn(*args)
+    # The tf.metrics.mean function assures correct aggregation.
+    return tf.metrics.mean(scores, weights)
+  return problem_metric_fn
+def get_eval_metrics(logits, labels, params):
+  """Return dictionary of model evaluation metrics."""
+  metrics = {
+      "accuracy": _convert_to_eval_metric(padded_accuracy)(logits, labels),
+      "accuracy_top5": _convert_to_eval_metric(padded_accuracy_top5)(
+          logits, labels),
+      "accuracy_per_sequence": _convert_to_eval_metric(
+          padded_sequence_accuracy)(logits, labels),
+      "neg_log_perplexity": _convert_to_eval_metric(padded_neg_log_perplexity)(
+          logits, labels, params.vocab_size),
+      "approx_bleu_score": _convert_to_eval_metric(bleu_score)(logits, labels),
+      "rouge_2_fscore": _convert_to_eval_metric(rouge_2_fscore)(logits, labels),
+      "rouge_L_fscore": _convert_to_eval_metric(rouge_l_fscore)(logits, labels),
+  }
+  # Prefix each of the metric names with "metrics/". This allows the metric
+  # graphs to display under the "metrics" category in TensorBoard.
+  metrics = {"metrics/%s" % k: v for k, v in six.iteritems(metrics)}
+  return metrics
+def padded_accuracy(logits, labels):
+  """Percentage of times that predictions matches labels on non-0s."""
+  with tf.variable_scope("padded_accuracy", values=[logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.to_float(tf.not_equal(labels, 0))
+    outputs = tf.to_int32(tf.argmax(logits, axis=-1))
+    padded_labels = tf.to_int32(labels)
+    return tf.to_float(tf.equal(outputs, padded_labels)), weights
+def padded_accuracy_topk(logits, labels, k):
+  """Percentage of times that top-k predictions matches labels on non-0s."""
+  with tf.variable_scope("padded_accuracy_topk", values=[logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.to_float(tf.not_equal(labels, 0))
+    effective_k = tf.minimum(k, tf.shape(logits)[-1])
+    _, outputs = tf.nn.top_k(logits, k=effective_k)
+    outputs = tf.to_int32(outputs)
+    padded_labels = tf.to_int32(labels)
+    padded_labels = tf.expand_dims(padded_labels, axis=-1)
+    padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
+    same = tf.to_float(tf.equal(outputs, padded_labels))
+    same_topk = tf.reduce_sum(same, axis=-1)
+    return same_topk, weights
+def padded_accuracy_top5(logits, labels):
+  return padded_accuracy_topk(logits, labels, 5)
+def padded_sequence_accuracy(logits, labels):
+  """Percentage of times that predictions matches labels everywhere (non-0)."""
+  with tf.variable_scope("padded_sequence_accuracy", values=[logits, labels]):
+    logits, labels = _pad_tensors_to_same_length(logits, labels)
+    weights = tf.to_float(tf.not_equal(labels, 0))
+    outputs = tf.to_int32(tf.argmax(logits, axis=-1))
+    padded_labels = tf.to_int32(labels)
+    not_correct = tf.to_float(tf.not_equal(outputs, padded_labels)) * weights
+    axis = list(range(1, len(outputs.get_shape())))
+    correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
+    return correct_seq, tf.constant(1.0)
+def padded_neg_log_perplexity(logits, labels, vocab_size):
+  """Average log-perplexity excluding padding 0s. No smoothing."""
+  num, den = padded_cross_entropy_loss(logits, labels, 0, vocab_size)
+  return -num, den
+def bleu_score(logits, labels):
+  """Approximate BLEU score computation between labels and predictions.
+  An approximate BLEU scoring method since we do not glue word pieces or
+  decode the ids and tokenize the output. By default, we use ngram order of 4
+  and use brevity penalty. Also, this does not have beam search.
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch-size, length_labels]
+  Returns:
+    bleu: int, approx bleu score
+  """
+  predictions = tf.to_int32(tf.argmax(logits, axis=-1))
+  # TODO: Look into removing use of py_func
+  bleu = tf.py_func(compute_bleu, (labels, predictions), tf.float32)
+  return bleu, tf.constant(1.0)
+def _get_ngrams_with_counter(segment, max_order):
+  """Extracts all n-grams up to a given maximum order from an input segment.
+  Args:
+    segment: text segment from which n-grams will be extracted.
+    max_order: maximum length in tokens of the n-grams returned by this
+        methods.
+  Returns:
+    The Counter containing all n-grams upto max_order in segment
+    with a count of how many times each n-gram occurred.
+  """
+  ngram_counts = collections.Counter()
+  for order in xrange(1, max_order + 1):
+    for i in xrange(0, len(segment) - order + 1):
+      ngram = tuple(segment[i:i + order])
+      ngram_counts[ngram] += 1
+  return ngram_counts
+def compute_bleu(reference_corpus, translation_corpus, max_order=4,
+                 use_bp=True):
+  """Computes BLEU score of translated segments against one or more references.
+  Args:
+    reference_corpus: list of references for each translation. Each
+        reference should be tokenized into a list of tokens.
+    translation_corpus: list of translations to score. Each translation
+        should be tokenized into a list of tokens.
+    max_order: Maximum n-gram order to use when computing BLEU score.
+    use_bp: boolean, whether to apply brevity penalty.
+  Returns:
+    BLEU score.
+  """
+  reference_length = 0
+  translation_length = 0
+  bp = 1.0
+  geo_mean = 0
+  matches_by_order = [0] * max_order
+  possible_matches_by_order = [0] * max_order
+  precisions = []
+  for (references, translations) in zip(reference_corpus, translation_corpus):
+    reference_length += len(references)
+    translation_length += len(translations)
+    ref_ngram_counts = _get_ngrams_with_counter(references, max_order)
+    translation_ngram_counts = _get_ngrams_with_counter(translations, max_order)
+    overlap = dict((ngram,
+                    min(count, translation_ngram_counts[ngram]))
+                   for ngram, count in ref_ngram_counts.items())
+    for ngram in overlap:
+      matches_by_order[len(ngram) - 1] += overlap[ngram]
+    for ngram in translation_ngram_counts:
+      possible_matches_by_order[len(ngram) - 1] += translation_ngram_counts[
+          ngram]
+  precisions = [0] * max_order
+  smooth = 1.0
+  for i in xrange(0, max_order):
+    if possible_matches_by_order[i] > 0:
+      precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
+      if matches_by_order[i] > 0:
+        precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[
+            i]
+      else:
+        smooth *= 2
+        precisions[i] = 1.0 / (smooth * possible_matches_by_order[i])
+    else:
+      precisions[i] = 0.0
+  if max(precisions) > 0:
+    p_log_sum = sum(math.log(p) for p in precisions if p)
+    geo_mean = math.exp(p_log_sum / max_order)
+  if use_bp:
+    ratio = translation_length / reference_length
+    bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
+  bleu = geo_mean * bp
+  return np.float32(bleu)
+def rouge_2_fscore(logits, labels):
+  """ROUGE-2 F1 score computation between labels and predictions.
+  This is an approximate ROUGE scoring method since we do not glue word pieces
+  or decode the ids and tokenize the output.
+  Args:
+    logits: tensor, model predictions
+    labels: tensor, gold output.
+  Returns:
+    rouge2_fscore: approx rouge-2 f1 score.
+  """
+  predictions = tf.to_int32(tf.argmax(logits, axis=-1))
+  # TODO: Look into removing use of py_func
+  rouge_2_f_score = tf.py_func(rouge_n, (predictions, labels), tf.float32)
+  return rouge_2_f_score, tf.constant(1.0)
+def _get_ngrams(n, text):
+  """Calculates n-grams.
+  Args:
+    n: which n-grams to calculate
+    text: An array of tokens
+  Returns:
+    A set of n-grams
+  """
+  ngram_set = set()
+  text_length = len(text)
+  max_index_ngram_start = text_length - n
+  for i in range(max_index_ngram_start + 1):
+    ngram_set.add(tuple(text[i:i + n]))
+  return ngram_set
+def rouge_n(eval_sentences, ref_sentences, n=2):
+  """Computes ROUGE-N f1 score of two text collections of sentences.
+  Source: https://www.microsoft.com/en-us/research/publication/
+  rouge-a-package-for-automatic-evaluation-of-summaries/
+  Args:
+    eval_sentences: Predicted sentences.
+    ref_sentences: Sentences from the reference set
+    n: Size of ngram.  Defaults to 2.
+  Returns:
+    f1 score for ROUGE-N
+  """
+  f1_scores = []
+  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
+    eval_ngrams = _get_ngrams(n, eval_sentence)
+    ref_ngrams = _get_ngrams(n, ref_sentence)
+    ref_count = len(ref_ngrams)
+    eval_count = len(eval_ngrams)
+    # Count the overlapping ngrams between evaluated and reference
+    overlapping_ngrams = eval_ngrams.intersection(ref_ngrams)
+    overlapping_count = len(overlapping_ngrams)
+    # Handle edge case. This isn't mathematically correct, but it's good enough
+    if eval_count == 0:
+      precision = 0.0
+    else:
+      precision = float(overlapping_count) / eval_count
+    if ref_count == 0:
+      recall = 0.0
+    else:
+      recall = float(overlapping_count) / ref_count
+    f1_scores.append(2.0 * ((precision * recall) / (precision + recall + 1e-8)))
+  # return overlapping_count / reference_count
+  return np.mean(f1_scores, dtype=np.float32)
+def rouge_l_fscore(predictions, labels):
+  """ROUGE scores computation between labels and predictions.
+  This is an approximate ROUGE scoring method since we do not glue word pieces
+  or decode the ids and tokenize the output.
+  Args:
+    predictions: tensor, model predictions
+    labels: tensor, gold output.
+  Returns:
+    rouge_l_fscore: approx rouge-l f1 score.
+  """
+  outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
+  rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (outputs, labels),
+                               tf.float32)
+  return rouge_l_f_score, tf.constant(1.0)
+def rouge_l_sentence_level(eval_sentences, ref_sentences):
+  """Computes ROUGE-L (sentence level) of two collections of sentences.
+  Source: https://www.microsoft.com/en-us/research/publication/
+  rouge-a-package-for-automatic-evaluation-of-summaries/
+  Calculated according to:
+  R_lcs = LCS(X,Y)/m
+  P_lcs = LCS(X,Y)/n
+  F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
+  where:
+  X = reference summary
+  Y = Candidate summary
+  m = length of reference summary
+  n = length of candidate summary
+  Args:
+    eval_sentences: The sentences that have been picked by the summarizer
+    ref_sentences: The sentences from the reference set
+  Returns:
+    A float: F_lcs
+  """
+  f1_scores = []
+  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
+    m = float(len(ref_sentence))
+    n = float(len(eval_sentence))
+    lcs = _len_lcs(eval_sentence, ref_sentence)
+    f1_scores.append(_f_lcs(lcs, m, n))
+  return np.mean(f1_scores, dtype=np.float32)
+def _len_lcs(x, y):
+  """Returns the length of the Longest Common Subsequence between two seqs.
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+  Args:
+    x: sequence of words
+    y: sequence of words
+  Returns
+    integer: Length of LCS between x and y
+  """
+  table = _lcs(x, y)
+  n, m = len(x), len(y)
+  return table[n, m]
+def _lcs(x, y):
+  """Computes the length of the LCS between two seqs.
+  The implementation below uses a DP programming algorithm and runs
+  in O(nm) time where n = len(x) and m = len(y).
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+  Args:
+    x: collection of words
+    y: collection of words
+  Returns:
+    Table of dictionary of coord and len lcs
+  """
+  n, m = len(x), len(y)
+  table = dict()
+  for i in range(n + 1):
+    for j in range(m + 1):
+      if i == 0 or j == 0:
+        table[i, j] = 0
+      elif x[i - 1] == y[j - 1]:
+        table[i, j] = table[i - 1, j - 1] + 1
+      else:
+        table[i, j] = max(table[i - 1, j], table[i, j - 1])
+  return table
+def _f_lcs(llcs, m, n):
+  """Computes the LCS-based F-measure score.
+  Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
+  rouge-working-note-v1.3.1.pdf
+  Args:
+    llcs: Length of LCS
+    m: number of words in reference summary
+    n: number of words in candidate summary
+  Returns:
+    Float. LCS-based F-measure score
+  """
+  r_lcs = llcs / m
+  p_lcs = llcs / n
+  beta = p_lcs / (r_lcs + 1e-12)
+  num = (1 + (beta ** 2)) * r_lcs * p_lcs
+  denom = r_lcs + ((beta ** 2) * p_lcs)
+  f_lcs = num / (denom + 1e-12)
+  return f_lcs
--- a/official/transformer/utils/tokenizer.py
+++ b/official/transformer/utils/tokenizer.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines Subtokenizer class to encode and decode strings."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import re
+import sys
+import unicodedata
+import numpy as np
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+PAD = "<pad>"
+PAD_ID = 0
+EOS = "<EOS>"
+EOS_ID = 1
+RESERVED_TOKENS = [PAD, EOS]
+# Set of characters that will be used in the function _escape_token() (see func
+# docstring for more details).
+# This set is added to the alphabet list to ensure that all escaped tokens can
+# be encoded.
+_ESCAPE_CHARS = set(u"\\_u;0123456789")
+# Regex for the function _unescape_token(), the inverse of _escape_token().
+# This is used to find "\u", "\\", and "\###;" substrings in the token.
+_UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);")
+_UNDEFINED_UNICODE = u"\u3013"
+# Set contains all letter and number characters.
+_ALPHANUMERIC_CHAR_SET = set(
+    six.unichr(i) for i in xrange(sys.maxunicode)
+    if (unicodedata.category(six.unichr(i)).startswith("L") or
+        unicodedata.category(six.unichr(i)).startswith("N")))
+# min_count is the minimum number of times a subtoken must appear in the data
+# before before it is added to the vocabulary. The value is found using binary
+# search to obtain the target vocabulary size.
+_MIN_MIN_COUNT = 1     # min value to use when binary searching for min_count
+_MAX_MIN_COUNT = 1000  # max value to use when binary searching for min_count
+class Subtokenizer(object):
+  """Encodes and decodes strings to/from integer IDs."""
+  def __init__(self, vocab_file, reserved_tokens=None):
+    """Initializes class, creating a vocab file if data_files is provided."""
+    tf.logging.info("Initializing Subtokenizer from file %s." % vocab_file)
+    if reserved_tokens is None:
+      reserved_tokens = RESERVED_TOKENS
+    self.subtoken_list = _load_vocab_file(vocab_file, reserved_tokens)
+    self.alphabet = _generate_alphabet_dict(self.subtoken_list)
+    self.subtoken_to_id_dict = _list_to_index_dict(self.subtoken_list)
+    self.max_subtoken_length = 0
+    for subtoken in self.subtoken_list:
+      self.max_subtoken_length = max(self.max_subtoken_length, len(subtoken))
+    # Create cache to speed up subtokenization
+    self._cache_size = 2 ** 20
+    self._cache = [(None, None)] * self._cache_size
+  @staticmethod
+  def init_from_files(
+      vocab_file, files, target_vocab_size, threshold, min_count=None,
+      file_byte_limit=1e6, reserved_tokens=None):
+    """Create subtoken vocabulary based on files, and save vocab to file.
+    Args:
+      vocab_file: String name of vocab file to store subtoken vocabulary.
+      files: List of file paths that will be used to generate vocabulary.
+      target_vocab_size: target vocabulary size to generate.
+      threshold: int threshold of vocabulary size to accept.
+      min_count: int minimum count to use for generating the vocabulary. The min
+        count is the minimum number of times a subtoken should appear in the
+        files before it is added to the vocabulary. If set to none, this value
+        is found using binary search.
+      file_byte_limit: (Default 1e6) Maximum number of bytes of sample text that
+        will be drawn from the files.
+      reserved_tokens: List of string tokens that are guaranteed to be at the
+        beginning of the subtoken vocabulary list.
+    Returns:
+      Subtokenizer object
+    """
+    if reserved_tokens is None:
+      reserved_tokens = RESERVED_TOKENS
+    if tf.gfile.Exists(vocab_file):
+      tf.logging.info("Vocab file already exists (%s)" % vocab_file)
+    else:
+      tf.logging.info("Begin steps to create subtoken vocabulary...")
+      token_counts = _count_tokens(files, file_byte_limit)
+      alphabet = _generate_alphabet_dict(token_counts)
+      subtoken_list = _generate_subtokens_with_target_vocab_size(
+          token_counts, alphabet, target_vocab_size, threshold, min_count,
+          reserved_tokens)
+      tf.logging.info("Generated vocabulary with %d subtokens." %
+                      len(subtoken_list))
+      _save_vocab_file(vocab_file, subtoken_list)
+    return Subtokenizer(vocab_file)
+  def encode(self, raw_string, add_eos=False):
+    """Encodes a string into a list of int subtoken ids."""
+    ret = []
+    tokens = _split_string_to_tokens(_native_to_unicode(raw_string))
+    for token in tokens:
+      ret.extend(self._token_to_subtoken_ids(token))
+    if add_eos:
+      ret.append(EOS_ID)
+    return ret
+  def _token_to_subtoken_ids(self, token):
+    """Encode a single token into a list of subtoken ids."""
+    cache_location = hash(token) % self._cache_size
+    cache_key, cache_value = self._cache[cache_location]
+    if cache_key == token:
+      return cache_value
+    ret = _split_token_to_subtokens(
+        _escape_token(token, self.alphabet), self.subtoken_to_id_dict,
+        self.max_subtoken_length)
+    ret = [self.subtoken_to_id_dict[subtoken_id] for subtoken_id in ret]
+    self._cache[cache_location] = (token, ret)
+    return ret
+  def decode(self, subtokens):
+    """Converts list of int subtokens ids into a string."""
+    if isinstance(subtokens, np.ndarray):
+      # Note that list(subtokens) converts subtokens to a python list, but the
+      # items remain as np.int32. This converts both the array and its items.
+      subtokens = subtokens.tolist()
+    if not subtokens:
+      return ""
+    assert isinstance(subtokens, list) and isinstance(subtokens[0], int), (
+        "Subtokens argument passed into decode() must be a list of integers.")
+    return _unicode_to_native(
+        _join_tokens_to_string(self._subtoken_ids_to_tokens(subtokens)))
+  def _subtoken_ids_to_tokens(self, subtokens):
+    """Convert list of int subtoken ids to a list of string tokens."""
+    escaped_tokens = "".join([
+        self.subtoken_list[s] for s in subtokens
+        if s < len(self.subtoken_list)])
+    escaped_tokens = escaped_tokens.split("_")
+    # All tokens in the vocabulary list have been escaped (see _escape_token())
+    # so each token must be unescaped when decoding.
+    ret = []
+    for token in escaped_tokens:
+      if token:
+        ret.append(_unescape_token(token))
+    return ret
+def _save_vocab_file(vocab_file, subtoken_list):
+  """Save subtokens to file."""
+  with tf.gfile.Open(vocab_file, mode="w") as f:
+    for subtoken in subtoken_list:
+      f.write("'%s'\n" % _unicode_to_native(subtoken))
+def _load_vocab_file(vocab_file, reserved_tokens=None):
+  """Load vocabulary while ensuring reserved tokens are at the top."""
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+  subtoken_list = []
+  with tf.gfile.Open(vocab_file, mode="r") as f:
+    for line in f:
+      subtoken = _native_to_unicode(line.strip())
+      subtoken = subtoken[1:-1]  # Remove surrounding single-quotes
+      if subtoken in reserved_tokens:
+        continue
+      subtoken_list.append(_native_to_unicode(subtoken))
+  return reserved_tokens + subtoken_list
+def _native_to_unicode(s):
+  """Convert string to unicode (required in Python 2)."""
+  if six.PY2:
+    return s if isinstance(s, unicode) else s.decode("utf-8")
+  else:
+    return s
+def _unicode_to_native(s):
+  """Convert string from unicode to native format (required in Python 2)."""
+  if six.PY2:
+    return s.encode("utf-8") if isinstance(s, unicode) else s
+  else:
+    return s
+def _split_string_to_tokens(text):
+  """Splits text to a list of string tokens."""
+  if not text:
+    return []
+  ret = []
+  token_start = 0
+  # Classify each character in the input string
+  is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text]
+  for pos in xrange(1, len(text)):
+    if is_alnum[pos] != is_alnum[pos - 1]:
+      token = text[token_start:pos]
+      if token != u" " or token_start == 0:
+        ret.append(token)
+      token_start = pos
+  final_token = text[token_start:]
+  ret.append(final_token)
+  return ret
+def _join_tokens_to_string(tokens):
+  """Join a list of string tokens into a single string."""
+  token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens]
+  ret = []
+  for i, token in enumerate(tokens):
+    if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]:
+      ret.append(u" ")
+    ret.append(token)
+  return "".join(ret)
+def _escape_token(token, alphabet):
+  r"""Replace characters that aren't in the alphabet and append "_" to token.
+  Apply three transformations to the token:
+    1. Replace underline character "_" with "\u", and backslash "\" with "\\".
+    2. Replace characters outside of the alphabet with "\###;", where ### is the
+       character's Unicode code point.
+    3. Appends "_" to mark the end of a token.
+  Args:
+    token: unicode string to be escaped
+    alphabet: list of all known characters
+  Returns:
+    escaped string
+  """
+  token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u")
+  ret = [c if c in alphabet and c != u"\n" else r"\%d;" % ord(c) for c in token]
+  return u"".join(ret) + "_"
+def _unescape_token(token):
+  r"""Replaces escaped characters in the token with their unescaped versions.
+  Applies inverse transformations as _escape_token():
+    1. Replace "\u" with "_", and "\\" with "\".
+    2. Replace "\###;" with the unicode character the ### refers to.
+  Args:
+    token: escaped string
+  Returns:
+    unescaped string
+  """
+  def match(m):
+    r"""Returns replacement string for matched object.
+    Matched objects contain one of the strings that matches the regex pattern:
+      r"\\u|\\\\|\\([0-9]+);"
+    The strings can be '\u', '\\', or '\###;' (### is any digit number).
+    m.group(0) refers to the entire matched string ('\u', '\\', or '\###;').
+    m.group(1) refers to the first parenthesized subgroup ('###').
+    m.group(0) exists for all match objects, while m.group(1) exists only for
+    the string '\###;'.
+    This function looks to see if m.group(1) exists. If it doesn't, then the
+    matched string must be '\u' or '\\' . In this case, the corresponding
+    replacement ('_' and '\') are returned. Note that in python, a single
+    backslash is written as '\\', and double backslash as '\\\\'.
+    If m.goup(1) exists, then use the integer in m.group(1) to return a
+    unicode character.
+    Args:
+      m: match object
+    Returns:
+      String to replace matched object with.
+    """
+    # Check if the matched strings are '\u' or '\\'.
+    if m.group(1) is None:
+      return u"_" if m.group(0) == u"\\u" else u"\\"
+    # If m.group(1) exists, try and return unicode character.
+    try:
+      return six.unichr(int(m.group(1)))
+    except (ValueError, OverflowError) as _:
+      return _UNDEFINED_UNICODE
+  # Use match function to replace escaped substrings in the token.
+  return _UNESCAPE_REGEX.sub(match, token)
+def _count_tokens(files, file_byte_limit=1e6):
+  """Return token counts of words in the files.
+  Samples file_byte_limit bytes from each file, and counts the words that appear
+  in the samples. The samples are semi-evenly distributed across the file.
+  Args:
+    files: List of filepaths
+    file_byte_limit: Max number of bytes that will be read from each file.
+  Returns:
+    Dictionary mapping tokens to the number of times they appear in the sampled
+    lines from the files.
+  """
+  token_counts = collections.defaultdict(int)
+  for filepath in files:
+    with tf.gfile.Open(filepath, mode="r") as reader:
+      file_byte_budget = file_byte_limit
+      counter = 0
+      lines_to_skip = int(reader.size() / (file_byte_budget * 2))
+      for line in reader:
+        if counter < lines_to_skip:
+          counter += 1
+        else:
+          if file_byte_budget < 0:
+            break
+          line = line.strip()
+          file_byte_budget -= len(line)
+          counter = 0
+          # Add words to token counts
+          for token in _split_string_to_tokens(_native_to_unicode(line)):
+            token_counts[token] += 1
+  return token_counts
+def _list_to_index_dict(lst):
+  """Create dictionary mapping list items to their indices in the list."""
+  return {item: n for n, item in enumerate(lst)}
+def _split_token_to_subtokens(token, subtoken_dict, max_subtoken_length):
+  """Splits a token into subtokens defined in the subtoken dict."""
+  ret = []
+  start = 0
+  token_len = len(token)
+  while start < token_len:
+    # Find the longest subtoken, so iterate backwards.
+    for end in xrange(min(token_len, start + max_subtoken_length), start, -1):
+      subtoken = token[start:end]
+      if subtoken in subtoken_dict:
+        ret.append(subtoken)
+        start = end
+        break
+    else:  # Did not break
+      # If there is no possible encoding of the escaped token then one of the
+      # characters in the token is not in the alphabet. This should be
+      # impossible and would be indicative of a bug.
+      raise ValueError("Was unable to split token \"%s\" into subtokens." %
+                       token)
+  return ret
+def _generate_subtokens_with_target_vocab_size(
+    token_counts, alphabet, target_size, threshold, min_count=None,
+    reserved_tokens=None):
+  """Generate subtoken vocabulary close to the target size."""
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+  if min_count is not None:
+    tf.logging.info("Using min_count=%d to generate vocab with target size %d" %
+                    (min_count, target_size))
+    return _generate_subtokens(
+        token_counts, alphabet, min_count, reserved_tokens=reserved_tokens)
+  def bisect(min_val, max_val):
+    """Recursive function to binary search for subtoken vocabulary."""
+    cur_count = (min_val + max_val) // 2
+    tf.logging.info("Binary search: trying min_count=%d (%d %d)" %
+                    (cur_count, min_val, max_val))
+    subtoken_list = _generate_subtokens(
+        token_counts, alphabet, cur_count, reserved_tokens=reserved_tokens)
+    val = len(subtoken_list)
+    tf.logging.info("Binary search: min_count=%d resulted in %d tokens" %
+                    (cur_count, val))
+    within_threshold = abs(val - target_size) < threshold
+    if within_threshold or min_val >= max_val or cur_count < 2:
+      return subtoken_list
+    if val > target_size:
+      other_subtoken_list = bisect(cur_count + 1, max_val)
+    else:
+      other_subtoken_list = bisect(min_val, cur_count - 1)
+    # Return vocabulary dictionary with the closest number of tokens.
+    other_val = len(other_subtoken_list)
+    if abs(other_val - target_size) < abs(val - target_size):
+      return other_subtoken_list
+    return subtoken_list
+  tf.logging.info("Finding best min_count to get target size of %d" %
+                  target_size)
+  return bisect(_MIN_MIN_COUNT, _MAX_MIN_COUNT)
+def _generate_alphabet_dict(iterable, reserved_tokens=None):
+  """Create set of characters that appear in any element in the iterable."""
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+  alphabet = {c for token in iterable for c in token}
+  alphabet |= {c for token in reserved_tokens for c in token}
+  alphabet |= _ESCAPE_CHARS  # Add escape characters to alphabet set.
+  return alphabet
+def _count_and_gen_subtokens(
+    token_counts, alphabet, subtoken_dict, max_subtoken_length):
+  """Count number of times subtokens appear, and generate new subtokens.
+  Args:
+    token_counts: dict mapping tokens to the number of times they appear in the
+      original files.
+    alphabet: list of allowed characters. Used to escape the tokens, which
+      guarantees that all tokens can be split into subtokens.
+    subtoken_dict: dict mapping subtokens to ids.
+    max_subtoken_length: maximum length of subtoken in subtoken_dict.
+  Returns:
+    A defaultdict mapping subtokens to the number of times they appear in the
+    tokens. The dict may contain new subtokens.
+  """
+  subtoken_counts = collections.defaultdict(int)
+  for token, count in six.iteritems(token_counts):
+    token = _escape_token(token, alphabet)
+    subtokens = _split_token_to_subtokens(
+        token, subtoken_dict, max_subtoken_length)
+    # Generate new subtokens by taking substrings from token.
+    start = 0
+    for subtoken in subtokens:
+      for end in xrange(start + 1, len(token) + 1):
+        new_subtoken = token[start:end]
+        subtoken_counts[new_subtoken] += count
+      start += len(subtoken)
+  return subtoken_counts
+def _filter_and_bucket_subtokens(subtoken_counts, min_count):
+  """Return a bucketed list of subtokens that are filtered by count.
+  Args:
+    subtoken_counts: defaultdict mapping subtokens to their counts
+    min_count: int count used to filter subtokens
+  Returns:
+    List of subtoken sets, where subtokens in set i have the same length=i.
+  """
+  # Create list of buckets, where subtokens in bucket i have length i.
+  subtoken_buckets = []
+  for subtoken, count in six.iteritems(subtoken_counts):
+    if count < min_count:  # Filter out subtokens that don't appear enough
+      continue
+    while len(subtoken_buckets) <= len(subtoken):
+      subtoken_buckets.append(set())
+    subtoken_buckets[len(subtoken)].add(subtoken)
+  return subtoken_buckets
+def _gen_new_subtoken_list(
+    subtoken_counts, min_count, alphabet, reserved_tokens=None):
+  """Generate candidate subtokens ordered by count, and new max subtoken length.
+  Add subtokens to the candiate list in order of length (longest subtokens
+  first). When a subtoken is added, the counts of each of its prefixes are
+  decreased. Prefixes that don't appear much outside the subtoken are not added
+  to the candidate list.
+  For example:
+    subtoken being added to candidate list: 'translate'
+    subtoken_counts: {'translate':10, 't':40, 'tr':16, 'tra':12, ...}
+    min_count: 5
+  When 'translate' is added, subtoken_counts is updated to:
+    {'translate':0, 't':30, 'tr':6, 'tra': 2, ...}
+  The subtoken 'tra' will not be added to the candidate list, because it appears
+  twice (less than min_count) outside of 'translate'.
+  Args:
+    subtoken_counts: defaultdict mapping str subtokens to int counts
+    min_count: int minumum count requirement for subtokens
+    alphabet: set of characters. Each character is added to the subtoken list to
+      guarantee that all tokens can be encoded.
+    reserved_tokens: list of tokens that will be added to the beginning of the
+      returned subtoken list.
+  Returns:
+    List of candidate subtokens in decreasing count order, and maximum subtoken
+    length
+  """
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+  # Create a list of (count, subtoken) for each candidate subtoken.
+  subtoken_candidates = []
+  # Use bucketted list to iterate through subtokens in order of length.
+  # subtoken_buckets[i] = set(subtokens), where each subtoken has length i.
+  subtoken_buckets = _filter_and_bucket_subtokens(subtoken_counts, min_count)
+  max_subtoken_length = len(subtoken_buckets) - 1
+  # Go through the list in reverse order to consider longer subtokens first.
+  for subtoken_len in xrange(max_subtoken_length, 0, -1):
+    for subtoken in subtoken_buckets[subtoken_len]:
+      count = subtoken_counts[subtoken]
+      # Possible if this subtoken is a prefix of another token.
+      if count < min_count:
+        continue
+      # Ignore alphabet/reserved tokens, which will be added manually later.
+      if subtoken not in alphabet and subtoken not in reserved_tokens:
+        subtoken_candidates.append((count, subtoken))
+      # Decrement count of the subtoken's prefixes (if a longer subtoken is
+      # added, its prefixes lose priority to be added).
+      for end in xrange(1, subtoken_len):
+        subtoken_counts[subtoken[:end]] -= count
+  # Add alphabet subtokens (guarantees that all strings are encodable).
+  subtoken_candidates.extend((subtoken_counts.get(a, 0), a) for a in alphabet)
+  # Order subtoken candidates by decreasing count.
+  subtoken_list = [t for _, t in sorted(subtoken_candidates, reverse=True)]
+  # Add reserved tokens to beginning of the list.
+  subtoken_list = reserved_tokens + subtoken_list
+  return subtoken_list, max_subtoken_length
+def _generate_subtokens(
+    token_counts, alphabet, min_count, num_iterations=4,
+    reserved_tokens=None):
+  """Create a list of subtokens in decreasing order of frequency.
+  Args:
+    token_counts: dict mapping str tokens -> int count
+    alphabet: set of characters
+    min_count: int minimum number of times a subtoken must appear before it is
+      added to the vocabulary.
+    num_iterations: int number of iterations to generate new tokens.
+    reserved_tokens: list of tokens that will be added to the beginning to the
+      returned subtoken list.
+  Returns:
+    Sorted list of subtokens (most frequent first)
+  """
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+  # Use alphabet set to create initial list of subtokens
+  subtoken_list = reserved_tokens + list(alphabet)
+  max_subtoken_length = 1
+  # On each iteration, segment all words using the subtokens defined in
+  # subtoken_dict, count how often the resulting subtokens appear, and update
+  # the dictionary with subtokens w/ high enough counts.
+  for i in xrange(num_iterations):
+    tf.logging.info("\tGenerating subtokens: iteration %d" % i)
+    # Generate new subtoken->id dictionary using the new subtoken list.
+    subtoken_dict = _list_to_index_dict(subtoken_list)
+    # Create dict mapping subtoken->count, with additional subtokens created
+    # from substrings taken from the tokens.
+    subtoken_counts = _count_and_gen_subtokens(
+        token_counts, alphabet, subtoken_dict, max_subtoken_length)
+    # Generate new list of subtokens sorted by subtoken count.
+    subtoken_list, max_subtoken_length = _gen_new_subtoken_list(
+        subtoken_counts, min_count, alphabet, reserved_tokens)
+    tf.logging.info("\tVocab size: %d" % len(subtoken_list))
+  return subtoken_list
--- a/official/transformer/utils/tokenizer_test.py
+++ b/official/transformer/utils/tokenizer_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test Subtokenizer and string helper methods."""
+import collections
+import tempfile
+import unittest
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+from official.transformer.utils import tokenizer
+class SubtokenizerTest(unittest.TestCase):
+  def _init_subtokenizer(self, vocab_list):
+    temp_file = tempfile.NamedTemporaryFile(delete=False)
+    with tf.gfile.Open(temp_file.name, 'w') as w:
+      for subtoken in vocab_list:
+        w.write("'%s'" % subtoken)
+        w.write("\n")
+    return tokenizer.Subtokenizer(temp_file.name, reserved_tokens=[])
+  def test_encode(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    s = "testing 123"
+    encoded_list = subtokenizer.encode(s)
+    self.assertEqual([1, 2, 0], encoded_list)
+  def test_decode(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    encoded_list = [1, 2, 0]  # testing 123
+    decoded_str = subtokenizer.decode(encoded_list)
+    self.assertEqual("testing 123", decoded_str)
+  def test_subtoken_ids_to_tokens(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    encoded_list = [1, 2, 0]  # testing 123
+    token_list = subtokenizer._subtoken_ids_to_tokens(encoded_list)
+    self.assertEqual([u"testing", u"123"], token_list)
+class StringHelperTest(unittest.TestCase):
+  def test_split_string_to_tokens(self):
+    text = "test? testing 123."
+    tokens = tokenizer._split_string_to_tokens(text)
+    self.assertEqual(["test", "? ", "testing", "123", "."], tokens)
+  def test_join_tokens_to_string(self):
+    tokens = ["test", "? ", "testing", "123", "."]
+    s = tokenizer._join_tokens_to_string(tokens)
+    self.assertEqual("test? testing 123.", s)
+  def test_escape_token(self):
+    token = u"abc_\\4"
+    alphabet = set("abc_\\u;")
+    escaped_token = tokenizer._escape_token(token, alphabet)
+    self.assertEqual("abc\\u\\\\\\52;_", escaped_token)
+  def test_unescape_token(self):
+    escaped_token = u"Underline: \\u, Backslash: \\\\, Unicode: \\52;"
+    unescaped_token = tokenizer._unescape_token(escaped_token)
+    self.assertEqual(
+        "Underline: _, Backslash: \\, Unicode: 4", unescaped_token)
+  def test_list_to_index_dict(self):
+    lst = ["test", "strings"]
+    d = tokenizer._list_to_index_dict(lst)
+    self.assertDictEqual({"test": 0, "strings": 1}, d)
+  def test_split_token_to_subtokens(self):
+    token = "abc"
+    subtoken_dict = {"a": 0, "b": 1, "c": 2, "ab": 3}
+    max_subtoken_length = 2
+    subtokens = tokenizer._split_token_to_subtokens(
+        token, subtoken_dict, max_subtoken_length)
+    self.assertEqual(["ab", "c"], subtokens)
+  def test_generate_alphabet_dict(self):
+    s = ["testing", "123"]
+    reserved_tokens = ["???"]
+    alphabet = tokenizer._generate_alphabet_dict(s, reserved_tokens)
+    self.assertIn("?", alphabet)
+    self.assertIn("t", alphabet)
+    self.assertIn("e", alphabet)
+    self.assertIn("s", alphabet)
+    self.assertIn("i", alphabet)
+    self.assertIn("n", alphabet)
+    self.assertIn("g", alphabet)
+    self.assertIn("1", alphabet)
+    self.assertIn("2", alphabet)
+    self.assertIn("3", alphabet)
+  def test_count_and_gen_subtokens(self):
+    token_counts = {"abc": 5}
+    alphabet = set("abc_")
+    subtoken_dict = {"a": 0, "b": 1, "c": 2, "_": 3}
+    max_subtoken_length = 2
+    subtoken_counts = tokenizer._count_and_gen_subtokens(
+        token_counts, alphabet, subtoken_dict, max_subtoken_length)
+    self.assertIsInstance(subtoken_counts, collections.defaultdict)
+    self.assertDictEqual(
+        {"a": 5, "b": 5, "c": 5, "_": 5, "ab": 5, "bc": 5, "c_": 5,
+         "abc": 5, "bc_": 5, "abc_": 5}, subtoken_counts)
+  def test_filter_and_bucket_subtokens(self):
+    subtoken_counts = collections.defaultdict(
+        int, {"a": 2, "b": 4, "c": 1, "ab": 6, "ac": 3, "abbc": 5})
+    min_count = 3
+    subtoken_buckets = tokenizer._filter_and_bucket_subtokens(
+        subtoken_counts, min_count)
+    self.assertEqual(len(subtoken_buckets[0]), 0)
+    self.assertEqual(set("b"), subtoken_buckets[1])
+    self.assertEqual(set(["ab", "ac"]), subtoken_buckets[2])
+    self.assertEqual(len(subtoken_buckets[3]), 0)
+    self.assertEqual(set(["abbc"]), subtoken_buckets[4])
+  def test_gen_new_subtoken_list(self):
+    subtoken_counts = collections.defaultdict(
+        int, {"translate": 10, "t": 40, "tr": 16, "tra": 12})
+    min_count = 5
+    alphabet = set("translate")
+    reserved_tokens = ["reserved", "tokens"]
+    subtoken_list, max_token_length = tokenizer._gen_new_subtoken_list(
+        subtoken_counts, min_count, alphabet, reserved_tokens)
+    # Check that "tra" isn"t in the list (its count should be decremented to 2,
+    # so it should not be added to the canddiate list).
+    self.assertNotIn("tra", subtoken_list)
+    self.assertIn("tr", subtoken_list)
+    self.assertIn("t", subtoken_list)
+    self.assertEqual(len("translate"), max_token_length)
+  def test_generate_subtokens(self):
+    token_counts = {"ab": 1, "bc": 3, "abc": 5}
+    alphabet = set("abc_")
+    min_count = 100
+    num_iterations = 1
+    reserved_tokens = ["reserved", "tokens"]
+    vocab_list = tokenizer._generate_subtokens(
+        token_counts, alphabet, min_count, num_iterations, reserved_tokens)
+    # Check that reserved tokens are at the front of the list
+    self.assertEqual(vocab_list[:2], reserved_tokens)
+    # Check that each character in alphabet is in the vocab list
+    for c in alphabet:
+      self.assertIn(c, vocab_list)
+if __name__ == "__main__":
+  unittest.main()