Merge branch 'master' of https://github.com/ilyamironov/models

3b158095 · Ilya Mironov · a90db800 · be659c2f · 3b158095 · 3b158095
Commit 3b158095 authored May 07, 2018 by Ilya Mironov
20 changed files
--- a/official/transformer/model/beam_search.py
+++ b/official/transformer/model/beam_search.py
--- a/official/transformer/model/beam_search_test.py
+++ b/official/transformer/model/beam_search_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test beam search helper methods."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+from official.transformer.model import beam_search
+class BeamSearchHelperTests(tf.test.TestCase):
+  def test_expand_to_beam_size(self):
+    x = tf.ones([7, 4, 2, 5])
+    x = beam_search._expand_to_beam_size(x, 3)
+    with self.test_session() as sess:
+      shape = sess.run(tf.shape(x))
+    self.assertAllEqual([7, 3, 4, 2, 5], shape)
+  def test_shape_list(self):
+    y = tf.constant(4.0)
+    x = tf.ones([7, tf.to_int32(tf.sqrt(y)), 2, 5])
+    shape = beam_search._shape_list(x)
+    self.assertIsInstance(shape[0], int)
+    self.assertIsInstance(shape[1], tf.Tensor)
+    self.assertIsInstance(shape[2], int)
+    self.assertIsInstance(shape[3], int)
+  def test_get_shape_keep_last_dim(self):
+    y = tf.constant(4.0)
+    x = tf.ones([7, tf.to_int32(tf.sqrt(y)), 2, 5])
+    shape = beam_search._get_shape_keep_last_dim(x)
+    self.assertAllEqual([None, None, None, 5],
+                        shape.as_list())
+  def test_flatten_beam_dim(self):
+    x = tf.ones([7, 4, 2, 5])
+    x = beam_search._flatten_beam_dim(x)
+    with self.test_session() as sess:
+      shape = sess.run(tf.shape(x))
+    self.assertAllEqual([28, 2, 5], shape)
+  def test_unflatten_beam_dim(self):
+    x = tf.ones([28, 2, 5])
+    x = beam_search._unflatten_beam_dim(x, 7, 4)
+    with self.test_session() as sess:
+      shape = sess.run(tf.shape(x))
+    self.assertAllEqual([7, 4, 2, 5], shape)
+  def test_gather_beams(self):
+    x = tf.reshape(tf.range(24), [2, 3, 4])
+    # x looks like:  [[[ 0  1  2  3]
+    #                  [ 4  5  6  7]
+    #                  [ 8  9 10 11]]
+    #
+    #                 [[12 13 14 15]
+    #                  [16 17 18 19]
+    #                  [20 21 22 23]]]
+    y = beam_search._gather_beams(x, [[1, 2], [0, 2]], 2, 2)
+    with self.test_session() as sess:
+      y = sess.run(y)
+    self.assertAllEqual([[[4, 5, 6, 7],
+                          [8, 9, 10, 11]],
+                         [[12, 13, 14, 15],
+                          [20, 21, 22, 23]]],
+                        y)
+  def test_gather_topk_beams(self):
+    x = tf.reshape(tf.range(24), [2, 3, 4])
+    x_scores = [[0, 1, 1], [1, 0, 1]]
+    y = beam_search._gather_topk_beams(x, x_scores, 2, 2)
+    with self.test_session() as sess:
+      y = sess.run(y)
+    self.assertAllEqual([[[4, 5, 6, 7],
+                          [8, 9, 10, 11]],
+                         [[12, 13, 14, 15],
+                          [20, 21, 22, 23]]],
+                        y)
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/transformer/model/embedding_layer.py
+++ b/official/transformer/model/embedding_layer.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of embedding layer with shared weights."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+from official.transformer.model import model_utils
+class EmbeddingSharedWeights(tf.layers.Layer):
+  """Calculates input embeddings and pre-softmax linear with shared weights."""
+  def __init__(self, vocab_size, hidden_size):
+    super(EmbeddingSharedWeights, self).__init__()
+    self.vocab_size = vocab_size
+    self.hidden_size = hidden_size
+  def build(self, _):
+    with tf.variable_scope("embedding_and_softmax", reuse=tf.AUTO_REUSE):
+      # Create and initialize weights. The random normal initializer was chosen
+      # randomly, and works well.
+      self.shared_weights = tf.get_variable(
+          "weights", [self.vocab_size, self.hidden_size],
+          initializer=tf.random_normal_initializer(
+              0., self.hidden_size ** -0.5))
+    self.built = True
+  def call(self, x):
+    """Get token embeddings of x.
+    Args:
+      x: An int64 tensor with shape [batch_size, length]
+    Returns:
+      embeddings: float32 tensor with shape [batch_size, length, embedding_size]
+      padding: float32 tensor with shape [batch_size, length] indicating the
+        locations of the padding tokens in x.
+    """
+    with tf.name_scope("embedding"):
+      embeddings = tf.gather(self.shared_weights, x)
+      # Scale embedding by the sqrt of the hidden size
+      embeddings *= self.hidden_size ** 0.5
+      # Create binary array of size [batch_size, length]
+      # where 1 = padding, 0 = not padding
+      padding = model_utils.get_padding(x)
+      # Set all padding embedding values to 0
+      embeddings *= tf.expand_dims(1 - padding, -1)
+      return embeddings
+  def linear(self, x):
+    """Computes logits by running x through a linear layer.
+    Args:
+      x: A float32 tensor with shape [batch_size, length, hidden_size]
+    Returns:
+      float32 tensor with shape [batch_size, length, vocab_size].
+    """
+    with tf.name_scope("presoftmax_linear"):
+      batch_size = tf.shape(x)[0]
+      length = tf.shape(x)[1]
+      x = tf.reshape(x, [-1, self.hidden_size])
+      logits = tf.matmul(x, self.shared_weights, transpose_b=True)
+      return tf.reshape(logits, [batch_size, length, self.vocab_size])
--- a/official/transformer/model/ffn_layer.py
+++ b/official/transformer/model/ffn_layer.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of fully connected network."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+class FeedFowardNetwork(tf.layers.Layer):
+  """Fully connected feedforward network."""
+  def __init__(self, hidden_size, filter_size, relu_dropout, train):
+    super(FeedFowardNetwork, self).__init__()
+    self.hidden_size = hidden_size
+    self.filter_size = filter_size
+    self.relu_dropout = relu_dropout
+    self.train = train
+    self.filter_dense_layer = tf.layers.Dense(
+        filter_size, use_bias=True, activation=tf.nn.relu, name="filter_layer")
+    self.output_dense_layer = tf.layers.Dense(
+        hidden_size, use_bias=True, name="output_layer")
+  def call(self, x, padding=None):
+    """Return outputs of the feedforward network.
+    Args:
+      x: tensor with shape [batch_size, length, hidden_size]
+      padding: (optional) If set, the padding values are temporarily removed
+        from x. The padding values are placed back in the output tensor in the
+        same locations. shape [batch_size, length]
+    Returns:
+      Output of the feedforward network.
+      tensor with shape [batch_size, length, hidden_size]
+    """
+    # Retrieve dynamically known shapes
+    batch_size = tf.shape(x)[0]
+    length = tf.shape(x)[1]
+    if padding is not None:
+      with tf.name_scope("remove_padding"):
+        # Flatten padding to [batch_size*length]
+        pad_mask = tf.reshape(padding, [-1])
+        nonpad_ids = tf.to_int32(tf.where(pad_mask < 1e-9))
+        # Reshape x to [batch_size*length, hidden_size] to remove padding
+        x = tf.reshape(x, [-1, self.hidden_size])
+        x = tf.gather_nd(x, indices=nonpad_ids)
+        # Reshape x from 2 dimensions to 3 dimensions.
+        x.set_shape([None, self.hidden_size])
+        x = tf.expand_dims(x, axis=0)
+    output = self.filter_dense_layer(x)
+    if self.train:
+      output = tf.nn.dropout(output, 1.0 - self.relu_dropout)
+    output = self.output_dense_layer(output)
+    if padding is not None:
+      with tf.name_scope("re_add_padding"):
+        output = tf.squeeze(output, axis=0)
+        output = tf.scatter_nd(
+            indices=nonpad_ids,
+            updates=output,
+            shape=[batch_size * length, self.hidden_size]
+        )
+        output = tf.reshape(output, [batch_size, length, self.hidden_size])
+    return output
--- a/official/transformer/model/model_params.py
+++ b/official/transformer/model/model_params.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines Transformer model parameters."""
+class TransformerBaseParams(object):
+  """Parameters for the base Transformer model."""
+  # Input params
+  batch_size = 2048  # Maximum number of tokens per batch of examples.
+  max_length = 256  # Maximum number of tokens per example.
+  # Model params
+  initializer_gain = 1.0  # Used in trainable variable initialization.
+  vocab_size = 33708  # Number of tokens defined in the vocabulary file.
+  hidden_size = 512  # Model dimension in the hidden layers.
+  num_hidden_layers = 6  # Number of layers in the encoder and decoder stacks.
+  num_heads = 8  # Number of heads to use in multi-headed attention.
+  filter_size = 2048  # Inner layer dimensionality in the feedforward network.
+  # Dropout values (only used when training)
+  layer_postprocess_dropout = 0.1
+  attention_dropout = 0.1
+  relu_dropout = 0.1
+  # Training params
+  label_smoothing = 0.1
+  learning_rate = 2.0
+  learning_rate_decay_rate = 1.0
+  learning_rate_warmup_steps = 16000
+  # Optimizer params
+  optimizer_adam_beta1 = 0.9
+  optimizer_adam_beta2 = 0.997
+  optimizer_adam_epsilon = 1e-09
+  # Default prediction params
+  extra_decode_length = 50
+  beam_size = 4
+  alpha = 0.6  # used to calculate length normalization in beam search
+class TransformerBigParams(TransformerBaseParams):
+  """Parameters for the big Transformer model."""
+  batch_size = 4096
+  hidden_size = 1024
+  filter_size = 4096
+  num_heads = 16
--- a/official/transformer/model/model_utils.py
+++ b/official/transformer/model/model_utils.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Transformer model helper methods."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import tensorflow as tf
+_NEG_INF = -1e9
+def get_position_encoding(
+    length, hidden_size, min_timescale=1.0, max_timescale=1.0e4):
+  """Return positional encoding.
+  Calculates the position encoding as a mix of sine and cosine functions with
+  geometrically increasing wavelengths.
+  Defined and formulized in Attention is All You Need, section 3.5.
+  Args:
+    length: Sequence length.
+    hidden_size: Size of the
+    min_timescale: Minimum scale that will be applied at each position
+    max_timescale: Maximum scale that will be applied at each position
+  Returns:
+    Tensor with shape [length, hidden_size]
+  """
+  position = tf.to_float(tf.range(length))
+  num_timescales = hidden_size // 2
+  log_timescale_increment = (
+      math.log(float(max_timescale) / float(min_timescale)) /
+      (tf.to_float(num_timescales) - 1))
+  inv_timescales = min_timescale * tf.exp(
+      tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
+  scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
+  signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
+  return signal
+def get_decoder_self_attention_bias(length):
+  """Calculate bias for decoder that maintains model's autoregressive property.
+  Creates a tensor that masks out locations that correspond to illegal
+  connections, so prediction at position i cannot draw information from future
+  positions.
+  Args:
+    length: int length of sequences in batch.
+  Returns:
+    float tensor of shape [1, 1, length, length]
+  """
+  with tf.name_scope("decoder_self_attention_bias"):
+    valid_locs = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
+    valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
+    decoder_bias = _NEG_INF * (1.0 - valid_locs)
+  return decoder_bias
+def get_padding(x, padding_value=0):
+  """Return float tensor representing the padding values in x.
+  Args:
+    x: int tensor with any shape
+    padding_value: int value that
+  Returns:
+    flaot tensor with same shape as x containing values 0 or 1.
+      0 -> non-padding, 1 -> padding
+  """
+  with tf.name_scope("padding"):
+    return tf.to_float(tf.equal(x, padding_value))
+def get_padding_bias(x):
+  """Calculate bias tensor from padding values in tensor.
+  Bias tensor that is added to the pre-softmax multi-headed attention logits,
+  which has shape [batch_size, num_heads, length, length]. The tensor is zero at
+  non-padding locations, and -1e9 (negative infinity) at padding locations.
+  Args:
+    x: int tensor with shape [batch_size, length]
+  Returns:
+    Attention bias tensor of shape [batch_size, 1, 1, length].
+  """
+  with tf.name_scope("attention_bias"):
+    padding = get_padding(x)
+    attention_bias = padding * _NEG_INF
+    attention_bias = tf.expand_dims(
+        tf.expand_dims(attention_bias, axis=1), axis=1)
+  return attention_bias
--- a/official/transformer/model/model_utils_test.py
+++ b/official/transformer/model/model_utils_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test Transformer model helper methods."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+from official.transformer.model import model_utils
+NEG_INF = -1e9
+class ModelUtilsTest(tf.test.TestCase):
+  def test_get_padding(self):
+    x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
+    padding = model_utils.get_padding(x, padding_value=0)
+    with self.test_session() as sess:
+      padding = sess.run(padding)
+    self.assertAllEqual([[0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [1, 0, 0, 1, 0]],
+                        padding)
+  def test_get_padding_bias(self):
+    x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
+    bias = model_utils.get_padding_bias(x)
+    bias_shape = tf.shape(bias)
+    flattened_bias = tf.reshape(bias, [3, 5])
+    with self.test_session() as sess:
+      flattened_bias, bias_shape = sess.run((flattened_bias, bias_shape))
+    self.assertAllEqual([[0, NEG_INF, NEG_INF, NEG_INF, 0],
+                         [0, 0, NEG_INF, NEG_INF, NEG_INF],
+                         [NEG_INF, 0, 0, NEG_INF, 0]],
+                        flattened_bias)
+    self.assertAllEqual([3, 1, 1, 5], bias_shape)
+  def test_get_decoder_self_attention_bias(self):
+    length = 5
+    bias = model_utils.get_decoder_self_attention_bias(length)
+    with self.test_session() as sess:
+      bias = sess.run(bias)
+    self.assertAllEqual([[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
+                           [0, 0, NEG_INF, NEG_INF, NEG_INF],
+                           [0, 0, 0, NEG_INF, NEG_INF],
+                           [0, 0, 0, 0, NEG_INF],
+                           [0, 0, 0, 0, 0]]]],
+                        bias)
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/transformer/model/transformer.py
+++ b/official/transformer/model/transformer.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines the Transformer model, and its encoder and decoder stacks.
+Model paper: https://arxiv.org/pdf/1706.03762.pdf
+Transformer model code source: https://github.com/tensorflow/tensor2tensor
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+from official.transformer.model import attention_layer
+from official.transformer.model import beam_search
+from official.transformer.model import embedding_layer
+from official.transformer.model import ffn_layer
+from official.transformer.model import model_utils
+from official.transformer.utils.tokenizer import EOS_ID
+_NEG_INF = -1e9
+class Transformer(object):
+  """Transformer model for sequence to sequence data.
+  Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
+  The Transformer model consists of an encoder and decoder. The input is an int
+  sequence (or a batch of sequences). The encoder produces a continous
+  representation, and the decoder uses the encoder output to generate
+  probabilities for the output sequence.
+  """
+  def __init__(self, params, train):
+    """Initialize layers to build Transformer model.
+    Args:
+      params: hyperparameter object defining layer sizes, dropout values, etc.
+      train: boolean indicating whether the model is in training mode. Used to
+        determine if dropout layers should be added.
+    """
+    self.train = train
+    self.params = params
+    self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights(
+        params.vocab_size, params.hidden_size)
+    self.encoder_stack = EncoderStack(params, train)
+    self.decoder_stack = DecoderStack(params, train)
+  def __call__(self, inputs, targets=None):
+    """Calculate target logits or inferred target sequences.
+    Args:
+      inputs: int tensor with shape [batch_size, input_length].
+      targets: None or int tensor with shape [batch_size, target_length].
+    Returns:
+      If targets is defined, then return logits for each word in the target
+      sequence. float tensor with shape [batch_size, target_length, vocab_size]
+      If target is none, then generate output sequence one token at a time.
+        returns a dictionary {
+          output: [batch_size, decoded length]
+          score: [batch_size, float]}
+    """
+    # Variance scaling is used here because it seems to work in many problems.
+    # Other reasonable initializers may also work just as well.
+    initializer = tf.variance_scaling_initializer(
+        self.params.initializer_gain, mode="fan_avg", distribution="uniform")
+    with tf.variable_scope("Transformer", initializer=initializer):
+      # Calculate attention bias for encoder self-attention and decoder
+      # multi-headed attention layers.
+      attention_bias = model_utils.get_padding_bias(inputs)
+      # Run the inputs through the encoder layer to map the symbol
+      # representations to continuous representations.
+      encoder_outputs = self.encode(inputs, attention_bias)
+      # Generate output sequence if targets is None, or return logits if target
+      # sequence is known.
+      if targets is None:
+        return self.predict(encoder_outputs, attention_bias)
+      else:
+        logits = self.decode(targets, encoder_outputs, attention_bias)
+        return logits
+  def encode(self, inputs, attention_bias):
+    """Generate continuous representation for inputs.
+    Args:
+      inputs: int tensor with shape [batch_size, input_length].
+      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
+    Returns:
+      float tensor with shape [batch_size, input_length, hidden_size]
+    """
+    with tf.name_scope("encode"):
+      # Prepare inputs to the layer stack by adding positional encodings and
+      # applying dropout.
+      embedded_inputs = self.embedding_softmax_layer(inputs)
+      inputs_padding = model_utils.get_padding(inputs)
+      with tf.name_scope("add_pos_encoding"):
+        length = tf.shape(embedded_inputs)[1]
+        pos_encoding = model_utils.get_position_encoding(
+            length, self.params.hidden_size)
+        encoder_inputs = embedded_inputs + pos_encoding
+      if self.train:
+        encoder_inputs = tf.nn.dropout(
+            encoder_inputs, 1 - self.params.layer_postprocess_dropout)
+      return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
+  def decode(self, targets, encoder_outputs, attention_bias):
+    """Generate logits for each value in the target sequence.
+    Args:
+      targets: target values for the output sequence.
+        int tensor with shape [batch_size, target_length]
+      encoder_outputs: continuous representation of input sequence.
+        float tensor with shape [batch_size, input_length, hidden_size]
+      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
+    Returns:
+      float32 tensor with shape [batch_size, target_length, vocab_size]
+    """
+    with tf.name_scope("decode"):
+      # Prepare inputs to decoder layers by shifting targets, adding positional
+      # encoding and applying dropout.
+      decoder_inputs = self.embedding_softmax_layer(targets)
+      with tf.name_scope("shift_targets"):
+        # Shift targets to the right, and remove the last element
+        decoder_inputs = tf.pad(
+            decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
+      with tf.name_scope("add_pos_encoding"):
+        length = tf.shape(decoder_inputs)[1]
+        decoder_inputs += model_utils.get_position_encoding(
+            length, self.params.hidden_size)
+      if self.train:
+        decoder_inputs = tf.nn.dropout(
+            decoder_inputs, 1 - self.params.layer_postprocess_dropout)
+      # Run values
+      decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
+          length)
+      outputs = self.decoder_stack(
+          decoder_inputs, encoder_outputs, decoder_self_attention_bias,
+          attention_bias)
+      logits = self.embedding_softmax_layer.linear(outputs)
+      return logits
+  def _get_symbols_to_logits_fn(self, max_decode_length):
+    """Returns a decoding function that calculates logits of the next tokens."""
+    timing_signal = model_utils.get_position_encoding(
+        max_decode_length + 1, self.params.hidden_size)
+    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
+        max_decode_length)
+    def symbols_to_logits_fn(ids, i, cache):
+      """Generate logits for next potential IDs.
+      Args:
+        ids: Current decoded sequences.
+          int tensor with shape [batch_size * beam_size, i + 1]
+        i: Loop index
+        cache: dictionary of values storing the encoder output, encoder-decoder
+          attention bias, and previous decoder attention values.
+      Returns:
+        Tuple of
+          (logits with shape [batch_size * beam_size, vocab_size],
+           updated cache values)
+      """
+      # Set decoder input to the last generated IDs
+      decoder_input = ids[:, -1:]
+      # Preprocess decoder input by getting embeddings and adding timing signal.
+      decoder_input = self.embedding_softmax_layer(decoder_input)
+      decoder_input += timing_signal[i:i + 1]
+      self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
+      decoder_outputs = self.decoder_stack(
+          decoder_input, cache.get("encoder_outputs"), self_attention_bias,
+          cache.get("encoder_decoder_attention_bias"), cache)
+      logits = self.embedding_softmax_layer.linear(decoder_outputs)
+      logits = tf.squeeze(logits, axis=[1])
+      return logits, cache
+    return symbols_to_logits_fn
+  def predict(self, encoder_outputs, encoder_decoder_attention_bias):
+    """Return predicted sequence."""
+    batch_size = tf.shape(encoder_outputs)[0]
+    input_length = tf.shape(encoder_outputs)[1]
+    max_decode_length = input_length + self.params.extra_decode_length
+    symbols_to_logits_fn = self._get_symbols_to_logits_fn(max_decode_length)
+    # Create initial set of IDs that will be passed into symbols_to_logits_fn.
+    initial_ids = tf.zeros([batch_size], dtype=tf.int32)
+    # Create cache storing decoder attention values for each layer.
+    cache = {
+        "layer_%d" % layer: {
+            "k": tf.zeros([batch_size, 0, self.params.hidden_size]),
+            "v": tf.zeros([batch_size, 0, self.params.hidden_size]),
+        } for layer in range(self.params.num_hidden_layers)}
+    # Add encoder output and attention bias to the cache.
+    cache["encoder_outputs"] = encoder_outputs
+    cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
+    # Use beam search to find the top beam_size sequences and scores.
+    decoded_ids, scores = beam_search.sequence_beam_search(
+        symbols_to_logits_fn=symbols_to_logits_fn,
+        initial_ids=initial_ids,
+        initial_cache=cache,
+        vocab_size=self.params.vocab_size,
+        beam_size=self.params.beam_size,
+        alpha=self.params.alpha,
+        max_decode_length=max_decode_length,
+        eos_id=EOS_ID)
+    # Get the top sequence for each batch element
+    top_decoded_ids = decoded_ids[:, 0, 1:]
+    top_scores = scores[:, 0]
+    return {"outputs": top_decoded_ids, "scores": top_scores}
+class LayerNormalization(tf.layers.Layer):
+  """Applies layer normalization."""
+  def __init__(self, hidden_size):
+    super(LayerNormalization, self).__init__()
+    self.hidden_size = hidden_size
+  def build(self, _):
+    self.scale = tf.get_variable("layer_norm_scale", [self.hidden_size],
+                                 initializer=tf.ones_initializer())
+    self.bias = tf.get_variable("layer_norm_bias", [self.hidden_size],
+                                initializer=tf.zeros_initializer())
+    self.built = True
+  def call(self, x, epsilon=1e-6):
+    mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
+    variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
+    norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
+    return norm_x * self.scale + self.bias
+class PrePostProcessingWrapper(object):
+  """Wrapper class that applies layer pre-processing and post-processing."""
+  def __init__(self, layer, params, train):
+    self.layer = layer
+    self.postprocess_dropout = params.layer_postprocess_dropout
+    self.train = train
+    # Create normalization layer
+    self.layer_norm = LayerNormalization(params.hidden_size)
+  def __call__(self, x, *args, **kwargs):
+    # Preprocessing: apply layer normalization
+    y = self.layer_norm(x)
+    # Get layer output
+    y = self.layer(y, *args, **kwargs)
+    # Postprocessing: apply dropout and residual connection
+    if self.train:
+      y = tf.nn.dropout(y, 1 - self.postprocess_dropout)
+    return x + y
+class EncoderStack(tf.layers.Layer):
+  """Transformer encoder stack.
+  The encoder stack is made up of N identical layers. Each layer is composed
+  of the sublayers:
+    1. Self-attention layer
+    2. Feedforward network (which is 2 fully-connected layers)
+  """
+  def __init__(self, params, train):
+    super(EncoderStack, self).__init__()
+    self.layers = []
+    for _ in range(params.num_hidden_layers):
+      # Create sublayers for each layer.
+      self_attention_layer = attention_layer.SelfAttention(
+          params.hidden_size, params.num_heads, params.attention_dropout, train)
+      feed_forward_network = ffn_layer.FeedFowardNetwork(
+          params.hidden_size, params.filter_size, params.relu_dropout, train)
+      self.layers.append([
+          PrePostProcessingWrapper(self_attention_layer, params, train),
+          PrePostProcessingWrapper(feed_forward_network, params, train)])
+    # Create final layer normalization layer.
+    self.output_normalization = LayerNormalization(params.hidden_size)
+  def call(self, encoder_inputs, attention_bias, inputs_padding):
+    """Return the output of the encoder layer stacks.
+    Args:
+      encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
+      attention_bias: bias for the encoder self-attention layer.
+        [batch_size, 1, 1, input_length]
+      inputs_padding: P
+    Returns:
+      Output of encoder layer stack.
+      float32 tensor with shape [batch_size, input_length, hidden_size]
+    """
+    for n, layer in enumerate(self.layers):
+      # Run inputs through the sublayers.
+      self_attention_layer = layer[0]
+      feed_forward_network = layer[1]
+      with tf.variable_scope("layer_%d" % n):
+        with tf.variable_scope("self_attention"):
+          encoder_inputs = self_attention_layer(encoder_inputs, attention_bias)
+        with tf.variable_scope("ffn"):
+          encoder_inputs = feed_forward_network(encoder_inputs, inputs_padding)
+    return self.output_normalization(encoder_inputs)
+class DecoderStack(tf.layers.Layer):
+  """Transformer decoder stack.
+  Like the encoder stack, the decoder stack is made up of N identical layers.
+  Each layer is composed of the sublayers:
+    1. Self-attention layer
+    2. Multi-headed attention layer combining encoder outputs with results from
+       the previous self-attention layer.
+    3. Feedforward network (2 fully-connected layers)
+  """
+  def __init__(self, params, train):
+    super(DecoderStack, self).__init__()
+    self.layers = []
+    for _ in range(params.num_hidden_layers):
+      self_attention_layer = attention_layer.SelfAttention(
+          params.hidden_size, params.num_heads, params.attention_dropout, train)
+      enc_dec_attention_layer = attention_layer.Attention(
+          params.hidden_size, params.num_heads, params.attention_dropout, train)
+      feed_forward_network = ffn_layer.FeedFowardNetwork(
+          params.hidden_size, params.filter_size, params.relu_dropout, train)
+      self.layers.append([
+          PrePostProcessingWrapper(self_attention_layer, params, train),
+          PrePostProcessingWrapper(enc_dec_attention_layer, params, train),
+          PrePostProcessingWrapper(feed_forward_network, params, train)])
+    self.output_normalization = LayerNormalization(params.hidden_size)
+  def call(self, decoder_inputs, encoder_outputs, decoder_self_attention_bias,
+           attention_bias, cache=None):
+    """Return the output of the decoder layer stacks.
+    Args:
+      decoder_inputs: tensor with shape [batch_size, target_length, hidden_size]
+      encoder_outputs: tensor with shape [batch_size, input_length, hidden_size]
+      decoder_self_attention_bias: bias for decoder self-attention layer.
+        [1, 1, target_len, target_length]
+      attention_bias: bias for encoder-decoder attention layer.
+        [batch_size, 1, 1, input_length]
+      cache: (Used for fast decoding) A nested dictionary storing previous
+        decoder self-attention values. The items are:
+          {layer_n: {"k": tensor with shape [batch_size, i, key_channels],
+                     "v": tensor with shape [batch_size, i, value_channels]},
+           ...}
+    Returns:
+      Output of decoder layer stack.
+      float32 tensor with shape [batch_size, target_length, hidden_size]
+    """
+    for n, layer in enumerate(self.layers):
+      self_attention_layer = layer[0]
+      enc_dec_attention_layer = layer[1]
+      feed_forward_network = layer[2]
+      # Run inputs through the sublayers.
+      layer_name = "layer_%d" % n
+      layer_cache = cache[layer_name] if cache is not None else None
+      with tf.variable_scope(layer_name):
+        with tf.variable_scope("self_attention"):
+          decoder_inputs = self_attention_layer(
+              decoder_inputs, decoder_self_attention_bias, cache=layer_cache)
+        with tf.variable_scope("encdec_attention"):
+          decoder_inputs = enc_dec_attention_layer(
+              decoder_inputs, encoder_outputs, attention_bias)
+        with tf.variable_scope("ffn"):
+          decoder_inputs = feed_forward_network(decoder_inputs)
+    return self.output_normalization(decoder_inputs)
--- a/official/transformer/test_data/newstest2014.de
+++ b/official/transformer/test_data/newstest2014.de
--- a/official/transformer/test_data/newstest2014.en
+++ b/official/transformer/test_data/newstest2014.en
--- a/official/transformer/transformer_main.py
+++ b/official/transformer/transformer_main.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Creates an estimator to train the Transformer model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+import os
+import sys
+import tempfile
+# pylint: disable=g-bad-import-order
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+# pylint: enable=g-bad-import-order
+from official.transformer import compute_bleu
+from official.transformer import translate
+from official.transformer.data_download import VOCAB_FILE
+from official.transformer.model import model_params
+from official.transformer.model import transformer
+from official.transformer.utils import dataset
+from official.transformer.utils import metrics
+from official.transformer.utils import tokenizer
+DEFAULT_TRAIN_EPOCHS = 10
+BLEU_DIR = "bleu"
+INF = int(1e9)
+def model_fn(features, labels, mode, params):
+  """Defines how to train, evaluate and predict from the transformer model."""
+  with tf.variable_scope("model"):
+    inputs, targets = features, labels
+    # Create model and get output logits.
+    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)
+    output = model(inputs, targets)
+    # When in prediction mode, the labels/targets is None. The model output
+    # is the prediction
+    if mode == tf.estimator.ModeKeys.PREDICT:
+      return tf.estimator.EstimatorSpec(
+          tf.estimator.ModeKeys.PREDICT,
+          predictions=output)
+    logits = output
+    # Calculate model loss.
+    xentropy, weights = metrics.padded_cross_entropy_loss(
+        logits, targets, params.label_smoothing, params.vocab_size)
+    loss = tf.reduce_sum(xentropy * weights) / tf.reduce_sum(weights)
+    if mode == tf.estimator.ModeKeys.EVAL:
+      return tf.estimator.EstimatorSpec(
+          mode=mode, loss=loss, predictions={"predictions": logits},
+          eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
+    else:
+      train_op = get_train_op(loss, params)
+      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
+def get_learning_rate(learning_rate, hidden_size, learning_rate_warmup_steps):
+  """Calculate learning rate with linear warmup and rsqrt decay."""
+  with tf.name_scope("learning_rate"):
+    warmup_steps = tf.to_float(learning_rate_warmup_steps)
+    step = tf.to_float(tf.train.get_or_create_global_step())
+    learning_rate *= (hidden_size ** -0.5)
+    # Apply linear warmup
+    learning_rate *= tf.minimum(1.0, step / warmup_steps)
+    # Apply rsqrt decay
+    learning_rate *= tf.rsqrt(tf.maximum(step, warmup_steps))
+    # Save learning rate value to TensorBoard summary.
+    tf.summary.scalar("learning_rate", learning_rate)
+    return learning_rate
+def get_train_op(loss, params):
+  """Generate training operation that updates variables based on loss."""
+  with tf.variable_scope("get_train_op"):
+    learning_rate = get_learning_rate(
+        params.learning_rate, params.hidden_size,
+        params.learning_rate_warmup_steps)
+    # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster
+    # than the TF core Adam optimizer.
+    optimizer = tf.contrib.opt.LazyAdamOptimizer(
+        learning_rate,
+        beta1=params.optimizer_adam_beta1,
+        beta2=params.optimizer_adam_beta2,
+        epsilon=params.optimizer_adam_epsilon)
+    # Calculate and apply gradients using LazyAdamOptimizer.
+    global_step = tf.train.get_global_step()
+    tvars = tf.trainable_variables()
+    gradients = optimizer.compute_gradients(
+        loss, tvars, colocate_gradients_with_ops=True)
+    train_op = optimizer.apply_gradients(
+        gradients, global_step=global_step, name="train")
+    # Save gradient norm to Tensorboard
+    tf.summary.scalar("global_norm/gradient_norm",
+                      tf.global_norm(list(zip(*gradients))[0]))
+    return train_op
+def translate_and_compute_bleu(estimator, subtokenizer, bleu_source, bleu_ref):
+  """Translate file and report the cased and uncased bleu scores."""
+  # Create temporary file to store translation.
+  tmp = tempfile.NamedTemporaryFile(delete=False)
+  tmp_filename = tmp.name
+  translate.translate_file(
+      estimator, subtokenizer, bleu_source, output_file=tmp_filename,
+      print_all_translations=False)
+  # Compute uncased and cased bleu scores.
+  uncased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, False)
+  cased_score = compute_bleu.bleu_wrapper(bleu_ref, tmp_filename, True)
+  os.remove(tmp_filename)
+  return uncased_score, cased_score
+def get_global_step(estimator):
+  """Return estimator's last checkpoint."""
+  return int(estimator.latest_checkpoint().split("-")[-1])
+def evaluate_and_log_bleu(estimator, bleu_writer, bleu_source, bleu_ref):
+  """Calculate and record the BLEU score."""
+  subtokenizer = tokenizer.Subtokenizer(
+      os.path.join(FLAGS.data_dir, FLAGS.vocab_file))
+  uncased_score, cased_score = translate_and_compute_bleu(
+      estimator, subtokenizer, bleu_source, bleu_ref)
+  print("Bleu score (uncased):", uncased_score)
+  print("Bleu score (cased):", cased_score)
+  summary = tf.Summary(value=[
+      tf.Summary.Value(tag="bleu/uncased", simple_value=uncased_score),
+      tf.Summary.Value(tag="bleu/cased", simple_value=cased_score),
+  ])
+  bleu_writer.add_summary(summary, get_global_step(estimator))
+  bleu_writer.flush()
+  return uncased_score, cased_score
+def train_schedule(
+    estimator, train_eval_iterations, single_iteration_train_steps=None,
+    single_iteration_train_epochs=None, bleu_source=None, bleu_ref=None,
+    bleu_threshold=None):
+  """Train and evaluate model, and optionally compute model's BLEU score.
+  **Step vs. Epoch vs. Iteration**
+  Steps and epochs are canonical terms used in TensorFlow and general machine
+  learning. They are used to describe running a single process (train/eval):
+    - Step refers to running the process through a single or batch of examples.
+    - Epoch refers to running the process through an entire dataset.
+  E.g. training a dataset with 100 examples. The dataset is
+  divided into 20 batches with 5 examples per batch. A single training step
+  trains the model on one batch. After 20 training steps, the model will have
+  trained on every batch in the dataset, or, in other words, one epoch.
+  Meanwhile, iteration is used in this implementation to describe running
+  multiple processes (training and eval).
+    - A single iteration:
+      1. trains the model for a specific number of steps or epochs.
+      2. evaluates the model.
+      3. (if source and ref files are provided) compute BLEU score.
+  This function runs through multiple train+eval+bleu iterations.
+  Args:
+    estimator: tf.Estimator containing model to train.
+    train_eval_iterations: Number of times to repeat the train+eval iteration.
+    single_iteration_train_steps: Number of steps to train in one iteration.
+    single_iteration_train_epochs: Number of epochs to train in one iteration.
+    bleu_source: File containing text to be translated for BLEU calculation.
+    bleu_ref: File containing reference translations for BLEU calculation.
+    bleu_threshold: minimum BLEU score before training is stopped.
+  Raises:
+    ValueError: if both or none of single_iteration_train_steps and
+      single_iteration_train_epochs were defined.
+  """
+  # Ensure that exactly one of single_iteration_train_steps and
+  # single_iteration_train_epochs is defined.
+  if single_iteration_train_steps is None:
+    if single_iteration_train_epochs is None:
+      raise ValueError(
+          "Exactly one of single_iteration_train_steps or "
+          "single_iteration_train_epochs must be defined. Both were none.")
+  else:
+    if single_iteration_train_epochs is not None:
+      raise ValueError(
+          "Exactly one of single_iteration_train_steps or "
+          "single_iteration_train_epochs must be defined. Both were defined.")
+  evaluate_bleu = bleu_source is not None and bleu_ref is not None
+  # Print out training schedule
+  print("Training schedule:")
+  if single_iteration_train_epochs is not None:
+    print("\t1. Train for %d epochs." % single_iteration_train_epochs)
+  else:
+    print("\t1. Train for %d steps." % single_iteration_train_steps)
+  print("\t2. Evaluate model.")
+  if evaluate_bleu:
+    print("\t3. Compute BLEU score.")
+    if bleu_threshold is not None:
+      print("Repeat above steps until the BLEU score reaches", bleu_threshold)
+  if not evaluate_bleu or bleu_threshold is None:
+    print("Repeat above steps %d times." % train_eval_iterations)
+  if evaluate_bleu:
+    # Set summary writer to log bleu score.
+    bleu_writer = tf.summary.FileWriter(
+        os.path.join(estimator.model_dir, BLEU_DIR))
+    if bleu_threshold is not None:
+      # Change loop stopping condition if bleu_threshold is defined.
+      train_eval_iterations = INF
+  # Loop training/evaluation/bleu cycles
+  for i in xrange(train_eval_iterations):
+    print("Starting iteration", i + 1)
+    # Train the model for single_iteration_train_steps or until the input fn
+    # runs out of examples (if single_iteration_train_steps is None).
+    estimator.train(dataset.train_input_fn, steps=single_iteration_train_steps)
+    eval_results = estimator.evaluate(dataset.eval_input_fn)
+    print("Evaluation results (iter %d/%d):" % (i + 1, train_eval_iterations),
+          eval_results)
+    if evaluate_bleu:
+      uncased_score, _ = evaluate_and_log_bleu(
+          estimator, bleu_writer, bleu_source, bleu_ref)
+      if bleu_threshold is not None and uncased_score > bleu_threshold:
+        bleu_writer.close()
+        break
+def main(_):
+  # Set logging level to INFO to display training progress (logged by the
+  # estimator)
+  tf.logging.set_verbosity(tf.logging.INFO)
+  if FLAGS.params == "base":
+    params = model_params.TransformerBaseParams
+  elif FLAGS.params == "big":
+    params = model_params.TransformerBigParams
+  else:
+    raise ValueError("Invalid parameter set defined: %s."
+                     "Expected 'base' or 'big.'" % FLAGS.params)
+  # Determine training schedule based on flags.
+  if FLAGS.train_steps is not None and FLAGS.train_epochs is not None:
+    raise ValueError("Both --train_steps and --train_epochs were set. Only one "
+                     "may be defined.")
+  if FLAGS.train_steps is not None:
+    train_eval_iterations = FLAGS.train_steps // FLAGS.steps_between_eval
+    single_iteration_train_steps = FLAGS.steps_between_eval
+    single_iteration_train_epochs = None
+  else:
+    if FLAGS.train_epochs is None:
+      FLAGS.train_epochs = DEFAULT_TRAIN_EPOCHS
+    train_eval_iterations = FLAGS.train_epochs // FLAGS.epochs_between_eval
+    single_iteration_train_steps = None
+    single_iteration_train_epochs = FLAGS.epochs_between_eval
+  # Make sure that the BLEU source and ref files if set
+  if FLAGS.bleu_source is not None and FLAGS.bleu_ref is not None:
+    if not tf.gfile.Exists(FLAGS.bleu_source):
+      raise ValueError("BLEU source file %s does not exist" % FLAGS.bleu_source)
+    if not tf.gfile.Exists(FLAGS.bleu_ref):
+      raise ValueError("BLEU source file %s does not exist" % FLAGS.bleu_ref)
+  # Add flag-defined parameters to params object
+  params.data_dir = FLAGS.data_dir
+  params.num_cpu_cores = FLAGS.num_cpu_cores
+  params.epochs_between_eval = FLAGS.epochs_between_eval
+  params.repeat_dataset = single_iteration_train_epochs
+  estimator = tf.estimator.Estimator(
+      model_fn=model_fn, model_dir=FLAGS.model_dir, params=params)
+  train_schedule(
+      estimator, train_eval_iterations, single_iteration_train_steps,
+      single_iteration_train_epochs, FLAGS.bleu_source, FLAGS.bleu_ref,
+      FLAGS.bleu_threshold)
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "--data_dir", "-dd", type=str, default="/tmp/translate_ende",
+      help="[default: %(default)s] Directory containing training and "
+           "evaluation data, and vocab file used for encoding.",
+      metavar="<DD>")
+  parser.add_argument(
+      "--vocab_file", "-vf", type=str, default=VOCAB_FILE,
+      help="[default: %(default)s] Name of vocabulary file.",
+      metavar="<vf>")
+  parser.add_argument(
+      "--model_dir", "-md", type=str, default="/tmp/transformer_model",
+      help="[default: %(default)s] Directory to save Transformer model "
+           "training checkpoints",
+      metavar="<MD>")
+  parser.add_argument(
+      "--params", "-p", type=str, default="big", choices=["base", "big"],
+      help="[default: %(default)s] Parameter set to use when creating and "
+           "training the model.",
+      metavar="<P>")
+  parser.add_argument(
+      "--num_cpu_cores", "-nc", type=int, default=4,
+      help="[default: %(default)s] Number of CPU cores to use in the input "
+           "pipeline.",
+      metavar="<NC>")
+  # Flags for training with epochs. (default)
+  parser.add_argument(
+      "--train_epochs", "-te", type=int, default=None,
+      help="The number of epochs used to train. If both --train_epochs and "
+           "--train_steps are not set, the model will train for %d epochs." %
+      DEFAULT_TRAIN_EPOCHS,
+      metavar="<TE>")
+  parser.add_argument(
+      "--epochs_between_eval", "-ebe", type=int, default=1,
+      help="[default: %(default)s] The number of training epochs to run "
+           "between evaluations.",
+      metavar="<TE>")
+  # Flags for training with steps (may be used for debugging)
+  parser.add_argument(
+      "--train_steps", "-ts", type=int, default=None,
+      help="Total number of training steps. If both --train_epochs and "
+           "--train_steps are not set, the model will train for %d epochs." %
+      DEFAULT_TRAIN_EPOCHS,
+      metavar="<TS>")
+  parser.add_argument(
+      "--steps_between_eval", "-sbe", type=int, default=1000,
+      help="[default: %(default)s] Number of training steps to run between "
+           "evaluations.",
+      metavar="<SBE>")
+  # BLEU score computation
+  parser.add_argument(
+      "--bleu_source", "-bs", type=str, default=None,
+      help="Path to source file containing text translate when calculating the "
+           "official BLEU score. Both --bleu_source and --bleu_ref must be "
+           "set. The BLEU score will be calculated during model evaluation.",
+      metavar="<BS>")
+  parser.add_argument(
+      "--bleu_ref", "-br", type=str, default=None,
+      help="Path to file containing the reference translation for calculating "
+           "the official BLEU score. Both --bleu_source and --bleu_ref must be "
+           "set. The BLEU score will be calculated during model evaluation.",
+      metavar="<BR>")
+  parser.add_argument(
+      "--bleu_threshold", "-bt", type=float, default=None,
+      help="Stop training when the uncased BLEU score reaches this value. "
+           "Setting this overrides the total number of steps or epochs set by "
+           "--train_steps or --train_epochs.",
+      metavar="<BT>")
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
--- a/official/transformer/translate.py
+++ b/official/transformer/translate.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Translate text or files using trained transformer model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+import os
+import sys
+# pylint: disable=g-bad-import-order
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+# pylint: enable=g-bad-import-order
+from official.transformer.data_download import VOCAB_FILE
+from official.transformer.model import model_params
+from official.transformer.utils import tokenizer
+_DECODE_BATCH_SIZE = 32
+_EXTRA_DECODE_LENGTH = 100
+_BEAM_SIZE = 4
+_ALPHA = 0.6
+def _get_sorted_inputs(filename):
+  """Read and sort lines from the file sorted by decreasing length.
+  Args:
+    filename: String name of file to read inputs from.
+  Returns:
+    Sorted list of inputs, and dictionary mapping original index->sorted index
+    of each element.
+  """
+  with tf.gfile.Open(filename) as f:
+    records = f.read().split("\n")
+    inputs = [record.strip() for record in records]
+    if not inputs[-1]:
+      inputs.pop()
+  input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
+  sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)
+  sorted_inputs = []
+  sorted_keys = {}
+  for i, (index, _) in enumerate(sorted_input_lens):
+    sorted_inputs.append(inputs[index])
+    sorted_keys[index] = i
+  return sorted_inputs, sorted_keys
+def _encode_and_add_eos(line, subtokenizer):
+  """Encode line with subtokenizer, and add EOS id to the end."""
+  return subtokenizer.encode(line) + [tokenizer.EOS_ID]
+def _trim_and_decode(ids, subtokenizer):
+  """Trim EOS and PAD tokens from ids, and decode to return a string."""
+  try:
+    index = list(ids).index(tokenizer.EOS_ID)
+    return subtokenizer.decode(ids[:index])
+  except ValueError:  # No EOS found in sequence
+    return subtokenizer.decode(ids)
+def translate_file(
+    estimator, subtokenizer, input_file, output_file=None,
+    print_all_translations=True):
+  """Translate lines in file, and save to output file if specified.
+  Args:
+    estimator: tf.Estimator used to generate the translations.
+    subtokenizer: Subtokenizer object for encoding and decoding source and
+       translated lines.
+    input_file: file containing lines to translate
+    output_file: file that stores the generated translations.
+    print_all_translations: If true, all translations are printed to stdout.
+  Raises:
+    ValueError: if output file is invalid.
+  """
+  batch_size = _DECODE_BATCH_SIZE
+  # Read and sort inputs by length. Keep dictionary (original index-->new index
+  # in sorted list) to write translations in the original order.
+  sorted_inputs, sorted_keys = _get_sorted_inputs(input_file)
+  num_decode_batches = (len(sorted_inputs) - 1) // batch_size + 1
+  def input_generator():
+    """Yield encoded strings from sorted_inputs."""
+    for i, line in enumerate(sorted_inputs):
+      if i % batch_size == 0:
+        batch_num = (i // batch_size) + 1
+        print("Decoding batch %d out of %d." % (batch_num, num_decode_batches))
+      yield _encode_and_add_eos(line, subtokenizer)
+  def input_fn():
+    """Created batched dataset of encoded inputs."""
+    ds = tf.data.Dataset.from_generator(
+        input_generator, tf.int64, tf.TensorShape([None]))
+    ds = ds.padded_batch(batch_size, [None])
+    return ds
+  translations = []
+  for i, prediction in enumerate(estimator.predict(input_fn)):
+    translation = _trim_and_decode(prediction["outputs"], subtokenizer)
+    translations.append(translation)
+    if print_all_translations:
+      print("Translating:")
+      print("\tInput: %s" % sorted_inputs[i])
+      print("\tOutput: %s\n" % translation)
+      print("=" * 100)
+  # Write translations in the order they appeared in the original file.
+  if output_file is not None:
+    if tf.gfile.IsDirectory(output_file):
+      raise ValueError("File output is a directory, will not save outputs to "
+                       "file.")
+    tf.logging.info("Writing to file %s" % output_file)
+    with tf.gfile.Open(output_file, "w") as f:
+      for index in xrange(len(sorted_keys)):
+        f.write("%s\n" % translations[sorted_keys[index]])
+def translate_text(estimator, subtokenizer, txt):
+  """Translate a single string."""
+  encoded_txt = _encode_and_add_eos(txt, subtokenizer)
+  def input_fn():
+    ds = tf.data.Dataset.from_tensors(encoded_txt)
+    ds = ds.batch(_DECODE_BATCH_SIZE)
+    return ds
+  predictions = estimator.predict(input_fn)
+  translation = next(predictions)["outputs"]
+  translation = _trim_and_decode(translation, subtokenizer)
+  print("Translation of \"%s\": \"%s\"" % (txt, translation))
+def main(unused_argv):
+  from official.transformer import transformer_main
+  tf.logging.set_verbosity(tf.logging.INFO)
+  if FLAGS.text is None and FLAGS.file is None:
+    tf.logging.warn("Nothing to translate. Make sure to call this script using "
+                    "flags --text or --file.")
+    return
+  subtokenizer = tokenizer.Subtokenizer(
+      os.path.join(FLAGS.data_dir, FLAGS.vocab_file))
+  if FLAGS.params == "base":
+    params = model_params.TransformerBaseParams
+  elif FLAGS.params == "big":
+    params = model_params.TransformerBigParams
+  else:
+    raise ValueError("Invalid parameter set defined: %s."
+                     "Expected 'base' or 'big.'" % FLAGS.params)
+  # Set up estimator and params
+  params.beam_size = _BEAM_SIZE
+  params.alpha = _ALPHA
+  params.extra_decode_length = _EXTRA_DECODE_LENGTH
+  params.batch_size = _DECODE_BATCH_SIZE
+  estimator = tf.estimator.Estimator(
+      model_fn=transformer_main.model_fn, model_dir=FLAGS.model_dir,
+      params=params)
+  if FLAGS.text is not None:
+    tf.logging.info("Translating text: %s" % FLAGS.text)
+    translate_text(estimator, subtokenizer, FLAGS.text)
+  if FLAGS.file is not None:
+    input_file = os.path.abspath(FLAGS.file)
+    tf.logging.info("Translating file: %s" % input_file)
+    if not tf.gfile.Exists(FLAGS.file):
+      raise ValueError("File does not exist: %s" % input_file)
+    output_file = None
+    if FLAGS.file_out is not None:
+      output_file = os.path.abspath(FLAGS.file_out)
+      tf.logging.info("File output specified: %s" % output_file)
+    translate_file(estimator, subtokenizer, input_file, output_file)
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  # Model arguments
+  parser.add_argument(
+      "--data_dir", "-dd", type=str, default="/tmp/data/translate_ende",
+      help="[default: %(default)s] Directory where vocab file is stored.",
+      metavar="<DD>")
+  parser.add_argument(
+      "--vocab_file", "-vf", type=str, default=VOCAB_FILE,
+      help="[default: %(default)s] Name of vocabulary file.",
+      metavar="<vf>")
+  parser.add_argument(
+      "--model_dir", "-md", type=str, default="/tmp/transformer_model",
+      help="[default: %(default)s] Directory containing Transformer model "
+           "checkpoints.",
+      metavar="<MD>")
+  parser.add_argument(
+      "--params", "-p", type=str, default="big", choices=["base", "big"],
+      help="[default: %(default)s] Parameter used for trained model.",
+      metavar="<P>")
+  # Flags for specifying text/file to be translated.
+  parser.add_argument(
+      "--text", "-t", type=str, default=None,
+      help="[default: %(default)s] Text to translate. Output will be printed "
+           "to console.",
+      metavar="<T>")
+  parser.add_argument(
+      "--file", "-f", type=str, default=None,
+      help="[default: %(default)s] File containing text to translate. "
+           "Translation will be printed to console and, if --file_out is "
+           "provided, saved to an output file.",
+      metavar="<F>")
+  parser.add_argument(
+      "--file_out", "-fo", type=str, default=None,
+      help="[default: %(default)s] If --file flag is specified, save "
+           "translation to this file.",
+      metavar="<FO>")
+  FLAGS, unparsed = parser.parse_known_args()
+  main(sys.argv)
--- a/official/transformer/utils/__init__.py
+++ b/official/transformer/utils/__init__.py
--- a/official/transformer/utils/dataset.py
+++ b/official/transformer/utils/dataset.py
--- a/official/transformer/utils/metrics.py
+++ b/official/transformer/utils/metrics.py
--- a/official/transformer/utils/tokenizer.py
+++ b/official/transformer/utils/tokenizer.py
--- a/official/transformer/utils/tokenizer_test.py
+++ b/official/transformer/utils/tokenizer_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test Subtokenizer and string helper methods."""
+import collections
+import tempfile
+import unittest
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+from official.transformer.utils import tokenizer
+class SubtokenizerTest(unittest.TestCase):
+  def _init_subtokenizer(self, vocab_list):
+    temp_file = tempfile.NamedTemporaryFile(delete=False)
+    with tf.gfile.Open(temp_file.name, 'w') as w:
+      for subtoken in vocab_list:
+        w.write("'%s'" % subtoken)
+        w.write("\n")
+    return tokenizer.Subtokenizer(temp_file.name, reserved_tokens=[])
+  def test_encode(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    s = "testing 123"
+    encoded_list = subtokenizer.encode(s)
+    self.assertEqual([1, 2, 0], encoded_list)
+  def test_decode(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    encoded_list = [1, 2, 0]  # testing 123
+    decoded_str = subtokenizer.decode(encoded_list)
+    self.assertEqual("testing 123", decoded_str)
+  def test_subtoken_ids_to_tokens(self):
+    vocab_list = ["123_", "test", "ing_"]
+    subtokenizer = self._init_subtokenizer(vocab_list)
+    encoded_list = [1, 2, 0]  # testing 123
+    token_list = subtokenizer._subtoken_ids_to_tokens(encoded_list)
+    self.assertEqual([u"testing", u"123"], token_list)
+class StringHelperTest(unittest.TestCase):
+  def test_split_string_to_tokens(self):
+    text = "test? testing 123."
+    tokens = tokenizer._split_string_to_tokens(text)
+    self.assertEqual(["test", "? ", "testing", "123", "."], tokens)
+  def test_join_tokens_to_string(self):
+    tokens = ["test", "? ", "testing", "123", "."]
+    s = tokenizer._join_tokens_to_string(tokens)
+    self.assertEqual("test? testing 123.", s)
+  def test_escape_token(self):
+    token = u"abc_\\4"
+    alphabet = set("abc_\\u;")
+    escaped_token = tokenizer._escape_token(token, alphabet)
+    self.assertEqual("abc\\u\\\\\\52;_", escaped_token)
+  def test_unescape_token(self):
+    escaped_token = u"Underline: \\u, Backslash: \\\\, Unicode: \\52;"
+    unescaped_token = tokenizer._unescape_token(escaped_token)
+    self.assertEqual(
+        "Underline: _, Backslash: \\, Unicode: 4", unescaped_token)
+  def test_list_to_index_dict(self):
+    lst = ["test", "strings"]
+    d = tokenizer._list_to_index_dict(lst)
+    self.assertDictEqual({"test": 0, "strings": 1}, d)
+  def test_split_token_to_subtokens(self):
+    token = "abc"
+    subtoken_dict = {"a": 0, "b": 1, "c": 2, "ab": 3}
+    max_subtoken_length = 2
+    subtokens = tokenizer._split_token_to_subtokens(
+        token, subtoken_dict, max_subtoken_length)
+    self.assertEqual(["ab", "c"], subtokens)
+  def test_generate_alphabet_dict(self):
+    s = ["testing", "123"]
+    reserved_tokens = ["???"]
+    alphabet = tokenizer._generate_alphabet_dict(s, reserved_tokens)
+    self.assertIn("?", alphabet)
+    self.assertIn("t", alphabet)
+    self.assertIn("e", alphabet)
+    self.assertIn("s", alphabet)
+    self.assertIn("i", alphabet)
+    self.assertIn("n", alphabet)
+    self.assertIn("g", alphabet)
+    self.assertIn("1", alphabet)
+    self.assertIn("2", alphabet)
+    self.assertIn("3", alphabet)
+  def test_count_and_gen_subtokens(self):
+    token_counts = {"abc": 5}
+    alphabet = set("abc_")
+    subtoken_dict = {"a": 0, "b": 1, "c": 2, "_": 3}
+    max_subtoken_length = 2
+    subtoken_counts = tokenizer._count_and_gen_subtokens(
+        token_counts, alphabet, subtoken_dict, max_subtoken_length)
+    self.assertIsInstance(subtoken_counts, collections.defaultdict)
+    self.assertDictEqual(
+        {"a": 5, "b": 5, "c": 5, "_": 5, "ab": 5, "bc": 5, "c_": 5,
+         "abc": 5, "bc_": 5, "abc_": 5}, subtoken_counts)
+  def test_filter_and_bucket_subtokens(self):
+    subtoken_counts = collections.defaultdict(
+        int, {"a": 2, "b": 4, "c": 1, "ab": 6, "ac": 3, "abbc": 5})
+    min_count = 3
+    subtoken_buckets = tokenizer._filter_and_bucket_subtokens(
+        subtoken_counts, min_count)
+    self.assertEqual(len(subtoken_buckets[0]), 0)
+    self.assertEqual(set("b"), subtoken_buckets[1])
+    self.assertEqual(set(["ab", "ac"]), subtoken_buckets[2])
+    self.assertEqual(len(subtoken_buckets[3]), 0)
+    self.assertEqual(set(["abbc"]), subtoken_buckets[4])
+  def test_gen_new_subtoken_list(self):
+    subtoken_counts = collections.defaultdict(
+        int, {"translate": 10, "t": 40, "tr": 16, "tra": 12})
+    min_count = 5
+    alphabet = set("translate")
+    reserved_tokens = ["reserved", "tokens"]
+    subtoken_list, max_token_length = tokenizer._gen_new_subtoken_list(
+        subtoken_counts, min_count, alphabet, reserved_tokens)
+    # Check that "tra" isn"t in the list (its count should be decremented to 2,
+    # so it should not be added to the canddiate list).
+    self.assertNotIn("tra", subtoken_list)
+    self.assertIn("tr", subtoken_list)
+    self.assertIn("t", subtoken_list)
+    self.assertEqual(len("translate"), max_token_length)
+  def test_generate_subtokens(self):
+    token_counts = {"ab": 1, "bc": 3, "abc": 5}
+    alphabet = set("abc_")
+    min_count = 100
+    num_iterations = 1
+    reserved_tokens = ["reserved", "tokens"]
+    vocab_list = tokenizer._generate_subtokens(
+        token_counts, alphabet, min_count, num_iterations, reserved_tokens)
+    # Check that reserved tokens are at the front of the list
+    self.assertEqual(vocab_list[:2], reserved_tokens)
+    # Check that each character in alphabet is in the vocab list
+    for c in alphabet:
+      self.assertIn(c, vocab_list)
+if __name__ == "__main__":
+  unittest.main()
--- a/official/utils/arg_parsers/parsers.py
+++ b/official/utils/arg_parsers/parsers.py
--- a/official/utils/flags/README.md
+++ b/official/utils/flags/README.md
--- a/official/utils/flags/__init__.py
+++ b/official/utils/flags/__init__.py