Merge branch 'panoptic-segmentation' into panoptic-deeplab-modeling

09d9656f · Srihari Humbarwadi · GitHub · ac671306 · 49a5706c · 09d9656f
Unverified Commit 09d9656f authored Jan 13, 2022 by Srihari Humbarwadi Committed by GitHub Jan 13, 2022
20 changed files
--- a/official/nlp/modeling/layers/block_diag_feedforward.py
+++ b/official/nlp/modeling/layers/block_diag_feedforward.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Keras-based gated feedforward layer."""
+# pylint: disable=g-classes-have-attributes
+from typing import Optional
+import tensorflow as tf
+class BlockDiagFeedforward(tf.keras.layers.Layer):
+  """Block diagonal feedforward layer.
+  This layer replaces the weight matrix of the output_dense layer with a block
+  diagonal matrix to save layer parameters and FLOPs. A linear mixing layer can
+  be added optionally to improve layer expressibility.
+  Args:
+    intermediate_size: Size of the intermediate layer.
+    intermediate_activation: Activation for the intermediate layer.
+    dropout: Dropout probability for the output dropout.
+    num_blocks: The number of blocks for the block diagonal matrix of the
+      output_dense layer.
+    apply_mixing: Apply linear mixing if True.
+    kernel_initializer: Initializer for dense layer kernels.
+    bias_initializer: Initializer for dense layer biases.
+    kernel_regularizer: Regularizer for dense layer kernels.
+    bias_regularizer: Regularizer for dense layer biases.
+    activity_regularizer: Regularizer for dense layer activity.
+    kernel_constraint: Constraint for dense layer kernels.
+    bias_constraint: Constraint for dense layer kernels.
+  """
+  def __init__(
+      self,
+      intermediate_size: int,
+      intermediate_activation: str,
+      dropout: float,
+      num_blocks: int = 1,
+      apply_mixing: bool = True,
+      kernel_initializer: str = "glorot_uniform",
+      bias_initializer: str = "zeros",
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      activity_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      kernel_constraint: Optional[tf.keras.constraints.Constraint] = None,
+      bias_constraint: Optional[tf.keras.constraints.Constraint] = None,
+      **kwargs):  # pylint: disable=g-doc-args
+    super(BlockDiagFeedforward, self).__init__(**kwargs)
+    self._intermediate_size = intermediate_size
+    self._intermediate_activation = intermediate_activation
+    self._dropout = dropout
+    self._num_blocks = num_blocks
+    self._apply_mixing = apply_mixing
+    if intermediate_size % num_blocks != 0:
+      raise ValueError("Intermediate_size (%d) isn't a multiple of num_blocks "
+                       "(%d)." % (intermediate_size, num_blocks))
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+  def build(self, input_shape):
+    hidden_size = input_shape.as_list()[-1]
+    common_kwargs = dict(
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+    self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cde->abde",
+        output_shape=(None, self._num_blocks,
+                      self._intermediate_size // self._num_blocks),
+        bias_axes="de",
+        name="intermediate",
+        **common_kwargs)
+    policy = tf.keras.mixed_precision.global_policy()
+    if policy.name == "mixed_bfloat16":
+      # bfloat16 causes BERT with the LAMB optimizer to not converge
+      # as well, so we use float32.
+      policy = tf.float32
+    self._intermediate_activation_layer = tf.keras.layers.Activation(
+        self._intermediate_activation, dtype=policy)
+    self._output_dense = tf.keras.layers.experimental.EinsumDense(
+        "abde,deo->abdo",
+        output_shape=(None, self._num_blocks,
+                      hidden_size // self._num_blocks),
+        bias_axes="do",
+        name="output",
+        **common_kwargs)
+    if self._apply_mixing:
+      self._output_mixing = tf.keras.layers.experimental.EinsumDense(
+          "abdo,de->abeo",
+          output_shape=(None, self._num_blocks,
+                        hidden_size // self._num_blocks),
+          name="output_mixing",
+          **common_kwargs)
+    self._output_reshape = tf.keras.layers.Reshape((-1, hidden_size))
+    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout)
+  def get_config(self):
+    config = {
+        "intermediate_size":
+            self._intermediate_size,
+        "intermediate_activation":
+            self._intermediate_activation,
+        "dropout":
+            self._dropout,
+        "num_blocks":
+            self._num_blocks,
+        "apply_mixing":
+            self._apply_mixing,
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            tf.keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            tf.keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            tf.keras.constraints.serialize(self._bias_constraint)
+    }
+    base_config = super(BlockDiagFeedforward, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+  def call(self, inputs):
+    intermediate_output = self._intermediate_dense(inputs)
+    intermediate_output = self._intermediate_activation_layer(
+        intermediate_output)
+    layer_output = self._output_dense(intermediate_output)
+    if self._apply_mixing:
+      layer_output = self._output_mixing(layer_output)
+    layer_output = self._output_reshape(layer_output)
+    layer_output = self._output_dropout(layer_output)
+    return layer_output
--- a/official/nlp/modeling/layers/block_diag_feedforward_test.py
+++ b/official/nlp/modeling/layers/block_diag_feedforward_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for Keras-based gated feedforward layer."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.modeling.layers import block_diag_feedforward
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class BlockDiagFeedforwardTest(keras_parameterized.TestCase):
+  def tearDown(self):
+    super(BlockDiagFeedforwardTest, self).tearDown()
+    tf.keras.mixed_precision.set_global_policy("float32")
+  @parameterized.parameters(
+      (1, True, "float32"),
+      (1, True, "mixed_float16"),
+      (1, False, "float32"),
+      (1, False, "mixed_float16"),
+      (2, True, "float32"),
+      (2, True, "mixed_float16"),
+      (2, False, "float32"),
+      (2, False, "mixed_float16"),
+  )
+  def test_layer_creation(self, num_blocks, apply_mixing, dtype):
+    tf.keras.mixed_precision.set_global_policy(dtype)
+    kwargs = dict(
+        intermediate_size=128,
+        intermediate_activation="relu",
+        dropout=0.1,
+        num_blocks=num_blocks,
+        apply_mixing=apply_mixing,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros")
+    test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
+    sequence_length = 64
+    width = 128
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(data_tensor)
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
+  @parameterized.parameters(
+      (1, True, "float32"),
+      (1, True, "mixed_float16"),
+      (1, False, "float32"),
+      (1, False, "mixed_float16"),
+      (2, True, "float32"),
+      (2, True, "mixed_float16"),
+      (2, False, "float32"),
+      (2, False, "mixed_float16"),
+  )
+  def test_layer_invocation(self, num_blocks, apply_mixing, dtype):
+    tf.keras.mixed_precision.set_global_policy(dtype)
+    kwargs = dict(
+        intermediate_size=16,
+        intermediate_activation="relu",
+        dropout=0.1,
+        num_blocks=num_blocks,
+        apply_mixing=apply_mixing,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros")
+    test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
+    sequence_length = 16
+    width = 32
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(data_tensor)
+    # Create a model from the test layer.
+    model = tf.keras.Model(data_tensor, output_tensor)
+    # Invoke the model on test data.
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    output_data = model.predict(input_data)
+    self.assertEqual(output_data.shape, (batch_size, sequence_length, width))
+  def test_get_config(self):
+    kwargs = dict(
+        intermediate_size=16,
+        intermediate_activation="relu",
+        dropout=0.1,
+        num_blocks=2,
+        apply_mixing=True,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros")
+    test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
+    new_layer = block_diag_feedforward.BlockDiagFeedforward.from_config(
+        test_layer.get_config())
+    self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/modeling/layers/dense_einsum.py
+++ b/official/nlp/modeling/layers/dense_einsum.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Keras-based einsum layer."""
-# pylint: disable=g-classes-have-attributes
-import tensorflow as tf
-from tensorflow.python.util import deprecation
-_CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]
-@tf.keras.utils.register_keras_serializable(package="Text")
-class DenseEinsum(tf.keras.layers.Layer):
-  """A densely connected layer that uses `tf.einsum` as the backing computation.
-  This layer can perform einsum calculations of arbitrary dimensionality.
-  Args:
-    output_shape: Positive integer or tuple, dimensionality of the output space.
-    num_summed_dimensions: The number of dimensions to sum over. Standard 2D
-      matmul should use 1, 3D matmul should use 2, and so forth.
-    activation: Activation function to use. If you don't specify anything, no
-      activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix.
-    bias_initializer: Initializer for the bias vector.
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    activity_regularizer: Regularizer function applied to the output of the
-      layer (its "activation")..
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-  Input shape:
-    N-D tensor with shape: `(batch_size, ..., input_dim)`. The most common
-      situation would be a 2D input with shape `(batch_size, input_dim)`.
-  Output shape:
-    N-D tensor with shape: `(batch_size, ..., units)`. For instance, for a 2D
-      input with shape `(batch_size, input_dim)`, the output would have shape
-      `(batch_size, units)`.
-  """
-  @deprecation.deprecated(None, "DenseEinsum is deprecated. Please use "
-                          "tf.keras.experimental.EinsumDense layer instead.")
-  def __init__(self,
-               output_shape,
-               num_summed_dimensions=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super(DenseEinsum, self).__init__(**kwargs)
-    self._output_shape = output_shape if isinstance(
-        output_shape, (list, tuple)) else (output_shape,)
-    self._activation = tf.keras.activations.get(activation)
-    self._use_bias = use_bias
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
-    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
-    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
-    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
-    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
-    self._num_summed_dimensions = num_summed_dimensions
-    self._einsum_string = None
-  def _build_einsum_string(self, free_input_dims, bound_dims, output_dims):
-    input_str = ""
-    kernel_str = ""
-    output_str = ""
-    letter_offset = 0
-    for i in range(free_input_dims):
-      char = _CHR_IDX[i + letter_offset]
-      input_str += char
-      output_str += char
-    letter_offset += free_input_dims
-    for i in range(bound_dims):
-      char = _CHR_IDX[i + letter_offset]
-      input_str += char
-      kernel_str += char
-    letter_offset += bound_dims
-    for i in range(output_dims):
-      char = _CHR_IDX[i + letter_offset]
-      kernel_str += char
-      output_str += char
-    return input_str + "," + kernel_str + "->" + output_str
-  def build(self, input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    input_rank = input_shape.rank
-    free_input_dims = input_rank - self._num_summed_dimensions
-    output_dims = len(self._output_shape)
-    self._einsum_string = self._build_einsum_string(free_input_dims,
-                                                    self._num_summed_dimensions,
-                                                    output_dims)
-    # This is only saved for testing purposes.
-    self._kernel_shape = (
-        input_shape[free_input_dims:].concatenate(self._output_shape))
-    self._kernel = self.add_weight(
-        "kernel",
-        shape=self._kernel_shape,
-        initializer=self._kernel_initializer,
-        regularizer=self._kernel_regularizer,
-        constraint=self._kernel_constraint,
-        dtype=self.dtype,
-        trainable=True)
-    if self._use_bias:
-      self._bias = self.add_weight(
-          "bias",
-          shape=self._output_shape,
-          initializer=self._bias_initializer,
-          regularizer=self._bias_regularizer,
-          constraint=self._bias_constraint,
-          dtype=self.dtype,
-          trainable=True)
-    else:
-      self._bias = None
-    super(DenseEinsum, self).build(input_shape)
-  def get_config(self):
-    config = {
-        "output_shape":
-            self._output_shape,
-        "num_summed_dimensions":
-            self._num_summed_dimensions,
-        "activation":
-            tf.keras.activations.serialize(self._activation),
-        "use_bias":
-            self._use_bias,
-        "kernel_initializer":
-            tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer":
-            tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer":
-            tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer":
-            tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer":
-            tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint":
-            tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint":
-            tf.keras.constraints.serialize(self._bias_constraint)
-    }
-    base_config = super(DenseEinsum, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-  def call(self, inputs):
-    ret = tf.einsum(self._einsum_string, inputs, self._kernel)
-    if self._use_bias:
-      ret += self._bias
-    if self._activation is not None:
-      ret = self._activation(ret)
-    return ret
--- a/official/nlp/modeling/layers/dense_einsum_test.py
+++ b/official/nlp/modeling/layers/dense_einsum_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for Keras-based einsum layer."""
-import numpy as np
-import tensorflow as tf
-from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers import dense_einsum
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class DenseEinsumLayer(keras_parameterized.TestCase):
-  def test_3D_einsum_with_two_bound_dimensions(self):
-    test_layer = dense_einsum.DenseEinsum(
-        output_shape=(64,), num_summed_dimensions=2)
-    # Create a 4-dimensional input (the first dimension is implicit).
-    input_tensor = tf.keras.Input(shape=(None, 40, 80))
-    _ = test_layer(input_tensor)
-    self.assertEqual(test_layer._einsum_string, "abcd,cde->abe")
-    self.assertEqual(test_layer._kernel_shape, (40, 80, 64))
-  def test_3D_einsum_with_one_bound_dimensions(self):
-    test_layer = dense_einsum.DenseEinsum(
-        output_shape=(64, 32), num_summed_dimensions=1)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    input_tensor = tf.keras.Input(shape=(None, 80))
-    _ = test_layer(input_tensor)
-    self.assertEqual(test_layer._einsum_string, "abc,cde->abde")
-    self.assertEqual(test_layer._kernel_shape, (80, 64, 32))
-  def test_2D_einsum_with_one_bound_dimensions(self):
-    test_layer = dense_einsum.DenseEinsum(
-        output_shape=(64,), num_summed_dimensions=1)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    input_tensor = tf.keras.Input(shape=(None, 80))
-    _ = test_layer(input_tensor)
-    self.assertEqual(test_layer._einsum_string, "abc,cd->abd")
-    self.assertEqual(test_layer._kernel_shape, (80, 64))
-  def test_bias_term_can_be_disabled(self):
-    # A layer created using the bias should have two weights.
-    test_layer = dense_einsum.DenseEinsum(
-        output_shape=64, num_summed_dimensions=1, use_bias=True)
-    input_tensor = tf.keras.Input(shape=(None, 80))
-    _ = test_layer(input_tensor)
-    self.assertEqual(2, len(test_layer.get_weights()))
-    # A layer created without the bias should have only one weight.
-    test_layer = dense_einsum.DenseEinsum(
-        output_shape=64, num_summed_dimensions=1, use_bias=False)
-    input_tensor = tf.keras.Input(shape=(None, 80))
-    _ = test_layer(input_tensor)
-    self.assertEqual(1, len(test_layer.get_weights()))
-  def test_activation(self):
-    # Create a model that does not use an activation.
-    no_activation_layer = dense_einsum.DenseEinsum(
-        output_shape=64, num_summed_dimensions=1, activation=None)
-    input_tensor = tf.keras.Input(shape=(None, 80))
-    output_tensor = no_activation_layer(input_tensor)
-    no_activation_model = tf.keras.Model(input_tensor, output_tensor)
-    # Create a model that uses a softmax activation.
-    activation_layer = dense_einsum.DenseEinsum(
-        output_shape=64, num_summed_dimensions=1, activation="softmax")
-    input_tensor = tf.keras.Input(shape=(None, 80))
-    output_tensor = activation_layer(input_tensor)
-    activation_model = tf.keras.Model(input_tensor, output_tensor)
-    # Make sure the models' weights are identical.
-    activation_model.set_weights(no_activation_model.get_weights())
-    # Predict using each model on the same input data. The output should be
-    # different, since one is using a softmax - even though the models' weights
-    # are the same.
-    input_values = 10 * np.random.random_sample((10, 4, 80))
-    non_activated_data = no_activation_model.predict(input_values)
-    activated_data = activation_model.predict(input_values)
-    self.assertNotAllClose(activated_data, non_activated_data)
-  def test_non_iterable_output_shape(self):
-    test_layer = dense_einsum.DenseEinsum(
-        output_shape=64, num_summed_dimensions=1)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    input_tensor = tf.keras.Input(shape=(None, 80))
-    _ = test_layer(input_tensor)
-    self.assertEqual(test_layer._einsum_string, "abc,cd->abd")
-    self.assertEqual(test_layer._kernel_shape, (80, 64))
-  def test_with_explicit_initializer(self):
-    test_layer = dense_einsum.DenseEinsum(
-        output_shape=(64,),
-        num_summed_dimensions=2,
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
-    # Create a 4-dimensional input (the first dimension is implicit).
-    input_tensor = tf.keras.Input(shape=(None, 40, 80))
-    _ = test_layer(input_tensor)
-    self.assertEqual(test_layer._einsum_string, "abcd,cde->abe")
-    self.assertEqual(test_layer._kernel_shape, (40, 80, 64))
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/nlp/modeling/layers/reuse_transformer_test.py
+++ b/official/nlp/modeling/layers/reuse_transformer_test.py
@@ -68,7 +68,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
    # Invoke the model on test data. We can't validate the output data itself
    # (the NN is too complex) but this will rule out structural runtime errors.
    batch_size = 6
-    input_data = 10 * np.random.random_sample(
+    input_data = np.random.random_sample(
        (batch_size, sequence_length, width))
    _ = model.predict(input_data)
@@ -89,7 +89,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
    # Invoke the model on test data. We can't validate the output data itself
    # (the NN is too complex) but this will rule out structural runtime errors.
    batch_size = 6
-    input_data = 10 * np.random.random_sample(
+    input_data = np.random.random_sample(
        (batch_size, sequence_length, width))
    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
    # which here is (batch, sequence_length, sequence_length)
@@ -104,7 +104,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
    width = 80
    batch_size = 6
-    input_data = 10 * np.random.random_sample(
+    input_data = np.random.random_sample(
        (batch_size, sequence_length, width))
    mask_data = np.random.randint(
        2, size=(batch_size, sequence_length, sequence_length))
@@ -121,7 +121,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
    new_layer.set_weights(test_layer.get_weights())
    new_output_tensor, _ = new_layer([input_data, mask_data])
    self.assertAllClose(
-        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+        new_output_tensor, output_tensor[:, 0:1, :], atol=0.002, rtol=0.01)
  def test_layer_output_range_with_relative_pe(self, transformer_cls):
    test_layer = transformer_cls(
@@ -131,7 +131,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
    width = 80
    batch_size = 6
-    input_data = 10 * np.random.random_sample(
+    input_data = np.random.random_sample(
        (batch_size, sequence_length, width))
    mask_data = np.random.randint(
        2, size=(batch_size, sequence_length, sequence_length))
@@ -149,7 +149,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
    new_layer.set_weights(test_layer.get_weights())
    new_output_tensor, _ = new_layer([input_data, mask_data])
    self.assertAllClose(
-        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+        new_output_tensor, output_tensor[:, 0:1, :], atol=0.002, rtol=0.01)
  def test_layer_output_range_without_mask(self, transformer_cls):
    test_layer = transformer_cls(
@@ -159,7 +159,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
    width = 80
    batch_size = 6
-    input_data = 10 * np.random.random_sample(
+    input_data = np.random.random_sample(
        (batch_size, sequence_length, width))
    output_tensor, _ = test_layer(input_data)
@@ -175,7 +175,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
    new_layer.set_weights(test_layer.get_weights())
    new_output_tensor, _ = new_layer(input_data)
    self.assertAllClose(
-        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+        new_output_tensor, output_tensor[:, 0:1, :], atol=0.002, rtol=0.01)
  def test_layer_output_range_with_pre_norm(self, transformer_cls):
    test_layer = transformer_cls(
@@ -185,7 +185,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
    width = 80
    batch_size = 6
-    input_data = 10 * np.random.random_sample(
+    input_data = np.random.random_sample(
        (batch_size, sequence_length, width))
    mask_data = np.random.randint(
        2, size=(batch_size, sequence_length, sequence_length))
@@ -203,7 +203,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
    new_layer.set_weights(test_layer.get_weights())
    new_output_tensor, _ = new_layer([input_data, mask_data])
    self.assertAllClose(
-        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+        new_output_tensor, output_tensor[:, 0:1, :], atol=0.002, rtol=0.01)
  def test_layer_invocation_with_float16_dtype(self, transformer_cls):
    tf.keras.mixed_precision.set_global_policy('mixed_float16')
@@ -223,7 +223,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
    # Invoke the model on test data. We can't validate the output data itself
    # (the NN is too complex) but this will rule out structural runtime errors.
    batch_size = 6
-    input_data = (10 * np.random.random_sample(
+    input_data = (np.random.random_sample(
        (batch_size, sequence_length, width)))
    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
    # which here is (batch, sequence_length, sequence_length)
@@ -368,7 +368,7 @@ class ReuseTransformerArgumentTest(tf.test.TestCase, parameterized.TestCase):
    # Invoke the model on test data. We can't validate the output data itself
    # (the NN is too complex) but this will rule out structural runtime errors.
    batch_size = 6
-    input_data = 10 * np.random.random_sample(
+    input_data = np.random.random_sample(
        (batch_size, sequence_length, width))
    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
    # which here is (batch, sequence_length, sequence_length)
@@ -404,7 +404,7 @@ class ReuseTransformerArgumentTest(tf.test.TestCase, parameterized.TestCase):
    # Invoke the model on test data. We can't validate the output data itself
    # (the NN is too complex) but this will rule out structural runtime errors.
    batch_size = 6
-    input_data = (10 * np.random.random_sample(
+    input_data = (np.random.random_sample(
        (batch_size, sequence_length, width)))
    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
    # which here is (batch, sequence_length, sequence_length)

--- a/official/nlp/modeling/layers/rezero_transformer.py
+++ b/official/nlp/modeling/layers/rezero_transformer.py
@@ -18,6 +18,8 @@
 import gin
 import tensorflow as tf
+from official.nlp.modeling.layers import util
 @tf.keras.utils.register_keras_serializable(package="Text")
 @gin.configurable
@@ -45,6 +47,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
    kernel_constraint: Constraint for dense layer kernels.
    bias_constraint: Constraint for dense layer kernels.
    use_layer_norm: If add layer_norm on top of the ReZero.
+    share_rezero: If attention layer and FFN layer share the same alpha.
  """
  def __init__(self,
@@ -62,7 +65,14 @@ class ReZeroTransformer(tf.keras.layers.Layer):
               kernel_constraint=None,
               bias_constraint=None,
               use_layer_norm=False,
+               share_rezero=True,
               **kwargs):
+    # attention_dropout will override attention_dropout_rate.
+    # This is to unify the input params with TransformerEncoderBlock.
+    attention_dropout_rate = kwargs.pop("attention_dropout",
+                                        attention_dropout_rate)
+    dropout_rate = kwargs.pop("output_dropout", dropout_rate)
+    util.filter_kwargs(kwargs)
    super(ReZeroTransformer, self).__init__(**kwargs)
    self._num_heads = num_attention_heads
@@ -78,10 +88,18 @@ class ReZeroTransformer(tf.keras.layers.Layer):
    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
    self._use_layer_norm = use_layer_norm
+    self._share_rezero = share_rezero
  def build(self, input_shape):
-    input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
+    if isinstance(input_shape, tf.TensorShape):
-    input_tensor_shape = tf.TensorShape(input_tensor)
+      input_tensor_shape = input_shape
+    elif isinstance(input_shape, (list, tuple)):
+      input_tensor_shape = tf.TensorShape(input_shape[0])
+    else:
+      raise ValueError(
+          "The type of input shape argument is not supported, got: %s" %
+          type(input_shape))
    if len(input_tensor_shape.as_list()) != 3:
      raise ValueError("TransformerLayer expects a three-dimensional input of "
                       "shape [batch, sequence, width].")
@@ -158,6 +176,15 @@ class ReZeroTransformer(tf.keras.layers.Layer):
        trainable=True,
        dtype=tf.float32)
+    if self._share_rezero:
+      self._rezero_a_ffn = self._rezero_a
+    else:
+      self._rezero_a_ffn = self.add_weight(
+          name="rezero_alpha_ffn",
+          initializer=tf.keras.initializers.Zeros(),
+          trainable=True,
+          dtype=tf.float32)
    super(ReZeroTransformer, self).build(input_shape)
  def get_config(self):
@@ -176,6 +203,8 @@ class ReZeroTransformer(tf.keras.layers.Layer):
            self._output_range,
        "use_layer_norm":
            self._use_layer_norm,
+        "share_rezero":
+            self._share_rezero,
        "kernel_initializer":
            tf.keras.initializers.serialize(self._kernel_initializer),
        "bias_initializer":
@@ -196,21 +225,34 @@ class ReZeroTransformer(tf.keras.layers.Layer):
  def reset_rezero(self):
    self._rezero_a.assign(0.)
+    if not self._share_rezero:
+      self._rezero_a_ffn.assign(0.)
  def call(self, inputs):
-    if isinstance(inputs, (list, tuple)) and len(inputs) == 2:
+    if isinstance(inputs, (list, tuple)):
-      input_tensor, attention_mask = inputs
+      if len(inputs) == 2:
+        input_tensor, attention_mask = inputs
+        key_value = None
+      elif len(inputs) == 3:
+        input_tensor, key_value, attention_mask = inputs
+      else:
+        raise ValueError("Unexpected inputs to %s with length at %d" %
+                         (self.__class__, len(inputs)))
    else:
-      input_tensor, attention_mask = (inputs, None)
+      input_tensor, key_value, attention_mask = (inputs, None, None)
    if self._output_range:
      target_tensor = input_tensor[:, 0:self._output_range, :]
-      attention_mask = attention_mask[:, 0:self._output_range, :]
+      if attention_mask is not None:
+        attention_mask = attention_mask[:, 0:self._output_range, :]
    else:
      target_tensor = input_tensor
+    if key_value is None:
+      key_value = input_tensor
    attention_output = self._attention_layer(
-        query=target_tensor, value=input_tensor, attention_mask=attention_mask)
+        query=target_tensor, value=key_value, attention_mask=attention_mask)
    attention_output = self._attention_dropout(attention_output)
    attention_output = target_tensor + self._rezero_a * attention_output
    if self._use_layer_norm:
@@ -225,7 +267,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
    layer_output = self._output_dropout(layer_output)
    # During mixed precision training, attention_output is from layer norm and
    # is always fp32 for now. Cast layer_output to fp32 for the subsequent add.
-    layer_output = attention_output + tf.cast(self._rezero_a * layer_output,
+    layer_output = attention_output + tf.cast(self._rezero_a_ffn * layer_output,
                                              tf.float32)
    if self._use_layer_norm:
      layer_output = self._output_layer_norm(layer_output)

--- a/official/nlp/modeling/layers/rezero_transformer_test.py
+++ b/official/nlp/modeling/layers/rezero_transformer_test.py
@@ -14,6 +14,7 @@
 """Tests for Keras-based rezero-transformer block layer."""
+from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
@@ -30,12 +31,15 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):
    super(TransformerWithReZeroLayerTest, self).tearDown()
    tf.keras.mixed_precision.set_global_policy('float32')
-  def test_layer_invocation_with_float16_dtype(self):
+  @parameterized.named_parameters(('no_share_attn_ffn', False),
+                                  ('share_attn_ffn', True))
+  def test_layer_invocation_with_float16_dtype(self, share_rezero):
    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    test_layer = rezero_transformer.ReZeroTransformer(
        num_attention_heads=10,
        intermediate_size=2048,
-        intermediate_activation='relu')
+        intermediate_activation='relu',
+        share_rezero=share_rezero)
    sequence_length = 21
    width = 80
    # Create a 3-dimensional input (the first dimension is implicit).
@@ -124,6 +128,20 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):
    new_output_tensor = new_layer([input_data, mask_data])
    self.assertAllClose(new_output_tensor, output_tensor[:, 0:1, :])
+  def test_separate_qkv(self):
+    test_layer = rezero_transformer.ReZeroTransformer(
+        num_attention_heads=2,
+        intermediate_size=128,
+        intermediate_activation='relu',
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
+    # Forward path.
+    q_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
+    kv_tensor = tf.zeros([2, 8, 16], dtype=tf.float32)
+    dummy_mask = tf.zeros([2, 4, 8], dtype=tf.float32)
+    inputs = [q_tensor, kv_tensor, dummy_mask]
+    output = test_layer(inputs)
+    self.assertEqual(output.shape, q_tensor.shape)
 if __name__ == '__main__':
  tf.test.main()
--- a/official/nlp/modeling/layers/text_layers.py
+++ b/official/nlp/modeling/layers/text_layers.py
@@ -13,18 +13,22 @@
 # limitations under the License.
 """Keras Layers for BERT-specific preprocessing."""
+# pylint: disable=g-import-not-at-top
 from typing import Any, Dict, List, Optional, Union
 from absl import logging
 import tensorflow as tf
 try:
-  import tensorflow_text as text  # pylint: disable=g-import-not-at-top
+  import tensorflow_text as text
+  from tensorflow_text.python.ops import bert_tokenizer
 except ImportError:
  text = None
+  bert_tokenizer = None
 except tf.errors.NotFoundError as e:
  logging.warn("Encountered error when importing tensorflow_text: %s", e)
  text = None
+  bert_tokenizer = None
 def _check_if_tf_text_installed():
@@ -587,3 +591,139 @@ class BertPackInputs(tf.keras.layers.Layer):
    return dict(input_word_ids=_reshape(input_word_ids),
                input_mask=_reshape(input_mask),
                input_type_ids=_reshape(input_type_ids))
+class FastWordpieceBertTokenizer(tf.keras.layers.Layer):
+  """A bert tokenizer keras layer using text.FastWordpieceTokenizer.
+  See details: "Fast WordPiece Tokenization" (https://arxiv.org/abs/2012.15524)
+  """
+  def __init__(self,
+               *,
+               vocab_file: str,
+               lower_case: bool,
+               tokenize_with_offsets: bool = False,
+               **kwargs):
+    """Initializes a FastWordpieceBertTokenizer layer.
+    Args:
+      vocab_file: A Python string with the path of the vocabulary file. This is
+        a text file with newline-separated wordpiece tokens. This layer loads
+        a list of tokens from it to create text.FastWordpieceTokenizer.
+      lower_case: A Python boolean forwarded to text.BasicTokenizer. If true,
+        input text is converted to lower case (where applicable) before
+        tokenization. This must be set to match the way in which the vocab_file
+        was created.
+      tokenize_with_offsets: A Python boolean. If true, this layer calls
+        FastWordpieceTokenizer.tokenize_with_offsets() instead of plain
+        .tokenize() and outputs a triple of (tokens, start_offsets,
+        limit_offsets) insead of just tokens.
+      **kwargs: standard arguments to Layer().
+    """
+    super().__init__(**kwargs)
+    logging.info("Initialize a FastWordpieceBertTokenizer.")
+    self.tokenize_with_offsets = tokenize_with_offsets
+    self._basic_tokenizer = bert_tokenizer.BasicTokenizer(lower_case=lower_case)
+    # Read the vocab file into a list of tokens to create `fast_wp_tokenizer`.
+    self._vocab = [line.rstrip() for line in tf.io.gfile.GFile(vocab_file)]
+    self._fast_wp_tokenizer = text.FastWordpieceTokenizer(
+        vocab=self._vocab, token_out_type=tf.int32, no_pretokenization=True)
+    self._special_tokens_dict = self._create_special_tokens_dict()
+  @property
+  def vocab_size(self):
+    return len(self._vocab)
+  def get_config(self):
+    # Skip in tf.saved_model.save(); fail if called direcly.
+    # We cannot just put the original, user-supplied vocab file name into
+    # the config, because the path has to change as the SavedModel is copied
+    # around.
+    raise NotImplementedError("Not implemented yet.")
+  def get_special_tokens_dict(self):
+    """Returns dict of token ids, keyed by standard names for their purpose.
+    Returns:
+      A dict from Python strings to Python integers. Each key is a standard
+      name for a special token describing its use. (For example, "padding_id"
+      is what BERT traditionally calls "[PAD]" but others may call "<pad>".)
+      The corresponding value is the integer token id. If a special token
+      is not found, its entry is omitted from the dict.
+      The supported keys and tokens are:
+        * start_of_sequence_id: looked up from "[CLS]"
+        * end_of_segment_id: looked up from "[SEP]"
+        * padding_id: looked up form "[PAD]"
+        * mask_id: looked up from "[MASK]"
+        * vocab_size: one past the largest token id used
+    """
+    return self._special_tokens_dict
+  def _create_special_tokens_dict(self):
+    """Creates dict of token ids, keyed by standard names for their purpose."""
+    special_tokens = {"vocab_size": self.vocab_size}
+    def add_special_token(key, token):
+      try:
+        token_id = self._vocab.index(token)
+        special_tokens[key] = token_id
+      except ValueError:
+        # Similar as nlp.modeling.layers.BertTokenizer, if a special token
+        # is not found, its entry is omitted from the dict.
+        logging.warning("Could not find %s as token \"%s\" in vocab file", key,
+                        token)
+    add_special_token("start_of_sequence_id", "[CLS]")
+    add_special_token("end_of_segment_id", "[SEP]")
+    add_special_token("padding_id", "[PAD]")
+    add_special_token("mask_id", "[MASK]")
+    return special_tokens
+  def _tokenize_with_offsets(self, text_input: tf.Tensor):
+    tokens, begin, _ = self._basic_tokenizer.tokenize_with_offsets(text_input)
+    wordpieces, wp_begin, wp_end = (
+        self._fast_wp_tokenizer.tokenize_with_offsets(tokens))
+    begin_expanded = tf.expand_dims(begin, axis=2)
+    final_begin = begin_expanded + wp_begin
+    final_end = begin_expanded + wp_end
+    return wordpieces, final_begin, final_end
+  def _tokenize(self, text_input: tf.Tensor):
+    tokens = self._basic_tokenizer.tokenize(text_input)
+    return self._fast_wp_tokenizer.tokenize(tokens)
+  def call(self, inputs: tf.Tensor):
+    """Calls text.BertTokenizer on inputs.
+    Args:
+      inputs: A string Tensor of shape [batch_size].
+    Returns:
+      One or three of RaggedTensors if tokenize_with_offsets is False or True,
+      respectively. These are
+      tokens: A RaggedTensor of shape [batch_size, (words), (pieces_per_word)]
+        and type int32. tokens[i,j,k] contains the k-th wordpiece of the
+        j-th word in the i-th input.
+      start_offsets, limit_offsets: If tokenize_with_offsets is True,
+        RaggedTensors of type int64 with the same indices as tokens.
+        Element [i,j,k] contains the byte offset at the start, or past the
+        end, resp., for the k-th wordpiece of the j-th word in the i-th input.
+    """
+    # Prepare to reshape the result to work around broken shape inference.
+    batch_size = tf.shape(inputs)[0]
+    def _reshape(rt):
+      values = rt.values
+      row_splits = rt.row_splits
+      row_splits = tf.reshape(row_splits, [batch_size + 1])
+      return tf.RaggedTensor.from_row_splits(values, row_splits)
+    if self.tokenize_with_offsets:
+      tokens, start_offsets, limit_offsets = self._tokenize_with_offsets(inputs)
+      return _reshape(tokens), _reshape(start_offsets), _reshape(limit_offsets)
+    else:
+      tokens = self._tokenize(inputs)
+      return _reshape(tokens)
--- a/official/nlp/modeling/layers/text_layers_test.py
+++ b/official/nlp/modeling/layers/text_layers_test.py
@@ -442,5 +442,109 @@ class BertPackInputsTest(tf.test.TestCase):
                     [1001, 21, 22, 23, 24, 25, 26, 27, 28, 1002]]))
+# This test covers the in-process behavior of FastWordpieceBertTokenizer layer.
+class FastWordPieceBertTokenizerTest(tf.test.TestCase):
+  def _make_vocab_file(self, vocab, filename="vocab.txt"):
+    path = os.path.join(
+        tempfile.mkdtemp(dir=self.get_temp_dir()),  # New subdir each time.
+        filename)
+    with tf.io.gfile.GFile(path, "w") as f:
+      f.write("\n".join(vocab + [""]))
+    return path
+  def test_uncased(self):
+    vocab_file = self._make_vocab_file(
+        ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "xy"])
+    bert_tokenize = text_layers.FastWordpieceBertTokenizer(
+        vocab_file=vocab_file, lower_case=True)
+    inputs = tf.constant(["abc def", "ABC DEF d"])
+    token_ids = bert_tokenize(inputs)
+    self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
+                                                       [[6], [4, 5], [4]]]))
+    bert_tokenize.tokenize_with_offsets = True
+    token_ids_2, start_offsets, limit_offsets = bert_tokenize(inputs)
+    self.assertAllEqual(token_ids, token_ids_2)
+    self.assertAllEqual(start_offsets, tf.ragged.constant([[[0], [4, 5]],
+                                                           [[0], [4, 5], [8]]]))
+    self.assertAllEqual(limit_offsets, tf.ragged.constant([[[3], [5, 7]],
+                                                           [[3], [5, 7], [9]]]))
+    self.assertEqual(bert_tokenize.vocab_size, 8)
+  # Repeat the above and test that case matters with lower_case=False.
+  def test_cased(self):
+    vocab_file = self._make_vocab_file(
+        ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "ABC"])
+    bert_tokenize = text_layers.FastWordpieceBertTokenizer(
+        vocab_file=vocab_file, lower_case=False, tokenize_with_offsets=True)
+    inputs = tf.constant(["abc def", "ABC DEF"])
+    token_ids, start_offsets, limit_offsets = bert_tokenize(inputs)
+    self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
+                                                       [[7], [1]]]))
+    self.assertAllEqual(start_offsets, tf.ragged.constant([[[0], [4, 5]],
+                                                           [[0], [4]]]))
+    self.assertAllEqual(limit_offsets, tf.ragged.constant([[[3], [5, 7]],
+                                                           [[3], [7]]]))
+  def test_special_tokens_complete(self):
+    vocab_file = self._make_vocab_file(
+        ["foo", "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "xy"])
+    bert_tokenize = text_layers.FastWordpieceBertTokenizer(
+        vocab_file=vocab_file, lower_case=True)
+    self.assertDictEqual(bert_tokenize.get_special_tokens_dict(),
+                         dict(padding_id=1,
+                              start_of_sequence_id=3,
+                              end_of_segment_id=4,
+                              mask_id=5,
+                              vocab_size=7))
+  def test_special_tokens_partial(self):
+    # [UNK] token is required by fast wordpiece tokenizer.
+    vocab_file = self._make_vocab_file(
+        ["[PAD]", "[CLS]", "[SEP]", "[UNK]"])
+    bert_tokenize = text_layers.FastWordpieceBertTokenizer(
+        vocab_file=vocab_file, lower_case=True)
+    self.assertDictEqual(bert_tokenize.get_special_tokens_dict(),
+                         dict(padding_id=0,
+                              start_of_sequence_id=1,
+                              end_of_segment_id=2,
+                              vocab_size=4))  # No mask_id,
+  def test_special_tokens_in_estimator(self):
+    """Tests getting special tokens without an Eager init context."""
+    vocab_file = self._make_vocab_file(
+        ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "xy"])
+    def input_fn():
+      with tf.init_scope():
+        self.assertFalse(tf.executing_eagerly())
+      # Build a preprocessing Model.
+      sentences = tf.keras.layers.Input(shape=[], dtype=tf.string)
+      bert_tokenizer = text_layers.FastWordpieceBertTokenizer(
+          vocab_file=vocab_file, lower_case=True)
+      special_tokens_dict = bert_tokenizer.get_special_tokens_dict()
+      for k, v in special_tokens_dict.items():
+        self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
+      tokens = bert_tokenizer(sentences)
+      packed_inputs = text_layers.BertPackInputs(
+          4, special_tokens_dict=special_tokens_dict)(tokens)
+      preprocessing = tf.keras.Model(sentences, packed_inputs)
+      # Map the dataset.
+      ds = tf.data.Dataset.from_tensors(
+          (tf.constant(["abc", "DEF"]), tf.constant([0, 1])))
+      ds = ds.map(lambda features, labels: (preprocessing(features), labels))
+      return ds
+    def model_fn(features, labels, mode):
+      del labels  # Unused.
+      return tf.estimator.EstimatorSpec(mode=mode,
+                                        predictions=features["input_word_ids"])
+    estimator = tf.estimator.Estimator(model_fn=model_fn)
+    outputs = list(estimator.predict(input_fn))
+    self.assertAllEqual(outputs, np.array([[2, 6, 3, 0],
+                                           [2, 4, 5, 3]]))
 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/modeling/layers/transformer_encoder_block.py
+++ b/official/nlp/modeling/layers/transformer_encoder_block.py
@@ -16,6 +16,8 @@
 import tensorflow as tf
+from official.nlp.modeling.layers import util
 @tf.keras.utils.register_keras_serializable(package="Text")
 class TransformerEncoderBlock(tf.keras.layers.Layer):
@@ -86,8 +88,9 @@ class TransformerEncoderBlock(tf.keras.layers.Layer):
        kernel.
      attention_axes: axes over which the attention is applied. `None` means
        attention over all axes, but batch, heads, and features.
-      **kwargs: keyword arguments/
+      **kwargs: keyword arguments.
    """
+    util.filter_kwargs(kwargs)
    super().__init__(**kwargs)
    self._num_heads = num_attention_heads

--- a/official/nlp/modeling/layers/util.py
+++ b/official/nlp/modeling/layers/util.py
@@ -30,13 +30,13 @@ class TfFunctionIfEagerDecorator(object):
    @functools.wraps(func)
    def wrapped_func(*args):
      # TODO(b/150147476, b/150024785): Fix tf.function in TF1 crash.
-      if not hasattr(tf.compat.v1, "executing_eagerly_outside_functions"
+      if not hasattr(tf.compat.v1, 'executing_eagerly_outside_functions'
                    ) or tf.compat.v1.executing_eagerly_outside_functions():
        return tf.function(func=func, **self.func_kwargs)(*args)
      return func(*args)
    # Cache the created function in self._call_impl.
-    if not hasattr(self, "_call_impl"):
+    if not hasattr(self, '_call_impl'):
      self._call_impl = wrapped_func
    return self._call_impl
@@ -44,3 +44,29 @@ class TfFunctionIfEagerDecorator(object):
 def tf_function_if_eager(**kwargs):
  """Applies the @tf.function decorator only if running in eager mode."""
  return TfFunctionIfEagerDecorator(**kwargs)
+def filter_kwargs(kwargs):
+  """In place removes unused options in kwargs.
+  This function removes the construction signatures: e.g.
+  number_attention_heads... in TransformerEncoderBlock. This is needed,
+  otherwise base_layer.py in Keras will complain.
+  Args:
+    kwargs: keyword arguments to be filtered.
+  """
+  # This is the union of signatures of TransformerEncoderBlock and
+  # ReZeroTransformer. Every Transformer
+  # block that uses compatible signature with TransformerEncoderBlock should
+  # call this function before base constructor super().__init__(**kwargs).
+  denylist = [
+      'num_attention_heads', 'intermediate_size', 'intermediate_activation',
+      'inner_dim', 'inner_activation', 'output_range', 'kernel_initializer',
+      'bias_initializer', 'kernel_regularizer', 'bias_regularizer',
+      'activity_regularizer', 'kernel_constraint', 'bias_constraint',
+      'use_bias', 'norm_first', 'norm_epsilon', 'output_dropout',
+      'attention_dropout', 'inner_dropout', 'attention_initializer',
+      'attention_axes', 'share_rezero'
+  ]
+  for unused_key in denylist:
+    kwargs.pop(unused_key, None)
--- a/official/nlp/modeling/models/seq2seq_transformer.py
+++ b/official/nlp/modeling/models/seq2seq_transformer.py
@@ -260,11 +260,9 @@ class Seq2SeqTransformer(tf.keras.Model):
      return {"outputs": top_decoded_ids, "scores": top_scores}
-    decoder_inputs = self.embedding_lookup(targets)
-    embedding_mask = tf.cast(tf.not_equal(targets, 0), decoder_inputs.dtype)
-    decoder_inputs *= tf.expand_dims(embedding_mask, -1)
    # Shift targets to the right, and remove the last element
-    decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
+    targets = tf.pad(targets, [[0, 0], [1, 0]])[:, :-1]
+    decoder_inputs = self.embedding_lookup(targets)
    length = tf.shape(decoder_inputs)[1]
    pos_encoding = self.position_embedding(decoder_inputs)
    pos_encoding = tf.cast(pos_encoding, embedded_inputs.dtype)
@@ -325,12 +323,7 @@ class Seq2SeqTransformer(tf.keras.Model):
      decoder_input = ids[:, -1:]
      # Preprocess decoder input by getting embeddings and adding timing signal.
-      # decoder_input = self.embedding_softmax_layer(decoder_input)
-      source_decoder_input = decoder_input
      decoder_input = self.embedding_lookup(decoder_input)
-      embedding_mask = tf.cast(
-          tf.not_equal(source_decoder_input, 0), decoder_input.dtype)
-      decoder_input *= tf.expand_dims(embedding_mask, -1)
      decoder_input += timing_signal[i]
      if self._padded_decode:
        # indexing does not work on TPU.

--- a/official/nlp/modeling/networks/bert_dense_encoder_test.py
+++ b/official/nlp/modeling/networks/bert_dense_encoder_test.py
@@ -20,29 +20,30 @@ import numpy as np
 import tensorflow as tf
 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.networks import bert_dense_encoder
+from official.nlp.modeling.networks import bert_encoder
 # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
-class BertDenseEncoderTest(keras_parameterized.TestCase):
+class BertEncoderV2Test(keras_parameterized.TestCase):
  def tearDown(self):
-    super(BertDenseEncoderTest, self).tearDown()
+    super(BertEncoderV2Test, self).tearDown()
    tf.keras.mixed_precision.set_global_policy("float32")
  def test_dict_outputs_network_creation(self):
    hidden_size = 32
    sequence_length = 21
    dense_sequence_length = 20
-    # Create a small dense BertDenseEncoder for testing.
+    # Create a small dense BertEncoderV2 for testing.
    kwargs = {}
-    test_network = bert_dense_encoder.BertDenseEncoder(
+    test_network = bert_encoder.BertEncoderV2(
        vocab_size=100,
        hidden_size=hidden_size,
        num_attention_heads=2,
        num_layers=3,
+        with_dense_inputs=True,
        **kwargs)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -86,12 +87,13 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
    sequence_length = 21
    dense_sequence_length = 20
    # Create a small BertEncoder for testing.
-    test_network = bert_dense_encoder.BertDenseEncoder(
+    test_network = bert_encoder.BertEncoderV2(
        vocab_size=100,
        hidden_size=hidden_size,
        num_attention_heads=2,
        num_layers=3,
-        dict_outputs=True)
+        dict_outputs=True,
+        with_dense_inputs=True)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -134,12 +136,13 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
    dense_sequence_length = 20
    tf.keras.mixed_precision.set_global_policy("mixed_float16")
    # Create a small BertEncoder for testing.
-    test_network = bert_dense_encoder.BertDenseEncoder(
+    test_network = bert_encoder.BertEncoderV2(
        vocab_size=100,
        hidden_size=hidden_size,
        num_attention_heads=2,
        num_layers=3,
-        dict_outputs=True)
+        dict_outputs=True,
+        with_dense_inputs=True)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -176,9 +179,8 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
    self.assertAllEqual(tf.float16, pooled.dtype)
  @parameterized.named_parameters(
-      ("all_sequence_encoder_v2", bert_dense_encoder.BertDenseEncoder, None,
+      ("all_sequence_encoder_v2", bert_encoder.BertEncoderV2, None, 41),
-       41),
+      ("output_range_encoder_v2", bert_encoder.BertEncoderV2, 1, 1),
-      ("output_range_encoder_v2", bert_dense_encoder.BertDenseEncoder, 1, 1),
  )
  def test_dict_outputs_network_invocation(
      self, encoder_cls, output_range, out_seq_len):
@@ -195,7 +197,8 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
        num_layers=3,
        type_vocab_size=num_types,
        output_range=output_range,
-        dict_outputs=True)
+        dict_outputs=True,
+        with_dense_inputs=True)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -276,7 +279,7 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
    # Creates a BertEncoder with embedding_width != hidden_size
    embedding_width = 16
-    test_network = bert_dense_encoder.BertDenseEncoder(
+    test_network = bert_encoder.BertEncoderV2(
        vocab_size=vocab_size,
        hidden_size=hidden_size,
        max_sequence_length=max_sequence_length,
@@ -316,11 +319,12 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
    sequence_length = 21
    dense_sequence_length = 20
    # Create a small BertEncoder for testing.
-    test_network = bert_dense_encoder.BertDenseEncoder(
+    test_network = bert_encoder.BertEncoderV2(
        vocab_size=100,
        hidden_size=hidden_size,
        num_attention_heads=2,
-        num_layers=3)
+        num_layers=3,
+        with_dense_inputs=True)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)

--- a/official/nlp/modeling/networks/bert_encoder.py
+++ b/official/nlp/modeling/networks/bert_encoder.py
@@ -23,6 +23,8 @@ from official.nlp.modeling import layers
 _Initializer = Union[str, tf.keras.initializers.Initializer]
+_Activation = Union[str, Callable[..., Any]]
 _approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True)
@@ -72,6 +74,7 @@ class BertEncoderV2(tf.keras.layers.Layer):
    norm_first: Whether to normalize inputs to attention and intermediate dense
      layers. If set False, output of attention and intermediate dense layers is
      normalized.
+    with_dense_inputs: Whether to accept dense embeddings as the input.
  """
  def __init__(
@@ -83,7 +86,7 @@ class BertEncoderV2(tf.keras.layers.Layer):
      max_sequence_length: int = 512,
      type_vocab_size: int = 16,
      inner_dim: int = 3072,
-      inner_activation: Callable[..., Any] = _approx_gelu,
+      inner_activation: _Activation = _approx_gelu,
      output_dropout: float = 0.1,
      attention_dropout: float = 0.1,
      initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
@@ -92,6 +95,7 @@ class BertEncoderV2(tf.keras.layers.Layer):
      embedding_width: Optional[int] = None,
      embedding_layer: Optional[tf.keras.layers.Layer] = None,
      norm_first: bool = False,
+      with_dense_inputs: bool = False,
      **kwargs):
    # Pops kwargs that are used in V1 implementation.
    if 'dict_outputs' in kwargs:
@@ -190,11 +194,23 @@ class BertEncoderV2(tf.keras.layers.Layer):
        'embedding_width': embedding_width,
        'embedding_layer': embedding_layer,
        'norm_first': norm_first,
+        'with_dense_inputs': with_dense_inputs,
    }
-    self.inputs = dict(
+    if with_dense_inputs:
-        input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+      self.inputs = dict(
-        input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+          input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
-        input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32))
+          input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+          input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+          dense_inputs=tf.keras.Input(
+              shape=(None, embedding_width), dtype=tf.float32),
+          dense_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+          dense_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+      )
+    else:
+      self.inputs = dict(
+          input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+          input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+          input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32))
  def call(self, inputs):
    word_embeddings = None
@@ -203,11 +219,22 @@ class BertEncoderV2(tf.keras.layers.Layer):
      mask = inputs.get('input_mask')
      type_ids = inputs.get('input_type_ids')
      word_embeddings = inputs.get('input_word_embeddings', None)
+      dense_inputs = inputs.get('dense_inputs', None)
+      dense_mask = inputs.get('dense_mask', None)
+      dense_type_ids = inputs.get('dense_type_ids', None)
    else:
      raise ValueError('Unexpected inputs type to %s.' % self.__class__)
    if word_embeddings is None:
      word_embeddings = self._embedding_layer(word_ids)
+    if dense_inputs is not None:
+      # Concat the dense embeddings at sequence end.
+      word_embeddings = tf.concat([word_embeddings, dense_inputs], axis=1)
+      type_ids = tf.concat([type_ids, dense_type_ids], axis=1)
+      mask = tf.concat([mask, dense_mask], axis=1)
    # absolute position embeddings.
    position_embeddings = self._position_embedding_layer(word_embeddings)
    type_embeddings = self._type_embedding_layer(type_ids)

--- a/official/nlp/modeling/networks/funnel_transformer.py
+++ b/official/nlp/modeling/networks/funnel_transformer.py
@@ -15,17 +15,32 @@
 """Funnel Transformer network."""
 # pylint: disable=g-classes-have-attributes
-from typing import Union, Sequence
+from typing import Any, Callable, Optional, Union, Sequence
 from absl import logging
 import numpy as np
 import tensorflow as tf
 from official.nlp.modeling import layers
+_Initializer = Union[str, tf.keras.initializers.Initializer]
+_Activation = Union[str, Callable[..., Any]]
 _MAX = 'max'
 _AVG = 'avg'
 _TRUNCATED_AVG = 'truncated_avg'
+_transformer_cls2str = {
+    layers.TransformerEncoderBlock: 'TransformerEncoderBlock',
+    layers.ReZeroTransformer: 'ReZeroTransformer'
+}
+_str2transformer_cls = {
+    'TransformerEncoderBlock': layers.TransformerEncoderBlock,
+    'ReZeroTransformer': layers.ReZeroTransformer
+}
+_approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True)
 def _get_policy_dtype():
  try:
@@ -206,29 +221,37 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
      embeddings for the input word IDs.
    norm_first: Whether to normalize inputs to attention and intermediate dense
      layers. If set False, output of attention and intermediate dense layers is
-      normalized.
+      normalized. This does not apply to ReZero.
+    transformer_cls: str or a keras Layer. This is the base TransformerBlock the
+      funnel encoder relies on.
+    share_rezero: bool. Whether to share ReZero alpha between the attention
+      layer and the ffn layer. This option is specific to ReZero.
  """
  def __init__(
      self,
-      vocab_size,
+      vocab_size: int,
-      hidden_size=768,
+      hidden_size: int = 768,
-      num_layers=12,
+      num_layers: int = 12,
-      num_attention_heads=12,
+      num_attention_heads: int = 12,
-      max_sequence_length=512,
+      max_sequence_length: int = 512,
-      type_vocab_size=16,
+      type_vocab_size: int = 16,
-      inner_dim=3072,
+      inner_dim: int = 3072,
-      inner_activation=lambda x: tf.keras.activations.gelu(x, approximate=True),
+      inner_activation: _Activation = _approx_gelu,
-      output_dropout=0.1,
+      output_dropout: float = 0.1,
-      attention_dropout=0.1,
+      attention_dropout: float = 0.1,
-      pool_type=_MAX,
+      pool_type: str = _MAX,
-      pool_stride=2,
+      pool_stride: int = 2,
-      unpool_length=0,
+      unpool_length: int = 0,
-      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
-      output_range=None,
+          stddev=0.02),
-      embedding_width=None,
+      output_range: Optional[int] = None,
-      embedding_layer=None,
+      embedding_width: Optional[int] = None,
-      norm_first=False,
+      embedding_layer: Optional[tf.keras.layers.Layer] = None,
+      norm_first: bool = False,
+      transformer_cls: Union[
+          str, tf.keras.layers.Layer] = layers.TransformerEncoderBlock,
+      share_rezero: bool = True,
      **kwargs):
    super().__init__(**kwargs)
    activation = tf.keras.activations.get(inner_activation)
@@ -278,16 +301,22 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
    self._transformer_layers = []
    self._attention_mask_layer = layers.SelfAttentionMask(
        name='self_attention_mask')
+    # Will raise an error if the string is not supported.
+    if isinstance(transformer_cls, str):
+      transformer_cls = _str2transformer_cls[transformer_cls]
    for i in range(num_layers):
-      layer = layers.TransformerEncoderBlock(
+      layer = transformer_cls(
          num_attention_heads=num_attention_heads,
+          intermediate_size=inner_dim,
          inner_dim=inner_dim,
+          intermediate_activation=inner_activation,
          inner_activation=inner_activation,
          output_dropout=output_dropout,
          attention_dropout=attention_dropout,
          norm_first=norm_first,
          output_range=output_range if i == num_layers - 1 else None,
          kernel_initializer=initializer,
+          share_rezero=share_rezero,
          name='transformer/layer_%d' % i)
      self._transformer_layers.append(layer)
@@ -333,24 +362,44 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
    self._pool_type = pool_type
    self._config = {
-        'vocab_size': vocab_size,
+        'vocab_size':
-        'hidden_size': hidden_size,
+            vocab_size,
-        'num_layers': num_layers,
+        'hidden_size':
-        'num_attention_heads': num_attention_heads,
+            hidden_size,
-        'max_sequence_length': max_sequence_length,
+        'num_layers':
-        'type_vocab_size': type_vocab_size,
+            num_layers,
-        'inner_dim': inner_dim,
+        'num_attention_heads':
-        'inner_activation': tf.keras.activations.serialize(activation),
+            num_attention_heads,
-        'output_dropout': output_dropout,
+        'max_sequence_length':
-        'attention_dropout': attention_dropout,
+            max_sequence_length,
-        'initializer': tf.keras.initializers.serialize(initializer),
+        'type_vocab_size':
-        'output_range': output_range,
+            type_vocab_size,
-        'embedding_width': embedding_width,
+        'inner_dim':
-        'embedding_layer': embedding_layer,
+            inner_dim,
-        'norm_first': norm_first,
+        'inner_activation':
-        'pool_type': pool_type,
+            tf.keras.activations.serialize(activation),
-        'pool_stride': pool_stride,
+        'output_dropout':
-        'unpool_length': unpool_length,
+            output_dropout,
+        'attention_dropout':
+            attention_dropout,
+        'initializer':
+            tf.keras.initializers.serialize(initializer),
+        'output_range':
+            output_range,
+        'embedding_width':
+            embedding_width,
+        'embedding_layer':
+            embedding_layer,
+        'norm_first':
+            norm_first,
+        'pool_type':
+            pool_type,
+        'pool_stride':
+            pool_stride,
+        'unpool_length':
+            unpool_length,
+        'transformer_cls':
+            _transformer_cls2str.get(transformer_cls, str(transformer_cls))
    }
  def call(self, inputs):

--- a/official/nlp/modeling/networks/funnel_transformer_test.py
+++ b/official/nlp/modeling/networks/funnel_transformer_test.py
@@ -38,13 +38,20 @@ class FunnelTransformerEncoderTest(parameterized.TestCase, tf.test.TestCase):
    tf.keras.mixed_precision.set_global_policy("float32")
  @parameterized.named_parameters(
-      ("mix_truncated_avg", "mixed_float16", tf.float16, "truncated_avg"),
+      ("mix_truncated_avg_rezero", "mixed_float16", tf.float16, "truncated_avg",
-      ("float32_truncated_avg", "float32", tf.float32, "truncated_avg"),
+       "ReZeroTransformer"), ("float32_truncated_avg_rezero", "float32",
-      ("mix_max", "mixed_float16", tf.float16, "max"),
+                              tf.float32, "truncated_avg", "ReZeroTransformer"),
-      ("float32_max", "float32", tf.float32, "max"),
+      ("mix_truncated_avg", "mixed_float16", tf.float16, "truncated_avg",
-      ("mix_avg", "mixed_float16", tf.float16, "avg"),
+       "TransformerEncoderBlock"),
-      ("float32_avg", "float32", tf.float32, "avg"))
+      ("float32_truncated_avg", "float32", tf.float32, "truncated_avg",
-  def test_network_creation(self, policy, pooled_dtype, pool_type):
+       "TransformerEncoderBlock"), ("mix_max", "mixed_float16", tf.float16,
+                                    "max", "TransformerEncoderBlock"),
+      ("float32_max", "float32", tf.float32, "max", "TransformerEncoderBlock"),
+      ("mix_avg", "mixed_float16", tf.float16, "avg",
+       "TransformerEncoderBlock"),
+      ("float32_avg", "float32", tf.float32, "avg", "TransformerEncoderBlock"))
+  def test_network_creation(self, policy, pooled_dtype, pool_type,
+                            transformer_cls):
    tf.keras.mixed_precision.set_global_policy(policy)
    hidden_size = 32
@@ -60,7 +67,8 @@ class FunnelTransformerEncoderTest(parameterized.TestCase, tf.test.TestCase):
        pool_stride=pool_stride,
        pool_type=pool_type,
        max_sequence_length=sequence_length,
-        unpool_length=0)
+        unpool_length=0,
+        transformer_cls=transformer_cls)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -253,7 +261,8 @@ class FunnelTransformerEncoderTest(parameterized.TestCase, tf.test.TestCase):
        norm_first=False,
        pool_type="max",
        pool_stride=2,
-        unpool_length=0)
+        unpool_length=0,
+        transformer_cls="TransformerEncoderBlock")
    network = funnel_transformer.FunnelTransformerEncoder(**kwargs)
    expected_config = dict(kwargs)
    expected_config["inner_activation"] = tf.keras.activations.serialize(

--- a/official/nlp/serving/export_savedmodel.py
+++ b/official/nlp/serving/export_savedmodel.py
@@ -13,12 +13,14 @@
 # limitations under the License.
 """A binary/library to export TF-NLP serving `SavedModel`."""
+import dataclasses
 import os
 from typing import Any, Dict, Text
 from absl import app
 from absl import flags
-import dataclasses
 import yaml
 from official.core import base_task
 from official.core import task_factory
 from official.modeling import hyperparams
@@ -29,6 +31,7 @@ from official.nlp.tasks import masked_lm
 from official.nlp.tasks import question_answering
 from official.nlp.tasks import sentence_prediction
 from official.nlp.tasks import tagging
+from official.nlp.tasks import translation
 FLAGS = flags.FLAGS
@@ -40,7 +43,9 @@ SERVING_MODULES = {
    question_answering.QuestionAnsweringTask:
        serving_modules.QuestionAnswering,
    tagging.TaggingTask:
-        serving_modules.Tagging
+        serving_modules.Tagging,
+    translation.TranslationTask:
+        serving_modules.Translation
 }
@@ -60,6 +65,10 @@ def define_flags():
  flags.DEFINE_string(
      "function_keys", None,
      "A string key to retrieve pre-defined serving signatures.")
+  flags.DEFINE_string(
+      "module_key", None,
+      "For multi-task case, load the export module weights from a specific "
+      "checkpoint item.")
  flags.DEFINE_bool("convert_tpu", False, "")
  flags.DEFINE_multi_integer("allowed_batch_size", None,
                             "Allowed batch sizes for batching ops.")
@@ -116,7 +125,8 @@ def main(_):
      export_module,
      function_keys=[FLAGS.function_keys],
      checkpoint_path=FLAGS.checkpoint_path,
-      export_savedmodel_dir=FLAGS.export_savedmodel_dir)
+      export_savedmodel_dir=FLAGS.export_savedmodel_dir,
+      module_key=FLAGS.module_key)
  if FLAGS.convert_tpu:
    # pylint: disable=g-import-not-at-top

--- a/official/nlp/serving/export_savedmodel_util.py
+++ b/official/nlp/serving/export_savedmodel_util.py
@@ -13,24 +13,21 @@
 # limitations under the License.
 """Common library to export a SavedModel from the export module."""
-import os
-import time
 from typing import Dict, List, Optional, Text, Union
-from absl import logging
 import tensorflow as tf
 from official.core import export_base
+get_timestamped_export_dir = export_base.get_timestamped_export_dir
-MAX_DIRECTORY_CREATION_ATTEMPTS = 10
 def export(export_module: export_base.ExportModule,
           function_keys: Union[List[Text], Dict[Text, Text]],
           export_savedmodel_dir: Text,
           checkpoint_path: Optional[Text] = None,
-           timestamped: bool = True) -> Text:
+           timestamped: bool = True,
+           module_key: Optional[Text] = None) -> Text:
  """Exports to SavedModel format.
  Args:
@@ -41,6 +38,8 @@ def export(export_module: export_base.ExportModule,
    export_savedmodel_dir: Output saved model directory.
    checkpoint_path: Object-based checkpoint path or directory.
    timestamped: Whether to export the savedmodel to a timestamped directory.
+    module_key: Optional string to identify a checkpoint object to load for the
+      model in the export module.
  Returns:
    The savedmodel directory path.
@@ -48,37 +47,16 @@ def export(export_module: export_base.ExportModule,
  save_options = tf.saved_model.SaveOptions(function_aliases={
      'tpu_candidate': export_module.serve,
  })
-  return export_base.export(export_module, function_keys, export_savedmodel_dir,
+  if module_key:
-                            checkpoint_path, timestamped, save_options)
+    kwargs = {module_key: export_module.model}
+    checkpoint = tf.train.Checkpoint(**kwargs)
+  else:
-def get_timestamped_export_dir(export_dir_base):
+    checkpoint = None
-  """Builds a path to a new subdirectory within the base directory.
+  return export_base.export(
+      export_module,
-  Args:
+      function_keys,
-    export_dir_base: A string containing a directory to write the exported graph
+      export_savedmodel_dir,
-      and checkpoints.
+      checkpoint_path,
+      timestamped,
-  Returns:
+      save_options,
-    The full path of the new subdirectory (which is not actually created yet).
+      checkpoint=checkpoint)
-  Raises:
-    RuntimeError: if repeated attempts fail to obtain a unique timestamped
-      directory name.
-  """
-  attempts = 0
-  while attempts < MAX_DIRECTORY_CREATION_ATTEMPTS:
-    timestamp = int(time.time())
-    result_dir = os.path.join(export_dir_base, str(timestamp))
-    if not tf.io.gfile.exists(result_dir):
-      # Collisions are still possible (though extremely unlikely): this
-      # directory is not actually created yet, but it will be almost
-      # instantly on return from this function.
-      return result_dir
-    time.sleep(1)
-    attempts += 1
-    logging.warning('Directory %s already exists; retrying (attempt %s/%s)',
-                    str(result_dir), attempts, MAX_DIRECTORY_CREATION_ATTEMPTS)
-  raise RuntimeError('Failed to obtain a unique export directory name after '
-                     f'{MAX_DIRECTORY_CREATION_ATTEMPTS} attempts.')
--- a/official/nlp/serving/serving_modules.py
+++ b/official/nlp/serving/serving_modules.py
@@ -14,10 +14,12 @@
 """Serving export modules for TF Model Garden NLP models."""
 # pylint:disable=missing-class-docstring
+import dataclasses
 from typing import Dict, List, Optional, Text
-import dataclasses
 import tensorflow as tf
+import tensorflow_text as tf_text
 from official.core import export_base
 from official.modeling.hyperparams import base_config
 from official.nlp.data import sentence_prediction_dataloader
@@ -407,3 +409,48 @@ class Tagging(export_base.ExportModule):
        signatures[signature_key] = self.serve_examples.get_concrete_function(
            tf.TensorSpec(shape=[None], dtype=tf.string, name="examples"))
    return signatures
+class Translation(export_base.ExportModule):
+  """The export module for the translation task."""
+  @dataclasses.dataclass
+  class Params(base_config.Config):
+    sentencepiece_model_path: str = ""
+  def __init__(self, params, model: tf.keras.Model, inference_step=None):
+    super().__init__(params, model, inference_step)
+    self._sp_tokenizer = tf_text.SentencepieceTokenizer(
+        model=tf.io.gfile.GFile(params.sentencepiece_model_path, "rb").read(),
+        add_eos=True)
+    try:
+      empty_str_tokenized = self._sp_tokenizer.tokenize("").numpy()
+    except tf.errors.InternalError:
+      raise ValueError(
+          "EOS token not in tokenizer vocab."
+          "Please make sure the tokenizer generates a single token for an "
+          "empty string.")
+    self._eos_id = empty_str_tokenized.item()
+  @tf.function
+  def serve(self, inputs) -> Dict[str, tf.Tensor]:
+    return self.inference_step(inputs)
+  @tf.function
+  def serve_text(self, text: tf.Tensor) -> Dict[str, tf.Tensor]:
+    tokenized = self._sp_tokenizer.tokenize(text).to_tensor(0)
+    return self._sp_tokenizer.detokenize(
+        self.serve({"inputs": tokenized})["outputs"])
+  def get_inference_signatures(self, function_keys: Dict[Text, Text]):
+    signatures = {}
+    valid_keys = ("serve_text")
+    for func_key, signature_key in function_keys.items():
+      if func_key not in valid_keys:
+        raise ValueError("Invalid function key for the module: %s with key %s. "
+                         "Valid keys are: %s" %
+                         (self.__class__, func_key, valid_keys))
+      if func_key == "serve_text":
+        signatures[signature_key] = self.serve_text.get_concrete_function(
+            tf.TensorSpec(shape=[None], dtype=tf.string, name="text"))
+    return signatures
--- a/official/nlp/serving/serving_modules_test.py
+++ b/official/nlp/serving/serving_modules_test.py
@@ -15,8 +15,11 @@
 """Tests for nlp.serving.serving_modules."""
 import os
 from absl.testing import parameterized
 import tensorflow as tf
+from sentencepiece import SentencePieceTrainer
 from official.nlp.configs import bert
 from official.nlp.configs import encoders
 from official.nlp.serving import serving_modules
@@ -24,6 +27,7 @@ from official.nlp.tasks import masked_lm
 from official.nlp.tasks import question_answering
 from official.nlp.tasks import sentence_prediction
 from official.nlp.tasks import tagging
+from official.nlp.tasks import translation
 def _create_fake_serialized_examples(features_dict):
@@ -59,6 +63,33 @@ def _create_fake_vocab_file(vocab_file_path):
    outfile.write("\n".join(tokens))
+def _train_sentencepiece(input_path, vocab_size, model_path, eos_id=1):
+  argstr = " ".join([
+      f"--input={input_path}", f"--vocab_size={vocab_size}",
+      "--character_coverage=0.995",
+      f"--model_prefix={model_path}", "--model_type=bpe",
+      "--bos_id=-1", "--pad_id=0", f"--eos_id={eos_id}", "--unk_id=2"
+  ])
+  SentencePieceTrainer.Train(argstr)
+def _generate_line_file(filepath, lines):
+  with tf.io.gfile.GFile(filepath, "w") as f:
+    for l in lines:
+      f.write("{}\n".format(l))
+def _make_sentencepeice(output_dir):
+  src_lines = ["abc ede fg", "bbcd ef a g", "de f a a g"]
+  tgt_lines = ["dd cc a ef  g", "bcd ef a g", "gef cd ba"]
+  sentencepeice_input_path = os.path.join(output_dir, "inputs.txt")
+  _generate_line_file(sentencepeice_input_path, src_lines + tgt_lines)
+  sentencepeice_model_prefix = os.path.join(output_dir, "sp")
+  _train_sentencepiece(sentencepeice_input_path, 11, sentencepeice_model_prefix)
+  sentencepeice_model_path = "{}.model".format(sentencepeice_model_prefix)
+  return sentencepeice_model_path
 class ServingModulesTest(tf.test.TestCase, parameterized.TestCase):
  @parameterized.parameters(
@@ -312,6 +343,31 @@ class ServingModulesTest(tf.test.TestCase, parameterized.TestCase):
    with self.assertRaises(ValueError):
      _ = export_module.get_inference_signatures({"foo": None})
+  def test_translation(self):
+    sp_path = _make_sentencepeice(self.get_temp_dir())
+    encdecoder = translation.EncDecoder(
+        num_attention_heads=4, intermediate_size=256)
+    config = translation.TranslationConfig(
+        model=translation.ModelConfig(
+            encoder=encdecoder,
+            decoder=encdecoder,
+            embedding_width=256,
+            padded_decode=False,
+            decode_max_length=100),
+        sentencepiece_model_path=sp_path,
+    )
+    task = translation.TranslationTask(config)
+    model = task.build_model()
+    params = serving_modules.Translation.Params(
+        sentencepiece_model_path=sp_path)
+    export_module = serving_modules.Translation(params=params, model=model)
+    functions = export_module.get_inference_signatures({
+        "serve_text": "serving_default"
+    })
+    outputs = functions["serving_default"](tf.constant(["abcd", "ef gh"]))
+    self.assertEqual(outputs.shape, (2,))
+    self.assertEqual(outputs.dtype, tf.string)
 if __name__ == "__main__":
  tf.test.main()