Merge pull request #1 from tensorflow/master

new pull

Merge pull request #1 from tensorflow/master
new pull
f16a7b5b · vedanshu · GitHub · 8e9296ff · 8f58f396 · f16a7b5b
Unverified Commit f16a7b5b authored May 04, 2021 by vedanshu Committed by GitHub May 04, 2021
20 changed files
--- a/official/nlp/keras_nlp/layers/position_embedding_test.py
+++ b/official/nlp/keras_nlp/layers/position_embedding_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Keras-based positional embedding layer."""
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.keras_nlp.layers import position_embedding
+
+
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
+
+  def test_static_layer_output_shape(self):
+    # Create a 3-dimensional input (the first dimension is implicit).
+    sequence_length = 21
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=sequence_length)
+    width = 30
+    input_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(input_tensor)
+
+    # When using static positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions save batch.
+    expected_output_shape = [None, sequence_length, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    # The default output dtype for this layer should be tf.float32.
+    self.assertEqual(tf.float32, output_tensor.dtype)
+
+  def test_float16_dtype(self):
+    # Create a 3-dimensional input (the first dimension is implicit).
+    sequence_length = 21
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=sequence_length, dtype="float16")
+    width = 30
+    input_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(input_tensor)
+
+    # When using static positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions save batch.
+    expected_output_shape = [None, sequence_length, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    # The default output dtype for this layer should be tf.float32.
+    self.assertEqual(tf.float16, output_tensor.dtype)
+
+  def test_dynamic_layer_output_shape(self):
+    max_sequence_length = 40
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=max_sequence_length)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    width = 30
+    input_tensor = tf.keras.Input(shape=(None, width))
+    output_tensor = test_layer(input_tensor)
+
+    # When using dynamic positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions - but may be None if
+    # the input shape is None there.
+    expected_output_shape = [None, None, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+
+  def test_dynamic_layer_slicing(self):
+    max_sequence_length = 40
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=max_sequence_length)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    width = 30
+    input_tensor = tf.keras.Input(shape=(None, width))
+    output_tensor = test_layer(input_tensor)
+
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    # Create input data that is shorter than max_sequence_length, which should
+    # trigger a down-slice.
+    input_length = 17
+    # Note: This test explicitly uses a batch size of 1. This is to get around
+    # Keras' restriction on Model invocations: inputs are expected to have the
+    # same batch cardinality as outputs. In practice, this layer should be used
+    # inside a model, where it can be projected when added to another tensor.
+    input_data = np.ones((1, input_length, width))
+    output_data = model.predict(input_data)
+
+    self.assertAllEqual([1, input_length, width], output_data.shape)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/keras_nlp/layers/self_attention_mask.py
+++ b/official/nlp/keras_nlp/layers/self_attention_mask.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras layer that creates a self-attention mask."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='keras_nlp')
+class SelfAttentionMask(tf.keras.layers.Layer):
+  """Create 3D attention mask from a 2D tensor mask.
+
+    inputs[0]: from_tensor: 2D or 3D Tensor of shape
+      [batch_size, from_seq_length, ...].
+    inputs[1]: to_mask: int32 Tensor of shape [batch_size, to_seq_length].
+
+    Returns:
+      float Tensor of shape [batch_size, from_seq_length, to_seq_length].
+  """
+
+  def call(self, inputs, to_mask):
+    from_shape = tf.shape(inputs)
+    batch_size = from_shape[0]
+    from_seq_length = from_shape[1]
+
+    to_shape = tf.shape(to_mask)
+    to_seq_length = to_shape[1]
+
+    to_mask = tf.cast(
+        tf.reshape(to_mask, [batch_size, 1, to_seq_length]),
+        dtype=inputs.dtype)
+
+    # We don't assume that `from_tensor` is a mask (although it could be). We
+    # don't actually care if we attend *from* padding tokens (only *to* padding)
+    # tokens so we create a tensor of all ones.
+    #
+    # `broadcast_ones` = [batch_size, from_seq_length, 1]
+    broadcast_ones = tf.ones(
+        shape=[batch_size, from_seq_length, 1], dtype=inputs.dtype)
+
+    # Here we broadcast along two dimensions to create the mask.
+    mask = broadcast_ones * to_mask
+
+    return mask
--- a/official/nlp/keras_nlp/layers/transformer_encoder_block.py
+++ b/official/nlp/keras_nlp/layers/transformer_encoder_block.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based TransformerEncoder block layer."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package="keras_nlp")
+class TransformerEncoderBlock(tf.keras.layers.Layer):
+  """TransformerEncoderBlock layer.
+
+  This layer implements the Transformer Encoder from
+  "Attention Is All You Need". (https://arxiv.org/abs/1706.03762),
+  which combines a `tf.keras.layers.MultiHeadAttention` layer with a
+  two-layer feedforward network.
+
+  References:
+    [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
+    [BERT: Pre-training of Deep Bidirectional Transformers for Language
+     Understanding](https://arxiv.org/abs/1810.04805)
+  """
+
+  def __init__(self,
+               num_attention_heads,
+               inner_dim,
+               inner_activation,
+               output_range=None,
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               use_bias=True,
+               norm_first=False,
+               norm_epsilon=1e-12,
+               output_dropout=0.0,
+               attention_dropout=0.0,
+               inner_dropout=0.0,
+               attention_initializer=None,
+               **kwargs):
+    """Initializes `TransformerEncoderBlock`.
+
+    Args:
+      num_attention_heads: Number of attention heads.
+      inner_dim: The output dimension of the first Dense layer in a two-layer
+        feedforward network.
+      inner_activation: The activation for the first Dense layer in a two-layer
+        feedforward network.
+      output_range: the sequence output range, [0, output_range) for slicing the
+        target sequence. `None` means the target sequence is not sliced.
+      kernel_initializer: Initializer for dense layer kernels.
+      bias_initializer: Initializer for dense layer biases.
+      kernel_regularizer: Regularizer for dense layer kernels.
+      bias_regularizer: Regularizer for dense layer biases.
+      activity_regularizer: Regularizer for dense layer activity.
+      kernel_constraint: Constraint for dense layer kernels.
+      bias_constraint: Constraint for dense layer kernels.
+      use_bias: Whether to enable use_bias in attention layer. If set False,
+        use_bias in attention layer is disabled.
+      norm_first: Whether to normalize inputs to attention and intermediate
+        dense layers. If set False, output of attention and intermediate dense
+        layers is normalized.
+      norm_epsilon: Epsilon value to initialize normalization layers.
+      output_dropout: Dropout probability for the post-attention and output
+        dropout.
+      attention_dropout: Dropout probability for within the attention layer.
+      inner_dropout: Dropout probability for the first Dense layer in a
+        two-layer feedforward network.
+      attention_initializer: Initializer for kernels of attention layers. If set
+        `None`, attention layers use kernel_initializer as initializer for
+        kernel.
+      **kwargs: keyword arguments/
+    """
+    super().__init__(**kwargs)
+
+    self._num_heads = num_attention_heads
+    self._inner_dim = inner_dim
+    self._inner_activation = inner_activation
+    self._attention_dropout = attention_dropout
+    self._output_dropout = output_dropout
+    self._output_range = output_range
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+    self._use_bias = use_bias
+    self._norm_first = norm_first
+    self._norm_epsilon = norm_epsilon
+    self._inner_dropout = inner_dropout
+    if attention_initializer:
+      self._attention_initializer = tf.keras.initializers.get(
+          attention_initializer)
+    else:
+      self._attention_initializer = self._kernel_initializer
+
+  def build(self, input_shape):
+    if isinstance(input_shape, tf.TensorShape):
+      input_tensor_shape = input_shape
+    elif isinstance(input_shape, (list, tuple)):
+      input_tensor_shape = tf.TensorShape(input_shape[0])
+    else:
+      raise ValueError(
+          "The type of input shape argument is not supported, got: %s" %
+          type(input_shape))
+    if len(input_tensor_shape.as_list()) != 3:
+      raise ValueError("TransformerEncoderBlock expects a three-dimensional "
+                       "input of shape [batch, sequence, width].")
+    hidden_size = input_tensor_shape[-1]
+    if hidden_size % self._num_heads != 0:
+      raise ValueError(
+          "The input size (%d) is not a multiple of the number of attention "
+          "heads (%d)" % (hidden_size, self._num_heads))
+    self._attention_head_size = int(hidden_size // self._num_heads)
+    common_kwargs = dict(
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+    self._attention_layer = tf.keras.layers.MultiHeadAttention(
+        num_heads=self._num_heads,
+        key_dim=self._attention_head_size,
+        dropout=self._attention_dropout,
+        use_bias=self._use_bias,
+        kernel_initializer=self._attention_initializer,
+        name="self_attention",
+        **common_kwargs)
+    self._attention_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
+    # Use float32 in layernorm for numeric stability.
+    # It is probably safe in mixed_float16, but we haven't validated this yet.
+    self._attention_layer_norm = (
+        tf.keras.layers.LayerNormalization(
+            name="self_attention_layer_norm",
+            axis=-1,
+            epsilon=self._norm_epsilon,
+            dtype=tf.float32))
+    self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, self._inner_dim),
+        bias_axes="d",
+        kernel_initializer=self._kernel_initializer,
+        name="intermediate",
+        **common_kwargs)
+    policy = tf.keras.mixed_precision.global_policy()
+    if policy.name == "mixed_bfloat16":
+      # bfloat16 causes BERT with the LAMB optimizer to not converge
+      # as well, so we use float32.
+      # TODO(b/154538392): Investigate this.
+      policy = tf.float32
+    self._intermediate_activation_layer = tf.keras.layers.Activation(
+        self._inner_activation, dtype=policy)
+    self._inner_dropout_layer = tf.keras.layers.Dropout(
+        rate=self._inner_dropout)
+    self._output_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, hidden_size),
+        bias_axes="d",
+        name="output",
+        kernel_initializer=self._kernel_initializer,
+        **common_kwargs)
+    self._output_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
+    # Use float32 in layernorm for numeric stability.
+    self._output_layer_norm = tf.keras.layers.LayerNormalization(
+        name="output_layer_norm",
+        axis=-1,
+        epsilon=self._norm_epsilon,
+        dtype=tf.float32)
+
+    super(TransformerEncoderBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        "num_attention_heads":
+            self._num_heads,
+        "inner_dim":
+            self._inner_dim,
+        "inner_activation":
+            self._inner_activation,
+        "output_dropout":
+            self._output_dropout,
+        "attention_dropout":
+            self._attention_dropout,
+        "output_range":
+            self._output_range,
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            tf.keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            tf.keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            tf.keras.constraints.serialize(self._bias_constraint),
+        "use_bias":
+            self._use_bias,
+        "norm_first":
+            self._norm_first,
+        "norm_epsilon":
+            self._norm_epsilon,
+        "inner_dropout":
+            self._inner_dropout,
+        "attention_initializer":
+            tf.keras.initializers.serialize(self._attention_initializer)
+    }
+    base_config = super(TransformerEncoderBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    """Transformer self-attention encoder block call.
+
+    Args:
+      inputs: a single tensor or a list of tensors.
+        `input tensor` as the single sequence of embeddings.
+        [`input tensor`, `attention mask`] to have the additional attention
+          mask.
+        [`query tensor`, `key value tensor`, `attention mask`] to have separate
+          input streams for the query, and key/value to the multi-head
+          attention.
+
+    Returns:
+      An ouput tensor with the same dimensions as input/query tensor.
+    """
+    if isinstance(inputs, (list, tuple)):
+      if len(inputs) == 2:
+        input_tensor, attention_mask = inputs
+        key_value = None
+      elif len(inputs) == 3:
+        input_tensor, key_value, attention_mask = inputs
+      else:
+        raise ValueError("Unexpected inputs to %s with length at %d" %
+                         (self.__class__, len(inputs)))
+    else:
+      input_tensor, key_value, attention_mask = (inputs, None, None)
+
+    if self._output_range:
+      if self._norm_first:
+        source_tensor = input_tensor[:, 0:self._output_range, :]
+        input_tensor = self._attention_layer_norm(input_tensor)
+        if key_value is not None:
+          key_value = self._attention_layer_norm(key_value)
+      target_tensor = input_tensor[:, 0:self._output_range, :]
+      if attention_mask is not None:
+        attention_mask = attention_mask[:, 0:self._output_range, :]
+    else:
+      if self._norm_first:
+        source_tensor = input_tensor
+        input_tensor = self._attention_layer_norm(input_tensor)
+        if key_value is not None:
+          key_value = self._attention_layer_norm(key_value)
+      target_tensor = input_tensor
+
+    if key_value is None:
+      key_value = input_tensor
+    attention_output = self._attention_layer(
+        query=target_tensor, value=key_value, attention_mask=attention_mask)
+    attention_output = self._attention_dropout(attention_output)
+    if self._norm_first:
+      attention_output = source_tensor + attention_output
+    else:
+      attention_output = self._attention_layer_norm(target_tensor +
+                                                    attention_output)
+    if self._norm_first:
+      source_attention_output = attention_output
+      attention_output = self._output_layer_norm(attention_output)
+    inner_output = self._intermediate_dense(attention_output)
+    inner_output = self._intermediate_activation_layer(inner_output)
+    inner_output = self._inner_dropout_layer(inner_output)
+    layer_output = self._output_dense(inner_output)
+    layer_output = self._output_dropout(layer_output)
+
+    if self._norm_first:
+      return source_attention_output + layer_output
+
+    # During mixed precision training, layer norm output is always fp32 for now.
+    # Casts fp32 for the subsequent add.
+    layer_output = tf.cast(layer_output, tf.float32)
+    return self._output_layer_norm(layer_output + attention_output)
--- a/official/nlp/keras_nlp/layers/transformer_encoder_block_test.py
+++ b/official/nlp/keras_nlp/layers/transformer_encoder_block_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Keras-based transformer block layer."""
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.keras_nlp.layers.transformer_encoder_block import TransformerEncoderBlock
+
+
+@keras_parameterized.run_all_keras_modes
+@parameterized.named_parameters(
+    ('base', TransformerEncoderBlock))
+class TransformerEncoderBlockLayerTest(keras_parameterized.TestCase):
+
+  def tearDown(self):
+    super(TransformerEncoderBlockLayerTest, self).tearDown()
+    tf.keras.mixed_precision.set_global_policy('float32')
+
+  def test_layer_creation(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(data_tensor)
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
+
+  def test_layer_creation_with_mask(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    # Create a 2-dimensional input (the first dimension is implicit).
+    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
+    output_tensor = test_layer([data_tensor, mask_tensor])
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
+
+  def test_layer_invocation(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(data_tensor)
+
+    # Create a model from the test layer.
+    model = tf.keras.Model(data_tensor, output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    _ = model.predict(input_data)
+
+  def test_layer_invocation_with_mask(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    # Create a 2-dimensional input (the first dimension is implicit).
+    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
+    output_tensor = test_layer([data_tensor, mask_tensor])
+
+    # Create a model from the test layer.
+    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
+    # which here is (batch, sequence_length, sequence_length)
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    _ = model.predict([input_data, mask_data])
+
+  def test_layer_output_range(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    output_tensor = test_layer([input_data, mask_data])
+
+    # The layer only attends to the first token and outputs the first token
+    # embedding.
+    new_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        output_range=1)
+    _ = new_layer([input_data, mask_data])
+    new_layer.set_weights(test_layer.get_weights())
+    new_output_tensor = new_layer([input_data, mask_data])
+    self.assertAllClose(
+        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+
+  def test_layer_output_range_without_mask(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048,
+        inner_activation='relu', norm_first=True)
+    sequence_length = 21
+    width = 80
+
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    output_tensor = test_layer(input_data)
+
+    # The layer only attends to the first token and outputs the first token
+    # embedding.
+    new_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        output_range=1,
+        norm_first=True)
+    _ = new_layer(input_data)
+    new_layer.set_weights(test_layer.get_weights())
+    new_output_tensor = new_layer(input_data)
+    self.assertAllClose(
+        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+
+  def test_layer_output_range_with_pre_norm(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048,
+        inner_activation='relu', norm_first=True)
+    sequence_length = 21
+    width = 80
+
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    output_tensor = test_layer([input_data, mask_data])
+
+    # The layer only attends to the first token and outputs the first token
+    # embedding.
+    new_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        output_range=1,
+        norm_first=True)
+    _ = new_layer([input_data, mask_data])
+    new_layer.set_weights(test_layer.get_weights())
+    new_output_tensor = new_layer([input_data, mask_data])
+    self.assertAllClose(
+        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+
+  def test_layer_invocation_with_float16_dtype(self, transformer_cls):
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    # Create a 2-dimensional input (the first dimension is implicit).
+    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
+    output_tensor = test_layer([data_tensor, mask_tensor])
+
+    # Create a model from the test layer.
+    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 6
+    input_data = (10 * np.random.random_sample(
+        (batch_size, sequence_length, width)))
+    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
+    # which here is (batch, sequence_length, sequence_length)
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    _ = model.predict([input_data, mask_data])
+
+  def test_transform_with_initializer(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output = test_layer(data_tensor)
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output.shape.as_list())
+
+  def test_dynamic_layer_sequence(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
+    # Create a 3-dimensional input (the first dimension is implicit).
+    width = 30
+    input_tensor = tf.keras.Input(shape=(None, width))
+    output_tensor = test_layer(input_tensor)
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    input_length = 17
+    input_data = np.ones((1, input_length, width))
+    output_data = model.predict(input_data)
+
+    self.assertAllEqual([1, input_length, width], output_data.shape)
+
+  def test_separate_qkv(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=2,
+        inner_dim=128,
+        inner_activation='relu',
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
+    # Forward path.
+    q_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
+    kv_tensor = tf.zeros([2, 8, 16], dtype=tf.float32)
+    dummy_mask = tf.zeros([2, 4, 8], dtype=tf.float32)
+    inputs = [q_tensor, kv_tensor, dummy_mask]
+    output = test_layer(inputs)
+    self.assertEqual(output.shape, q_tensor.shape)
+
+
+@keras_parameterized.run_all_keras_modes
+class TransformerArgumentTest(keras_parameterized.TestCase):
+
+  def test_use_bias_norm_first(self):
+    num_attention_heads = 2
+    hidden_size = 16
+    encoder_block = TransformerEncoderBlock(
+        num_attention_heads=num_attention_heads,
+        inner_dim=32,
+        inner_activation='relu',
+        output_dropout=0.1,
+        attention_dropout=0.1,
+        use_bias=False,
+        norm_first=True,
+        norm_epsilon=1e-6,
+        inner_dropout=0.1,
+        attention_initializer=tf.keras.initializers.RandomUniform(
+            minval=0., maxval=1.))
+    # Forward path.
+    dummy_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
+    dummy_mask = tf.zeros([2, 4, 4], dtype=tf.float32)
+    inputs = [dummy_tensor, dummy_mask]
+    output = encoder_block(inputs)
+    self.assertEqual(output.shape, (2, 4, hidden_size))
+
+  def test_get_config(self):
+    num_attention_heads = 2
+    encoder_block = TransformerEncoderBlock(
+        num_attention_heads=num_attention_heads,
+        inner_dim=32,
+        inner_activation='relu',
+        output_dropout=0.1,
+        attention_dropout=0.1,
+        use_bias=False,
+        norm_first=True,
+        norm_epsilon=1e-6,
+        inner_dropout=0.1,
+        attention_initializer=tf.keras.initializers.RandomUniform(
+            minval=0., maxval=1.))
+    encoder_block_config = encoder_block.get_config()
+    new_encoder_block = TransformerEncoderBlock.from_config(
+        encoder_block_config)
+    self.assertEqual(encoder_block_config, new_encoder_block.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/nlp/keras_nlp/requirements.txt
+++ b/official/nlp/keras_nlp/requirements.txt
+numpy>=1.15.4
--- a/official/nlp/keras_nlp/setup.py
+++ b/official/nlp/keras_nlp/setup.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Setup script."""
+
+import os
+
+from setuptools import find_packages
+from setuptools import setup
+
+version = '0.0.1'
+
+
+def _get_requirements():
+  """Parses requirements.txt file."""
+  install_requires_tmp = []
+  dependency_links_tmp = []
+  with open(
+      os.path.join(os.path.dirname(__file__), './requirements.txt'), 'r') as f:
+    for line in f:
+      package_name = line.strip()
+      # Skip empty line or comments starting with "#".
+      if not package_name or package_name[0] == '#':
+        continue
+      if package_name.startswith('-e '):
+        dependency_links_tmp.append(package_name[3:].strip())
+      else:
+        install_requires_tmp.append(package_name)
+  return install_requires_tmp, dependency_links_tmp
+
+install_requires, dependency_links = _get_requirements()
+
+install_requires.append('tf-nightly')
+
+setup(
+    name='keras-nlp',
+    version=version,
+    description='Keras Natural Language Processing Library',
+    url='https://github.com/keras-team/keras-nlp',
+    author='The Keras authors',
+    author_email='keras-team@google.com',
+    license='Apache License 2.0',
+    install_requires=install_requires,
+    classifiers=[
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 3.6',
+        'Operating System :: Unix',
+        'Operating System :: Microsoft :: Windows',
+        'Operating System :: MacOS',
+        'Intended Audience :: Science/Research',
+        'Topic :: Scientific/Engineering',
+        'Topic :: Software Development'
+    ],
+    packages=find_packages(exclude=('tests',)),
+    exclude_package_data={'': ['*_test.py',],},
+    dependency_links=dependency_links,
+    python_requires='>=3.6',
+)
--- a/official/nlp/metrics/__init__.py
+++ b/official/nlp/metrics/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
--- a/official/nlp/metrics/bleu.py
+++ b/official/nlp/metrics/bleu.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script to compute official BLEU score.
+
+Source:
+https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
+"""
+
+import collections
+import math
+import re
+import sys
+import unicodedata
+
+import numpy as np
+import tensorflow as tf
+
+
+class UnicodeRegex(object):
+  """Ad-hoc hack to recognize all punctuation and symbols."""
+
+  def __init__(self):
+    punctuation = self.property_chars("P")
+    self.nondigit_punct_re = re.compile(r"([^\d])([" + punctuation + r"])")
+    self.punct_nondigit_re = re.compile(r"([" + punctuation + r"])([^\d])")
+    self.symbol_re = re.compile("([" + self.property_chars("S") + "])")
+
+  def property_chars(self, prefix):
+    return "".join(
+        chr(x)
+        for x in range(sys.maxunicode)
+        if unicodedata.category(chr(x)).startswith(prefix))
+
+
+uregex = UnicodeRegex()
+
+
+def bleu_tokenize(string):
+  r"""Tokenize a string following the official BLEU implementation.
+
+  See https://github.com/moses-smt/mosesdecoder/'
+           'blob/master/scripts/generic/mteval-v14.pl#L954-L983
+  In our case, the input string is expected to be just one line
+  and no HTML entities de-escaping is needed.
+  So we just tokenize on punctuation and symbols,
+  except when a punctuation is preceded and followed by a digit
+  (e.g. a comma/dot as a thousand/decimal separator).
+
+  Note that a numer (e.g. a year) followed by a dot at the end of sentence
+  is NOT tokenized,
+  i.e. the dot stays with the number because `s/(\p{P})(\P{N})/ $1 $2/g`
+  does not match this case (unless we add a space after each sentence).
+  However, this error is already in the original mteval-v14.pl
+  and we want to be consistent with it.
+
+  Args:
+    string: the input string
+
+  Returns:
+    a list of tokens
+  """
+  string = uregex.nondigit_punct_re.sub(r"\1 \2 ", string)
+  string = uregex.punct_nondigit_re.sub(r" \1 \2", string)
+  string = uregex.symbol_re.sub(r" \1 ", string)
+  return string.split()
+
+
+def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False):
+  """Compute BLEU for two files (reference and hypothesis translation)."""
+  ref_lines = tf.io.gfile.GFile(ref_filename).read().strip().splitlines()
+  hyp_lines = tf.io.gfile.GFile(hyp_filename).read().strip().splitlines()
+  return bleu_on_list(ref_lines, hyp_lines, case_sensitive)
+
+
+def _get_ngrams_with_counter(segment, max_order):
+  """Extracts all n-grams up to a given maximum order from an input segment.
+
+  Args:
+    segment: text segment from which n-grams will be extracted.
+    max_order: maximum length in tokens of the n-grams returned by this
+        methods.
+
+  Returns:
+    The Counter containing all n-grams upto max_order in segment
+    with a count of how many times each n-gram occurred.
+  """
+  ngram_counts = collections.Counter()
+  for order in range(1, max_order + 1):
+    for i in range(0, len(segment) - order + 1):
+      ngram = tuple(segment[i:i + order])
+      ngram_counts[ngram] += 1
+  return ngram_counts
+
+
+def compute_bleu(reference_corpus, translation_corpus, max_order=4,
+                 use_bp=True):
+  """Computes BLEU score of translated segments against one or more references.
+
+  Args:
+    reference_corpus: list of references for each translation. Each
+        reference should be tokenized into a list of tokens.
+    translation_corpus: list of translations to score. Each translation
+        should be tokenized into a list of tokens.
+    max_order: Maximum n-gram order to use when computing BLEU score.
+    use_bp: boolean, whether to apply brevity penalty.
+
+  Returns:
+    BLEU score.
+  """
+  reference_length = 0
+  translation_length = 0
+  bp = 1.0
+  geo_mean = 0
+
+  matches_by_order = [0] * max_order
+  possible_matches_by_order = [0] * max_order
+  precisions = []
+
+  for (references, translations) in zip(reference_corpus, translation_corpus):
+    reference_length += len(references)
+    translation_length += len(translations)
+    ref_ngram_counts = _get_ngrams_with_counter(references, max_order)
+    translation_ngram_counts = _get_ngrams_with_counter(translations, max_order)
+
+    overlap = dict((ngram,
+                    min(count, translation_ngram_counts[ngram]))
+                   for ngram, count in ref_ngram_counts.items())
+
+    for ngram in overlap:
+      matches_by_order[len(ngram) - 1] += overlap[ngram]
+    for ngram in translation_ngram_counts:
+      possible_matches_by_order[len(ngram) - 1] += translation_ngram_counts[
+          ngram]
+
+  precisions = [0] * max_order
+  smooth = 1.0
+
+  for i in range(0, max_order):
+    if possible_matches_by_order[i] > 0:
+      precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
+      if matches_by_order[i] > 0:
+        precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[
+            i]
+      else:
+        smooth *= 2
+        precisions[i] = 1.0 / (smooth * possible_matches_by_order[i])
+    else:
+      precisions[i] = 0.0
+
+  if max(precisions) > 0:
+    p_log_sum = sum(math.log(p) for p in precisions if p)
+    geo_mean = math.exp(p_log_sum / max_order)
+
+  if use_bp:
+    ratio = translation_length / reference_length
+    bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
+  bleu = geo_mean * bp
+  return np.float32(bleu)
+
+
+def bleu_on_list(ref_lines, hyp_lines, case_sensitive=False):
+  """Compute BLEU for two list of strings (reference and hypothesis)."""
+  if len(ref_lines) != len(hyp_lines):
+    raise ValueError(
+        "Reference and translation files have different number of "
+        "lines (%d VS %d). If training only a few steps (100-200), the "
+        "translation may be empty." % (len(ref_lines), len(hyp_lines)))
+  if not case_sensitive:
+    ref_lines = [x.lower() for x in ref_lines]
+    hyp_lines = [x.lower() for x in hyp_lines]
+  ref_tokens = [bleu_tokenize(x) for x in ref_lines]
+  hyp_tokens = [bleu_tokenize(x) for x in hyp_lines]
+  return compute_bleu(ref_tokens, hyp_tokens) * 100
--- a/official/nlp/metrics/bleu_test.py
+++ b/official/nlp/metrics/bleu_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test functions in compute_blue.py."""
+
+import tempfile
+
+import tensorflow as tf
+
+from official.nlp.metrics import bleu
+
+
+class ComputeBleuTest(tf.test.TestCase):
+
+  def _create_temp_file(self, text):
+    temp_file = tempfile.NamedTemporaryFile(delete=False)
+    with tf.io.gfile.GFile(temp_file.name, "w") as w:
+      w.write(text)
+    return temp_file.name
+
+  def test_bleu_same(self):
+    ref = self._create_temp_file("test 1 two 3\nmore tests!")
+    hyp = self._create_temp_file("test 1 two 3\nmore tests!")
+
+    uncased_score = bleu.bleu_wrapper(ref, hyp, False)
+    cased_score = bleu.bleu_wrapper(ref, hyp, True)
+    self.assertEqual(100, uncased_score)
+    self.assertEqual(100, cased_score)
+
+  def test_bleu_same_different_case(self):
+    ref = self._create_temp_file("Test 1 two 3\nmore tests!")
+    hyp = self._create_temp_file("test 1 two 3\nMore tests!")
+    uncased_score = bleu.bleu_wrapper(ref, hyp, False)
+    cased_score = bleu.bleu_wrapper(ref, hyp, True)
+    self.assertEqual(100, uncased_score)
+    self.assertLess(cased_score, 100)
+
+  def test_bleu_different(self):
+    ref = self._create_temp_file("Testing\nmore tests!")
+    hyp = self._create_temp_file("Dog\nCat")
+    uncased_score = bleu.bleu_wrapper(ref, hyp, False)
+    cased_score = bleu.bleu_wrapper(ref, hyp, True)
+    self.assertLess(uncased_score, 100)
+    self.assertLess(cased_score, 100)
+
+  def test_bleu_tokenize(self):
+    s = "Test0, 1 two, 3"
+    tokenized = bleu.bleu_tokenize(s)
+    self.assertEqual(["Test0", ",", "1", "two", ",", "3"], tokenized)
+
+  def test_bleu_list(self):
+    ref = ["test 1 two 3", "more tests!"]
+    hyp = ["test 1 two 3", "More tests!"]
+    uncased_score = bleu.bleu_on_list(ref, hyp, False)
+    cased_score = bleu.bleu_on_list(ref, hyp, True)
+    self.assertEqual(uncased_score, 100)
+    self.assertLess(cased_score, 100)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/modeling/README.md
+++ b/official/nlp/modeling/README.md
 # NLP Modeling Library

-This library provides a set of Keras primitives (Layers, Networks, and Models)
-that can be assembled into transformer-based models. They are
-flexible, validated, interoperable, and both TF1 and TF2 compatible.
+This library provides a set of Keras primitives (`tf.keras.Layer` and
+`tf.keras.Model`) that can be assembled into transformer-based models.
+They are flexible, validated, interoperable, and both TF1 and TF2 compatible.

 * [`layers`](layers) are the fundamental building blocks for NLP models.
-They can be used to assemble new layers, networks, or models.
+They can be used to assemble new `tf.keras` layers or models.

-* [`networks`](networks) are combinations of layers (and possibly other networks). They are sub-units of models that would not be trained alone. They
-encapsulate common network structures like a classification head
-or a transformer encoder into an easily handled object with a
-standardized configuration.
+* [`networks`](networks) are combinations of `tf.keras` layers (and possibly
+other networks). They are `tf.keras` models that would not be trained alone.
+It encapsulates common network structures like a transformer encoder into an
+easily handled object with a standardized configuration.

-* [`models`](models) are combinations of layers and networks that would be trained. Pre-built canned models are provided as both convenience functions and canonical examples.
+* [`models`](models) are combinations of `tf.keras` layers and models that can
+be trained. Several pre-built canned models are provided to train encoder
+networks. These models are intended as both convenience functions and canonical
+examples.

 * [`losses`](losses) contains common loss computation used in NLP tasks.

@@ -22,7 +25,9 @@ Please see the colab
 for how to build transformer-based NLP models using above primitives.

 Besides the pre-defined primitives, it also provides scaffold classes to allow
-easy experimentation with noval achitectures, e.g., you don’t need to fork a whole Transformer object to try a different kind of attention primitive, for instance.
+easy experimentation with noval achitectures, e.g., you don’t need to fork a
+whole Transformer object to try a different kind of attention primitive,
+for instance.

 * [`TransformerScaffold`](layers/transformer_scaffold.py) implements the
 Transformer from ["Attention Is All You Need"]
@@ -43,4 +48,5 @@ Please see the colab
 (https://colab.sandbox.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb)
 for how to use scaffold classes to build noval achitectures.

-BERT and ALBERT models in this repo are implemented using this library. Code examples can be found in the corresponding model folder.
+BERT and ALBERT models in this repo are implemented using this library.
+Code examples can be found in the corresponding model folder.
--- a/official/nlp/modeling/__init__.py
+++ b/official/nlp/modeling/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+"""NLP Modeling Library.
+
+This library provides a set of Keras primitives (`tf.keras.Layer` and
+`tf.keras.Model`) that can be assembled into transformer-based models.
+They are flexible, validated, interoperable, and both TF1 and TF2 compatible.
+"""
+from official.nlp.modeling import layers
+from official.nlp.modeling import losses
+from official.nlp.modeling import models
+from official.nlp.modeling import networks
--- a/official/nlp/modeling/layers/README.md
+++ b/official/nlp/modeling/layers/README.md
 # Layers

 Layers are the fundamental building blocks for NLP models. They can be used to
-assemble new layers, networks, or models.
+assemble new `tf.keras` layers or models.

 *   [MultiHeadAttention](attention.py) implements an optionally masked attention
    between query, key, value tensors as described in
@@ -11,6 +11,12 @@ assemble new layers, networks, or models.
 *   [CachedAttention](attention.py) implements an attention layer with cache
    used for auto-agressive decoding.

+*   [MatMulWithMargin](mat_mul_with_margin.py) implements a matrix
+    multiplication with margin layer used for training retrieval / ranking
+    tasks, as described in ["Improving Multilingual Sentence Embedding using
+    Bi-directional Dual Encoder with Additive Margin
+    Softmax"](https://www.ijcai.org/Proceedings/2019/0746.pdf).
+
 *   [MultiChannelAttention](multi_channel_attention.py) implements an variant of
    multi-head attention which can be used to merge multiple streams for
    cross-attentions.
@@ -23,9 +29,13 @@ assemble new layers, networks, or models.
    described in
    ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762).

-*   [TransformerDecoderLayer](transformer.py) TransformerDecoderLayer is made up
-    of self multi-head attention, cross multi-head attention and
-    feedforward network.
+*   [TransformerDecoderBlock](transformer.py) TransformerDecoderBlock is made up
+    of self multi-head attention, cross multi-head attention and feedforward
+    network.
+
+*   [RandomFeatureGaussianProcess](gaussian_process.py) implements random
+    feature-based Gaussian process described in ["Random Features for
+     Large-Scale Kernel Machines"](https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf).

 *   [ReZeroTransformer](rezero_transformer.py) implements Transformer with
    ReZero described in
@@ -41,6 +51,11 @@ assemble new layers, networks, or models.
 *   [SelfAttentionMask](self_attention_mask.py) creates a 3D attention mask from
    a 2D tensor mask.

+*   [SpectralNormalization](spectral_normalization.py) implements a tf.Wrapper
+    that applies spectral normalization regularization to a given layer. See
+    [Spectral Norm Regularization for Improving the Generalizability of
+     Deep Learning](https://arxiv.org/abs/1705.10941)
+
 *   [MaskedSoftmax](masked_softmax.py) implements a softmax with an optional
    masking input. If no mask is provided to this layer, it performs a standard
    softmax; however, if a mask tensor is applied (which should be 1 in
@@ -54,6 +69,42 @@ assemble new layers, networks, or models.
 *   [ClassificationHead](cls_head.py) A pooling head over a sequence of
    embeddings, commonly used by classification tasks.

+*   [GaussianProcessClassificationHead](cls_head.py) A spectral-normalized
+    neural Gaussian process (SNGP)-based classification head as described in
+    ["Simple and Principled Uncertainty Estimation with Deterministic Deep
+     Learning via Distance Awareness"](https://arxiv.org/abs/2006.10108).
+
 *   [GatedFeedforward](gated_feedforward.py) implements the gated linear layer
    feedforward as described in
    ["GLU Variants Improve Transformer"](https://arxiv.org/abs/2002.05202).
+
+*   [MultiHeadRelativeAttention](relative_attention.py) implements a variant
+    of multi-head attention with support for relative position encodings as
+    described in "Transformer-XL: Attentive Language Models Beyond a
+    Fixed-Length Context"(https://arxiv.org/abs/1901.02860). This also has
+    extended support for segment-based attention, a re-parameterization
+    introduced in "XLNet: Generalized Autoregressive Pretraining for Language
+    Understanding" (https://arxiv.org/abs/1906.08237).
+
+*   [TwoStreamRelativeAttention](relative_attention.py) implements a variant
+    of multi-head relative attention as described in "XLNet: Generalized
+    Autoregressive Pretraining for Language Understanding"
+    (https://arxiv.org/abs/1906.08237). This takes in a query and content
+    stream and applies self attention.
+
+*   [TransformerXL](transformer_xl.py) implements Transformer XL introduced in
+    "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
+    (https://arxiv.org/abs/1901.02860). This contains `TransformerXLBlock`, a
+    block containing either one or two stream relative self-attention as well as
+    subsequent feedforward networks. It also contains `TransformerXL`, which
+    contains attention biases as well as multiple `TransformerXLBlocks`.
+
+*   [MobileBertEmbedding](mobile_bert_layers.py) and
+    [MobileBertTransformer](mobile_bert_layers.py) implement the embedding layer
+    and also transformer layer proposed in the
+    [MobileBERT paper](https://arxiv.org/pdf/2004.02984.pdf).
+
+*   [BertPackInputs](text_layers.py) and
+    [BertTokenizer](text_layers.py) and [SentencepieceTokenizer](text_layers.py)
+    implements the layer to tokenize raw text and pack them into the inputs for
+    BERT models.
--- a/official/nlp/modeling/layers/__init__.py
+++ b/official/nlp/modeling/layers/__init__.py
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,20 +11,38 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Layers package definition."""
+
+"""Layers are the fundamental building blocks for NLP models.
+
+They can be used to assemble new `tf.keras` layers or models.
+"""
 # pylint: disable=wildcard-import
 from official.nlp.modeling.layers.attention import *
 from official.nlp.modeling.layers.cls_head import *
 from official.nlp.modeling.layers.dense_einsum import DenseEinsum
 from official.nlp.modeling.layers.gated_feedforward import GatedFeedforward
+from official.nlp.modeling.layers.gaussian_process import RandomFeatureGaussianProcess
 from official.nlp.modeling.layers.masked_lm import MaskedLM
 from official.nlp.modeling.layers.masked_softmax import MaskedSoftmax
+from official.nlp.modeling.layers.mat_mul_with_margin import MatMulWithMargin
+from official.nlp.modeling.layers.mobile_bert_layers import MobileBertEmbedding
+from official.nlp.modeling.layers.mobile_bert_layers import MobileBertMaskedLM
+from official.nlp.modeling.layers.mobile_bert_layers import MobileBertTransformer
 from official.nlp.modeling.layers.multi_channel_attention import *
 from official.nlp.modeling.layers.on_device_embedding import OnDeviceEmbedding
-from official.nlp.modeling.layers.position_embedding import PositionEmbedding
+from official.nlp.modeling.layers.position_embedding import RelativePositionBias
+from official.nlp.modeling.layers.position_embedding import RelativePositionEmbedding
+from official.nlp.modeling.layers.relative_attention import MultiHeadRelativeAttention
+from official.nlp.modeling.layers.relative_attention import TwoStreamRelativeAttention
 from official.nlp.modeling.layers.rezero_transformer import ReZeroTransformer
 from official.nlp.modeling.layers.self_attention_mask import SelfAttentionMask
+from official.nlp.modeling.layers.spectral_normalization import *
 from official.nlp.modeling.layers.talking_heads_attention import TalkingHeadsAttention
+from official.nlp.modeling.layers.text_layers import BertPackInputs
+from official.nlp.modeling.layers.text_layers import BertTokenizer
+from official.nlp.modeling.layers.text_layers import SentencepieceTokenizer
+from official.nlp.modeling.layers.tn_transformer_expand_condense import TNTransformerExpandCondense
 from official.nlp.modeling.layers.transformer import *
 from official.nlp.modeling.layers.transformer_scaffold import TransformerScaffold
+from official.nlp.modeling.layers.transformer_xl import TransformerXL
+from official.nlp.modeling.layers.transformer_xl import TransformerXLBlock
--- a/official/nlp/modeling/layers/attention.py
+++ b/official/nlp/modeling/layers/attention.py
-# Lint as: python3
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,481 +11,61 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Keras-based attention layer."""
 # pylint: disable=g-classes-have-attributes
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
-
-import collections
 import math
-import string

-import numpy as np
 import tensorflow as tf

-from official.nlp.modeling.layers import masked_softmax
-
 EinsumDense = tf.keras.layers.experimental.EinsumDense
-_CHR_IDX = string.ascii_lowercase
-
-
-def _build_attention_equation(qkv_rank, attn_axes):
-  """Builds einsum equations for the attention computation.
-
-  Query, key, value inputs after projection are expected to have the shape as:
-  (bs, <non-attention dims>, <attention dims>, num_heads, channels).
-  bs and <non-attention dims> are treated as <batch dims>.
-  The attention operations can be generalized:
-  (1) Query-key dot product:
-  (<batch dims>, <query attention dims>, num_heads, channels), (<batch dims>,
-  <key attention dims>, num_heads, channels) -> (<batch dims>,
-  num_heads, <query attention dims>, <key attention dims>)
-  (2) Combination:
-  (<batch dims>, num_heads, <query attention dims>, <key attention dims>),
-  (<batch dims>, <value attention dims>, num_heads, channels) -> (<batch dims>,
-  <query attention dims>, num_heads, channels)
-
-  Args:
-    qkv_rank: the rank of query, key, value tensors.
-    attn_axes: a list/tuple of axes, [1, rank), that will do attention.
-
-  Returns:
-    Einsum equations.
-  """
-  target_notation = _CHR_IDX[:qkv_rank]
-  # `batch_dims` includes the head dim.
-  batch_dims = tuple(np.delete(range(qkv_rank), attn_axes + (qkv_rank - 1,)))
-  letter_offset = qkv_rank
-  source_notation = ""
-  for i in range(qkv_rank):
-    if i in batch_dims or i == qkv_rank - 1:
-      source_notation += target_notation[i]
-    else:
-      source_notation += _CHR_IDX[letter_offset]
-      letter_offset += 1
-
-  product_notation = "".join([target_notation[i] for i in batch_dims] +
-                             [target_notation[i] for i in attn_axes] +
-                             [source_notation[i] for i in attn_axes])
-  dot_product_equation = "%s,%s->%s" % (source_notation, target_notation,
-                                        product_notation)
-  attn_scores_rank = len(product_notation)
-  combine_equation = "%s,%s->%s" % (product_notation, source_notation,
-                                    target_notation)
-  return dot_product_equation, combine_equation, attn_scores_rank
-
-
-def _build_proj_equation(free_dims, bound_dims, output_dims):
-  """Builds an einsum equation for projections inside multi-head attention."""
-  input_str = ""
-  kernel_str = ""
-  output_str = ""
-  bias_axes = ""
-  letter_offset = 0
-  for i in range(free_dims):
-    char = _CHR_IDX[i + letter_offset]
-    input_str += char
-    output_str += char
-
-  letter_offset += free_dims
-  for i in range(bound_dims):
-    char = _CHR_IDX[i + letter_offset]
-    input_str += char
-    kernel_str += char
-
-  letter_offset += bound_dims
-  for i in range(output_dims):
-    char = _CHR_IDX[i + letter_offset]
-    kernel_str += char
-    output_str += char
-    bias_axes += char
-  equation = "%s,%s->%s" % (input_str, kernel_str, output_str)
-
-  return equation, bias_axes, len(output_str)
-
-
-def _get_output_shape(output_rank, known_last_dims):
-  return [None] * (output_rank - len(known_last_dims)) + list(known_last_dims)
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class MultiHeadAttention(tf.keras.layers.Layer):
-  """MultiHeadAttention layer.
-
-  This is an implementation of multi-headed attention based on "Attention
-  is all you Need". If `query`, `key,` `value` are the same, then
-  this is self-attention. Each timestep in `query` attends to the
-  corresponding sequence in `key`, and returns a fixed-width vector.
-
-  This layer first projects `query`, `key` and `value`. These are
-  (effectively) a list of tensors of length `num_attention_heads`, where the
-  corresponding shapes are [batch_size, <query dimensions>, key_size],
-  [batch_size, <key/value dimensions>, key_size],
-  [batch_size, <key/value dimensions>, value_size].
-
-  Then, the query and key tensors are dot-producted and scaled. These are
-  softmaxed to obtain attention probabilities. The value tensors are then
-  interpolated by these probabilities, then concatenated back to a single
-  tensor.
-
-  Finally, the result tensor with the last dimension as value_size can take an
-  linear projection and return.
-
-  Examples:
-
-  Performs 1D cross-attention over two sequence inputs with an attention mask.
-  Returns the additional attention weights over heads.
-
-  >>> layer = MultiHeadAttention(num_heads=2, key_size=2,
-  ...                            return_attention_scores=True)
-  >>> target = tf.keras.Input(shape=[8, 16])
-  >>> source = tf.keras.Input(shape=[4, 16])
-  >>> mask_tensor = tf.keras.Input(shape=[8, 4])
-  >>> output_tensor, weights = layer([target, source])
-  >>> print(output_tensor.shape), print(weights.shape)
-  (None, 8, 16)  (None, 2, 8, 4)
-
-  Performs 2D self-attention over a 5D input tensor on axes 2 and 3.
-
-  >>> layer = MultiHeadAttention(num_heads=2, key_size=2, attention_axes=(2, 3))
-  >>> input_tensor = tf.keras.Input(shape=[5, 3, 4, 16])
-  >>> output_tensor = layer([input_tensor, input_tensor])
-  >>> print(output_tensor.shape)
-  (None, 5, 3, 4, 16)
-
-  Arguments:
-    num_heads: Number of attention heads.
-    key_size: Size of each attention head for query and key.
-    value_size:  Size of each attention head for value.
-    dropout: Dropout probability.
-    use_bias: Boolean, whether the dense layers use bias vectors/matrices.
-    output_shape: The expected shape of an output tensor, besides the batch and
-      sequence dims. If not specified, projects back to the key feature dim.
-    attention_axes: axes over which the attention is applied. `None` means
-      attention over all axes, but batch, heads, and features.
-    return_attention_scores: bool, if `True`, returns the multi-head
-      attention scores as an additional output argument.
-    kernel_initializer: Initializer for dense layer kernels.
-    bias_initializer: Initializer for dense layer biases.
-    kernel_regularizer: Regularizer for dense layer kernels.
-    bias_regularizer: Regularizer for dense layer biases.
-    activity_regularizer: Regularizer for dense layer activity.
-    kernel_constraint: Constraint for dense layer kernels.
-    bias_constraint: Constraint for dense layer kernels.
-  """
-
-  def __init__(self,
-               num_heads,
-               key_size,
-               value_size=None,
-               dropout=0.0,
-               use_bias=True,
-               output_shape=None,
-               attention_axes=None,
-               return_attention_scores=False,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super(MultiHeadAttention, self).__init__(**kwargs)
-    self._num_heads = num_heads
-    self._key_size = key_size
-    self._value_size = value_size if value_size else key_size
-    self._dropout = dropout
-    self._use_bias = use_bias
-    self._output_shape = output_shape
-    self._return_attention_scores = return_attention_scores
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
-    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
-    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
-    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
-    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
-    if attention_axes is not None and not isinstance(attention_axes,
-                                                     collections.abc.Sized):
-      self._attention_axes = (attention_axes,)
-    else:
-      self._attention_axes = attention_axes
-
-  def get_config(self):
-    config = {
-        "num_heads":
-            self._num_heads,
-        "key_size":
-            self._key_size,
-        "value_size":
-            self._value_size,
-        "dropout":
-            self._dropout,
-        "use_bias":
-            self._use_bias,
-        "output_shape":
-            self._output_shape,
-        "attention_axes":
-            self._attention_axes,
-        "return_attention_scores":
-            self._return_attention_scores,
-        "kernel_initializer":
-            tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer":
-            tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer":
-            tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer":
-            tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer":
-            tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint":
-            tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint":
-            tf.keras.constraints.serialize(self._bias_constraint)
-    }
-    base_config = super(MultiHeadAttention, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def build(self, input_shape):
-    inputs_len = len(input_shape)
-    if inputs_len > 3 or inputs_len < 2:
-      raise ValueError(
-          "Expects inputs list of length 2 or 3, namely [query, value] or "
-          "[query, value, key]. "
-          "Given length: %d" % inputs_len)
-    tensor_shapes = tf.nest.map_structure(tf.TensorShape, input_shape)
-    query_shape = tensor_shapes[0]
-    value_shape = tensor_shapes[1]
-    key_shape = tensor_shapes[2] if inputs_len == 3 else value_shape
-
-    common_kwargs = dict(
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint)
-
-    free_dims = query_shape.rank - 1
-    einsum_equation, bias_axes, output_rank = _build_proj_equation(
-        free_dims, bound_dims=1, output_dims=2)
-    self._query_dense = EinsumDense(
-        einsum_equation,
-        output_shape=_get_output_shape(output_rank - 1,
-                                       [self._num_heads, self._key_size]),
-        bias_axes=bias_axes if self._use_bias else None,
-        name="query",
-        **common_kwargs)
-    einsum_equation, bias_axes, output_rank = _build_proj_equation(
-        key_shape.rank - 1, bound_dims=1, output_dims=2)
-    self._key_dense = EinsumDense(
-        einsum_equation,
-        output_shape=_get_output_shape(output_rank - 1,
-                                       [self._num_heads, self._key_size]),
-        bias_axes=bias_axes if self._use_bias else None,
-        name="key",
-        **common_kwargs)
-    einsum_equation, bias_axes, output_rank = _build_proj_equation(
-        value_shape.rank - 1, bound_dims=1, output_dims=2)
-    self._value_dense = EinsumDense(
-        einsum_equation,
-        output_shape=_get_output_shape(output_rank - 1,
-                                       [self._num_heads, self._value_size]),
-        bias_axes=bias_axes if self._use_bias else None,
-        name="value",
-        **common_kwargs)
-
-    # Builds the attention computations for multi-head dot product attention.
-    # These computations could be wrapped into the keras attention layer once it
-    # support mult-head einsum computations.
-    self._build_attention(output_rank)
-    if self._output_shape:
-      if not isinstance(self._output_shape, collections.abc.Sized):
-        output_shape = [self._output_shape]
-      else:
-        output_shape = self._output_shape
-    else:
-      output_shape = [query_shape[-1]]
-    einsum_equation, bias_axes, output_rank = _build_proj_equation(
-        free_dims, bound_dims=2, output_dims=len(output_shape))
-    self._output_dense = EinsumDense(
-        einsum_equation,
-        output_shape=_get_output_shape(output_rank - 1, output_shape),
-        bias_axes=bias_axes if self._use_bias else None,
-        name="attention_output",
-        **common_kwargs)
-    super(MultiHeadAttention, self).build(input_shape)
-
-  def _build_attention(self, qkv_rank):
-    """Builds multi-head dot-product attention computations.
-
-    This function builds attributes necessary for `_compute_attention` to
-    costomize attention computation to replace the default dot-product
-    attention.
-
-    Args:
-      qkv_rank: the rank of query, key, value tensors.
-    """
-    if self._attention_axes is None:
-      self._attention_axes = tuple(range(1, qkv_rank - 2))
-    else:
-      self._attention_axes = tuple(self._attention_axes)
-    self._dot_product_equation, self._combine_equation, attn_scores_rank = (
-        _build_attention_equation(qkv_rank, attn_axes=self._attention_axes))
-    norm_axes = tuple(
-        range(attn_scores_rank - len(self._attention_axes), attn_scores_rank))
-    self._masked_softmax = masked_softmax.MaskedSoftmax(
-        mask_expansion_axes=[1], normalization_axes=norm_axes)
-    self._dropout_layer = tf.keras.layers.Dropout(rate=self._dropout)
-
-  def _compute_attention(self,
-                         query_tensor,
-                         key_tensor,
-                         value_tensor,
-                         attention_mask=None):
-    """Applies Dot-product attention with query, key, value tensors.
-
-    This function defines the computation inside `call` with projected
-    multi-head Q, K, V inputs. Users can override this function for customized
-    attention implementation.
-
-    Args:
-      query_tensor: Projected query `Tensor` of shape `[B, T, N, key_size]`.
-      key_tensor: Projected key `Tensor` of shape `[B, T, N, key_size]`.
-      value_tensor: Projected value `Tensor` of shape `[B, T, N, value_size]`.
-      attention_mask: a boolean mask of shape `[B, T, S]`, that prevents
-        attention to certain positions.
-
-    Returns:
-      attention_output: Multi-headed outputs of attention computation.
-      attention_scores: Multi-headed attention weights.
-    """
-    # Take the dot product between "query" and "key" to get the raw
-    # attention scores.
-    attention_scores = tf.einsum(self._dot_product_equation, key_tensor,
-                                 query_tensor)
-    attention_scores = tf.multiply(attention_scores,
-                                   1.0 / math.sqrt(float(self._key_size)))
-
-    # Normalize the attention scores to probabilities.
-    # `attention_scores` = [B, N, T, S]
-    attention_scores = self._masked_softmax(attention_scores, attention_mask)
-
-    # This is actually dropping out entire tokens to attend to, which might
-    # seem a bit unusual, but is taken from the original Transformer paper.
-    attention_scores_dropout = self._dropout_layer(attention_scores)
-
-    # `context_layer` = [B, T, N, H]
-    attention_output = tf.einsum(self._combine_equation,
-                                 attention_scores_dropout, value_tensor)
-    return attention_output, attention_scores
-
-  def call(self, inputs, attention_mask=None):
-    """Implements the forward pass.
-
-    Size glossary:
-      * Number of heads (H): the number of attention heads.
-      * Value size (V): the size of each value embedding per head.
-      * Key size (K): the size of each key embedding per head. Equally, the size
-          of each query embedding per head. Typically K <= V.
-      * Batch dimensions (B).
-      * Query (target) attention axes shape (T).
-      * Value (source) attention axes shape (S), the rank must match the target.
-
-    Args:
-      inputs: List of the following tensors:
-        * query: Query `Tensor` of shape `[B, T, dim]`.
-        * value: Value `Tensor` of shape `[B, S, dim]`.
-        * key: Optional key `Tensor` of shape `[B, S, dim]`. If not given, will
-          use `value` for both `key` and `value`, which is the most common case.
-      attention_mask: a boolean mask of shape `[B, T, S]`, that prevents
-        attention to certain positions.
-
-    Returns:
-      attention_output: The result of the computation, of shape [B, T, E],
-        where `T` is for target sequence shapes and `E` is the query input last
-        dimension if `output_shape` is `None`. Otherwise, the multi-head outputs
-        are project to the shape specified by `output_shape`.
-      attention_scores: [Optional] multi-head attention coeffients over
-      attention
-        axes.
-    """
-    inputs_len = len(inputs)
-    if inputs_len > 3 or inputs_len < 2:
-      raise ValueError(
-          "Expects inputs list of length 2 or 3, namely [query, value] or "
-          "[query, value, key]. "
-          "Given length: %d" % inputs_len)
-    query = inputs[0]
-    value = inputs[1]
-    key = inputs[2] if inputs_len == 3 else value
-
-    #   N = `num_attention_heads`
-    #   H = `size_per_head`
-    # `query_tensor` = [B, T, N ,H]
-    query_tensor = self._query_dense(query)
-
-    # `key_tensor` = [B, S, N, H]
-    key_tensor = self._key_dense(key)
-
-    # `value_tensor` = [B, S, N, H]
-    value_tensor = self._value_dense(value)
-
-    attention_output, attention_scores = self._compute_attention(
-        query_tensor, key_tensor, value_tensor, attention_mask)
-    attention_output = self._output_dense(attention_output)
-
-    if self._return_attention_scores:
-      return attention_output, attention_scores
-    return attention_output
+MultiHeadAttention = tf.keras.layers.MultiHeadAttention


 @tf.keras.utils.register_keras_serializable(package="Text")
-class CachedAttention(MultiHeadAttention):
+class CachedAttention(tf.keras.layers.MultiHeadAttention):
  """Attention layer with cache used for auto-agressive decoding.

-  Arguments are the same as `MultiHeadAttention` layer.
+  Arguments are the same as `tf.keras.layers.MultiHeadAttention` layer.
  """

-  def _update_cache(self, key_tensor, value_tensor, cache, decode_loop_step):
+  def _update_cache(self, key, value, cache, decode_loop_step):
    """Updates cache states and gets full-length key/value tensors."""
    # Combines cached keys and values with new keys and values.
    if decode_loop_step is not None:
      # TPU special case.
      key_seq_dim = cache["key"].shape.as_list()[1]
      indices = tf.reshape(
-          tf.one_hot(decode_loop_step, key_seq_dim, dtype=key_tensor.dtype),
+          tf.one_hot(decode_loop_step, key_seq_dim, dtype=key.dtype),
          [1, key_seq_dim, 1, 1])
-      key_tensor = cache["key"] + key_tensor * indices
+      key = cache["key"] + key * indices
      value_seq_dim = cache["value"].shape.as_list()[1]
      indices = tf.reshape(
-          tf.one_hot(decode_loop_step, value_seq_dim, dtype=value_tensor.dtype),
+          tf.one_hot(decode_loop_step, value_seq_dim, dtype=value.dtype),
          [1, value_seq_dim, 1, 1])
-      value_tensor = cache["value"] + value_tensor * indices
+      value = cache["value"] + value * indices
    else:
-      key_tensor = tf.concat(
-          [tf.cast(cache["key"], key_tensor.dtype), key_tensor], axis=1)
-      value_tensor = tf.concat(
-          [tf.cast(cache["value"], value_tensor.dtype), value_tensor], axis=1)
+      key = tf.concat([tf.cast(cache["key"], key.dtype), key], axis=1)
+      value = tf.concat([tf.cast(cache["value"], value.dtype), value], axis=1)

    # Update cache
-    cache["key"] = key_tensor
-    cache["value"] = value_tensor
+    cache["key"] = key
+    cache["value"] = value

-    return key_tensor, value_tensor
+    return key, value

  def call(self,
-           inputs,
+           query,
+           value,
+           key=None,
           attention_mask=None,
           cache=None,
-           decode_loop_step=None):
-    from_tensor = inputs[0]
-    to_tensor = inputs[1]
+           decode_loop_step=None,
+           return_attention_scores=False):
+    if not self._built_from_signature:
+      self._build_from_signature(query=query, value=value, key=key)
+    if key is None:
+      key = value

    # Scalar dimensions referenced here:
    #   B = batch size (number of sequences)
@@ -494,25 +73,23 @@ class CachedAttention(MultiHeadAttention):
    #   T = `to_tensor` sequence length
    #   N = `num_attention_heads`
    #   H = `size_per_head`
-    # `query_tensor` = [B, F, N ,H]
-    query_tensor = self._query_dense(from_tensor)
+    # `query` = [B, F, N ,H]
+    query = self._query_dense(query)

-    # `key_tensor` = [B, T, N, H]
-    key_tensor = self._key_dense(to_tensor)
+    # `key` = [B, T, N, H]
+    key = self._key_dense(key)

-    # `value_tensor` = [B, T, N, H]
-    value_tensor = self._value_dense(to_tensor)
+    # `value` = [B, T, N, H]
+    value = self._value_dense(value)

    if cache:
-      key_tensor, value_tensor = self._update_cache(key_tensor, value_tensor,
-                                                    cache, decode_loop_step)
+      key, value = self._update_cache(key, value, cache, decode_loop_step)
+
+    query = tf.multiply(query, 1.0 / math.sqrt(float(self._key_dim)))

    # Take the dot product between "query" and "key" to get the raw
    # attention scores.
-    attention_scores = tf.einsum(self._dot_product_equation, key_tensor,
-                                 query_tensor)
-    attention_scores = tf.multiply(attention_scores,
-                                   1.0 / math.sqrt(float(self._key_size)))
+    attention_scores = tf.einsum(self._dot_product_equation, key, query)

    # Normalize the attention scores to probabilities.
    # `attention_scores` = [B, N, F, T]
@@ -523,8 +100,8 @@ class CachedAttention(MultiHeadAttention):
    attention_scores = self._dropout_layer(attention_scores)
    # `context_layer` = [B, F, N, H]
    attention_output = tf.einsum(self._combine_equation, attention_scores,
-                                 value_tensor)
+                                 value)
    attention_output = self._output_dense(attention_output)
-    if self._return_attention_scores:
+    if return_attention_scores:
      return attention_output, attention_scores, cache
    return attention_output, cache
--- a/official/nlp/modeling/layers/attention_test.py
+++ b/official/nlp/modeling/layers/attention_test.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,14 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Tests for the attention layer."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Tests for the attention layer."""

-from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf

@@ -26,164 +21,6 @@ from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-dir
 from official.nlp.modeling.layers import attention


-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class MultiHeadAttentionTest(keras_parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ("key_value_same_proj", None, None, [40, 80]),
-      ("key_value_different_proj", 32, 60, [40, 60]),
-  )
-  def test_non_masked_attention(self, value_size, output_shape, output_dims):
-    """Test that the attention layer can be created without a mask tensor."""
-    test_layer = attention.MultiHeadAttention(
-        num_heads=12,
-        key_size=64,
-        value_size=value_size,
-        output_shape=output_shape)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = tf.keras.Input(shape=(40, 80))
-    value = tf.keras.Input(shape=(20, 80))
-    output = test_layer([query, value])
-    self.assertEqual(output.shape.as_list(), [None] + output_dims)
-
-  def test_non_masked_self_attention(self):
-    """Test with one input (self-attenntion) and no mask tensor."""
-    test_layer = attention.MultiHeadAttention(num_heads=12, key_size=64)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = tf.keras.Input(shape=(40, 80))
-    output = test_layer([query, query])
-    self.assertEqual(output.shape.as_list(), [None, 40, 80])
-
-  def test_attention_scores(self):
-    """Test attention outputs with coefficients."""
-    test_layer = attention.MultiHeadAttention(
-        num_heads=12, key_size=64, return_attention_scores=True)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = tf.keras.Input(shape=(40, 80))
-    output, coef = test_layer([query, query])
-    self.assertEqual(output.shape.as_list(), [None, 40, 80])
-    self.assertEqual(coef.shape.as_list(), [None, 12, 40, 40])
-
-  @parameterized.named_parameters(("with_bias", True), ("no_bias", False))
-  def test_masked_attention(self, use_bias):
-    """Test with a mask tensor."""
-    test_layer = attention.MultiHeadAttention(
-        num_heads=2, key_size=2, use_bias=use_bias)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    batch_size = 3
-    query = tf.keras.Input(shape=(4, 8))
-    value = tf.keras.Input(shape=(2, 8))
-    mask_tensor = tf.keras.Input(shape=(4, 2))
-    output = test_layer([query, value], mask_tensor)
-
-    # Create a model containing the test layer.
-    model = tf.keras.Model([query, value, mask_tensor], output)
-
-    # Generate data for the input (non-mask) tensors.
-    from_data = 10 * np.random.random_sample((batch_size, 4, 8))
-    to_data = 10 * np.random.random_sample((batch_size, 2, 8))
-
-    # Invoke the data with a random set of mask data. This should mask at least
-    # one element.
-    mask_data = np.random.randint(2, size=(batch_size, 4, 2))
-    masked_output_data = model.predict([from_data, to_data, mask_data])
-
-    # Invoke the same data, but with a null mask (where no elements are masked).
-    null_mask_data = np.ones((batch_size, 4, 2))
-    unmasked_output_data = model.predict([from_data, to_data, null_mask_data])
-
-    # Because one data is masked and one is not, the outputs should not be the
-    # same.
-    self.assertNotAllClose(masked_output_data, unmasked_output_data)
-
-    # Tests the layer with three inputs: Q, K, V.
-    key = tf.keras.Input(shape=(2, 8))
-    output = test_layer([query, value, key], mask_tensor)
-    model = tf.keras.Model([query, value, key, mask_tensor], output)
-
-    masked_output_data = model.predict([from_data, to_data, to_data, mask_data])
-    unmasked_output_data = model.predict(
-        [from_data, to_data, to_data, null_mask_data])
-    # Because one data is masked and one is not, the outputs should not be the
-    # same.
-    self.assertNotAllClose(masked_output_data, unmasked_output_data)
-
-    if use_bias:
-      self.assertLen(test_layer._query_dense.trainable_variables, 2)
-      self.assertLen(test_layer._output_dense.trainable_variables, 2)
-    else:
-      self.assertLen(test_layer._query_dense.trainable_variables, 1)
-      self.assertLen(test_layer._output_dense.trainable_variables, 1)
-
-  def test_initializer(self):
-    """Test with a specified initializer."""
-    test_layer = attention.MultiHeadAttention(
-        num_heads=12,
-        key_size=64,
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = tf.keras.Input(shape=(40, 80))
-    output = test_layer([query, query])
-    self.assertEqual(output.shape.as_list(), [None, 40, 80])
-
-  @parameterized.named_parameters(
-      ("4d_inputs_one_free_batch", [3, 4], [3, 2], [4, 2], (2,)),
-      ("4D_inputs_2D_attention", [3, 4], [3, 2], [3, 4, 3, 2], (1, 2)),
-      ("5D_inputs_2D_attention", [5, 3, 4], [5, 3, 2], [3, 4, 3, 2], (2, 3)))
-  def test_high_dim_attention(self, q_dims, v_dims, mask_dims, attention_axes):
-    """Test with a mask tensor."""
-    test_layer = attention.MultiHeadAttention(
-        num_heads=2, key_size=2, attention_axes=attention_axes)
-    batch_size, hidden_size = 3, 8
-    # Generate data for the input (non-mask) tensors.
-    query_shape = [batch_size] + q_dims + [hidden_size]
-    value_shape = [batch_size] + v_dims + [hidden_size]
-    mask_shape = [batch_size] + mask_dims
-    query = 10 * np.random.random_sample(query_shape)
-    value = 10 * np.random.random_sample(value_shape)
-
-    # Invoke the data with a random set of mask data. This should mask at least
-    # one element.
-    mask_data = np.random.randint(2, size=mask_shape).astype("bool")
-    output = test_layer([query, value], mask_data)
-
-    # Invoke the same data, but with a null mask (where no elements are masked).
-    null_mask_data = np.ones(mask_shape)
-    unmasked_output = test_layer([query, value], null_mask_data)
-    # Because one data is masked and one is not, the outputs should not be the
-    # same.
-    self.assertNotAllClose(output, unmasked_output)
-
-
-class SubclassAttention(attention.MultiHeadAttention):
-
-  def _build_attention(self, qkv_rank):
-    pass
-
-  def _compute_attention(self,
-                         query_tensor,
-                         key_tensor,
-                         value_tensor,
-                         attention_mask=None):
-    return value_tensor, None
-
-
-@keras_parameterized.run_all_keras_modes
-class AttentionSubclassTest(keras_parameterized.TestCase):
-
-  def test_initializer(self):
-    """Test with a specified initializer."""
-    test_layer = SubclassAttention(
-        num_heads=12,
-        key_size=64)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = tf.keras.Input(shape=(40, 80))
-    output = test_layer([query, query])
-    self.assertEqual(output.shape.as_list(), [None, 40, 80])
-
-
 def _create_cache(batch_size, init_decode_length, num_heads, head_size):
  return {
      "key":
@@ -208,7 +45,7 @@ class CachedAttentionTest(keras_parameterized.TestCase):
    init_decode_length = 0
    # Directly tests the keras layer.
    cache = _create_cache(batch_size, init_decode_length, num_heads, head_size)
-    layer = attention.CachedAttention(num_heads=num_heads, key_size=head_size)
+    layer = attention.CachedAttention(num_heads=num_heads, key_dim=head_size)

    # Generate data for the input (non-mask) tensors.
    from_data = tf.zeros((batch_size, from_seq_length, 8), dtype=np.float32)
@@ -216,12 +53,14 @@ class CachedAttentionTest(keras_parameterized.TestCase):
    # one element.
    mask_data = np.random.randint(
        2, size=(batch_size, from_seq_length, from_seq_length))
-    masked_output_data, cache = layer([from_data, from_data], mask_data, cache)
+    masked_output_data, cache = layer(
+        query=from_data, value=from_data, attention_mask=mask_data, cache=cache)
    self.assertEqual(masked_output_data.shape, (3, 4, 8))
    self.assertEqual(cache["value"].shape, (3, 4, 2, 2))

    # Tests inputs without cache.
-    masked_output_data, cache = layer([from_data, from_data, mask_data])
+    masked_output_data, cache = layer(
+        query=from_data, value=from_data, attention_mask=mask_data)
    self.assertEqual(masked_output_data.shape, (3, 4, 8))
    self.assertIsNone(cache)

@@ -235,7 +74,7 @@ class CachedAttentionTest(keras_parameterized.TestCase):

    # Directly tests the keras layer.
    cache = _create_cache(batch_size, init_decode_length, num_heads, head_size)
-    layer = attention.CachedAttention(num_heads=num_heads, key_size=head_size)
+    layer = attention.CachedAttention(num_heads=num_heads, key_dim=head_size)

    # Generate data for the input (non-mask) tensors.
    from_data = tf.zeros((batch_size, from_seq_length, 8), dtype=np.float32)
@@ -243,10 +82,12 @@ class CachedAttentionTest(keras_parameterized.TestCase):
    mask_data = np.random.randint(
        2, size=(batch_size, from_seq_length, from_seq_length), dtype=np.int32)
    # Testing the invocation directly as Keras cannot consume inputs correctly.
-    masked_output_data, cache = layer([from_data, from_data],
-                                      mask_data,
-                                      cache,
-                                      decode_loop_step=decode_loop_step)
+    masked_output_data, cache = layer(
+        query=from_data,
+        value=from_data,
+        attention_mask=mask_data,
+        cache=cache,
+        decode_loop_step=decode_loop_step)
    self.assertEqual(masked_output_data.shape, (3, 4, 8))
    self.assertEqual(cache["value"].shape, (3, 4, 2, 2))


--- a/official/nlp/modeling/layers/cls_head.py
+++ b/official/nlp/modeling/layers/cls_head.py
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,17 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """A Classification head layer which is common used with sequence encoders."""
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function

 import tensorflow as tf

 from official.modeling import tf_utils

+from official.nlp.modeling.layers import gaussian_process
+from official.nlp.modeling.layers import spectral_normalization
+

 class ClassificationHead(tf.keras.layers.Layer):
  """Pooling head for sentence-level classification tasks."""
@@ -38,7 +36,8 @@ class ClassificationHead(tf.keras.layers.Layer):
    """Initializes the `ClassificationHead`.

    Args:
-      inner_dim: The dimensionality of inner projection layer.
+      inner_dim: The dimensionality of inner projection layer. If 0 or `None`
+        then only the output projection layer is created.
      num_classes: Number of output classes.
      cls_token_idx: The index inside the sequence to pool.
      activation: Dense layer activation.
@@ -46,7 +45,7 @@ class ClassificationHead(tf.keras.layers.Layer):
      initializer: Initializer for dense layer kernels.
      **kwargs: Keyword arguments.
    """
-    super(ClassificationHead, self).__init__(**kwargs)
+    super().__init__(**kwargs)
    self.dropout_rate = dropout_rate
    self.inner_dim = inner_dim
    self.num_classes = num_classes
@@ -54,24 +53,31 @@ class ClassificationHead(tf.keras.layers.Layer):
    self.initializer = tf.keras.initializers.get(initializer)
    self.cls_token_idx = cls_token_idx

-    self.dense = tf.keras.layers.Dense(
-        units=inner_dim,
-        activation=self.activation,
-        kernel_initializer=self.initializer,
-        name="pooler_dense")
-    self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
+    if self.inner_dim:
+      self.dense = tf.keras.layers.Dense(
+          units=self.inner_dim,
+          activation=self.activation,
+          kernel_initializer=self.initializer,
+          name="pooler_dense")
+      self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
+
    self.out_proj = tf.keras.layers.Dense(
        units=num_classes, kernel_initializer=self.initializer, name="logits")

  def call(self, features):
-    x = features[:, self.cls_token_idx, :]  # take <CLS> token.
-    x = self.dense(x)
-    x = self.dropout(x)
+    if not self.inner_dim:
+      x = features
+    else:
+      x = features[:, self.cls_token_idx, :]  # take <CLS> token.
+      x = self.dense(x)
+      x = self.dropout(x)
+
    x = self.out_proj(x)
    return x

  def get_config(self):
    config = {
+        "cls_token_idx": self.cls_token_idx,
        "dropout_rate": self.dropout_rate,
        "num_classes": self.num_classes,
        "inner_dim": self.inner_dim,
@@ -88,3 +94,241 @@ class ClassificationHead(tf.keras.layers.Layer):
  @property
  def checkpoint_items(self):
    return {self.dense.name: self.dense}
+
+
+class MultiClsHeads(tf.keras.layers.Layer):
+  """Pooling heads sharing the same pooling stem."""
+
+  def __init__(self,
+               inner_dim,
+               cls_list,
+               cls_token_idx=0,
+               activation="tanh",
+               dropout_rate=0.0,
+               initializer="glorot_uniform",
+               **kwargs):
+    """Initializes the `MultiClsHeads`.
+
+    Args:
+      inner_dim: The dimensionality of inner projection layer. If 0 or `None`
+        then only the output projection layer is created.
+      cls_list: a list of pairs of (classification problem name and the numbers
+        of classes.
+      cls_token_idx: The index inside the sequence to pool.
+      activation: Dense layer activation.
+      dropout_rate: Dropout probability.
+      initializer: Initializer for dense layer kernels.
+      **kwargs: Keyword arguments.
+    """
+    super().__init__(**kwargs)
+    self.dropout_rate = dropout_rate
+    self.inner_dim = inner_dim
+    self.cls_list = cls_list
+    self.activation = tf_utils.get_activation(activation)
+    self.initializer = tf.keras.initializers.get(initializer)
+    self.cls_token_idx = cls_token_idx
+
+    if self.inner_dim:
+      self.dense = tf.keras.layers.Dense(
+          units=inner_dim,
+          activation=self.activation,
+          kernel_initializer=self.initializer,
+          name="pooler_dense")
+      self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
+    self.out_projs = []
+    for name, num_classes in cls_list:
+      self.out_projs.append(
+          tf.keras.layers.Dense(
+              units=num_classes, kernel_initializer=self.initializer,
+              name=name))
+
+  def call(self, features):
+    if not self.inner_dim:
+      x = features
+    else:
+      x = features[:, self.cls_token_idx, :]  # take <CLS> token.
+      x = self.dense(x)
+      x = self.dropout(x)
+
+    outputs = {}
+    for proj_layer in self.out_projs:
+      outputs[proj_layer.name] = proj_layer(x)
+    return outputs
+
+  def get_config(self):
+    config = {
+        "dropout_rate": self.dropout_rate,
+        "cls_token_idx": self.cls_token_idx,
+        "cls_list": self.cls_list,
+        "inner_dim": self.inner_dim,
+        "activation": tf.keras.activations.serialize(self.activation),
+        "initializer": tf.keras.initializers.serialize(self.initializer),
+    }
+    config.update(super().get_config())
+    return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  @property
+  def checkpoint_items(self):
+    items = {self.dense.name: self.dense}
+    items.update({v.name: v for v in self.out_projs})
+    return items
+
+
+class GaussianProcessClassificationHead(ClassificationHead):
+  """Gaussian process-based pooling head for sentence classification.
+
+  This class implements a classifier head for BERT encoder that is based on the
+  spectral-normalized neural Gaussian process (SNGP) [1]. SNGP is a simple
+  method to improve a neural network's uncertainty quantification ability
+  without sacrificing accuracy or lantency. It applies spectral normalization to
+  the hidden pooler layer, and then replaces the dense output layer with a
+  Gaussian process.
+
+
+  [1]: Jeremiah Liu et al. Simple and Principled Uncertainty Estimation with
+       Deterministic Deep Learning via Distance Awareness.
+       In _Neural Information Processing Systems_, 2020.
+       https://arxiv.org/abs/2006.10108
+  """
+
+  def __init__(self,
+               inner_dim,
+               num_classes,
+               cls_token_idx=0,
+               activation="tanh",
+               dropout_rate=0.0,
+               initializer="glorot_uniform",
+               use_spec_norm=True,
+               use_gp_layer=True,
+               temperature=None,
+               **kwargs):
+    """Initializes the `GaussianProcessClassificationHead`.
+
+    Args:
+      inner_dim: The dimensionality of inner projection layer. If 0 or `None`
+        then only the output projection layer is created.
+      num_classes: Number of output classes.
+      cls_token_idx: The index inside the sequence to pool.
+      activation: Dense layer activation.
+      dropout_rate: Dropout probability.
+      initializer: Initializer for dense layer kernels.
+      use_spec_norm: Whether to apply spectral normalization to pooler layer.
+      use_gp_layer: Whether to use Gaussian process as the output layer.
+      temperature: The temperature parameter to be used for mean-field
+        approximation during inference. If None then no mean-field adjustment is
+        applied.
+      **kwargs: Additional keyword arguments.
+    """
+    # Collects spectral normalization and Gaussian process args from kwargs.
+    self.use_spec_norm = use_spec_norm
+    self.use_gp_layer = use_gp_layer
+    self.spec_norm_kwargs = extract_spec_norm_kwargs(kwargs)
+    self.gp_layer_kwargs = extract_gp_layer_kwargs(kwargs)
+    self.temperature = temperature
+
+    super().__init__(
+        inner_dim=inner_dim,
+        num_classes=num_classes,
+        cls_token_idx=cls_token_idx,
+        activation=activation,
+        dropout_rate=dropout_rate,
+        initializer=initializer,
+        **kwargs)
+
+    # Applies spectral normalization to the dense pooler layer.
+    if self.use_spec_norm and hasattr(self, "dense"):
+      self.dense = spectral_normalization.SpectralNormalization(
+          self.dense, inhere_layer_name=True, **self.spec_norm_kwargs)
+
+    # Replace Dense output layer with the Gaussian process layer.
+    if use_gp_layer:
+      self.out_proj = gaussian_process.RandomFeatureGaussianProcess(
+          self.num_classes,
+          kernel_initializer=self.initializer,
+          name="logits",
+          **self.gp_layer_kwargs)
+
+  def call(self, features, training=False, return_covmat=False):
+    """Returns model output.
+
+    Dring training, the model returns raw logits. During evaluation, the model
+    returns uncertainty adjusted logits, and (optionally) the covariance matrix.
+
+    Arguments:
+      features: A tensor of input features, shape (batch_size, feature_dim).
+      training: Whether the model is in training mode.
+      return_covmat: Whether the model should also return covariance matrix if
+        `use_gp_layer=True`. During training, it is recommended to set
+        `return_covmat=False` to be compatible with the standard Keras pipelines
+        (e.g., `model.fit()`).
+
+    Returns:
+      logits: Uncertainty-adjusted predictive logits, shape
+        (batch_size, num_classes).
+      covmat: (Optional) Covariance matrix, shape (batch_size, batch_size).
+        Returned only when return_covmat=True.
+    """
+    logits = super().call(features)
+
+    # Extracts logits and covariance matrix from model output.
+    if self.use_gp_layer:
+      logits, covmat = logits
+    else:
+      covmat = None
+
+    # Computes the uncertainty-adjusted logits during evaluation.
+    if not training:
+      logits = gaussian_process.mean_field_logits(
+          logits, covmat, mean_field_factor=self.temperature)
+
+    if return_covmat and covmat is not None:
+      return logits, covmat
+    return logits
+
+  def reset_covariance_matrix(self):
+    """Resets covariance matrix of the Gaussian process layer."""
+    if hasattr(self.out_proj, "reset_covariance_matrix"):
+      self.out_proj.reset_covariance_matrix()
+
+  def get_config(self):
+    config = dict(
+        use_spec_norm=self.use_spec_norm, use_gp_layer=self.use_gp_layer)
+
+    config.update(self.spec_norm_kwargs)
+    config.update(self.gp_layer_kwargs)
+    config["temperature"] = self.temperature
+
+    config.update(super(GaussianProcessClassificationHead, self).get_config())
+    return config
+
+
+def extract_gp_layer_kwargs(kwargs):
+  """Extracts Gaussian process layer configs from a given kwarg."""
+
+  return dict(
+      num_inducing=kwargs.pop("num_inducing", 1024),
+      normalize_input=kwargs.pop("normalize_input", True),
+      gp_cov_momentum=kwargs.pop("gp_cov_momentum", 0.999),
+      gp_cov_ridge_penalty=kwargs.pop("gp_cov_ridge_penalty", 1.),
+      scale_random_features=kwargs.pop("scale_random_features", False),
+      l2_regularization=kwargs.pop("l2_regularization", 1e-6),
+      gp_cov_likelihood=kwargs.pop("gp_cov_likelihood", "gaussian"),
+      return_gp_cov=kwargs.pop("return_gp_cov", True),
+      return_random_features=kwargs.pop("return_random_features", False),
+      use_custom_random_features=kwargs.pop("use_custom_random_features", True),
+      custom_random_features_initializer=kwargs.pop(
+          "custom_random_features_initializer", "random_normal"),
+      custom_random_features_activation=kwargs.pop(
+          "custom_random_features_activation", None))
+
+
+def extract_spec_norm_kwargs(kwargs):
+  """Extracts spectral normalization configs from a given kwarg."""
+
+  return dict(
+      iteration=kwargs.pop("iteration", 1),
+      norm_multiplier=kwargs.pop("norm_multiplier", .99))
--- a/official/nlp/modeling/layers/cls_head_test.py
+++ b/official/nlp/modeling/layers/cls_head_test.py
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,15 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Tests for cls_head."""
+from absl.testing import parameterized

 import tensorflow as tf

 from official.nlp.modeling.layers import cls_head


-class ClassificationHead(tf.test.TestCase):
+class ClassificationHeadTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(("no_pooler_layer", 0, 2),
+                                  ("has_pooler_layer", 5, 4))
+  def test_pooler_layer(self, inner_dim, num_weights_expected):
+    test_layer = cls_head.ClassificationHead(inner_dim=inner_dim, num_classes=2)
+    features = tf.zeros(shape=(2, 10, 10), dtype=tf.float32)
+    _ = test_layer(features)
+
+    num_weights_observed = len(test_layer.get_weights())
+    self.assertEqual(num_weights_observed, num_weights_expected)

  def test_layer_invocation(self):
    test_layer = cls_head.ClassificationHead(inner_dim=5, num_classes=2)
@@ -38,5 +48,151 @@ class ClassificationHead(tf.test.TestCase):
    self.assertAllEqual(layer.get_config(), new_layer.get_config())


+class MultiClsHeadsTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(("no_pooler_layer", 0, 4),
+                                  ("has_pooler_layer", 5, 6))
+  def test_pooler_layer(self, inner_dim, num_weights_expected):
+    cls_list = [("foo", 2), ("bar", 3)]
+    test_layer = cls_head.MultiClsHeads(inner_dim=inner_dim, cls_list=cls_list)
+    features = tf.zeros(shape=(2, 10, 10), dtype=tf.float32)
+    _ = test_layer(features)
+
+    num_weights_observed = len(test_layer.get_weights())
+    self.assertEqual(num_weights_observed, num_weights_expected)
+
+  def test_layer_invocation(self):
+    cls_list = [("foo", 2), ("bar", 3)]
+    test_layer = cls_head.MultiClsHeads(inner_dim=5, cls_list=cls_list)
+    features = tf.zeros(shape=(2, 10, 10), dtype=tf.float32)
+    outputs = test_layer(features)
+    self.assertAllClose(outputs["foo"], [[0., 0.], [0., 0.]])
+    self.assertAllClose(outputs["bar"], [[0., 0., 0.], [0., 0., 0.]])
+    self.assertSameElements(test_layer.checkpoint_items.keys(),
+                            ["pooler_dense", "foo", "bar"])
+
+  def test_layer_serialization(self):
+    cls_list = [("foo", 2), ("bar", 3)]
+    test_layer = cls_head.MultiClsHeads(inner_dim=5, cls_list=cls_list)
+    new_layer = cls_head.MultiClsHeads.from_config(test_layer.get_config())
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
+
+
+class GaussianProcessClassificationHead(tf.test.TestCase,
+                                        parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.spec_norm_kwargs = dict(norm_multiplier=1.,)
+    self.gp_layer_kwargs = dict(num_inducing=512)
+
+  @parameterized.named_parameters(("no_pooler_layer", 0, 7),
+                                  ("has_pooler_layer", 5, 11))
+  def test_pooler_layer(self, inner_dim, num_weights_expected):
+    test_layer = cls_head.GaussianProcessClassificationHead(
+        inner_dim=inner_dim,
+        num_classes=2,
+        use_spec_norm=True,
+        use_gp_layer=True,
+        initializer="zeros",
+        **self.spec_norm_kwargs,
+        **self.gp_layer_kwargs)
+    features = tf.zeros(shape=(2, 10, 10), dtype=tf.float32)
+    _ = test_layer(features)
+
+    num_weights_observed = len(test_layer.get_weights())
+    self.assertEqual(num_weights_observed, num_weights_expected)
+
+  def test_layer_invocation(self):
+    test_layer = cls_head.GaussianProcessClassificationHead(
+        inner_dim=5,
+        num_classes=2,
+        use_spec_norm=True,
+        use_gp_layer=True,
+        initializer="zeros",
+        **self.spec_norm_kwargs,
+        **self.gp_layer_kwargs)
+    features = tf.zeros(shape=(2, 10, 10), dtype=tf.float32)
+    output = test_layer(features)
+    self.assertAllClose(output, [[0., 0.], [0., 0.]])
+    self.assertSameElements(test_layer.checkpoint_items.keys(),
+                            ["pooler_dense"])
+
+  @parameterized.named_parameters(
+      ("gp_layer_with_covmat", True, True),
+      ("gp_layer_no_covmat", True, False),
+      ("dense_layer_with_covmat", False, True),
+      ("dense_layer_no_covmat", False, False))
+  def test_sngp_output_shape(self, use_gp_layer, return_covmat):
+    batch_size = 32
+    num_classes = 2
+
+    test_layer = cls_head.GaussianProcessClassificationHead(
+        inner_dim=5,
+        num_classes=num_classes,
+        use_spec_norm=True,
+        use_gp_layer=use_gp_layer,
+        **self.spec_norm_kwargs,
+        **self.gp_layer_kwargs)
+
+    features = tf.zeros(shape=(batch_size, 10, 10), dtype=tf.float32)
+    outputs = test_layer(features, return_covmat=return_covmat)
+
+    if use_gp_layer and return_covmat:
+      self.assertIsInstance(outputs, tuple)
+      self.assertEqual(outputs[0].shape, (batch_size, num_classes))
+      self.assertEqual(outputs[1].shape, (batch_size, batch_size))
+    else:
+      self.assertIsInstance(outputs, tf.Tensor)
+      self.assertEqual(outputs.shape, (batch_size, num_classes))
+
+  def test_sngp_train_logits(self):
+    """Checks if temperature scaling is disabled during training."""
+    features = tf.zeros(shape=(5, 10, 10), dtype=tf.float32)
+
+    gp_layer = cls_head.GaussianProcessClassificationHead(
+        inner_dim=5, num_classes=2)
+
+    # Without temperature.
+    gp_layer.temperature = None
+    outputs_no_temp = gp_layer(features, training=True)
+
+    # With temperature.
+    gp_layer.temperature = 10.
+    outputs_with_temp = gp_layer(features, training=True)
+
+    self.assertAllEqual(outputs_no_temp, outputs_with_temp)
+
+  def test_layer_serialization(self):
+    layer = cls_head.GaussianProcessClassificationHead(
+        inner_dim=5,
+        num_classes=2,
+        use_spec_norm=True,
+        use_gp_layer=True,
+        **self.spec_norm_kwargs,
+        **self.gp_layer_kwargs)
+    new_layer = cls_head.GaussianProcessClassificationHead.from_config(
+        layer.get_config())
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(layer.get_config(), new_layer.get_config())
+
+  def test_sngp_kwargs_serialization(self):
+    """Tests if SNGP-specific kwargs are added during serialization."""
+    layer = cls_head.GaussianProcessClassificationHead(
+        inner_dim=5,
+        num_classes=2,
+        use_spec_norm=True,
+        use_gp_layer=True,
+        **self.spec_norm_kwargs,
+        **self.gp_layer_kwargs)
+    layer_config = layer.get_config()
+
+    # The config value should equal to those defined in setUp().
+    self.assertEqual(layer_config["norm_multiplier"], 1.)
+    self.assertEqual(layer_config["num_inducing"], 512)
+
 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/modeling/layers/dense_einsum.py
+++ b/official/nlp/modeling/layers/dense_einsum.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,13 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Keras-based einsum layer."""
 # pylint: disable=g-classes-have-attributes
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function

 import tensorflow as tf

@@ -28,11 +24,11 @@ _CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]

 @tf.keras.utils.register_keras_serializable(package="Text")
 class DenseEinsum(tf.keras.layers.Layer):
-  """A densely connected layer that uses tf.einsum as the backing computation.
+  """A densely connected layer that uses `tf.einsum` as the backing computation.

  This layer can perform einsum calculations of arbitrary dimensionality.

-  Arguments:
+  Args:
    output_shape: Positive integer or tuple, dimensionality of the output space.
    num_summed_dimensions: The number of dimensions to sum over. Standard 2D
      matmul should use 1, 3D matmul should use 2, and so forth.
@@ -59,9 +55,8 @@ class DenseEinsum(tf.keras.layers.Layer):
      `(batch_size, units)`.
  """

-  @deprecation.deprecated(
-      None, "DenseEinsum is deprecated. Please use "
-      "tf.keras.experimental.EinsumDense layer instead.")
+  @deprecation.deprecated(None, "DenseEinsum is deprecated. Please use "
+                          "tf.keras.experimental.EinsumDense layer instead.")
  def __init__(self,
               output_shape,
               num_summed_dimensions=1,

--- a/official/nlp/modeling/layers/dense_einsum_test.py
+++ b/official/nlp/modeling/layers/dense_einsum_test.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Tests for Keras-based einsum layer."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Tests for Keras-based einsum layer."""

 import numpy as np
 import tensorflow as tf

--- a/official/nlp/modeling/layers/gated_feedforward.py
+++ b/official/nlp/modeling/layers/gated_feedforward.py
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,13 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Keras-based gated feedforward layer."""
 # pylint: disable=g-classes-have-attributes
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function

 import gin
 import tensorflow as tf
@@ -32,22 +28,22 @@ class GatedFeedforward(tf.keras.layers.Layer):
  (https://arxiv.org/abs/2002.05202). In additional, it allows to stack
  multiple feedforward blocks and specify the position of dropout layer.

-  Arguments:
+  Args:
    intermediate_size: Size of the intermediate layer.
    intermediate_activation: Activation for the intermediate layer.
    dropout: Dropout probability for the output dropout.
-    use_gate: Whether to use gated linear units. If True, assuming `GELU` as
-      the activation and omitting bias, will apply
+    use_gate: Whether to use gated linear units. If True, assuming `GELU` as the
+      activation and omitting bias, will apply
      `GEGLU(x, W, V, W_2) = (GEGLU(xW) * xV)W2`; if False, will follow
-      "Attention Is All You Need" (https://arxiv.org/abs/1706.03762) paper
-      and apply `FFN(x, W, W_2) = GELU(xW_1)W_2.`
-    num_blocks: The number of feedforward blocks to stack. Each block contains
-      a (gated) linear layer and a fully connected layer followed by dropout,
+      "Attention Is All You Need" (https://arxiv.org/abs/1706.03762) paper and
+        apply `FFN(x, W, W_2) = GELU(xW_1)W_2.`
+    num_blocks: The number of feedforward blocks to stack. Each block contains a
+      (gated) linear layer and a fully connected layer followed by dropout,
      layer norm and residual.
    dropout_position: Where to apply the dropout, the value can be either
      `before_residual` or `after_residual`. If `before_residual`, will apply
-      `layer_output = layer_norm(dropout(layer_output) + layer_input)`;
-      if `after residual`, will apply
+      `layer_output = layer_norm(dropout(layer_output) + layer_input)`; if
+      `after residual`, will apply
      `layer_output = dropout(layer_norm(layer_output + layer_input))`.
    kernel_initializer: Initializer for dense layer kernels.
    bias_initializer: Initializer for dense layer biases.
@@ -63,6 +59,7 @@ class GatedFeedforward(tf.keras.layers.Layer):
               intermediate_activation,
               dropout,
               use_gate=True,
+               apply_output_layer_norm=True,
               num_blocks=1,
               dropout_position="before_residual",
               kernel_initializer="glorot_uniform",
@@ -79,6 +76,7 @@ class GatedFeedforward(tf.keras.layers.Layer):
    self._dropout = dropout
    self._use_gate = use_gate
    self._num_blocks = num_blocks
+    self._apply_output_layer_norm = apply_output_layer_norm
    self._dropout_position = dropout_position
    if self._dropout_position not in ("before_residual", "after_residual"):
      raise ValueError(
@@ -110,7 +108,7 @@ class GatedFeedforward(tf.keras.layers.Layer):
    self._output_dense = []
    self._output_dropout = []
    self._output_layer_norm = []
-    activation_policy = tf.keras.mixed_precision.experimental.global_policy()
+    activation_policy = tf.keras.mixed_precision.global_policy()
    if activation_policy.name == "mixed_bfloat16":
      # bfloat16 causes BERT with the LAMB optimizer to not converge
      # as well, so we use float32.
@@ -124,8 +122,9 @@ class GatedFeedforward(tf.keras.layers.Layer):
              bias_axes="d",
              name="intermediate_%d" % i,
              **common_kwargs))
-      self._intermediate_activation_layers.append(tf.keras.layers.Activation(
-          self._intermediate_activation, dtype=activation_policy))
+      self._intermediate_activation_layers.append(
+          tf.keras.layers.Activation(
+              self._intermediate_activation, dtype=activation_policy))
      if self._use_gate:
        self._gate_dense.append(
            tf.keras.layers.experimental.EinsumDense(
@@ -141,15 +140,15 @@ class GatedFeedforward(tf.keras.layers.Layer):
              bias_axes="d",
              name="output_%d" % i,
              **common_kwargs))
-      self._output_dropout.append(
-          tf.keras.layers.Dropout(rate=self._dropout))
+      self._output_dropout.append(tf.keras.layers.Dropout(rate=self._dropout))
      # Use float32 in layernorm for numeric stability.
-      self._output_layer_norm.append(
-          tf.keras.layers.LayerNormalization(
-              name="output_layer_norm_%d" % i,
-              axis=-1,
-              epsilon=1e-12,
-              dtype=tf.float32))
+      if self._apply_output_layer_norm:
+        self._output_layer_norm.append(
+            tf.keras.layers.LayerNormalization(
+                name="output_layer_norm_%d" % i,
+                axis=-1,
+                epsilon=1e-12,
+                dtype=tf.float32))

  def get_config(self):
    config = {
@@ -203,7 +202,8 @@ class GatedFeedforward(tf.keras.layers.Layer):
      # add.
      if layer_input.dtype == tf.float32:
        layer_output = tf.cast(layer_output, tf.float32)
-      layer_output = self._output_layer_norm[i](layer_output + layer_input)
+      if self._apply_output_layer_norm:
+        layer_output = self._output_layer_norm[i](layer_output + layer_input)
      if self._dropout_position == "after_residual":
        layer_output = self._output_dropout[i](layer_output)