Merge branch 'panoptic-segmentation' into panoptic-deeplab-modeling

09d9656f · Srihari Humbarwadi · GitHub · ac671306 · 49a5706c · 09d9656f
Unverified Commit 09d9656f authored Jan 13, 2022 by Srihari Humbarwadi Committed by GitHub Jan 13, 2022
20 changed files
--- a/official/projects/roformer/roformer_attention.py
+++ b/official/projects/roformer/roformer_attention.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Roformer attention layer."""
+# pylint: disable=g-classes-have-attributes
+import tensorflow as tf
+EinsumDense = tf.keras.layers.experimental.EinsumDense
+MultiHeadAttention = tf.keras.layers.MultiHeadAttention
+def _build_trig_vector(length, key_dim):
+  """Builds the trig vector."""
+  tf_dtype = tf.keras.mixed_precision.global_policy().compute_dtype
+  position_ids = tf.cast(tf.range(length), dtype=tf_dtype)
+  position_ids = tf.expand_dims(position_ids, axis=0)
+  steps = key_dim // 2
+  indices = tf.cast(tf.range(steps), dtype=tf_dtype)
+  indices = tf.pow(tf.constant(10000.0, dtype=tf_dtype), -2 * indices / steps)
+  vec = tf.einsum('bl,d->bld', position_ids, indices)
+  sin_vec = tf.repeat(tf.sin(vec), repeats=2, axis=-1)
+  cos_vec = tf.repeat(tf.cos(vec), repeats=2, axis=-1)
+  sin_vec, cos_vec = tf.expand_dims(sin_vec, 2), tf.expand_dims(cos_vec, 2)
+  return sin_vec, cos_vec
+@tf.keras.utils.register_keras_serializable(package='Text')
+class RoformerAttention(tf.keras.layers.MultiHeadAttention):
+  """Roformer Attention."""
+  def __init__(self,
+               q_max_sequence_length,
+               kv_max_sequence_length,
+               output_range=None,
+               **kwargs):
+    """Instantiates a roformer attention layer.
+    Roformer paper: https://arxiv.org/abs/2104.09864
+    Args:
+      q_max_sequence_length: maximum length in input for the query
+      kv_max_sequence_length: maximum length in input for key and value, can be
+        different from q_max_sequence_length
+      output_range: length of the query tensor to consider.
+      **kwargs: other keyword arguments.
+    """
+    super().__init__(**kwargs)
+    self._q_max_sequence_length = q_max_sequence_length
+    self._kv_max_sequence_length = kv_max_sequence_length
+    assert self._key_dim % 2 == 0
+    q_sin_vec, q_cos_vec = _build_trig_vector(self._q_max_sequence_length,
+                                              self._key_dim)
+    k_sin_vec, k_cos_vec = _build_trig_vector(self._kv_max_sequence_length,
+                                              self._key_dim)
+    # pylint:disable=g-long-ternary
+    self.q_sin_vec, self.q_cos_vec = (q_sin_vec,
+                                      q_cos_vec) if output_range is None else (
+                                          q_sin_vec[:, 0:output_range, ...],
+                                          q_cos_vec[:, 0:output_range, ...])
+    # pylint:enable=g-long-ternary
+    self.k_sin_vec, self.k_cos_vec = (k_sin_vec, k_cos_vec)
+  def roformer_recompute_qkv(self, q, k, v):
+    q_shape = tf.shape(q)
+    q_len = q_shape[1]
+    k_shape = tf.shape(k)
+    k_len = k_shape[1]
+    q2 = tf.stack([-q[..., 1::2], q[..., ::2]], axis=4)
+    q2 = tf.reshape(q2, q_shape)
+    k2 = tf.stack([-k[..., 1::2], k[..., ::2]], axis=4)
+    k2 = tf.reshape(k2, k_shape)
+    ret_q = q * self.q_cos_vec[:, 0:q_len,
+                               ...] + q2 * self.q_sin_vec[:, 0:q_len, ...]
+    ret_w = k * self.k_cos_vec[:, 0:k_len,
+                               ...] + k2 * self.k_sin_vec[:, 0:k_len, ...]
+    return ret_q, ret_w, v
+  def call(self,
+           query,
+           value,
+           key=None,
+           attention_mask=None,
+           return_attention_scores=False,
+           training=None):
+    if not self._built_from_signature:
+      self._build_from_signature(query=query, value=value, key=key)
+    if key is None:
+      key = value
+    query = self._query_dense(query)
+    key = self._key_dense(key)
+    value = self._value_dense(value)
+    query, key, value = self.roformer_recompute_qkv(query, key, value)
+    attention_output, attention_scores = self._compute_attention(
+        query, key, value, attention_mask, training)
+    attention_output = self._output_dense(attention_output)
+    if return_attention_scores:
+      return attention_output, attention_scores
+    return attention_output
--- a/official/projects/roformer/roformer_attention_test.py
+++ b/official/projects/roformer/roformer_attention_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the attention layer."""
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.distribute import combinations
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.projects.roformer import roformer_attention
+def _create_mock_attention_data(num_heads,
+                                key_dim,
+                                value_dim,
+                                q_seq_length,
+                                kv_seq_length,
+                                batch_size,
+                                include_mask=False):
+  """Creates mock testing data.
+  Args:
+    num_heads: `int`, Number of attention heads.
+    key_dim: `int`, Size of query head.
+    value_dim: `int`, Size of key, value dim.
+    q_seq_length: query sequence length.
+    kv_seq_length: key/value sequence length.
+    batch_size: `int`, the batch size.
+    include_mask: optional `bool`, whether or not to include mask data.
+  Returns:
+    A dictionary with `str` as keys and `Tensor` as values.
+  """
+  query_shape = (batch_size, q_seq_length, key_dim)
+  value_shape = (batch_size, kv_seq_length, value_dim)
+  data = dict(
+      query=tf.random.normal(shape=query_shape),
+      value=tf.random.normal(shape=value_shape),
+      key=tf.random.normal(shape=value_shape))
+  total_seq_length = kv_seq_length
+  if include_mask:
+    mask_shape = (batch_size, num_heads, q_seq_length, total_seq_length)
+    mask_data = np.random.randint(2, size=mask_shape).astype("float32")
+    mask_data = dict(attention_mask=mask_data)
+    data.update(mask_data)
+  return data
+@keras_parameterized.run_all_keras_modes
+class RoformerAttentionTest(keras_parameterized.TestCase):
+  def setUp(self):
+    super(RoformerAttentionTest, self).setUp()
+    np.random.seed(0)
+    tf.random.set_seed(0)
+  @combinations.generate(
+      combinations.combine(length=[8, 50], key_dim=[64, 128]))
+  def test_trig_vector(self, length, key_dim):
+    sin_emb, cos_emb = roformer_attention._build_trig_vector(length, key_dim)
+    length = tf.shape(sin_emb)[1]
+    key_dim = tf.shape(sin_emb)[3]
+    for m in range(0, length):
+      half_d = key_dim // 2
+      std_emb = tf.range(half_d, dtype=tf.float32)
+      std_emb = tf.pow(10000.0, -2 * std_emb / float(half_d))
+      std_emb = m * std_emb
+      std_sin_emb = tf.sin(std_emb)
+      std_cos_emb = tf.cos(std_emb)
+      tf.assert_equal(sin_emb[:, m, :, 0::2], std_sin_emb)
+      tf.assert_equal(sin_emb[:, m, :, 1::2], std_sin_emb)
+      tf.assert_equal(cos_emb[:, m, :, 0::2], std_cos_emb)
+      tf.assert_equal(cos_emb[:, m, :, 1::2], std_cos_emb)
+  @combinations.generate(
+      combinations.combine(value_dim=[32, 64], mask=[True, False]))
+  def test_attention_scores(self, value_dim, mask):
+    """Tests combinations of attention score calculations."""
+    batch_size, num_heads, key_dim, seq_length = 2, 12, 64, 8
+    test_layer = roformer_attention.RoformerAttention(
+        q_max_sequence_length=seq_length,
+        kv_max_sequence_length=seq_length,
+        num_heads=num_heads,
+        key_dim=key_dim,
+        value_dim=value_dim)
+    data = _create_mock_attention_data(
+        num_heads=num_heads,
+        key_dim=key_dim,
+        value_dim=value_dim,
+        q_seq_length=seq_length,
+        kv_seq_length=seq_length,
+        batch_size=batch_size,
+        include_mask=mask)
+    output = test_layer(**data)
+    self.assertEqual(output.shape, [batch_size, seq_length, key_dim])
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/modeling/networks/bert_dense_encoder.py
+++ b/official/nlp/modeling/networks/bert_dense_encoder.py
@@ -12,25 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformer-based BERT encoder network with dense features as inputs."""
+"""Roformer encoder network."""
 # pylint: disable=g-classes-have-attributes
-from typing import Any, Callable, Optional, Union
+import collections
 from absl import logging
 import tensorflow as tf
 from official.nlp.modeling import layers
+from official.projects.roformer import roformer_encoder_block
-_Initializer = Union[str, tf.keras.initializers.Initializer]
+@tf.keras.utils.register_keras_serializable(package='Text')
-_approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True)
+class RoformerEncoder(tf.keras.Model):
+  """Bi-directional Transformer-based encoder network with Roformer.
+  Roformer paper: https://arxiv.org/abs/2104.09864
-class BertDenseEncoder(tf.keras.layers.Layer):
+  *Note* that the network is constructed by
-  """Bi-directional Transformer-based encoder network with dense features.
+  [Keras Functional API](https://keras.io/guides/functional_api/).
-  This network is the same as the BertEncoder except it also concats dense
-  features with the embeddings.
  Args:
    vocab_size: The size of the token vocabulary.
@@ -69,105 +69,142 @@ class BertDenseEncoder(tf.keras.layers.Layer):
  def __init__(
      self,
-      vocab_size: int,
+      vocab_size,
-      hidden_size: int = 768,
+      hidden_size=768,  # FIXME: hidden_size per head should be even!
-      num_layers: int = 12,
+      num_layers=12,
-      num_attention_heads: int = 12,
+      num_attention_heads=12,
-      max_sequence_length: int = 512,
+      max_sequence_length=512,
-      type_vocab_size: int = 16,
+      type_vocab_size=16,
-      inner_dim: int = 3072,
+      inner_dim=3072,
-      inner_activation: Callable[..., Any] = _approx_gelu,
+      inner_activation=lambda x: tf.keras.activations.gelu(x, approximate=True),
-      output_dropout: float = 0.1,
+      output_dropout=0.1,
-      attention_dropout: float = 0.1,
+      attention_dropout=0.1,
-      initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
+      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-          stddev=0.02),
+      output_range=None,
-      output_range: Optional[int] = None,
+      embedding_width=None,
-      embedding_width: Optional[int] = None,
+      embedding_layer=None,
-      embedding_layer: Optional[tf.keras.layers.Layer] = None,
+      norm_first=False,
-      norm_first: bool = False,
      **kwargs):
-    # Pops kwargs that are used in V1 implementation.
-    if 'dict_outputs' in kwargs:
-      kwargs.pop('dict_outputs')
-    if 'return_all_encoder_outputs' in kwargs:
-      kwargs.pop('return_all_encoder_outputs')
    if 'intermediate_size' in kwargs:
-      inner_dim = kwargs.pop('intermediate_size')
+      inner_dim = kwargs['intermediate_size']
+      del kwargs['intermediate_size']
    if 'activation' in kwargs:
-      inner_activation = kwargs.pop('activation')
+      inner_activation = kwargs['activation']
+      del kwargs['activation']
    if 'dropout_rate' in kwargs:
-      output_dropout = kwargs.pop('dropout_rate')
+      output_dropout = kwargs['dropout_rate']
+      del kwargs['dropout_rate']
    if 'attention_dropout_rate' in kwargs:
-      attention_dropout = kwargs.pop('attention_dropout_rate')
+      attention_dropout = kwargs['attention_dropout_rate']
-    super().__init__(**kwargs)
+      del kwargs['attention_dropout_rate']
    activation = tf.keras.activations.get(inner_activation)
    initializer = tf.keras.initializers.get(initializer)
+    word_ids = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_word_ids')
+    mask = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_mask')
+    type_ids = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_type_ids')
    if embedding_width is None:
      embedding_width = hidden_size
    if embedding_layer is None:
-      self._embedding_layer = layers.OnDeviceEmbedding(
+      embedding_layer_inst = layers.on_device_embedding.OnDeviceEmbedding(
          vocab_size=vocab_size,
          embedding_width=embedding_width,
          initializer=initializer,
          name='word_embeddings')
    else:
-      self._embedding_layer = embedding_layer
+      embedding_layer_inst = embedding_layer
+    word_embeddings = embedding_layer_inst(word_ids)
-    self._position_embedding_layer = layers.PositionEmbedding(
-        initializer=initializer,
-        max_length=max_sequence_length,
-        name='position_embedding')
-    self._type_embedding_layer = layers.OnDeviceEmbedding(
+    # Roformer does not need a position embedding layer
+    type_embedding_layer = layers.on_device_embedding.OnDeviceEmbedding(
        vocab_size=type_vocab_size,
        embedding_width=embedding_width,
        initializer=initializer,
        use_one_hot=True,
        name='type_embeddings')
+    type_embeddings = type_embedding_layer(type_ids)
-    self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
+    # Roformer does not have absolute position embedding
+    embeddings = tf.keras.layers.Add()([word_embeddings, type_embeddings])
+    embedding_norm_layer = tf.keras.layers.LayerNormalization(
        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)
-    self._embedding_dropout = tf.keras.layers.Dropout(
+    embeddings = embedding_norm_layer(embeddings)
-        rate=output_dropout, name='embedding_dropout')
+    embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))
    # We project the 'embedding' output to 'hidden_size' if it is not already
    # 'hidden_size'.
-    self._embedding_projection = None
    if embedding_width != hidden_size:
-      self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
+      embedding_projection = tf.keras.layers.experimental.EinsumDense(
          '...x,xy->...y',
          output_shape=hidden_size,
          bias_axes='y',
          kernel_initializer=initializer,
          name='embedding_projection')
+      embeddings = embedding_projection(embeddings)
+    else:
+      embedding_projection = None
-    self._transformer_layers = []
+    transformer_layers = []
-    self._attention_mask_layer = layers.SelfAttentionMask(
+    data = embeddings
-        name='self_attention_mask')
+    attention_mask = layers.SelfAttentionMask()(data, mask)
+    encoder_outputs = []
    for i in range(num_layers):
-      layer = layers.TransformerEncoderBlock(
+      if i == num_layers - 1 and output_range is not None:
+        transformer_output_range = output_range
+      else:
+        transformer_output_range = None
+      layer = roformer_encoder_block.RoformerEncoderBlock(
          num_attention_heads=num_attention_heads,
          inner_dim=inner_dim,
          inner_activation=inner_activation,
+          q_max_sequence_length=max_sequence_length,
+          kv_max_sequence_length=max_sequence_length,
          output_dropout=output_dropout,
          attention_dropout=attention_dropout,
          norm_first=norm_first,
-          output_range=output_range if i == num_layers - 1 else None,
+          output_range=transformer_output_range,
          kernel_initializer=initializer,
-          name='transformer/layer_%d' % i)
+          name='roformer/layer_%d' % i)
-      self._transformer_layers.append(layer)
+      transformer_layers.append(layer)
+      data = layer([data, attention_mask])
+      encoder_outputs.append(data)
-    self._pooler_layer = tf.keras.layers.Dense(
+    last_encoder_output = encoder_outputs[-1]
+    # Applying a tf.slice op (through subscript notation) to a Keras tensor
+    # like this will create a SliceOpLambda layer. This is better than a Lambda
+    # layer with Python code, because that is fundamentally less portable.
+    first_token_tensor = last_encoder_output[:, 0, :]
+    pooler_layer = tf.keras.layers.Dense(
        units=hidden_size,
        activation='tanh',
        kernel_initializer=initializer,
        name='pooler_transform')
+    cls_output = pooler_layer(first_token_tensor)
-    self._config = {
+    outputs = dict(
+        sequence_output=encoder_outputs[-1],
+        pooled_output=cls_output,
+        encoder_outputs=encoder_outputs,
+    )
+    # Once we've created the network using the Functional API, we call
+    # super().__init__ as though we were invoking the Functional API Model
+    # constructor, resulting in this object having all the properties of a model
+    # created using the Functional API. Once super().__init__ is called, we
+    # can assign attributes to `self` - note that all `self` assignments are
+    # below this line.
+    super(RoformerEncoder, self).__init__(
+        inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
+    config_dict = {
        'vocab_size': vocab_size,
        'hidden_size': hidden_size,
        'num_layers': num_layers,
@@ -184,64 +221,23 @@ class BertDenseEncoder(tf.keras.layers.Layer):
        'embedding_layer': embedding_layer,
        'norm_first': norm_first,
    }
-    self.inputs = dict(
-        input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
-        input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
-        input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
-        dense_inputs=tf.keras.Input(
-            shape=(None, embedding_width), dtype=tf.float32),
-        dense_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
-        dense_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
-    )
-  def call(self, inputs):
+    # We are storing the config dict as a namedtuple here to ensure checkpoint
-    word_embeddings = None
+    # compatibility with an earlier version of this model which did not track
-    if isinstance(inputs, dict):
+    # the config dict attribute. TF does not track immutable attrs which
-      word_ids = inputs.get('input_word_ids')
+    # do not contain Trackables, so by creating a config namedtuple instead of
-      mask = inputs.get('input_mask')
+    # a dict we avoid tracking it.
-      type_ids = inputs.get('input_type_ids')
+    config_cls = collections.namedtuple('Config', config_dict.keys())
-      word_embeddings = inputs.get('input_word_embeddings', None)
+    self._config = config_cls(**config_dict)
-      dense_inputs = inputs.get('dense_inputs')
+    self._pooler_layer = pooler_layer
-      dense_mask = inputs.get('dense_mask')
+    self._transformer_layers = transformer_layers
-      dense_type_ids = inputs.get('dense_type_ids')
+    self._embedding_norm_layer = embedding_norm_layer
-    else:
+    self._embedding_layer = embedding_layer_inst
-      raise ValueError('Unexpected inputs type to %s.' % self.__class__)
+    # self._position_embedding_layer = position_embedding_layer
+    self._position_embedding_layer = None
-    if word_embeddings is None:
+    self._type_embedding_layer = type_embedding_layer
-      word_embeddings = self._embedding_layer(word_ids)
+    if embedding_projection is not None:
+      self._embedding_projection = embedding_projection
-    # Concat the dense embeddings at sequence end.
-    combined_embeddings = tf.concat([word_embeddings, dense_inputs], axis=1)
-    combined_type_ids = tf.concat([type_ids, dense_type_ids], axis=1)
-    combined_mask = tf.concat([mask, dense_mask], axis=1)
-    # absolute position embeddings.
-    position_embeddings = self._position_embedding_layer(combined_embeddings)
-    type_embeddings = self._type_embedding_layer(combined_type_ids)
-    embeddings = combined_embeddings + position_embeddings + type_embeddings
-    embeddings = self._embedding_norm_layer(embeddings)
-    embeddings = self._embedding_dropout(embeddings)
-    if self._embedding_projection is not None:
-      embeddings = self._embedding_projection(embeddings)
-    attention_mask = self._attention_mask_layer(embeddings, combined_mask)
-    encoder_outputs = []
-    x = embeddings
-    for layer in self._transformer_layers:
-      x = layer([x, attention_mask])
-      encoder_outputs.append(x)
-    last_encoder_output = encoder_outputs[-1]
-    first_token_tensor = last_encoder_output[:, 0, :]
-    pooled_output = self._pooler_layer(first_token_tensor)
-    return dict(
-        sequence_output=encoder_outputs[-1],
-        pooled_output=pooled_output,
-        encoder_outputs=encoder_outputs)
  def get_embedding_table(self):
    return self._embedding_layer.embeddings
@@ -250,7 +246,7 @@ class BertDenseEncoder(tf.keras.layers.Layer):
    return self._embedding_layer
  def get_config(self):
-    return dict(self._config)
+    return dict(self._config._asdict())
  @property
  def transformer_layers(self):

--- a/official/projects/roformer/roformer_encoder_block.py
+++ b/official/projects/roformer/roformer_encoder_block.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Roformer TransformerEncoder block layer."""
+import tensorflow as tf
+from official.projects.roformer import roformer_attention
+@tf.keras.utils.register_keras_serializable(package="Text")
+class RoformerEncoderBlock(tf.keras.layers.Layer):
+  """RoformerEncoderBlock layer."""
+  def __init__(self,
+               num_attention_heads,
+               inner_dim,
+               inner_activation,
+               q_max_sequence_length=512,
+               kv_max_sequence_length=512,
+               output_range=None,
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               use_bias=True,
+               norm_first=False,
+               norm_epsilon=1e-12,
+               output_dropout=0.0,
+               attention_dropout=0.0,
+               inner_dropout=0.0,
+               attention_initializer=None,
+               attention_axes=None,
+               **kwargs):
+    """Initializes `RoformerEncoderBlock`.
+    Args:
+      num_attention_heads: Number of attention heads.
+      inner_dim: The output dimension of the first Dense layer in a two-layer
+        feedforward network.
+      inner_activation: The activation for the first Dense layer in a two-layer
+        feedforward network.
+      q_max_sequence_length: The maximum sequence length of queries.
+      kv_max_sequence_length: The maximum sequence length of keys and values.
+      output_range: the sequence output range, [0, output_range) for slicing the
+        target sequence. `None` means the target sequence is not sliced.
+      kernel_initializer: Initializer for dense layer kernels.
+      bias_initializer: Initializer for dense layer biases.
+      kernel_regularizer: Regularizer for dense layer kernels.
+      bias_regularizer: Regularizer for dense layer biases.
+      activity_regularizer: Regularizer for dense layer activity.
+      kernel_constraint: Constraint for dense layer kernels.
+      bias_constraint: Constraint for dense layer kernels.
+      use_bias: Whether to enable use_bias in attention layer. If set False,
+        use_bias in attention layer is disabled.
+      norm_first: Whether to normalize inputs to attention and intermediate
+        dense layers. If set False, output of attention and intermediate dense
+        layers is normalized.
+      norm_epsilon: Epsilon value to initialize normalization layers.
+      output_dropout: Dropout probability for the post-attention and output
+        dropout.
+      attention_dropout: Dropout probability for within the attention layer.
+      inner_dropout: Dropout probability for the first Dense layer in a
+        two-layer feedforward network.
+      attention_initializer: Initializer for kernels of attention layers. If set
+        `None`, attention layers use kernel_initializer as initializer for
+        kernel.
+      attention_axes: axes over which the attention is applied. `None` means
+        attention over all axes, but batch, heads, and features.
+      **kwargs: keyword arguments.
+    """
+    super().__init__(**kwargs)
+    if inner_dim % 2 != 0:
+      raise ValueError(f"The inner_dim of f{self.__class__} must be an even "
+                       f"integer. However, inner_dim is f{inner_dim}")
+    self._num_heads = num_attention_heads
+    self._inner_dim = inner_dim
+    self._inner_activation = inner_activation
+    self._attention_dropout = attention_dropout
+    self._attention_dropout_rate = attention_dropout
+    self._output_dropout = output_dropout
+    self._output_dropout_rate = output_dropout
+    self._output_range = output_range
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+    self._use_bias = use_bias
+    self._norm_first = norm_first
+    self._norm_epsilon = norm_epsilon
+    self._inner_dropout = inner_dropout
+    self._q_max_sequence_length = q_max_sequence_length
+    self._kv_max_sequence_length = kv_max_sequence_length
+    if attention_initializer:
+      self._attention_initializer = tf.keras.initializers.get(
+          attention_initializer)
+    else:
+      self._attention_initializer = self._kernel_initializer
+    self._attention_axes = attention_axes
+  def build(self, input_shape):
+    if isinstance(input_shape, tf.TensorShape):
+      input_tensor_shape = input_shape
+    elif isinstance(input_shape, (list, tuple)):
+      input_tensor_shape = tf.TensorShape(input_shape[0])
+    else:
+      raise ValueError(
+          "The type of input shape argument is not supported, got: %s" %
+          type(input_shape))
+    einsum_equation = "abc,cd->abd"
+    if len(input_tensor_shape.as_list()) > 3:
+      einsum_equation = "...bc,cd->...bd"
+    hidden_size = input_tensor_shape[-1]
+    if hidden_size % self._num_heads != 0:
+      raise ValueError(
+          "The input size (%d) is not a multiple of the number of attention "
+          "heads (%d)" % (hidden_size, self._num_heads))
+    self._attention_head_size = int(hidden_size // self._num_heads)
+    common_kwargs = dict(
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+    self._attention_layer = roformer_attention.RoformerAttention(
+        q_max_sequence_length=self._q_max_sequence_length,
+        kv_max_sequence_length=self._kv_max_sequence_length,
+        output_range=self._output_range,
+        num_heads=self._num_heads,
+        key_dim=self._attention_head_size,
+        dropout=self._attention_dropout,
+        use_bias=self._use_bias,
+        kernel_initializer=self._attention_initializer,
+        attention_axes=self._attention_axes,
+        name="self_attention",
+        **common_kwargs)
+    self._attention_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
+    # Use float32 in layernorm for numeric stability.
+    # It is probably safe in mixed_float16, but we haven't validated this yet.
+    self._attention_layer_norm = (
+        tf.keras.layers.LayerNormalization(
+            name="self_attention_layer_norm",
+            axis=-1,
+            epsilon=self._norm_epsilon,
+            dtype=tf.float32))
+    self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
+        einsum_equation,
+        output_shape=(None, self._inner_dim),
+        bias_axes="d",
+        kernel_initializer=self._kernel_initializer,
+        name="intermediate",
+        **common_kwargs)
+    policy = tf.keras.mixed_precision.global_policy()
+    if policy.name == "mixed_bfloat16":
+      # bfloat16 causes BERT with the LAMB optimizer to not converge
+      # as well, so we use float32.
+      # TODO(b/154538392): Investigate this.
+      policy = tf.float32
+    self._intermediate_activation_layer = tf.keras.layers.Activation(
+        self._inner_activation, dtype=policy)
+    self._inner_dropout_layer = tf.keras.layers.Dropout(
+        rate=self._inner_dropout)
+    self._output_dense = tf.keras.layers.experimental.EinsumDense(
+        einsum_equation,
+        output_shape=(None, hidden_size),
+        bias_axes="d",
+        name="output",
+        kernel_initializer=self._kernel_initializer,
+        **common_kwargs)
+    self._output_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
+    # Use float32 in layernorm for numeric stability.
+    self._output_layer_norm = tf.keras.layers.LayerNormalization(
+        name="output_layer_norm",
+        axis=-1,
+        epsilon=self._norm_epsilon,
+        dtype=tf.float32)
+    super(RoformerEncoderBlock, self).build(input_shape)
+  def get_config(self):
+    config = {
+        "num_attention_heads":
+            self._num_heads,
+        "inner_dim":
+            self._inner_dim,
+        "inner_activation":
+            self._inner_activation,
+        "output_dropout":
+            self._output_dropout_rate,
+        "attention_dropout":
+            self._attention_dropout_rate,
+        "output_range":
+            self._output_range,
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            tf.keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            tf.keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            tf.keras.constraints.serialize(self._bias_constraint),
+        "use_bias":
+            self._use_bias,
+        "norm_first":
+            self._norm_first,
+        "norm_epsilon":
+            self._norm_epsilon,
+        "inner_dropout":
+            self._inner_dropout,
+        "attention_initializer":
+            tf.keras.initializers.serialize(self._attention_initializer),
+        "attention_axes":
+            self._attention_axes,
+    }
+    base_config = super(RoformerEncoderBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+  def call(self, inputs):
+    """Transformer self-attention encoder block call.
+    Args:
+      inputs: a single tensor or a list of tensors. `input tensor` as the single
+        sequence of embeddings. [`input tensor`, `attention mask`] to have the
+        additional attention mask. [`query tensor`, `key value tensor`,
+        `attention mask`] to have separate input streams for the query, and
+        key/value to the multi-head attention.
+    Returns:
+      An output tensor with the same dimensions as input/query tensor.
+    """
+    if isinstance(inputs, (list, tuple)):
+      if len(inputs) == 2:
+        input_tensor, attention_mask = inputs
+        key_value = None
+      elif len(inputs) == 3:
+        input_tensor, key_value, attention_mask = inputs
+      else:
+        raise ValueError("Unexpected inputs to %s with length at %d" %
+                         (self.__class__, len(inputs)))
+    else:
+      input_tensor, key_value, attention_mask = (inputs, None, None)
+    if self._output_range:
+      if self._norm_first:
+        source_tensor = input_tensor[:, 0:self._output_range, :]
+        input_tensor = self._attention_layer_norm(input_tensor)
+        if key_value is not None:
+          key_value = self._attention_layer_norm(key_value)
+      target_tensor = input_tensor[:, 0:self._output_range, :]
+      if attention_mask is not None:
+        attention_mask = attention_mask[:, 0:self._output_range, :]
+    else:
+      if self._norm_first:
+        source_tensor = input_tensor
+        input_tensor = self._attention_layer_norm(input_tensor)
+        if key_value is not None:
+          key_value = self._attention_layer_norm(key_value)
+      target_tensor = input_tensor
+    if key_value is None:
+      key_value = input_tensor
+    attention_output = self._attention_layer(
+        query=target_tensor, value=key_value, attention_mask=attention_mask)
+    attention_output = self._attention_dropout(attention_output)
+    if self._norm_first:
+      attention_output = source_tensor + attention_output
+    else:
+      attention_output = self._attention_layer_norm(target_tensor +
+                                                    attention_output)
+    if self._norm_first:
+      source_attention_output = attention_output
+      attention_output = self._output_layer_norm(attention_output)
+    inner_output = self._intermediate_dense(attention_output)
+    inner_output = self._intermediate_activation_layer(inner_output)
+    inner_output = self._inner_dropout_layer(inner_output)
+    layer_output = self._output_dense(inner_output)
+    layer_output = self._output_dropout(layer_output)
+    if self._norm_first:
+      return source_attention_output + layer_output
+    # During mixed precision training, layer norm output is always fp32 for now.
+    # Casts fp32 for the subsequent add.
+    layer_output = tf.cast(layer_output, tf.float32)
+    return self._output_layer_norm(layer_output + attention_output)
--- a/official/projects/roformer/roformer_encoder_block_test.py
+++ b/official/projects/roformer/roformer_encoder_block_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for Keras-based transformer block layer."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.projects.roformer import roformer_encoder_block
+@keras_parameterized.run_all_keras_modes
+@parameterized.named_parameters(
+    ('base', roformer_encoder_block.RoformerEncoderBlock))
+class RoformerEncoderBlockTest(keras_parameterized.TestCase):
+  def tearDown(self):
+    super(RoformerEncoderBlockTest, self).tearDown()
+    tf.keras.mixed_precision.set_global_policy('float32')
+  def test_layer_creation(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(data_tensor)
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
+  def test_layer_creation_with_mask(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    # Create a 2-dimensional input (the first dimension is implicit).
+    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
+    output_tensor = test_layer([data_tensor, mask_tensor])
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
+  def test_layer_invocation(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(data_tensor)
+    # Create a model from the test layer.
+    model = tf.keras.Model(data_tensor, output_tensor)
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    _ = model.predict(input_data)
+  def test_layer_invocation_with_mask(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    # Create a 2-dimensional input (the first dimension is implicit).
+    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
+    output_tensor = test_layer([data_tensor, mask_tensor])
+    # Create a model from the test layer.
+    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
+    # which here is (batch, sequence_length, sequence_length)
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    _ = model.predict([input_data, mask_data])
+  def test_layer_output_range(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    output_tensor = test_layer([input_data, mask_data])
+    # The layer only attends to the first token and outputs the first token
+    # embedding.
+    new_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        output_range=1)
+    _ = new_layer([input_data, mask_data])
+    new_layer.set_weights(test_layer.get_weights())
+    new_output_tensor = new_layer([input_data, mask_data])
+    self.assertAllClose(
+        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+  def test_layer_output_range_without_mask(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        norm_first=True)
+    sequence_length = 21
+    width = 80
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    output_tensor = test_layer(input_data)
+    # The layer only attends to the first token and outputs the first token
+    # embedding.
+    new_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        output_range=1,
+        norm_first=True)
+    _ = new_layer(input_data)
+    new_layer.set_weights(test_layer.get_weights())
+    new_output_tensor = new_layer(input_data)
+    self.assertAllClose(
+        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+  def test_layer_output_range_with_pre_norm(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        norm_first=True)
+    sequence_length = 21
+    width = 80
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    output_tensor = test_layer([input_data, mask_data])
+    # The layer only attends to the first token and outputs the first token
+    # embedding.
+    new_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        output_range=1,
+        norm_first=True)
+    _ = new_layer([input_data, mask_data])
+    new_layer.set_weights(test_layer.get_weights())
+    new_output_tensor = new_layer([input_data, mask_data])
+    self.assertAllClose(
+        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+  def test_layer_invocation_with_float16_dtype(self, transformer_cls):
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    # Create a 2-dimensional input (the first dimension is implicit).
+    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
+    output_tensor = test_layer([data_tensor, mask_tensor])
+    # Create a model from the test layer.
+    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 6
+    input_data = (10 * np.random.random_sample(
+        (batch_size, sequence_length, width)))
+    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
+    # which here is (batch, sequence_length, sequence_length)
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    _ = model.predict([input_data, mask_data])
+  def test_transform_with_initializer(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output = test_layer(data_tensor)
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output.shape.as_list())
+  def test_separate_qkv(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=2,
+        inner_dim=128,
+        inner_activation='relu',
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
+    # Forward path.
+    q_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
+    kv_tensor = tf.zeros([2, 8, 16], dtype=tf.float32)
+    dummy_mask = tf.zeros([2, 4, 8], dtype=tf.float32)
+    inputs = [q_tensor, kv_tensor, dummy_mask]
+    output = test_layer(inputs)
+    self.assertEqual(output.shape, q_tensor.shape)
+@keras_parameterized.run_all_keras_modes
+class RoformerArgumentTest(keras_parameterized.TestCase):
+  def test_raises(self):
+    num_attention_heads = 2
+    with self.assertRaisesRegex(ValueError, 'The inner_dim of.*'):
+      _ = roformer_encoder_block.RoformerEncoderBlock(
+          num_attention_heads=num_attention_heads,
+          inner_dim=31,
+          inner_activation='relu',
+          output_dropout=0.1,
+          attention_dropout=0.1,
+          use_bias=False,
+          norm_first=True,
+          norm_epsilon=1e-6,
+          inner_dropout=0.1,
+          attention_initializer=tf.keras.initializers.RandomUniform(
+              minval=0., maxval=1.))
+  def test_use_bias_norm_first(self):
+    num_attention_heads = 2
+    hidden_size = 16
+    encoder_block = roformer_encoder_block.RoformerEncoderBlock(
+        num_attention_heads=num_attention_heads,
+        inner_dim=32,
+        inner_activation='relu',
+        output_dropout=0.1,
+        attention_dropout=0.1,
+        use_bias=False,
+        norm_first=True,
+        norm_epsilon=1e-6,
+        inner_dropout=0.1,
+        attention_initializer=tf.keras.initializers.RandomUniform(
+            minval=0., maxval=1.))
+    # Forward path.
+    dummy_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
+    dummy_mask = tf.zeros([2, 4, 4], dtype=tf.float32)
+    inputs = [dummy_tensor, dummy_mask]
+    output = encoder_block(inputs)
+    self.assertEqual(output.shape, (2, 4, hidden_size))
+  def test_get_config(self):
+    num_attention_heads = 2
+    encoder_block = roformer_encoder_block.RoformerEncoderBlock(
+        num_attention_heads=num_attention_heads,
+        inner_dim=32,
+        inner_activation='relu',
+        output_dropout=0.1,
+        attention_dropout=0.1,
+        use_bias=False,
+        norm_first=True,
+        norm_epsilon=1e-6,
+        inner_dropout=0.1,
+        attention_initializer=tf.keras.initializers.RandomUniform(
+            minval=0., maxval=1.))
+    encoder_block_config = encoder_block.get_config()
+    new_encoder_block = roformer_encoder_block.RoformerEncoderBlock.from_config(
+        encoder_block_config)
+    self.assertEqual(encoder_block_config, new_encoder_block.get_config())
+  @parameterized.parameters({'attention_axes': None}, {'attention_axes': [1]},
+                            {'attention_axes': [2]}, {'attention_axes': [1, 2]})
+  def test_several_attention_axes(self, attention_axes):
+    test_layer = roformer_encoder_block.RoformerEncoderBlock(
+        inner_dim=32,
+        inner_activation='relu',
+        output_dropout=0.1,
+        attention_dropout=0.1,
+        use_bias=False,
+        norm_first=True,
+        norm_epsilon=1e-6,
+        inner_dropout=0.1,
+        num_attention_heads=10,
+        attention_axes=attention_axes)
+    seq_len = 21
+    dimensions = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(seq_len, dimensions))
+    output_tensor = test_layer(data_tensor)
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/projects/roformer/roformer_encoder_test.py
+++ b/official/projects/roformer/roformer_encoder_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for transformer-based bert encoder network."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.projects.roformer import roformer_encoder
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class RoformerEncoderTest(keras_parameterized.TestCase):
+  def tearDown(self):
+    super(RoformerEncoderTest, self).tearDown()
+    tf.keras.mixed_precision.set_global_policy("float32")
+  def test_network_creation(self):
+    hidden_size = 32
+    sequence_length = 21
+    # Create a small BertEncoder for testing.
+    test_network = roformer_encoder.RoformerEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    self.assertIsInstance(test_network.transformer_layers, list)
+    self.assertLen(test_network.transformer_layers, 3)
+    self.assertIsInstance(test_network.pooler_layer, tf.keras.layers.Dense)
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+    # The default output dtype is float32.
+    self.assertAllEqual(tf.float32, data.dtype)
+    self.assertAllEqual(tf.float32, pooled.dtype)
+  def test_all_encoder_outputs_network_creation(self):
+    hidden_size = 32
+    sequence_length = 21
+    # Create a small BertEncoder for testing.
+    test_network = roformer_encoder.RoformerEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    all_encoder_outputs = dict_outputs["encoder_outputs"]
+    pooled = dict_outputs["pooled_output"]
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertLen(all_encoder_outputs, 3)
+    for data in all_encoder_outputs:
+      self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+    # The default output dtype is float32.
+    self.assertAllEqual(tf.float32, all_encoder_outputs[-1].dtype)
+    self.assertAllEqual(tf.float32, pooled.dtype)
+  def test_network_creation_with_float16_dtype(self):
+    hidden_size = 32
+    sequence_length = 21
+    tf.keras.mixed_precision.set_global_policy("mixed_float16")
+    # Create a small BertEncoder for testing.
+    test_network = roformer_encoder.RoformerEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+    # If float_dtype is set to float16, the data output is float32 (from a layer
+    # norm) and pool output should be float16.
+    self.assertAllEqual(tf.float32, data.dtype)
+    self.assertAllEqual(tf.float16, pooled.dtype)
+  @parameterized.named_parameters(
+      ("all_sequence", None, 21),
+      ("output_range", 1, 1),
+  )
+  def test_network_invocation(self, output_range, out_seq_len):
+    hidden_size = 32
+    sequence_length = 21
+    vocab_size = 57
+    num_types = 7
+    # Create a small BertEncoder for testing.
+    test_network = roformer_encoder.RoformerEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        output_range=output_range)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    # Create a model based off of this network:
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    # Invoke the model. We can't validate the output data here (the model is too
+    # complex) but this will catch structural runtime errors.
+    batch_size = 3
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
+    type_id_data = np.random.randint(
+        num_types, size=(batch_size, sequence_length))
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[1], out_seq_len)
+    # Creates a BertEncoder with max_sequence_length != sequence_length
+    max_sequence_length = 128
+    test_network = roformer_encoder.RoformerEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        max_sequence_length=max_sequence_length,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[1], sequence_length)
+    # Creates a BertEncoder with embedding_width != hidden_size
+    test_network = roformer_encoder.RoformerEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        max_sequence_length=max_sequence_length,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        embedding_width=16)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[-1], hidden_size)
+    self.assertTrue(hasattr(test_network, "_embedding_projection"))
+  def test_serialize_deserialize(self):
+    # Create a network object that sets all of its config options.
+    kwargs = dict(
+        vocab_size=100,
+        hidden_size=32,
+        num_layers=3,
+        num_attention_heads=2,
+        max_sequence_length=21,
+        type_vocab_size=12,
+        inner_dim=512,
+        inner_activation="relu",
+        output_dropout=0.05,
+        attention_dropout=0.22,
+        initializer="glorot_uniform",
+        output_range=-1,
+        embedding_width=16,
+        embedding_layer=None,
+        norm_first=False)
+    network = roformer_encoder.RoformerEncoder(**kwargs)
+    expected_config = dict(kwargs)
+    expected_config["inner_activation"] = tf.keras.activations.serialize(
+        tf.keras.activations.get(expected_config["inner_activation"]))
+    expected_config["initializer"] = tf.keras.initializers.serialize(
+        tf.keras.initializers.get(expected_config["initializer"]))
+    self.assertEqual(network.get_config(), expected_config)
+    # Create another network object from the first object's config.
+    new_network = roformer_encoder.RoformerEncoder.from_config(
+        network.get_config())
+    # Validate that the config can be forced to JSON.
+    _ = network.to_json()
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(network.get_config(), new_network.get_config())
+    # Tests model saving/loading.
+    model_path = self.get_temp_dir() + "/model"
+    network.save(model_path)
+    _ = tf.keras.models.load_model(model_path)
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/projects/roformer/roformer_experiments.py
+++ b/official/projects/roformer/roformer_experiments.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Roformer experiment configurations."""
+# pylint: disable=g-doc-return-or-yield,line-too-long
+import dataclasses
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.modeling import optimization
+from official.nlp.configs import bert
+from official.nlp.configs import encoders
+from official.nlp.data import pretrain_dataloader
+from official.nlp.data import sentence_prediction_dataloader
+from official.nlp.tasks import masked_lm
+from official.nlp.tasks import sentence_prediction
+from official.projects.roformer import roformer
+AdamWeightDecay = optimization.AdamWeightDecayConfig
+PolynomialLr = optimization.PolynomialLrConfig
+PolynomialWarmupConfig = optimization.PolynomialWarmupConfig
+@dataclasses.dataclass
+class RoformerOptimizationConfig(optimization.OptimizationConfig):
+  """TEAMS optimization config."""
+  optimizer: optimization.OptimizerConfig = optimization.OptimizerConfig(
+      type='adamw',
+      adamw=AdamWeightDecay(
+          weight_decay_rate=0.01,
+          exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'],
+          epsilon=1e-6))
+  learning_rate: optimization.LrConfig = optimization.LrConfig(
+      type='polynomial',
+      polynomial=PolynomialLr(
+          initial_learning_rate=1e-4,
+          decay_steps=1000000,
+          end_learning_rate=0.0))
+  warmup: optimization.WarmupConfig = optimization.WarmupConfig(
+      type='polynomial', polynomial=PolynomialWarmupConfig(warmup_steps=10000))
+@exp_factory.register_config_factory('roformer/pretraining')
+def roformer_pretraining() -> cfg.ExperimentConfig:
+  """BERT pretraining experiment."""
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(enable_xla=True),
+      task=masked_lm.MaskedLMConfig(
+          model=bert.PretrainerConfig(
+              encoder=encoders.EncoderConfig(
+                  type='any', any=roformer.RoformerEncoderConfig()),
+              cls_heads=[
+                  bert.ClsHeadConfig(
+                      inner_dim=768,
+                      num_classes=2,
+                      dropout_rate=0.1,
+                      name='next_sentence')
+              ]),
+          train_data=pretrain_dataloader.BertPretrainDataConfig(
+              use_v2_feature_names=True),
+          validation_data=pretrain_dataloader.BertPretrainDataConfig(
+              use_v2_feature_names=True, is_training=False)),
+      trainer=cfg.TrainerConfig(
+          optimizer_config=RoformerOptimizationConfig(), train_steps=1000000),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config
+@exp_factory.register_config_factory('roformer/glue')
+def roformer_glue() -> cfg.ExperimentConfig:
+  r"""BigBird GLUE."""
+  config = cfg.ExperimentConfig(
+      task=sentence_prediction.SentencePredictionConfig(
+          model=sentence_prediction.ModelConfig(
+              encoder=encoders.EncoderConfig(
+                  type='any', any=roformer.RoformerEncoderConfig())),
+          train_data=sentence_prediction_dataloader
+          .SentencePredictionDataConfig(),
+          validation_data=sentence_prediction_dataloader
+          .SentencePredictionDataConfig(
+              is_training=False, drop_remainder=False)),
+      trainer=cfg.TrainerConfig(
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'adamw',
+                  'adamw': {
+                      'weight_decay_rate':
+                          0.01,
+                      'exclude_from_weight_decay':
+                          ['LayerNorm', 'layer_norm', 'bias'],
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 3e-5,
+                      'end_learning_rate': 0.0,
+                  }
+              },
+              'warmup': {
+                  'type': 'polynomial'
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config
--- a/official/vision/beta/projects/assemblenet/train.py
+++ b/official/vision/beta/projects/assemblenet/train.py
@@ -12,27 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Lint as: python3
+"""A customized training library for the specific task."""
-"""Training driver."""
 from absl import app
 from absl import flags
-from absl import logging
 import gin
-# pylint: disable=unused-import
-from official.common import registry_imports
-# pylint: enable=unused-import
 from official.common import distribute_utils
 from official.common import flags as tfm_flags
 from official.core import task_factory
 from official.core import train_lib
 from official.core import train_utils
 from official.modeling import performance
-# pylint: disable=unused-import
+from official.projects.roformer import roformer_experiments  # pylint: disable=unused-import
-from official.vision.beta.projects.assemblenet.configs import assemblenet as asn_configs
-from official.vision.beta.projects.assemblenet.modeling import assemblenet as asn
-# pylint: enable=unused-import
 FLAGS = flags.FLAGS
@@ -46,26 +38,6 @@ def main(_):
    # may race against the train job for writing the same file.
    train_utils.serialize_config(params, model_dir)
-  if 'train_and_eval' in FLAGS.mode:
-    assert (params.task.train_data.feature_shape ==
-            params.task.validation_data.feature_shape), (
-                f'train {params.task.train_data.feature_shape} != validate '
-                f'{params.task.validation_data.feature_shape}')
-  if 'assemblenet' in FLAGS.experiment:
-    if 'eval' in FLAGS.mode:
-      # Use the feature shape in validation_data for all jobs. The number of
-      # frames in train_data will be used to construct the Assemblenet model.
-      params.task.model.backbone.assemblenet.num_frames = params.task.validation_data.feature_shape[
-          0]
-      shape = params.task.validation_data.feature_shape
-    else:
-      params.task.model.backbone.assemblenet.num_frames = params.task.train_data.feature_shape[
-          0]
-      shape = params.task.train_data.feature_shape
-    logging.info('mode %r num_frames %r feature shape %r', FLAGS.mode,
-                 params.task.model.backbone.assemblenet.num_frames, shape)
  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
  # can have significant impact on model speeds by utilizing float16 in case of
  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
@@ -76,7 +48,9 @@ def main(_):
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus,
-      tpu_address=params.runtime.tpu)
+      tpu_address=params.runtime.tpu,
+      **params.runtime.model_parallelism())
  with distribution_strategy.scope():
    task = task_factory.get_task(params.task, logging_dir=model_dir)
@@ -89,6 +63,7 @@ def main(_):
  train_utils.save_gin_config(FLAGS.mode, model_dir)
 if __name__ == '__main__':
  tfm_flags.define_flags()
  app.run(main)
--- a/official/nlp/projects/teams/README.md
+++ b/official/nlp/projects/teams/README.md
--- a/official/projects/teams/__init__.py
+++ b/official/projects/teams/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/official/nlp/projects/teams/experiments/base/glue_mnli.yaml
+++ b/official/nlp/projects/teams/experiments/base/glue_mnli.yaml
--- a/official/nlp/projects/teams/experiments/base/squad_v1.yaml
+++ b/official/nlp/projects/teams/experiments/base/squad_v1.yaml
--- a/official/nlp/projects/teams/experiments/base/squad_v2.yaml
+++ b/official/nlp/projects/teams/experiments/base/squad_v2.yaml
--- a/official/nlp/projects/teams/experiments/base/wiki_books_pretrain.yaml
+++ b/official/nlp/projects/teams/experiments/base/wiki_books_pretrain.yaml
--- a/official/nlp/projects/teams/experiments/small/glue_mnli.yaml
+++ b/official/nlp/projects/teams/experiments/small/glue_mnli.yaml
--- a/official/nlp/projects/teams/experiments/small/squad_v1.yaml
+++ b/official/nlp/projects/teams/experiments/small/squad_v1.yaml
--- a/official/nlp/projects/teams/experiments/small/squad_v2.yaml
+++ b/official/nlp/projects/teams/experiments/small/squad_v2.yaml
--- a/official/nlp/projects/teams/experiments/small/wiki_books_pretrain.yaml
+++ b/official/nlp/projects/teams/experiments/small/wiki_books_pretrain.yaml
--- a/official/nlp/projects/teams/experiments/teams_en_uncased_base.yaml
+++ b/official/nlp/projects/teams/experiments/teams_en_uncased_base.yaml
 task:
  model:
    encoder:
-      teams:
+      any:  # Teams encoder.
        attention_dropout_rate: 0.1
        dropout_rate: 0.1
        embedding_size: 768
@@ -14,4 +14,4 @@ task:
        num_layers: 12
        type_vocab_size: 2
        vocab_size: 30522
-      type: teams
+      type: any
--- a/official/nlp/projects/teams/experiments/teams_en_uncased_small.yaml
+++ b/official/nlp/projects/teams/experiments/teams_en_uncased_small.yaml
 task:
  model:
    encoder:
-      teams:
+      any:  # Teams encoder.
        attention_dropout_rate: 0.1
        dropout_rate: 0.1
        embedding_size: 128
@@ -14,4 +14,4 @@ task:
        num_layers: 12
        type_vocab_size: 2
        vocab_size: 30522
-      type: teams
+      type: any