[keras_nlp] Merge keras_nlp into tf_nlp.

PiperOrigin-RevId: 401593694

[keras_nlp] Merge keras_nlp into tf_nlp.
PiperOrigin-RevId: 401593694
002b4240 · Frederick Liu · A. Unique TensorFlower · 03c096ab · 002b4240 · 002b4240
Commit 002b4240 authored Oct 07, 2021 by Frederick Liu Committed by A. Unique TensorFlower Oct 07, 2021
20 changed files
--- a/official/nlp/keras_nlp/layers/masked_lm.py
+++ b/official/nlp/keras_nlp/layers/masked_lm.py
@@ -13,111 +13,7 @@
 # limitations under the License.
 """Masked language model network."""
-# pylint: disable=g-classes-have-attributes
+from official.nlp.modeling import layers
-import tensorflow as tf
-@tf.keras.utils.register_keras_serializable(package='keras_nlp')
+MaskedLM = layers.MaskedLM
-class MaskedLM(tf.keras.layers.Layer):
-  """Masked language model network head for BERT modeling.
-  This layer implements a masked language model based on the provided
-  transformer based encoder. It assumes that the encoder network being passed
-  has a "get_embedding_table()" method.
-  Example:
-  ```python
-  encoder=keras_nlp.BertEncoder(...)
-  lm_layer=MaskedLM(embedding_table=encoder.get_embedding_table())
-  ```
-  Args:
-    embedding_table: The embedding table from encoder network.
-    activation: The activation, if any, for the dense layer.
-    initializer: The initializer for the dense layer. Defaults to a Glorot
-      uniform initializer.
-    output: The output style for this layer. Can be either 'logits' or
-      'predictions'.
-  """
-  def __init__(self,
-               embedding_table,
-               activation=None,
-               initializer='glorot_uniform',
-               output='logits',
-               name=None,
-               **kwargs):
-    super(MaskedLM, self).__init__(name=name, **kwargs)
-    self.embedding_table = embedding_table
-    self.activation = activation
-    self.initializer = tf.keras.initializers.get(initializer)
-    if output not in ('predictions', 'logits'):
-      raise ValueError(
-          ('Unknown `output` value "%s". `output` can be either "logits" or '
-           '"predictions"') % output)
-    self._output_type = output
-  def build(self, input_shape):
-    self._vocab_size, hidden_size = self.embedding_table.shape
-    self.dense = tf.keras.layers.Dense(
-        hidden_size,
-        activation=self.activation,
-        kernel_initializer=self.initializer,
-        name='transform/dense')
-    self.layer_norm = tf.keras.layers.LayerNormalization(
-        axis=-1, epsilon=1e-12, name='transform/LayerNorm')
-    self.bias = self.add_weight(
-        'output_bias/bias',
-        shape=(self._vocab_size,),
-        initializer='zeros',
-        trainable=True)
-    super(MaskedLM, self).build(input_shape)
-  def call(self, sequence_data, masked_positions):
-    masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
-    lm_data = self.dense(masked_lm_input)
-    lm_data = self.layer_norm(lm_data)
-    lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
-    logits = tf.nn.bias_add(lm_data, self.bias)
-    masked_positions_length = masked_positions.shape.as_list()[1] or tf.shape(
-        masked_positions)[1]
-    logits = tf.reshape(logits,
-                        [-1, masked_positions_length, self._vocab_size])
-    if self._output_type == 'logits':
-      return logits
-    return tf.nn.log_softmax(logits)
-  def get_config(self):
-    raise NotImplementedError('MaskedLM cannot be directly serialized because '
-                              'it has variable sharing logic.')
-  def _gather_indexes(self, sequence_tensor, positions):
-    """Gathers the vectors at the specific positions, for performance.
-    Args:
-        sequence_tensor: Sequence output of shape
-          (`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
-          hidden units.
-        positions: Positions ids of tokens in sequence to mask for pretraining
-          of with dimension (batch_size, num_predictions) where
-          `num_predictions` is maximum number of tokens to mask out and predict
-          per each sequence.
-    Returns:
-        Masked out sequence tensor of shape (batch_size * num_predictions,
-        num_hidden).
-    """
-    sequence_shape = tf.shape(sequence_tensor)
-    batch_size, seq_length = sequence_shape[0], sequence_shape[1]
-    width = sequence_tensor.shape.as_list()[2] or sequence_shape[2]
-    flat_offsets = tf.reshape(
-        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
-    flat_positions = tf.reshape(positions + flat_offsets, [-1])
-    flat_sequence_tensor = tf.reshape(sequence_tensor,
-                                      [batch_size * seq_length, width])
-    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
-    return output_tensor
--- a/official/nlp/keras_nlp/layers/on_device_embedding.py
+++ b/official/nlp/keras_nlp/layers/on_device_embedding.py
@@ -13,94 +13,7 @@
 # limitations under the License.
 """Keras-based one-hot embedding layer."""
-# pylint: disable=g-classes-have-attributes
+from official.nlp.modeling import layers
-import tensorflow as tf
+OnDeviceEmbedding = layers.OnDeviceEmbedding
-@tf.keras.utils.register_keras_serializable(package="keras_nlp")
-class OnDeviceEmbedding(tf.keras.layers.Layer):
-  """Performs an embedding lookup suitable for accelerator devices.
-  This layer uses either tf.gather or tf.one_hot to translate integer indices to
-  float embeddings.
-  Args:
-    vocab_size: Number of elements in the vocabulary.
-    embedding_width: Output size of the embedding layer.
-    initializer: The initializer to use for the embedding weights. Defaults to
-      "glorot_uniform".
-    use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
-      lookup. Defaults to False (that is, using tf.gather). Setting this option
-      to True may improve performance, especially on small vocabulary sizes, but
-      will generally require more memory.
-    scale_factor: Whether to scale the output embeddings. Defaults to None (that
-      is, not to scale). Setting this option to a float will let values in
-      output embeddings multiplied by scale_factor.
-  """
-  def __init__(self,
-               vocab_size,
-               embedding_width,
-               initializer="glorot_uniform",
-               use_one_hot=False,
-               scale_factor=None,
-               **kwargs):
-    super(OnDeviceEmbedding, self).__init__(**kwargs)
-    self._vocab_size = vocab_size
-    self._embedding_width = embedding_width
-    self._initializer = initializer
-    self._use_one_hot = use_one_hot
-    self._scale_factor = scale_factor
-  def get_config(self):
-    config = {
-        "vocab_size": self._vocab_size,
-        "embedding_width": self._embedding_width,
-        "initializer": self._initializer,
-        "use_one_hot": self._use_one_hot,
-        "scale_factor": self._scale_factor,
-    }
-    base_config = super(OnDeviceEmbedding, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-  def build(self, input_shape):
-    self.embeddings = self.add_weight(
-        "embeddings",
-        shape=[self._vocab_size, self._embedding_width],
-        initializer=self._initializer,
-        dtype=tf.float32)
-    super(OnDeviceEmbedding, self).build(input_shape)
-  def call(self, inputs):
-    flat_inputs = tf.reshape(inputs, [-1])
-    if self._use_one_hot:
-      dtype = self._compute_dtype
-      if not tf.dtypes.as_dtype(dtype).is_floating:
-        # TensorFlow 1 compatibility. In TF1, self._compute_dtype is int32
-        # instead of a floating-point dtype, as the dtype is inferred from the
-        # dtype of the inputs
-        dtype = tf.float32
-      one_hot_data = tf.one_hot(
-          flat_inputs, depth=self._vocab_size, dtype=dtype)
-      embeddings = tf.matmul(one_hot_data, self.embeddings)
-    else:
-      embeddings = tf.gather(self.embeddings, flat_inputs)
-    embeddings = tf.reshape(
-        embeddings,
-        # Work around b/142213824: prefer concat to shape over a Python list.
-        tf.concat([tf.shape(inputs), [self._embedding_width]], axis=0))
-    embeddings.set_shape(inputs.shape.as_list() + [self._embedding_width])
-    if self._scale_factor:
-      embeddings *= self._scale_factor
-    return embeddings
-  @property
-  def vocab_size(self):
-    return self._vocab_size
-  @property
-  def embedding_width(self):
-    return self._embedding_width
--- a/official/nlp/keras_nlp/layers/position_embedding.py
+++ b/official/nlp/keras_nlp/layers/position_embedding.py
@@ -13,75 +13,7 @@
 # limitations under the License.
 """Keras-based positional embedding layer."""
-# pylint: disable=g-classes-have-attributes
+from official.nlp.modeling import layers
-import tensorflow as tf
-@tf.keras.utils.register_keras_serializable(package="keras_nlp")
+PositionEmbedding = layers.PositionEmbedding
-class PositionEmbedding(tf.keras.layers.Layer):
-  """Creates a positional embedding.
-  Example:
-  ```python
-  position_embedding = PositionEmbedding(max_length=100)
-  inputs = tf.keras.Input((100, 32), dtype=tf.float32)
-  outputs = position_embedding(inputs)
-  ```
-  Args:
-    max_length: The maximum size of the dynamic sequence.
-    initializer: The initializer to use for the embedding weights. Defaults to
-      "glorot_uniform".
-    seq_axis: The axis of the input tensor where we add the embeddings.
-  Reference: This layer creates a positional embedding as described in
-  [BERT: Pre-training of Deep Bidirectional Transformers for Language
-  Understanding](https://arxiv.org/abs/1810.04805).
-  """
-  def __init__(self,
-               max_length,
-               initializer="glorot_uniform",
-               seq_axis=1,
-               **kwargs):
-    super(PositionEmbedding, self).__init__(**kwargs)
-    if max_length is None:
-      raise ValueError(
-          "`max_length` must be an Integer, not `None`."
-      )
-    self._max_length = max_length
-    self._initializer = tf.keras.initializers.get(initializer)
-    self._seq_axis = seq_axis
-  def get_config(self):
-    config = {
-        "max_length": self._max_length,
-        "initializer": tf.keras.initializers.serialize(self._initializer),
-        "seq_axis": self._seq_axis,
-    }
-    base_config = super(PositionEmbedding, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-  def build(self, input_shape):
-    dimension_list = input_shape.as_list()
-    width = dimension_list[-1]
-    weight_sequence_length = self._max_length
-    self._position_embeddings = self.add_weight(
-        "embeddings",
-        shape=[weight_sequence_length, width],
-        initializer=self._initializer)
-    super(PositionEmbedding, self).build(input_shape)
-  def call(self, inputs):
-    input_shape = tf.shape(inputs)
-    actual_seq_len = input_shape[self._seq_axis]
-    position_embeddings = self._position_embeddings[:actual_seq_len, :]
-    new_shape = [1 for _ in inputs.get_shape().as_list()]
-    new_shape[self._seq_axis] = actual_seq_len
-    new_shape[-1] = position_embeddings.get_shape().as_list()[-1]
-    position_embeddings = tf.reshape(position_embeddings, new_shape)
-    return tf.broadcast_to(position_embeddings, input_shape)
--- a/official/nlp/keras_nlp/layers/self_attention_mask.py
+++ b/official/nlp/keras_nlp/layers/self_attention_mask.py
@@ -14,42 +14,7 @@
 """Keras layer that creates a self-attention mask."""
-import tensorflow as tf
+from official.nlp.modeling import layers
-@tf.keras.utils.register_keras_serializable(package='keras_nlp')
+SelfAttentionMask = layers.SelfAttentionMask
-class SelfAttentionMask(tf.keras.layers.Layer):
-  """Create 3D attention mask from a 2D tensor mask.
-    inputs[0]: from_tensor: 2D or 3D Tensor of shape
-      [batch_size, from_seq_length, ...].
-    inputs[1]: to_mask: int32 Tensor of shape [batch_size, to_seq_length].
-    Returns:
-      float Tensor of shape [batch_size, from_seq_length, to_seq_length].
-  """
-  def call(self, inputs, to_mask):
-    from_shape = tf.shape(inputs)
-    batch_size = from_shape[0]
-    from_seq_length = from_shape[1]
-    to_shape = tf.shape(to_mask)
-    to_seq_length = to_shape[1]
-    to_mask = tf.cast(
-        tf.reshape(to_mask, [batch_size, 1, to_seq_length]),
-        dtype=inputs.dtype)
-    # We don't assume that `from_tensor` is a mask (although it could be). We
-    # don't actually care if we attend *from* padding tokens (only *to* padding)
-    # tokens so we create a tensor of all ones.
-    #
-    # `broadcast_ones` = [batch_size, from_seq_length, 1]
-    broadcast_ones = tf.ones(
-        shape=[batch_size, from_seq_length, 1], dtype=inputs.dtype)
-    # Here we broadcast along two dimensions to create the mask.
-    mask = broadcast_ones * to_mask
-    return mask
--- a/official/nlp/keras_nlp/layers/transformer_encoder_block.py
+++ b/official/nlp/keras_nlp/layers/transformer_encoder_block.py
@@ -14,295 +14,7 @@
 """Keras-based TransformerEncoder block layer."""
-import tensorflow as tf
+from official.nlp.modeling import layers
-@tf.keras.utils.register_keras_serializable(package="keras_nlp")
+TransformerEncoderBlock = layers.TransformerEncoderBlock
-class TransformerEncoderBlock(tf.keras.layers.Layer):
-  """TransformerEncoderBlock layer.
-  This layer implements the Transformer Encoder from
-  "Attention Is All You Need". (https://arxiv.org/abs/1706.03762),
-  which combines a `tf.keras.layers.MultiHeadAttention` layer with a
-  two-layer feedforward network.
-  References:
-    [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
-    [BERT: Pre-training of Deep Bidirectional Transformers for Language
-     Understanding](https://arxiv.org/abs/1810.04805)
-  """
-  def __init__(self,
-               num_attention_heads,
-               inner_dim,
-               inner_activation,
-               output_range=None,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               use_bias=True,
-               norm_first=False,
-               norm_epsilon=1e-12,
-               output_dropout=0.0,
-               attention_dropout=0.0,
-               inner_dropout=0.0,
-               attention_initializer=None,
-               attention_axes=None,
-               **kwargs):
-    """Initializes `TransformerEncoderBlock`.
-    Args:
-      num_attention_heads: Number of attention heads.
-      inner_dim: The output dimension of the first Dense layer in a two-layer
-        feedforward network.
-      inner_activation: The activation for the first Dense layer in a two-layer
-        feedforward network.
-      output_range: the sequence output range, [0, output_range) for slicing the
-        target sequence. `None` means the target sequence is not sliced.
-      kernel_initializer: Initializer for dense layer kernels.
-      bias_initializer: Initializer for dense layer biases.
-      kernel_regularizer: Regularizer for dense layer kernels.
-      bias_regularizer: Regularizer for dense layer biases.
-      activity_regularizer: Regularizer for dense layer activity.
-      kernel_constraint: Constraint for dense layer kernels.
-      bias_constraint: Constraint for dense layer kernels.
-      use_bias: Whether to enable use_bias in attention layer. If set False,
-        use_bias in attention layer is disabled.
-      norm_first: Whether to normalize inputs to attention and intermediate
-        dense layers. If set False, output of attention and intermediate dense
-        layers is normalized.
-      norm_epsilon: Epsilon value to initialize normalization layers.
-      output_dropout: Dropout probability for the post-attention and output
-        dropout.
-      attention_dropout: Dropout probability for within the attention layer.
-      inner_dropout: Dropout probability for the first Dense layer in a
-        two-layer feedforward network.
-      attention_initializer: Initializer for kernels of attention layers. If set
-        `None`, attention layers use kernel_initializer as initializer for
-        kernel.
-      attention_axes: axes over which the attention is applied. `None` means
-        attention over all axes, but batch, heads, and features.
-      **kwargs: keyword arguments/
-    """
-    super().__init__(**kwargs)
-    self._num_heads = num_attention_heads
-    self._inner_dim = inner_dim
-    self._inner_activation = inner_activation
-    self._attention_dropout = attention_dropout
-    self._attention_dropout_rate = attention_dropout
-    self._output_dropout = output_dropout
-    self._output_dropout_rate = output_dropout
-    self._output_range = output_range
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
-    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
-    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
-    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
-    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
-    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
-    self._use_bias = use_bias
-    self._norm_first = norm_first
-    self._norm_epsilon = norm_epsilon
-    self._inner_dropout = inner_dropout
-    if attention_initializer:
-      self._attention_initializer = tf.keras.initializers.get(
-          attention_initializer)
-    else:
-      self._attention_initializer = self._kernel_initializer
-    self._attention_axes = attention_axes
-  def build(self, input_shape):
-    if isinstance(input_shape, tf.TensorShape):
-      input_tensor_shape = input_shape
-    elif isinstance(input_shape, (list, tuple)):
-      input_tensor_shape = tf.TensorShape(input_shape[0])
-    else:
-      raise ValueError(
-          "The type of input shape argument is not supported, got: %s" %
-          type(input_shape))
-    einsum_equation = "abc,cd->abd"
-    if len(input_tensor_shape.as_list()) > 3:
-      einsum_equation = "...bc,cd->...bd"
-    hidden_size = input_tensor_shape[-1]
-    if hidden_size % self._num_heads != 0:
-      raise ValueError(
-          "The input size (%d) is not a multiple of the number of attention "
-          "heads (%d)" % (hidden_size, self._num_heads))
-    self._attention_head_size = int(hidden_size // self._num_heads)
-    common_kwargs = dict(
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint)
-    self._attention_layer = tf.keras.layers.MultiHeadAttention(
-        num_heads=self._num_heads,
-        key_dim=self._attention_head_size,
-        dropout=self._attention_dropout,
-        use_bias=self._use_bias,
-        kernel_initializer=self._attention_initializer,
-        attention_axes=self._attention_axes,
-        name="self_attention",
-        **common_kwargs)
-    self._attention_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
-    # Use float32 in layernorm for numeric stability.
-    # It is probably safe in mixed_float16, but we haven't validated this yet.
-    self._attention_layer_norm = (
-        tf.keras.layers.LayerNormalization(
-            name="self_attention_layer_norm",
-            axis=-1,
-            epsilon=self._norm_epsilon,
-            dtype=tf.float32))
-    self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
-        einsum_equation,
-        output_shape=(None, self._inner_dim),
-        bias_axes="d",
-        kernel_initializer=self._kernel_initializer,
-        name="intermediate",
-        **common_kwargs)
-    policy = tf.keras.mixed_precision.global_policy()
-    if policy.name == "mixed_bfloat16":
-      # bfloat16 causes BERT with the LAMB optimizer to not converge
-      # as well, so we use float32.
-      # TODO(b/154538392): Investigate this.
-      policy = tf.float32
-    self._intermediate_activation_layer = tf.keras.layers.Activation(
-        self._inner_activation, dtype=policy)
-    self._inner_dropout_layer = tf.keras.layers.Dropout(
-        rate=self._inner_dropout)
-    self._output_dense = tf.keras.layers.experimental.EinsumDense(
-        einsum_equation,
-        output_shape=(None, hidden_size),
-        bias_axes="d",
-        name="output",
-        kernel_initializer=self._kernel_initializer,
-        **common_kwargs)
-    self._output_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
-    # Use float32 in layernorm for numeric stability.
-    self._output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="output_layer_norm",
-        axis=-1,
-        epsilon=self._norm_epsilon,
-        dtype=tf.float32)
-    super(TransformerEncoderBlock, self).build(input_shape)
-  def get_config(self):
-    config = {
-        "num_attention_heads":
-            self._num_heads,
-        "inner_dim":
-            self._inner_dim,
-        "inner_activation":
-            self._inner_activation,
-        "output_dropout":
-            self._output_dropout_rate,
-        "attention_dropout":
-            self._attention_dropout_rate,
-        "output_range":
-            self._output_range,
-        "kernel_initializer":
-            tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer":
-            tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer":
-            tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer":
-            tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer":
-            tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint":
-            tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint":
-            tf.keras.constraints.serialize(self._bias_constraint),
-        "use_bias":
-            self._use_bias,
-        "norm_first":
-            self._norm_first,
-        "norm_epsilon":
-            self._norm_epsilon,
-        "inner_dropout":
-            self._inner_dropout,
-        "attention_initializer":
-            tf.keras.initializers.serialize(self._attention_initializer),
-        "attention_axes": self._attention_axes,
-    }
-    base_config = super(TransformerEncoderBlock, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-  def call(self, inputs):
-    """Transformer self-attention encoder block call.
-    Args:
-      inputs: a single tensor or a list of tensors.
-        `input tensor` as the single sequence of embeddings.
-        [`input tensor`, `attention mask`] to have the additional attention
-          mask.
-        [`query tensor`, `key value tensor`, `attention mask`] to have separate
-          input streams for the query, and key/value to the multi-head
-          attention.
-    Returns:
-      An output tensor with the same dimensions as input/query tensor.
-    """
-    if isinstance(inputs, (list, tuple)):
-      if len(inputs) == 2:
-        input_tensor, attention_mask = inputs
-        key_value = None
-      elif len(inputs) == 3:
-        input_tensor, key_value, attention_mask = inputs
-      else:
-        raise ValueError("Unexpected inputs to %s with length at %d" %
-                         (self.__class__, len(inputs)))
-    else:
-      input_tensor, key_value, attention_mask = (inputs, None, None)
-    if self._output_range:
-      if self._norm_first:
-        source_tensor = input_tensor[:, 0:self._output_range, :]
-        input_tensor = self._attention_layer_norm(input_tensor)
-        if key_value is not None:
-          key_value = self._attention_layer_norm(key_value)
-      target_tensor = input_tensor[:, 0:self._output_range, :]
-      if attention_mask is not None:
-        attention_mask = attention_mask[:, 0:self._output_range, :]
-    else:
-      if self._norm_first:
-        source_tensor = input_tensor
-        input_tensor = self._attention_layer_norm(input_tensor)
-        if key_value is not None:
-          key_value = self._attention_layer_norm(key_value)
-      target_tensor = input_tensor
-    if key_value is None:
-      key_value = input_tensor
-    attention_output = self._attention_layer(
-        query=target_tensor, value=key_value, attention_mask=attention_mask)
-    attention_output = self._attention_dropout(attention_output)
-    if self._norm_first:
-      attention_output = source_tensor + attention_output
-    else:
-      attention_output = self._attention_layer_norm(target_tensor +
-                                                    attention_output)
-    if self._norm_first:
-      source_attention_output = attention_output
-      attention_output = self._output_layer_norm(attention_output)
-    inner_output = self._intermediate_dense(attention_output)
-    inner_output = self._intermediate_activation_layer(inner_output)
-    inner_output = self._inner_dropout_layer(inner_output)
-    layer_output = self._output_dense(inner_output)
-    layer_output = self._output_dropout(layer_output)
-    if self._norm_first:
-      return source_attention_output + layer_output
-    # During mixed precision training, layer norm output is always fp32 for now.
-    # Casts fp32 for the subsequent add.
-    layer_output = tf.cast(layer_output, tf.float32)
-    return self._output_layer_norm(layer_output + attention_output)
--- a/official/nlp/modeling/layers/README.md
+++ b/official/nlp/modeling/layers/README.md
@@ -121,3 +121,7 @@ assemble new `tf.keras` layers or models.
    [BertTokenizer](text_layers.py) and [SentencepieceTokenizer](text_layers.py)
    implements the layer to tokenize raw text and pack them into the inputs for
    BERT models.
+*   [TransformerEncoderBlock](transformer_encoder_block.py) implements
+    an optionally masked transformer as described in
+    ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762).
--- a/official/nlp/modeling/layers/__init__.py
+++ b/official/nlp/modeling/layers/__init__.py
@@ -22,6 +22,7 @@ from official.nlp.modeling.layers.bigbird_attention import BigBirdAttention
 from official.nlp.modeling.layers.bigbird_attention import BigBirdMasks
 from official.nlp.modeling.layers.cls_head import *
 from official.nlp.modeling.layers.dense_einsum import DenseEinsum
+from official.nlp.modeling.layers.exbert_layers import *
 from official.nlp.modeling.layers.gated_feedforward import GatedFeedforward
 from official.nlp.modeling.layers.gaussian_process import RandomFeatureGaussianProcess
 from official.nlp.modeling.layers.kernel_attention import KernelAttention
@@ -34,6 +35,7 @@ from official.nlp.modeling.layers.mobile_bert_layers import MobileBertMaskedLM
 from official.nlp.modeling.layers.mobile_bert_layers import MobileBertTransformer
 from official.nlp.modeling.layers.multi_channel_attention import *
 from official.nlp.modeling.layers.on_device_embedding import OnDeviceEmbedding
+from official.nlp.modeling.layers.position_embedding import PositionEmbedding
 from official.nlp.modeling.layers.position_embedding import RelativePositionBias
 from official.nlp.modeling.layers.position_embedding import RelativePositionEmbedding
 from official.nlp.modeling.layers.relative_attention import MultiHeadRelativeAttention
@@ -47,6 +49,7 @@ from official.nlp.modeling.layers.text_layers import BertTokenizer
 from official.nlp.modeling.layers.text_layers import SentencepieceTokenizer
 from official.nlp.modeling.layers.tn_transformer_expand_condense import TNTransformerExpandCondense
 from official.nlp.modeling.layers.transformer import *
+from official.nlp.modeling.layers.transformer_encoder_block import TransformerEncoderBlock
 from official.nlp.modeling.layers.transformer_scaffold import TransformerScaffold
 from official.nlp.modeling.layers.transformer_xl import TransformerXL
 from official.nlp.modeling.layers.transformer_xl import TransformerXLBlock
--- a/official/nlp/modeling/layers/masked_lm.py
+++ b/official/nlp/modeling/layers/masked_lm.py
@@ -14,7 +14,110 @@
 """Masked language model network."""
 # pylint: disable=g-classes-have-attributes
-from official.nlp import keras_nlp
+import tensorflow as tf
-MaskedLM = keras_nlp.layers.MaskedLM
+@tf.keras.utils.register_keras_serializable(package='Text')
+class MaskedLM(tf.keras.layers.Layer):
+  """Masked language model network head for BERT modeling.
+  This layer implements a masked language model based on the provided
+  transformer based encoder. It assumes that the encoder network being passed
+  has a "get_embedding_table()" method.
+  Example:
+  ```python
+  encoder=keras_nlp.BertEncoder(...)
+  lm_layer=MaskedLM(embedding_table=encoder.get_embedding_table())
+  ```
+  Args:
+    embedding_table: The embedding table from encoder network.
+    activation: The activation, if any, for the dense layer.
+    initializer: The initializer for the dense layer. Defaults to a Glorot
+      uniform initializer.
+    output: The output style for this layer. Can be either 'logits' or
+      'predictions'.
+  """
+  def __init__(self,
+               embedding_table,
+               activation=None,
+               initializer='glorot_uniform',
+               output='logits',
+               name=None,
+               **kwargs):
+    super(MaskedLM, self).__init__(name=name, **kwargs)
+    self.embedding_table = embedding_table
+    self.activation = activation
+    self.initializer = tf.keras.initializers.get(initializer)
+    if output not in ('predictions', 'logits'):
+      raise ValueError(
+          ('Unknown `output` value "%s". `output` can be either "logits" or '
+           '"predictions"') % output)
+    self._output_type = output
+  def build(self, input_shape):
+    self._vocab_size, hidden_size = self.embedding_table.shape
+    self.dense = tf.keras.layers.Dense(
+        hidden_size,
+        activation=self.activation,
+        kernel_initializer=self.initializer,
+        name='transform/dense')
+    self.layer_norm = tf.keras.layers.LayerNormalization(
+        axis=-1, epsilon=1e-12, name='transform/LayerNorm')
+    self.bias = self.add_weight(
+        'output_bias/bias',
+        shape=(self._vocab_size,),
+        initializer='zeros',
+        trainable=True)
+    super(MaskedLM, self).build(input_shape)
+  def call(self, sequence_data, masked_positions):
+    masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
+    lm_data = self.dense(masked_lm_input)
+    lm_data = self.layer_norm(lm_data)
+    lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
+    logits = tf.nn.bias_add(lm_data, self.bias)
+    masked_positions_length = masked_positions.shape.as_list()[1] or tf.shape(
+        masked_positions)[1]
+    logits = tf.reshape(logits,
+                        [-1, masked_positions_length, self._vocab_size])
+    if self._output_type == 'logits':
+      return logits
+    return tf.nn.log_softmax(logits)
+  def get_config(self):
+    raise NotImplementedError('MaskedLM cannot be directly serialized because '
+                              'it has variable sharing logic.')
+  def _gather_indexes(self, sequence_tensor, positions):
+    """Gathers the vectors at the specific positions, for performance.
+    Args:
+        sequence_tensor: Sequence output of shape
+          (`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
+          hidden units.
+        positions: Positions ids of tokens in sequence to mask for pretraining
+          of with dimension (batch_size, num_predictions) where
+          `num_predictions` is maximum number of tokens to mask out and predict
+          per each sequence.
+    Returns:
+        Masked out sequence tensor of shape (batch_size * num_predictions,
+        num_hidden).
+    """
+    sequence_shape = tf.shape(sequence_tensor)
+    batch_size, seq_length = sequence_shape[0], sequence_shape[1]
+    width = sequence_tensor.shape.as_list()[2] or sequence_shape[2]
+    flat_offsets = tf.reshape(
+        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
+    flat_positions = tf.reshape(positions + flat_offsets, [-1])
+    flat_sequence_tensor = tf.reshape(sequence_tensor,
+                                      [batch_size * seq_length, width])
+    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
+    return output_tensor
--- a/official/nlp/modeling/layers/mobile_bert_layers.py
+++ b/official/nlp/modeling/layers/mobile_bert_layers.py
@@ -15,7 +15,8 @@
 """MobileBERT embedding and transformer layers."""
 import tensorflow as tf
-from official.nlp import keras_nlp
+from official.nlp.modeling.layers import on_device_embedding
+from official.nlp.modeling.layers import position_embedding
 @tf.keras.utils.register_keras_serializable(package='Text')
@@ -105,17 +106,17 @@ class MobileBertEmbedding(tf.keras.layers.Layer):
    self.initializer = tf.keras.initializers.get(initializer)
    self.dropout_rate = dropout_rate
-    self.word_embedding = keras_nlp.layers.OnDeviceEmbedding(
+    self.word_embedding = on_device_embedding.OnDeviceEmbedding(
        self.word_vocab_size,
        self.word_embed_size,
        initializer=initializer,
        name='word_embedding')
-    self.type_embedding = keras_nlp.layers.OnDeviceEmbedding(
+    self.type_embedding = on_device_embedding.OnDeviceEmbedding(
        self.type_vocab_size,
        self.output_embed_size,
        initializer=initializer,
        name='type_embedding')
-    self.pos_embedding = keras_nlp.layers.PositionEmbedding(
+    self.pos_embedding = position_embedding.PositionEmbedding(
        max_length=max_sequence_length,
        initializer=initializer,
        name='position_embedding')

--- a/official/nlp/modeling/layers/on_device_embedding.py
+++ b/official/nlp/modeling/layers/on_device_embedding.py
@@ -15,7 +15,92 @@
 """Keras-based one-hot embedding layer."""
 # pylint: disable=g-classes-have-attributes
-from official.nlp import keras_nlp
+import tensorflow as tf
-OnDeviceEmbedding = keras_nlp.layers.OnDeviceEmbedding
+@tf.keras.utils.register_keras_serializable(package="Text")
+class OnDeviceEmbedding(tf.keras.layers.Layer):
+  """Performs an embedding lookup suitable for accelerator devices.
+  This layer uses either tf.gather or tf.one_hot to translate integer indices to
+  float embeddings.
+  Args:
+    vocab_size: Number of elements in the vocabulary.
+    embedding_width: Output size of the embedding layer.
+    initializer: The initializer to use for the embedding weights. Defaults to
+      "glorot_uniform".
+    use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
+      lookup. Defaults to False (that is, using tf.gather). Setting this option
+      to True may improve performance, especially on small vocabulary sizes, but
+      will generally require more memory.
+    scale_factor: Whether to scale the output embeddings. Defaults to None (that
+      is, not to scale). Setting this option to a float will let values in
+      output embeddings multiplied by scale_factor.
+  """
+  def __init__(self,
+               vocab_size,
+               embedding_width,
+               initializer="glorot_uniform",
+               use_one_hot=False,
+               scale_factor=None,
+               **kwargs):
+    super(OnDeviceEmbedding, self).__init__(**kwargs)
+    self._vocab_size = vocab_size
+    self._embedding_width = embedding_width
+    self._initializer = initializer
+    self._use_one_hot = use_one_hot
+    self._scale_factor = scale_factor
+  def get_config(self):
+    config = {
+        "vocab_size": self._vocab_size,
+        "embedding_width": self._embedding_width,
+        "initializer": self._initializer,
+        "use_one_hot": self._use_one_hot,
+        "scale_factor": self._scale_factor,
+    }
+    base_config = super(OnDeviceEmbedding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+  def build(self, input_shape):
+    self.embeddings = self.add_weight(
+        "embeddings",
+        shape=[self._vocab_size, self._embedding_width],
+        initializer=self._initializer,
+        dtype=tf.float32)
+    super(OnDeviceEmbedding, self).build(input_shape)
+  def call(self, inputs):
+    flat_inputs = tf.reshape(inputs, [-1])
+    if self._use_one_hot:
+      dtype = self._compute_dtype
+      if not tf.dtypes.as_dtype(dtype).is_floating:
+        # TensorFlow 1 compatibility. In TF1, self._compute_dtype is int32
+        # instead of a floating-point dtype, as the dtype is inferred from the
+        # dtype of the inputs
+        dtype = tf.float32
+      one_hot_data = tf.one_hot(
+          flat_inputs, depth=self._vocab_size, dtype=dtype)
+      embeddings = tf.matmul(one_hot_data, self.embeddings)
+    else:
+      embeddings = tf.gather(self.embeddings, flat_inputs)
+    embeddings = tf.reshape(
+        embeddings,
+        # Work around b/142213824: prefer concat to shape over a Python list.
+        tf.concat([tf.shape(inputs), [self._embedding_width]], axis=0))
+    embeddings.set_shape(inputs.shape.as_list() + [self._embedding_width])
+    if self._scale_factor:
+      embeddings *= self._scale_factor
+    return embeddings
+  @property
+  def vocab_size(self):
+    return self._vocab_size
+  @property
+  def embedding_width(self):
+    return self._embedding_width
--- a/official/nlp/modeling/layers/on_device_embedding_test.py
+++ b/official/nlp/modeling/layers/on_device_embedding_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for Keras-based one-hot embedding layer."""
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.modeling.layers import on_device_embedding
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
+  def test_layer_creation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+    # The output should be the same as the input, save that it has an extra
+    # embedding_width dimension on the end.
+    expected_output_shape = [None, sequence_length, embedding_width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    self.assertEqual(output_tensor.dtype, tf.float32)
+  def test_layer_creation_with_mixed_precision(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width,
+        dtype="mixed_float16")
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+    # The output should be the same as the input, save that it has an extra
+    # embedding_width dimension on the end.
+    expected_output_shape = [None, sequence_length, embedding_width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    self.assertEqual(output_tensor.dtype, tf.float16)
+  def test_layer_invocation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float32, output.dtype)
+  def test_layer_invocation_with_mixed_precision(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width,
+        dtype="mixed_float16")
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float16, output.dtype)
+  def test_one_hot_layer_creation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        use_one_hot=True)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+    # The output should be the same as the input, save that it has an extra
+    # embedding_width dimension on the end.
+    expected_output_shape = [None, sequence_length, embedding_width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    self.assertEqual(output_tensor.dtype, tf.float32)
+  def test_one_hot_layer_creation_with_mixed_precision(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        dtype="mixed_float16",
+        use_one_hot=True)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+    # The output should be the same as the input, save that it has an extra
+    # embedding_width dimension on the end.
+    expected_output_shape = [None, sequence_length, embedding_width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    self.assertEqual(output_tensor.dtype, tf.float16)
+  def test_one_hot_layer_invocation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        use_one_hot=True)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float32, output.dtype)
+  def test_one_hot_layer_invocation_with_mixed_precision(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        dtype="mixed_float16",
+        use_one_hot=True)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float16, output.dtype)
+  def test_use_scale_layer_invocation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width,
+        scale_factor=embedding_width**0.5)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float32, output.dtype)
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/modeling/layers/position_embedding.py
+++ b/official/nlp/modeling/layers/position_embedding.py
@@ -24,6 +24,76 @@ from official.modeling import tf_utils
 Initializer = tf.keras.initializers.Initializer
+@tf.keras.utils.register_keras_serializable(package="Text")
+class PositionEmbedding(tf.keras.layers.Layer):
+  """Creates a positional embedding.
+  Example:
+  ```python
+  position_embedding = PositionEmbedding(max_length=100)
+  inputs = tf.keras.Input((100, 32), dtype=tf.float32)
+  outputs = position_embedding(inputs)
+  ```
+  Args:
+    max_length: The maximum size of the dynamic sequence.
+    initializer: The initializer to use for the embedding weights. Defaults to
+      "glorot_uniform".
+    seq_axis: The axis of the input tensor where we add the embeddings.
+  Reference: This layer creates a positional embedding as described in
+  [BERT: Pre-training of Deep Bidirectional Transformers for Language
+  Understanding](https://arxiv.org/abs/1810.04805).
+  """
+  def __init__(self,
+               max_length,
+               initializer="glorot_uniform",
+               seq_axis=1,
+               **kwargs):
+    super(PositionEmbedding, self).__init__(**kwargs)
+    if max_length is None:
+      raise ValueError(
+          "`max_length` must be an Integer, not `None`."
+      )
+    self._max_length = max_length
+    self._initializer = tf.keras.initializers.get(initializer)
+    self._seq_axis = seq_axis
+  def get_config(self):
+    config = {
+        "max_length": self._max_length,
+        "initializer": tf.keras.initializers.serialize(self._initializer),
+        "seq_axis": self._seq_axis,
+    }
+    base_config = super(PositionEmbedding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+  def build(self, input_shape):
+    dimension_list = input_shape.as_list()
+    width = dimension_list[-1]
+    weight_sequence_length = self._max_length
+    self._position_embeddings = self.add_weight(
+        "embeddings",
+        shape=[weight_sequence_length, width],
+        initializer=self._initializer)
+    super(PositionEmbedding, self).build(input_shape)
+  def call(self, inputs):
+    input_shape = tf.shape(inputs)
+    actual_seq_len = input_shape[self._seq_axis]
+    position_embeddings = self._position_embeddings[:actual_seq_len, :]
+    new_shape = [1 for _ in inputs.get_shape().as_list()]
+    new_shape[self._seq_axis] = actual_seq_len
+    new_shape[-1] = position_embeddings.get_shape().as_list()[-1]
+    position_embeddings = tf.reshape(position_embeddings, new_shape)
+    return tf.broadcast_to(position_embeddings, input_shape)
 @tf.keras.utils.register_keras_serializable(package="Text")
 class RelativePositionEmbedding(tf.keras.layers.Layer):
  """Creates a positional embedding.

--- a/official/nlp/modeling/layers/position_embedding_test.py
+++ b/official/nlp/modeling/layers/position_embedding_test.py
@@ -22,6 +22,113 @@ from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-dir
 from official.nlp.modeling.layers import position_embedding
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
+  def test_static_layer_output_shape(self):
+    # Create a 3-dimensional input (the first dimension is implicit).
+    sequence_length = 21
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=sequence_length)
+    width = 30
+    input_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(input_tensor)
+    # When using static positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions save batch.
+    expected_output_shape = [None, sequence_length, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    # The default output dtype for this layer should be tf.float32.
+    self.assertEqual(tf.float32, output_tensor.dtype)
+  def test_non_default_axis_static(self):
+    # Create a 3-dimensional input (the first dimension is implicit).
+    sequence_length = 21
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=sequence_length, seq_axis=2)
+    width = 30
+    input_tensor = tf.keras.Input(shape=(width, sequence_length, width))
+    output_tensor = test_layer(input_tensor)
+    # When using static positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions save batch.
+    expected_output_shape = [None, width, sequence_length, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    # The default output dtype for this layer should be tf.float32.
+    self.assertEqual(tf.float32, output_tensor.dtype)
+  def test_float16_dtype(self):
+    # Create a 3-dimensional input (the first dimension is implicit).
+    sequence_length = 21
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=sequence_length, dtype="float16")
+    width = 30
+    input_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(input_tensor)
+    # When using static positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions save batch.
+    expected_output_shape = [None, sequence_length, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    # The default output dtype for this layer should be tf.float32.
+    self.assertEqual(tf.float16, output_tensor.dtype)
+  def test_dynamic_layer_output_shape(self):
+    max_sequence_length = 40
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=max_sequence_length)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    width = 30
+    input_tensor = tf.keras.Input(shape=(None, width))
+    output_tensor = test_layer(input_tensor)
+    # When using dynamic positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions - but may be None if
+    # the input shape is None there.
+    expected_output_shape = [None, None, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+  def test_non_default_axis_dynamic(self):
+    max_sequence_length = 60
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=max_sequence_length, seq_axis=2)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    width = 30
+    input_tensor = tf.keras.Input(shape=(None, None, width))
+    output_tensor = test_layer(input_tensor)
+    # When using dynamic positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions - but may be None if
+    # the input shape is None there.
+    expected_output_shape = [None, None, None, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+  def test_dynamic_layer_slicing(self):
+    max_sequence_length = 40
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=max_sequence_length)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    width = 30
+    input_tensor = tf.keras.Input(shape=(None, width))
+    output_tensor = test_layer(input_tensor)
+    model = tf.keras.Model(input_tensor, output_tensor)
+    # Create input data that is shorter than max_sequence_length, which should
+    # trigger a down-slice.
+    input_length = 17
+    # Note: This test explicitly uses a batch size of 1. This is to get around
+    # Keras' restriction on Model invocations: inputs are expected to have the
+    # same batch cardinality as outputs. In practice, this layer should be used
+    # inside a model, where it can be projected when added to another tensor.
+    input_data = np.ones((1, input_length, width))
+    output_data = model.predict(input_data)
+    self.assertAllEqual([1, input_length, width], output_data.shape)
 # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes

--- a/official/nlp/modeling/layers/self_attention_mask.py
+++ b/official/nlp/modeling/layers/self_attention_mask.py
@@ -16,24 +16,43 @@
 import tensorflow as tf
-from official.nlp.keras_nlp import layers
 @tf.keras.utils.register_keras_serializable(package='Text')
-class SelfAttentionMask(layers.SelfAttentionMask):
+class SelfAttentionMask(tf.keras.layers.Layer):
-  """Creates 3D attention mask from a 2D tensor mask.
+  """Create 3D attention mask from a 2D tensor mask.
-    **Warning: Please use the `keras_nlp.layers.SelfAttentionMask`.**
    inputs[0]: from_tensor: 2D or 3D Tensor of shape
-      `(batch_size, from_seq_length, ...)`.
+      [batch_size, from_seq_length, ...].
-    inputs[1]: to_mask: int32 Tensor of shape `(batch_size, to_seq_length)`.
+    inputs[1]: to_mask: int32 Tensor of shape [batch_size, to_seq_length].
    Returns:
-      Float Tensor of shape `(batch_size, from_seq_length, to_seq_length)`.
+      float Tensor of shape [batch_size, from_seq_length, to_seq_length].
  """
-  def call(self, inputs):
+  def call(self, inputs, to_mask=None):
-    if isinstance(inputs, list):
+    if isinstance(inputs, list) and to_mask is None:
-      return super().call(inputs[0], inputs[1])
+      to_mask = inputs[1]
-    else:
+      inputs = inputs[0]
-      return super().call(inputs)
+    from_shape = tf.shape(inputs)
+    batch_size = from_shape[0]
+    from_seq_length = from_shape[1]
+    to_shape = tf.shape(to_mask)
+    to_seq_length = to_shape[1]
+    to_mask = tf.cast(
+        tf.reshape(to_mask, [batch_size, 1, to_seq_length]),
+        dtype=inputs.dtype)
+    # We don't assume that `from_tensor` is a mask (although it could be). We
+    # don't actually care if we attend *from* padding tokens (only *to* padding)
+    # tokens so we create a tensor of all ones.
+    #
+    # `broadcast_ones` = [batch_size, from_seq_length, 1]
+    broadcast_ones = tf.ones(
+        shape=[batch_size, from_seq_length, 1], dtype=inputs.dtype)
+    # Here we broadcast along two dimensions to create the mask.
+    mask = broadcast_ones * to_mask
+    return mask
--- a/official/nlp/modeling/layers/transformer.py
+++ b/official/nlp/modeling/layers/transformer.py
@@ -18,14 +18,14 @@
 import gin
 import tensorflow as tf
-from official.nlp import keras_nlp
 from official.nlp.modeling.layers import attention
 from official.nlp.modeling.layers import multi_channel_attention
+from official.nlp.modeling.layers import transformer_encoder_block
 from official.nlp.modeling.layers.util import tf_function_if_eager
 @tf.keras.utils.register_keras_serializable(package="Text")
-class Transformer(keras_nlp.layers.TransformerEncoderBlock):
+class Transformer(transformer_encoder_block.TransformerEncoderBlock):
  """Transformer layer.
  This layer implements the Transformer from "Attention Is All You Need".

--- a/official/nlp/modeling/layers/transformer_encoder_block.py
+++ b/official/nlp/modeling/layers/transformer_encoder_block.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Keras-based TransformerEncoder block layer."""
+import tensorflow as tf
+@tf.keras.utils.register_keras_serializable(package="Text")
+class TransformerEncoderBlock(tf.keras.layers.Layer):
+  """TransformerEncoderBlock layer.
+  This layer implements the Transformer Encoder from
+  "Attention Is All You Need". (https://arxiv.org/abs/1706.03762),
+  which combines a `tf.keras.layers.MultiHeadAttention` layer with a
+  two-layer feedforward network.
+  References:
+    [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
+    [BERT: Pre-training of Deep Bidirectional Transformers for Language
+     Understanding](https://arxiv.org/abs/1810.04805)
+  """
+  def __init__(self,
+               num_attention_heads,
+               inner_dim,
+               inner_activation,
+               output_range=None,
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               use_bias=True,
+               norm_first=False,
+               norm_epsilon=1e-12,
+               output_dropout=0.0,
+               attention_dropout=0.0,
+               inner_dropout=0.0,
+               attention_initializer=None,
+               attention_axes=None,
+               **kwargs):
+    """Initializes `TransformerEncoderBlock`.
+    Args:
+      num_attention_heads: Number of attention heads.
+      inner_dim: The output dimension of the first Dense layer in a two-layer
+        feedforward network.
+      inner_activation: The activation for the first Dense layer in a two-layer
+        feedforward network.
+      output_range: the sequence output range, [0, output_range) for slicing the
+        target sequence. `None` means the target sequence is not sliced.
+      kernel_initializer: Initializer for dense layer kernels.
+      bias_initializer: Initializer for dense layer biases.
+      kernel_regularizer: Regularizer for dense layer kernels.
+      bias_regularizer: Regularizer for dense layer biases.
+      activity_regularizer: Regularizer for dense layer activity.
+      kernel_constraint: Constraint for dense layer kernels.
+      bias_constraint: Constraint for dense layer kernels.
+      use_bias: Whether to enable use_bias in attention layer. If set False,
+        use_bias in attention layer is disabled.
+      norm_first: Whether to normalize inputs to attention and intermediate
+        dense layers. If set False, output of attention and intermediate dense
+        layers is normalized.
+      norm_epsilon: Epsilon value to initialize normalization layers.
+      output_dropout: Dropout probability for the post-attention and output
+        dropout.
+      attention_dropout: Dropout probability for within the attention layer.
+      inner_dropout: Dropout probability for the first Dense layer in a
+        two-layer feedforward network.
+      attention_initializer: Initializer for kernels of attention layers. If set
+        `None`, attention layers use kernel_initializer as initializer for
+        kernel.
+      attention_axes: axes over which the attention is applied. `None` means
+        attention over all axes, but batch, heads, and features.
+      **kwargs: keyword arguments/
+    """
+    super().__init__(**kwargs)
+    self._num_heads = num_attention_heads
+    self._inner_dim = inner_dim
+    self._inner_activation = inner_activation
+    self._attention_dropout = attention_dropout
+    self._attention_dropout_rate = attention_dropout
+    self._output_dropout = output_dropout
+    self._output_dropout_rate = output_dropout
+    self._output_range = output_range
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+    self._use_bias = use_bias
+    self._norm_first = norm_first
+    self._norm_epsilon = norm_epsilon
+    self._inner_dropout = inner_dropout
+    if attention_initializer:
+      self._attention_initializer = tf.keras.initializers.get(
+          attention_initializer)
+    else:
+      self._attention_initializer = self._kernel_initializer
+    self._attention_axes = attention_axes
+  def build(self, input_shape):
+    if isinstance(input_shape, tf.TensorShape):
+      input_tensor_shape = input_shape
+    elif isinstance(input_shape, (list, tuple)):
+      input_tensor_shape = tf.TensorShape(input_shape[0])
+    else:
+      raise ValueError(
+          "The type of input shape argument is not supported, got: %s" %
+          type(input_shape))
+    einsum_equation = "abc,cd->abd"
+    if len(input_tensor_shape.as_list()) > 3:
+      einsum_equation = "...bc,cd->...bd"
+    hidden_size = input_tensor_shape[-1]
+    if hidden_size % self._num_heads != 0:
+      raise ValueError(
+          "The input size (%d) is not a multiple of the number of attention "
+          "heads (%d)" % (hidden_size, self._num_heads))
+    self._attention_head_size = int(hidden_size // self._num_heads)
+    common_kwargs = dict(
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+    self._attention_layer = tf.keras.layers.MultiHeadAttention(
+        num_heads=self._num_heads,
+        key_dim=self._attention_head_size,
+        dropout=self._attention_dropout,
+        use_bias=self._use_bias,
+        kernel_initializer=self._attention_initializer,
+        attention_axes=self._attention_axes,
+        name="self_attention",
+        **common_kwargs)
+    self._attention_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
+    # Use float32 in layernorm for numeric stability.
+    # It is probably safe in mixed_float16, but we haven't validated this yet.
+    self._attention_layer_norm = (
+        tf.keras.layers.LayerNormalization(
+            name="self_attention_layer_norm",
+            axis=-1,
+            epsilon=self._norm_epsilon,
+            dtype=tf.float32))
+    self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
+        einsum_equation,
+        output_shape=(None, self._inner_dim),
+        bias_axes="d",
+        kernel_initializer=self._kernel_initializer,
+        name="intermediate",
+        **common_kwargs)
+    policy = tf.keras.mixed_precision.global_policy()
+    if policy.name == "mixed_bfloat16":
+      # bfloat16 causes BERT with the LAMB optimizer to not converge
+      # as well, so we use float32.
+      # TODO(b/154538392): Investigate this.
+      policy = tf.float32
+    self._intermediate_activation_layer = tf.keras.layers.Activation(
+        self._inner_activation, dtype=policy)
+    self._inner_dropout_layer = tf.keras.layers.Dropout(
+        rate=self._inner_dropout)
+    self._output_dense = tf.keras.layers.experimental.EinsumDense(
+        einsum_equation,
+        output_shape=(None, hidden_size),
+        bias_axes="d",
+        name="output",
+        kernel_initializer=self._kernel_initializer,
+        **common_kwargs)
+    self._output_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
+    # Use float32 in layernorm for numeric stability.
+    self._output_layer_norm = tf.keras.layers.LayerNormalization(
+        name="output_layer_norm",
+        axis=-1,
+        epsilon=self._norm_epsilon,
+        dtype=tf.float32)
+    super(TransformerEncoderBlock, self).build(input_shape)
+  def get_config(self):
+    config = {
+        "num_attention_heads":
+            self._num_heads,
+        "inner_dim":
+            self._inner_dim,
+        "inner_activation":
+            self._inner_activation,
+        "output_dropout":
+            self._output_dropout_rate,
+        "attention_dropout":
+            self._attention_dropout_rate,
+        "output_range":
+            self._output_range,
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            tf.keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            tf.keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            tf.keras.constraints.serialize(self._bias_constraint),
+        "use_bias":
+            self._use_bias,
+        "norm_first":
+            self._norm_first,
+        "norm_epsilon":
+            self._norm_epsilon,
+        "inner_dropout":
+            self._inner_dropout,
+        "attention_initializer":
+            tf.keras.initializers.serialize(self._attention_initializer),
+        "attention_axes": self._attention_axes,
+    }
+    base_config = super(TransformerEncoderBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+  def call(self, inputs):
+    """Transformer self-attention encoder block call.
+    Args:
+      inputs: a single tensor or a list of tensors.
+        `input tensor` as the single sequence of embeddings.
+        [`input tensor`, `attention mask`] to have the additional attention
+          mask.
+        [`query tensor`, `key value tensor`, `attention mask`] to have separate
+          input streams for the query, and key/value to the multi-head
+          attention.
+    Returns:
+      An output tensor with the same dimensions as input/query tensor.
+    """
+    if isinstance(inputs, (list, tuple)):
+      if len(inputs) == 2:
+        input_tensor, attention_mask = inputs
+        key_value = None
+      elif len(inputs) == 3:
+        input_tensor, key_value, attention_mask = inputs
+      else:
+        raise ValueError("Unexpected inputs to %s with length at %d" %
+                         (self.__class__, len(inputs)))
+    else:
+      input_tensor, key_value, attention_mask = (inputs, None, None)
+    if self._output_range:
+      if self._norm_first:
+        source_tensor = input_tensor[:, 0:self._output_range, :]
+        input_tensor = self._attention_layer_norm(input_tensor)
+        if key_value is not None:
+          key_value = self._attention_layer_norm(key_value)
+      target_tensor = input_tensor[:, 0:self._output_range, :]
+      if attention_mask is not None:
+        attention_mask = attention_mask[:, 0:self._output_range, :]
+    else:
+      if self._norm_first:
+        source_tensor = input_tensor
+        input_tensor = self._attention_layer_norm(input_tensor)
+        if key_value is not None:
+          key_value = self._attention_layer_norm(key_value)
+      target_tensor = input_tensor
+    if key_value is None:
+      key_value = input_tensor
+    attention_output = self._attention_layer(
+        query=target_tensor, value=key_value, attention_mask=attention_mask)
+    attention_output = self._attention_dropout(attention_output)
+    if self._norm_first:
+      attention_output = source_tensor + attention_output
+    else:
+      attention_output = self._attention_layer_norm(target_tensor +
+                                                    attention_output)
+    if self._norm_first:
+      source_attention_output = attention_output
+      attention_output = self._output_layer_norm(attention_output)
+    inner_output = self._intermediate_dense(attention_output)
+    inner_output = self._intermediate_activation_layer(inner_output)
+    inner_output = self._inner_dropout_layer(inner_output)
+    layer_output = self._output_dense(inner_output)
+    layer_output = self._output_dropout(layer_output)
+    if self._norm_first:
+      return source_attention_output + layer_output
+    # During mixed precision training, layer norm output is always fp32 for now.
+    # Casts fp32 for the subsequent add.
+    layer_output = tf.cast(layer_output, tf.float32)
+    return self._output_layer_norm(layer_output + attention_output)
--- a/official/nlp/modeling/layers/transformer_encoder_block_test.py
+++ b/official/nlp/modeling/layers/transformer_encoder_block_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for Keras-based transformer block layer."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.modeling.layers.transformer_encoder_block import TransformerEncoderBlock
+@keras_parameterized.run_all_keras_modes
+@parameterized.named_parameters(
+    ('base', TransformerEncoderBlock))
+class TransformerEncoderBlockLayerTest(keras_parameterized.TestCase):
+  def tearDown(self):
+    super(TransformerEncoderBlockLayerTest, self).tearDown()
+    tf.keras.mixed_precision.set_global_policy('float32')
+  def test_layer_creation(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(data_tensor)
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
+  def test_layer_creation_with_mask(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    # Create a 2-dimensional input (the first dimension is implicit).
+    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
+    output_tensor = test_layer([data_tensor, mask_tensor])
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
+  def test_layer_invocation(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(data_tensor)
+    # Create a model from the test layer.
+    model = tf.keras.Model(data_tensor, output_tensor)
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    _ = model.predict(input_data)
+  def test_layer_invocation_with_mask(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    # Create a 2-dimensional input (the first dimension is implicit).
+    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
+    output_tensor = test_layer([data_tensor, mask_tensor])
+    # Create a model from the test layer.
+    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
+    # which here is (batch, sequence_length, sequence_length)
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    _ = model.predict([input_data, mask_data])
+  def test_layer_output_range(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    output_tensor = test_layer([input_data, mask_data])
+    # The layer only attends to the first token and outputs the first token
+    # embedding.
+    new_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        output_range=1)
+    _ = new_layer([input_data, mask_data])
+    new_layer.set_weights(test_layer.get_weights())
+    new_output_tensor = new_layer([input_data, mask_data])
+    self.assertAllClose(
+        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+  def test_layer_output_range_without_mask(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048,
+        inner_activation='relu', norm_first=True)
+    sequence_length = 21
+    width = 80
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    output_tensor = test_layer(input_data)
+    # The layer only attends to the first token and outputs the first token
+    # embedding.
+    new_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        output_range=1,
+        norm_first=True)
+    _ = new_layer(input_data)
+    new_layer.set_weights(test_layer.get_weights())
+    new_output_tensor = new_layer(input_data)
+    self.assertAllClose(
+        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+  def test_layer_output_range_with_pre_norm(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048,
+        inner_activation='relu', norm_first=True)
+    sequence_length = 21
+    width = 80
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    output_tensor = test_layer([input_data, mask_data])
+    # The layer only attends to the first token and outputs the first token
+    # embedding.
+    new_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        output_range=1,
+        norm_first=True)
+    _ = new_layer([input_data, mask_data])
+    new_layer.set_weights(test_layer.get_weights())
+    new_output_tensor = new_layer([input_data, mask_data])
+    self.assertAllClose(
+        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+  def test_layer_invocation_with_float16_dtype(self, transformer_cls):
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    # Create a 2-dimensional input (the first dimension is implicit).
+    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
+    output_tensor = test_layer([data_tensor, mask_tensor])
+    # Create a model from the test layer.
+    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 6
+    input_data = (10 * np.random.random_sample(
+        (batch_size, sequence_length, width)))
+    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
+    # which here is (batch, sequence_length, sequence_length)
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    _ = model.predict([input_data, mask_data])
+  def test_transform_with_initializer(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output = test_layer(data_tensor)
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output.shape.as_list())
+  def test_dynamic_layer_sequence(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
+    # Create a 3-dimensional input (the first dimension is implicit).
+    width = 30
+    input_tensor = tf.keras.Input(shape=(None, width))
+    output_tensor = test_layer(input_tensor)
+    model = tf.keras.Model(input_tensor, output_tensor)
+    input_length = 17
+    input_data = np.ones((1, input_length, width))
+    output_data = model.predict(input_data)
+    self.assertAllEqual([1, input_length, width], output_data.shape)
+  def test_separate_qkv(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=2,
+        inner_dim=128,
+        inner_activation='relu',
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
+    # Forward path.
+    q_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
+    kv_tensor = tf.zeros([2, 8, 16], dtype=tf.float32)
+    dummy_mask = tf.zeros([2, 4, 8], dtype=tf.float32)
+    inputs = [q_tensor, kv_tensor, dummy_mask]
+    output = test_layer(inputs)
+    self.assertEqual(output.shape, q_tensor.shape)
+@keras_parameterized.run_all_keras_modes
+class TransformerArgumentTest(keras_parameterized.TestCase):
+  def test_use_bias_norm_first(self):
+    num_attention_heads = 2
+    hidden_size = 16
+    encoder_block = TransformerEncoderBlock(
+        num_attention_heads=num_attention_heads,
+        inner_dim=32,
+        inner_activation='relu',
+        output_dropout=0.1,
+        attention_dropout=0.1,
+        use_bias=False,
+        norm_first=True,
+        norm_epsilon=1e-6,
+        inner_dropout=0.1,
+        attention_initializer=tf.keras.initializers.RandomUniform(
+            minval=0., maxval=1.))
+    # Forward path.
+    dummy_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
+    dummy_mask = tf.zeros([2, 4, 4], dtype=tf.float32)
+    inputs = [dummy_tensor, dummy_mask]
+    output = encoder_block(inputs)
+    self.assertEqual(output.shape, (2, 4, hidden_size))
+  def test_get_config(self):
+    num_attention_heads = 2
+    encoder_block = TransformerEncoderBlock(
+        num_attention_heads=num_attention_heads,
+        inner_dim=32,
+        inner_activation='relu',
+        output_dropout=0.1,
+        attention_dropout=0.1,
+        use_bias=False,
+        norm_first=True,
+        norm_epsilon=1e-6,
+        inner_dropout=0.1,
+        attention_initializer=tf.keras.initializers.RandomUniform(
+            minval=0., maxval=1.))
+    encoder_block_config = encoder_block.get_config()
+    new_encoder_block = TransformerEncoderBlock.from_config(
+        encoder_block_config)
+    self.assertEqual(encoder_block_config, new_encoder_block.get_config())
+  @parameterized.parameters({'attention_axes': None}, {'attention_axes': [1]},
+                            {'attention_axes': [2]}, {'attention_axes': [1, 2]})
+  def test_several_attention_axes(self, attention_axes):
+    test_layer = TransformerEncoderBlock(
+        inner_dim=32,
+        inner_activation='relu',
+        output_dropout=0.1,
+        attention_dropout=0.1,
+        use_bias=False,
+        norm_first=True,
+        norm_epsilon=1e-6,
+        inner_dropout=0.1,
+        num_attention_heads=10,
+        attention_axes=attention_axes)
+    num_rows = 21
+    num_cols = 13
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(num_rows, num_cols, width))
+    output_tensor = test_layer(data_tensor)
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/nlp/modeling/models/seq2seq_transformer.py
+++ b/official/nlp/modeling/models/seq2seq_transformer.py
@@ -20,7 +20,6 @@ import math
 import tensorflow as tf
 from official.modeling import tf_utils
-from official.nlp import keras_nlp
 from official.nlp.modeling import layers
 from official.nlp.modeling.ops import beam_search
@@ -79,7 +78,7 @@ class Seq2SeqTransformer(tf.keras.Model):
    self._beam_size = beam_size
    self._alpha = alpha
    self._eos_id = eos_id
-    self.embedding_lookup = keras_nlp.layers.OnDeviceEmbedding(
+    self.embedding_lookup = layers.OnDeviceEmbedding(
        vocab_size=self._vocab_size,
        embedding_width=self._embedding_width,
        initializer=tf.random_normal_initializer(
@@ -393,7 +392,7 @@ class TransformerEncoder(tf.keras.layers.Layer):
    self.encoder_layers = []
    for i in range(self.num_layers):
      self.encoder_layers.append(
-          keras_nlp.layers.TransformerEncoderBlock(
+          layers.TransformerEncoderBlock(
              num_attention_heads=self.num_attention_heads,
              inner_dim=self._intermediate_size,
              inner_activation=self._activation,

--- a/official/nlp/modeling/networks/__init__.py
+++ b/official/nlp/modeling/networks/__init__.py
@@ -20,6 +20,7 @@ handled object with a standardized configuration.
 """
 from official.nlp.modeling.networks.albert_encoder import AlbertEncoder
 from official.nlp.modeling.networks.bert_encoder import BertEncoder
+from official.nlp.modeling.networks.bert_encoder import BertEncoderV2
 from official.nlp.modeling.networks.classification import Classification
 from official.nlp.modeling.networks.encoder_scaffold import EncoderScaffold
 from official.nlp.modeling.networks.funnel_transformer import FunnelTransformerEncoder

--- a/official/nlp/modeling/networks/albert_encoder.py
+++ b/official/nlp/modeling/networks/albert_encoder.py
@@ -18,7 +18,6 @@ import collections
 import tensorflow as tf
 from official.modeling import activations
-from official.nlp import keras_nlp
 from official.nlp.modeling import layers
@@ -98,7 +97,7 @@ class AlbertEncoder(tf.keras.Model):
    word_embeddings = embedding_layer(word_ids)
    # Always uses dynamic slicing for simplicity.
-    position_embedding_layer = keras_nlp.layers.PositionEmbedding(
+    position_embedding_layer = layers.PositionEmbedding(
        initializer=initializer,
        max_length=max_sequence_length,
        name='position_embedding')
@@ -133,8 +132,8 @@ class AlbertEncoder(tf.keras.Model):
              embeddings)
    data = embeddings
-    attention_mask = keras_nlp.layers.SelfAttentionMask()(data, mask)
+    attention_mask = layers.SelfAttentionMask()(data, mask)
-    shared_layer = keras_nlp.layers.TransformerEncoderBlock(
+    shared_layer = layers.TransformerEncoderBlock(
        num_attention_heads=num_attention_heads,
        inner_dim=intermediate_size,
        inner_activation=activation,