Merge pull request #1 from tensorflow/master

new pull

Merge pull request #1 from tensorflow/master
new pull
f16a7b5b · vedanshu · GitHub · 8e9296ff · 8f58f396 · f16a7b5b
Unverified Commit f16a7b5b authored May 04, 2021 by vedanshu Committed by GitHub May 04, 2021
20 changed files
--- a/official/nlp/modeling/layers/gated_feedforward_test.py
+++ b/official/nlp/modeling/layers/gated_feedforward_test.py
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Tests for Keras-based gated feedforward layer."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Tests for Keras-based gated feedforward layer."""

 from absl.testing import parameterized
 import numpy as np
@@ -33,7 +29,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):

  def tearDown(self):
    super(GatedFeedforwardTest, self).tearDown()
-    tf.keras.mixed_precision.experimental.set_policy("float32")
+    tf.keras.mixed_precision.set_global_policy("float32")

  @parameterized.parameters(
      (True, 1, "after_residual", "float32"),
@@ -46,7 +42,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
      (False, 1, "before_residual", "mixed_float16"),
  )
  def test_layer_creation(self, use_gate, num_blocks, dropout_position, dtype):
-    tf.keras.mixed_precision.experimental.set_policy(dtype)
+    tf.keras.mixed_precision.set_global_policy(dtype)
    kwargs = dict(
        intermediate_size=128,
        intermediate_activation="relu",
@@ -78,7 +74,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
  )
  def test_layer_invocation(self, use_gate, num_blocks, dropout_position,
                            dtype):
-    tf.keras.mixed_precision.experimental.set_policy(dtype)
+    tf.keras.mixed_precision.set_global_policy(dtype)
    kwargs = dict(
        intermediate_size=16,
        intermediate_activation="relu",
@@ -123,5 +119,6 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
    # If the serialization was successful, the new config should match the old.
    self.assertAllEqual(test_layer.get_config(), new_layer.get_config())

+
 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/modeling/layers/gaussian_process.py
+++ b/official/nlp/modeling/layers/gaussian_process.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Definitions for random feature Gaussian process layer."""
+import math
+import tensorflow as tf
+
+
+_SUPPORTED_LIKELIHOOD = ('binary_logistic', 'poisson', 'gaussian')
+
+
+class RandomFeatureGaussianProcess(tf.keras.layers.Layer):
+  """Gaussian process layer with random feature approximation [1].
+
+  During training, the model updates the maximum a posteriori (MAP) logits
+  estimates and posterior precision matrix using minibatch statistics. During
+  inference, the model divides the MAP logit estimates by the predictive
+  standard deviation, which is equivalent to approximating the posterior mean
+  of the predictive probability via the mean-field approximation.
+
+  User can specify different types of random features by setting
+  `use_custom_random_features=True`, and change the initializer and activations
+  of the custom random features. For example:
+
+    MLP Kernel: initializer='random_normal', activation=tf.nn.relu
+    RBF Kernel: initializer='random_normal', activation=tf.math.cos
+
+  A linear kernel can also be specified by setting gp_kernel_type='linear' and
+  `use_custom_random_features=True`.
+
+  [1]: Ali Rahimi and Benjamin Recht. Random Features for Large-Scale Kernel
+       Machines. In _Neural Information Processing Systems_, 2007.
+       https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf
+
+  Attributes:
+    units: (int) The dimensionality of layer.
+    num_inducing: (int) The number of random features for the approximation.
+    is_training: (tf.bool) Whether the layer is set in training mode. If so the
+      layer updates the Gaussian process' variance estimate using statistics
+      computed from the incoming minibatches.
+  """
+
+  def __init__(self,
+               units,
+               num_inducing=1024,
+               gp_kernel_type='gaussian',
+               gp_kernel_scale=1.,
+               gp_output_bias=0.,
+               normalize_input=False,
+               gp_kernel_scale_trainable=False,
+               gp_output_bias_trainable=False,
+               gp_cov_momentum=0.999,
+               gp_cov_ridge_penalty=1.,
+               scale_random_features=True,
+               use_custom_random_features=True,
+               custom_random_features_initializer=None,
+               custom_random_features_activation=None,
+               l2_regularization=1e-6,
+               gp_cov_likelihood='gaussian',
+               return_gp_cov=True,
+               return_random_features=False,
+               dtype=None,
+               name='random_feature_gaussian_process',
+               **gp_output_kwargs):
+    """Initializes a random-feature Gaussian process layer instance.
+
+    Args:
+      units: (int) Number of output units.
+      num_inducing: (int) Number of random Fourier features used for
+        approximating the Gaussian process.
+      gp_kernel_type: (string) The type of kernel function to use for Gaussian
+        process. Currently default to 'gaussian' which is the Gaussian RBF
+        kernel.
+      gp_kernel_scale: (float) The length-scale parameter of the a
+        shift-invariant kernel function, i.e., for RBF kernel:
+        exp(-|x1 - x2|**2 / gp_kernel_scale).
+      gp_output_bias: (float) Scalar initial value for the bias vector.
+      normalize_input: (bool) Whether to normalize the input to Gaussian
+        process.
+      gp_kernel_scale_trainable: (bool) Whether the length scale variable is
+        trainable.
+      gp_output_bias_trainable: (bool) Whether the bias is trainable.
+      gp_cov_momentum: (float) A discount factor used to compute the moving
+        average for posterior covariance matrix.
+      gp_cov_ridge_penalty: (float) Initial Ridge penalty to posterior
+        covariance matrix.
+      scale_random_features: (bool) Whether to scale the random feature
+        by sqrt(2. / num_inducing).
+      use_custom_random_features: (bool) Whether to use custom random
+        features implemented using tf.keras.layers.Dense.
+      custom_random_features_initializer: (str or callable) Initializer for
+        the random features. Default to random normal which approximates a RBF
+        kernel function if activation function is cos.
+      custom_random_features_activation: (callable) Activation function for the
+        random feature layer. Default to cosine which approximates a RBF
+        kernel function.
+      l2_regularization: (float) The strength of l2 regularization on the output
+        weights.
+      gp_cov_likelihood: (string) Likelihood to use for computing Laplace
+        approximation for covariance matrix. Default to `gaussian`.
+      return_gp_cov: (bool) Whether to also return GP covariance matrix.
+        If False then no covariance learning is performed.
+      return_random_features: (bool) Whether to also return random features.
+      dtype: (tf.DType) Input data type.
+      name: (string) Layer name.
+      **gp_output_kwargs: Additional keyword arguments to dense output layer.
+    """
+    super(RandomFeatureGaussianProcess, self).__init__(name=name, dtype=dtype)
+    self.units = units
+    self.num_inducing = num_inducing
+
+    self.normalize_input = normalize_input
+    self.gp_input_scale = 1. / tf.sqrt(gp_kernel_scale)
+    self.gp_feature_scale = tf.sqrt(2. / float(num_inducing))
+
+    self.scale_random_features = scale_random_features
+    self.return_random_features = return_random_features
+    self.return_gp_cov = return_gp_cov
+
+    self.gp_kernel_type = gp_kernel_type
+    self.gp_kernel_scale = gp_kernel_scale
+    self.gp_output_bias = gp_output_bias
+    self.gp_kernel_scale_trainable = gp_kernel_scale_trainable
+    self.gp_output_bias_trainable = gp_output_bias_trainable
+
+    self.use_custom_random_features = use_custom_random_features
+    self.custom_random_features_initializer = custom_random_features_initializer
+    self.custom_random_features_activation = custom_random_features_activation
+
+    self.l2_regularization = l2_regularization
+    self.gp_output_kwargs = gp_output_kwargs
+
+    self.gp_cov_momentum = gp_cov_momentum
+    self.gp_cov_ridge_penalty = gp_cov_ridge_penalty
+    self.gp_cov_likelihood = gp_cov_likelihood
+
+    if self.use_custom_random_features:
+      # Default to Gaussian RBF kernel.
+      self.random_features_bias_initializer = tf.random_uniform_initializer(
+          minval=0., maxval=2. * math.pi)
+      if self.custom_random_features_initializer is None:
+        self.custom_random_features_initializer = (
+            tf.keras.initializers.RandomNormal(stddev=1.))
+      if self.custom_random_features_activation is None:
+        self.custom_random_features_activation = tf.math.cos
+
+  def build(self, input_shape):
+    # Defines model layers.
+    if self.normalize_input:
+      self._input_norm_layer = tf.keras.layers.LayerNormalization(
+          name='gp_input_normalization')
+      self._input_norm_layer.build(input_shape)
+      input_shape = self._input_norm_layer.compute_output_shape(input_shape)
+
+    self._random_feature = self._make_random_feature_layer(
+        name='gp_random_feature')
+    self._random_feature.build(input_shape)
+    input_shape = self._random_feature.compute_output_shape(input_shape)
+
+    if self.return_gp_cov:
+      self._gp_cov_layer = LaplaceRandomFeatureCovariance(
+          momentum=self.gp_cov_momentum,
+          ridge_penalty=self.gp_cov_ridge_penalty,
+          likelihood=self.gp_cov_likelihood,
+          dtype=self.dtype,
+          name='gp_covariance')
+      self._gp_cov_layer.build(input_shape)
+
+    self._gp_output_layer = tf.keras.layers.Dense(
+        units=self.units,
+        use_bias=False,
+        kernel_regularizer=tf.keras.regularizers.l2(self.l2_regularization),
+        dtype=self.dtype,
+        name='gp_output_weights',
+        **self.gp_output_kwargs)
+    self._gp_output_layer.build(input_shape)
+
+    self._gp_output_bias = tf.Variable(
+        initial_value=[self.gp_output_bias] * self.units,
+        dtype=self.dtype,
+        trainable=self.gp_output_bias_trainable,
+        name='gp_output_bias')
+
+    self.built = True
+
+  def _make_random_feature_layer(self, name):
+    """Defines random feature layer depending on kernel type."""
+    if not self.use_custom_random_features:
+      # Use default RandomFourierFeatures layer from tf.keras.
+      return tf.keras.layers.experimental.RandomFourierFeatures(
+          output_dim=self.num_inducing,
+          kernel_initializer=self.gp_kernel_type,
+          scale=self.gp_kernel_scale,
+          trainable=self.gp_kernel_scale_trainable,
+          dtype=self.dtype,
+          name=name)
+
+    if self.gp_kernel_type.lower() == 'linear':
+      custom_random_feature_layer = tf.keras.layers.Lambda(
+          lambda x: x, name=name)
+    else:
+      # Use user-supplied configurations.
+      custom_random_feature_layer = tf.keras.layers.Dense(
+          units=self.num_inducing,
+          use_bias=True,
+          activation=self.custom_random_features_activation,
+          kernel_initializer=self.custom_random_features_initializer,
+          bias_initializer=self.random_features_bias_initializer,
+          trainable=False,
+          name=name)
+
+    return custom_random_feature_layer
+
+  def reset_covariance_matrix(self):
+    """Resets covariance matrix of the GP layer.
+
+    This function is useful for reseting the model's covariance matrix at the
+    begining of a new epoch.
+    """
+    self._gp_cov_layer.reset_precision_matrix()
+
+  def call(self, inputs, global_step=None, training=None):
+    # Computes random features.
+    gp_inputs = inputs
+    if self.normalize_input:
+      gp_inputs = self._input_norm_layer(gp_inputs)
+    elif self.use_custom_random_features:
+      # Supports lengthscale for custom random feature layer by directly
+      # rescaling the input.
+      gp_input_scale = tf.cast(self.gp_input_scale, inputs.dtype)
+      gp_inputs = gp_inputs * gp_input_scale
+
+    gp_feature = self._random_feature(gp_inputs)
+
+    if self.scale_random_features:
+      # Scale random feature by 2. / sqrt(num_inducing) following [1].
+      # When using GP layer as the output layer of a nerual network,
+      # it is recommended to turn this scaling off to prevent it from changing
+      # the learning rate to the hidden layers.
+      gp_feature_scale = tf.cast(self.gp_feature_scale, inputs.dtype)
+      gp_feature = gp_feature * gp_feature_scale
+
+    # Computes posterior center (i.e., MAP estimate) and variance.
+    gp_output = self._gp_output_layer(gp_feature) + self._gp_output_bias
+
+    if self.return_gp_cov:
+      gp_covmat = self._gp_cov_layer(gp_feature, gp_output, training)
+
+    # Assembles model output.
+    model_output = [gp_output,]
+    if self.return_gp_cov:
+      model_output.append(gp_covmat)
+    if self.return_random_features:
+      model_output.append(gp_feature)
+
+    return model_output
+
+
+class LaplaceRandomFeatureCovariance(tf.keras.layers.Layer):
+  """Computes the Gaussian Process covariance using Laplace method.
+
+  At training time, this layer updates the Gaussian process posterior using
+  model features in minibatches.
+
+  Attributes:
+    momentum: (float) A discount factor used to compute the moving average for
+      posterior precision matrix. Analogous to the momentum factor in batch
+      normalization. If -1 then update covariance matrix using a naive sum
+      without momentum, which is desirable if the goal is to compute the exact
+      covariance matrix by passing through data once (say in the final epoch).
+    ridge_penalty: (float) Initial Ridge penalty to weight covariance matrix.
+      This value is used to stablize the eigenvalues of weight covariance
+      estimate so that the matrix inverse can be computed for Cov = inv(t(X) * X
+      + s * I). The ridge factor s cannot be too large since otherwise it will
+      dominate the t(X) * X term and make covariance estimate not meaningful.
+    likelihood: (str) The likelihood to use for computing Laplace approximation
+      for the covariance matrix. Can be one of ('binary_logistic', 'poisson',
+      'gaussian').
+  """
+
+  def __init__(self,
+               momentum=0.999,
+               ridge_penalty=1.,
+               likelihood='gaussian',
+               dtype=None,
+               name='laplace_covariance'):
+    if likelihood not in _SUPPORTED_LIKELIHOOD:
+      raise ValueError(
+          f'"likelihood" must be one of {_SUPPORTED_LIKELIHOOD}, got {likelihood}.'
+      )
+    self.ridge_penalty = ridge_penalty
+    self.momentum = momentum
+    self.likelihood = likelihood
+    super(LaplaceRandomFeatureCovariance, self).__init__(dtype=dtype, name=name)
+
+  def compute_output_shape(self, input_shape):
+    gp_feature_dim = input_shape[-1]
+    return tf.TensorShape([gp_feature_dim, gp_feature_dim])
+
+  def build(self, input_shape):
+    gp_feature_dim = input_shape[-1]
+
+    # Convert gp_feature_dim to int value for TF1 compatibility.
+    if isinstance(gp_feature_dim, tf.compat.v1.Dimension):
+      gp_feature_dim = gp_feature_dim.value
+
+    # Posterior precision matrix for the GP's random feature coefficients.
+    self.initial_precision_matrix = (
+        self.ridge_penalty * tf.eye(gp_feature_dim, dtype=self.dtype))
+
+    self.precision_matrix = (
+        self.add_weight(
+            name='gp_precision_matrix',
+            shape=(gp_feature_dim, gp_feature_dim),
+            dtype=self.dtype,
+            initializer=tf.keras.initializers.Identity(self.ridge_penalty),
+            trainable=False,
+            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA))
+    self.built = True
+
+  def make_precision_matrix_update_op(self,
+                                      gp_feature,
+                                      logits,
+                                      precision_matrix):
+    """Defines update op for the precision matrix of feature weights."""
+    if self.likelihood != 'gaussian':
+      if logits is None:
+        raise ValueError(
+            f'"logits" cannot be None when likelihood={self.likelihood}')
+
+      if logits.shape[-1] != 1:
+        raise ValueError(
+            f'likelihood={self.likelihood} only support univariate logits.'
+            f'Got logits dimension: {logits.shape[-1]}')
+
+    batch_size = tf.shape(gp_feature)[0]
+    batch_size = tf.cast(batch_size, dtype=gp_feature.dtype)
+
+    # Computes batch-specific normalized precision matrix.
+    if self.likelihood == 'binary_logistic':
+      prob = tf.sigmoid(logits)
+      prob_multiplier = prob * (1. - prob)
+    elif self.likelihood == 'poisson':
+      prob_multiplier = tf.exp(logits)
+    else:
+      prob_multiplier = 1.
+
+    gp_feature_adjusted = tf.sqrt(prob_multiplier) * gp_feature
+    precision_matrix_minibatch = tf.matmul(
+        gp_feature_adjusted, gp_feature_adjusted, transpose_a=True)
+
+    # Updates the population-wise precision matrix.
+    if self.momentum > 0:
+      # Use moving-average updates to accumulate batch-specific precision
+      # matrices.
+      precision_matrix_minibatch = precision_matrix_minibatch / batch_size
+      precision_matrix_new = (
+          self.momentum * precision_matrix +
+          (1. - self.momentum) * precision_matrix_minibatch)
+    else:
+      # Compute exact population-wise covariance without momentum.
+      # If use this option, make sure to pass through data only once.
+      precision_matrix_new = precision_matrix + precision_matrix_minibatch
+
+    # Returns the update op.
+    return precision_matrix.assign(precision_matrix_new)
+
+  def reset_precision_matrix(self):
+    """Resets precision matrix to its initial value.
+
+    This function is useful for reseting the model's covariance matrix at the
+    begining of a new epoch.
+    """
+    precision_matrix_reset_op = self.precision_matrix.assign(
+        self.initial_precision_matrix)
+    self.add_update(precision_matrix_reset_op)
+
+  def compute_predictive_covariance(self, gp_feature):
+    """Computes posterior predictive variance.
+
+    Approximates the Gaussian process posterior using random features.
+    Given training random feature Phi_tr (num_train, num_hidden) and testing
+    random feature Phi_ts (batch_size, num_hidden). The predictive covariance
+    matrix is computed as (assuming Gaussian likelihood):
+
+    s * Phi_ts @ inv(t(Phi_tr) * Phi_tr + s * I) @ t(Phi_ts),
+
+    where s is the ridge factor to be used for stablizing the inverse, and I is
+    the identity matrix with shape (num_hidden, num_hidden).
+
+    Args:
+      gp_feature: (tf.Tensor) The random feature of testing data to be used for
+        computing the covariance matrix. Shape (batch_size, gp_hidden_size).
+
+    Returns:
+      (tf.Tensor) Predictive covariance matrix, shape (batch_size, batch_size).
+    """
+    # Computes the covariance matrix of the feature coefficient.
+    feature_cov_matrix = tf.linalg.inv(self.precision_matrix)
+
+    # Computes the covariance matrix of the gp prediction.
+    cov_feature_product = tf.matmul(
+        feature_cov_matrix, gp_feature, transpose_b=True) * self.ridge_penalty
+    gp_cov_matrix = tf.matmul(gp_feature, cov_feature_product)
+    return gp_cov_matrix
+
+  def _get_training_value(self, training=None):
+    if training is None:
+      training = tf.keras.backend.learning_phase()
+
+    if isinstance(training, int):
+      training = bool(training)
+
+    return training
+
+  def call(self, inputs, logits=None, training=None):
+    """Minibatch updates the GP's posterior precision matrix estimate.
+
+    Args:
+      inputs: (tf.Tensor) GP random features, shape (batch_size,
+        gp_hidden_size).
+      logits: (tf.Tensor) Pre-activation output from the model. Needed
+        for Laplace approximation under a non-Gaussian likelihood.
+      training: (tf.bool) whether or not the layer is in training mode. If in
+        training mode, the gp_weight covariance is updated using gp_feature.
+
+    Returns:
+      gp_stddev (tf.Tensor): GP posterior predictive variance,
+        shape (batch_size, batch_size).
+    """
+    batch_size = tf.shape(inputs)[0]
+    training = self._get_training_value(training)
+
+    if training:
+      # Define and register the update op for feature precision matrix.
+      precision_matrix_update_op = self.make_precision_matrix_update_op(
+          gp_feature=inputs,
+          logits=logits,
+          precision_matrix=self.precision_matrix)
+      self.add_update(precision_matrix_update_op)
+      # Return null estimate during training.
+      return tf.eye(batch_size, dtype=self.dtype)
+    else:
+      # Return covariance estimate during inference.
+      return self.compute_predictive_covariance(gp_feature=inputs)
+
+
+def mean_field_logits(logits, covariance_matrix=None, mean_field_factor=1.):
+  """Adjust the model logits so its softmax approximates the posterior mean [1].
+
+  [1]: Zhiyun Lu, Eugene Ie, Fei Sha. Uncertainty Estimation with Infinitesimal
+       Jackknife.  _arXiv preprint arXiv:2006.07584_, 2020.
+       https://arxiv.org/abs/2006.07584
+
+  Arguments:
+    logits: A float tensor of shape (batch_size, num_classes).
+    covariance_matrix: The covariance matrix of shape (batch_size, batch_size).
+      If None then it assumes the covariance_matrix is an identity matrix.
+    mean_field_factor: The scale factor for mean-field approximation, used to
+      adjust the influence of posterior variance in posterior mean
+      approximation. If covariance_matrix=None then it is used as the
+      temperature parameter for temperature scaling.
+
+  Returns:
+    Tensor of adjusted logits, shape (batch_size, num_classes).
+  """
+  if mean_field_factor is None or mean_field_factor < 0:
+    return logits
+
+  # Compute standard deviation.
+  if covariance_matrix is None:
+    variances = 1.
+  else:
+    variances = tf.linalg.diag_part(covariance_matrix)
+
+  # Compute scaling coefficient for mean-field approximation.
+  logits_scale = tf.sqrt(1. + variances * mean_field_factor)
+
+  if len(logits.shape) > 1:
+    # Cast logits_scale to compatible dimension.
+    logits_scale = tf.expand_dims(logits_scale, axis=-1)
+
+  return logits / logits_scale
--- a/official/nlp/modeling/layers/gaussian_process_test.py
+++ b/official/nlp/modeling/layers/gaussian_process_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for Gaussian process functions."""
+import os
+import shutil
+
+from absl.testing import parameterized
+
+import numpy as np
+import tensorflow as tf
+
+from official.nlp.modeling.layers import gaussian_process
+
+
+def exact_gaussian_kernel(x1, x2):
+  """Computes exact Gaussian kernel value(s) for tensors x1 and x2."""
+  x1_squared = tf.reduce_sum(tf.square(x1), list(range(1, len(x1.shape))))
+  x2_squared = tf.reduce_sum(tf.square(x2), list(range(1, len(x2.shape))))
+  square = (x1_squared[:, tf.newaxis] + x2_squared[tf.newaxis, :] -
+            2 * tf.matmul(x1, x2, transpose_b=True))
+  return tf.math.exp(-square / 2.)
+
+
+def _generate_normal_data(num_sample, num_dim, loc):
+  """Generates random data sampled from i.i.d. normal distribution."""
+  return np.random.normal(
+      size=(num_sample, num_dim), loc=loc, scale=1. / np.sqrt(num_dim))
+
+
+def _generate_rbf_data(x_data, orthogonal=True):
+  """Generates high-dim data that is the eigen components of a RBF kernel."""
+  k_rbf = exact_gaussian_kernel(x_data, x_data)
+  x_orth, x_diag, _ = np.linalg.svd(k_rbf)
+  if orthogonal:
+    return x_orth
+  return np.diag(np.sqrt(x_diag)).dot(x_orth.T)
+
+
+def _make_minibatch_iterator(data_numpy, batch_size, num_epoch):
+  """Makes a tf.data.Dataset for given batch size and num epoches."""
+  dataset = tf.data.Dataset.from_tensor_slices(data_numpy)
+  dataset = dataset.repeat(num_epoch).batch(batch_size)
+  return iter(dataset)
+
+
+def _compute_posterior_kernel(x_tr, x_ts, kernel_func, ridge_penalty):
+  """Computes the posterior covariance matrix of a Gaussian process."""
+  num_sample = x_tr.shape[0]
+
+  k_tt_inv = tf.linalg.inv(
+      kernel_func(x_tr, x_tr) + ridge_penalty * np.eye(num_sample))
+  k_ts = kernel_func(x_tr, x_ts)
+  k_ss = kernel_func(x_ts, x_ts)
+
+  return k_ss - tf.matmul(k_ts, tf.matmul(k_tt_inv, k_ts), transpose_a=True)
+
+
+class GaussianProcessTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(GaussianProcessTest, self).setUp()
+    self.num_data_dim = 10
+    self.num_inducing = 1024
+    self.num_train_sample = 1024
+    self.num_test_sample = 256
+    self.prec_tolerance = {'atol': 1e-3, 'rtol': 5e-2}
+    self.cov_tolerance = {'atol': 5e-2, 'rtol': 2.}
+
+    self.rbf_kern_func = exact_gaussian_kernel
+
+    self.x_tr = _generate_normal_data(
+        self.num_train_sample, self.num_data_dim, loc=0.)
+    self.x_ts = _generate_normal_data(
+        self.num_test_sample, self.num_data_dim, loc=1.)
+
+  def test_layer_build(self):
+    """Tests if layer.built=True after building."""
+    rfgp_model = gaussian_process.RandomFeatureGaussianProcess(units=1)
+    rfgp_model.build(input_shape=self.x_tr.shape)
+
+    self.assertTrue(rfgp_model.built)
+
+  @parameterized.named_parameters(('rbf_data', False),
+                                  ('orthogonal_data', True))
+  def test_laplace_covariance_minibatch(self, generate_orthogonal_data):
+    """Tests if model correctly learns population-lvel precision matrix."""
+    batch_size = 50
+    epochs = 1000
+    x_data = _generate_rbf_data(self.x_ts, generate_orthogonal_data)
+    data_iterator = _make_minibatch_iterator(x_data, batch_size, epochs)
+
+    # Estimates precision matrix using minibatch.
+    cov_estimator = gaussian_process.LaplaceRandomFeatureCovariance(
+        momentum=0.999, ridge_penalty=0)
+
+    for minibatch_data in data_iterator:
+      _ = cov_estimator(minibatch_data, training=True)
+
+    # Evaluation
+    prec_mat_expected = x_data.T.dot(x_data)
+    prec_mat_computed = (
+        cov_estimator.precision_matrix.numpy() * self.num_test_sample)
+
+    np.testing.assert_allclose(prec_mat_computed, prec_mat_expected,
+                               **self.prec_tolerance)
+
+  def test_random_feature_prior_approximation(self):
+    """Tests random feature GP's ability in approximating exact GP prior."""
+    num_inducing = 10240
+    rfgp_model = gaussian_process.RandomFeatureGaussianProcess(
+        units=1,
+        num_inducing=num_inducing,
+        normalize_input=False,
+        gp_kernel_type='gaussian',
+        return_random_features=True)
+
+    # Extract random features.
+    _, _, gp_feature = rfgp_model(self.x_tr, training=True)
+    gp_feature_np = gp_feature.numpy()
+
+    prior_kernel_computed = gp_feature_np.dot(gp_feature_np.T)
+    prior_kernel_expected = self.rbf_kern_func(self.x_tr, self.x_tr)
+    np.testing.assert_allclose(prior_kernel_computed, prior_kernel_expected,
+                               **self.cov_tolerance)
+
+  def test_random_feature_posterior_approximation(self):
+    """Tests random feature GP's ability in approximating exact GP posterior."""
+    # Set momentum = 0.5 so posterior precision matrix is 0.5 * (I + K).
+    gp_cov_momentum = 0.5
+    gp_cov_ridge_penalty = 1.
+    num_inducing = 1024
+
+    rfgp_model = gaussian_process.RandomFeatureGaussianProcess(
+        units=1,
+        num_inducing=num_inducing,
+        normalize_input=False,
+        gp_kernel_type='gaussian',
+        gp_cov_momentum=gp_cov_momentum,
+        gp_cov_ridge_penalty=gp_cov_ridge_penalty)
+
+    # Computes posterior covariance on test data.
+    _, _ = rfgp_model(self.x_tr, training=True)
+    _, gp_cov_ts = rfgp_model(self.x_ts, training=False)
+
+    # Scale up covariance estimate since prec matrix is down-scaled by momentum.
+    post_kernel_computed = gp_cov_ts * gp_cov_momentum
+    post_kernel_expected = _compute_posterior_kernel(self.x_tr, self.x_ts,
+                                                     self.rbf_kern_func,
+                                                     gp_cov_ridge_penalty)
+    np.testing.assert_allclose(post_kernel_computed, post_kernel_expected,
+                               **self.cov_tolerance)
+
+  def test_random_feature_linear_kernel(self):
+    """Tests if linear kernel indeed leads to an identity mapping."""
+    # Specify linear kernel
+    gp_kernel_type = 'linear'
+    normalize_input = False
+    scale_random_features = False
+    use_custom_random_features = True
+
+    rfgp_model = gaussian_process.RandomFeatureGaussianProcess(
+        units=1,
+        normalize_input=normalize_input,
+        gp_kernel_type=gp_kernel_type,
+        scale_random_features=scale_random_features,
+        use_custom_random_features=use_custom_random_features,
+        return_random_features=True)
+
+    _, _, gp_feature = rfgp_model(self.x_tr, training=True)
+
+    # Check if linear kernel leads to identity mapping.
+    np.testing.assert_allclose(gp_feature, self.x_tr, **self.prec_tolerance)
+
+  def test_no_matrix_update_during_test(self):
+    """Tests if the precision matrix is not updated during testing."""
+    rfgp_model = gaussian_process.RandomFeatureGaussianProcess(units=1)
+
+    # Training.
+    _, gp_covmat_null = rfgp_model(self.x_tr, training=True)
+    precision_mat_before_test = rfgp_model._gp_cov_layer.precision_matrix
+
+    # Testing.
+    _ = rfgp_model(self.x_ts, training=False)
+    precision_mat_after_test = rfgp_model._gp_cov_layer.precision_matrix
+
+    self.assertAllClose(
+        gp_covmat_null, tf.eye(self.num_train_sample), atol=1e-4)
+    self.assertAllClose(
+        precision_mat_before_test, precision_mat_after_test, atol=1e-4)
+
+  def test_state_saving_and_loading(self):
+    """Tests if the loaded model returns same results."""
+    input_data = np.random.random((1, 2))
+    rfgp_model = gaussian_process.RandomFeatureGaussianProcess(units=1)
+
+    inputs = tf.keras.Input((2,), batch_size=1)
+    outputs = rfgp_model(inputs)
+    model = tf.keras.Model(inputs, outputs)
+    gp_output, gp_covmat = model.predict(input_data)
+
+    # Save and then load the model.
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+    saved_model_dir = os.path.join(temp_dir, 'rfgp_model')
+    model.save(saved_model_dir)
+    new_model = tf.keras.models.load_model(saved_model_dir)
+
+    gp_output_new, gp_covmat_new = new_model.predict(input_data)
+    self.assertAllClose(gp_output, gp_output_new, atol=1e-4)
+    self.assertAllClose(gp_covmat, gp_covmat_new, atol=1e-4)
+
+
+class MeanFieldLogitsTest(tf.test.TestCase):
+
+  def testMeanFieldLogitsLikelihood(self):
+    """Tests if scaling is correct under different likelihood."""
+    batch_size = 10
+    num_classes = 12
+    variance = 1.5
+    mean_field_factor = 2.
+
+    rng = np.random.RandomState(0)
+    tf.random.set_seed(1)
+    logits = rng.randn(batch_size, num_classes)
+    covmat = tf.linalg.diag([variance] * batch_size)
+
+    logits_logistic = gaussian_process.mean_field_logits(
+        logits, covmat, mean_field_factor=mean_field_factor)
+
+    self.assertAllClose(logits_logistic, logits / 2., atol=1e-4)
+
+  def testMeanFieldLogitsTemperatureScaling(self):
+    """Tests using mean_field_logits as temperature scaling method."""
+    batch_size = 10
+    num_classes = 12
+
+    rng = np.random.RandomState(0)
+    tf.random.set_seed(1)
+    logits = rng.randn(batch_size, num_classes)
+
+    # Test if there's no change to logits when mean_field_factor < 0.
+    logits_no_change = gaussian_process.mean_field_logits(
+        logits, covariance_matrix=None, mean_field_factor=-1)
+
+    # Test if mean_field_logits functions as a temperature scaling method when
+    # mean_field_factor > 0, with temperature = sqrt(1. + mean_field_factor).
+    logits_scale_by_two = gaussian_process.mean_field_logits(
+        logits, covariance_matrix=None, mean_field_factor=3.)
+
+    self.assertAllClose(logits_no_change, logits, atol=1e-4)
+    self.assertAllClose(logits_scale_by_two, logits / 2., atol=1e-4)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/nlp/modeling/layers/masked_lm.py
+++ b/official/nlp/modeling/layers/masked_lm.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,114 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Masked language model network."""
 # pylint: disable=g-classes-have-attributes
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
-
-import tensorflow as tf
-
-from official.modeling import tf_utils
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class MaskedLM(tf.keras.layers.Layer):
-  """Masked language model network head for BERT modeling.
-
-  This network implements a masked language model based on the provided network.
-  It assumes that the network being passed has a "get_embedding_table()" method.
-
-  Arguments:
-    embedding_table: The embedding table of the targets.
-    activation: The activation, if any, for the dense layer.
-    initializer: The intializer for the dense layer. Defaults to a Glorot
-      uniform initializer.
-    output: The output style for this network. Can be either 'logits' or
-      'predictions'.
-  """
-
-  def __init__(self,
-               embedding_table,
-               activation=None,
-               initializer='glorot_uniform',
-               output='logits',
-               name='cls/predictions',
-               **kwargs):
-    super(MaskedLM, self).__init__(name=name, **kwargs)
-    self.embedding_table = embedding_table
-    self.activation = activation
-    self.initializer = tf.keras.initializers.get(initializer)
-
-    if output not in ('predictions', 'logits'):
-      raise ValueError(
-          ('Unknown `output` value "%s". `output` can be either "logits" or '
-           '"predictions"') % output)
-    self._output_type = output
-
-  def build(self, input_shape):
-    self._vocab_size, hidden_size = self.embedding_table.shape
-    self.dense = tf.keras.layers.Dense(
-        hidden_size,
-        activation=self.activation,
-        kernel_initializer=self.initializer,
-        name='transform/dense')
-    self.layer_norm = tf.keras.layers.LayerNormalization(
-        axis=-1, epsilon=1e-12, name='transform/LayerNorm')
-    self.bias = self.add_weight(
-        'output_bias/bias',
-        shape=(self._vocab_size,),
-        initializer='zeros',
-        trainable=True)
-
-    super(MaskedLM, self).build(input_shape)
-
-  def call(self, sequence_data, masked_positions):
-    masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
-    lm_data = self.dense(masked_lm_input)
-    lm_data = self.layer_norm(lm_data)
-    lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
-    logits = tf.nn.bias_add(lm_data, self.bias)
-
-    masked_positions_shape = tf_utils.get_shape_list(
-        masked_positions, name='masked_positions_tensor')
-    logits = tf.reshape(logits,
-                        [-1, masked_positions_shape[1], self._vocab_size])
-    if self._output_type == 'logits':
-      return logits
-    return tf.nn.log_softmax(logits)
-
-  def get_config(self):
-    raise NotImplementedError('MaskedLM cannot be directly serialized because '
-                              'it has variable sharing logic.')
-
-  def _gather_indexes(self, sequence_tensor, positions):
-    """Gathers the vectors at the specific positions.
-
-    Args:
-        sequence_tensor: Sequence output of `BertModel` layer of shape
-          (`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
-          hidden units of `BertModel` layer.
-        positions: Positions ids of tokens in sequence to mask for pretraining
-          of with dimension (batch_size, num_predictions) where
-          `num_predictions` is maximum number of tokens to mask out and predict
-          per each sequence.
-
-    Returns:
-        Masked out sequence tensor of shape (batch_size * num_predictions,
-        num_hidden).
-    """
-    sequence_shape = tf_utils.get_shape_list(
-        sequence_tensor, name='sequence_output_tensor')
-    batch_size, seq_length, width = sequence_shape
+from official.nlp import keras_nlp

-    flat_offsets = tf.reshape(
-        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
-    flat_positions = tf.reshape(positions + flat_offsets, [-1])
-    flat_sequence_tensor = tf.reshape(sequence_tensor,
-                                      [batch_size * seq_length, width])
-    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)

-    return output_tensor
+MaskedLM = keras_nlp.layers.MaskedLM
--- a/official/nlp/modeling/layers/masked_lm_test.py
+++ b/official/nlp/modeling/layers/masked_lm_test.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Tests for masked language model network."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Tests for masked language model network."""

 import numpy as np
 import tensorflow as tf
@@ -24,7 +20,7 @@ import tensorflow as tf
 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import

 from official.nlp.modeling.layers import masked_lm
-from official.nlp.modeling.networks import transformer_encoder
+from official.nlp.modeling.networks import bert_encoder


 # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
@@ -34,25 +30,22 @@ class MaskedLMTest(keras_parameterized.TestCase):

  def create_layer(self,
                   vocab_size,
-                   sequence_length,
                   hidden_size,
                   output='predictions',
                   xformer_stack=None):
    # First, create a transformer stack that we can use to get the LM's
    # vocabulary weight.
    if xformer_stack is None:
-      xformer_stack = transformer_encoder.TransformerEncoder(
+      xformer_stack = bert_encoder.BertEncoder(
          vocab_size=vocab_size,
          num_layers=1,
-          sequence_length=sequence_length,
          hidden_size=hidden_size,
          num_attention_heads=4,
      )

    # Create a maskedLM from the transformer stack.
    test_layer = masked_lm.MaskedLM(
-        embedding_table=xformer_stack.get_embedding_table(),
-        output=output)
+        embedding_table=xformer_stack.get_embedding_table(), output=output)
    return test_layer

  def test_layer_creation(self):
@@ -61,9 +54,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
    hidden_size = 64
    num_predictions = 21
    test_layer = self.create_layer(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size)
+        vocab_size=vocab_size, hidden_size=hidden_size)

    # Make sure that the output tensor of the masked LM is the right shape.
    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
@@ -78,22 +69,19 @@ class MaskedLMTest(keras_parameterized.TestCase):
    sequence_length = 32
    hidden_size = 64
    num_predictions = 21
-    xformer_stack = transformer_encoder.TransformerEncoder(
+    xformer_stack = bert_encoder.BertEncoder(
        vocab_size=vocab_size,
        num_layers=1,
-        sequence_length=sequence_length,
        hidden_size=hidden_size,
        num_attention_heads=4,
    )
    test_layer = self.create_layer(
        vocab_size=vocab_size,
-        sequence_length=sequence_length,
        hidden_size=hidden_size,
        xformer_stack=xformer_stack,
        output='predictions')
    logit_layer = self.create_layer(
        vocab_size=vocab_size,
-        sequence_length=sequence_length,
        hidden_size=hidden_size,
        xformer_stack=xformer_stack,
        output='logits')
@@ -133,9 +121,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
    hidden_size = 64
    num_predictions = 21
    test_layer = self.create_layer(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size)
+        vocab_size=vocab_size, hidden_size=hidden_size)

    # Create a model from the masked LM layer.
    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
@@ -154,8 +140,7 @@ class MaskedLMTest(keras_parameterized.TestCase):

  def test_unknown_output_type_fails(self):
    with self.assertRaisesRegex(ValueError, 'Unknown `output` value "bad".*'):
-      _ = self.create_layer(
-          vocab_size=8, sequence_length=8, hidden_size=8, output='bad')
+      _ = self.create_layer(vocab_size=8, hidden_size=8, output='bad')


 if __name__ == '__main__':

--- a/official/nlp/modeling/layers/masked_softmax.py
+++ b/official/nlp/modeling/layers/masked_softmax.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,22 +11,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Keras-based softmax layer with optional masking."""
 # pylint: disable=g-classes-have-attributes
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function

 import tensorflow as tf


+def _large_compatible_negative(tensor_type):
+  """Large negative number as Tensor.
+
+  This function is necessary because the standard value for epsilon
+  in this module (-1e9) cannot be represented using `tf.float16`.
+
+  Args:
+    tensor_type: A dtype to determine the type.
+
+  Returns:
+    A large negative number.
+  """
+  if tensor_type == tf.float16:
+    return tf.float16.min
+  return -1e9
+
+
 @tf.keras.utils.register_keras_serializable(package='Text')
 class MaskedSoftmax(tf.keras.layers.Layer):
  """Performs a softmax with optional masking on a tensor.

-  Arguments:
+  Args:
    mask_expansion_axes: Any axes that should be padded on the mask tensor.
    normalization_axes: On which axes the softmax should perform.
  """
@@ -50,9 +63,9 @@ class MaskedSoftmax(tf.keras.layers.Layer):

      # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
      # masked positions, this operation will create a tensor which is 0.0 for
-      # positions we want to attend and -10000.0 for masked positions.
-      adder = (1.0 - tf.cast(mask, scores.dtype)) * -10000.0
-
+      # positions we want to attend and -1.e9 for masked positions.
+      adder = (1.0 - tf.cast(mask, scores.dtype)) * _large_compatible_negative(
+          scores.dtype)
      # Since we are adding it to the raw scores before the softmax, this is
      # effectively the same as removing these entirely.
      scores += adder

--- a/official/nlp/modeling/layers/masked_softmax_test.py
+++ b/official/nlp/modeling/layers/masked_softmax_test.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Tests for Keras-based masked softmax layer."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Tests for Keras-based masked softmax layer."""

 import numpy as np
 import tensorflow as tf

--- a/official/nlp/modeling/layers/mat_mul_with_margin.py
+++ b/official/nlp/modeling/layers/mat_mul_with_margin.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dot product with margin layer."""
+# pylint: disable=g-classes-have-attributes
+
+from typing import Tuple
+# Import libraries
+import tensorflow as tf
+
+from official.modeling import tf_utils
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class MatMulWithMargin(tf.keras.layers.Layer):
+  """This layer computs a dot product matrix given two encoded inputs.
+
+  Args:
+    logit_scale: The scaling factor of dot products when doing training.
+    logit_margin: The margin value between the positive and negative examples
+      when doing training.
+  """
+
+  def __init__(self,
+               logit_scale=1.0,
+               logit_margin=0.0,
+               **kwargs):
+    super(MatMulWithMargin, self).__init__(**kwargs)
+    self.logit_scale = logit_scale
+    self.logit_margin = logit_margin
+
+  def call(self, left_encoded: tf.Tensor,
+           right_encoded: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+    batch_size = tf_utils.get_shape_list(
+        left_encoded, name='sequence_output_tensor')[0]
+
+    # Left -> Right dot product.
+    left_dot_products = tf.matmul(
+        left_encoded, right_encoded, transpose_b=True)
+
+    self.left_logits = self.logit_scale * (
+        left_dot_products - self.logit_margin * tf.eye(batch_size))
+
+    # Right -> Left dot product.
+    self.right_logits = tf.transpose(self.left_logits)
+
+    return (self.left_logits, self.right_logits)
+
+  def get_config(self):
+    config = {
+        'logit_scale': self.logit_scale,
+        'logit_margin': self.logit_margin}
+    config.update(super(MatMulWithMargin, self).get_config())
+    return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/nlp/modeling/layers/mat_mul_with_margin_test.py
+++ b/official/nlp/modeling/layers/mat_mul_with_margin_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for mat_mul_with_margin layer."""
+
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.modeling.layers import mat_mul_with_margin
+
+
+class MatMulWithMarginTest(keras_parameterized.TestCase):
+
+  def test_layer_invocation(self):
+    """Validate that the Keras object can be created and invoked."""
+    input_width = 512
+    test_layer = mat_mul_with_margin.MatMulWithMargin()
+    # Create a 2-dimensional input (the first dimension is implicit).
+    left_encoded = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
+    right_encoded = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
+    left_logits, right_logits = test_layer(left_encoded, right_encoded)
+
+    # Validate that the outputs are of the expected shape.
+    expected_output_shape = [None, None]
+    self.assertEqual(expected_output_shape, left_logits.shape.as_list())
+    self.assertEqual(expected_output_shape, right_logits.shape.as_list())
+
+  def test_serialize_deserialize(self):
+    # Create a layer object that sets all of its config options.
+    layer = mat_mul_with_margin.MatMulWithMargin()
+
+    # Create another layer object from the first object's config.
+    new_layer = mat_mul_with_margin.MatMulWithMargin.from_config(
+        layer.get_config())
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(layer.get_config(), new_layer.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/nlp/modeling/layers/mobile_bert_layers.py
+++ b/official/nlp/modeling/layers/mobile_bert_layers.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MobileBERT embedding and transformer layers."""
+import tensorflow as tf
+
+from official.nlp import keras_nlp
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class NoNorm(tf.keras.layers.Layer):
+  """Apply element-wise linear transformation to the last dimension."""
+
+  def __init__(self, name=None):
+    super(NoNorm, self).__init__(name=name)
+
+  def build(self, shape):
+    kernal_size = shape[-1]
+    self.bias = self.add_weight('beta',
+                                shape=[kernal_size],
+                                initializer='zeros')
+    self.scale = self.add_weight('gamma',
+                                 shape=[kernal_size],
+                                 initializer='ones')
+
+  def call(self, feature):
+    output = feature * self.scale + self.bias
+    return output
+
+
+def _get_norm_layer(normalization_type='no_norm', name=None):
+  """Get normlization layer.
+
+  Args:
+      normalization_type: String. The type of normalization_type, only
+        `no_norm` and `layer_norm` are supported.
+      name: Name for the norm layer.
+
+  Returns:
+    layer norm class.
+  """
+  if normalization_type == 'no_norm':
+    layer = NoNorm(name=name)
+  elif normalization_type == 'layer_norm':
+    layer = tf.keras.layers.LayerNormalization(
+        name=name,
+        axis=-1,
+        epsilon=1e-12,
+        dtype=tf.float32)
+  else:
+    raise NotImplementedError('Only "no_norm" and "layer_norm" and supported.')
+  return layer
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class MobileBertEmbedding(tf.keras.layers.Layer):
+  """Performs an embedding lookup for MobileBERT.
+
+  This layer includes word embedding, token type embedding, position embedding.
+  """
+
+  def __init__(self,
+               word_vocab_size,
+               word_embed_size,
+               type_vocab_size,
+               output_embed_size,
+               max_sequence_length=512,
+               normalization_type='no_norm',
+               initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+               dropout_rate=0.1,
+               **kwargs):
+    """Class initialization.
+
+    Args:
+      word_vocab_size: Number of words in the vocabulary.
+      word_embed_size: Word embedding size.
+      type_vocab_size: Number of word types.
+      output_embed_size: Embedding size for the final embedding output.
+      max_sequence_length: Maximum length of input sequence.
+      normalization_type: String. The type of normalization_type, only
+        `no_norm` and `layer_norm` are supported.
+      initializer: The initializer to use for the embedding weights and
+        linear projection weights.
+      dropout_rate: Dropout rate.
+      **kwargs: keyword arguments.
+    """
+    super(MobileBertEmbedding, self).__init__(**kwargs)
+    self.word_vocab_size = word_vocab_size
+    self.word_embed_size = word_embed_size
+    self.type_vocab_size = type_vocab_size
+    self.output_embed_size = output_embed_size
+    self.max_sequence_length = max_sequence_length
+    self.normalization_type = normalization_type
+    self.initializer = tf.keras.initializers.get(initializer)
+    self.dropout_rate = dropout_rate
+
+    self.word_embedding = keras_nlp.layers.OnDeviceEmbedding(
+        self.word_vocab_size,
+        self.word_embed_size,
+        initializer=initializer,
+        name='word_embedding')
+    self.type_embedding = keras_nlp.layers.OnDeviceEmbedding(
+        self.type_vocab_size,
+        self.output_embed_size,
+        initializer=initializer,
+        name='type_embedding')
+    self.pos_embedding = keras_nlp.layers.PositionEmbedding(
+        max_length=max_sequence_length,
+        initializer=initializer,
+        name='position_embedding')
+    self.word_embedding_proj = tf.keras.layers.experimental.EinsumDense(
+        'abc,cd->abd',
+        output_shape=[None, self.output_embed_size],
+        kernel_initializer=initializer,
+        bias_axes='d',
+        name='embedding_projection')
+    self.layer_norm = _get_norm_layer(normalization_type, 'embedding_norm')
+    self.dropout_layer = tf.keras.layers.Dropout(
+        self.dropout_rate,
+        name='embedding_dropout')
+
+  def get_config(self):
+    config = {
+        'word_vocab_size': self.word_vocab_size,
+        'word_embed_size': self.word_embed_size,
+        'type_vocab_size': self.type_vocab_size,
+        'output_embed_size': self.output_embed_size,
+        'max_sequence_length': self.max_sequence_length,
+        'normalization_type': self.normalization_type,
+        'initializer': tf.keras.initializers.serialize(self.initializer),
+        'dropout_rate': self.dropout_rate
+    }
+    base_config = super(MobileBertEmbedding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, input_ids, token_type_ids=None):
+    word_embedding_out = self.word_embedding(input_ids)
+    word_embedding_out = tf.concat(
+        [tf.pad(word_embedding_out[:, 1:], ((0, 0), (0, 1), (0, 0))),
+         word_embedding_out,
+         tf.pad(word_embedding_out[:, :-1], ((0, 0), (1, 0), (0, 0)))],
+        axis=2)
+    word_embedding_out = self.word_embedding_proj(word_embedding_out)
+
+    pos_embedding_out = self.pos_embedding(word_embedding_out)
+    embedding_out = word_embedding_out + pos_embedding_out
+    if token_type_ids is not None:
+      type_embedding_out = self.type_embedding(token_type_ids)
+      embedding_out += type_embedding_out
+    embedding_out = self.layer_norm(embedding_out)
+    embedding_out = self.dropout_layer(embedding_out)
+
+    return embedding_out
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class MobileBertTransformer(tf.keras.layers.Layer):
+  """Transformer block for MobileBERT.
+
+  An implementation of one layer (block) of Transformer with bottleneck and
+  inverted-bottleneck for MobilerBERT.
+
+  Original paper for MobileBERT:
+  https://arxiv.org/pdf/2004.02984.pdf
+  """
+
+  def __init__(self,
+               hidden_size=512,
+               num_attention_heads=4,
+               intermediate_size=512,
+               intermediate_act_fn='relu',
+               hidden_dropout_prob=0.1,
+               attention_probs_dropout_prob=0.1,
+               intra_bottleneck_size=128,
+               use_bottleneck_attention=False,
+               key_query_shared_bottleneck=True,
+               num_feedforward_networks=4,
+               normalization_type='no_norm',
+               initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+               **kwargs):
+    """Class initialization.
+
+    Args:
+      hidden_size: Hidden size for the Transformer input and output tensor.
+      num_attention_heads: Number of attention heads in the Transformer.
+      intermediate_size: The size of the "intermediate" (a.k.a., feed
+        forward) layer.
+      intermediate_act_fn: The non-linear activation function to apply
+        to the output of the intermediate/feed-forward layer.
+      hidden_dropout_prob: Dropout probability for the hidden layers.
+      attention_probs_dropout_prob: Dropout probability of the attention
+        probabilities.
+      intra_bottleneck_size: Size of bottleneck.
+      use_bottleneck_attention: Use attention inputs from the bottleneck
+        transformation. If true, the following `key_query_shared_bottleneck`
+        will be ignored.
+      key_query_shared_bottleneck: Whether to share linear transformation for
+        keys and queries.
+      num_feedforward_networks: Number of stacked feed-forward networks.
+      normalization_type: The type of normalization_type, only `no_norm` and
+        `layer_norm` are supported. `no_norm` represents the element-wise
+        linear transformation for the student model, as suggested by the
+        original MobileBERT paper. `layer_norm` is used for the teacher model.
+      initializer: The initializer to use for the embedding weights and
+        linear projection weights.
+      **kwargs: keyword arguments.
+
+    Raises:
+      ValueError: A Tensor shape or parameter is invalid.
+    """
+    super(MobileBertTransformer, self).__init__(**kwargs)
+    self.hidden_size = hidden_size
+    self.num_attention_heads = num_attention_heads
+    self.intermediate_size = intermediate_size
+    self.intermediate_act_fn = intermediate_act_fn
+    self.hidden_dropout_prob = hidden_dropout_prob
+    self.attention_probs_dropout_prob = attention_probs_dropout_prob
+    self.intra_bottleneck_size = intra_bottleneck_size
+    self.use_bottleneck_attention = use_bottleneck_attention
+    self.key_query_shared_bottleneck = key_query_shared_bottleneck
+    self.num_feedforward_networks = num_feedforward_networks
+    self.normalization_type = normalization_type
+    self.initializer = tf.keras.initializers.get(initializer)
+
+    if intra_bottleneck_size % num_attention_heads != 0:
+      raise ValueError(
+          (f'The bottleneck size {intra_bottleneck_size} is not a multiple '
+           f'of the number of attention heads {num_attention_heads}.'))
+    attention_head_size = int(intra_bottleneck_size / num_attention_heads)
+
+    self.block_layers = {}
+    # add input bottleneck
+    dense_layer_2d = tf.keras.layers.experimental.EinsumDense(
+        'abc,cd->abd',
+        output_shape=[None, self.intra_bottleneck_size],
+        bias_axes='d',
+        kernel_initializer=initializer,
+        name='bottleneck_input/dense')
+    layer_norm = _get_norm_layer(self.normalization_type,
+                                 name='bottleneck_input/norm')
+    self.block_layers['bottleneck_input'] = [dense_layer_2d,
+                                             layer_norm]
+
+    if self.key_query_shared_bottleneck:
+      dense_layer_2d = tf.keras.layers.experimental.EinsumDense(
+          'abc,cd->abd',
+          output_shape=[None, self.intra_bottleneck_size],
+          bias_axes='d',
+          kernel_initializer=initializer,
+          name='kq_shared_bottleneck/dense')
+      layer_norm = _get_norm_layer(self.normalization_type,
+                                   name='kq_shared_bottleneck/norm')
+      self.block_layers['kq_shared_bottleneck'] = [dense_layer_2d,
+                                                   layer_norm]
+
+    # add attention layer
+    attention_layer = tf.keras.layers.MultiHeadAttention(
+        num_heads=self.num_attention_heads,
+        key_dim=attention_head_size,
+        value_dim=attention_head_size,
+        dropout=self.attention_probs_dropout_prob,
+        output_shape=self.intra_bottleneck_size,
+        kernel_initializer=initializer,
+        name='attention')
+    layer_norm = _get_norm_layer(self.normalization_type,
+                                 name='attention/norm')
+    self.block_layers['attention'] = [attention_layer,
+                                      layer_norm]
+
+    # add stacked feed-forward networks
+    self.block_layers['ffn'] = []
+    for ffn_layer_idx in range(self.num_feedforward_networks):
+      layer_prefix = f'ffn_layer_{ffn_layer_idx}'
+      layer_name = layer_prefix + '/intermediate_dense'
+      intermediate_layer = tf.keras.layers.experimental.EinsumDense(
+          'abc,cd->abd',
+          activation=self.intermediate_act_fn,
+          output_shape=[None, self.intermediate_size],
+          bias_axes='d',
+          kernel_initializer=initializer,
+          name=layer_name)
+      layer_name = layer_prefix + '/output_dense'
+      output_layer = tf.keras.layers.experimental.EinsumDense(
+          'abc,cd->abd',
+          output_shape=[None, self.intra_bottleneck_size],
+          bias_axes='d',
+          kernel_initializer=initializer,
+          name=layer_name)
+      layer_name = layer_prefix + '/norm'
+      layer_norm = _get_norm_layer(self.normalization_type,
+                                   name=layer_name)
+      self.block_layers['ffn'].append([intermediate_layer,
+                                       output_layer,
+                                       layer_norm])
+
+    # add output bottleneck
+    bottleneck = tf.keras.layers.experimental.EinsumDense(
+        'abc,cd->abd',
+        output_shape=[None, self.hidden_size],
+        activation=None,
+        bias_axes='d',
+        kernel_initializer=initializer,
+        name='bottleneck_output/dense')
+    dropout_layer = tf.keras.layers.Dropout(
+        self.hidden_dropout_prob,
+        name='bottleneck_output/dropout')
+    layer_norm = _get_norm_layer(self.normalization_type,
+                                 name='bottleneck_output/norm')
+    self.block_layers['bottleneck_output'] = [bottleneck,
+                                              dropout_layer,
+                                              layer_norm]
+
+  def get_config(self):
+    config = {
+        'hidden_size': self.hidden_size,
+        'num_attention_heads': self.num_attention_heads,
+        'intermediate_size': self.intermediate_size,
+        'intermediate_act_fn': self.intermediate_act_fn,
+        'hidden_dropout_prob': self.hidden_dropout_prob,
+        'attention_probs_dropout_prob': self.attention_probs_dropout_prob,
+        'intra_bottleneck_size': self.intra_bottleneck_size,
+        'use_bottleneck_attention': self.use_bottleneck_attention,
+        'key_query_shared_bottleneck': self.key_query_shared_bottleneck,
+        'num_feedforward_networks': self.num_feedforward_networks,
+        'normalization_type': self.normalization_type,
+        'initializer': tf.keras.initializers.serialize(self.initializer),
+    }
+    base_config = super(MobileBertTransformer, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self,
+           input_tensor,
+           attention_mask=None,
+           return_attention_scores=False):
+    """Implementes the forward pass.
+
+    Args:
+      input_tensor: Float tensor of shape
+        `(batch_size, seq_length, hidden_size)`.
+      attention_mask: (optional) int32 tensor of shape
+        `(batch_size, seq_length, seq_length)`, with 1 for positions that can
+        be attended to and 0 in positions that should not be.
+      return_attention_scores: If return attention score.
+
+    Returns:
+      layer_output: Float tensor of shape
+        `(batch_size, seq_length, hidden_size)`.
+      attention_scores (Optional): Only when return_attention_scores is True.
+
+    Raises:
+      ValueError: A Tensor shape or parameter is invalid.
+    """
+    input_width = input_tensor.shape.as_list()[-1]
+    if input_width != self.hidden_size:
+      raise ValueError(
+          (f'The width of the input tensor {input_width} != '
+           f'hidden size {self.hidden_size}'))
+
+    prev_output = input_tensor
+    # input bottleneck
+    dense_layer = self.block_layers['bottleneck_input'][0]
+    layer_norm = self.block_layers['bottleneck_input'][1]
+    layer_input = dense_layer(prev_output)
+    layer_input = layer_norm(layer_input)
+
+    if self.use_bottleneck_attention:
+      key_tensor = layer_input
+      query_tensor = layer_input
+      value_tensor = layer_input
+    elif self.key_query_shared_bottleneck:
+      dense_layer = self.block_layers['kq_shared_bottleneck'][0]
+      layer_norm = self.block_layers['kq_shared_bottleneck'][1]
+      shared_attention_input = dense_layer(prev_output)
+      shared_attention_input = layer_norm(shared_attention_input)
+      key_tensor = shared_attention_input
+      query_tensor = shared_attention_input
+      value_tensor = prev_output
+    else:
+      key_tensor = prev_output
+      query_tensor = prev_output
+      value_tensor = prev_output
+
+    # attention layer
+    attention_layer = self.block_layers['attention'][0]
+    layer_norm = self.block_layers['attention'][1]
+    attention_output, attention_scores = attention_layer(
+        query_tensor,
+        value_tensor,
+        key_tensor,
+        attention_mask,
+        return_attention_scores=True,
+    )
+    attention_output = layer_norm(attention_output + layer_input)
+
+    # stacked feed-forward networks
+    layer_input = attention_output
+    for ffn_idx in range(self.num_feedforward_networks):
+      intermediate_layer = self.block_layers['ffn'][ffn_idx][0]
+      output_layer = self.block_layers['ffn'][ffn_idx][1]
+      layer_norm = self.block_layers['ffn'][ffn_idx][2]
+      intermediate_output = intermediate_layer(layer_input)
+      layer_output = output_layer(intermediate_output)
+      layer_output = layer_norm(layer_output + layer_input)
+      layer_input = layer_output
+
+    # output bottleneck
+    bottleneck = self.block_layers['bottleneck_output'][0]
+    dropout_layer = self.block_layers['bottleneck_output'][1]
+    layer_norm = self.block_layers['bottleneck_output'][2]
+    layer_output = bottleneck(layer_output)
+    layer_output = dropout_layer(layer_output)
+    layer_output = layer_norm(layer_output + prev_output)
+
+    if return_attention_scores:
+      return layer_output, attention_scores
+    else:
+      return layer_output
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class MobileBertMaskedLM(tf.keras.layers.Layer):
+  """Masked language model network head for BERT modeling.
+
+  This layer implements a masked language model based on the provided
+  transformer based encoder. It assumes that the encoder network being passed
+  has a "get_embedding_table()" method. Different from canonical BERT's masked
+  LM layer, when the embedding width is smaller than hidden_size, it adds an
+  extra output weights in shape [vocab_size, (hidden_size - embedding_width)].
+  """
+
+  def __init__(self,
+               embedding_table,
+               activation=None,
+               initializer='glorot_uniform',
+               output='logits',
+               **kwargs):
+    """Class initialization.
+
+    Args:
+      embedding_table: The embedding table from encoder network.
+      activation: The activation, if any, for the dense layer.
+      initializer: The initializer for the dense layer. Defaults to a Glorot
+        uniform initializer.
+      output: The output style for this layer. Can be either `logits` or
+        `predictions`.
+      **kwargs: keyword arguments.
+    """
+    super(MobileBertMaskedLM, self).__init__(**kwargs)
+    self.embedding_table = embedding_table
+    self.activation = activation
+    self.initializer = tf.keras.initializers.get(initializer)
+
+    if output not in ('predictions', 'logits'):
+      raise ValueError(
+          ('Unknown `output` value "%s". `output` can be either "logits" or '
+           '"predictions"') % output)
+    self._output_type = output
+
+  def build(self, input_shape):
+    self._vocab_size, embedding_width = self.embedding_table.shape
+    hidden_size = input_shape[-1]
+    self.dense = tf.keras.layers.Dense(
+        hidden_size,
+        activation=self.activation,
+        kernel_initializer=self.initializer,
+        name='transform/dense')
+
+    if hidden_size > embedding_width:
+      self.extra_output_weights = self.add_weight(
+          'extra_output_weights',
+          shape=(self._vocab_size, hidden_size - embedding_width),
+          initializer=self.initializer,
+          trainable=True)
+    elif hidden_size == embedding_width:
+      self.extra_output_weights = None
+    else:
+      raise ValueError(
+          'hidden size %d cannot be smaller than embedding width %d.' %
+          (hidden_size, embedding_width))
+
+    self.layer_norm = tf.keras.layers.LayerNormalization(
+        axis=-1, epsilon=1e-12, name='transform/LayerNorm')
+    self.bias = self.add_weight(
+        'output_bias/bias',
+        shape=(self._vocab_size,),
+        initializer='zeros',
+        trainable=True)
+
+    super(MobileBertMaskedLM, self).build(input_shape)
+
+  def call(self, sequence_data, masked_positions):
+    masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
+    lm_data = self.dense(masked_lm_input)
+    lm_data = self.layer_norm(lm_data)
+    if self.extra_output_weights is None:
+      lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
+    else:
+      lm_data = tf.matmul(
+          lm_data,
+          tf.concat([self.embedding_table, self.extra_output_weights], axis=1),
+          transpose_b=True)
+
+    logits = tf.nn.bias_add(lm_data, self.bias)
+    masked_positions_length = masked_positions.shape.as_list()[1] or tf.shape(
+        masked_positions)[1]
+    logits = tf.reshape(logits,
+                        [-1, masked_positions_length, self._vocab_size])
+    if self._output_type == 'logits':
+      return logits
+    return tf.nn.log_softmax(logits)
+
+  def get_config(self):
+    raise NotImplementedError('MaskedLM cannot be directly serialized because '
+                              'it has variable sharing logic.')
+
+  def _gather_indexes(self, sequence_tensor, positions):
+    """Gathers the vectors at the specific positions.
+
+    Args:
+      sequence_tensor: Sequence output of `BertModel` layer of shape
+        `(batch_size, seq_length, num_hidden)` where `num_hidden` is number of
+        hidden units of `BertModel` layer.
+      positions: Positions ids of tokens in sequence to mask for pretraining
+        of with dimension `(batch_size, num_predictions)` where
+        `num_predictions` is maximum number of tokens to mask out and predict
+        per each sequence.
+
+    Returns:
+      Masked out sequence tensor of shape
+        `(batch_size * num_predictions, num_hidden)`.
+    """
+    sequence_shape = tf.shape(sequence_tensor)
+    batch_size, seq_length = sequence_shape[0], sequence_shape[1]
+    width = sequence_tensor.shape.as_list()[2] or sequence_shape[2]
+
+    flat_offsets = tf.reshape(
+        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
+    flat_positions = tf.reshape(positions + flat_offsets, [-1])
+    flat_sequence_tensor = tf.reshape(sequence_tensor,
+                                      [batch_size * seq_length, width])
+    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
+
+    return output_tensor
--- a/official/nlp/modeling/layers/mobile_bert_layers_test.py
+++ b/official/nlp/modeling/layers/mobile_bert_layers_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from absl.testing import parameterized
+
+import numpy as np
+import tensorflow as tf
+
+from official.nlp.modeling.layers import mobile_bert_layers
+from official.nlp.modeling.networks import mobile_bert_encoder
+
+
+def generate_fake_input(batch_size=1, seq_len=5, vocab_size=10000, seed=0):
+  """Generate consistent fake integer input sequences."""
+  np.random.seed(seed)
+  fake_input = []
+  for _ in range(batch_size):
+    fake_input.append([])
+    for _ in range(seq_len):
+      fake_input[-1].append(np.random.randint(0, vocab_size))
+  fake_input = np.asarray(fake_input)
+  return fake_input
+
+
+class MobileBertEncoderTest(parameterized.TestCase, tf.test.TestCase):
+
+  def test_embedding_layer_with_token_type(self):
+    layer = mobile_bert_layers.MobileBertEmbedding(10, 8, 2, 16)
+    input_seq = tf.Variable([[2, 3, 4, 5]])
+    token_type = tf.Variable([[0, 1, 1, 1]])
+    output = layer(input_seq, token_type)
+    output_shape = output.shape.as_list()
+    expected_shape = [1, 4, 16]
+    self.assertListEqual(output_shape, expected_shape, msg=None)
+
+  def test_embedding_layer_without_token_type(self):
+    layer = mobile_bert_layers.MobileBertEmbedding(10, 8, 2, 16)
+    input_seq = tf.Variable([[2, 3, 4, 5]])
+    output = layer(input_seq)
+    output_shape = output.shape.as_list()
+    expected_shape = [1, 4, 16]
+    self.assertListEqual(output_shape, expected_shape, msg=None)
+
+  def test_embedding_layer_get_config(self):
+    layer = mobile_bert_layers.MobileBertEmbedding(
+        word_vocab_size=16,
+        word_embed_size=32,
+        type_vocab_size=4,
+        output_embed_size=32,
+        max_sequence_length=32,
+        normalization_type='layer_norm',
+        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01),
+        dropout_rate=0.5)
+    layer_config = layer.get_config()
+    new_layer = mobile_bert_layers.MobileBertEmbedding.from_config(layer_config)
+    self.assertEqual(layer_config, new_layer.get_config())
+
+  def test_no_norm(self):
+    layer = mobile_bert_layers.NoNorm()
+    feature = tf.random.normal([2, 3, 4])
+    output = layer(feature)
+    output_shape = output.shape.as_list()
+    expected_shape = [2, 3, 4]
+    self.assertListEqual(output_shape, expected_shape, msg=None)
+
+  @parameterized.named_parameters(('with_kq_shared_bottleneck', False),
+                                  ('without_kq_shared_bottleneck', True))
+  def test_transfomer_kq_shared_bottleneck(self, is_kq_shared):
+    feature = tf.random.uniform([2, 3, 512])
+    layer = mobile_bert_layers.MobileBertTransformer(
+        key_query_shared_bottleneck=is_kq_shared)
+    output = layer(feature)
+    output_shape = output.shape.as_list()
+    expected_shape = [2, 3, 512]
+    self.assertListEqual(output_shape, expected_shape, msg=None)
+
+  def test_transfomer_with_mask(self):
+    feature = tf.random.uniform([2, 3, 512])
+    input_mask = [[[0., 0., 1.], [0., 0., 1.], [0., 0., 1.]],
+                  [[0., 1., 1.], [0., 1., 1.], [0., 1., 1.]]]
+    input_mask = np.asarray(input_mask)
+    layer = mobile_bert_layers.MobileBertTransformer()
+    output = layer(feature, input_mask)
+    output_shape = output.shape.as_list()
+    expected_shape = [2, 3, 512]
+    self.assertListEqual(output_shape, expected_shape, msg=None)
+
+  def test_transfomer_return_attention_score(self):
+    sequence_length = 5
+    num_attention_heads = 8
+    feature = tf.random.uniform([2, sequence_length, 512])
+    layer = mobile_bert_layers.MobileBertTransformer(
+        num_attention_heads=num_attention_heads)
+    _, attention_score = layer(feature, return_attention_scores=True)
+    expected_shape = [2, num_attention_heads, sequence_length, sequence_length]
+    self.assertListEqual(
+        attention_score.shape.as_list(), expected_shape, msg=None)
+
+  def test_transformer_get_config(self):
+    layer = mobile_bert_layers.MobileBertTransformer(
+        hidden_size=32,
+        num_attention_heads=2,
+        intermediate_size=48,
+        intermediate_act_fn='gelu',
+        hidden_dropout_prob=0.5,
+        attention_probs_dropout_prob=0.4,
+        intra_bottleneck_size=64,
+        use_bottleneck_attention=True,
+        key_query_shared_bottleneck=False,
+        num_feedforward_networks=2,
+        normalization_type='layer_norm',
+        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01),
+        name='block')
+    layer_config = layer.get_config()
+    new_layer = mobile_bert_layers.MobileBertTransformer.from_config(
+        layer_config)
+    self.assertEqual(layer_config, new_layer.get_config())
+
+
+class MobileBertMaskedLMTest(tf.test.TestCase):
+
+  def create_layer(self,
+                   vocab_size,
+                   hidden_size,
+                   embedding_width,
+                   output='predictions',
+                   xformer_stack=None):
+    # First, create a transformer stack that we can use to get the LM's
+    # vocabulary weight.
+    if xformer_stack is None:
+      xformer_stack = mobile_bert_encoder.MobileBERTEncoder(
+          word_vocab_size=vocab_size,
+          num_blocks=1,
+          hidden_size=hidden_size,
+          num_attention_heads=4,
+          word_embed_size=embedding_width)
+
+    # Create a maskedLM from the transformer stack.
+    test_layer = mobile_bert_layers.MobileBertMaskedLM(
+        embedding_table=xformer_stack.get_embedding_table(), output=output)
+    return test_layer
+
+  def test_layer_creation(self):
+    vocab_size = 100
+    sequence_length = 32
+    hidden_size = 64
+    embedding_width = 32
+    num_predictions = 21
+    test_layer = self.create_layer(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        embedding_width=embedding_width)
+
+    # Make sure that the output tensor of the masked LM is the right shape.
+    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
+    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
+    output = test_layer(lm_input_tensor, masked_positions=masked_positions)
+
+    expected_output_shape = [None, num_predictions, vocab_size]
+    self.assertEqual(expected_output_shape, output.shape.as_list())
+
+  def test_layer_invocation_with_external_logits(self):
+    vocab_size = 100
+    sequence_length = 32
+    hidden_size = 64
+    embedding_width = 32
+    num_predictions = 21
+    xformer_stack = mobile_bert_encoder.MobileBERTEncoder(
+        word_vocab_size=vocab_size,
+        num_blocks=1,
+        hidden_size=hidden_size,
+        num_attention_heads=4,
+        word_embed_size=embedding_width)
+    test_layer = self.create_layer(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        embedding_width=embedding_width,
+        xformer_stack=xformer_stack,
+        output='predictions')
+    logit_layer = self.create_layer(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        embedding_width=embedding_width,
+        xformer_stack=xformer_stack,
+        output='logits')
+
+    # Create a model from the masked LM layer.
+    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
+    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
+    output = test_layer(lm_input_tensor, masked_positions)
+    logit_output = logit_layer(lm_input_tensor, masked_positions)
+    logit_output = tf.keras.layers.Activation(tf.nn.log_softmax)(logit_output)
+    logit_layer.set_weights(test_layer.get_weights())
+    model = tf.keras.Model([lm_input_tensor, masked_positions], output)
+    logits_model = tf.keras.Model(([lm_input_tensor, masked_positions]),
+                                  logit_output)
+
+    # Invoke the masked LM on some fake data to make sure there are no runtime
+    # errors in the code.
+    batch_size = 3
+    lm_input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, hidden_size))
+    masked_position_data = np.random.randint(
+        sequence_length, size=(batch_size, num_predictions))
+    # ref_outputs = model.predict([lm_input_data, masked_position_data])
+    # outputs = logits_model.predict([lm_input_data, masked_position_data])
+    ref_outputs = model([lm_input_data, masked_position_data])
+    outputs = logits_model([lm_input_data, masked_position_data])
+
+    # Ensure that the tensor shapes are correct.
+    expected_output_shape = (batch_size, num_predictions, vocab_size)
+    self.assertEqual(expected_output_shape, ref_outputs.shape)
+    self.assertEqual(expected_output_shape, outputs.shape)
+    self.assertAllClose(ref_outputs, outputs)
+
+  def test_layer_invocation(self):
+    vocab_size = 100
+    sequence_length = 32
+    hidden_size = 64
+    embedding_width = 32
+    num_predictions = 21
+    test_layer = self.create_layer(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        embedding_width=embedding_width)
+
+    # Create a model from the masked LM layer.
+    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
+    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
+    output = test_layer(lm_input_tensor, masked_positions)
+    model = tf.keras.Model([lm_input_tensor, masked_positions], output)
+
+    # Invoke the masked LM on some fake data to make sure there are no runtime
+    # errors in the code.
+    batch_size = 3
+    lm_input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, hidden_size))
+    masked_position_data = np.random.randint(
+        2, size=(batch_size, num_predictions))
+    _ = model.predict([lm_input_data, masked_position_data])
+
+  def test_unknown_output_type_fails(self):
+    with self.assertRaisesRegex(ValueError, 'Unknown `output` value "bad".*'):
+      _ = self.create_layer(
+          vocab_size=8, hidden_size=8, embedding_width=4, output='bad')
+
+  def test_hidden_size_smaller_than_embedding_width(self):
+    hidden_size = 8
+    sequence_length = 32
+    num_predictions = 20
+    with self.assertRaisesRegex(
+        ValueError, 'hidden size 8 cannot be smaller than embedding width 16.'):
+      test_layer = self.create_layer(
+          vocab_size=8, hidden_size=8, embedding_width=16)
+      lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
+      masked_positions = tf.keras.Input(
+          shape=(num_predictions,), dtype=tf.int32)
+      _ = test_layer(lm_input_tensor, masked_positions)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/nlp/modeling/layers/multi_channel_attention.py
+++ b/official/nlp/modeling/layers/multi_channel_attention.py
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,29 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Multi-channel Attention."""
 # pylint: disable=g-classes-have-attributes

-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
-
 import math

 import tensorflow as tf
 from official.modeling import tf_utils
-from official.nlp.modeling.layers import attention
 from official.nlp.modeling.layers import masked_softmax


 class VotingAttention(tf.keras.layers.Layer):
  """Voting Attention layer.

-  Arguments:
-    num_heads: the number of attention heads.
-    head_size: per-head hidden size.
+  Args:
+    num_heads: The number of attention heads.
+    head_size: Per-head hidden size.
    kernel_initializer: Initializer for dense layer kernels.
    bias_initializer: Initializer for dense layer biases.
    kernel_regularizer: Regularizer for dense layer kernels.
@@ -107,43 +100,61 @@ class VotingAttention(tf.keras.layers.Layer):
    return tf.nn.softmax(doc_attention_probs + infadder)


-class MultiChannelAttention(attention.MultiHeadAttention):
+class MultiChannelAttention(tf.keras.layers.MultiHeadAttention):
  """Multi-channel Attention layer.

-  Introduced in: https://arxiv.org/abs/2001.09386. Expects multiple
-  cross-attention target sequences.
+  Introduced in, [Generating Representative Headlines for News Stories
+  ](https://arxiv.org/abs/2001.09386). Expects multiple cross-attention
+  target sequences.
+
+  Call args:
+    query: Query `Tensor` of shape `[B, T, dim]`.
+    value: Value `Tensor` of shape `[B, A, S, dim]`, where A denotes the
+    context_attention_weights: Context weights of shape `[B, N, T, A]`, where N
+      is the number of attention heads. Combines multi-channel sources
+      context tensors according to the distribution among channels.
+    key: Optional key `Tensor` of shape `[B, A, S, dim]`. If not given, will use
+      `value` for both `key` and `value`, which is the most common case.
+    attention_mask: A boolean mask of shape `[B, T, S]`, that prevents attention
+      to certain positions.
  """

-  def _build_attention(self, qkv_rank):
-    super(MultiChannelAttention, self)._build_attention(qkv_rank)
+  def _build_attention(self, rank):
+    super(MultiChannelAttention, self)._build_attention(rank)
    self._masked_softmax = masked_softmax.MaskedSoftmax(mask_expansion_axes=[2])

-  def call(self, inputs, attention_mask=None):
-    from_tensor = inputs[0]
-    to_tensor = inputs[1]
-    doc_attention_probs = inputs[2]
+  def call(self,
+           query,
+           value,
+           key=None,
+           context_attention_weights=None,
+           attention_mask=None):
+    if not self._built_from_signature:
+      self._build_from_signature(query, value, key=key)
+    if key is None:
+      key = value

    # Scalar dimensions referenced here:
    #   B = batch size (number of stories)
    #   A = num_docs (number of docs)
-    #   F = `from_tensor` sequence length
-    #   T = `to_tensor` sequence length
+    #   F = target sequence length
+    #   T = source sequence length
    #   N = `num_attention_heads`
    #   H = `size_per_head`
    # `query_tensor` = [B, F, N ,H]
-    query_tensor = self._query_dense(from_tensor)
+    query_tensor = self._query_dense(query)

    # `key_tensor` = [B, A, T, N, H]
-    key_tensor = self._key_dense(to_tensor)
+    key_tensor = self._key_dense(key)

    # `value_tensor` = [B, A, T, N, H]
-    value_tensor = self._value_dense(to_tensor)
+    value_tensor = self._value_dense(value)

    # Take the dot product between "query" and "key" to get the raw
    # attention scores.
    attention_scores = tf.einsum("BATNH,BFNH->BANFT", key_tensor, query_tensor)
    attention_scores = tf.multiply(attention_scores,
-                                   1.0 / math.sqrt(float(self._key_size)))
+                                   1.0 / math.sqrt(float(self._key_dim)))

    # Normalize the attention scores to probabilities.
    # `attention_probs` = [B, A, N, F, T]
@@ -156,7 +167,7 @@ class MultiChannelAttention(attention.MultiHeadAttention):
    # `context_layer` = [B, F, N, H]
    context_layer = tf.einsum("BANFT,BATNH->BAFNH", attention_probs,
                              value_tensor)
-    attention_output = tf.einsum("BNFA,BAFNH->BFNH", doc_attention_probs,
+    attention_output = tf.einsum("BNFA,BAFNH->BFNH", context_attention_weights,
                                 context_layer)
    attention_output = self._output_dense(attention_output)
    return attention_output
--- a/official/nlp/modeling/layers/multi_channel_attention_test.py
+++ b/official/nlp/modeling/layers/multi_channel_attention_test.py
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Tests for nlp.nhnet.multi_channel_attention."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Tests for nlp.nhnet.multi_channel_attention."""

 import numpy as np
 import tensorflow as tf
@@ -41,14 +36,18 @@ class MultiChannelAttentionTest(tf.test.TestCase):
    num_heads = 2
    num_docs = 5
    attention_layer = multi_channel_attention.MultiChannelAttention(
-        num_heads, key_size=2)
+        num_heads, key_dim=2)

    from_data = 10 * np.random.random_sample((3, 4, 8))
    to_data = 10 * np.random.random_sample((3, num_docs, 2, 8))
    mask_data = np.random.randint(2, size=(3, num_docs, 4, 2))
    doc_probs = np.random.randint(
        2, size=(3, num_heads, 4, num_docs)).astype(float)
-    outputs = attention_layer([from_data, to_data, doc_probs], mask_data)
+    outputs = attention_layer(
+        query=from_data,
+        value=to_data,
+        context_attention_weights=doc_probs,
+        attention_mask=mask_data)
    self.assertEqual(outputs.shape, (3, 4, 8))



--- a/official/nlp/modeling/layers/on_device_embedding.py
+++ b/official/nlp/modeling/layers/on_device_embedding.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,78 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Keras-based one-hot embedding layer."""
 # pylint: disable=g-classes-have-attributes
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
-
-import tensorflow as tf
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class OnDeviceEmbedding(tf.keras.layers.Layer):
-  """Performs an embedding lookup suitable for accelerator devices.
-
-  This layer uses either tf.gather or tf.one_hot to translate integer indices to
-  float embeddings.
-
-  Arguments:
-    vocab_size: Number of elements in the vocabulary.
-    embedding_width: Output size of the embedding layer.
-    initializer: The initializer to use for the embedding weights. Defaults to
-      "glorot_uniform".
-    use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
-      lookup. Defaults to False (that is, using tf.gather). Setting this option
-      to True may improve performance, especially on small vocabulary sizes, but
-      will generally require more memory.
-  """
-
-  def __init__(self,
-               vocab_size,
-               embedding_width,
-               initializer="glorot_uniform",
-               use_one_hot=False,
-               **kwargs):
-
-    super(OnDeviceEmbedding, self).__init__(**kwargs)
-    self._vocab_size = vocab_size
-    self._embedding_width = embedding_width
-    self._initializer = initializer
-    self._use_one_hot = use_one_hot
-
-  def get_config(self):
-    config = {
-        "vocab_size": self._vocab_size,
-        "embedding_width": self._embedding_width,
-        "initializer": self._initializer,
-        "use_one_hot": self._use_one_hot,
-    }
-    base_config = super(OnDeviceEmbedding, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))

-  def build(self, input_shape):
-    self.embeddings = self.add_weight(
-        "embeddings",
-        shape=[self._vocab_size, self._embedding_width],
-        initializer=self._initializer,
-        dtype=tf.float32)
+from official.nlp import keras_nlp

-    super(OnDeviceEmbedding, self).build(input_shape)

-  def call(self, inputs):
-    flat_inputs = tf.reshape(inputs, [-1])
-    if self._use_one_hot:
-      one_hot_data = tf.one_hot(
-          flat_inputs, depth=self._vocab_size, dtype=self.embeddings.dtype)
-      embeddings = tf.matmul(one_hot_data, self.embeddings)
-    else:
-      embeddings = tf.gather(self.embeddings, flat_inputs)
-    embeddings = tf.reshape(
-        embeddings,
-        # Work around b/142213824: prefer concat to shape over a Python list.
-        tf.concat([tf.shape(inputs), [self._embedding_width]], axis=0))
-    embeddings.set_shape(inputs.shape.as_list() + [self._embedding_width])
-    return embeddings
+OnDeviceEmbedding = keras_nlp.layers.OnDeviceEmbedding
--- a/official/nlp/modeling/layers/position_embedding.py
+++ b/official/nlp/modeling/layers/position_embedding.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,115 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Keras-based positional embedding layer."""
 # pylint: disable=g-classes-have-attributes
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
-
 import math
+from typing import Optional

 import tensorflow as tf

 from official.modeling import tf_utils

-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class PositionEmbedding(tf.keras.layers.Layer):
-  """Creates a positional embedding.
-
-  This layer creates a positional embedding as described in "BERT: Pre-training
-  of Deep Bidirectional Transformers for Language Understanding"
-  (https://arxiv.org/abs/1810.04805).
-
-  This layer can be set up to either create a statically shaped slice or a
-  dynamically shaped slice. If `use_dynamic_slicing` is True, the input tensor
-  can have a dynamic 1st dimension, while if `use_dynamic_slicing` is False the
-  input size must be fixed.
-
-  Arguments:
-    use_dynamic_slicing: Whether to use the dynamic slicing path.
-    max_sequence_length: The maximum size of the dynamic sequence. Only
-      applicable if `use_dynamic_slicing` is True.
-    initializer: The initializer to use for the embedding weights. Defaults to
-      "glorot_uniform".
-  """
-
-  def __init__(self,
-               initializer="glorot_uniform",
-               use_dynamic_slicing=False,
-               max_sequence_length=None,
-               **kwargs):
-    # We need to have a default dtype of float32, since the inputs (which Keras
-    # usually uses to infer the dtype) will always be int32.
-    if "dtype" not in kwargs:
-      kwargs["dtype"] = "float32"
-
-    super(PositionEmbedding, self).__init__(**kwargs)
-    if use_dynamic_slicing and max_sequence_length is None:
-      raise ValueError(
-          "If `use_dynamic_slicing` is True, `max_sequence_length` must be set."
-      )
-    self._max_sequence_length = max_sequence_length
-    self._initializer = tf.keras.initializers.get(initializer)
-    self._use_dynamic_slicing = use_dynamic_slicing
-
-  def get_config(self):
-    config = {
-        "max_sequence_length": self._max_sequence_length,
-        "initializer": tf.keras.initializers.serialize(self._initializer),
-        "use_dynamic_slicing": self._use_dynamic_slicing,
-    }
-    base_config = super(PositionEmbedding, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def build(self, input_shape):
-    """Implements build() for the layer."""
-    dimension_list = input_shape.as_list()
-
-    if len(dimension_list) != 3:
-      raise ValueError("PositionEmbedding expects a 3-dimensional input tensor "
-                       "of shape [batch, sequence, width]")
-    seq_length = dimension_list[1]
-    width = dimension_list[2]
-
-    # If we are not using dynamic slicing, we must assume that the sequence
-    # length is fixed and max_sequence_length should not be specified.
-    if not self._use_dynamic_slicing:
-      if seq_length is None:
-        raise ValueError(
-            "PositionEmbedding must have `use_dynamic_slicing` set "
-            "to True (and max_sequence_length set) when the "
-            "sequence (1st) dimension of the input is None.")
-      if self._max_sequence_length is not None:
-        raise ValueError(
-            "When `use_dynamic_slicing` is False, max_sequence_length should "
-            "not be specified and we ought to use seq_length to get the "
-            "variable shape.")
-
-    if self._max_sequence_length is not None:
-      weight_sequence_length = self._max_sequence_length
-    else:
-      weight_sequence_length = seq_length
-
-    self._position_embeddings = self.add_weight(
-        "embeddings",
-        shape=[weight_sequence_length, width],
-        initializer=self._initializer)
-
-    super(PositionEmbedding, self).build(input_shape)
-
-  def call(self, inputs):
-    """Implements call() for the layer."""
-    input_shape = tf_utils.get_shape_list(inputs, expected_rank=3)
-    if self._use_dynamic_slicing:
-      position_embeddings = self._position_embeddings[:input_shape[1], :]
-    else:
-      position_embeddings = self._position_embeddings
-
-    return tf.broadcast_to(position_embeddings, input_shape)
+Initializer = tf.keras.initializers.Initializer


 @tf.keras.utils.register_keras_serializable(package="Text")
@@ -131,16 +33,16 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
   "Attention is All You Need", section 3.5.
  (https://arxiv.org/abs/1706.03762).

-  Arguments:
+  Args:
    hidden_size: Size of the hidden layer.
    min_timescale: Minimum scale that will be applied at each position
    max_timescale: Maximum scale that will be applied at each position.
  """

  def __init__(self,
-               hidden_size,
-               min_timescale=1.0,
-               max_timescale=1.0e4,
+               hidden_size: int,
+               min_timescale: float = 1.0,
+               max_timescale: float = 1.0e4,
               **kwargs):
    # We need to have a default dtype of float32, since the inputs (which Keras
    # usually uses to infer the dtype) will always be int32.
@@ -150,7 +52,7 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
    if "dtype" not in kwargs:
      kwargs["dtype"] = "float32"

-    super(RelativePositionEmbedding, self).__init__(**kwargs)
+    super().__init__(**kwargs)
    self._hidden_size = hidden_size
    self._min_timescale = min_timescale
    self._max_timescale = max_timescale
@@ -160,7 +62,6 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
        "hidden_size": self._hidden_size,
        "min_timescale": self._min_timescale,
        "max_timescale": self._max_timescale,
-        "length": self._length,
    }
    base_config = super(RelativePositionEmbedding, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))
@@ -172,22 +73,20 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
      inputs: An tensor whose second dimension will be used as `length`. If
        `None`, the other `length` argument must be specified.
      length: An optional integer specifying the number of positions. If both
-        `inputs` and `length` are spcified, `length` must be equal to the
-        second dimension of `inputs`.
+        `inputs` and `length` are spcified, `length` must be equal to the second
+        dimension of `inputs`.

    Returns:
-      A tensor in shape of [length, hidden_size].
+      A tensor in shape of `(length, hidden_size)`.
    """
    if inputs is None and length is None:
-      raise ValueError(
-          "If inputs is None, `length` must be set in "
-          "RelativePositionEmbedding().")
+      raise ValueError("If inputs is None, `length` must be set in "
+                       "RelativePositionEmbedding().")
    if inputs is not None:
      input_shape = tf_utils.get_shape_list(inputs)
      if length is not None and length != input_shape[1]:
        raise ValueError(
-            "If inputs is not None, `length` must equal to input_shape[1]."
-        )
+            "If inputs is not None, `length` must equal to input_shape[1].")
      length = input_shape[1]
    position = tf.cast(tf.range(length), tf.float32)
    num_timescales = self._hidden_size // 2
@@ -198,8 +97,141 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
    inv_timescales = min_timescale * tf.exp(
        tf.cast(tf.range(num_timescales), tf.float32) *
        -log_timescale_increment)
-    scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales,
-                                                               0)
-    position_embeddings = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)],
-                                    axis=1)
+    scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(
+        inv_timescales, 0)
+    position_embeddings = tf.concat(
+        [tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
    return position_embeddings
+
+
+def _relative_position_bucket(relative_position,
+                              bidirectional=True,
+                              num_buckets=32,
+                              max_distance=128):
+  """Translate relative position to a bucket number for relative attention.
+
+  The relative position is defined as memory_position - query_position, i.e.
+  the distance in tokens from the attending position to the attended-to
+  position.
+
+  If `bidirectional=False`, then positive relative positions are invalid.
+
+  We use smaller buckets for small absolute relative_position and larger
+  buckets for larger absolute relative_positions.
+
+  All relative positions >=max_distance map to the same bucket.
+
+  All relative positions <=-max_distance map to the same bucket.
+
+  This should allow for more graceful generalization to longer sequences
+  than the model has been trained on.
+
+  Args:
+    relative_position: An int32 Tensor
+    bidirectional: A boolean - whether the attention is bidirectional
+    num_buckets: An integer
+    max_distance: An integer
+
+  Returns:
+    A Tensor with the same shape as relative_position, containing int32
+    values in the range [0, num_buckets)
+  """
+  ret = 0
+  n = -relative_position
+  if bidirectional:
+    num_buckets //= 2
+    ret += tf.cast(tf.math.less(n, 0), tf.int32) * num_buckets
+    n = tf.math.abs(n)
+  else:
+    n = tf.math.maximum(n, 0)
+  # now n is in the range [0, inf)
+  max_exact = num_buckets // 2
+  is_small = tf.math.less(n, max_exact)
+  val_if_large = max_exact + tf.dtypes.cast(
+      tf.math.log(tf.cast(n, tf.float32) / max_exact) /
+      math.log(max_distance / max_exact) * (num_buckets - max_exact),
+      tf.int32,
+  )
+  val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
+  ret += tf.where(is_small, n, val_if_large)
+  return ret
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class RelativePositionBias(tf.keras.layers.Layer):
+  """Relative position embedding via per-head bias in T5 style.
+
+  Reference implementation in MeshTF:
+  https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L1000
+
+  This layer implements the relative position bias used in "Exploring the Limits
+  of Transfer Learning with a Unified Text-to-Text Transformer"
+  (https://arxiv.org/abs/1910.10683)
+  """
+
+  def __init__(self,
+               num_heads: int,
+               relative_attention_num_buckets: int = 32,
+               relative_attention_max_distance: int = 128,
+               bidirectional: bool = True,
+               embeddings_initializer: Optional[Initializer] = None,
+               **kwargs):
+    super().__init__(**kwargs)
+    self.num_heads = num_heads
+    self.relative_attention_num_buckets = relative_attention_num_buckets
+    self.bidirectional = bidirectional
+    self.relative_attention_max_distance = relative_attention_max_distance
+    if embeddings_initializer:
+      self._embed_init = embeddings_initializer
+    else:
+      self._embed_init = tf.keras.initializers.TruncatedNormal(stddev=1.0)
+    with tf.name_scope(self.name):
+      self._relative_attention_bias = self.add_weight(
+          "rel_embedding",
+          shape=[self.relative_attention_num_buckets, self.num_heads],
+          initializer=self._embed_init,
+          dtype=self.dtype,
+          trainable=True)
+
+  def get_config(self):
+    config = {
+        "num_heads":
+            self.num_heads,
+        "relative_attention_num_buckets":
+            self.relative_attention_num_buckets,
+        "relative_attention_max_distance":
+            self.relative_attention_max_distance,
+        "bidirectional":
+            self.bidirectional,
+        "embeddings_initializer":
+            tf.keras.initializers.serialize(self._embed_init),
+    }
+    base_config = super().get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, query: tf.Tensor, key: tf.Tensor):
+    """Implements the forward pass.
+
+    Args:
+      query: query input tensor shape [batch, query length, hidden size].
+      key: key input tensor shape [batch, key length, hidden size].
+
+    Returns:
+      A tensor in shape of [batch, heads, query length, key length].
+    """
+    batch_size, qlen = tf_utils.get_shape_list(query)[:2]
+    klen = tf_utils.get_shape_list(key)[1]
+    context_position = tf.range(qlen)[:, None]
+    memory_position = tf.range(klen)[None, :]
+    relative_position = memory_position - context_position
+    rp_bucket = _relative_position_bucket(
+        relative_position,
+        bidirectional=self.bidirectional,
+        num_buckets=self.relative_attention_num_buckets,
+        max_distance=self.relative_attention_max_distance)
+    values = tf.nn.embedding_lookup(self._relative_attention_bias, rp_bucket)
+    values = tf.expand_dims(
+        tf.transpose(values, [2, 0, 1]),
+        axis=0)  # shape (1, num_heads, qlen, klen)
+    values = tf.tile(values, [batch_size, 1, 1, 1])
+    return values
--- a/official/nlp/modeling/layers/position_embedding_test.py
+++ b/official/nlp/modeling/layers/position_embedding_test.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,13 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Tests for Keras-based positional embedding layer."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Tests for Keras-based positional embedding layer."""

+from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf

@@ -28,75 +25,7 @@ from official.nlp.modeling.layers import position_embedding
 # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
-class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
-
-  def test_static_layer_output_shape(self):
-    test_layer = position_embedding.PositionEmbedding()
-    # Create a 3-dimensional input (the first dimension is implicit).
-    sequence_length = 21
-    width = 30
-    input_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output_tensor = test_layer(input_tensor)
-
-    # When using static positional embedding shapes, the output is expected
-    # to be the same as the input shape in all dimensions save batch.
-    expected_output_shape = [None, sequence_length, width]
-    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
-    # The default output dtype for this layer should be tf.float32.
-    self.assertEqual(tf.float32, output_tensor.dtype)
-
-  def test_float16_dtype(self):
-    test_layer = position_embedding.PositionEmbedding(dtype="float16")
-    # Create a 3-dimensional input (the first dimension is implicit).
-    sequence_length = 21
-    width = 30
-    input_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output_tensor = test_layer(input_tensor)
-
-    # When using static positional embedding shapes, the output is expected
-    # to be the same as the input shape in all dimensions save batch.
-    expected_output_shape = [None, sequence_length, width]
-    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
-    # The default output dtype for this layer should be tf.float32.
-    self.assertEqual(tf.float16, output_tensor.dtype)
-
-  def test_dynamic_layer_output_shape(self):
-    max_sequence_length = 40
-    test_layer = position_embedding.PositionEmbedding(
-        use_dynamic_slicing=True, max_sequence_length=max_sequence_length)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    width = 30
-    input_tensor = tf.keras.Input(shape=(None, width))
-    output_tensor = test_layer(input_tensor)
-
-    # When using dynamic positional embedding shapes, the output is expected
-    # to be the same as the input shape in all dimensions - but may be None if
-    # the input shape is None there.
-    expected_output_shape = [None, None, width]
-    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
-
-  def test_dynamic_layer_slicing(self):
-    max_sequence_length = 40
-    test_layer = position_embedding.PositionEmbedding(
-        use_dynamic_slicing=True, max_sequence_length=max_sequence_length)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    width = 30
-    input_tensor = tf.keras.Input(shape=(None, width))
-    output_tensor = test_layer(input_tensor)
-
-    model = tf.keras.Model(input_tensor, output_tensor)
-
-    # Create input data that is shorter than max_sequence_length, which should
-    # trigger a down-slice.
-    input_length = 17
-    # Note: This test explicitly uses a batch size of 1. This is to get around
-    # Keras' restriction on Model invocations: inputs are expected to have the
-    # same batch cardinality as outputs. In practice, this layer should be used
-    # inside a model, where it can be projected when added to another tensor.
-    input_data = np.ones((1, input_length, width))
-    output_data = model.predict(input_data)
-
-    self.assertAllEqual([1, input_length, width], output_data.shape)
+class RelativePositionEmbeddingLayerTest(keras_parameterized.TestCase):

  def test_relative_tensor_input(self):
    hidden_size = 8
@@ -127,5 +56,33 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
    expected_output_tensor = tf.constant([[0, 0, 0, 0, 1, 1, 1, 1]])
    self.assertAllEqual(output_tensor, expected_output_tensor)

+
+@keras_parameterized.run_all_keras_modes
+class RelativePositionBiasTest(keras_parameterized.TestCase):
+
+  @parameterized.named_parameters(("bidirectional", True),
+                                  ("unidirectional", False))
+  def test_relative_position_bias(self, bidirectional):
+    query = tf.zeros((4, 4, 2))
+    key = tf.zeros((4, 2, 2))
+    l = position_embedding.RelativePositionBias(
+        num_heads=3,
+        bidirectional=bidirectional,
+        name="foo")
+    self.assertEqual(l(query, key).shape, (4, 3, 4, 2))
+    self.assertLen(l.trainable_variables, 1)
+    self.assertEqual(l.trainable_variables[0].name, "foo/rel_embedding:0")
+
+  def test_relative_position_bucket(self):
+    context_position = tf.range(3)[:, None]
+    memory_position = tf.range(2)[None, :]
+    relative_position = memory_position - context_position
+    outputs = position_embedding._relative_position_bucket(relative_position)
+    self.assertAllEqual(outputs.numpy(), np.array([[0, 17], [1, 0], [2, 1]]))
+    outputs = position_embedding._relative_position_bucket(
+        relative_position, bidirectional=False)
+    self.assertAllEqual(outputs.numpy(), np.array([[0, 0], [1, 0], [2, 1]]))
+
+
 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/modeling/layers/relative_attention.py
+++ b/official/nlp/modeling/layers/relative_attention.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based relative attention layers."""
+import math
+import string
+import tensorflow as tf
+
+_CHR_IDX = string.ascii_lowercase
+
+
+def _build_proj_equation(free_dims, bound_dims, output_dims):
+  """Builds an einsum equation for projections inside multi-head attention."""
+  input_str = ""
+  kernel_str = ""
+  output_str = ""
+  bias_axes = ""
+  letter_offset = 0
+  for i in range(free_dims):
+    char = _CHR_IDX[i + letter_offset]
+    input_str += char
+    output_str += char
+
+  letter_offset += free_dims
+  for i in range(bound_dims):
+    char = _CHR_IDX[i + letter_offset]
+    input_str += char
+    kernel_str += char
+
+  letter_offset += bound_dims
+  for i in range(output_dims):
+    char = _CHR_IDX[i + letter_offset]
+    kernel_str += char
+    output_str += char
+    bias_axes += char
+  equation = "%s,%s->%s" % (input_str, kernel_str, output_str)
+
+  return equation, bias_axes, len(output_str)
+
+
+def _get_output_shape(output_rank, known_last_dims):
+  return [None] * (output_rank - len(known_last_dims)) + list(known_last_dims)
+
+
+def _rel_shift(x, klen=-1):
+  """Performs relative shift to form the relative attention score."""
+
+  x = tf.transpose(x, perm=[2, 3, 0, 1])
+  x_size = tf.shape(x)
+
+  x = tf.reshape(x, [x_size[1], x_size[0], x_size[2], x_size[3]])
+  x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
+  x = tf.reshape(x, [x_size[0], x_size[1] - 1, x_size[2], x_size[3]])
+  x = tf.slice(x, [0, 0, 0, 0], [-1, klen, -1, -1])
+
+  x = tf.transpose(x, perm=[2, 3, 0, 1])
+
+  return x
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class MultiHeadRelativeAttention(tf.keras.layers.MultiHeadAttention):
+  """A multi-head attention layer with relative attention + position encoding.
+
+  This layer shares the same input/output projections as the common
+  `tf.keras.layers.MultiHeadAttention` layer.
+
+  When it calculates attention logits, position encoding is projected to form
+  relative keys. The logits are composed by shifted relative logits and content
+  logits.
+
+  **Note: This layer is currently experimental.
+
+  Attributes:
+    kernel_initializer: The kernel initializer. Defaults to variance_scaling.
+
+  Call args:
+    query: Query `Tensor` of shape `[B, T, dim]`.
+    value: Value `Tensor` of shape `[B, S, dim]`.
+    content_attention_bias: Bias `Tensor` for content based attention of shape
+      `[num_heads, dim]`.
+    positional_attention_bias: Bias `Tensor` for position based attention of
+      shape `[num_heads, dim]`.
+    key: Optional key `Tensor` of shape `[B, S, dim]`. If not given, will use
+      `value` for both `key` and `value`, which is the most common case.
+    relative_position_encoding: Relative positional encoding `Tensor` of shape
+      `[B, L, dim]`.
+    segment_matrix: Optional `Tensor` representing segmentation IDs used in
+      XLNet of shape `[B, S, S + M]`.
+    segment_encoding: Optional `Tensor` representing the segmentation
+      encoding as used in XLNet of shape `[2, num_heads, dim]`.
+    segment_attention_bias: Optional trainable bias parameter added to the
+      query had when calculating the segment-based attention score used in
+      XLNet of shape `[num_heads, dim]`.
+    state: Optional `Tensor` of shape `[B, M, E]` where M is the length of the
+      state or memory.
+      If passed, this is also attended over as in Transformer XL.
+    attention_mask: A boolean mask of shape `[B, T, S]` that prevents attention
+      to certain positions.
+  """
+
+  def __init__(self,
+               kernel_initializer="variance_scaling",
+               **kwargs):
+    super().__init__(kernel_initializer=kernel_initializer,
+                     **kwargs)
+
+  def _build_from_signature(self, query, value, key=None):
+    super(MultiHeadRelativeAttention, self)._build_from_signature(
+        query=query,
+        value=value,
+        key=key)
+    if hasattr(value, "shape"):
+      value_shape = tf.TensorShape(value.shape)
+    else:
+      value_shape = value
+    if key is None:
+      key_shape = value_shape
+    elif hasattr(key, "shape"):
+      key_shape = tf.TensorShape(key.shape)
+    else:
+      key_shape = key
+
+    common_kwargs = dict(
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+
+    with tf.init_scope():
+      einsum_equation, _, output_rank = _build_proj_equation(
+          key_shape.rank - 1, bound_dims=1, output_dims=2)
+      self._encoding_dense = tf.keras.layers.experimental.EinsumDense(
+          einsum_equation,
+          output_shape=_get_output_shape(output_rank - 1,
+                                         [self._num_heads, self._key_dim]),
+          bias_axes=None,
+          name="encoding",
+          **common_kwargs)
+
+  def compute_attention(self,
+                        query,
+                        key,
+                        value,
+                        position,
+                        content_attention_bias,
+                        positional_attention_bias,
+                        segment_matrix=None,
+                        segment_encoding=None,
+                        segment_attention_bias=None,
+                        attention_mask=None):
+    """Computes the attention.
+
+    This function defines the computation inside `call` with projected
+    multihead Q, K, V, R inputs.
+
+    Args:
+      query: Projected query `Tensor` of shape `[B, T, N, key_dim]`.
+      key: Projected key `Tensor` of shape `[B, S + M, N, key_dim]`.
+      value: Projected value `Tensor` of shape `[B, S + M, N, key_dim]`.
+      position: Projected position `Tensor` of shape `[B, L, N, key_dim]`.
+      content_attention_bias: Trainable bias parameter added to the query head
+        when calculating the content-based attention score.
+      positional_attention_bias: Trainable bias parameter added to the query
+        head when calculating the position-based attention score.
+      segment_matrix: Optional `Tensor` representing segmentation IDs used in
+        XLNet.
+      segment_encoding: Optional trainable `Tensor` representing the
+        segmentation encoding as used in XLNet.
+      segment_attention_bias: Optional trainable bias parameter added to the
+        query had when calculating the segment-based attention score used in
+        XLNet.
+      attention_mask: (default None) Optional mask that is added to attention
+        logits. If state is not None, the mask source sequence dimension should
+        extend M.
+
+    Returns:
+      attention_output: Multi-headed output of attention computation of shape
+        `[B, S, N, key_dim]`.
+
+    """
+    content_attention = tf.einsum(self._dot_product_equation,
+                                  key,
+                                  query + content_attention_bias)
+    positional_attention = tf.einsum(self._dot_product_equation,
+                                     position,
+                                     query + positional_attention_bias)
+    positional_attention = _rel_shift(
+        positional_attention, klen=tf.shape(content_attention)[3])
+
+    if segment_matrix is not None:
+      segment_attention = tf.einsum("bind,snd->bnis",
+                                    query + segment_attention_bias,
+                                    segment_encoding)
+      target_shape = tf.shape(positional_attention)
+      segment_attention = tf.where(
+          tf.broadcast_to(tf.expand_dims(segment_matrix, 1), target_shape),
+          tf.broadcast_to(segment_attention[:, :, :, 1:], target_shape),
+          tf.broadcast_to(segment_attention[:, :, :, :1], target_shape))
+      attention_sum = (
+          content_attention + positional_attention + segment_attention)
+    else:
+      attention_sum = content_attention + positional_attention
+
+    attention_scores = tf.multiply(
+        attention_sum, 1.0 / math.sqrt(float(self._key_dim)))
+
+    attention_scores = self._masked_softmax(attention_scores, attention_mask)
+
+    attention_output = self._dropout_layer(attention_scores)
+
+    attention_output = tf.einsum(self._combine_equation,
+                                 attention_output,
+                                 value)
+    return attention_output
+
+  def call(self,
+           query,
+           value,
+           content_attention_bias,
+           positional_attention_bias,
+           key=None,
+           relative_position_encoding=None,
+           segment_matrix=None,
+           segment_encoding=None,
+           segment_attention_bias=None,
+           state=None,
+           attention_mask=None):
+    """Compute multi-head relative attention over inputs.
+
+    Size glossary:
+      * Number of heads (H): the number of attention heads.
+      * Value size (V): the size of each value embedding per head.
+      * Key size (K): the size of each key embedding per head. Equally, the size
+        of each query embedding per head. Typically K <= V.
+      * Batch dimensions (B).
+      * Query (target) attention axes shape (T).
+      * Value (source) attention axes shape (S), the rank must match the target.
+      * Encoding length (L): The relative positional encoding length.
+
+    Args:
+      query: attention input.
+      value: attention input.
+      content_attention_bias: A trainable bias parameter added to the query
+        head when calculating the content-based attention score.
+      positional_attention_bias: A trainable bias parameter added to the query
+        head when calculating the position-based attention score.
+      key: attention input.
+      relative_position_encoding: relative positional encoding for key and
+        value.
+      segment_matrix: Optional `Tensor` representing segmentation IDs used in
+        XLNet.
+      segment_encoding: Optional `Tensor` representing the segmentation
+        encoding as used in XLNet.
+      segment_attention_bias: Optional trainable bias parameter added to the
+        query had when calculating the segment-based attention score used in
+        XLNet.
+      state: (default None) optional state. If passed, this is also attended
+        over as in TransformerXL.
+      attention_mask: (default None) Optional mask that is added to attention
+        logits. If state is not None, the mask source sequence dimension should
+        extend M.
+
+    Returns:
+      attention_output: The result of the computation, of shape [B, T, E],
+        where `T` is for target sequence shapes and `E` is the query input last
+        dimension if `output_shape` is `None`. Otherwise, the multi-head outputs
+        are projected to the shape specified by `output_shape`.
+    """
+    if not self._built_from_signature:
+      self._build_from_signature(query, value, key=key)
+    if key is None:
+      key = value
+    if state is not None and state.shape.ndims > 1:
+      value = tf.concat([state, value], 1)
+      key = tf.concat([state, key], 1)
+
+    # `query` = [B, T, N ,H]
+    query = self._query_dense(query)
+
+    # `key` = [B, S + M, N, H]
+    key = self._key_dense(key)
+
+    # `value` = [B, S + M, N, H]
+    value = self._value_dense(value)
+
+    # `position` = [B, L, N, H]
+    position = self._encoding_dense(relative_position_encoding)
+
+    attention_output = self.compute_attention(
+        query=query,
+        key=key,
+        value=value,
+        position=position,
+        content_attention_bias=content_attention_bias,
+        positional_attention_bias=positional_attention_bias,
+        segment_matrix=segment_matrix,
+        segment_encoding=segment_encoding,
+        segment_attention_bias=segment_attention_bias,
+        attention_mask=attention_mask)
+
+    # `attention_output` = [B, S, N, H]
+    attention_output = self._output_dense(attention_output)
+
+    return attention_output
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class TwoStreamRelativeAttention(MultiHeadRelativeAttention):
+  """Two-stream relative self-attention for XLNet.
+
+  In XLNet, each token has two associated vectors at each self-attention layer,
+  the content stream (h) and the query stream (g).
+
+  The content stream is the self-attention stream as in Transformer XL and
+  represents the context and content (the token itself).
+
+  The query stream only has access to contextual information and the position,
+  but not the content.
+
+  This layer shares the same build signature as
+  `tf.keras.layers.MultiHeadAttention` but has different input/output
+  projections.
+
+  **Note: This layer is currently experimental.
+
+  Call args:
+    content_stream: `Tensor` of shape `[B, T, dim]`.
+    content_attention_bias: Bias `Tensor` for content based attention of shape
+      `[num_heads, dim]`.
+    positional_attention_bias: Bias `Tensor` for position based attention of
+      shape `[num_heads, dim]`.
+    query_stream: `Tensor` of shape `[B, P, dim]`.
+    target_mapping: `Tensor` of shape `[B, P, S]`.
+    relative_position_encoding: Relative positional encoding `Tensor` of shape
+      `[B, L, dim]`.
+    segment_matrix: Optional `Tensor` representing segmentation IDs used in
+      XLNet of shape `[B, S, S + M]`.
+    segment_encoding: Optional `Tensor` representing the segmentation
+      encoding as used in XLNet of shape `[2, num_heads, dim]`.
+    segment_attention_bias: Optional trainable bias parameter added to the
+      query had when calculating the segment-based attention score used in
+      XLNet of shape `[num_heads, dim]`.
+    state: Optional `Tensor` of shape [B, M, E] where M is the length of the
+      state or memory.
+      If passed, this is also attended over as in Transformer XL.
+    content_attention_mask: a boolean mask of shape `[B, T, S]` that
+      prevents attention to certain positions for content attention computation.
+    query_attention_mask: a boolean mask of shape `[B, T, S]` that
+      prevents attention to certain position for query attention computation.
+  """
+
+  def call(self,
+           content_stream,
+           content_attention_bias,
+           positional_attention_bias,
+           query_stream,
+           relative_position_encoding,
+           target_mapping=None,
+           segment_matrix=None,
+           segment_encoding=None,
+           segment_attention_bias=None,
+           state=None,
+           content_attention_mask=None,
+           query_attention_mask=None):
+    """Compute multi-head relative attention over inputs.
+
+    Size glossary:
+      * Number of heads (H): the number of attention heads.
+      * Value size (V): the size of each value embedding per head.
+      * Key size (K): the size of each key embedding per head. Equally, the size
+        of each query embedding per head. Typically K <= V.
+      * Number of predictions (P): the number of predictions.
+      * Batch dimensions (B).
+      * Query (target) attention axes shape (T).
+      * Value (source) attention axes shape (S), the rank must match the target.
+      * Encoding length (L): The relative positional encoding length.
+
+    Args:
+      content_stream: The content representation, commonly referred to as h.
+        This serves a similar role to the standard hidden states in
+        Transformer-XL.
+      content_attention_bias: A trainable bias parameter added to the query
+        head when calculating the content-based attention score.
+      positional_attention_bias: A trainable bias parameter added to the query
+        head when calculating the position-based attention score.
+      query_stream: The query representation, commonly referred to as g.
+        This only has access to contextual information and position, but not
+        content. If not provided, then this is MultiHeadRelativeAttention with
+        self-attention.
+      relative_position_encoding: relative positional encoding for key and
+        value.
+      target_mapping: Optional `Tensor` representing the target mapping used
+        in partial prediction.
+      segment_matrix: Optional `Tensor` representing segmentation IDs used in
+        XLNet.
+      segment_encoding: Optional `Tensor` representing the segmentation
+        encoding as used in XLNet.
+      segment_attention_bias: Optional trainable bias parameter added to the
+        query head when calculating the segment-based attention score.
+      state: (default None) optional state. If passed, this is also attended
+        over as in TransformerXL and XLNet.
+      content_attention_mask: (default None) Optional mask that is added to
+        content attention logits. If state is not None, the mask source sequence
+        dimension should extend M.
+      query_attention_mask: (default None) Optional mask that is added to
+        query attention logits. If state is not None, the mask source sequence
+        dimension should extend M.
+
+    Returns:
+      content_attention_output, query_attention_output: the results of the
+        computation, both of shape [B, T, E]. `T` is for target sequence shapes,
+        `E` is the query input last dimension if `output_shape` is `None`.
+        Otherwise, the multi-head outputs are projected to the shape specified
+        by `output_shape`.
+    """
+    if not self._built_from_signature:
+      self._build_from_signature(content_stream, content_stream, content_stream)
+    if state is not None and state.shape.ndims > 1:
+      content_and_memory_stream = tf.concat([state, content_stream], 1)
+    else:
+      content_and_memory_stream = content_stream
+
+    # `query` = [B, T, N, H]
+    query = self._query_dense(content_stream)
+
+    # `key` = [B, S + M, N, H]
+    key = self._key_dense(content_and_memory_stream)
+
+    # `value` = [B, S + M, N, H]
+    value = self._value_dense(content_and_memory_stream)
+
+    # `position` = [B, L, N, H]
+    position = self._encoding_dense(relative_position_encoding)
+
+    content_attention_output = self.compute_attention(
+        query=query,
+        key=key,
+        value=value,
+        position=position,
+        content_attention_bias=content_attention_bias,
+        positional_attention_bias=positional_attention_bias,
+        segment_matrix=segment_matrix,
+        segment_encoding=segment_encoding,
+        segment_attention_bias=segment_attention_bias,
+        attention_mask=content_attention_mask)
+
+    # `content_attention_output` = [B, S, N, H]
+    content_attention_output = self._output_dense(content_attention_output)
+
+    query_attention_output = None
+    if query_stream is not None:
+      query = self._query_dense(query_stream)
+      if target_mapping is not None:
+        query = tf.einsum("bmnd,bml->blnd", query, target_mapping)
+        query_attention_output = self.compute_attention(
+            query=query,
+            key=key,
+            value=value,
+            position=position,
+            content_attention_bias=content_attention_bias,
+            positional_attention_bias=positional_attention_bias,
+            segment_matrix=segment_matrix,
+            segment_encoding=segment_encoding,
+            segment_attention_bias=segment_attention_bias,
+            attention_mask=query_attention_mask)
+        query_attention_output = tf.einsum("blnd,bml->bmnd",
+                                           query_attention_output,
+                                           target_mapping)
+      else:
+        query_attention_output = self.compute_attention(
+            query=query,
+            key=key,
+            value=value,
+            position=position,
+            content_attention_bias=content_attention_bias,
+            positional_attention_bias=positional_attention_bias,
+            segment_matrix=segment_matrix,
+            segment_encoding=segment_encoding,
+            segment_attention_bias=segment_attention_bias,
+            attention_mask=query_attention_mask)
+      query_attention_output = self._output_dense(query_attention_output)
+
+    return content_attention_output, query_attention_output
+
--- a/official/nlp/modeling/layers/relative_attention_test.py
+++ b/official/nlp/modeling/layers/relative_attention_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the attention layer."""
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.modeling.layers import relative_attention
+
+
+def _create_mock_attention_data(
+    num_heads,
+    key_dim,
+    value_dim,
+    seq_length,
+    batch_size,
+    memory_length=0,
+    num_predictions=2,
+    two_stream=False,
+    include_state=False,
+    include_mask=False,
+    include_segment=False):
+  """Creates mock testing data.
+
+  Args:
+    num_heads: `int`, Number of attention heads.
+    key_dim: `int`, Size of query head.
+    value_dim: `int`, Size of key, value dim.
+    seq_length: `int`, Sequence length of the input.
+    batch_size: `int`, the batch size.
+    memory_length: optional `int`, the length of the state. Defaults to 0.
+    num_predictions: `int`, the number of predictions used in two stream
+      attention.
+    two_stream: `bool`, whether or not to generate two stream data.
+    include_state: optional `bool`, whether or not to include state data.
+    include_mask: optional `bool`, whether or not to include mask data.
+    include_segment: optional `bool`, whether or not to include segment data.
+
+  Returns:
+    A dictionary with `str` as keys and `Tensor` as values.
+  """
+  query_shape = (batch_size, seq_length, key_dim)
+  value_shape = (batch_size, seq_length, value_dim)
+  encoding_shape = (batch_size, seq_length * 2, key_dim)
+  attention_bias_shape = (num_heads, key_dim)
+
+  data = dict(
+      relative_position_encoding=tf.random.normal(shape=encoding_shape),
+      content_attention_bias=tf.random.normal(shape=attention_bias_shape),
+      positional_attention_bias=tf.random.normal(shape=attention_bias_shape))
+
+  if two_stream:
+    query_stream_shape = (batch_size, num_predictions, key_dim)
+    target_mapping_shape = (batch_size, num_predictions, seq_length)
+    stream_data = dict(
+        content_stream=tf.random.normal(shape=query_shape),
+        query_stream=tf.random.normal(shape=query_stream_shape),
+        target_mapping=tf.random.normal(shape=target_mapping_shape))
+  else:
+    stream_data = dict(
+        query=tf.random.normal(shape=query_shape),
+        value=tf.random.normal(shape=value_shape),
+        key=tf.random.normal(shape=value_shape))
+
+  data.update(stream_data)
+
+  if include_state:
+    total_seq_length = seq_length + memory_length
+    state_data = dict(
+        state=tf.random.normal(shape=(batch_size, memory_length, value_dim)))
+    data.update(state_data)
+  else:
+    total_seq_length = seq_length
+
+  if include_mask:
+    mask_shape = (batch_size, num_heads, seq_length, total_seq_length)
+    mask_data = np.random.randint(2, size=mask_shape).astype("float32")
+    if two_stream:
+      mask_data = dict(
+          content_attention_mask=mask_data,
+          query_attention_mask=mask_data)
+    else:
+      mask_data = dict(attention_mask=mask_data)
+    data.update(mask_data)
+
+  if include_segment:
+    segment_encoding_shape = (2, num_heads, key_dim)
+    segment_matrix = np.random.randint(
+        2, size=(batch_size, seq_length, total_seq_length))
+    segment_matrix = tf.math.equal(segment_matrix, 1)
+    segment_data = dict(
+        segment_attention_bias=tf.random.normal(shape=attention_bias_shape),
+        segment_encoding=tf.random.normal(shape=segment_encoding_shape),
+        segment_matrix=segment_matrix)
+    data.update(segment_data)
+
+  return data
+
+
+@keras_parameterized.run_all_keras_modes
+class MultiHeadRelativeAttentionTest(keras_parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      value_dim=[32, 64],
+      memory_length=[0, 4],
+      state=[True, False],
+      mask=[True, False],
+      segment=[True, False]))
+  def test_attention_scores(self,
+                            value_dim,
+                            memory_length,
+                            state,
+                            mask,
+                            segment):
+    """Tests combinations of attention score calculations."""
+    batch_size, num_heads, key_dim, seq_length = 2, 12, 64, 8
+    test_layer = relative_attention.MultiHeadRelativeAttention(
+        num_heads=num_heads,
+        key_dim=key_dim,
+        value_dim=value_dim)
+    data = _create_mock_attention_data(
+        num_heads=num_heads,
+        key_dim=key_dim,
+        value_dim=value_dim,
+        seq_length=seq_length,
+        memory_length=memory_length,
+        two_stream=False,
+        batch_size=batch_size,
+        include_state=state,
+        include_mask=mask,
+        include_segment=segment)
+    output = test_layer(**data)
+    self.assertEqual(output.shape, [batch_size, seq_length, key_dim])
+
+
+@keras_parameterized.run_all_keras_modes
+class TwoStreamRelativeAttentionTest(keras_parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      num_predictions=[2, 10],
+      memory_length=[0, 4],
+      state=[True, False],
+      mask=[True, False],
+      segment=[True, False]))
+  def test_attention_scores(self,
+                            num_predictions,
+                            memory_length,
+                            state,
+                            mask,
+                            segment):
+    """Tests combinations of attention score calculations."""
+    batch_size, num_heads, key_dim, seq_length = 2, 12, 64, 8
+    test_layer = relative_attention.TwoStreamRelativeAttention(
+        num_heads=num_heads,
+        key_dim=key_dim,
+        value_dim=key_dim)
+    data = _create_mock_attention_data(
+        num_heads=num_heads,
+        key_dim=key_dim,
+        value_dim=key_dim,
+        seq_length=seq_length,
+        memory_length=memory_length,
+        num_predictions=num_predictions,
+        two_stream=True,
+        batch_size=batch_size,
+        include_state=state,
+        include_mask=mask,
+        include_segment=segment)
+    content_output, query_output, = test_layer(**data)
+    self.assertEqual(content_output.shape, [batch_size, seq_length, key_dim])
+    self.assertEqual(query_output.shape, [batch_size, num_predictions, key_dim])
+
+
+if __name__ == "__main__":
+  np.random.seed(0)
+  tf.random.set_seed(0)
+  tf.test.main()
--- a/official/nlp/modeling/layers/rezero_transformer.py
+++ b/official/nlp/modeling/layers/rezero_transformer.py
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,19 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+
 """Keras-based rezero-transformer block layer (Transformer with ReZero)."""
 # pylint: disable=g-classes-have-attributes
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function

 import gin
 import tensorflow as tf

-from official.nlp.modeling.layers import attention
-

 @tf.keras.utils.register_keras_serializable(package="Text")
 @gin.configurable
@@ -35,7 +29,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
  The residual connection implements the ReZero method.
  (https://arxiv.org/abs/2003.04887)

-  Arguments:
+  Args:
    num_attention_heads: Number of attention heads.
    intermediate_size: Size of the intermediate layer.
    intermediate_activation: Activation for the intermediate layer.
@@ -88,7 +82,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
  def build(self, input_shape):
    input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
    input_tensor_shape = tf.TensorShape(input_tensor)
-    if len(input_tensor_shape) != 3:
+    if len(input_tensor_shape.as_list()) != 3:
      raise ValueError("TransformerLayer expects a three-dimensional input of "
                       "shape [batch, sequence, width].")
    batch_size, sequence_length, hidden_size = input_tensor_shape
@@ -116,9 +110,9 @@ class ReZeroTransformer(tf.keras.layers.Layer):
        activity_regularizer=self._activity_regularizer,
        kernel_constraint=self._kernel_constraint,
        bias_constraint=self._bias_constraint)
-    self._attention_layer = attention.MultiHeadAttention(
+    self._attention_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=self._num_heads,
-        key_size=self._attention_head_size,
+        key_dim=self._attention_head_size,
        dropout=self._attention_dropout_rate,
        name="self_attention",
        **common_kwargs)
@@ -138,7 +132,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
        bias_axes="d",
        name="intermediate",
        **common_kwargs)
-    policy = tf.keras.mixed_precision.experimental.global_policy()
+    policy = tf.keras.mixed_precision.global_policy()
    if policy.name == "mixed_bfloat16":
      # bfloat16 causes BERT with the LAMB optimizer to not converge
      # as well, so we use float32.
@@ -161,7 +155,8 @@ class ReZeroTransformer(tf.keras.layers.Layer):
    self._rezero_a = self.add_weight(
        name="rezero_alpha",
        initializer=tf.keras.initializers.Zeros(),
-        trainable=True, dtype=tf.float32)
+        trainable=True,
+        dtype=tf.float32)

    super(ReZeroTransformer, self).build(input_shape)

@@ -213,9 +208,9 @@ class ReZeroTransformer(tf.keras.layers.Layer):
      attention_mask = attention_mask[:, 0:self._output_range, :]
    else:
      target_tensor = input_tensor
-    attention_inputs = [target_tensor, input_tensor]

-    attention_output = self._attention_layer(attention_inputs, attention_mask)
+    attention_output = self._attention_layer(
+        query=target_tensor, value=input_tensor, attention_mask=attention_mask)
    attention_output = self._attention_dropout(attention_output)
    attention_output = target_tensor + self._rezero_a * attention_output
    if self._use_layer_norm:

--- a/official/nlp/modeling/layers/rezero_transformer_test.py
+++ b/official/nlp/modeling/layers/rezero_transformer_test.py
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,12 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Tests for Keras-based rezero-transformer block layer."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+"""Tests for Keras-based rezero-transformer block layer."""

 import numpy as np
 import tensorflow as tf
@@ -32,10 +28,10 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):

  def tearDown(self):
    super(TransformerWithReZeroLayerTest, self).tearDown()
-    tf.keras.mixed_precision.experimental.set_policy('float32')
+    tf.keras.mixed_precision.set_global_policy('float32')

  def test_layer_invocation_with_float16_dtype(self):
-    tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    test_layer = rezero_transformer.ReZeroTransformer(
        num_attention_heads=10,
        intermediate_size=2048,
@@ -95,9 +91,9 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):

    input_data = np.random.rand(2, input_length, width) + 2.0
    output_data = model.predict(input_data)
-    input_data_normed = (
-        input_data - np.mean(input_data, axis=-1, keepdims=True)) / (
-            np.std(input_data, axis=-1, keepdims=True))
+    input_data_normed = (input_data -
+                         np.mean(input_data, axis=-1, keepdims=True)) / (
+                             np.std(input_data, axis=-1, keepdims=True))

    self.assertAllClose(input_data_normed, output_data)