Update code to v2.11.0

32e4ca51 · qianyj · 9485aa1d · 71060f67 · 32e4ca51 · 32e4ca51
Commit 32e4ca51 authored Nov 28, 2023 by qianyj
20 changed files
--- a/official/nlp/metrics/bleu.py
+++ b/official/nlp/metrics/bleu.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/metrics/bleu_test.py
+++ b/official/nlp/metrics/bleu_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/modeling/README.md
+++ b/official/nlp/modeling/README.md
@@ -20,8 +20,7 @@ examples.
 * [`losses`](losses) contains common loss computation used in NLP tasks.

 Please see the colab
-[nlp_modeling_library_intro.ipynb]
-(https://colab.sandbox.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb)
+[NLP modeling library intro.ipynb](https://colab.sandbox.google.com/github/tensorflow/models/blob/master/docs/nlp/index.ipynb)
 for how to build transformer-based NLP models using above primitives.

 Besides the pre-defined primitives, it also provides scaffold classes to allow
@@ -44,8 +43,7 @@ custom hidden layer (which will replace the Transformer instantiation in the
 encoder).

 Please see the colab
-[customize_encoder.ipynb]
-(https://colab.sandbox.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb)
+[customize_encoder.ipynb](https://colab.sandbox.google.com/github/tensorflow/models/blob/master/docs/nlp/customize_encoder.ipynb)
 for how to use scaffold classes to build noval achitectures.

 BERT and ALBERT models in this repo are implemented using this library.

--- a/official/nlp/modeling/__init__.py
+++ b/official/nlp/modeling/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/modeling/layers/README.md
+++ b/official/nlp/modeling/layers/README.md
@@ -13,7 +13,7 @@ assemble new `tf.keras` layers or models.
    ["Big Bird: Transformers for Longer Sequences"](https://arxiv.org/abs/2007.14062).

 *   [CachedAttention](attention.py) implements an attention layer with cache
-    used for auto-agressive decoding.
+    used for auto-aggressive decoding.

 *   [KernelAttention](kernel_attention.py) implements a group of attention
    mechansim that express the self-attention as a linear dot-product of

--- a/official/nlp/modeling/layers/__init__.py
+++ b/official/nlp/modeling/layers/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,7 +20,9 @@ They can be used to assemble new `tf.keras` layers or models.
 from official.nlp.modeling.layers.attention import *
 from official.nlp.modeling.layers.bigbird_attention import BigBirdAttention
 from official.nlp.modeling.layers.bigbird_attention import BigBirdMasks
+from official.nlp.modeling.layers.block_diag_feedforward import BlockDiagFeedforward
 from official.nlp.modeling.layers.cls_head import *
+from official.nlp.modeling.layers.factorized_embedding import FactorizedEmbedding
 from official.nlp.modeling.layers.gated_feedforward import GatedFeedforward
 from official.nlp.modeling.layers.gaussian_process import RandomFeatureGaussianProcess
 from official.nlp.modeling.layers.kernel_attention import KernelAttention
@@ -28,11 +30,19 @@ from official.nlp.modeling.layers.kernel_attention import KernelMask
 from official.nlp.modeling.layers.masked_lm import MaskedLM
 from official.nlp.modeling.layers.masked_softmax import MaskedSoftmax
 from official.nlp.modeling.layers.mat_mul_with_margin import MatMulWithMargin
+from official.nlp.modeling.layers.mixing import FourierTransformLayer
+from official.nlp.modeling.layers.mixing import HartleyTransformLayer
+from official.nlp.modeling.layers.mixing import LinearTransformLayer
+from official.nlp.modeling.layers.mixing import MixingMechanism
 from official.nlp.modeling.layers.mobile_bert_layers import MobileBertEmbedding
 from official.nlp.modeling.layers.mobile_bert_layers import MobileBertMaskedLM
 from official.nlp.modeling.layers.mobile_bert_layers import MobileBertTransformer
 from official.nlp.modeling.layers.multi_channel_attention import *
 from official.nlp.modeling.layers.on_device_embedding import OnDeviceEmbedding
+from official.nlp.modeling.layers.pack_optimization import PackBertEmbeddings
+from official.nlp.modeling.layers.pack_optimization import StridedTransformerEncoderBlock
+from official.nlp.modeling.layers.pack_optimization import StridedTransformerScaffold
+from official.nlp.modeling.layers.per_dim_scale_attention import PerDimScaleAttention
 from official.nlp.modeling.layers.position_embedding import PositionEmbedding
 from official.nlp.modeling.layers.position_embedding import RelativePositionBias
 from official.nlp.modeling.layers.position_embedding import RelativePositionEmbedding
@@ -41,6 +51,7 @@ from official.nlp.modeling.layers.relative_attention import TwoStreamRelativeAtt
 from official.nlp.modeling.layers.reuse_attention import ReuseMultiHeadAttention
 from official.nlp.modeling.layers.reuse_transformer import ReuseTransformer
 from official.nlp.modeling.layers.rezero_transformer import ReZeroTransformer
+from official.nlp.modeling.layers.routing import *
 from official.nlp.modeling.layers.self_attention_mask import SelfAttentionMask
 from official.nlp.modeling.layers.spectral_normalization import *
 from official.nlp.modeling.layers.talking_heads_attention import TalkingHeadsAttention
@@ -49,7 +60,8 @@ from official.nlp.modeling.layers.text_layers import BertTokenizer
 from official.nlp.modeling.layers.text_layers import FastWordpieceBertTokenizer
 from official.nlp.modeling.layers.text_layers import SentencepieceTokenizer
 from official.nlp.modeling.layers.tn_transformer_expand_condense import TNTransformerExpandCondense
-from official.nlp.modeling.layers.transformer import *
+from official.nlp.modeling.layers.transformer import Transformer
+from official.nlp.modeling.layers.transformer import TransformerDecoderBlock
 from official.nlp.modeling.layers.transformer_encoder_block import TransformerEncoderBlock
 from official.nlp.modeling.layers.transformer_scaffold import TransformerScaffold
 from official.nlp.modeling.layers.transformer_xl import TransformerXL

--- a/official/nlp/modeling/layers/attention.py
+++ b/official/nlp/modeling/layers/attention.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,13 +18,13 @@ import math

 import tensorflow as tf

-EinsumDense = tf.keras.layers.experimental.EinsumDense
+EinsumDense = tf.keras.layers.EinsumDense
 MultiHeadAttention = tf.keras.layers.MultiHeadAttention


 @tf.keras.utils.register_keras_serializable(package="Text")
 class CachedAttention(tf.keras.layers.MultiHeadAttention):
-  """Attention layer with cache used for auto-agressive decoding.
+  """Attention layer with cache used for autoregressive decoding.

  Arguments are the same as `tf.keras.layers.MultiHeadAttention` layer.
  """

--- a/official/nlp/modeling/layers/attention_test.py
+++ b/official/nlp/modeling/layers/attention_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/modeling/layers/bigbird_attention.py
+++ b/official/nlp/modeling/layers/bigbird_attention.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/modeling/layers/bigbird_attention_test.py
+++ b/official/nlp/modeling/layers/bigbird_attention_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/nlp/modeling/layers/block_diag_feedforward.py
+++ b/official/nlp/modeling/layers/block_diag_feedforward.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based gated feedforward layer."""
+# pylint: disable=g-classes-have-attributes
+from typing import Optional
+
+import tensorflow as tf
+
+from official.modeling import tf_utils
+
+
+class BlockDiagFeedforward(tf.keras.layers.Layer):
+  """Block diagonal feedforward layer.
+
+  This layer replaces the weight matrix of the output_dense layer with a block
+  diagonal matrix to save layer parameters and FLOPs. A linear mixing layer can
+  be added optionally to improve layer expressibility.
+
+  Args:
+    intermediate_size: Size of the intermediate layer.
+    intermediate_activation: Activation for the intermediate layer.
+    dropout: Dropout probability for the output dropout.
+    num_blocks: The number of blocks for the block diagonal matrix of the
+      output_dense layer.
+    apply_mixing: Apply linear mixing if True.
+    kernel_initializer: Initializer for dense layer kernels.
+    bias_initializer: Initializer for dense layer biases.
+    kernel_regularizer: Regularizer for dense layer kernels.
+    bias_regularizer: Regularizer for dense layer biases.
+    activity_regularizer: Regularizer for dense layer activity.
+    kernel_constraint: Constraint for dense layer kernels.
+    bias_constraint: Constraint for dense layer kernels.
+  """
+
+  def __init__(
+      self,
+      intermediate_size: int,
+      intermediate_activation: str,
+      dropout: float,
+      num_blocks: int = 1,
+      apply_mixing: bool = True,
+      kernel_initializer: str = "glorot_uniform",
+      bias_initializer: str = "zeros",
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      activity_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      kernel_constraint: Optional[tf.keras.constraints.Constraint] = None,
+      bias_constraint: Optional[tf.keras.constraints.Constraint] = None,
+      **kwargs):  # pylint: disable=g-doc-args
+    super().__init__(**kwargs)
+    self._intermediate_size = intermediate_size
+    self._intermediate_activation = intermediate_activation
+    self._dropout = dropout
+    self._num_blocks = num_blocks
+    self._apply_mixing = apply_mixing
+
+    if intermediate_size % num_blocks != 0:
+      raise ValueError("Intermediate_size (%d) isn't a multiple of num_blocks "
+                       "(%d)." % (intermediate_size, num_blocks))
+
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+
+  def build(self, input_shape):
+    hidden_size = input_shape.as_list()[-1]
+
+    common_kwargs = dict(
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+
+    self._intermediate_dense = tf.keras.layers.EinsumDense(
+        "abc,cde->abde",
+        output_shape=(None, self._num_blocks,
+                      self._intermediate_size // self._num_blocks),
+        bias_axes="de",
+        name="intermediate",
+        kernel_initializer=tf_utils.clone_initializer(self._kernel_initializer),
+        bias_initializer=tf_utils.clone_initializer(self._bias_initializer),
+        **common_kwargs)
+
+    policy = tf.keras.mixed_precision.global_policy()
+    if policy.name == "mixed_bfloat16":
+      # bfloat16 causes BERT with the LAMB optimizer to not converge
+      # as well, so we use float32.
+      policy = tf.float32
+    self._intermediate_activation_layer = tf.keras.layers.Activation(
+        self._intermediate_activation, dtype=policy)
+
+    self._output_dense = tf.keras.layers.EinsumDense(
+        "abde,deo->abdo",
+        output_shape=(None, self._num_blocks, hidden_size // self._num_blocks),
+        bias_axes="do",
+        name="output",
+        kernel_initializer=tf_utils.clone_initializer(self._kernel_initializer),
+        bias_initializer=tf_utils.clone_initializer(self._bias_initializer),
+        **common_kwargs)
+
+    if self._apply_mixing:
+      self._output_mixing = tf.keras.layers.EinsumDense(
+          "abdo,de->abeo",
+          output_shape=(None, self._num_blocks,
+                        hidden_size // self._num_blocks),
+          name="output_mixing",
+          kernel_initializer=tf_utils.clone_initializer(
+              self._kernel_initializer),
+          bias_initializer=tf_utils.clone_initializer(self._bias_initializer),
+          **common_kwargs)
+    self._output_reshape = tf.keras.layers.Reshape((-1, hidden_size))
+
+    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout)
+
+  def get_config(self):
+    config = {
+        "intermediate_size":
+            self._intermediate_size,
+        "intermediate_activation":
+            self._intermediate_activation,
+        "dropout":
+            self._dropout,
+        "num_blocks":
+            self._num_blocks,
+        "apply_mixing":
+            self._apply_mixing,
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            tf.keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            tf.keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            tf.keras.constraints.serialize(self._bias_constraint)
+    }
+    base_config = super().get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    intermediate_output = self._intermediate_dense(inputs)
+    intermediate_output = self._intermediate_activation_layer(
+        intermediate_output)
+    layer_output = self._output_dense(intermediate_output)
+    if self._apply_mixing:
+      layer_output = self._output_mixing(layer_output)
+    layer_output = self._output_reshape(layer_output)
+    layer_output = self._output_dropout(layer_output)
+
+    return layer_output
--- a/official/nlp/modeling/layers/block_diag_feedforward_test.py
+++ b/official/nlp/modeling/layers/block_diag_feedforward_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Keras-based gated feedforward layer."""
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.modeling.layers import block_diag_feedforward
+
+
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class BlockDiagFeedforwardTest(keras_parameterized.TestCase):
+
+  def tearDown(self):
+    super(BlockDiagFeedforwardTest, self).tearDown()
+    tf.keras.mixed_precision.set_global_policy("float32")
+
+  @parameterized.parameters(
+      (1, True, "float32"),
+      (1, True, "mixed_float16"),
+      (1, False, "float32"),
+      (1, False, "mixed_float16"),
+      (2, True, "float32"),
+      (2, True, "mixed_float16"),
+      (2, False, "float32"),
+      (2, False, "mixed_float16"),
+  )
+  def test_layer_creation(self, num_blocks, apply_mixing, dtype):
+    tf.keras.mixed_precision.set_global_policy(dtype)
+    kwargs = dict(
+        intermediate_size=128,
+        intermediate_activation="relu",
+        dropout=0.1,
+        num_blocks=num_blocks,
+        apply_mixing=apply_mixing,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros")
+    test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
+
+    sequence_length = 64
+    width = 128
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(data_tensor)
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
+
+  @parameterized.parameters(
+      (1, True, "float32"),
+      (1, True, "mixed_float16"),
+      (1, False, "float32"),
+      (1, False, "mixed_float16"),
+      (2, True, "float32"),
+      (2, True, "mixed_float16"),
+      (2, False, "float32"),
+      (2, False, "mixed_float16"),
+  )
+  def test_layer_invocation(self, num_blocks, apply_mixing, dtype):
+    tf.keras.mixed_precision.set_global_policy(dtype)
+    kwargs = dict(
+        intermediate_size=16,
+        intermediate_activation="relu",
+        dropout=0.1,
+        num_blocks=num_blocks,
+        apply_mixing=apply_mixing,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros")
+    test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
+
+    sequence_length = 16
+    width = 32
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(data_tensor)
+
+    # Create a model from the test layer.
+    model = tf.keras.Model(data_tensor, output_tensor)
+
+    # Invoke the model on test data.
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    output_data = model.predict(input_data)
+    self.assertEqual(output_data.shape, (batch_size, sequence_length, width))
+
+  def test_get_config(self):
+    kwargs = dict(
+        intermediate_size=16,
+        intermediate_activation="relu",
+        dropout=0.1,
+        num_blocks=2,
+        apply_mixing=True,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros")
+    test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
+    new_layer = block_diag_feedforward.BlockDiagFeedforward.from_config(
+        test_layer.get_config())
+
+    self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/modeling/layers/cls_head.py
+++ b/official/nlp/modeling/layers/cls_head.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -57,12 +57,14 @@ class ClassificationHead(tf.keras.layers.Layer):
      self.dense = tf.keras.layers.Dense(
          units=self.inner_dim,
          activation=self.activation,
-          kernel_initializer=self.initializer,
+          kernel_initializer=tf_utils.clone_initializer(self.initializer),
          name="pooler_dense")
    self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)

    self.out_proj = tf.keras.layers.Dense(
-        units=num_classes, kernel_initializer=self.initializer, name="logits")
+        units=num_classes,
+        kernel_initializer=tf_utils.clone_initializer(self.initializer),
+        name="logits")

  def call(self, features: tf.Tensor, only_project: bool = False):
    """Implements call().
@@ -146,14 +148,15 @@ class MultiClsHeads(tf.keras.layers.Layer):
      self.dense = tf.keras.layers.Dense(
          units=inner_dim,
          activation=self.activation,
-          kernel_initializer=self.initializer,
+          kernel_initializer=tf_utils.clone_initializer(self.initializer),
          name="pooler_dense")
    self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
    self.out_projs = []
    for name, num_classes in cls_list:
      self.out_projs.append(
          tf.keras.layers.Dense(
-              units=num_classes, kernel_initializer=self.initializer,
+              units=num_classes,
+              kernel_initializer=tf_utils.clone_initializer(self.initializer),
              name=name))

  def call(self, features: tf.Tensor, only_project: bool = False):
@@ -277,7 +280,7 @@ class GaussianProcessClassificationHead(ClassificationHead):
    if use_gp_layer:
      self.out_proj = gaussian_process.RandomFeatureGaussianProcess(
          self.num_classes,
-          kernel_initializer=self.initializer,
+          kernel_initializer=tf_utils.clone_initializer(self.initializer),
          name="logits",
          **self.gp_layer_kwargs)

@@ -361,3 +364,97 @@ def extract_spec_norm_kwargs(kwargs):
  return dict(
      iteration=kwargs.pop("iteration", 1),
      norm_multiplier=kwargs.pop("norm_multiplier", .99))
+
+
+class PerQueryDenseHead(tf.keras.layers.Layer):
+  """Pooling head used for EncT5 style models.
+
+    This module projects each query to use a different projection.
+
+    For a input shape= [bs, num_queries, hidden_size], it projects each query to
+    (features). Ending up with shape= [bs, num_queries, features].
+
+    For example, for classification with a few classes, one may use num_queries
+    as 1 and features as number of classes. For multilabel classification, one
+    may use num_queries as number of classes and features as 2. So each query
+    represents a binary classification of one label.
+  """
+
+  def __init__(self,
+               num_queries: int,
+               features: int,
+               use_bias: bool = False,
+               kernel_initializer: str = "glorot_uniform",
+               **kwargs):
+    """Initializes the `PerQueryDenseHead`.
+
+    Args:
+      num_queries: number of queries (the learnable embeddings in the input
+        sequences) from the decoder.
+      features: int with numbers of output features. Each query with be
+        projected to this number with a different projection.
+      use_bias: whether to add a bias to the output.
+      kernel_initializer: Initializer for dense layer kernels.
+      **kwargs: Keyword arguments.
+    """
+    super().__init__(**kwargs)
+    self.num_queries = num_queries
+    self.features = features
+
+    self.use_bias = use_bias
+    self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+
+  def build(self, input_shape):
+    input_shape = tf.TensorShape(input_shape)
+    # Hidden size.
+    last_dim = tf.compat.dimension_value(input_shape[-1])
+
+    self.hidden_size = last_dim
+    self.kernel = self.add_weight(
+        "kernel",
+        shape=[self.num_queries, last_dim, self.features],
+        initializer=self.kernel_initializer,
+        dtype=self.dtype,
+        trainable=True)
+    if self.use_bias:
+      self.bias = self.add_weight(
+          "bias",
+          shape=[
+              self.num_queries,
+              self.features,
+          ],
+          dtype=self.dtype,
+          trainable=True)
+    else:
+      self.bias = None
+
+  def call(self, inputs: tf.Tensor) -> tf.Tensor:
+    """Implements call().
+
+    Args:
+      inputs: a rank-3 Tensor of shape= [bs, num_queries, hidden_size].
+
+    Returns:
+      A Tensor, shape= [batch size, num_queries, features].
+    """
+
+    outputs = tf.einsum("bqh,qhf->bqf", inputs, self.kernel)
+    if self.use_bias:
+      outputs += self.bias
+    return outputs
+
+  def get_config(self):
+    config = {
+        "num_queries":
+            self.num_queries,
+        "features":
+            self.features,
+        "kernel_initializer":
+            tf.keras.activations.serialize(self.kernel_initializer),
+    }
+    config.update(super(PerQueryDenseHead, self).get_config())
+    return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/nlp/modeling/layers/cls_head_test.py
+++ b/official/nlp/modeling/layers/cls_head_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -199,5 +199,29 @@ class GaussianProcessClassificationHead(tf.test.TestCase,
    self.assertEqual(layer_config["norm_multiplier"], 1.)
    self.assertEqual(layer_config["num_inducing"], 512)

+
+class PerQueryDenseHeadTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(("single_query", 1, 3, False),
+                                  ("multi_queries", 10, 2, False),
+                                  ("with_bias", 10, 2, True))
+  def test_layer_invocation(self, num_queries, features, use_bias):
+    batch_size = 5
+    hidden_size = 10
+    layer = cls_head.PerQueryDenseHead(
+        num_queries=num_queries, features=features, use_bias=use_bias)
+    inputs = tf.zeros(
+        shape=(batch_size, num_queries, hidden_size), dtype=tf.float32)
+    outputs = layer(inputs)
+    self.assertEqual(outputs.shape, [batch_size, num_queries, features])
+
+  def test_layer_serialization(self):
+    layer = cls_head.PerQueryDenseHead(
+        num_queries=10, features=2, use_bias=True)
+    new_layer = cls_head.PerQueryDenseHead.from_config(layer.get_config())
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(layer.get_config(), new_layer.get_config())
+
 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/modeling/layers/factorized_embedding.py
+++ b/official/nlp/modeling/layers/factorized_embedding.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A factorized embedding layer."""
+# pylint: disable=g-classes-have-attributes
+
+import tensorflow as tf
+
+from official.modeling import tf_utils
+from official.nlp.modeling.layers import on_device_embedding
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class FactorizedEmbedding(on_device_embedding.OnDeviceEmbedding):
+  """A factorized embeddings layer for supporting larger embeddings.
+
+  Arguments:
+    vocab_size: Number of elements in the vocabulary.
+    embedding_width: Width of word embeddings.
+    output_dim: The output dimension of this layer.
+    initializer: The initializer to use for the embedding weights. Defaults to
+      "glorot_uniform".
+    use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
+      lookup. Defaults to False (that is, using tf.gather). Setting this option
+      to True may improve performance, especially on small vocabulary sizes, but
+      will generally require more memory.
+    scale_factor: Whether to scale the output embeddings. Defaults to None (that
+      is, not to scale). Setting this option to a float will let values in
+      output embeddings multiplied by scale_factor.
+  """
+
+  def __init__(self,
+               vocab_size: int,
+               embedding_width: int,
+               output_dim: int,
+               initializer='glorot_uniform',
+               use_one_hot=False,
+               scale_factor=None,
+               **kwargs):
+    super().__init__(
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        initializer=initializer,
+        use_one_hot=use_one_hot,
+        scale_factor=scale_factor,
+        **kwargs)
+    self._output_dim = output_dim
+
+  def get_config(self):
+    config = {'output_dim': self._output_dim}
+    base_config = super().get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def build(self, input_shape):
+    self._embedding_projection = tf.keras.layers.EinsumDense(
+        '...x,xy->...y',
+        output_shape=self._output_dim,
+        bias_axes=None,
+        kernel_initializer=tf_utils.clone_initializer(self._initializer),
+        name='embedding_projection')
+    super().build(input_shape)
+
+  def call(self, inputs):
+    output = super().call(inputs)
+    return self._embedding_projection(output)
--- a/official/nlp/modeling/layers/factorized_embedding_test.py
+++ b/official/nlp/modeling/layers/factorized_embedding_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for FactorizedEmbedding layer."""
+
+import numpy as np
+import tensorflow as tf
+
+from official.nlp.modeling.layers import factorized_embedding
+
+
+class FactorizedEmbeddingTest(tf.test.TestCase):
+
+  def test_layer_creation(self):
+    vocab_size = 31
+    embedding_width = 27
+    output_dim = 45
+    test_layer = factorized_embedding.FactorizedEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        output_dim=output_dim)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # The output should be the same as the input, save that it has an extra
+    # embedding_width dimension on the end.
+    expected_output_shape = [None, sequence_length, output_dim]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    self.assertEqual(output_tensor.dtype, tf.float32)
+
+  def test_layer_invocation(self):
+    vocab_size = 31
+    embedding_width = 27
+    output_dim = 45
+    test_layer = factorized_embedding.FactorizedEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        output_dim=output_dim)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float32, output.dtype)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/modeling/layers/gated_feedforward.py
+++ b/official/nlp/modeling/layers/gated_feedforward.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,9 @@
 import gin
 import tensorflow as tf

+from official.modeling import tf_utils
+from official.nlp.modeling.layers import util
+

 @tf.keras.utils.register_keras_serializable(package="Text")
 @gin.configurable
@@ -55,9 +58,9 @@ class GatedFeedforward(tf.keras.layers.Layer):
  """

  def __init__(self,
-               intermediate_size,
-               intermediate_activation,
-               dropout,
+               inner_dim=768,
+               inner_activation=tf_utils.get_activation("gelu"),
+               dropout=0.0,
               use_gate=True,
               apply_output_layer_norm=True,
               num_blocks=1,
@@ -70,9 +73,12 @@ class GatedFeedforward(tf.keras.layers.Layer):
               kernel_constraint=None,
               bias_constraint=None,
               **kwargs):
-    super(GatedFeedforward, self).__init__(**kwargs)
-    self._intermediate_size = intermediate_size
-    self._intermediate_activation = intermediate_activation
+    inner_dim = kwargs.pop("intermediate_size", inner_dim)
+    inner_activation = kwargs.pop("intermediate_activation", inner_activation)
+    util.filter_kwargs(kwargs)
+    super().__init__(**kwargs)
+    self._inner_dim = inner_dim
+    self._inner_activation = inner_activation
    self._dropout = dropout
    self._use_gate = use_gate
    self._num_blocks = num_blocks
@@ -95,15 +101,13 @@ class GatedFeedforward(tf.keras.layers.Layer):
    hidden_size = input_shape.as_list()[-1]

    common_kwargs = dict(
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
        kernel_regularizer=self._kernel_regularizer,
        bias_regularizer=self._bias_regularizer,
        activity_regularizer=self._activity_regularizer,
        kernel_constraint=self._kernel_constraint,
        bias_constraint=self._bias_constraint)
    self._intermediate_dense = []
-    self._intermediate_activation_layers = []
+    self._inner_activation_layers = []
    self._gate_dense = []
    self._output_dense = []
    self._output_dropout = []
@@ -116,29 +120,41 @@ class GatedFeedforward(tf.keras.layers.Layer):
      activation_policy = tf.float32
    for i in range(self._num_blocks):
      self._intermediate_dense.append(
-          tf.keras.layers.experimental.EinsumDense(
+          tf.keras.layers.EinsumDense(
              "abc,cd->abd",
-              output_shape=(None, self._intermediate_size),
+              output_shape=(None, self._inner_dim),
              bias_axes="d",
              name="intermediate_%d" % i,
+              kernel_initializer=tf_utils.clone_initializer(
+                  self._kernel_initializer),
+              bias_initializer=tf_utils.clone_initializer(
+                  self._bias_initializer),
              **common_kwargs))
-      self._intermediate_activation_layers.append(
+      self._inner_activation_layers.append(
          tf.keras.layers.Activation(
-              self._intermediate_activation, dtype=activation_policy))
+              self._inner_activation, dtype=activation_policy))
      if self._use_gate:
        self._gate_dense.append(
-            tf.keras.layers.experimental.EinsumDense(
+            tf.keras.layers.EinsumDense(
                "abc,cd->abd",
-                output_shape=(None, self._intermediate_size),
+                output_shape=(None, self._inner_dim),
                bias_axes="d",
                name="gate_%d" % i,
+                kernel_initializer=tf_utils.clone_initializer(
+                    self._kernel_initializer),
+                bias_initializer=tf_utils.clone_initializer(
+                    self._bias_initializer),
                **common_kwargs))
      self._output_dense.append(
-          tf.keras.layers.experimental.EinsumDense(
+          tf.keras.layers.EinsumDense(
              "abc,cd->abd",
              output_shape=(None, hidden_size),
              bias_axes="d",
              name="output_%d" % i,
+              kernel_initializer=tf_utils.clone_initializer(
+                  self._kernel_initializer),
+              bias_initializer=tf_utils.clone_initializer(
+                  self._bias_initializer),
              **common_kwargs))
      self._output_dropout.append(tf.keras.layers.Dropout(rate=self._dropout))
      # Use float32 in layernorm for numeric stability.
@@ -152,10 +168,10 @@ class GatedFeedforward(tf.keras.layers.Layer):

  def get_config(self):
    config = {
-        "intermediate_size":
-            self._intermediate_size,
-        "intermediate_activation":
-            self._intermediate_activation,
+        "inner_dim":
+            self._inner_dim,
+        "inner_activation":
+            self._inner_activation,
        "dropout":
            self._dropout,
        "use_gate":
@@ -179,7 +195,7 @@ class GatedFeedforward(tf.keras.layers.Layer):
        "bias_constraint":
            tf.keras.constraints.serialize(self._bias_constraint)
    }
-    base_config = super(GatedFeedforward, self).get_config()
+    base_config = super().get_config()
    return dict(list(base_config.items()) + list(config.items()))

  def call(self, inputs):
@@ -187,7 +203,7 @@ class GatedFeedforward(tf.keras.layers.Layer):
    for i in range(self._num_blocks):
      layer_input = layer_output
      intermediate_output = self._intermediate_dense[i](layer_input)
-      intermediate_output = self._intermediate_activation_layers[i](
+      intermediate_output = self._inner_activation_layers[i](
          intermediate_output)
      if self._use_gate:
        gated_linear = self._gate_dense[i](layer_input)

--- a/official/nlp/modeling/layers/gated_feedforward_test.py
+++ b/official/nlp/modeling/layers/gated_feedforward_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -44,8 +44,8 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
  def test_layer_creation(self, use_gate, num_blocks, dropout_position, dtype):
    tf.keras.mixed_precision.set_global_policy(dtype)
    kwargs = dict(
-        intermediate_size=128,
-        intermediate_activation="relu",
+        inner_dim=128,
+        inner_activation="relu",
        dropout=0.1,
        use_gate=use_gate,
        num_blocks=num_blocks,
@@ -76,8 +76,8 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
                            dtype):
    tf.keras.mixed_precision.set_global_policy(dtype)
    kwargs = dict(
-        intermediate_size=16,
-        intermediate_activation="relu",
+        inner_dim=16,
+        inner_activation="relu",
        dropout=0.1,
        use_gate=use_gate,
        num_blocks=num_blocks,
@@ -104,8 +104,8 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):

  def test_serialize_deserialize(self):
    kwargs = dict(
-        intermediate_size=16,
-        intermediate_activation="relu",
+        inner_dim=16,
+        inner_activation="relu",
        dropout=0.1,
        use_gate=False,
        num_blocks=4,

--- a/official/nlp/modeling/layers/gaussian_process.py
+++ b/official/nlp/modeling/layers/gaussian_process.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Lint as: python3
 """Definitions for random feature Gaussian process layer."""
 import math
 import tensorflow as tf
@@ -117,7 +116,7 @@ class RandomFeatureGaussianProcess(tf.keras.layers.Layer):
      name: (string) Layer name.
      **gp_output_kwargs: Additional keyword arguments to dense output layer.
    """
-    super(RandomFeatureGaussianProcess, self).__init__(name=name, dtype=dtype)
+    super().__init__(name=name, dtype=dtype)
    self.units = units
    self.num_inducing = num_inducing

@@ -227,7 +226,7 @@ class RandomFeatureGaussianProcess(tf.keras.layers.Layer):
    """Resets covariance matrix of the GP layer.

    This function is useful for reseting the model's covariance matrix at the
-    begining of a new epoch.
+    beginning of a new epoch.
    """
    self._gp_cov_layer.reset_precision_matrix()

@@ -381,7 +380,7 @@ class LaplaceRandomFeatureCovariance(tf.keras.layers.Layer):
    """Resets precision matrix to its initial value.

    This function is useful for reseting the model's covariance matrix at the
-    begining of a new epoch.
+    beginning of a new epoch.
    """
    precision_matrix_reset_op = self.precision_matrix.assign(
        self.initial_precision_matrix)

--- a/official/nlp/modeling/layers/gaussian_process_test.py
+++ b/official/nlp/modeling/layers/gaussian_process_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Lint as: python3
 """Tests for Gaussian process functions."""
 import os
 import shutil