Merge branch 'master' of https://github.com/tensorflow/models into run_superglue

790e49e5 · stephenwu · 8ab018b0 · 5bb827c3 · 790e49e5 · 790e49e5
Commit 790e49e5 authored Mar 23, 2021 by stephenwu
20 changed files
--- a/official/nlp/modeling/layers/masked_softmax.py
+++ b/official/nlp/modeling/layers/masked_softmax.py
@@ -25,10 +25,10 @@ def _large_compatible_negative(tensor_type):
  in this module (-1e9) cannot be represented using `tf.float16`.

  Args:
-    tensor_type: a dtype to determine the type.
+    tensor_type: A dtype to determine the type.

  Returns:
-    a large negative number.
+    A large negative number.
  """
  if tensor_type == tf.float16:
    return tf.float16.min

--- a/official/nlp/modeling/layers/mobile_bert_layers.py
+++ b/official/nlp/modeling/layers/mobile_bert_layers.py
@@ -44,7 +44,7 @@ def _get_norm_layer(normalization_type='no_norm', name=None):

  Args:
      normalization_type: String. The type of normalization_type, only
-        'no_norm' and 'layer_norm' are supported.
+        `no_norm` and `layer_norm` are supported.
      name: Name for the norm layer.

  Returns:
@@ -89,7 +89,7 @@ class MobileBertEmbedding(tf.keras.layers.Layer):
      output_embed_size: Embedding size for the final embedding output.
      max_sequence_length: Maximum length of input sequence.
      normalization_type: String. The type of normalization_type, only
-        'no_norm' and 'layer_norm' are supported.
+        `no_norm` and `layer_norm` are supported.
      initializer: The initializer to use for the embedding weights and
        linear projection weights.
      dropout_rate: Dropout rate.
@@ -208,10 +208,10 @@ class MobileBertTransformer(tf.keras.layers.Layer):
      key_query_shared_bottleneck: Whether to share linear transformation for
        keys and queries.
      num_feedforward_networks: Number of stacked feed-forward networks.
-      normalization_type: The type of normalization_type, only 'no_norm' and
-        'layer_norm' are supported. 'no_norm' represents the element-wise
+      normalization_type: The type of normalization_type, only `no_norm` and
+        `layer_norm` are supported. `no_norm` represents the element-wise
        linear transformation for the student model, as suggested by the
-        original MobileBERT paper. 'layer_norm' is used for the teacher model.
+        original MobileBERT paper. `layer_norm` is used for the teacher model.
      initializer: The initializer to use for the embedding weights and
        linear projection weights.
      **kwargs: keyword arguments.
@@ -346,14 +346,16 @@ class MobileBertTransformer(tf.keras.layers.Layer):
    """Implementes the forward pass.

    Args:
-      input_tensor: Float tensor of shape [batch_size, seq_length, hidden_size].
-      attention_mask: (optional) int32 tensor of shape [batch_size, seq_length,
-        seq_length], with 1 for positions that can be attended to and 0 in
-        positions that should not be.
+      input_tensor: Float tensor of shape
+        `(batch_size, seq_length, hidden_size)`.
+      attention_mask: (optional) int32 tensor of shape
+        `(batch_size, seq_length, seq_length)`, with 1 for positions that can
+        be attended to and 0 in positions that should not be.
      return_attention_scores: If return attention score.

    Returns:
-      layer_output: Float tensor of shape [batch_size, seq_length, hidden_size].
+      layer_output: Float tensor of shape
+        `(batch_size, seq_length, hidden_size)`.
      attention_scores (Optional): Only when return_attention_scores is True.

    Raises:
@@ -450,8 +452,8 @@ class MobileBertMaskedLM(tf.keras.layers.Layer):
      activation: The activation, if any, for the dense layer.
      initializer: The initializer for the dense layer. Defaults to a Glorot
        uniform initializer.
-      output: The output style for this layer. Can be either 'logits' or
-        'predictions'.
+      output: The output style for this layer. Can be either `logits` or
+        `predictions`.
      **kwargs: keyword arguments.
    """
    super(MobileBertMaskedLM, self).__init__(**kwargs)
@@ -527,16 +529,16 @@ class MobileBertMaskedLM(tf.keras.layers.Layer):

    Args:
      sequence_tensor: Sequence output of `BertModel` layer of shape
-        (`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
+        `(batch_size, seq_length, num_hidden)` where `num_hidden` is number of
        hidden units of `BertModel` layer.
      positions: Positions ids of tokens in sequence to mask for pretraining
-        of with dimension (batch_size, num_predictions) where
+        of with dimension `(batch_size, num_predictions)` where
        `num_predictions` is maximum number of tokens to mask out and predict
        per each sequence.

    Returns:
-      Masked out sequence tensor of shape (batch_size * num_predictions,
-      num_hidden).
+      Masked out sequence tensor of shape
+        `(batch_size * num_predictions, num_hidden)`.
    """
    sequence_shape = tf.shape(sequence_tensor)
    batch_size, seq_length = sequence_shape[0], sequence_shape[1]

--- a/official/nlp/modeling/layers/multi_channel_attention.py
+++ b/official/nlp/modeling/layers/multi_channel_attention.py
@@ -26,8 +26,8 @@ class VotingAttention(tf.keras.layers.Layer):
  """Voting Attention layer.

  Args:
-    num_heads: the number of attention heads.
-    head_size: per-head hidden size.
+    num_heads: The number of attention heads.
+    head_size: Per-head hidden size.
    kernel_initializer: Initializer for dense layer kernels.
    bias_initializer: Initializer for dense layer biases.
    kernel_regularizer: Regularizer for dense layer kernels.
@@ -115,7 +115,7 @@ class MultiChannelAttention(tf.keras.layers.MultiHeadAttention):
      context tensors according to the distribution among channels.
    key: Optional key `Tensor` of shape `[B, A, S, dim]`. If not given, will use
      `value` for both `key` and `value`, which is the most common case.
-    attention_mask: a boolean mask of shape `[B, T, S]`, that prevents attention
+    attention_mask: A boolean mask of shape `[B, T, S]`, that prevents attention
      to certain positions.
  """


--- a/official/nlp/modeling/layers/position_embedding.py
+++ b/official/nlp/modeling/layers/position_embedding.py
@@ -77,7 +77,7 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
        dimension of `inputs`.

    Returns:
-      A tensor in shape of [length, hidden_size].
+      A tensor in shape of `(length, hidden_size)`.
    """
    if inputs is None and length is None:
      raise ValueError("If inputs is None, `length` must be set in "
@@ -114,7 +114,7 @@ def _relative_position_bucket(relative_position,
  the distance in tokens from the attending position to the attended-to
  position.

-  If bidirectional=False, then positive relative positions are invalid.
+  If `bidirectional=False`, then positive relative positions are invalid.

  We use smaller buckets for small absolute relative_position and larger
  buckets for larger absolute relative_positions.
@@ -127,13 +127,13 @@ def _relative_position_bucket(relative_position,
  than the model has been trained on.

  Args:
-    relative_position: an int32 Tensor
-    bidirectional: a boolean - whether the attention is bidirectional
-    num_buckets: an integer
-    max_distance: an integer
+    relative_position: An int32 Tensor
+    bidirectional: A boolean - whether the attention is bidirectional
+    num_buckets: An integer
+    max_distance: An integer

  Returns:
-    a Tensor with the same shape as relative_position, containing int32
+    A Tensor with the same shape as relative_position, containing int32
    values in the range [0, num_buckets)
  """
  ret = 0

--- a/official/nlp/modeling/layers/relative_attention.py
+++ b/official/nlp/modeling/layers/relative_attention.py
@@ -103,10 +103,10 @@ class MultiHeadRelativeAttention(tf.keras.layers.MultiHeadAttention):
    segment_attention_bias: Optional trainable bias parameter added to the
      query had when calculating the segment-based attention score used in
      XLNet of shape `[num_heads, dim]`.
-    state: Optional `Tensor` of shape [B, M, E] where M is the length of the
+    state: Optional `Tensor` of shape `[B, M, E]` where M is the length of the
      state or memory.
      If passed, this is also attended over as in Transformer XL.
-    attention_mask: a boolean mask of shape `[B, T, S]` that prevents attention
+    attention_mask: A boolean mask of shape `[B, T, S]` that prevents attention
      to certain positions.
  """


--- a/official/nlp/modeling/layers/self_attention_mask.py
+++ b/official/nlp/modeling/layers/self_attention_mask.py
@@ -21,15 +21,15 @@ from official.nlp.keras_nlp import layers

 @tf.keras.utils.register_keras_serializable(package='Text')
 class SelfAttentionMask(layers.SelfAttentionMask):
-  """Create 3D attention mask from a 2D tensor mask.
+  """Creates 3D attention mask from a 2D tensor mask.

    **Warning: Please use the `keras_nlp.layers.SelfAttentionMask`.**
    inputs[0]: from_tensor: 2D or 3D Tensor of shape
-      [batch_size, from_seq_length, ...].
-    inputs[1]: to_mask: int32 Tensor of shape [batch_size, to_seq_length].
+      `(batch_size, from_seq_length, ...)`.
+    inputs[1]: to_mask: int32 Tensor of shape `(batch_size, to_seq_length)`.

    Returns:
-      float Tensor of shape [batch_size, from_seq_length, to_seq_length].
+      Float Tensor of shape `(batch_size, from_seq_length, to_seq_length)`.
  """

  def call(self, inputs):

--- a/official/nlp/modeling/layers/spectral_normalization.py
+++ b/official/nlp/modeling/layers/spectral_normalization.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Normalization layers.
+
+## References:
+
+[1] Yuichi Yoshida, Takeru Miyato. Spectral Norm Regularization for Improving
+    the Generalizability of Deep Learning.
+    _arXiv preprint arXiv:1705.10941_, 2017. https://arxiv.org/abs/1705.10941
+
+[2] Takeru Miyato, Toshiki Kataoka, Masanori Koyama, Yuichi Yoshida.
+    Spectral normalization for generative adversarial networks.
+    In _International Conference on Learning Representations_, 2018.
+
+[3] Henry Gouk, Eibe Frank, Bernhard Pfahringer, Michael Cree.
+    Regularisation of neural networks by enforcing lipschitz continuity.
+    _arXiv preprint arXiv:1804.04368_, 2018. https://arxiv.org/abs/1804.04368
+"""
+
+import numpy as np
+import tensorflow as tf
+
+
+class SpectralNormalization(tf.keras.layers.Wrapper):
+  """Implements spectral normalization for Dense layer."""
+
+  def __init__(self,
+               layer,
+               iteration=1,
+               norm_multiplier=0.95,
+               training=True,
+               aggregation=tf.VariableAggregation.MEAN,
+               inhere_layer_name=False,
+               **kwargs):
+    """Initializer.
+
+    Args:
+      layer: (tf.keras.layers.Layer) A TF Keras layer to apply normalization to.
+      iteration: (int) The number of power iteration to perform to estimate
+        weight matrix's singular value.
+      norm_multiplier: (float) Multiplicative constant to threshold the
+        normalization. Usually under normalization, the singular value will
+        converge to this value.
+      training: (bool) Whether to perform power iteration to update the singular
+        value estimate.
+      aggregation: (tf.VariableAggregation) Indicates how a distributed variable
+        will be aggregated. Accepted values are constants defined in the class
+        tf.VariableAggregation.
+      inhere_layer_name: (bool) Whether to inhere the name of the input layer.
+      **kwargs: (dict) Other keyword arguments for the layers.Wrapper class.
+    """
+    self.iteration = iteration
+    self.do_power_iteration = training
+    self.aggregation = aggregation
+    self.norm_multiplier = norm_multiplier
+
+    # Set layer name.
+    wrapper_name = kwargs.pop('name', None)
+    if inhere_layer_name:
+      wrapper_name = layer.name
+
+    if not isinstance(layer, tf.keras.layers.Layer):
+      raise ValueError('`layer` must be a `tf.keras.layer.Layer`. '
+                       'Observed `{}`'.format(layer))
+    super(SpectralNormalization, self).__init__(
+        layer, name=wrapper_name, **kwargs)
+
+  def build(self, input_shape):
+    super(SpectralNormalization, self).build(input_shape)
+    self.layer.kernel._aggregation = self.aggregation  # pylint: disable=protected-access
+    self._dtype = self.layer.kernel.dtype
+
+    self.w = self.layer.kernel
+    self.w_shape = self.w.shape.as_list()
+    self.uv_initializer = tf.initializers.random_normal()
+
+    self.v = self.add_weight(
+        shape=(1, np.prod(self.w_shape[:-1])),
+        initializer=self.uv_initializer,
+        trainable=False,
+        name='v',
+        dtype=self.dtype,
+        aggregation=self.aggregation)
+
+    self.u = self.add_weight(
+        shape=(1, self.w_shape[-1]),
+        initializer=self.uv_initializer,
+        trainable=False,
+        name='u',
+        dtype=self.dtype,
+        aggregation=self.aggregation)
+
+    self.update_weights()
+
+  def call(self, inputs, *, training=None):
+    training = self.do_power_iteration if training is None else training
+    u_update_op, v_update_op, w_update_op = self.update_weights(
+        training=training)
+    output = self.layer(inputs)
+    w_restore_op = self.restore_weights()
+
+    # Register update ops.
+    self.add_update(u_update_op)
+    self.add_update(v_update_op)
+    self.add_update(w_update_op)
+    self.add_update(w_restore_op)
+
+    return output
+
+  def update_weights(self, *, training=True):
+    w_reshaped = tf.reshape(self.w, [-1, self.w_shape[-1]])
+
+    u_hat = self.u
+    v_hat = self.v
+
+    if training:
+      for _ in range(self.iteration):
+        v_hat = tf.nn.l2_normalize(tf.matmul(u_hat, tf.transpose(w_reshaped)))
+        u_hat = tf.nn.l2_normalize(tf.matmul(v_hat, w_reshaped))
+
+    sigma = tf.matmul(tf.matmul(v_hat, w_reshaped), tf.transpose(u_hat))
+    # Convert sigma from a 1x1 matrix to a scalar.
+    sigma = tf.reshape(sigma, [])
+    u_update_op = self.u.assign(u_hat)
+    v_update_op = self.v.assign(v_hat)
+
+    # Bound spectral norm to be not larger than self.norm_multiplier.
+    w_norm = tf.cond((self.norm_multiplier / sigma) < 1, lambda:  # pylint:disable=g-long-lambda
+                     (self.norm_multiplier / sigma) * self.w, lambda: self.w)
+
+    w_update_op = self.layer.kernel.assign(w_norm)
+    return u_update_op, v_update_op, w_update_op
+
+  def restore_weights(self):
+    """Restores layer weights to maintain gradient update (See Alg 1 of [1])."""
+    return self.layer.kernel.assign(self.w)
+
+
+class SpectralNormalizationConv2D(tf.keras.layers.Wrapper):
+  """Implements spectral normalization for Conv2D layer based on [3]."""
+
+  def __init__(self,
+               layer,
+               iteration=1,
+               norm_multiplier=0.95,
+               training=True,
+               aggregation=tf.VariableAggregation.MEAN,
+               legacy_mode=False,
+               **kwargs):
+    """Initializer.
+
+    Args:
+      layer: (tf.keras.layers.Layer) A TF Keras layer to apply normalization to.
+      iteration: (int) The number of power iteration to perform to estimate
+        weight matrix's singular value.
+      norm_multiplier: (float) Multiplicative constant to threshold the
+        normalization. Usually under normalization, the singular value will
+        converge to this value.
+      training: (bool) Whether to perform power iteration to update the singular
+        value estimate.
+      aggregation: (tf.VariableAggregation) Indicates how a distributed variable
+        will be aggregated. Accepted values are constants defined in the class
+        tf.VariableAggregation.
+      legacy_mode: (bool) Whether to use the legacy implementation where the
+        dimension of the u and v vectors are set to the batch size. It should
+        not be enabled unless for backward compatibility reasons.
+      **kwargs: (dict) Other keyword arguments for the layers.Wrapper class.
+    """
+    self.iteration = iteration
+    self.do_power_iteration = training
+    self.aggregation = aggregation
+    self.norm_multiplier = norm_multiplier
+    self.legacy_mode = legacy_mode
+
+    # Set layer attributes.
+    layer._name += '_spec_norm'
+
+    if not isinstance(layer, tf.keras.layers.Conv2D):
+      raise ValueError(
+          'layer must be a `tf.keras.layer.Conv2D` instance. You passed: {input}'
+          .format(input=layer))
+    super(SpectralNormalizationConv2D, self).__init__(layer, **kwargs)
+
+  def build(self, input_shape):
+    self.layer.build(input_shape)
+    self.layer.kernel._aggregation = self.aggregation  # pylint: disable=protected-access
+    self._dtype = self.layer.kernel.dtype
+
+    # Shape (kernel_size_1, kernel_size_2, in_channel, out_channel).
+    self.w = self.layer.kernel
+    self.w_shape = self.w.shape.as_list()
+    self.strides = self.layer.strides
+
+    # Set the dimensions of u and v vectors.
+    batch_size = input_shape[0]
+    uv_dim = batch_size if self.legacy_mode else 1
+
+    # Resolve shapes.
+    in_height = input_shape[1]
+    in_width = input_shape[2]
+    in_channel = self.w_shape[2]
+
+    out_height = in_height // self.strides[0]
+    out_width = in_width // self.strides[1]
+    out_channel = self.w_shape[3]
+
+    self.in_shape = (uv_dim, in_height, in_width, in_channel)
+    self.out_shape = (uv_dim, out_height, out_width, out_channel)
+    self.uv_initializer = tf.initializers.random_normal()
+
+    self.v = self.add_weight(
+        shape=self.in_shape,
+        initializer=self.uv_initializer,
+        trainable=False,
+        name='v',
+        dtype=self.dtype,
+        aggregation=self.aggregation)
+
+    self.u = self.add_weight(
+        shape=self.out_shape,
+        initializer=self.uv_initializer,
+        trainable=False,
+        name='u',
+        dtype=self.dtype,
+        aggregation=self.aggregation)
+
+    super(SpectralNormalizationConv2D, self).build()
+
+  def call(self, inputs):
+    u_update_op, v_update_op, w_update_op = self.update_weights()
+    output = self.layer(inputs)
+    w_restore_op = self.restore_weights()
+
+    # Register update ops.
+    self.add_update(u_update_op)
+    self.add_update(v_update_op)
+    self.add_update(w_update_op)
+    self.add_update(w_restore_op)
+
+    return output
+
+  def update_weights(self):
+    """Computes power iteration for convolutional filters based on [3]."""
+    # Initialize u, v vectors.
+    u_hat = self.u
+    v_hat = self.v
+
+    if self.do_power_iteration:
+      for _ in range(self.iteration):
+        # Updates v.
+        v_ = tf.nn.conv2d_transpose(
+            u_hat,
+            self.w,
+            output_shape=self.in_shape,
+            strides=self.strides,
+            padding='SAME')
+        v_hat = tf.nn.l2_normalize(tf.reshape(v_, [1, -1]))
+        v_hat = tf.reshape(v_hat, v_.shape)
+
+        # Updates u.
+        u_ = tf.nn.conv2d(v_hat, self.w, strides=self.strides, padding='SAME')
+        u_hat = tf.nn.l2_normalize(tf.reshape(u_, [1, -1]))
+        u_hat = tf.reshape(u_hat, u_.shape)
+
+    v_w_hat = tf.nn.conv2d(v_hat, self.w, strides=self.strides, padding='SAME')
+
+    sigma = tf.matmul(tf.reshape(v_w_hat, [1, -1]), tf.reshape(u_hat, [-1, 1]))
+    # Convert sigma from a 1x1 matrix to a scalar.
+    sigma = tf.reshape(sigma, [])
+
+    u_update_op = self.u.assign(u_hat)
+    v_update_op = self.v.assign(v_hat)
+
+    w_norm = tf.cond((self.norm_multiplier / sigma) < 1, lambda:      # pylint:disable=g-long-lambda
+                     (self.norm_multiplier / sigma) * self.w, lambda: self.w)
+
+    w_update_op = self.layer.kernel.assign(w_norm)
+
+    return u_update_op, v_update_op, w_update_op
+
+  def restore_weights(self):
+    """Restores layer weights to maintain gradient update (See Alg 1 of [1])."""
+    return self.layer.kernel.assign(self.w)
--- a/official/nlp/modeling/layers/spectral_normalization_test.py
+++ b/official/nlp/modeling/layers/spectral_normalization_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for normalization layers.
+
+## References:
+
+[1] Hanie Sedghi, Vineet Gupta, Philip M. Long.
+    The Singular Values of Convolutional Layers.
+    In _International Conference on Learning Representations_, 2019.
+"""
+from absl.testing import parameterized
+
+import numpy as np
+import tensorflow as tf
+
+from official.nlp.modeling.layers import spectral_normalization
+
+DenseLayer = tf.keras.layers.Dense(10)
+Conv2DLayer = tf.keras.layers.Conv2D(filters=64, kernel_size=3, padding='valid')
+
+
+def _compute_spectral_norm(weight):
+  if weight.ndim > 2:
+    # Computes Conv2D via FFT transform as in [1].
+    weight = np.fft.fft2(weight, weight.shape[1:3], axes=[0, 1])
+  return np.max(np.linalg.svd(weight, compute_uv=False))
+
+
+class NormalizationTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(NormalizationTest, self).setUp()
+    self.num_iterations = 1000
+    self.norm_multiplier = 0.95
+
+  @parameterized.named_parameters(
+      ('Dense',
+       (None, 10), DenseLayer, spectral_normalization.SpectralNormalization),
+      ('Conv2D', (None, 32, 32, 3), Conv2DLayer,
+       spectral_normalization.SpectralNormalizationConv2D))
+  def test_spec_norm_magnitude(self, input_shape, layer, norm_wrapper):
+    """Tests if the weights spectral norm converges to norm_multiplier."""
+    layer.build(input_shape)
+    sn_layer = norm_wrapper(
+        layer,
+        iteration=self.num_iterations,
+        norm_multiplier=self.norm_multiplier)
+
+    # Perform normalization.
+    sn_layer.build(input_shape)
+    sn_layer.update_weights()
+    normalized_kernel = sn_layer.layer.kernel.numpy()
+
+    spectral_norm_computed = _compute_spectral_norm(normalized_kernel)
+    spectral_norm_expected = self.norm_multiplier
+    self.assertAllClose(
+        spectral_norm_computed, spectral_norm_expected, atol=5e-2)
+
+    # Test that the normalized layer is K-Lipschitz. In particular, if the layer
+    # is a function f, then ||f(x1) - f(x2)||_2 <= K * ||(x1 - x2)||_2, where K
+    # is the norm multiplier.
+    new_input_shape = (16,) + input_shape[1:]
+    new_input = tf.random.uniform(new_input_shape)
+    delta_vec = tf.random.uniform(new_input_shape)
+    output1 = sn_layer(new_input)
+    output2 = sn_layer(new_input + delta_vec)
+
+    delta_input = tf.norm(tf.reshape(delta_vec, (-1,))).numpy()
+    delta_output = tf.norm(tf.reshape(output2 - output1, (-1,))).numpy()
+    self.assertLessEqual(delta_output, self.norm_multiplier * delta_input)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/nlp/modeling/layers/talking_heads_attention.py
+++ b/official/nlp/modeling/layers/talking_heads_attention.py
@@ -63,7 +63,7 @@ class TalkingHeadsAttention(tf.keras.layers.MultiHeadAttention):
    that will be applied on attention scores before and after softmax.

    Args:
-      qkv_rank: the rank of query, key, value tensors after projection.
+      qkv_rank: The rank of query, key, value tensors after projection.
    """
    super(TalkingHeadsAttention, self)._build_attention(qkv_rank)


--- a/official/nlp/modeling/layers/text_layers.py
+++ b/official/nlp/modeling/layers/text_layers.py
@@ -100,10 +100,10 @@ class BertTokenizer(tf.keras.layers.Layer):
    tokenize_with_offsets: If true, calls
      `text.BertTokenizer.tokenize_with_offsets()` instead of plain
      `text.BertTokenizer.tokenize()` and outputs a triple of
-      (tokens, start_offsets, limit_offsets).
-    raw_table_access: An object with methods .lookup(keys) and .size()
+      `(tokens, start_offsets, limit_offsets)`.
+    raw_table_access: An object with methods `.lookup(keys) and `.size()`
      that operate on the raw lookup table of tokens. It can be used to
-      look up special token synbols like [MASK].
+      look up special token synbols like `[MASK]`.
  """

  def __init__(self, *,
@@ -121,16 +121,16 @@ class BertTokenizer(tf.keras.layers.Layer):
      lower_case: A Python boolean forwarded to `text.BertTokenizer`.
        If true, input text is converted to lower case (where applicable)
        before tokenization. This must be set to match the way in which
-        the vocab_file was created.
+        the `vocab_file` was created.
      tokenize_with_offsets: A Python boolean. If true, this layer calls
-         `text.BertTokenizer.tokenize_with_offsets()` instead of plain
-         `text.BertTokenizer.tokenize()` and outputs a triple of
-         (tokens, start_offsets, limit_offsets)
-         insead of just tokens.
-      **kwargs: standard arguments to Layer().
+        `text.BertTokenizer.tokenize_with_offsets()` instead of plain
+        `text.BertTokenizer.tokenize()` and outputs a triple of
+        `(tokens, start_offsets, limit_offsets)`
+        insead of just tokens.
+      **kwargs: Standard arguments to `Layer()`.

    Raises:
-      ImportError: if importing `tensorflow_text` failed.
+      ImportError: If importing `tensorflow_text` failed.
    """
    _check_if_tf_text_installed()

@@ -167,18 +167,19 @@ class BertTokenizer(tf.keras.layers.Layer):
    """Calls `text.BertTokenizer` on inputs.

    Args:
-      inputs: A string Tensor of shape [batch_size].
+      inputs: A string Tensor of shape `(batch_size,)`.

    Returns:
      One or three of `RaggedTensors` if `tokenize_with_offsets` is False or
      True, respectively. These are
-      tokens: A `RaggedTensor` of shape [batch_size, (words), (pieces_per_word)]
-        and type int32. tokens[i,j,k] contains the k-th wordpiece of the
-        j-th word in the i-th input.
-      start_offsets, limit_offsets: If `tokenize_with_offsets` is True,
-        RaggedTensors of type int64 with the same indices as tokens.
-        Element [i,j,k] contains the byte offset at the start, or past the
-        end, resp., for the k-th wordpiece of the j-th word in the i-th input.
+        tokens: A `RaggedTensor` of shape
+          `[batch_size, (words), (pieces_per_word)]`
+          and type int32. `tokens[i,j,k]` contains the k-th wordpiece of the
+          j-th word in the i-th input.
+        start_offsets, limit_offsets: If `tokenize_with_offsets` is True,
+          RaggedTensors of type int64 with the same indices as tokens.
+          Element `[i,j,k]` contains the byte offset at the start, or past the
+          end, resp., for the k-th wordpiece of the j-th word in the i-th input.
    """
    # Prepare to reshape the result to work around broken shape inference.
    batch_size = tf.shape(inputs)[0]
@@ -201,12 +202,7 @@ class BertTokenizer(tf.keras.layers.Layer):

  def get_config(self):
    # Skip in tf.saved_model.save(); fail if called direcly.
-    # TODO(arnoegw): Implement when switching to MutableHashTable, which gets
-    # initialized from the checkpoint and not from a vocab file.
-    # We cannot just put the original, user-supplied vocab file name into
-    # the config, because the path has to change as the SavedModel is copied
-    # around.
-    raise NotImplementedError("Not implemented yet.")
+    raise NotImplementedError("TODO(b/170480226): implement")

  def get_special_tokens_dict(self):
    """Returns dict of token ids, keyed by standard names for their purpose.
@@ -268,13 +264,13 @@ class BertTokenizer(tf.keras.layers.Layer):


 class SentencepieceTokenizer(tf.keras.layers.Layer):
-  """Wraps tf_text.SentencepieceTokenizer as a Keras Layer.
+  """Wraps `tf_text.SentencepieceTokenizer` as a Keras Layer.

  Attributes:
    tokenize_with_offsets: If true, calls
-      SentencepieceTokenizer.tokenize_with_offsets()
-      instead of plain .tokenize() and outputs a triple of
-      (tokens, start_offsets, limit_offsets).
+      `SentencepieceTokenizer.tokenize_with_offsets()`
+      instead of plain `.tokenize()` and outputs a triple of
+      `(tokens, start_offsets, limit_offsets)`.
  """

  def __init__(self,
@@ -300,9 +296,9 @@ class SentencepieceTokenizer(tf.keras.layers.Layer):
        store the actual proto (not a filename passed here).
      model_serialized_proto: The sentencepiece model serialized proto string.
      tokenize_with_offsets: A Python boolean. If true, this layer calls
-        SentencepieceTokenizer.tokenize_with_offsets() instead of
-        plain .tokenize() and outputs a triple of
-        (tokens, start_offsets, limit_offsets) insead of just tokens.
+        `SentencepieceTokenizer.tokenize_with_offsets()` instead of
+        plain `.tokenize()` and outputs a triple of
+        `(tokens, start_offsets, limit_offsets)` insead of just tokens.
        Note that when following `strip_diacritics` is set to True, returning
        offsets is not supported now.
      nbest_size: A scalar for sampling:
@@ -320,7 +316,7 @@ class SentencepieceTokenizer(tf.keras.layers.Layer):
        `tokenize_with_offsets`. NOTE: New models are encouraged to put this
        into custom normalization rules for the Sentencepiece model itself to
        avoid this extra step and the limitation regarding offsets.
-      **kwargs: standard arguments to Layer().
+      **kwargs: standard arguments to `Layer()`.

    Raises:
      ImportError: if importing tensorflow_text failed.
@@ -360,19 +356,19 @@ class SentencepieceTokenizer(tf.keras.layers.Layer):
    return self._tokenizer.vocab_size()

  def call(self, inputs: tf.Tensor):
-    """Calls text.SentencepieceTokenizer on inputs.
+    """Calls `text.SentencepieceTokenizer` on inputs.

    Args:
-      inputs: A string Tensor of shape [batch_size].
+      inputs: A string Tensor of shape `(batch_size,)`.

    Returns:
      One or three of RaggedTensors if tokenize_with_offsets is False or True,
      respectively. These are
-      tokens: A RaggedTensor of shape [batch_size, (pieces)] and type int32.
-        tokens[i,j] contains the j-th piece in the i-th input.
-      start_offsets, limit_offsets: If tokenize_with_offsets is True,
-        RaggedTensors of type int64 with the same indices as tokens.
-        Element [i,j] contains the byte offset at the start, or past the
+      tokens: A RaggedTensor of shape `[batch_size, (pieces)]` and type `int32`.
+        `tokens[i,j]` contains the j-th piece in the i-th input.
+      start_offsets, limit_offsets: If `tokenize_with_offsets` is True,
+        RaggedTensors of type `int64` with the same indices as tokens.
+        Element `[i,j]` contains the byte offset at the start, or past the
        end, resp., for the j-th piece in the i-th input.
    """
    if self._strip_diacritics:
@@ -403,19 +399,8 @@ class SentencepieceTokenizer(tf.keras.layers.Layer):
      return _reshape(tokens)

  def get_config(self):
-    raise NotImplementedError("b/170480226")
-    # TODO(b/170480226): Uncomment and improve to fix the bug.
-    # config = {
-    #     "model_serialized_proto": self._model_serialized_proto,
-    #     "lower_case": self._lower_case,
-    #     "tokenize_with_offsets": self.tokenize_with_offsets,
-    #     "nbest_size": self._nbest_size,
-    #     "alpha": self._alpha,
-    #     "strip_diacritics": self._strip_diacritics,
-    # }
-    # base_config = super(SentencepieceTokenizer, self).get_config()
-    # base_config.update(config)
-    # return base_config
+    # Skip in tf.saved_model.save(); fail if called direcly.
+    raise NotImplementedError("TODO(b/170480226): implement")

  def get_special_tokens_dict(self):
    """Returns dict of token ids, keyed by standard names for their purpose.
@@ -492,7 +477,7 @@ class BertPackInputs(tf.keras.layers.Layer):
               special_tokens_dict=None,
               truncator="round_robin",
               **kwargs):
-    """Initializes with a target seq_length, relevant token ids and truncator.
+    """Initializes with a target `seq_length`, relevant token ids and truncator.

    Args:
      seq_length: The desired output length. Must not exceed the max_seq_length
@@ -505,13 +490,13 @@ class BertPackInputs(tf.keras.layers.Layer):
        unused positions after the last segment in the sequence
        (called "[PAD]" for BERT).
      special_tokens_dict: Optionally, a dict from Python strings to Python
-        integers that contains values for start_of_sequence_id,
-        end_of_segment_id and padding_id. (Further values in the dict are
+        integers that contains values for `start_of_sequence_id`,
+        `end_of_segment_id` and `padding_id`. (Further values in the dict are
        silenty ignored.) If this is passed, separate *_id arguments must be
        omitted.
      truncator: The algorithm to truncate a list of batched segments to fit a
-        per-example length limit. The value can be either "round_robin" or
-        "waterfall":
+        per-example length limit. The value can be either `round_robin` or
+        `waterfall`:
          (1) For "round_robin" algorithm, available space is assigned
          one token at a time in a round-robin fashion to the inputs that still
          need some, until the limit is reached. It currently only supports
@@ -521,10 +506,10 @@ class BertPackInputs(tf.keras.layers.Layer):
            left-to-right manner and fills up the buckets until we run out of
            budget. It support arbitrary number of segments.

-      **kwargs: standard arguments to Layer().
+      **kwargs: standard arguments to `Layer()`.

    Raises:
-      ImportError: if importing tensorflow_text failed.
+      ImportError: if importing `tensorflow_text` failed.
    """
    _check_if_tf_text_installed()
    super().__init__(**kwargs)

--- a/official/nlp/modeling/layers/tn_expand_condense.py
+++ b/official/nlp/modeling/layers/tn_expand_condense.py
@@ -37,8 +37,8 @@ class TNExpandCondense(Layer):
  Note the input shape and output shape will be identical.

  Args:
-    proj_multiplier: Positive integer, multiple of input_shape[-1] to project
-      up to. Must be one of [2, 4, 6, 8].
+    proj_multiplier: Positive integer, multiple of `input_shape[-1]` to project
+      up to. Must be one of `[2, 4, 6, 8]`.
    use_bias: Boolean, whether the layer uses a bias vector.
    activation: Activation function to use between Expand and Condense. If you
      don't specify anything, no activation is applied

--- a/official/nlp/modeling/layers/transformer.py
+++ b/official/nlp/modeling/layers/transformer.py
@@ -232,7 +232,8 @@ class TransformerDecoderBlock(tf.keras.layers.Layer):
        tf.keras.layers.LayerNormalization(
            name="self_attention_layer_norm",
            axis=-1,
-            epsilon=self._norm_epsilon))
+            epsilon=self._norm_epsilon,
+            dtype="float32"))
    # Encoder-decoder attention.
    self.encdec_attention = self._cross_attention_cls(
        num_heads=self.num_attention_heads,
@@ -250,7 +251,8 @@ class TransformerDecoderBlock(tf.keras.layers.Layer):
        tf.keras.layers.LayerNormalization(
            name="attention/encdec_output_layer_norm",
            axis=-1,
-            epsilon=self._norm_epsilon))
+            epsilon=self._norm_epsilon,
+            dtype="float32"))

    # Feed-forward projection.
    self.intermediate_dense = tf.keras.layers.experimental.EinsumDense(
@@ -273,7 +275,8 @@ class TransformerDecoderBlock(tf.keras.layers.Layer):
        **common_kwargs)
    self.output_dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
    self.output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="output_layer_norm", axis=-1, epsilon=self._norm_epsilon)
+        name="output_layer_norm", axis=-1,
+        epsilon=self._norm_epsilon, dtype="float32")
    super().build(input_shape)

  def get_config(self):

--- a/official/nlp/modeling/layers/transformer_scaffold.py
+++ b/official/nlp/modeling/layers/transformer_scaffold.py
@@ -112,8 +112,9 @@ class TransformerScaffold(tf.keras.layers.Layer):
    self._bias_constraint = tf.keras.constraints.get(bias_constraint)

  def build(self, input_shape):
-    input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
-    input_tensor_shape = tf.TensorShape(input_tensor)
+    input_tensor_shape = input_shape[0] if (
+        len(input_shape) == 2) else input_shape
+    input_tensor_shape = tf.TensorShape(input_tensor_shape)
    if len(input_tensor_shape.as_list()) != 3:
      raise ValueError(
          "TransformerScaffold expects a three-dimensional input of "
@@ -170,6 +171,8 @@ class TransformerScaffold(tf.keras.layers.Layer):
    else:
      self._feedforward_block = None

+    # self._dropout_rate controls dropout rates at two places:
+    # after attention, and after FFN.
    self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
    # Use float32 in layernorm for numeric stability.
    # It is probably safe in mixed_float16, but we haven't validated this yet.

--- a/official/nlp/modeling/losses/__init__.py
+++ b/official/nlp/modeling/losses/__init__.py
@@ -12,5 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""Activations package definition. Subject to change."""
+"""Losses contains common loss computation used in NLP (subject to change)."""
 from official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy import loss as weighted_sparse_categorical_crossentropy_loss
--- a/official/nlp/modeling/models/README.md
+++ b/official/nlp/modeling/models/README.md
 # Models

-Models are combinations of layers and networks that would be trained.
+Models are combinations of `tf.keras` layers and models that can be trained.

-Several pre-built canned models are provided to train encoder networks. These
-models are intended as both convenience functions and canonical examples.
+Several pre-built canned models are provided to train encoder networks.
+These models are intended as both convenience functions and canonical examples.

 * [`BertClassifier`](bert_classifier.py) implements a simple classification
 model containing a single classification head using the Classification network.

--- a/official/nlp/modeling/models/__init__.py
+++ b/official/nlp/modeling/models/__init__.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""Models package definition."""
+"""Models are combinations of `tf.keras` layers and models that can be trained.
+
+Several pre-built canned models are provided to train encoder networks.
+These models are intended as both convenience functions and canonical examples.
+"""
 from official.nlp.modeling.models.bert_classifier import BertClassifier
 from official.nlp.modeling.models.bert_pretrainer import *
 from official.nlp.modeling.models.bert_span_labeler import BertSpanLabeler

--- a/official/nlp/modeling/models/bert_pretrainer.py
+++ b/official/nlp/modeling/models/bert_pretrainer.py
@@ -50,8 +50,8 @@ class BertPretrainer(tf.keras.Model):
      None, no activation will be used.
    initializer: The initializer (if any) to use in the masked LM and
      classification networks. Defaults to a Glorot uniform initializer.
-    output: The output style for this network. Can be either 'logits' or
-      'predictions'.
+    output: The output style for this network. Can be either `logits` or
+      `predictions`.
  """

  def __init__(self,

--- a/official/nlp/modeling/models/bert_span_labeler.py
+++ b/official/nlp/modeling/models/bert_span_labeler.py
@@ -37,11 +37,11 @@ class BertSpanLabeler(tf.keras.Model):
  Args:
    network: A transformer network. This network should output a sequence output
      and a classification output. Furthermore, it should expose its embedding
-      table via a "get_embedding_table" method.
+      table via a `get_embedding_table` method.
    initializer: The initializer (if any) to use in the span labeling network.
      Defaults to a Glorot uniform initializer.
-    output: The output style for this network. Can be either 'logits' or
-      'predictions'.
+    output: The output style for this network. Can be either `logit`' or
+      `predictions`.
  """

  def __init__(self,

--- a/official/nlp/modeling/models/bert_token_classifier.py
+++ b/official/nlp/modeling/models/bert_token_classifier.py
@@ -36,12 +36,15 @@ class BertTokenClassifier(tf.keras.Model):
  Args:
    network: A transformer network. This network should output a sequence output
      and a classification output. Furthermore, it should expose its embedding
-      table via a "get_embedding_table" method.
+      table via a `get_embedding_table` method.
    num_classes: Number of classes to predict from the classification network.
    initializer: The initializer (if any) to use in the classification networks.
      Defaults to a Glorot uniform initializer.
-    output: The output style for this network. Can be either 'logits' or
-      'predictions'.
+    output: The output style for this network. Can be either `logits` or
+      `predictions`.
+    dropout_rate: The dropout probability of the token classification head.
+    output_encoder_outputs: Whether to include intermediate sequence output
+      in the final output.
  """

  def __init__(self,
@@ -50,6 +53,7 @@ class BertTokenClassifier(tf.keras.Model):
               initializer='glorot_uniform',
               output='logits',
               dropout_rate=0.1,
+               output_encoder_outputs=False,
               **kwargs):

    # We want to use the inputs of the passed network as the inputs to this
@@ -74,14 +78,19 @@ class BertTokenClassifier(tf.keras.Model):
        name='predictions/transform/logits')
    logits = classifier(sequence_output)
    if output == 'logits':
-      output_tensors = logits
+      output_tensors = {'logits': logits}
    elif output == 'predictions':
-      output_tensors = tf.keras.layers.Activation(tf.nn.log_softmax)(logits)
+      output_tensors = {
+          'predictions': tf.keras.layers.Activation(tf.nn.log_softmax)(logits)
+      }
    else:
      raise ValueError(
          ('Unknown `output` value "%s". `output` can be either "logits" or '
           '"predictions"') % output)

+    if output_encoder_outputs:
+      output_tensors['encoder_outputs'] = sequence_output
+
    # b/164516224
    # Once we've created the network using the Functional API, we call
    # super().__init__ as though we were invoking the Functional API Model
@@ -98,6 +107,7 @@ class BertTokenClassifier(tf.keras.Model):
        'num_classes': num_classes,
        'initializer': initializer,
        'output': output,
+        'output_encoder_outputs': output_encoder_outputs
    }

    # We are storing the config dict as a namedtuple here to ensure checkpoint

--- a/official/nlp/modeling/models/bert_token_classifier_test.py
+++ b/official/nlp/modeling/models/bert_token_classifier_test.py
@@ -27,22 +27,26 @@ from official.nlp.modeling.models import bert_token_classifier
 @keras_parameterized.run_all_keras_modes
 class BertTokenClassifierTest(keras_parameterized.TestCase):

-  @parameterized.parameters(True, False)
-  def test_bert_trainer(self, dict_outputs):
+  @parameterized.parameters((True, True), (False, False))
+  def test_bert_trainer(self, dict_outputs, output_encoder_outputs):
    """Validate that the Keras object can be created."""
    # Build a transformer network to use within the BERT trainer.
    vocab_size = 100
    sequence_length = 512
+    hidden_size = 768
    test_network = networks.BertEncoder(
        vocab_size=vocab_size,
        num_layers=2,
        max_sequence_length=sequence_length,
-        dict_outputs=dict_outputs)
+        dict_outputs=dict_outputs,
+        hidden_size=hidden_size)

    # Create a BERT trainer with the created network.
    num_classes = 3
    bert_trainer_model = bert_token_classifier.BertTokenClassifier(
-        test_network, num_classes=num_classes)
+        test_network,
+        num_classes=num_classes,
+        output_encoder_outputs=output_encoder_outputs)

    # Create a set of 2-dimensional inputs (the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -50,12 +54,18 @@ class BertTokenClassifierTest(keras_parameterized.TestCase):
    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)

    # Invoke the trainer model on the inputs. This causes the layer to be built.
-    sequence_outs = bert_trainer_model([word_ids, mask, type_ids])
+    outputs = bert_trainer_model([word_ids, mask, type_ids])
+    if output_encoder_outputs:
+      logits = outputs['logits']
+      encoder_outputs = outputs['encoder_outputs']
+      self.assertAllEqual(encoder_outputs.shape.as_list(),
+                          [None, sequence_length, hidden_size])
+    else:
+      logits = outputs['logits']

    # Validate that the outputs are of the expected shape.
    expected_classification_shape = [None, sequence_length, num_classes]
-    self.assertAllEqual(expected_classification_shape,
-                        sequence_outs.shape.as_list())
+    self.assertAllEqual(expected_classification_shape, logits.shape.as_list())

  def test_bert_trainer_tensor_call(self):
    """Validate that the Keras object can be invoked."""