Internal change

PiperOrigin-RevId: 461063108

Internal change
PiperOrigin-RevId: 461063108
96cbd362 · A. Unique TensorFlower · 2c4ea3d8 · 96cbd362 · 96cbd362 · 96cbd362
Commit 96cbd362 authored Jul 14, 2022 by A. Unique TensorFlower
5 changed files
--- a/official/nlp/modeling/layers/__init__.py
+++ b/official/nlp/modeling/layers/__init__.py
@@ -36,6 +36,8 @@ from official.nlp.modeling.layers.multi_channel_attention import *
 from official.nlp.modeling.layers.on_device_embedding import OnDeviceEmbedding
 from official.nlp.modeling.layers.pack_optimization import PackBertEmbeddings
 from official.nlp.modeling.layers.pack_optimization import StridedTransformerEncoderBlock
+from official.nlp.modeling.layers.pack_optimization import StridedTransformerScaffold
+from official.nlp.modeling.layers.per_dim_scale_attention import PerDimScaleAttention
 from official.nlp.modeling.layers.position_embedding import PositionEmbedding
 from official.nlp.modeling.layers.position_embedding import RelativePositionBias
 from official.nlp.modeling.layers.position_embedding import RelativePositionEmbedding

--- a/official/nlp/modeling/layers/pack_optimization.py
+++ b/official/nlp/modeling/layers/pack_optimization.py
@@ -202,10 +202,20 @@ class StridedTransformerScaffold(transformer_scaffold.TransformerScaffold):
  """TransformerScaffold for packing optimization to stride over inputs."""

  def call(self, inputs, stride: tf.Tensor, training=None):
-    if isinstance(inputs, (list, tuple)) and len(inputs) == 2:
+    if isinstance(inputs, (list, tuple)):
+      if len(inputs) == 2:
        input_tensor, attention_mask = inputs
+        key_value = None
+      elif len(inputs) == 3:
+        input_tensor, key_value, attention_mask = inputs
+      else:
+        raise ValueError('Unexpected inputs to %s with length at %d' %
+                         (self.__class__, len(inputs)))
    else:
-      input_tensor, attention_mask = (inputs, None)
+      input_tensor, key_value, attention_mask = (inputs, None, None)
+
+    if key_value is None:
+      key_value = input_tensor

    if self._norm_first:
      source_tensor = input_tensor[:, ::stride, :]
@@ -215,7 +225,7 @@ class StridedTransformerScaffold(transformer_scaffold.TransformerScaffold):
    target_tensor = input_tensor[:, ::stride, :]

    attention_output = self._attention_layer(
-        query=target_tensor, value=input_tensor, attention_mask=attention_mask,
+        query=target_tensor, value=key_value, attention_mask=attention_mask,
        training=training)
    attention_output = self._attention_dropout(attention_output,
                                               training=training)

--- a/official/nlp/modeling/layers/per_dim_scale_attention.py
+++ b/official/nlp/modeling/layers/per_dim_scale_attention.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based attention layer with learnable per dim scaling."""
+import numpy as np
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class PerDimScaleAttention(tf.keras.layers.MultiHeadAttention):
+  """Learn scales for individual dims.
+
+     It can improve quality but might hurt training stability.
+  """
+
+  def _build_from_signature(self, query, value, key=None):
+    super()._build_from_signature(query=query, value=value, key=key)  # pytype: disable=attribute-error
+    self._scale_dim = self._key_dim
+    self.per_dim_scale = self.add_weight(
+        name='per_dim_scale',
+        shape=(self._scale_dim,),
+        initializer='zeros',
+        dtype=self.dtype,
+        trainable=True)
+
+  def _scale_query(self, query):
+    # 1.0/tf.nn.softplus(0.0) = 1.442695041. Hard code this number so that we
+    # can avoid unnecessary XLA op fusion mess on TPU.
+    r_softplus_0 = 1.442695041
+    scale = tf.constant(
+        r_softplus_0 / np.sqrt(float(self._scale_dim)), dtype=query.dtype)
+
+    scale *= tf.nn.softplus(self.per_dim_scale)
+    return query * scale
+
+  def _compute_attention(self,
+                         query,
+                         key,
+                         value,
+                         attention_mask=None,
+                         training=None):
+    query = self._scale_query(query)
+
+    attention_scores = tf.einsum(self._dot_product_equation, key, query)
+
+    attention_scores = self._masked_softmax(attention_scores, attention_mask)
+
+    attention_scores_dropout = self._dropout_layer(
+        attention_scores, training=training)
+
+    # `context_layer` = [B, T, N, H]
+    attention_output = tf.einsum(self._combine_equation,
+                                 attention_scores_dropout, value)
+    return attention_output, attention_scores
+
+  def call(
+      self,
+      query,
+      value,
+      key=None,
+      attention_mask=None,
+      return_attention_scores=False,
+      training=None,
+  ):
+    if not self._built_from_signature:
+      self._build_from_signature(query=query, value=value, key=key)
+    if key is None:
+      key = value
+
+    #   N = `num_attention_heads`
+    #   H = `size_per_head`
+    # `query` = [B, T, N ,H]
+    query = self._query_dense(query)
+
+    # `key` = [B, S, N, H]
+    key = self._key_dense(key)
+
+    # `value` = [B, S, N, H]
+    value = self._value_dense(value)
+
+    attention_output, attention_scores = self._compute_attention(
+        query, key, value, attention_mask, training)
+    attention_output = self._output_dense(attention_output)
+
+    if return_attention_scores:
+      return attention_output, attention_scores
+    return attention_output
--- a/official/nlp/modeling/layers/per_dim_scale_attention_test.py
+++ b/official/nlp/modeling/layers/per_dim_scale_attention_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for PerDimScaleAttention."""
+
+import tensorflow as tf
+
+from official.nlp.modeling.layers import per_dim_scale_attention as attention
+
+
+class PerDimScaleAttentionTest(tf.test.TestCase):
+
+  def test_attention(self):
+    num_heads = 12
+    key_dim = 64
+    seq_length = 1024
+    batch_size = 2
+    test_layer = attention.PerDimScaleAttention(
+        num_heads=num_heads, key_dim=key_dim)
+    query = tf.random.normal(
+        shape=(batch_size, seq_length, key_dim * num_heads))
+    value = query
+    output = test_layer(query=query, value=value)
+    self.assertEqual(output.shape,
+                     [batch_size, seq_length, key_dim * num_heads])
+
+  def test_config(self):
+    num_heads = 12
+    key_dim = 64
+    test_layer = attention.PerDimScaleAttention(
+        num_heads=num_heads, key_dim=key_dim)
+    print(test_layer.get_config())
+    new_layer = attention.PerDimScaleAttention.from_config(
+        test_layer.get_config())
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/nlp/modeling/layers/transformer_scaffold.py
+++ b/official/nlp/modeling/layers/transformer_scaffold.py
@@ -119,9 +119,15 @@ class TransformerScaffold(tf.keras.layers.Layer):
    self._bias_constraint = tf.keras.constraints.get(bias_constraint)

  def build(self, input_shape):
-    input_tensor_shape = input_shape[0] if (
-        len(input_shape) == 2) else input_shape
-    input_tensor_shape = tf.TensorShape(input_tensor_shape)
+    if isinstance(input_shape, tf.TensorShape):
+      input_tensor_shape = input_shape
+    elif isinstance(input_shape, (list, tuple)):
+      input_tensor_shape = tf.TensorShape(input_shape[0])
+    else:
+      raise ValueError(
+          "The type of input shape argument is not supported, got: %s" %
+          type(input_shape))
+
    if len(input_tensor_shape.as_list()) != 3:
      raise ValueError(
          "TransformerScaffold expects a three-dimensional input of "
@@ -271,17 +277,27 @@ class TransformerScaffold(tf.keras.layers.Layer):
    return dict(list(base_config.items()) + list(config.items()))

  def call(self, inputs, training=None):
-    if isinstance(inputs, (list, tuple)) and len(inputs) == 2:
+    if isinstance(inputs, (list, tuple)):
+      if len(inputs) == 2:
        input_tensor, attention_mask = inputs
+        key_value = None
+      elif len(inputs) == 3:
+        input_tensor, key_value, attention_mask = inputs
      else:
-      input_tensor, attention_mask = (inputs, None)
+        raise ValueError("Unexpected inputs to %s with length at %d" %
+                         (self.__class__, len(inputs)))
+    else:
+      input_tensor, key_value, attention_mask = (inputs, None, None)
+
+    if key_value is None:
+      key_value = input_tensor

    if self._norm_first:
      source_tensor = input_tensor
      input_tensor = self._attention_layer_norm(input_tensor, training=training)

    attention_output = self._attention_layer(
-        query=input_tensor, value=input_tensor, attention_mask=attention_mask,
+        query=input_tensor, value=key_value, attention_mask=attention_mask,
        training=training)
    attention_output = self._attention_dropout(attention_output,
                                               training=training)