Adds T5/MTF style relative position bias layer.

PiperOrigin-RevId: 354401143

Adds T5/MTF style relative position bias layer.
PiperOrigin-RevId: 354401143
c02bca43 · Hongkun Yu · A. Unique TensorFlower · 2fba0107 · c02bca43 · c02bca43
Commit c02bca43 authored Jan 28, 2021 by Hongkun Yu Committed by A. Unique TensorFlower Jan 28, 2021
3 changed files
--- a/official/nlp/modeling/layers/__init__.py
+++ b/official/nlp/modeling/layers/__init__.py
@@ -26,6 +26,7 @@ from official.nlp.modeling.layers.mobile_bert_layers import MobileBertMaskedLM
 from official.nlp.modeling.layers.mobile_bert_layers import MobileBertTransformer
 from official.nlp.modeling.layers.multi_channel_attention import *
 from official.nlp.modeling.layers.on_device_embedding import OnDeviceEmbedding
+from official.nlp.modeling.layers.position_embedding import RelativePositionBias
 from official.nlp.modeling.layers.position_embedding import RelativePositionEmbedding
 from official.nlp.modeling.layers.relative_attention import MultiHeadRelativeAttention
 from official.nlp.modeling.layers.relative_attention import TwoStreamRelativeAttention

--- a/official/nlp/modeling/layers/position_embedding.py
+++ b/official/nlp/modeling/layers/position_embedding.py
@@ -14,13 +14,15 @@
 # ==============================================================================
 """Keras-based positional embedding layer."""
 # pylint: disable=g-classes-have-attributes
 import math
+from typing import Optional
 import tensorflow as tf
 from official.modeling import tf_utils
+Initializer = tf.keras.initializers.Initializer
 @tf.keras.utils.register_keras_serializable(package="Text")
 class RelativePositionEmbedding(tf.keras.layers.Layer):
@@ -38,9 +40,9 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
  """
  def __init__(self,
-               hidden_size,
+               hidden_size: int,
-               min_timescale=1.0,
+               min_timescale: float = 1.0,
-               max_timescale=1.0e4,
+               max_timescale: float = 1.0e4,
               **kwargs):
    # We need to have a default dtype of float32, since the inputs (which Keras
    # usually uses to infer the dtype) will always be int32.
@@ -50,7 +52,7 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
    if "dtype" not in kwargs:
      kwargs["dtype"] = "float32"
-    super(RelativePositionEmbedding, self).__init__(**kwargs)
+    super().__init__(**kwargs)
    self._hidden_size = hidden_size
    self._min_timescale = min_timescale
    self._max_timescale = max_timescale
@@ -101,3 +103,135 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
        [tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
    return position_embeddings
+def _relative_position_bucket(relative_position,
+                              bidirectional=True,
+                              num_buckets=32,
+                              max_distance=128):
+  """Translate relative position to a bucket number for relative attention.
+  The relative position is defined as memory_position - query_position, i.e.
+  the distance in tokens from the attending position to the attended-to
+  position.
+  If bidirectional=False, then positive relative positions are invalid.
+  We use smaller buckets for small absolute relative_position and larger
+  buckets for larger absolute relative_positions.
+  All relative positions >=max_distance map to the same bucket.
+  All relative positions <=-max_distance map to the same bucket.
+  This should allow for more graceful generalization to longer sequences
+  than the model has been trained on.
+  Args:
+    relative_position: an int32 Tensor
+    bidirectional: a boolean - whether the attention is bidirectional
+    num_buckets: an integer
+    max_distance: an integer
+  Returns:
+    a Tensor with the same shape as relative_position, containing int32
+    values in the range [0, num_buckets)
+  """
+  ret = 0
+  n = -relative_position
+  if bidirectional:
+    num_buckets //= 2
+    ret += tf.cast(tf.math.less(n, 0), tf.int32) * num_buckets
+    n = tf.math.abs(n)
+  else:
+    n = tf.math.maximum(n, 0)
+  # now n is in the range [0, inf)
+  max_exact = num_buckets // 2
+  is_small = tf.math.less(n, max_exact)
+  val_if_large = max_exact + tf.dtypes.cast(
+      tf.math.log(tf.cast(n, tf.float32) / max_exact) /
+      math.log(max_distance / max_exact) * (num_buckets - max_exact),
+      tf.int32,
+  )
+  val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
+  ret += tf.where(is_small, n, val_if_large)
+  return ret
+@tf.keras.utils.register_keras_serializable(package="Text")
+class RelativePositionBias(tf.keras.layers.Layer):
+  """Relative position embedding via per-head bias in T5 style.
+  Reference implementation in MeshTF:
+  https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L1000
+  This layer implements the relative position bias used in "Exploring the Limits
+  of Transfer Learning with a Unified Text-to-Text Transformer"
+  (https://arxiv.org/abs/1910.10683)
+  """
+  def __init__(self,
+               num_heads: int,
+               relative_attention_num_buckets: int = 32,
+               relative_attention_max_distance: int = 128,
+               bidirectional: bool = True,
+               embeddings_initializer: Optional[Initializer] = None,
+               **kwargs):
+    super().__init__(**kwargs)
+    self.num_heads = num_heads
+    self.relative_attention_num_buckets = relative_attention_num_buckets
+    self.bidirectional = bidirectional
+    self.relative_attention_max_distance = relative_attention_max_distance
+    if embeddings_initializer:
+      self._embed_init = embeddings_initializer
+    else:
+      self._embed_init = tf.keras.initializers.TruncatedNormal(stddev=1.0)
+    with tf.name_scope(self.name):
+      self._relative_attention_bias = self.add_weight(
+          "rel_embedding",
+          shape=[self.relative_attention_num_buckets, self.num_heads],
+          initializer=self._embed_init,
+          dtype=self.dtype,
+          trainable=True)
+  def get_config(self):
+    config = {
+        "num_heads":
+            self.num_heads,
+        "relative_attention_num_buckets":
+            self.relative_attention_num_buckets,
+        "relative_attention_max_distance":
+            self.relative_attention_max_distance,
+        "bidirectional":
+            self.bidirectional,
+        "embeddings_initializer":
+            tf.keras.initializers.serialize(self._embed_init),
+    }
+    base_config = super().get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+  def call(self, query: tf.Tensor, key: tf.Tensor):
+    """Implements the forward pass.
+    Args:
+      query: query input tensor shape [batch, query length, hidden size].
+      key: key input tensor shape [batch, key length, hidden size].
+    Returns:
+      A tensor in shape of [batch, heads, query length, key length].
+    """
+    batch_size, qlen = tf_utils.get_shape_list(query)[:2]
+    klen = tf_utils.get_shape_list(key)[1]
+    context_position = tf.range(qlen)[:, None]
+    memory_position = tf.range(klen)[None, :]
+    relative_position = memory_position - context_position
+    rp_bucket = _relative_position_bucket(
+        relative_position,
+        bidirectional=self.bidirectional,
+        num_buckets=self.relative_attention_num_buckets,
+        max_distance=self.relative_attention_max_distance)
+    values = tf.nn.embedding_lookup(self._relative_attention_bias, rp_bucket)
+    values = tf.expand_dims(
+        tf.transpose(values, [2, 0, 1]),
+        axis=0)  # shape (1, num_heads, qlen, klen)
+    values = tf.tile(values, [batch_size, 1, 1, 1])
+    return values
--- a/official/nlp/modeling/layers/position_embedding_test.py
+++ b/official/nlp/modeling/layers/position_embedding_test.py
@@ -14,6 +14,8 @@
 # ==============================================================================
 """Tests for Keras-based positional embedding layer."""
+from absl.testing import parameterized
+import numpy as np
 import tensorflow as tf
 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
@@ -55,5 +57,32 @@ class RelativePositionEmbeddingLayerTest(keras_parameterized.TestCase):
    self.assertAllEqual(output_tensor, expected_output_tensor)
+@keras_parameterized.run_all_keras_modes
+class RelativePositionBiasTest(keras_parameterized.TestCase):
+  @parameterized.named_parameters(("bidirectional", True),
+                                  ("unidirectional", False))
+  def test_relative_position_bias(self, bidirectional):
+    query = tf.zeros((4, 4, 2))
+    key = tf.zeros((4, 2, 2))
+    l = position_embedding.RelativePositionBias(
+        num_heads=3,
+        bidirectional=bidirectional,
+        name="foo")
+    self.assertEqual(l(query, key).shape, (4, 3, 4, 2))
+    self.assertLen(l.trainable_variables, 1)
+    self.assertEqual(l.trainable_variables[0].name, "foo/rel_embedding:0")
+  def test_relative_position_bucket(self):
+    context_position = tf.range(3)[:, None]
+    memory_position = tf.range(2)[None, :]
+    relative_position = memory_position - context_position
+    outputs = position_embedding._relative_position_bucket(relative_position)
+    self.assertAllEqual(outputs.numpy(), np.array([[0, 17], [1, 0], [2, 1]]))
+    outputs = position_embedding._relative_position_bucket(
+        relative_position, bidirectional=False)
+    self.assertAllEqual(outputs.numpy(), np.array([[0, 0], [1, 0], [2, 1]]))
 if __name__ == "__main__":
  tf.test.main()