Add PositionEmbedding for keras_nlp.

PiperOrigin-RevId: 329568227

Add PositionEmbedding for keras_nlp.
PiperOrigin-RevId: 329568227
2b166642 · Zhenyu Tan · A. Unique TensorFlower · b74b4ee1 · 2b166642 · 2b166642
Commit 2b166642 authored Sep 01, 2020 by Zhenyu Tan Committed by A. Unique TensorFlower Sep 01, 2020
12 changed files
--- a/official/nlp/keras_nlp/layers/__init__.py
+++ b/official/nlp/keras_nlp/layers/__init__.py
@@ -13,4 +13,5 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-NLP layers package definition."""
+from official.nlp.keras_nlp.layers.position_embedding import PositionEmbedding
 from official.nlp.keras_nlp.layers.transformer_encoder_block import TransformerEncoderBlock
--- a/official/nlp/keras_nlp/layers/position_embedding.py
+++ b/official/nlp/keras_nlp/layers/position_embedding.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras-based positional embedding layer."""
+# pylint: disable=g-classes-have-attributes
+import tensorflow as tf
+@tf.keras.utils.register_keras_serializable(package="Text")
+class PositionEmbedding(tf.keras.layers.Layer):
+  """Creates a positional embedding.
+  Example:
+  ```python
+  position_embedding = PositionEmbedding(max_length=100)
+  inputs = tf.keras.Input((100, 32), dtype=tf.float32)
+  outputs = position_embedding(inputs)
+  ```
+  Arguments:
+    max_length: The maximum size of the dynamic sequence.
+    initializer: The initializer to use for the embedding weights. Defaults to
+      "glorot_uniform".
+  Reference: This layer creates a positional embedding as described in
+  [BERT: Pre-training of Deep Bidirectional Transformers for Language
+  Understanding](https://arxiv.org/abs/1810.04805).
+  """
+  def __init__(self,
+               max_length,
+               initializer="glorot_uniform",
+               **kwargs):
+    super(PositionEmbedding, self).__init__(**kwargs)
+    if max_length is None:
+      raise ValueError(
+          "`max_length` must be an Integer, not `None`."
+      )
+    self._max_length = max_length
+    self._initializer = tf.keras.initializers.get(initializer)
+  def get_config(self):
+    config = {
+        "max_length": self._max_length,
+        "initializer": tf.keras.initializers.serialize(self._initializer),
+    }
+    base_config = super(PositionEmbedding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+  def build(self, input_shape):
+    dimension_list = input_shape.as_list()
+    if len(dimension_list) != 3:
+      raise ValueError("PositionEmbedding expects a 3-dimensional input tensor "
+                       "of shape [batch, sequence, width], got "
+                       "{}".format(input_shape))
+    seq_length = dimension_list[1]
+    width = dimension_list[2]
+    if self._max_length is not None:
+      weight_sequence_length = self._max_length
+    else:
+      weight_sequence_length = seq_length
+    self._position_embeddings = self.add_weight(
+        "embeddings",
+        shape=[weight_sequence_length, width],
+        initializer=self._initializer)
+    super(PositionEmbedding, self).build(input_shape)
+  def call(self, inputs):
+    input_shape = tf.shape(inputs)
+    position_embeddings = self._position_embeddings[:input_shape[1], :]
+    return tf.broadcast_to(position_embeddings, input_shape)
--- a/official/nlp/keras_nlp/layers/position_embedding_test.py
+++ b/official/nlp/keras_nlp/layers/position_embedding_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras-based positional embedding layer."""
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.keras_nlp.layers import position_embedding
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
+  def test_static_layer_output_shape(self):
+    # Create a 3-dimensional input (the first dimension is implicit).
+    sequence_length = 21
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=sequence_length)
+    width = 30
+    input_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(input_tensor)
+    # When using static positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions save batch.
+    expected_output_shape = [None, sequence_length, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    # The default output dtype for this layer should be tf.float32.
+    self.assertEqual(tf.float32, output_tensor.dtype)
+  def test_float16_dtype(self):
+    # Create a 3-dimensional input (the first dimension is implicit).
+    sequence_length = 21
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=sequence_length, dtype="float16")
+    width = 30
+    input_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(input_tensor)
+    # When using static positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions save batch.
+    expected_output_shape = [None, sequence_length, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    # The default output dtype for this layer should be tf.float32.
+    self.assertEqual(tf.float16, output_tensor.dtype)
+  def test_dynamic_layer_output_shape(self):
+    max_sequence_length = 40
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=max_sequence_length)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    width = 30
+    input_tensor = tf.keras.Input(shape=(None, width))
+    output_tensor = test_layer(input_tensor)
+    # When using dynamic positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions - but may be None if
+    # the input shape is None there.
+    expected_output_shape = [None, None, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+  def test_dynamic_layer_slicing(self):
+    max_sequence_length = 40
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=max_sequence_length)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    width = 30
+    input_tensor = tf.keras.Input(shape=(None, width))
+    output_tensor = test_layer(input_tensor)
+    model = tf.keras.Model(input_tensor, output_tensor)
+    # Create input data that is shorter than max_sequence_length, which should
+    # trigger a down-slice.
+    input_length = 17
+    # Note: This test explicitly uses a batch size of 1. This is to get around
+    # Keras' restriction on Model invocations: inputs are expected to have the
+    # same batch cardinality as outputs. In practice, this layer should be used
+    # inside a model, where it can be projected when added to another tensor.
+    input_data = np.ones((1, input_length, width))
+    output_data = model.predict(input_data)
+    self.assertAllEqual([1, input_length, width], output_data.shape)
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/modeling/layers/__init__.py
+++ b/official/nlp/modeling/layers/__init__.py
@@ -23,7 +23,6 @@ from official.nlp.modeling.layers.masked_softmax import MaskedSoftmax
 from official.nlp.modeling.layers.mat_mul_with_margin import MatMulWithMargin
 from official.nlp.modeling.layers.multi_channel_attention import *
 from official.nlp.modeling.layers.on_device_embedding import OnDeviceEmbedding
-from official.nlp.modeling.layers.position_embedding import PositionEmbedding
 from official.nlp.modeling.layers.position_embedding import RelativePositionEmbedding
 from official.nlp.modeling.layers.rezero_transformer import ReZeroTransformer
 from official.nlp.modeling.layers.self_attention_mask import SelfAttentionMask

--- a/official/nlp/modeling/layers/position_embedding.py
+++ b/official/nlp/modeling/layers/position_embedding.py
@@ -22,7 +22,6 @@ import tensorflow as tf
 from official.modeling import tf_utils
-@tf.keras.utils.register_keras_serializable(package="Text")
 class PositionEmbedding(tf.keras.layers.Layer):
  """Creates a positional embedding.

--- a/official/nlp/modeling/layers/position_embedding_test.py
+++ b/official/nlp/modeling/layers/position_embedding_test.py
@@ -14,7 +14,6 @@
 # ==============================================================================
 """Tests for Keras-based positional embedding layer."""
-import numpy as np
 import tensorflow as tf
 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
@@ -24,75 +23,7 @@ from official.nlp.modeling.layers import position_embedding
 # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
-class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
+class RelativePositionEmbeddingLayerTest(keras_parameterized.TestCase):
-  def test_static_layer_output_shape(self):
-    test_layer = position_embedding.PositionEmbedding()
-    # Create a 3-dimensional input (the first dimension is implicit).
-    sequence_length = 21
-    width = 30
-    input_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output_tensor = test_layer(input_tensor)
-    # When using static positional embedding shapes, the output is expected
-    # to be the same as the input shape in all dimensions save batch.
-    expected_output_shape = [None, sequence_length, width]
-    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
-    # The default output dtype for this layer should be tf.float32.
-    self.assertEqual(tf.float32, output_tensor.dtype)
-  def test_float16_dtype(self):
-    test_layer = position_embedding.PositionEmbedding(dtype="float16")
-    # Create a 3-dimensional input (the first dimension is implicit).
-    sequence_length = 21
-    width = 30
-    input_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output_tensor = test_layer(input_tensor)
-    # When using static positional embedding shapes, the output is expected
-    # to be the same as the input shape in all dimensions save batch.
-    expected_output_shape = [None, sequence_length, width]
-    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
-    # The default output dtype for this layer should be tf.float32.
-    self.assertEqual(tf.float16, output_tensor.dtype)
-  def test_dynamic_layer_output_shape(self):
-    max_sequence_length = 40
-    test_layer = position_embedding.PositionEmbedding(
-        use_dynamic_slicing=True, max_sequence_length=max_sequence_length)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    width = 30
-    input_tensor = tf.keras.Input(shape=(None, width))
-    output_tensor = test_layer(input_tensor)
-    # When using dynamic positional embedding shapes, the output is expected
-    # to be the same as the input shape in all dimensions - but may be None if
-    # the input shape is None there.
-    expected_output_shape = [None, None, width]
-    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
-  def test_dynamic_layer_slicing(self):
-    max_sequence_length = 40
-    test_layer = position_embedding.PositionEmbedding(
-        use_dynamic_slicing=True, max_sequence_length=max_sequence_length)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    width = 30
-    input_tensor = tf.keras.Input(shape=(None, width))
-    output_tensor = test_layer(input_tensor)
-    model = tf.keras.Model(input_tensor, output_tensor)
-    # Create input data that is shorter than max_sequence_length, which should
-    # trigger a down-slice.
-    input_length = 17
-    # Note: This test explicitly uses a batch size of 1. This is to get around
-    # Keras' restriction on Model invocations: inputs are expected to have the
-    # same batch cardinality as outputs. In practice, this layer should be used
-    # inside a model, where it can be projected when added to another tensor.
-    input_data = np.ones((1, input_length, width))
-    output_data = model.predict(input_data)
-    self.assertAllEqual([1, input_length, width], output_data.shape)
  def test_relative_tensor_input(self):
    hidden_size = 8

--- a/official/nlp/modeling/networks/albert_transformer_encoder.py
+++ b/official/nlp/modeling/networks/albert_transformer_encoder.py
@@ -116,10 +116,9 @@ class AlbertTransformerEncoder(tf.keras.Model):
    word_embeddings = self._embedding_layer(word_ids)
    # Always uses dynamic slicing for simplicity.
-    self._position_embedding_layer = layers.PositionEmbedding(
+    self._position_embedding_layer = keras_nlp.PositionEmbedding(
        initializer=initializer,
-        use_dynamic_slicing=True,
+        max_length=max_sequence_length,
-        max_sequence_length=max_sequence_length,
        name='position_embedding')
    position_embeddings = self._position_embedding_layer(word_embeddings)

--- a/official/nlp/modeling/networks/albert_transformer_encoder_test.py
+++ b/official/nlp/modeling/networks/albert_transformer_encoder_test.py
@@ -92,7 +92,6 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
        num_attention_heads=2,
        num_layers=3,
        type_vocab_size=num_types)
-    self.assertTrue(test_network._position_embedding_layer._use_dynamic_slicing)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -122,7 +121,6 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
        num_attention_heads=2,
        num_layers=3,
        type_vocab_size=num_types)
-    self.assertTrue(test_network._position_embedding_layer._use_dynamic_slicing)
    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
    _ = model.predict([word_id_data, mask_data, type_id_data])

--- a/official/nlp/modeling/networks/bert_encoder.py
+++ b/official/nlp/modeling/networks/bert_encoder.py
@@ -18,6 +18,7 @@
 import tensorflow as tf
 from official.modeling import activations
+from official.nlp import keras_nlp
 from official.nlp.modeling import layers
@@ -132,10 +133,9 @@ class BertEncoder(tf.keras.Model):
    word_embeddings = self._embedding_layer(word_ids)
    # Always uses dynamic slicing for simplicity.
-    self._position_embedding_layer = layers.PositionEmbedding(
+    self._position_embedding_layer = keras_nlp.PositionEmbedding(
        initializer=initializer,
-        use_dynamic_slicing=True,
+        max_length=max_sequence_length,
-        max_sequence_length=max_sequence_length,
        name='position_embedding')
    position_embeddings = self._position_embedding_layer(word_embeddings)
    self._type_embedding_layer = layers.OnDeviceEmbedding(

--- a/official/nlp/modeling/networks/bert_encoder_test.py
+++ b/official/nlp/modeling/networks/bert_encoder_test.py
@@ -134,7 +134,6 @@ class BertEncoderTest(keras_parameterized.TestCase):
        num_layers=3,
        type_vocab_size=num_types,
        output_range=output_range)
-    self.assertTrue(test_network._position_embedding_layer._use_dynamic_slicing)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -163,7 +162,6 @@ class BertEncoderTest(keras_parameterized.TestCase):
        num_attention_heads=2,
        num_layers=3,
        type_vocab_size=num_types)
-    self.assertTrue(test_network._position_embedding_layer._use_dynamic_slicing)
    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
    outputs = model.predict([word_id_data, mask_data, type_id_data])
    self.assertEqual(outputs[0].shape[1], out_seq_len)

--- a/official/nlp/modeling/networks/encoder_scaffold.py
+++ b/official/nlp/modeling/networks/encoder_scaffold.py
@@ -26,6 +26,7 @@ from absl import logging
 import gin
 import tensorflow as tf
+from official.nlp import keras_nlp
 from official.nlp.modeling import layers
@@ -146,10 +147,9 @@ class EncoderScaffold(tf.keras.Model):
      word_embeddings = self._embedding_layer(word_ids)
      # Always uses dynamic slicing for simplicity.
-      self._position_embedding_layer = layers.PositionEmbedding(
+      self._position_embedding_layer = keras_nlp.PositionEmbedding(
          initializer=embedding_cfg['initializer'],
-          use_dynamic_slicing=True,
+          max_length=embedding_cfg['max_seq_length'],
-          max_sequence_length=embedding_cfg['max_seq_length'],
          name='position_embedding')
      position_embeddings = self._position_embedding_layer(word_embeddings)

--- a/official/nlp/modeling/networks/mobile_bert_encoder.py
+++ b/official/nlp/modeling/networks/mobile_bert_encoder.py
@@ -16,6 +16,7 @@
 import gin
 import tensorflow as tf
+from official.nlp import keras_nlp
 from official.nlp.modeling import layers
@@ -111,9 +112,8 @@ class MobileBertEmbedding(tf.keras.layers.Layer):
        use_one_hot=True,
        initializer=initializer,
        name='type_embedding')
-    self.pos_embedding = layers.PositionEmbedding(
+    self.pos_embedding = keras_nlp.PositionEmbedding(
-        use_dynamic_slicing=True,
+        max_length=max_sequence_length,
-        max_sequence_length=max_sequence_length,
        initializer=initializer,
        name='position_embedding')
    self.word_embedding_proj = tf.keras.layers.experimental.EinsumDense(