Commit 2b166642 authored by Zhenyu Tan's avatar Zhenyu Tan Committed by A. Unique TensorFlower
Browse files

Add PositionEmbedding for keras_nlp.

PiperOrigin-RevId: 329568227
parent b74b4ee1
...@@ -13,4 +13,5 @@ ...@@ -13,4 +13,5 @@
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Keras-NLP layers package definition.""" """Keras-NLP layers package definition."""
from official.nlp.keras_nlp.layers.position_embedding import PositionEmbedding
from official.nlp.keras_nlp.layers.transformer_encoder_block import TransformerEncoderBlock from official.nlp.keras_nlp.layers.transformer_encoder_block import TransformerEncoderBlock
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras-based positional embedding layer."""
# pylint: disable=g-classes-have-attributes
import tensorflow as tf
@tf.keras.utils.register_keras_serializable(package="Text")
class PositionEmbedding(tf.keras.layers.Layer):
"""Creates a positional embedding.
Example:
```python
position_embedding = PositionEmbedding(max_length=100)
inputs = tf.keras.Input((100, 32), dtype=tf.float32)
outputs = position_embedding(inputs)
```
Arguments:
max_length: The maximum size of the dynamic sequence.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
Reference: This layer creates a positional embedding as described in
[BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding](https://arxiv.org/abs/1810.04805).
"""
def __init__(self,
max_length,
initializer="glorot_uniform",
**kwargs):
super(PositionEmbedding, self).__init__(**kwargs)
if max_length is None:
raise ValueError(
"`max_length` must be an Integer, not `None`."
)
self._max_length = max_length
self._initializer = tf.keras.initializers.get(initializer)
def get_config(self):
config = {
"max_length": self._max_length,
"initializer": tf.keras.initializers.serialize(self._initializer),
}
base_config = super(PositionEmbedding, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def build(self, input_shape):
dimension_list = input_shape.as_list()
if len(dimension_list) != 3:
raise ValueError("PositionEmbedding expects a 3-dimensional input tensor "
"of shape [batch, sequence, width], got "
"{}".format(input_shape))
seq_length = dimension_list[1]
width = dimension_list[2]
if self._max_length is not None:
weight_sequence_length = self._max_length
else:
weight_sequence_length = seq_length
self._position_embeddings = self.add_weight(
"embeddings",
shape=[weight_sequence_length, width],
initializer=self._initializer)
super(PositionEmbedding, self).build(input_shape)
def call(self, inputs):
input_shape = tf.shape(inputs)
position_embeddings = self._position_embeddings[:input_shape[1], :]
return tf.broadcast_to(position_embeddings, input_shape)
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for Keras-based positional embedding layer."""
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.keras_nlp.layers import position_embedding
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes
class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
def test_static_layer_output_shape(self):
# Create a 3-dimensional input (the first dimension is implicit).
sequence_length = 21
test_layer = position_embedding.PositionEmbedding(
max_length=sequence_length)
width = 30
input_tensor = tf.keras.Input(shape=(sequence_length, width))
output_tensor = test_layer(input_tensor)
# When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch.
expected_output_shape = [None, sequence_length, width]
self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
# The default output dtype for this layer should be tf.float32.
self.assertEqual(tf.float32, output_tensor.dtype)
def test_float16_dtype(self):
# Create a 3-dimensional input (the first dimension is implicit).
sequence_length = 21
test_layer = position_embedding.PositionEmbedding(
max_length=sequence_length, dtype="float16")
width = 30
input_tensor = tf.keras.Input(shape=(sequence_length, width))
output_tensor = test_layer(input_tensor)
# When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch.
expected_output_shape = [None, sequence_length, width]
self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
# The default output dtype for this layer should be tf.float32.
self.assertEqual(tf.float16, output_tensor.dtype)
def test_dynamic_layer_output_shape(self):
max_sequence_length = 40
test_layer = position_embedding.PositionEmbedding(
max_length=max_sequence_length)
# Create a 3-dimensional input (the first dimension is implicit).
width = 30
input_tensor = tf.keras.Input(shape=(None, width))
output_tensor = test_layer(input_tensor)
# When using dynamic positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions - but may be None if
# the input shape is None there.
expected_output_shape = [None, None, width]
self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
def test_dynamic_layer_slicing(self):
max_sequence_length = 40
test_layer = position_embedding.PositionEmbedding(
max_length=max_sequence_length)
# Create a 3-dimensional input (the first dimension is implicit).
width = 30
input_tensor = tf.keras.Input(shape=(None, width))
output_tensor = test_layer(input_tensor)
model = tf.keras.Model(input_tensor, output_tensor)
# Create input data that is shorter than max_sequence_length, which should
# trigger a down-slice.
input_length = 17
# Note: This test explicitly uses a batch size of 1. This is to get around
# Keras' restriction on Model invocations: inputs are expected to have the
# same batch cardinality as outputs. In practice, this layer should be used
# inside a model, where it can be projected when added to another tensor.
input_data = np.ones((1, input_length, width))
output_data = model.predict(input_data)
self.assertAllEqual([1, input_length, width], output_data.shape)
if __name__ == "__main__":
tf.test.main()
...@@ -23,7 +23,6 @@ from official.nlp.modeling.layers.masked_softmax import MaskedSoftmax ...@@ -23,7 +23,6 @@ from official.nlp.modeling.layers.masked_softmax import MaskedSoftmax
from official.nlp.modeling.layers.mat_mul_with_margin import MatMulWithMargin from official.nlp.modeling.layers.mat_mul_with_margin import MatMulWithMargin
from official.nlp.modeling.layers.multi_channel_attention import * from official.nlp.modeling.layers.multi_channel_attention import *
from official.nlp.modeling.layers.on_device_embedding import OnDeviceEmbedding from official.nlp.modeling.layers.on_device_embedding import OnDeviceEmbedding
from official.nlp.modeling.layers.position_embedding import PositionEmbedding
from official.nlp.modeling.layers.position_embedding import RelativePositionEmbedding from official.nlp.modeling.layers.position_embedding import RelativePositionEmbedding
from official.nlp.modeling.layers.rezero_transformer import ReZeroTransformer from official.nlp.modeling.layers.rezero_transformer import ReZeroTransformer
from official.nlp.modeling.layers.self_attention_mask import SelfAttentionMask from official.nlp.modeling.layers.self_attention_mask import SelfAttentionMask
......
...@@ -22,7 +22,6 @@ import tensorflow as tf ...@@ -22,7 +22,6 @@ import tensorflow as tf
from official.modeling import tf_utils from official.modeling import tf_utils
@tf.keras.utils.register_keras_serializable(package="Text")
class PositionEmbedding(tf.keras.layers.Layer): class PositionEmbedding(tf.keras.layers.Layer):
"""Creates a positional embedding. """Creates a positional embedding.
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
# ============================================================================== # ==============================================================================
"""Tests for Keras-based positional embedding layer.""" """Tests for Keras-based positional embedding layer."""
import numpy as np
import tensorflow as tf import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
...@@ -24,75 +23,7 @@ from official.nlp.modeling.layers import position_embedding ...@@ -24,75 +23,7 @@ from official.nlp.modeling.layers import position_embedding
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover. # guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes @keras_parameterized.run_all_keras_modes
class PositionEmbeddingLayerTest(keras_parameterized.TestCase): class RelativePositionEmbeddingLayerTest(keras_parameterized.TestCase):
def test_static_layer_output_shape(self):
test_layer = position_embedding.PositionEmbedding()
# Create a 3-dimensional input (the first dimension is implicit).
sequence_length = 21
width = 30
input_tensor = tf.keras.Input(shape=(sequence_length, width))
output_tensor = test_layer(input_tensor)
# When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch.
expected_output_shape = [None, sequence_length, width]
self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
# The default output dtype for this layer should be tf.float32.
self.assertEqual(tf.float32, output_tensor.dtype)
def test_float16_dtype(self):
test_layer = position_embedding.PositionEmbedding(dtype="float16")
# Create a 3-dimensional input (the first dimension is implicit).
sequence_length = 21
width = 30
input_tensor = tf.keras.Input(shape=(sequence_length, width))
output_tensor = test_layer(input_tensor)
# When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch.
expected_output_shape = [None, sequence_length, width]
self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
# The default output dtype for this layer should be tf.float32.
self.assertEqual(tf.float16, output_tensor.dtype)
def test_dynamic_layer_output_shape(self):
max_sequence_length = 40
test_layer = position_embedding.PositionEmbedding(
use_dynamic_slicing=True, max_sequence_length=max_sequence_length)
# Create a 3-dimensional input (the first dimension is implicit).
width = 30
input_tensor = tf.keras.Input(shape=(None, width))
output_tensor = test_layer(input_tensor)
# When using dynamic positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions - but may be None if
# the input shape is None there.
expected_output_shape = [None, None, width]
self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
def test_dynamic_layer_slicing(self):
max_sequence_length = 40
test_layer = position_embedding.PositionEmbedding(
use_dynamic_slicing=True, max_sequence_length=max_sequence_length)
# Create a 3-dimensional input (the first dimension is implicit).
width = 30
input_tensor = tf.keras.Input(shape=(None, width))
output_tensor = test_layer(input_tensor)
model = tf.keras.Model(input_tensor, output_tensor)
# Create input data that is shorter than max_sequence_length, which should
# trigger a down-slice.
input_length = 17
# Note: This test explicitly uses a batch size of 1. This is to get around
# Keras' restriction on Model invocations: inputs are expected to have the
# same batch cardinality as outputs. In practice, this layer should be used
# inside a model, where it can be projected when added to another tensor.
input_data = np.ones((1, input_length, width))
output_data = model.predict(input_data)
self.assertAllEqual([1, input_length, width], output_data.shape)
def test_relative_tensor_input(self): def test_relative_tensor_input(self):
hidden_size = 8 hidden_size = 8
......
...@@ -116,10 +116,9 @@ class AlbertTransformerEncoder(tf.keras.Model): ...@@ -116,10 +116,9 @@ class AlbertTransformerEncoder(tf.keras.Model):
word_embeddings = self._embedding_layer(word_ids) word_embeddings = self._embedding_layer(word_ids)
# Always uses dynamic slicing for simplicity. # Always uses dynamic slicing for simplicity.
self._position_embedding_layer = layers.PositionEmbedding( self._position_embedding_layer = keras_nlp.PositionEmbedding(
initializer=initializer, initializer=initializer,
use_dynamic_slicing=True, max_length=max_sequence_length,
max_sequence_length=max_sequence_length,
name='position_embedding') name='position_embedding')
position_embeddings = self._position_embedding_layer(word_embeddings) position_embeddings = self._position_embedding_layer(word_embeddings)
......
...@@ -92,7 +92,6 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase): ...@@ -92,7 +92,6 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
num_attention_heads=2, num_attention_heads=2,
num_layers=3, num_layers=3,
type_vocab_size=num_types) type_vocab_size=num_types)
self.assertTrue(test_network._position_embedding_layer._use_dynamic_slicing)
# Create the inputs (note that the first dimension is implicit). # Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
...@@ -122,7 +121,6 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase): ...@@ -122,7 +121,6 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
num_attention_heads=2, num_attention_heads=2,
num_layers=3, num_layers=3,
type_vocab_size=num_types) type_vocab_size=num_types)
self.assertTrue(test_network._position_embedding_layer._use_dynamic_slicing)
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled]) model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
_ = model.predict([word_id_data, mask_data, type_id_data]) _ = model.predict([word_id_data, mask_data, type_id_data])
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
import tensorflow as tf import tensorflow as tf
from official.modeling import activations from official.modeling import activations
from official.nlp import keras_nlp
from official.nlp.modeling import layers from official.nlp.modeling import layers
...@@ -132,10 +133,9 @@ class BertEncoder(tf.keras.Model): ...@@ -132,10 +133,9 @@ class BertEncoder(tf.keras.Model):
word_embeddings = self._embedding_layer(word_ids) word_embeddings = self._embedding_layer(word_ids)
# Always uses dynamic slicing for simplicity. # Always uses dynamic slicing for simplicity.
self._position_embedding_layer = layers.PositionEmbedding( self._position_embedding_layer = keras_nlp.PositionEmbedding(
initializer=initializer, initializer=initializer,
use_dynamic_slicing=True, max_length=max_sequence_length,
max_sequence_length=max_sequence_length,
name='position_embedding') name='position_embedding')
position_embeddings = self._position_embedding_layer(word_embeddings) position_embeddings = self._position_embedding_layer(word_embeddings)
self._type_embedding_layer = layers.OnDeviceEmbedding( self._type_embedding_layer = layers.OnDeviceEmbedding(
......
...@@ -134,7 +134,6 @@ class BertEncoderTest(keras_parameterized.TestCase): ...@@ -134,7 +134,6 @@ class BertEncoderTest(keras_parameterized.TestCase):
num_layers=3, num_layers=3,
type_vocab_size=num_types, type_vocab_size=num_types,
output_range=output_range) output_range=output_range)
self.assertTrue(test_network._position_embedding_layer._use_dynamic_slicing)
# Create the inputs (note that the first dimension is implicit). # Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
...@@ -163,7 +162,6 @@ class BertEncoderTest(keras_parameterized.TestCase): ...@@ -163,7 +162,6 @@ class BertEncoderTest(keras_parameterized.TestCase):
num_attention_heads=2, num_attention_heads=2,
num_layers=3, num_layers=3,
type_vocab_size=num_types) type_vocab_size=num_types)
self.assertTrue(test_network._position_embedding_layer._use_dynamic_slicing)
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled]) model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
outputs = model.predict([word_id_data, mask_data, type_id_data]) outputs = model.predict([word_id_data, mask_data, type_id_data])
self.assertEqual(outputs[0].shape[1], out_seq_len) self.assertEqual(outputs[0].shape[1], out_seq_len)
......
...@@ -26,6 +26,7 @@ from absl import logging ...@@ -26,6 +26,7 @@ from absl import logging
import gin import gin
import tensorflow as tf import tensorflow as tf
from official.nlp import keras_nlp
from official.nlp.modeling import layers from official.nlp.modeling import layers
...@@ -146,10 +147,9 @@ class EncoderScaffold(tf.keras.Model): ...@@ -146,10 +147,9 @@ class EncoderScaffold(tf.keras.Model):
word_embeddings = self._embedding_layer(word_ids) word_embeddings = self._embedding_layer(word_ids)
# Always uses dynamic slicing for simplicity. # Always uses dynamic slicing for simplicity.
self._position_embedding_layer = layers.PositionEmbedding( self._position_embedding_layer = keras_nlp.PositionEmbedding(
initializer=embedding_cfg['initializer'], initializer=embedding_cfg['initializer'],
use_dynamic_slicing=True, max_length=embedding_cfg['max_seq_length'],
max_sequence_length=embedding_cfg['max_seq_length'],
name='position_embedding') name='position_embedding')
position_embeddings = self._position_embedding_layer(word_embeddings) position_embeddings = self._position_embedding_layer(word_embeddings)
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
import gin import gin
import tensorflow as tf import tensorflow as tf
from official.nlp import keras_nlp
from official.nlp.modeling import layers from official.nlp.modeling import layers
...@@ -111,9 +112,8 @@ class MobileBertEmbedding(tf.keras.layers.Layer): ...@@ -111,9 +112,8 @@ class MobileBertEmbedding(tf.keras.layers.Layer):
use_one_hot=True, use_one_hot=True,
initializer=initializer, initializer=initializer,
name='type_embedding') name='type_embedding')
self.pos_embedding = layers.PositionEmbedding( self.pos_embedding = keras_nlp.PositionEmbedding(
use_dynamic_slicing=True, max_length=max_sequence_length,
max_sequence_length=max_sequence_length,
initializer=initializer, initializer=initializer,
name='position_embedding') name='position_embedding')
self.word_embedding_proj = tf.keras.layers.experimental.EinsumDense( self.word_embedding_proj = tf.keras.layers.experimental.EinsumDense(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment