Unverified Commit 09d9656f authored by Srihari Humbarwadi's avatar Srihari Humbarwadi Committed by GitHub
Browse files

Merge branch 'panoptic-segmentation' into panoptic-deeplab-modeling

parents ac671306 49a5706c
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Keras-based gated feedforward layer."""
# pylint: disable=g-classes-have-attributes
from typing import Optional
import tensorflow as tf
class BlockDiagFeedforward(tf.keras.layers.Layer):
"""Block diagonal feedforward layer.
This layer replaces the weight matrix of the output_dense layer with a block
diagonal matrix to save layer parameters and FLOPs. A linear mixing layer can
be added optionally to improve layer expressibility.
Args:
intermediate_size: Size of the intermediate layer.
intermediate_activation: Activation for the intermediate layer.
dropout: Dropout probability for the output dropout.
num_blocks: The number of blocks for the block diagonal matrix of the
output_dense layer.
apply_mixing: Apply linear mixing if True.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
bias_regularizer: Regularizer for dense layer biases.
activity_regularizer: Regularizer for dense layer activity.
kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels.
"""
def __init__(
self,
intermediate_size: int,
intermediate_activation: str,
dropout: float,
num_blocks: int = 1,
apply_mixing: bool = True,
kernel_initializer: str = "glorot_uniform",
bias_initializer: str = "zeros",
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
activity_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
kernel_constraint: Optional[tf.keras.constraints.Constraint] = None,
bias_constraint: Optional[tf.keras.constraints.Constraint] = None,
**kwargs): # pylint: disable=g-doc-args
super(BlockDiagFeedforward, self).__init__(**kwargs)
self._intermediate_size = intermediate_size
self._intermediate_activation = intermediate_activation
self._dropout = dropout
self._num_blocks = num_blocks
self._apply_mixing = apply_mixing
if intermediate_size % num_blocks != 0:
raise ValueError("Intermediate_size (%d) isn't a multiple of num_blocks "
"(%d)." % (intermediate_size, num_blocks))
self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self._bias_initializer = tf.keras.initializers.get(bias_initializer)
self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
self._bias_constraint = tf.keras.constraints.get(bias_constraint)
def build(self, input_shape):
hidden_size = input_shape.as_list()[-1]
common_kwargs = dict(
kernel_initializer=self._kernel_initializer,
bias_initializer=self._bias_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer,
activity_regularizer=self._activity_regularizer,
kernel_constraint=self._kernel_constraint,
bias_constraint=self._bias_constraint)
self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
"abc,cde->abde",
output_shape=(None, self._num_blocks,
self._intermediate_size // self._num_blocks),
bias_axes="de",
name="intermediate",
**common_kwargs)
policy = tf.keras.mixed_precision.global_policy()
if policy.name == "mixed_bfloat16":
# bfloat16 causes BERT with the LAMB optimizer to not converge
# as well, so we use float32.
policy = tf.float32
self._intermediate_activation_layer = tf.keras.layers.Activation(
self._intermediate_activation, dtype=policy)
self._output_dense = tf.keras.layers.experimental.EinsumDense(
"abde,deo->abdo",
output_shape=(None, self._num_blocks,
hidden_size // self._num_blocks),
bias_axes="do",
name="output",
**common_kwargs)
if self._apply_mixing:
self._output_mixing = tf.keras.layers.experimental.EinsumDense(
"abdo,de->abeo",
output_shape=(None, self._num_blocks,
hidden_size // self._num_blocks),
name="output_mixing",
**common_kwargs)
self._output_reshape = tf.keras.layers.Reshape((-1, hidden_size))
self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout)
def get_config(self):
config = {
"intermediate_size":
self._intermediate_size,
"intermediate_activation":
self._intermediate_activation,
"dropout":
self._dropout,
"num_blocks":
self._num_blocks,
"apply_mixing":
self._apply_mixing,
"kernel_initializer":
tf.keras.initializers.serialize(self._kernel_initializer),
"bias_initializer":
tf.keras.initializers.serialize(self._bias_initializer),
"kernel_regularizer":
tf.keras.regularizers.serialize(self._kernel_regularizer),
"bias_regularizer":
tf.keras.regularizers.serialize(self._bias_regularizer),
"activity_regularizer":
tf.keras.regularizers.serialize(self._activity_regularizer),
"kernel_constraint":
tf.keras.constraints.serialize(self._kernel_constraint),
"bias_constraint":
tf.keras.constraints.serialize(self._bias_constraint)
}
base_config = super(BlockDiagFeedforward, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs):
intermediate_output = self._intermediate_dense(inputs)
intermediate_output = self._intermediate_activation_layer(
intermediate_output)
layer_output = self._output_dense(intermediate_output)
if self._apply_mixing:
layer_output = self._output_mixing(layer_output)
layer_output = self._output_reshape(layer_output)
layer_output = self._output_dropout(layer_output)
return layer_output
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for Keras-based gated feedforward layer."""
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.layers import block_diag_feedforward
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes
class BlockDiagFeedforwardTest(keras_parameterized.TestCase):
def tearDown(self):
super(BlockDiagFeedforwardTest, self).tearDown()
tf.keras.mixed_precision.set_global_policy("float32")
@parameterized.parameters(
(1, True, "float32"),
(1, True, "mixed_float16"),
(1, False, "float32"),
(1, False, "mixed_float16"),
(2, True, "float32"),
(2, True, "mixed_float16"),
(2, False, "float32"),
(2, False, "mixed_float16"),
)
def test_layer_creation(self, num_blocks, apply_mixing, dtype):
tf.keras.mixed_precision.set_global_policy(dtype)
kwargs = dict(
intermediate_size=128,
intermediate_activation="relu",
dropout=0.1,
num_blocks=num_blocks,
apply_mixing=apply_mixing,
kernel_initializer="glorot_uniform",
bias_initializer="zeros")
test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
sequence_length = 64
width = 128
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor = tf.keras.Input(shape=(sequence_length, width))
output_tensor = test_layer(data_tensor)
# The default output of a transformer layer should be the same as the input.
self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
@parameterized.parameters(
(1, True, "float32"),
(1, True, "mixed_float16"),
(1, False, "float32"),
(1, False, "mixed_float16"),
(2, True, "float32"),
(2, True, "mixed_float16"),
(2, False, "float32"),
(2, False, "mixed_float16"),
)
def test_layer_invocation(self, num_blocks, apply_mixing, dtype):
tf.keras.mixed_precision.set_global_policy(dtype)
kwargs = dict(
intermediate_size=16,
intermediate_activation="relu",
dropout=0.1,
num_blocks=num_blocks,
apply_mixing=apply_mixing,
kernel_initializer="glorot_uniform",
bias_initializer="zeros")
test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
sequence_length = 16
width = 32
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor = tf.keras.Input(shape=(sequence_length, width))
output_tensor = test_layer(data_tensor)
# Create a model from the test layer.
model = tf.keras.Model(data_tensor, output_tensor)
# Invoke the model on test data.
batch_size = 6
input_data = 10 * np.random.random_sample(
(batch_size, sequence_length, width))
output_data = model.predict(input_data)
self.assertEqual(output_data.shape, (batch_size, sequence_length, width))
def test_get_config(self):
kwargs = dict(
intermediate_size=16,
intermediate_activation="relu",
dropout=0.1,
num_blocks=2,
apply_mixing=True,
kernel_initializer="glorot_uniform",
bias_initializer="zeros")
test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
new_layer = block_diag_feedforward.BlockDiagFeedforward.from_config(
test_layer.get_config())
self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
if __name__ == "__main__":
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Keras-based einsum layer."""
# pylint: disable=g-classes-have-attributes
import tensorflow as tf
from tensorflow.python.util import deprecation
_CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]
@tf.keras.utils.register_keras_serializable(package="Text")
class DenseEinsum(tf.keras.layers.Layer):
"""A densely connected layer that uses `tf.einsum` as the backing computation.
This layer can perform einsum calculations of arbitrary dimensionality.
Args:
output_shape: Positive integer or tuple, dimensionality of the output space.
num_summed_dimensions: The number of dimensions to sum over. Standard 2D
matmul should use 1, 3D matmul should use 2, and so forth.
activation: Activation function to use. If you don't specify anything, no
activation is applied
(ie. "linear" activation: `a(x) = x`).
use_bias: Boolean, whether the layer uses a bias vector.
kernel_initializer: Initializer for the `kernel` weights matrix.
bias_initializer: Initializer for the bias vector.
kernel_regularizer: Regularizer function applied to the `kernel` weights
matrix.
bias_regularizer: Regularizer function applied to the bias vector.
activity_regularizer: Regularizer function applied to the output of the
layer (its "activation")..
kernel_constraint: Constraint function applied to the `kernel` weights
matrix.
bias_constraint: Constraint function applied to the bias vector.
Input shape:
N-D tensor with shape: `(batch_size, ..., input_dim)`. The most common
situation would be a 2D input with shape `(batch_size, input_dim)`.
Output shape:
N-D tensor with shape: `(batch_size, ..., units)`. For instance, for a 2D
input with shape `(batch_size, input_dim)`, the output would have shape
`(batch_size, units)`.
"""
@deprecation.deprecated(None, "DenseEinsum is deprecated. Please use "
"tf.keras.experimental.EinsumDense layer instead.")
def __init__(self,
output_shape,
num_summed_dimensions=1,
activation=None,
use_bias=True,
kernel_initializer="glorot_uniform",
bias_initializer="zeros",
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
**kwargs):
super(DenseEinsum, self).__init__(**kwargs)
self._output_shape = output_shape if isinstance(
output_shape, (list, tuple)) else (output_shape,)
self._activation = tf.keras.activations.get(activation)
self._use_bias = use_bias
self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self._bias_initializer = tf.keras.initializers.get(bias_initializer)
self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
self._bias_constraint = tf.keras.constraints.get(bias_constraint)
self._num_summed_dimensions = num_summed_dimensions
self._einsum_string = None
def _build_einsum_string(self, free_input_dims, bound_dims, output_dims):
input_str = ""
kernel_str = ""
output_str = ""
letter_offset = 0
for i in range(free_input_dims):
char = _CHR_IDX[i + letter_offset]
input_str += char
output_str += char
letter_offset += free_input_dims
for i in range(bound_dims):
char = _CHR_IDX[i + letter_offset]
input_str += char
kernel_str += char
letter_offset += bound_dims
for i in range(output_dims):
char = _CHR_IDX[i + letter_offset]
kernel_str += char
output_str += char
return input_str + "," + kernel_str + "->" + output_str
def build(self, input_shape):
input_shape = tf.TensorShape(input_shape)
input_rank = input_shape.rank
free_input_dims = input_rank - self._num_summed_dimensions
output_dims = len(self._output_shape)
self._einsum_string = self._build_einsum_string(free_input_dims,
self._num_summed_dimensions,
output_dims)
# This is only saved for testing purposes.
self._kernel_shape = (
input_shape[free_input_dims:].concatenate(self._output_shape))
self._kernel = self.add_weight(
"kernel",
shape=self._kernel_shape,
initializer=self._kernel_initializer,
regularizer=self._kernel_regularizer,
constraint=self._kernel_constraint,
dtype=self.dtype,
trainable=True)
if self._use_bias:
self._bias = self.add_weight(
"bias",
shape=self._output_shape,
initializer=self._bias_initializer,
regularizer=self._bias_regularizer,
constraint=self._bias_constraint,
dtype=self.dtype,
trainable=True)
else:
self._bias = None
super(DenseEinsum, self).build(input_shape)
def get_config(self):
config = {
"output_shape":
self._output_shape,
"num_summed_dimensions":
self._num_summed_dimensions,
"activation":
tf.keras.activations.serialize(self._activation),
"use_bias":
self._use_bias,
"kernel_initializer":
tf.keras.initializers.serialize(self._kernel_initializer),
"bias_initializer":
tf.keras.initializers.serialize(self._bias_initializer),
"kernel_regularizer":
tf.keras.regularizers.serialize(self._kernel_regularizer),
"bias_regularizer":
tf.keras.regularizers.serialize(self._bias_regularizer),
"activity_regularizer":
tf.keras.regularizers.serialize(self._activity_regularizer),
"kernel_constraint":
tf.keras.constraints.serialize(self._kernel_constraint),
"bias_constraint":
tf.keras.constraints.serialize(self._bias_constraint)
}
base_config = super(DenseEinsum, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs):
ret = tf.einsum(self._einsum_string, inputs, self._kernel)
if self._use_bias:
ret += self._bias
if self._activation is not None:
ret = self._activation(ret)
return ret
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for Keras-based einsum layer."""
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.layers import dense_einsum
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes
class DenseEinsumLayer(keras_parameterized.TestCase):
def test_3D_einsum_with_two_bound_dimensions(self):
test_layer = dense_einsum.DenseEinsum(
output_shape=(64,), num_summed_dimensions=2)
# Create a 4-dimensional input (the first dimension is implicit).
input_tensor = tf.keras.Input(shape=(None, 40, 80))
_ = test_layer(input_tensor)
self.assertEqual(test_layer._einsum_string, "abcd,cde->abe")
self.assertEqual(test_layer._kernel_shape, (40, 80, 64))
def test_3D_einsum_with_one_bound_dimensions(self):
test_layer = dense_einsum.DenseEinsum(
output_shape=(64, 32), num_summed_dimensions=1)
# Create a 3-dimensional input (the first dimension is implicit).
input_tensor = tf.keras.Input(shape=(None, 80))
_ = test_layer(input_tensor)
self.assertEqual(test_layer._einsum_string, "abc,cde->abde")
self.assertEqual(test_layer._kernel_shape, (80, 64, 32))
def test_2D_einsum_with_one_bound_dimensions(self):
test_layer = dense_einsum.DenseEinsum(
output_shape=(64,), num_summed_dimensions=1)
# Create a 3-dimensional input (the first dimension is implicit).
input_tensor = tf.keras.Input(shape=(None, 80))
_ = test_layer(input_tensor)
self.assertEqual(test_layer._einsum_string, "abc,cd->abd")
self.assertEqual(test_layer._kernel_shape, (80, 64))
def test_bias_term_can_be_disabled(self):
# A layer created using the bias should have two weights.
test_layer = dense_einsum.DenseEinsum(
output_shape=64, num_summed_dimensions=1, use_bias=True)
input_tensor = tf.keras.Input(shape=(None, 80))
_ = test_layer(input_tensor)
self.assertEqual(2, len(test_layer.get_weights()))
# A layer created without the bias should have only one weight.
test_layer = dense_einsum.DenseEinsum(
output_shape=64, num_summed_dimensions=1, use_bias=False)
input_tensor = tf.keras.Input(shape=(None, 80))
_ = test_layer(input_tensor)
self.assertEqual(1, len(test_layer.get_weights()))
def test_activation(self):
# Create a model that does not use an activation.
no_activation_layer = dense_einsum.DenseEinsum(
output_shape=64, num_summed_dimensions=1, activation=None)
input_tensor = tf.keras.Input(shape=(None, 80))
output_tensor = no_activation_layer(input_tensor)
no_activation_model = tf.keras.Model(input_tensor, output_tensor)
# Create a model that uses a softmax activation.
activation_layer = dense_einsum.DenseEinsum(
output_shape=64, num_summed_dimensions=1, activation="softmax")
input_tensor = tf.keras.Input(shape=(None, 80))
output_tensor = activation_layer(input_tensor)
activation_model = tf.keras.Model(input_tensor, output_tensor)
# Make sure the models' weights are identical.
activation_model.set_weights(no_activation_model.get_weights())
# Predict using each model on the same input data. The output should be
# different, since one is using a softmax - even though the models' weights
# are the same.
input_values = 10 * np.random.random_sample((10, 4, 80))
non_activated_data = no_activation_model.predict(input_values)
activated_data = activation_model.predict(input_values)
self.assertNotAllClose(activated_data, non_activated_data)
def test_non_iterable_output_shape(self):
test_layer = dense_einsum.DenseEinsum(
output_shape=64, num_summed_dimensions=1)
# Create a 3-dimensional input (the first dimension is implicit).
input_tensor = tf.keras.Input(shape=(None, 80))
_ = test_layer(input_tensor)
self.assertEqual(test_layer._einsum_string, "abc,cd->abd")
self.assertEqual(test_layer._kernel_shape, (80, 64))
def test_with_explicit_initializer(self):
test_layer = dense_einsum.DenseEinsum(
output_shape=(64,),
num_summed_dimensions=2,
kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
# Create a 4-dimensional input (the first dimension is implicit).
input_tensor = tf.keras.Input(shape=(None, 40, 80))
_ = test_layer(input_tensor)
self.assertEqual(test_layer._einsum_string, "abcd,cde->abe")
self.assertEqual(test_layer._kernel_shape, (40, 80, 64))
if __name__ == "__main__":
tf.test.main()
...@@ -68,7 +68,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase): ...@@ -68,7 +68,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
# Invoke the model on test data. We can't validate the output data itself # Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors. # (the NN is too complex) but this will rule out structural runtime errors.
batch_size = 6 batch_size = 6
input_data = 10 * np.random.random_sample( input_data = np.random.random_sample(
(batch_size, sequence_length, width)) (batch_size, sequence_length, width))
_ = model.predict(input_data) _ = model.predict(input_data)
...@@ -89,7 +89,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase): ...@@ -89,7 +89,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
# Invoke the model on test data. We can't validate the output data itself # Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors. # (the NN is too complex) but this will rule out structural runtime errors.
batch_size = 6 batch_size = 6
input_data = 10 * np.random.random_sample( input_data = np.random.random_sample(
(batch_size, sequence_length, width)) (batch_size, sequence_length, width))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len), # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length) # which here is (batch, sequence_length, sequence_length)
...@@ -104,7 +104,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase): ...@@ -104,7 +104,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
width = 80 width = 80
batch_size = 6 batch_size = 6
input_data = 10 * np.random.random_sample( input_data = np.random.random_sample(
(batch_size, sequence_length, width)) (batch_size, sequence_length, width))
mask_data = np.random.randint( mask_data = np.random.randint(
2, size=(batch_size, sequence_length, sequence_length)) 2, size=(batch_size, sequence_length, sequence_length))
...@@ -121,7 +121,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase): ...@@ -121,7 +121,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
new_layer.set_weights(test_layer.get_weights()) new_layer.set_weights(test_layer.get_weights())
new_output_tensor, _ = new_layer([input_data, mask_data]) new_output_tensor, _ = new_layer([input_data, mask_data])
self.assertAllClose( self.assertAllClose(
new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003) new_output_tensor, output_tensor[:, 0:1, :], atol=0.002, rtol=0.01)
def test_layer_output_range_with_relative_pe(self, transformer_cls): def test_layer_output_range_with_relative_pe(self, transformer_cls):
test_layer = transformer_cls( test_layer = transformer_cls(
...@@ -131,7 +131,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase): ...@@ -131,7 +131,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
width = 80 width = 80
batch_size = 6 batch_size = 6
input_data = 10 * np.random.random_sample( input_data = np.random.random_sample(
(batch_size, sequence_length, width)) (batch_size, sequence_length, width))
mask_data = np.random.randint( mask_data = np.random.randint(
2, size=(batch_size, sequence_length, sequence_length)) 2, size=(batch_size, sequence_length, sequence_length))
...@@ -149,7 +149,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase): ...@@ -149,7 +149,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
new_layer.set_weights(test_layer.get_weights()) new_layer.set_weights(test_layer.get_weights())
new_output_tensor, _ = new_layer([input_data, mask_data]) new_output_tensor, _ = new_layer([input_data, mask_data])
self.assertAllClose( self.assertAllClose(
new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003) new_output_tensor, output_tensor[:, 0:1, :], atol=0.002, rtol=0.01)
def test_layer_output_range_without_mask(self, transformer_cls): def test_layer_output_range_without_mask(self, transformer_cls):
test_layer = transformer_cls( test_layer = transformer_cls(
...@@ -159,7 +159,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase): ...@@ -159,7 +159,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
width = 80 width = 80
batch_size = 6 batch_size = 6
input_data = 10 * np.random.random_sample( input_data = np.random.random_sample(
(batch_size, sequence_length, width)) (batch_size, sequence_length, width))
output_tensor, _ = test_layer(input_data) output_tensor, _ = test_layer(input_data)
...@@ -175,7 +175,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase): ...@@ -175,7 +175,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
new_layer.set_weights(test_layer.get_weights()) new_layer.set_weights(test_layer.get_weights())
new_output_tensor, _ = new_layer(input_data) new_output_tensor, _ = new_layer(input_data)
self.assertAllClose( self.assertAllClose(
new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003) new_output_tensor, output_tensor[:, 0:1, :], atol=0.002, rtol=0.01)
def test_layer_output_range_with_pre_norm(self, transformer_cls): def test_layer_output_range_with_pre_norm(self, transformer_cls):
test_layer = transformer_cls( test_layer = transformer_cls(
...@@ -185,7 +185,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase): ...@@ -185,7 +185,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
width = 80 width = 80
batch_size = 6 batch_size = 6
input_data = 10 * np.random.random_sample( input_data = np.random.random_sample(
(batch_size, sequence_length, width)) (batch_size, sequence_length, width))
mask_data = np.random.randint( mask_data = np.random.randint(
2, size=(batch_size, sequence_length, sequence_length)) 2, size=(batch_size, sequence_length, sequence_length))
...@@ -203,7 +203,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase): ...@@ -203,7 +203,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
new_layer.set_weights(test_layer.get_weights()) new_layer.set_weights(test_layer.get_weights())
new_output_tensor, _ = new_layer([input_data, mask_data]) new_output_tensor, _ = new_layer([input_data, mask_data])
self.assertAllClose( self.assertAllClose(
new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003) new_output_tensor, output_tensor[:, 0:1, :], atol=0.002, rtol=0.01)
def test_layer_invocation_with_float16_dtype(self, transformer_cls): def test_layer_invocation_with_float16_dtype(self, transformer_cls):
tf.keras.mixed_precision.set_global_policy('mixed_float16') tf.keras.mixed_precision.set_global_policy('mixed_float16')
...@@ -223,7 +223,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase): ...@@ -223,7 +223,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
# Invoke the model on test data. We can't validate the output data itself # Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors. # (the NN is too complex) but this will rule out structural runtime errors.
batch_size = 6 batch_size = 6
input_data = (10 * np.random.random_sample( input_data = (np.random.random_sample(
(batch_size, sequence_length, width))) (batch_size, sequence_length, width)))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len), # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length) # which here is (batch, sequence_length, sequence_length)
...@@ -368,7 +368,7 @@ class ReuseTransformerArgumentTest(tf.test.TestCase, parameterized.TestCase): ...@@ -368,7 +368,7 @@ class ReuseTransformerArgumentTest(tf.test.TestCase, parameterized.TestCase):
# Invoke the model on test data. We can't validate the output data itself # Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors. # (the NN is too complex) but this will rule out structural runtime errors.
batch_size = 6 batch_size = 6
input_data = 10 * np.random.random_sample( input_data = np.random.random_sample(
(batch_size, sequence_length, width)) (batch_size, sequence_length, width))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len), # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length) # which here is (batch, sequence_length, sequence_length)
...@@ -404,7 +404,7 @@ class ReuseTransformerArgumentTest(tf.test.TestCase, parameterized.TestCase): ...@@ -404,7 +404,7 @@ class ReuseTransformerArgumentTest(tf.test.TestCase, parameterized.TestCase):
# Invoke the model on test data. We can't validate the output data itself # Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors. # (the NN is too complex) but this will rule out structural runtime errors.
batch_size = 6 batch_size = 6
input_data = (10 * np.random.random_sample( input_data = (np.random.random_sample(
(batch_size, sequence_length, width))) (batch_size, sequence_length, width)))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len), # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length) # which here is (batch, sequence_length, sequence_length)
......
...@@ -18,6 +18,8 @@ ...@@ -18,6 +18,8 @@
import gin import gin
import tensorflow as tf import tensorflow as tf
from official.nlp.modeling.layers import util
@tf.keras.utils.register_keras_serializable(package="Text") @tf.keras.utils.register_keras_serializable(package="Text")
@gin.configurable @gin.configurable
...@@ -45,6 +47,7 @@ class ReZeroTransformer(tf.keras.layers.Layer): ...@@ -45,6 +47,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
kernel_constraint: Constraint for dense layer kernels. kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels. bias_constraint: Constraint for dense layer kernels.
use_layer_norm: If add layer_norm on top of the ReZero. use_layer_norm: If add layer_norm on top of the ReZero.
share_rezero: If attention layer and FFN layer share the same alpha.
""" """
def __init__(self, def __init__(self,
...@@ -62,7 +65,14 @@ class ReZeroTransformer(tf.keras.layers.Layer): ...@@ -62,7 +65,14 @@ class ReZeroTransformer(tf.keras.layers.Layer):
kernel_constraint=None, kernel_constraint=None,
bias_constraint=None, bias_constraint=None,
use_layer_norm=False, use_layer_norm=False,
share_rezero=True,
**kwargs): **kwargs):
# attention_dropout will override attention_dropout_rate.
# This is to unify the input params with TransformerEncoderBlock.
attention_dropout_rate = kwargs.pop("attention_dropout",
attention_dropout_rate)
dropout_rate = kwargs.pop("output_dropout", dropout_rate)
util.filter_kwargs(kwargs)
super(ReZeroTransformer, self).__init__(**kwargs) super(ReZeroTransformer, self).__init__(**kwargs)
self._num_heads = num_attention_heads self._num_heads = num_attention_heads
...@@ -78,10 +88,18 @@ class ReZeroTransformer(tf.keras.layers.Layer): ...@@ -78,10 +88,18 @@ class ReZeroTransformer(tf.keras.layers.Layer):
self._kernel_constraint = tf.keras.constraints.get(kernel_constraint) self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
self._bias_constraint = tf.keras.constraints.get(bias_constraint) self._bias_constraint = tf.keras.constraints.get(bias_constraint)
self._use_layer_norm = use_layer_norm self._use_layer_norm = use_layer_norm
self._share_rezero = share_rezero
def build(self, input_shape): def build(self, input_shape):
input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape if isinstance(input_shape, tf.TensorShape):
input_tensor_shape = tf.TensorShape(input_tensor) input_tensor_shape = input_shape
elif isinstance(input_shape, (list, tuple)):
input_tensor_shape = tf.TensorShape(input_shape[0])
else:
raise ValueError(
"The type of input shape argument is not supported, got: %s" %
type(input_shape))
if len(input_tensor_shape.as_list()) != 3: if len(input_tensor_shape.as_list()) != 3:
raise ValueError("TransformerLayer expects a three-dimensional input of " raise ValueError("TransformerLayer expects a three-dimensional input of "
"shape [batch, sequence, width].") "shape [batch, sequence, width].")
...@@ -158,6 +176,15 @@ class ReZeroTransformer(tf.keras.layers.Layer): ...@@ -158,6 +176,15 @@ class ReZeroTransformer(tf.keras.layers.Layer):
trainable=True, trainable=True,
dtype=tf.float32) dtype=tf.float32)
if self._share_rezero:
self._rezero_a_ffn = self._rezero_a
else:
self._rezero_a_ffn = self.add_weight(
name="rezero_alpha_ffn",
initializer=tf.keras.initializers.Zeros(),
trainable=True,
dtype=tf.float32)
super(ReZeroTransformer, self).build(input_shape) super(ReZeroTransformer, self).build(input_shape)
def get_config(self): def get_config(self):
...@@ -176,6 +203,8 @@ class ReZeroTransformer(tf.keras.layers.Layer): ...@@ -176,6 +203,8 @@ class ReZeroTransformer(tf.keras.layers.Layer):
self._output_range, self._output_range,
"use_layer_norm": "use_layer_norm":
self._use_layer_norm, self._use_layer_norm,
"share_rezero":
self._share_rezero,
"kernel_initializer": "kernel_initializer":
tf.keras.initializers.serialize(self._kernel_initializer), tf.keras.initializers.serialize(self._kernel_initializer),
"bias_initializer": "bias_initializer":
...@@ -196,21 +225,34 @@ class ReZeroTransformer(tf.keras.layers.Layer): ...@@ -196,21 +225,34 @@ class ReZeroTransformer(tf.keras.layers.Layer):
def reset_rezero(self): def reset_rezero(self):
self._rezero_a.assign(0.) self._rezero_a.assign(0.)
if not self._share_rezero:
self._rezero_a_ffn.assign(0.)
def call(self, inputs): def call(self, inputs):
if isinstance(inputs, (list, tuple)) and len(inputs) == 2: if isinstance(inputs, (list, tuple)):
input_tensor, attention_mask = inputs if len(inputs) == 2:
input_tensor, attention_mask = inputs
key_value = None
elif len(inputs) == 3:
input_tensor, key_value, attention_mask = inputs
else:
raise ValueError("Unexpected inputs to %s with length at %d" %
(self.__class__, len(inputs)))
else: else:
input_tensor, attention_mask = (inputs, None) input_tensor, key_value, attention_mask = (inputs, None, None)
if self._output_range: if self._output_range:
target_tensor = input_tensor[:, 0:self._output_range, :] target_tensor = input_tensor[:, 0:self._output_range, :]
attention_mask = attention_mask[:, 0:self._output_range, :] if attention_mask is not None:
attention_mask = attention_mask[:, 0:self._output_range, :]
else: else:
target_tensor = input_tensor target_tensor = input_tensor
if key_value is None:
key_value = input_tensor
attention_output = self._attention_layer( attention_output = self._attention_layer(
query=target_tensor, value=input_tensor, attention_mask=attention_mask) query=target_tensor, value=key_value, attention_mask=attention_mask)
attention_output = self._attention_dropout(attention_output) attention_output = self._attention_dropout(attention_output)
attention_output = target_tensor + self._rezero_a * attention_output attention_output = target_tensor + self._rezero_a * attention_output
if self._use_layer_norm: if self._use_layer_norm:
...@@ -225,7 +267,7 @@ class ReZeroTransformer(tf.keras.layers.Layer): ...@@ -225,7 +267,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
layer_output = self._output_dropout(layer_output) layer_output = self._output_dropout(layer_output)
# During mixed precision training, attention_output is from layer norm and # During mixed precision training, attention_output is from layer norm and
# is always fp32 for now. Cast layer_output to fp32 for the subsequent add. # is always fp32 for now. Cast layer_output to fp32 for the subsequent add.
layer_output = attention_output + tf.cast(self._rezero_a * layer_output, layer_output = attention_output + tf.cast(self._rezero_a_ffn * layer_output,
tf.float32) tf.float32)
if self._use_layer_norm: if self._use_layer_norm:
layer_output = self._output_layer_norm(layer_output) layer_output = self._output_layer_norm(layer_output)
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
"""Tests for Keras-based rezero-transformer block layer.""" """Tests for Keras-based rezero-transformer block layer."""
from absl.testing import parameterized
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
...@@ -30,12 +31,15 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase): ...@@ -30,12 +31,15 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):
super(TransformerWithReZeroLayerTest, self).tearDown() super(TransformerWithReZeroLayerTest, self).tearDown()
tf.keras.mixed_precision.set_global_policy('float32') tf.keras.mixed_precision.set_global_policy('float32')
def test_layer_invocation_with_float16_dtype(self): @parameterized.named_parameters(('no_share_attn_ffn', False),
('share_attn_ffn', True))
def test_layer_invocation_with_float16_dtype(self, share_rezero):
tf.keras.mixed_precision.set_global_policy('mixed_float16') tf.keras.mixed_precision.set_global_policy('mixed_float16')
test_layer = rezero_transformer.ReZeroTransformer( test_layer = rezero_transformer.ReZeroTransformer(
num_attention_heads=10, num_attention_heads=10,
intermediate_size=2048, intermediate_size=2048,
intermediate_activation='relu') intermediate_activation='relu',
share_rezero=share_rezero)
sequence_length = 21 sequence_length = 21
width = 80 width = 80
# Create a 3-dimensional input (the first dimension is implicit). # Create a 3-dimensional input (the first dimension is implicit).
...@@ -124,6 +128,20 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase): ...@@ -124,6 +128,20 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):
new_output_tensor = new_layer([input_data, mask_data]) new_output_tensor = new_layer([input_data, mask_data])
self.assertAllClose(new_output_tensor, output_tensor[:, 0:1, :]) self.assertAllClose(new_output_tensor, output_tensor[:, 0:1, :])
def test_separate_qkv(self):
test_layer = rezero_transformer.ReZeroTransformer(
num_attention_heads=2,
intermediate_size=128,
intermediate_activation='relu',
kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
# Forward path.
q_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
kv_tensor = tf.zeros([2, 8, 16], dtype=tf.float32)
dummy_mask = tf.zeros([2, 4, 8], dtype=tf.float32)
inputs = [q_tensor, kv_tensor, dummy_mask]
output = test_layer(inputs)
self.assertEqual(output.shape, q_tensor.shape)
if __name__ == '__main__': if __name__ == '__main__':
tf.test.main() tf.test.main()
...@@ -13,18 +13,22 @@ ...@@ -13,18 +13,22 @@
# limitations under the License. # limitations under the License.
"""Keras Layers for BERT-specific preprocessing.""" """Keras Layers for BERT-specific preprocessing."""
# pylint: disable=g-import-not-at-top
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from absl import logging from absl import logging
import tensorflow as tf import tensorflow as tf
try: try:
import tensorflow_text as text # pylint: disable=g-import-not-at-top import tensorflow_text as text
from tensorflow_text.python.ops import bert_tokenizer
except ImportError: except ImportError:
text = None text = None
bert_tokenizer = None
except tf.errors.NotFoundError as e: except tf.errors.NotFoundError as e:
logging.warn("Encountered error when importing tensorflow_text: %s", e) logging.warn("Encountered error when importing tensorflow_text: %s", e)
text = None text = None
bert_tokenizer = None
def _check_if_tf_text_installed(): def _check_if_tf_text_installed():
...@@ -587,3 +591,139 @@ class BertPackInputs(tf.keras.layers.Layer): ...@@ -587,3 +591,139 @@ class BertPackInputs(tf.keras.layers.Layer):
return dict(input_word_ids=_reshape(input_word_ids), return dict(input_word_ids=_reshape(input_word_ids),
input_mask=_reshape(input_mask), input_mask=_reshape(input_mask),
input_type_ids=_reshape(input_type_ids)) input_type_ids=_reshape(input_type_ids))
class FastWordpieceBertTokenizer(tf.keras.layers.Layer):
"""A bert tokenizer keras layer using text.FastWordpieceTokenizer.
See details: "Fast WordPiece Tokenization" (https://arxiv.org/abs/2012.15524)
"""
def __init__(self,
*,
vocab_file: str,
lower_case: bool,
tokenize_with_offsets: bool = False,
**kwargs):
"""Initializes a FastWordpieceBertTokenizer layer.
Args:
vocab_file: A Python string with the path of the vocabulary file. This is
a text file with newline-separated wordpiece tokens. This layer loads
a list of tokens from it to create text.FastWordpieceTokenizer.
lower_case: A Python boolean forwarded to text.BasicTokenizer. If true,
input text is converted to lower case (where applicable) before
tokenization. This must be set to match the way in which the vocab_file
was created.
tokenize_with_offsets: A Python boolean. If true, this layer calls
FastWordpieceTokenizer.tokenize_with_offsets() instead of plain
.tokenize() and outputs a triple of (tokens, start_offsets,
limit_offsets) insead of just tokens.
**kwargs: standard arguments to Layer().
"""
super().__init__(**kwargs)
logging.info("Initialize a FastWordpieceBertTokenizer.")
self.tokenize_with_offsets = tokenize_with_offsets
self._basic_tokenizer = bert_tokenizer.BasicTokenizer(lower_case=lower_case)
# Read the vocab file into a list of tokens to create `fast_wp_tokenizer`.
self._vocab = [line.rstrip() for line in tf.io.gfile.GFile(vocab_file)]
self._fast_wp_tokenizer = text.FastWordpieceTokenizer(
vocab=self._vocab, token_out_type=tf.int32, no_pretokenization=True)
self._special_tokens_dict = self._create_special_tokens_dict()
@property
def vocab_size(self):
return len(self._vocab)
def get_config(self):
# Skip in tf.saved_model.save(); fail if called direcly.
# We cannot just put the original, user-supplied vocab file name into
# the config, because the path has to change as the SavedModel is copied
# around.
raise NotImplementedError("Not implemented yet.")
def get_special_tokens_dict(self):
"""Returns dict of token ids, keyed by standard names for their purpose.
Returns:
A dict from Python strings to Python integers. Each key is a standard
name for a special token describing its use. (For example, "padding_id"
is what BERT traditionally calls "[PAD]" but others may call "<pad>".)
The corresponding value is the integer token id. If a special token
is not found, its entry is omitted from the dict.
The supported keys and tokens are:
* start_of_sequence_id: looked up from "[CLS]"
* end_of_segment_id: looked up from "[SEP]"
* padding_id: looked up form "[PAD]"
* mask_id: looked up from "[MASK]"
* vocab_size: one past the largest token id used
"""
return self._special_tokens_dict
def _create_special_tokens_dict(self):
"""Creates dict of token ids, keyed by standard names for their purpose."""
special_tokens = {"vocab_size": self.vocab_size}
def add_special_token(key, token):
try:
token_id = self._vocab.index(token)
special_tokens[key] = token_id
except ValueError:
# Similar as nlp.modeling.layers.BertTokenizer, if a special token
# is not found, its entry is omitted from the dict.
logging.warning("Could not find %s as token \"%s\" in vocab file", key,
token)
add_special_token("start_of_sequence_id", "[CLS]")
add_special_token("end_of_segment_id", "[SEP]")
add_special_token("padding_id", "[PAD]")
add_special_token("mask_id", "[MASK]")
return special_tokens
def _tokenize_with_offsets(self, text_input: tf.Tensor):
tokens, begin, _ = self._basic_tokenizer.tokenize_with_offsets(text_input)
wordpieces, wp_begin, wp_end = (
self._fast_wp_tokenizer.tokenize_with_offsets(tokens))
begin_expanded = tf.expand_dims(begin, axis=2)
final_begin = begin_expanded + wp_begin
final_end = begin_expanded + wp_end
return wordpieces, final_begin, final_end
def _tokenize(self, text_input: tf.Tensor):
tokens = self._basic_tokenizer.tokenize(text_input)
return self._fast_wp_tokenizer.tokenize(tokens)
def call(self, inputs: tf.Tensor):
"""Calls text.BertTokenizer on inputs.
Args:
inputs: A string Tensor of shape [batch_size].
Returns:
One or three of RaggedTensors if tokenize_with_offsets is False or True,
respectively. These are
tokens: A RaggedTensor of shape [batch_size, (words), (pieces_per_word)]
and type int32. tokens[i,j,k] contains the k-th wordpiece of the
j-th word in the i-th input.
start_offsets, limit_offsets: If tokenize_with_offsets is True,
RaggedTensors of type int64 with the same indices as tokens.
Element [i,j,k] contains the byte offset at the start, or past the
end, resp., for the k-th wordpiece of the j-th word in the i-th input.
"""
# Prepare to reshape the result to work around broken shape inference.
batch_size = tf.shape(inputs)[0]
def _reshape(rt):
values = rt.values
row_splits = rt.row_splits
row_splits = tf.reshape(row_splits, [batch_size + 1])
return tf.RaggedTensor.from_row_splits(values, row_splits)
if self.tokenize_with_offsets:
tokens, start_offsets, limit_offsets = self._tokenize_with_offsets(inputs)
return _reshape(tokens), _reshape(start_offsets), _reshape(limit_offsets)
else:
tokens = self._tokenize(inputs)
return _reshape(tokens)
...@@ -442,5 +442,109 @@ class BertPackInputsTest(tf.test.TestCase): ...@@ -442,5 +442,109 @@ class BertPackInputsTest(tf.test.TestCase):
[1001, 21, 22, 23, 24, 25, 26, 27, 28, 1002]])) [1001, 21, 22, 23, 24, 25, 26, 27, 28, 1002]]))
# This test covers the in-process behavior of FastWordpieceBertTokenizer layer.
class FastWordPieceBertTokenizerTest(tf.test.TestCase):
def _make_vocab_file(self, vocab, filename="vocab.txt"):
path = os.path.join(
tempfile.mkdtemp(dir=self.get_temp_dir()), # New subdir each time.
filename)
with tf.io.gfile.GFile(path, "w") as f:
f.write("\n".join(vocab + [""]))
return path
def test_uncased(self):
vocab_file = self._make_vocab_file(
["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "xy"])
bert_tokenize = text_layers.FastWordpieceBertTokenizer(
vocab_file=vocab_file, lower_case=True)
inputs = tf.constant(["abc def", "ABC DEF d"])
token_ids = bert_tokenize(inputs)
self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
[[6], [4, 5], [4]]]))
bert_tokenize.tokenize_with_offsets = True
token_ids_2, start_offsets, limit_offsets = bert_tokenize(inputs)
self.assertAllEqual(token_ids, token_ids_2)
self.assertAllEqual(start_offsets, tf.ragged.constant([[[0], [4, 5]],
[[0], [4, 5], [8]]]))
self.assertAllEqual(limit_offsets, tf.ragged.constant([[[3], [5, 7]],
[[3], [5, 7], [9]]]))
self.assertEqual(bert_tokenize.vocab_size, 8)
# Repeat the above and test that case matters with lower_case=False.
def test_cased(self):
vocab_file = self._make_vocab_file(
["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "ABC"])
bert_tokenize = text_layers.FastWordpieceBertTokenizer(
vocab_file=vocab_file, lower_case=False, tokenize_with_offsets=True)
inputs = tf.constant(["abc def", "ABC DEF"])
token_ids, start_offsets, limit_offsets = bert_tokenize(inputs)
self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
[[7], [1]]]))
self.assertAllEqual(start_offsets, tf.ragged.constant([[[0], [4, 5]],
[[0], [4]]]))
self.assertAllEqual(limit_offsets, tf.ragged.constant([[[3], [5, 7]],
[[3], [7]]]))
def test_special_tokens_complete(self):
vocab_file = self._make_vocab_file(
["foo", "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "xy"])
bert_tokenize = text_layers.FastWordpieceBertTokenizer(
vocab_file=vocab_file, lower_case=True)
self.assertDictEqual(bert_tokenize.get_special_tokens_dict(),
dict(padding_id=1,
start_of_sequence_id=3,
end_of_segment_id=4,
mask_id=5,
vocab_size=7))
def test_special_tokens_partial(self):
# [UNK] token is required by fast wordpiece tokenizer.
vocab_file = self._make_vocab_file(
["[PAD]", "[CLS]", "[SEP]", "[UNK]"])
bert_tokenize = text_layers.FastWordpieceBertTokenizer(
vocab_file=vocab_file, lower_case=True)
self.assertDictEqual(bert_tokenize.get_special_tokens_dict(),
dict(padding_id=0,
start_of_sequence_id=1,
end_of_segment_id=2,
vocab_size=4)) # No mask_id,
def test_special_tokens_in_estimator(self):
"""Tests getting special tokens without an Eager init context."""
vocab_file = self._make_vocab_file(
["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "xy"])
def input_fn():
with tf.init_scope():
self.assertFalse(tf.executing_eagerly())
# Build a preprocessing Model.
sentences = tf.keras.layers.Input(shape=[], dtype=tf.string)
bert_tokenizer = text_layers.FastWordpieceBertTokenizer(
vocab_file=vocab_file, lower_case=True)
special_tokens_dict = bert_tokenizer.get_special_tokens_dict()
for k, v in special_tokens_dict.items():
self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
tokens = bert_tokenizer(sentences)
packed_inputs = text_layers.BertPackInputs(
4, special_tokens_dict=special_tokens_dict)(tokens)
preprocessing = tf.keras.Model(sentences, packed_inputs)
# Map the dataset.
ds = tf.data.Dataset.from_tensors(
(tf.constant(["abc", "DEF"]), tf.constant([0, 1])))
ds = ds.map(lambda features, labels: (preprocessing(features), labels))
return ds
def model_fn(features, labels, mode):
del labels # Unused.
return tf.estimator.EstimatorSpec(mode=mode,
predictions=features["input_word_ids"])
estimator = tf.estimator.Estimator(model_fn=model_fn)
outputs = list(estimator.predict(input_fn))
self.assertAllEqual(outputs, np.array([[2, 6, 3, 0],
[2, 4, 5, 3]]))
if __name__ == "__main__": if __name__ == "__main__":
tf.test.main() tf.test.main()
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
import tensorflow as tf import tensorflow as tf
from official.nlp.modeling.layers import util
@tf.keras.utils.register_keras_serializable(package="Text") @tf.keras.utils.register_keras_serializable(package="Text")
class TransformerEncoderBlock(tf.keras.layers.Layer): class TransformerEncoderBlock(tf.keras.layers.Layer):
...@@ -86,8 +88,9 @@ class TransformerEncoderBlock(tf.keras.layers.Layer): ...@@ -86,8 +88,9 @@ class TransformerEncoderBlock(tf.keras.layers.Layer):
kernel. kernel.
attention_axes: axes over which the attention is applied. `None` means attention_axes: axes over which the attention is applied. `None` means
attention over all axes, but batch, heads, and features. attention over all axes, but batch, heads, and features.
**kwargs: keyword arguments/ **kwargs: keyword arguments.
""" """
util.filter_kwargs(kwargs)
super().__init__(**kwargs) super().__init__(**kwargs)
self._num_heads = num_attention_heads self._num_heads = num_attention_heads
......
...@@ -30,13 +30,13 @@ class TfFunctionIfEagerDecorator(object): ...@@ -30,13 +30,13 @@ class TfFunctionIfEagerDecorator(object):
@functools.wraps(func) @functools.wraps(func)
def wrapped_func(*args): def wrapped_func(*args):
# TODO(b/150147476, b/150024785): Fix tf.function in TF1 crash. # TODO(b/150147476, b/150024785): Fix tf.function in TF1 crash.
if not hasattr(tf.compat.v1, "executing_eagerly_outside_functions" if not hasattr(tf.compat.v1, 'executing_eagerly_outside_functions'
) or tf.compat.v1.executing_eagerly_outside_functions(): ) or tf.compat.v1.executing_eagerly_outside_functions():
return tf.function(func=func, **self.func_kwargs)(*args) return tf.function(func=func, **self.func_kwargs)(*args)
return func(*args) return func(*args)
# Cache the created function in self._call_impl. # Cache the created function in self._call_impl.
if not hasattr(self, "_call_impl"): if not hasattr(self, '_call_impl'):
self._call_impl = wrapped_func self._call_impl = wrapped_func
return self._call_impl return self._call_impl
...@@ -44,3 +44,29 @@ class TfFunctionIfEagerDecorator(object): ...@@ -44,3 +44,29 @@ class TfFunctionIfEagerDecorator(object):
def tf_function_if_eager(**kwargs): def tf_function_if_eager(**kwargs):
"""Applies the @tf.function decorator only if running in eager mode.""" """Applies the @tf.function decorator only if running in eager mode."""
return TfFunctionIfEagerDecorator(**kwargs) return TfFunctionIfEagerDecorator(**kwargs)
def filter_kwargs(kwargs):
"""In place removes unused options in kwargs.
This function removes the construction signatures: e.g.
number_attention_heads... in TransformerEncoderBlock. This is needed,
otherwise base_layer.py in Keras will complain.
Args:
kwargs: keyword arguments to be filtered.
"""
# This is the union of signatures of TransformerEncoderBlock and
# ReZeroTransformer. Every Transformer
# block that uses compatible signature with TransformerEncoderBlock should
# call this function before base constructor super().__init__(**kwargs).
denylist = [
'num_attention_heads', 'intermediate_size', 'intermediate_activation',
'inner_dim', 'inner_activation', 'output_range', 'kernel_initializer',
'bias_initializer', 'kernel_regularizer', 'bias_regularizer',
'activity_regularizer', 'kernel_constraint', 'bias_constraint',
'use_bias', 'norm_first', 'norm_epsilon', 'output_dropout',
'attention_dropout', 'inner_dropout', 'attention_initializer',
'attention_axes', 'share_rezero'
]
for unused_key in denylist:
kwargs.pop(unused_key, None)
...@@ -260,11 +260,9 @@ class Seq2SeqTransformer(tf.keras.Model): ...@@ -260,11 +260,9 @@ class Seq2SeqTransformer(tf.keras.Model):
return {"outputs": top_decoded_ids, "scores": top_scores} return {"outputs": top_decoded_ids, "scores": top_scores}
decoder_inputs = self.embedding_lookup(targets)
embedding_mask = tf.cast(tf.not_equal(targets, 0), decoder_inputs.dtype)
decoder_inputs *= tf.expand_dims(embedding_mask, -1)
# Shift targets to the right, and remove the last element # Shift targets to the right, and remove the last element
decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] targets = tf.pad(targets, [[0, 0], [1, 0]])[:, :-1]
decoder_inputs = self.embedding_lookup(targets)
length = tf.shape(decoder_inputs)[1] length = tf.shape(decoder_inputs)[1]
pos_encoding = self.position_embedding(decoder_inputs) pos_encoding = self.position_embedding(decoder_inputs)
pos_encoding = tf.cast(pos_encoding, embedded_inputs.dtype) pos_encoding = tf.cast(pos_encoding, embedded_inputs.dtype)
...@@ -325,12 +323,7 @@ class Seq2SeqTransformer(tf.keras.Model): ...@@ -325,12 +323,7 @@ class Seq2SeqTransformer(tf.keras.Model):
decoder_input = ids[:, -1:] decoder_input = ids[:, -1:]
# Preprocess decoder input by getting embeddings and adding timing signal. # Preprocess decoder input by getting embeddings and adding timing signal.
# decoder_input = self.embedding_softmax_layer(decoder_input)
source_decoder_input = decoder_input
decoder_input = self.embedding_lookup(decoder_input) decoder_input = self.embedding_lookup(decoder_input)
embedding_mask = tf.cast(
tf.not_equal(source_decoder_input, 0), decoder_input.dtype)
decoder_input *= tf.expand_dims(embedding_mask, -1)
decoder_input += timing_signal[i] decoder_input += timing_signal[i]
if self._padded_decode: if self._padded_decode:
# indexing does not work on TPU. # indexing does not work on TPU.
......
...@@ -20,29 +20,30 @@ import numpy as np ...@@ -20,29 +20,30 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.networks import bert_dense_encoder from official.nlp.modeling.networks import bert_encoder
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover. # guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes @keras_parameterized.run_all_keras_modes
class BertDenseEncoderTest(keras_parameterized.TestCase): class BertEncoderV2Test(keras_parameterized.TestCase):
def tearDown(self): def tearDown(self):
super(BertDenseEncoderTest, self).tearDown() super(BertEncoderV2Test, self).tearDown()
tf.keras.mixed_precision.set_global_policy("float32") tf.keras.mixed_precision.set_global_policy("float32")
def test_dict_outputs_network_creation(self): def test_dict_outputs_network_creation(self):
hidden_size = 32 hidden_size = 32
sequence_length = 21 sequence_length = 21
dense_sequence_length = 20 dense_sequence_length = 20
# Create a small dense BertDenseEncoder for testing. # Create a small dense BertEncoderV2 for testing.
kwargs = {} kwargs = {}
test_network = bert_dense_encoder.BertDenseEncoder( test_network = bert_encoder.BertEncoderV2(
vocab_size=100, vocab_size=100,
hidden_size=hidden_size, hidden_size=hidden_size,
num_attention_heads=2, num_attention_heads=2,
num_layers=3, num_layers=3,
with_dense_inputs=True,
**kwargs) **kwargs)
# Create the inputs (note that the first dimension is implicit). # Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
...@@ -86,12 +87,13 @@ class BertDenseEncoderTest(keras_parameterized.TestCase): ...@@ -86,12 +87,13 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
sequence_length = 21 sequence_length = 21
dense_sequence_length = 20 dense_sequence_length = 20
# Create a small BertEncoder for testing. # Create a small BertEncoder for testing.
test_network = bert_dense_encoder.BertDenseEncoder( test_network = bert_encoder.BertEncoderV2(
vocab_size=100, vocab_size=100,
hidden_size=hidden_size, hidden_size=hidden_size,
num_attention_heads=2, num_attention_heads=2,
num_layers=3, num_layers=3,
dict_outputs=True) dict_outputs=True,
with_dense_inputs=True)
# Create the inputs (note that the first dimension is implicit). # Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
...@@ -134,12 +136,13 @@ class BertDenseEncoderTest(keras_parameterized.TestCase): ...@@ -134,12 +136,13 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
dense_sequence_length = 20 dense_sequence_length = 20
tf.keras.mixed_precision.set_global_policy("mixed_float16") tf.keras.mixed_precision.set_global_policy("mixed_float16")
# Create a small BertEncoder for testing. # Create a small BertEncoder for testing.
test_network = bert_dense_encoder.BertDenseEncoder( test_network = bert_encoder.BertEncoderV2(
vocab_size=100, vocab_size=100,
hidden_size=hidden_size, hidden_size=hidden_size,
num_attention_heads=2, num_attention_heads=2,
num_layers=3, num_layers=3,
dict_outputs=True) dict_outputs=True,
with_dense_inputs=True)
# Create the inputs (note that the first dimension is implicit). # Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
...@@ -176,9 +179,8 @@ class BertDenseEncoderTest(keras_parameterized.TestCase): ...@@ -176,9 +179,8 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
self.assertAllEqual(tf.float16, pooled.dtype) self.assertAllEqual(tf.float16, pooled.dtype)
@parameterized.named_parameters( @parameterized.named_parameters(
("all_sequence_encoder_v2", bert_dense_encoder.BertDenseEncoder, None, ("all_sequence_encoder_v2", bert_encoder.BertEncoderV2, None, 41),
41), ("output_range_encoder_v2", bert_encoder.BertEncoderV2, 1, 1),
("output_range_encoder_v2", bert_dense_encoder.BertDenseEncoder, 1, 1),
) )
def test_dict_outputs_network_invocation( def test_dict_outputs_network_invocation(
self, encoder_cls, output_range, out_seq_len): self, encoder_cls, output_range, out_seq_len):
...@@ -195,7 +197,8 @@ class BertDenseEncoderTest(keras_parameterized.TestCase): ...@@ -195,7 +197,8 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
num_layers=3, num_layers=3,
type_vocab_size=num_types, type_vocab_size=num_types,
output_range=output_range, output_range=output_range,
dict_outputs=True) dict_outputs=True,
with_dense_inputs=True)
# Create the inputs (note that the first dimension is implicit). # Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
...@@ -276,7 +279,7 @@ class BertDenseEncoderTest(keras_parameterized.TestCase): ...@@ -276,7 +279,7 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
# Creates a BertEncoder with embedding_width != hidden_size # Creates a BertEncoder with embedding_width != hidden_size
embedding_width = 16 embedding_width = 16
test_network = bert_dense_encoder.BertDenseEncoder( test_network = bert_encoder.BertEncoderV2(
vocab_size=vocab_size, vocab_size=vocab_size,
hidden_size=hidden_size, hidden_size=hidden_size,
max_sequence_length=max_sequence_length, max_sequence_length=max_sequence_length,
...@@ -316,11 +319,12 @@ class BertDenseEncoderTest(keras_parameterized.TestCase): ...@@ -316,11 +319,12 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
sequence_length = 21 sequence_length = 21
dense_sequence_length = 20 dense_sequence_length = 20
# Create a small BertEncoder for testing. # Create a small BertEncoder for testing.
test_network = bert_dense_encoder.BertDenseEncoder( test_network = bert_encoder.BertEncoderV2(
vocab_size=100, vocab_size=100,
hidden_size=hidden_size, hidden_size=hidden_size,
num_attention_heads=2, num_attention_heads=2,
num_layers=3) num_layers=3,
with_dense_inputs=True)
# Create the inputs (note that the first dimension is implicit). # Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length), dtype=tf.int32) word_ids = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
......
...@@ -23,6 +23,8 @@ from official.nlp.modeling import layers ...@@ -23,6 +23,8 @@ from official.nlp.modeling import layers
_Initializer = Union[str, tf.keras.initializers.Initializer] _Initializer = Union[str, tf.keras.initializers.Initializer]
_Activation = Union[str, Callable[..., Any]]
_approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True) _approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True)
...@@ -72,6 +74,7 @@ class BertEncoderV2(tf.keras.layers.Layer): ...@@ -72,6 +74,7 @@ class BertEncoderV2(tf.keras.layers.Layer):
norm_first: Whether to normalize inputs to attention and intermediate dense norm_first: Whether to normalize inputs to attention and intermediate dense
layers. If set False, output of attention and intermediate dense layers is layers. If set False, output of attention and intermediate dense layers is
normalized. normalized.
with_dense_inputs: Whether to accept dense embeddings as the input.
""" """
def __init__( def __init__(
...@@ -83,7 +86,7 @@ class BertEncoderV2(tf.keras.layers.Layer): ...@@ -83,7 +86,7 @@ class BertEncoderV2(tf.keras.layers.Layer):
max_sequence_length: int = 512, max_sequence_length: int = 512,
type_vocab_size: int = 16, type_vocab_size: int = 16,
inner_dim: int = 3072, inner_dim: int = 3072,
inner_activation: Callable[..., Any] = _approx_gelu, inner_activation: _Activation = _approx_gelu,
output_dropout: float = 0.1, output_dropout: float = 0.1,
attention_dropout: float = 0.1, attention_dropout: float = 0.1,
initializer: _Initializer = tf.keras.initializers.TruncatedNormal( initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
...@@ -92,6 +95,7 @@ class BertEncoderV2(tf.keras.layers.Layer): ...@@ -92,6 +95,7 @@ class BertEncoderV2(tf.keras.layers.Layer):
embedding_width: Optional[int] = None, embedding_width: Optional[int] = None,
embedding_layer: Optional[tf.keras.layers.Layer] = None, embedding_layer: Optional[tf.keras.layers.Layer] = None,
norm_first: bool = False, norm_first: bool = False,
with_dense_inputs: bool = False,
**kwargs): **kwargs):
# Pops kwargs that are used in V1 implementation. # Pops kwargs that are used in V1 implementation.
if 'dict_outputs' in kwargs: if 'dict_outputs' in kwargs:
...@@ -190,11 +194,23 @@ class BertEncoderV2(tf.keras.layers.Layer): ...@@ -190,11 +194,23 @@ class BertEncoderV2(tf.keras.layers.Layer):
'embedding_width': embedding_width, 'embedding_width': embedding_width,
'embedding_layer': embedding_layer, 'embedding_layer': embedding_layer,
'norm_first': norm_first, 'norm_first': norm_first,
'with_dense_inputs': with_dense_inputs,
} }
self.inputs = dict( if with_dense_inputs:
input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32), self.inputs = dict(
input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32), input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32)) input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
dense_inputs=tf.keras.Input(
shape=(None, embedding_width), dtype=tf.float32),
dense_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
dense_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
)
else:
self.inputs = dict(
input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32))
def call(self, inputs): def call(self, inputs):
word_embeddings = None word_embeddings = None
...@@ -203,11 +219,22 @@ class BertEncoderV2(tf.keras.layers.Layer): ...@@ -203,11 +219,22 @@ class BertEncoderV2(tf.keras.layers.Layer):
mask = inputs.get('input_mask') mask = inputs.get('input_mask')
type_ids = inputs.get('input_type_ids') type_ids = inputs.get('input_type_ids')
word_embeddings = inputs.get('input_word_embeddings', None) word_embeddings = inputs.get('input_word_embeddings', None)
dense_inputs = inputs.get('dense_inputs', None)
dense_mask = inputs.get('dense_mask', None)
dense_type_ids = inputs.get('dense_type_ids', None)
else: else:
raise ValueError('Unexpected inputs type to %s.' % self.__class__) raise ValueError('Unexpected inputs type to %s.' % self.__class__)
if word_embeddings is None: if word_embeddings is None:
word_embeddings = self._embedding_layer(word_ids) word_embeddings = self._embedding_layer(word_ids)
if dense_inputs is not None:
# Concat the dense embeddings at sequence end.
word_embeddings = tf.concat([word_embeddings, dense_inputs], axis=1)
type_ids = tf.concat([type_ids, dense_type_ids], axis=1)
mask = tf.concat([mask, dense_mask], axis=1)
# absolute position embeddings. # absolute position embeddings.
position_embeddings = self._position_embedding_layer(word_embeddings) position_embeddings = self._position_embedding_layer(word_embeddings)
type_embeddings = self._type_embedding_layer(type_ids) type_embeddings = self._type_embedding_layer(type_ids)
......
...@@ -15,17 +15,32 @@ ...@@ -15,17 +15,32 @@
"""Funnel Transformer network.""" """Funnel Transformer network."""
# pylint: disable=g-classes-have-attributes # pylint: disable=g-classes-have-attributes
from typing import Union, Sequence from typing import Any, Callable, Optional, Union, Sequence
from absl import logging from absl import logging
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from official.nlp.modeling import layers from official.nlp.modeling import layers
_Initializer = Union[str, tf.keras.initializers.Initializer]
_Activation = Union[str, Callable[..., Any]]
_MAX = 'max' _MAX = 'max'
_AVG = 'avg' _AVG = 'avg'
_TRUNCATED_AVG = 'truncated_avg' _TRUNCATED_AVG = 'truncated_avg'
_transformer_cls2str = {
layers.TransformerEncoderBlock: 'TransformerEncoderBlock',
layers.ReZeroTransformer: 'ReZeroTransformer'
}
_str2transformer_cls = {
'TransformerEncoderBlock': layers.TransformerEncoderBlock,
'ReZeroTransformer': layers.ReZeroTransformer
}
_approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True)
def _get_policy_dtype(): def _get_policy_dtype():
try: try:
...@@ -206,29 +221,37 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer): ...@@ -206,29 +221,37 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
embeddings for the input word IDs. embeddings for the input word IDs.
norm_first: Whether to normalize inputs to attention and intermediate dense norm_first: Whether to normalize inputs to attention and intermediate dense
layers. If set False, output of attention and intermediate dense layers is layers. If set False, output of attention and intermediate dense layers is
normalized. normalized. This does not apply to ReZero.
transformer_cls: str or a keras Layer. This is the base TransformerBlock the
funnel encoder relies on.
share_rezero: bool. Whether to share ReZero alpha between the attention
layer and the ffn layer. This option is specific to ReZero.
""" """
def __init__( def __init__(
self, self,
vocab_size, vocab_size: int,
hidden_size=768, hidden_size: int = 768,
num_layers=12, num_layers: int = 12,
num_attention_heads=12, num_attention_heads: int = 12,
max_sequence_length=512, max_sequence_length: int = 512,
type_vocab_size=16, type_vocab_size: int = 16,
inner_dim=3072, inner_dim: int = 3072,
inner_activation=lambda x: tf.keras.activations.gelu(x, approximate=True), inner_activation: _Activation = _approx_gelu,
output_dropout=0.1, output_dropout: float = 0.1,
attention_dropout=0.1, attention_dropout: float = 0.1,
pool_type=_MAX, pool_type: str = _MAX,
pool_stride=2, pool_stride: int = 2,
unpool_length=0, unpool_length: int = 0,
initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
output_range=None, stddev=0.02),
embedding_width=None, output_range: Optional[int] = None,
embedding_layer=None, embedding_width: Optional[int] = None,
norm_first=False, embedding_layer: Optional[tf.keras.layers.Layer] = None,
norm_first: bool = False,
transformer_cls: Union[
str, tf.keras.layers.Layer] = layers.TransformerEncoderBlock,
share_rezero: bool = True,
**kwargs): **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
activation = tf.keras.activations.get(inner_activation) activation = tf.keras.activations.get(inner_activation)
...@@ -278,16 +301,22 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer): ...@@ -278,16 +301,22 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
self._transformer_layers = [] self._transformer_layers = []
self._attention_mask_layer = layers.SelfAttentionMask( self._attention_mask_layer = layers.SelfAttentionMask(
name='self_attention_mask') name='self_attention_mask')
# Will raise an error if the string is not supported.
if isinstance(transformer_cls, str):
transformer_cls = _str2transformer_cls[transformer_cls]
for i in range(num_layers): for i in range(num_layers):
layer = layers.TransformerEncoderBlock( layer = transformer_cls(
num_attention_heads=num_attention_heads, num_attention_heads=num_attention_heads,
intermediate_size=inner_dim,
inner_dim=inner_dim, inner_dim=inner_dim,
intermediate_activation=inner_activation,
inner_activation=inner_activation, inner_activation=inner_activation,
output_dropout=output_dropout, output_dropout=output_dropout,
attention_dropout=attention_dropout, attention_dropout=attention_dropout,
norm_first=norm_first, norm_first=norm_first,
output_range=output_range if i == num_layers - 1 else None, output_range=output_range if i == num_layers - 1 else None,
kernel_initializer=initializer, kernel_initializer=initializer,
share_rezero=share_rezero,
name='transformer/layer_%d' % i) name='transformer/layer_%d' % i)
self._transformer_layers.append(layer) self._transformer_layers.append(layer)
...@@ -333,24 +362,44 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer): ...@@ -333,24 +362,44 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
self._pool_type = pool_type self._pool_type = pool_type
self._config = { self._config = {
'vocab_size': vocab_size, 'vocab_size':
'hidden_size': hidden_size, vocab_size,
'num_layers': num_layers, 'hidden_size':
'num_attention_heads': num_attention_heads, hidden_size,
'max_sequence_length': max_sequence_length, 'num_layers':
'type_vocab_size': type_vocab_size, num_layers,
'inner_dim': inner_dim, 'num_attention_heads':
'inner_activation': tf.keras.activations.serialize(activation), num_attention_heads,
'output_dropout': output_dropout, 'max_sequence_length':
'attention_dropout': attention_dropout, max_sequence_length,
'initializer': tf.keras.initializers.serialize(initializer), 'type_vocab_size':
'output_range': output_range, type_vocab_size,
'embedding_width': embedding_width, 'inner_dim':
'embedding_layer': embedding_layer, inner_dim,
'norm_first': norm_first, 'inner_activation':
'pool_type': pool_type, tf.keras.activations.serialize(activation),
'pool_stride': pool_stride, 'output_dropout':
'unpool_length': unpool_length, output_dropout,
'attention_dropout':
attention_dropout,
'initializer':
tf.keras.initializers.serialize(initializer),
'output_range':
output_range,
'embedding_width':
embedding_width,
'embedding_layer':
embedding_layer,
'norm_first':
norm_first,
'pool_type':
pool_type,
'pool_stride':
pool_stride,
'unpool_length':
unpool_length,
'transformer_cls':
_transformer_cls2str.get(transformer_cls, str(transformer_cls))
} }
def call(self, inputs): def call(self, inputs):
......
...@@ -38,13 +38,20 @@ class FunnelTransformerEncoderTest(parameterized.TestCase, tf.test.TestCase): ...@@ -38,13 +38,20 @@ class FunnelTransformerEncoderTest(parameterized.TestCase, tf.test.TestCase):
tf.keras.mixed_precision.set_global_policy("float32") tf.keras.mixed_precision.set_global_policy("float32")
@parameterized.named_parameters( @parameterized.named_parameters(
("mix_truncated_avg", "mixed_float16", tf.float16, "truncated_avg"), ("mix_truncated_avg_rezero", "mixed_float16", tf.float16, "truncated_avg",
("float32_truncated_avg", "float32", tf.float32, "truncated_avg"), "ReZeroTransformer"), ("float32_truncated_avg_rezero", "float32",
("mix_max", "mixed_float16", tf.float16, "max"), tf.float32, "truncated_avg", "ReZeroTransformer"),
("float32_max", "float32", tf.float32, "max"), ("mix_truncated_avg", "mixed_float16", tf.float16, "truncated_avg",
("mix_avg", "mixed_float16", tf.float16, "avg"), "TransformerEncoderBlock"),
("float32_avg", "float32", tf.float32, "avg")) ("float32_truncated_avg", "float32", tf.float32, "truncated_avg",
def test_network_creation(self, policy, pooled_dtype, pool_type): "TransformerEncoderBlock"), ("mix_max", "mixed_float16", tf.float16,
"max", "TransformerEncoderBlock"),
("float32_max", "float32", tf.float32, "max", "TransformerEncoderBlock"),
("mix_avg", "mixed_float16", tf.float16, "avg",
"TransformerEncoderBlock"),
("float32_avg", "float32", tf.float32, "avg", "TransformerEncoderBlock"))
def test_network_creation(self, policy, pooled_dtype, pool_type,
transformer_cls):
tf.keras.mixed_precision.set_global_policy(policy) tf.keras.mixed_precision.set_global_policy(policy)
hidden_size = 32 hidden_size = 32
...@@ -60,7 +67,8 @@ class FunnelTransformerEncoderTest(parameterized.TestCase, tf.test.TestCase): ...@@ -60,7 +67,8 @@ class FunnelTransformerEncoderTest(parameterized.TestCase, tf.test.TestCase):
pool_stride=pool_stride, pool_stride=pool_stride,
pool_type=pool_type, pool_type=pool_type,
max_sequence_length=sequence_length, max_sequence_length=sequence_length,
unpool_length=0) unpool_length=0,
transformer_cls=transformer_cls)
# Create the inputs (note that the first dimension is implicit). # Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
...@@ -253,7 +261,8 @@ class FunnelTransformerEncoderTest(parameterized.TestCase, tf.test.TestCase): ...@@ -253,7 +261,8 @@ class FunnelTransformerEncoderTest(parameterized.TestCase, tf.test.TestCase):
norm_first=False, norm_first=False,
pool_type="max", pool_type="max",
pool_stride=2, pool_stride=2,
unpool_length=0) unpool_length=0,
transformer_cls="TransformerEncoderBlock")
network = funnel_transformer.FunnelTransformerEncoder(**kwargs) network = funnel_transformer.FunnelTransformerEncoder(**kwargs)
expected_config = dict(kwargs) expected_config = dict(kwargs)
expected_config["inner_activation"] = tf.keras.activations.serialize( expected_config["inner_activation"] = tf.keras.activations.serialize(
......
...@@ -13,12 +13,14 @@ ...@@ -13,12 +13,14 @@
# limitations under the License. # limitations under the License.
"""A binary/library to export TF-NLP serving `SavedModel`.""" """A binary/library to export TF-NLP serving `SavedModel`."""
import dataclasses
import os import os
from typing import Any, Dict, Text from typing import Any, Dict, Text
from absl import app from absl import app
from absl import flags from absl import flags
import dataclasses
import yaml import yaml
from official.core import base_task from official.core import base_task
from official.core import task_factory from official.core import task_factory
from official.modeling import hyperparams from official.modeling import hyperparams
...@@ -29,6 +31,7 @@ from official.nlp.tasks import masked_lm ...@@ -29,6 +31,7 @@ from official.nlp.tasks import masked_lm
from official.nlp.tasks import question_answering from official.nlp.tasks import question_answering
from official.nlp.tasks import sentence_prediction from official.nlp.tasks import sentence_prediction
from official.nlp.tasks import tagging from official.nlp.tasks import tagging
from official.nlp.tasks import translation
FLAGS = flags.FLAGS FLAGS = flags.FLAGS
...@@ -40,7 +43,9 @@ SERVING_MODULES = { ...@@ -40,7 +43,9 @@ SERVING_MODULES = {
question_answering.QuestionAnsweringTask: question_answering.QuestionAnsweringTask:
serving_modules.QuestionAnswering, serving_modules.QuestionAnswering,
tagging.TaggingTask: tagging.TaggingTask:
serving_modules.Tagging serving_modules.Tagging,
translation.TranslationTask:
serving_modules.Translation
} }
...@@ -60,6 +65,10 @@ def define_flags(): ...@@ -60,6 +65,10 @@ def define_flags():
flags.DEFINE_string( flags.DEFINE_string(
"function_keys", None, "function_keys", None,
"A string key to retrieve pre-defined serving signatures.") "A string key to retrieve pre-defined serving signatures.")
flags.DEFINE_string(
"module_key", None,
"For multi-task case, load the export module weights from a specific "
"checkpoint item.")
flags.DEFINE_bool("convert_tpu", False, "") flags.DEFINE_bool("convert_tpu", False, "")
flags.DEFINE_multi_integer("allowed_batch_size", None, flags.DEFINE_multi_integer("allowed_batch_size", None,
"Allowed batch sizes for batching ops.") "Allowed batch sizes for batching ops.")
...@@ -116,7 +125,8 @@ def main(_): ...@@ -116,7 +125,8 @@ def main(_):
export_module, export_module,
function_keys=[FLAGS.function_keys], function_keys=[FLAGS.function_keys],
checkpoint_path=FLAGS.checkpoint_path, checkpoint_path=FLAGS.checkpoint_path,
export_savedmodel_dir=FLAGS.export_savedmodel_dir) export_savedmodel_dir=FLAGS.export_savedmodel_dir,
module_key=FLAGS.module_key)
if FLAGS.convert_tpu: if FLAGS.convert_tpu:
# pylint: disable=g-import-not-at-top # pylint: disable=g-import-not-at-top
......
...@@ -13,24 +13,21 @@ ...@@ -13,24 +13,21 @@
# limitations under the License. # limitations under the License.
"""Common library to export a SavedModel from the export module.""" """Common library to export a SavedModel from the export module."""
import os
import time
from typing import Dict, List, Optional, Text, Union from typing import Dict, List, Optional, Text, Union
from absl import logging
import tensorflow as tf import tensorflow as tf
from official.core import export_base from official.core import export_base
get_timestamped_export_dir = export_base.get_timestamped_export_dir
MAX_DIRECTORY_CREATION_ATTEMPTS = 10
def export(export_module: export_base.ExportModule, def export(export_module: export_base.ExportModule,
function_keys: Union[List[Text], Dict[Text, Text]], function_keys: Union[List[Text], Dict[Text, Text]],
export_savedmodel_dir: Text, export_savedmodel_dir: Text,
checkpoint_path: Optional[Text] = None, checkpoint_path: Optional[Text] = None,
timestamped: bool = True) -> Text: timestamped: bool = True,
module_key: Optional[Text] = None) -> Text:
"""Exports to SavedModel format. """Exports to SavedModel format.
Args: Args:
...@@ -41,6 +38,8 @@ def export(export_module: export_base.ExportModule, ...@@ -41,6 +38,8 @@ def export(export_module: export_base.ExportModule,
export_savedmodel_dir: Output saved model directory. export_savedmodel_dir: Output saved model directory.
checkpoint_path: Object-based checkpoint path or directory. checkpoint_path: Object-based checkpoint path or directory.
timestamped: Whether to export the savedmodel to a timestamped directory. timestamped: Whether to export the savedmodel to a timestamped directory.
module_key: Optional string to identify a checkpoint object to load for the
model in the export module.
Returns: Returns:
The savedmodel directory path. The savedmodel directory path.
...@@ -48,37 +47,16 @@ def export(export_module: export_base.ExportModule, ...@@ -48,37 +47,16 @@ def export(export_module: export_base.ExportModule,
save_options = tf.saved_model.SaveOptions(function_aliases={ save_options = tf.saved_model.SaveOptions(function_aliases={
'tpu_candidate': export_module.serve, 'tpu_candidate': export_module.serve,
}) })
return export_base.export(export_module, function_keys, export_savedmodel_dir, if module_key:
checkpoint_path, timestamped, save_options) kwargs = {module_key: export_module.model}
checkpoint = tf.train.Checkpoint(**kwargs)
else:
def get_timestamped_export_dir(export_dir_base): checkpoint = None
"""Builds a path to a new subdirectory within the base directory. return export_base.export(
export_module,
Args: function_keys,
export_dir_base: A string containing a directory to write the exported graph export_savedmodel_dir,
and checkpoints. checkpoint_path,
timestamped,
Returns: save_options,
The full path of the new subdirectory (which is not actually created yet). checkpoint=checkpoint)
Raises:
RuntimeError: if repeated attempts fail to obtain a unique timestamped
directory name.
"""
attempts = 0
while attempts < MAX_DIRECTORY_CREATION_ATTEMPTS:
timestamp = int(time.time())
result_dir = os.path.join(export_dir_base, str(timestamp))
if not tf.io.gfile.exists(result_dir):
# Collisions are still possible (though extremely unlikely): this
# directory is not actually created yet, but it will be almost
# instantly on return from this function.
return result_dir
time.sleep(1)
attempts += 1
logging.warning('Directory %s already exists; retrying (attempt %s/%s)',
str(result_dir), attempts, MAX_DIRECTORY_CREATION_ATTEMPTS)
raise RuntimeError('Failed to obtain a unique export directory name after '
f'{MAX_DIRECTORY_CREATION_ATTEMPTS} attempts.')
...@@ -14,10 +14,12 @@ ...@@ -14,10 +14,12 @@
"""Serving export modules for TF Model Garden NLP models.""" """Serving export modules for TF Model Garden NLP models."""
# pylint:disable=missing-class-docstring # pylint:disable=missing-class-docstring
import dataclasses
from typing import Dict, List, Optional, Text from typing import Dict, List, Optional, Text
import dataclasses
import tensorflow as tf import tensorflow as tf
import tensorflow_text as tf_text
from official.core import export_base from official.core import export_base
from official.modeling.hyperparams import base_config from official.modeling.hyperparams import base_config
from official.nlp.data import sentence_prediction_dataloader from official.nlp.data import sentence_prediction_dataloader
...@@ -407,3 +409,48 @@ class Tagging(export_base.ExportModule): ...@@ -407,3 +409,48 @@ class Tagging(export_base.ExportModule):
signatures[signature_key] = self.serve_examples.get_concrete_function( signatures[signature_key] = self.serve_examples.get_concrete_function(
tf.TensorSpec(shape=[None], dtype=tf.string, name="examples")) tf.TensorSpec(shape=[None], dtype=tf.string, name="examples"))
return signatures return signatures
class Translation(export_base.ExportModule):
"""The export module for the translation task."""
@dataclasses.dataclass
class Params(base_config.Config):
sentencepiece_model_path: str = ""
def __init__(self, params, model: tf.keras.Model, inference_step=None):
super().__init__(params, model, inference_step)
self._sp_tokenizer = tf_text.SentencepieceTokenizer(
model=tf.io.gfile.GFile(params.sentencepiece_model_path, "rb").read(),
add_eos=True)
try:
empty_str_tokenized = self._sp_tokenizer.tokenize("").numpy()
except tf.errors.InternalError:
raise ValueError(
"EOS token not in tokenizer vocab."
"Please make sure the tokenizer generates a single token for an "
"empty string.")
self._eos_id = empty_str_tokenized.item()
@tf.function
def serve(self, inputs) -> Dict[str, tf.Tensor]:
return self.inference_step(inputs)
@tf.function
def serve_text(self, text: tf.Tensor) -> Dict[str, tf.Tensor]:
tokenized = self._sp_tokenizer.tokenize(text).to_tensor(0)
return self._sp_tokenizer.detokenize(
self.serve({"inputs": tokenized})["outputs"])
def get_inference_signatures(self, function_keys: Dict[Text, Text]):
signatures = {}
valid_keys = ("serve_text")
for func_key, signature_key in function_keys.items():
if func_key not in valid_keys:
raise ValueError("Invalid function key for the module: %s with key %s. "
"Valid keys are: %s" %
(self.__class__, func_key, valid_keys))
if func_key == "serve_text":
signatures[signature_key] = self.serve_text.get_concrete_function(
tf.TensorSpec(shape=[None], dtype=tf.string, name="text"))
return signatures
...@@ -15,8 +15,11 @@ ...@@ -15,8 +15,11 @@
"""Tests for nlp.serving.serving_modules.""" """Tests for nlp.serving.serving_modules."""
import os import os
from absl.testing import parameterized from absl.testing import parameterized
import tensorflow as tf import tensorflow as tf
from sentencepiece import SentencePieceTrainer
from official.nlp.configs import bert from official.nlp.configs import bert
from official.nlp.configs import encoders from official.nlp.configs import encoders
from official.nlp.serving import serving_modules from official.nlp.serving import serving_modules
...@@ -24,6 +27,7 @@ from official.nlp.tasks import masked_lm ...@@ -24,6 +27,7 @@ from official.nlp.tasks import masked_lm
from official.nlp.tasks import question_answering from official.nlp.tasks import question_answering
from official.nlp.tasks import sentence_prediction from official.nlp.tasks import sentence_prediction
from official.nlp.tasks import tagging from official.nlp.tasks import tagging
from official.nlp.tasks import translation
def _create_fake_serialized_examples(features_dict): def _create_fake_serialized_examples(features_dict):
...@@ -59,6 +63,33 @@ def _create_fake_vocab_file(vocab_file_path): ...@@ -59,6 +63,33 @@ def _create_fake_vocab_file(vocab_file_path):
outfile.write("\n".join(tokens)) outfile.write("\n".join(tokens))
def _train_sentencepiece(input_path, vocab_size, model_path, eos_id=1):
argstr = " ".join([
f"--input={input_path}", f"--vocab_size={vocab_size}",
"--character_coverage=0.995",
f"--model_prefix={model_path}", "--model_type=bpe",
"--bos_id=-1", "--pad_id=0", f"--eos_id={eos_id}", "--unk_id=2"
])
SentencePieceTrainer.Train(argstr)
def _generate_line_file(filepath, lines):
with tf.io.gfile.GFile(filepath, "w") as f:
for l in lines:
f.write("{}\n".format(l))
def _make_sentencepeice(output_dir):
src_lines = ["abc ede fg", "bbcd ef a g", "de f a a g"]
tgt_lines = ["dd cc a ef g", "bcd ef a g", "gef cd ba"]
sentencepeice_input_path = os.path.join(output_dir, "inputs.txt")
_generate_line_file(sentencepeice_input_path, src_lines + tgt_lines)
sentencepeice_model_prefix = os.path.join(output_dir, "sp")
_train_sentencepiece(sentencepeice_input_path, 11, sentencepeice_model_prefix)
sentencepeice_model_path = "{}.model".format(sentencepeice_model_prefix)
return sentencepeice_model_path
class ServingModulesTest(tf.test.TestCase, parameterized.TestCase): class ServingModulesTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters( @parameterized.parameters(
...@@ -312,6 +343,31 @@ class ServingModulesTest(tf.test.TestCase, parameterized.TestCase): ...@@ -312,6 +343,31 @@ class ServingModulesTest(tf.test.TestCase, parameterized.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
_ = export_module.get_inference_signatures({"foo": None}) _ = export_module.get_inference_signatures({"foo": None})
def test_translation(self):
sp_path = _make_sentencepeice(self.get_temp_dir())
encdecoder = translation.EncDecoder(
num_attention_heads=4, intermediate_size=256)
config = translation.TranslationConfig(
model=translation.ModelConfig(
encoder=encdecoder,
decoder=encdecoder,
embedding_width=256,
padded_decode=False,
decode_max_length=100),
sentencepiece_model_path=sp_path,
)
task = translation.TranslationTask(config)
model = task.build_model()
params = serving_modules.Translation.Params(
sentencepiece_model_path=sp_path)
export_module = serving_modules.Translation(params=params, model=model)
functions = export_module.get_inference_signatures({
"serve_text": "serving_default"
})
outputs = functions["serving_default"](tf.constant(["abcd", "ef gh"]))
self.assertEqual(outputs.shape, (2,))
self.assertEqual(outputs.dtype, tf.string)
if __name__ == "__main__": if __name__ == "__main__":
tf.test.main() tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment