Unverified Commit 09d9656f authored by Srihari Humbarwadi's avatar Srihari Humbarwadi Committed by GitHub
Browse files

Merge branch 'panoptic-segmentation' into panoptic-deeplab-modeling

parents ac671306 49a5706c
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Keras-based gated feedforward layer."""
# pylint: disable=g-classes-have-attributes
from typing import Optional
import tensorflow as tf
class BlockDiagFeedforward(tf.keras.layers.Layer):
"""Block diagonal feedforward layer.
This layer replaces the weight matrix of the output_dense layer with a block
diagonal matrix to save layer parameters and FLOPs. A linear mixing layer can
be added optionally to improve layer expressibility.
Args:
intermediate_size: Size of the intermediate layer.
intermediate_activation: Activation for the intermediate layer.
dropout: Dropout probability for the output dropout.
num_blocks: The number of blocks for the block diagonal matrix of the
output_dense layer.
apply_mixing: Apply linear mixing if True.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
bias_regularizer: Regularizer for dense layer biases.
activity_regularizer: Regularizer for dense layer activity.
kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels.
"""
def __init__(
self,
intermediate_size: int,
intermediate_activation: str,
dropout: float,
num_blocks: int = 1,
apply_mixing: bool = True,
kernel_initializer: str = "glorot_uniform",
bias_initializer: str = "zeros",
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
activity_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
kernel_constraint: Optional[tf.keras.constraints.Constraint] = None,
bias_constraint: Optional[tf.keras.constraints.Constraint] = None,
**kwargs): # pylint: disable=g-doc-args
super(BlockDiagFeedforward, self).__init__(**kwargs)
self._intermediate_size = intermediate_size
self._intermediate_activation = intermediate_activation
self._dropout = dropout
self._num_blocks = num_blocks
self._apply_mixing = apply_mixing
if intermediate_size % num_blocks != 0:
raise ValueError("Intermediate_size (%d) isn't a multiple of num_blocks "
"(%d)." % (intermediate_size, num_blocks))
self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self._bias_initializer = tf.keras.initializers.get(bias_initializer)
self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
self._bias_constraint = tf.keras.constraints.get(bias_constraint)
def build(self, input_shape):
hidden_size = input_shape.as_list()[-1]
common_kwargs = dict(
kernel_initializer=self._kernel_initializer,
bias_initializer=self._bias_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer,
activity_regularizer=self._activity_regularizer,
kernel_constraint=self._kernel_constraint,
bias_constraint=self._bias_constraint)
self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
"abc,cde->abde",
output_shape=(None, self._num_blocks,
self._intermediate_size // self._num_blocks),
bias_axes="de",
name="intermediate",
**common_kwargs)
policy = tf.keras.mixed_precision.global_policy()
if policy.name == "mixed_bfloat16":
# bfloat16 causes BERT with the LAMB optimizer to not converge
# as well, so we use float32.
policy = tf.float32
self._intermediate_activation_layer = tf.keras.layers.Activation(
self._intermediate_activation, dtype=policy)
self._output_dense = tf.keras.layers.experimental.EinsumDense(
"abde,deo->abdo",
output_shape=(None, self._num_blocks,
hidden_size // self._num_blocks),
bias_axes="do",
name="output",
**common_kwargs)
if self._apply_mixing:
self._output_mixing = tf.keras.layers.experimental.EinsumDense(
"abdo,de->abeo",
output_shape=(None, self._num_blocks,
hidden_size // self._num_blocks),
name="output_mixing",
**common_kwargs)
self._output_reshape = tf.keras.layers.Reshape((-1, hidden_size))
self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout)
def get_config(self):
config = {
"intermediate_size":
self._intermediate_size,
"intermediate_activation":
self._intermediate_activation,
"dropout":
self._dropout,
"num_blocks":
self._num_blocks,
"apply_mixing":
self._apply_mixing,
"kernel_initializer":
tf.keras.initializers.serialize(self._kernel_initializer),
"bias_initializer":
tf.keras.initializers.serialize(self._bias_initializer),
"kernel_regularizer":
tf.keras.regularizers.serialize(self._kernel_regularizer),
"bias_regularizer":
tf.keras.regularizers.serialize(self._bias_regularizer),
"activity_regularizer":
tf.keras.regularizers.serialize(self._activity_regularizer),
"kernel_constraint":
tf.keras.constraints.serialize(self._kernel_constraint),
"bias_constraint":
tf.keras.constraints.serialize(self._bias_constraint)
}
base_config = super(BlockDiagFeedforward, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs):
intermediate_output = self._intermediate_dense(inputs)
intermediate_output = self._intermediate_activation_layer(
intermediate_output)
layer_output = self._output_dense(intermediate_output)
if self._apply_mixing:
layer_output = self._output_mixing(layer_output)
layer_output = self._output_reshape(layer_output)
layer_output = self._output_dropout(layer_output)
return layer_output
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for Keras-based gated feedforward layer."""
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.layers import block_diag_feedforward
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes
class BlockDiagFeedforwardTest(keras_parameterized.TestCase):
def tearDown(self):
super(BlockDiagFeedforwardTest, self).tearDown()
tf.keras.mixed_precision.set_global_policy("float32")
@parameterized.parameters(
(1, True, "float32"),
(1, True, "mixed_float16"),
(1, False, "float32"),
(1, False, "mixed_float16"),
(2, True, "float32"),
(2, True, "mixed_float16"),
(2, False, "float32"),
(2, False, "mixed_float16"),
)
def test_layer_creation(self, num_blocks, apply_mixing, dtype):
tf.keras.mixed_precision.set_global_policy(dtype)
kwargs = dict(
intermediate_size=128,
intermediate_activation="relu",
dropout=0.1,
num_blocks=num_blocks,
apply_mixing=apply_mixing,
kernel_initializer="glorot_uniform",
bias_initializer="zeros")
test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
sequence_length = 64
width = 128
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor = tf.keras.Input(shape=(sequence_length, width))
output_tensor = test_layer(data_tensor)
# The default output of a transformer layer should be the same as the input.
self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
@parameterized.parameters(
(1, True, "float32"),
(1, True, "mixed_float16"),
(1, False, "float32"),
(1, False, "mixed_float16"),
(2, True, "float32"),
(2, True, "mixed_float16"),
(2, False, "float32"),
(2, False, "mixed_float16"),
)
def test_layer_invocation(self, num_blocks, apply_mixing, dtype):
tf.keras.mixed_precision.set_global_policy(dtype)
kwargs = dict(
intermediate_size=16,
intermediate_activation="relu",
dropout=0.1,
num_blocks=num_blocks,
apply_mixing=apply_mixing,
kernel_initializer="glorot_uniform",
bias_initializer="zeros")
test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
sequence_length = 16
width = 32
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor = tf.keras.Input(shape=(sequence_length, width))
output_tensor = test_layer(data_tensor)
# Create a model from the test layer.
model = tf.keras.Model(data_tensor, output_tensor)
# Invoke the model on test data.
batch_size = 6
input_data = 10 * np.random.random_sample(
(batch_size, sequence_length, width))
output_data = model.predict(input_data)
self.assertEqual(output_data.shape, (batch_size, sequence_length, width))
def test_get_config(self):
kwargs = dict(
intermediate_size=16,
intermediate_activation="relu",
dropout=0.1,
num_blocks=2,
apply_mixing=True,
kernel_initializer="glorot_uniform",
bias_initializer="zeros")
test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
new_layer = block_diag_feedforward.BlockDiagFeedforward.from_config(
test_layer.get_config())
self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
if __name__ == "__main__":
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Keras-based einsum layer."""
# pylint: disable=g-classes-have-attributes
import tensorflow as tf
from tensorflow.python.util import deprecation
_CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]
@tf.keras.utils.register_keras_serializable(package="Text")
class DenseEinsum(tf.keras.layers.Layer):
"""A densely connected layer that uses `tf.einsum` as the backing computation.
This layer can perform einsum calculations of arbitrary dimensionality.
Args:
output_shape: Positive integer or tuple, dimensionality of the output space.
num_summed_dimensions: The number of dimensions to sum over. Standard 2D
matmul should use 1, 3D matmul should use 2, and so forth.
activation: Activation function to use. If you don't specify anything, no
activation is applied
(ie. "linear" activation: `a(x) = x`).
use_bias: Boolean, whether the layer uses a bias vector.
kernel_initializer: Initializer for the `kernel` weights matrix.
bias_initializer: Initializer for the bias vector.
kernel_regularizer: Regularizer function applied to the `kernel` weights
matrix.
bias_regularizer: Regularizer function applied to the bias vector.
activity_regularizer: Regularizer function applied to the output of the
layer (its "activation")..
kernel_constraint: Constraint function applied to the `kernel` weights
matrix.
bias_constraint: Constraint function applied to the bias vector.
Input shape:
N-D tensor with shape: `(batch_size, ..., input_dim)`. The most common
situation would be a 2D input with shape `(batch_size, input_dim)`.
Output shape:
N-D tensor with shape: `(batch_size, ..., units)`. For instance, for a 2D
input with shape `(batch_size, input_dim)`, the output would have shape
`(batch_size, units)`.
"""
@deprecation.deprecated(None, "DenseEinsum is deprecated. Please use "
"tf.keras.experimental.EinsumDense layer instead.")
def __init__(self,
output_shape,
num_summed_dimensions=1,
activation=None,
use_bias=True,
kernel_initializer="glorot_uniform",
bias_initializer="zeros",
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
**kwargs):
super(DenseEinsum, self).__init__(**kwargs)
self._output_shape = output_shape if isinstance(
output_shape, (list, tuple)) else (output_shape,)
self._activation = tf.keras.activations.get(activation)
self._use_bias = use_bias
self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self._bias_initializer = tf.keras.initializers.get(bias_initializer)
self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
self._bias_constraint = tf.keras.constraints.get(bias_constraint)
self._num_summed_dimensions = num_summed_dimensions
self._einsum_string = None
def _build_einsum_string(self, free_input_dims, bound_dims, output_dims):
input_str = ""
kernel_str = ""
output_str = ""
letter_offset = 0
for i in range(free_input_dims):
char = _CHR_IDX[i + letter_offset]
input_str += char
output_str += char
letter_offset += free_input_dims
for i in range(bound_dims):
char = _CHR_IDX[i + letter_offset]
input_str += char
kernel_str += char
letter_offset += bound_dims
for i in range(output_dims):
char = _CHR_IDX[i + letter_offset]
kernel_str += char
output_str += char
return input_str + "," + kernel_str + "->" + output_str
def build(self, input_shape):
input_shape = tf.TensorShape(input_shape)
input_rank = input_shape.rank
free_input_dims = input_rank - self._num_summed_dimensions
output_dims = len(self._output_shape)
self._einsum_string = self._build_einsum_string(free_input_dims,
self._num_summed_dimensions,
output_dims)
# This is only saved for testing purposes.
self._kernel_shape = (
input_shape[free_input_dims:].concatenate(self._output_shape))
self._kernel = self.add_weight(
"kernel",
shape=self._kernel_shape,
initializer=self._kernel_initializer,
regularizer=self._kernel_regularizer,
constraint=self._kernel_constraint,
dtype=self.dtype,
trainable=True)
if self._use_bias:
self._bias = self.add_weight(
"bias",
shape=self._output_shape,
initializer=self._bias_initializer,
regularizer=self._bias_regularizer,
constraint=self._bias_constraint,
dtype=self.dtype,
trainable=True)
else:
self._bias = None
super(DenseEinsum, self).build(input_shape)
def get_config(self):
config = {
"output_shape":
self._output_shape,
"num_summed_dimensions":
self._num_summed_dimensions,
"activation":
tf.keras.activations.serialize(self._activation),
"use_bias":
self._use_bias,
"kernel_initializer":
tf.keras.initializers.serialize(self._kernel_initializer),
"bias_initializer":
tf.keras.initializers.serialize(self._bias_initializer),
"kernel_regularizer":
tf.keras.regularizers.serialize(self._kernel_regularizer),
"bias_regularizer":
tf.keras.regularizers.serialize(self._bias_regularizer),
"activity_regularizer":
tf.keras.regularizers.serialize(self._activity_regularizer),
"kernel_constraint":
tf.keras.constraints.serialize(self._kernel_constraint),
"bias_constraint":
tf.keras.constraints.serialize(self._bias_constraint)
}
base_config = super(DenseEinsum, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs):
ret = tf.einsum(self._einsum_string, inputs, self._kernel)
if self._use_bias:
ret += self._bias
if self._activation is not None:
ret = self._activation(ret)
return ret
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for Keras-based einsum layer."""
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.layers import dense_einsum
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes
class DenseEinsumLayer(keras_parameterized.TestCase):
def test_3D_einsum_with_two_bound_dimensions(self):
test_layer = dense_einsum.DenseEinsum(
output_shape=(64,), num_summed_dimensions=2)
# Create a 4-dimensional input (the first dimension is implicit).
input_tensor = tf.keras.Input(shape=(None, 40, 80))
_ = test_layer(input_tensor)
self.assertEqual(test_layer._einsum_string, "abcd,cde->abe")
self.assertEqual(test_layer._kernel_shape, (40, 80, 64))
def test_3D_einsum_with_one_bound_dimensions(self):
test_layer = dense_einsum.DenseEinsum(
output_shape=(64, 32), num_summed_dimensions=1)
# Create a 3-dimensional input (the first dimension is implicit).
input_tensor = tf.keras.Input(shape=(None, 80))
_ = test_layer(input_tensor)
self.assertEqual(test_layer._einsum_string, "abc,cde->abde")
self.assertEqual(test_layer._kernel_shape, (80, 64, 32))
def test_2D_einsum_with_one_bound_dimensions(self):
test_layer = dense_einsum.DenseEinsum(
output_shape=(64,), num_summed_dimensions=1)
# Create a 3-dimensional input (the first dimension is implicit).
input_tensor = tf.keras.Input(shape=(None, 80))
_ = test_layer(input_tensor)
self.assertEqual(test_layer._einsum_string, "abc,cd->abd")
self.assertEqual(test_layer._kernel_shape, (80, 64))
def test_bias_term_can_be_disabled(self):
# A layer created using the bias should have two weights.
test_layer = dense_einsum.DenseEinsum(
output_shape=64, num_summed_dimensions=1, use_bias=True)
input_tensor = tf.keras.Input(shape=(None, 80))
_ = test_layer(input_tensor)
self.assertEqual(2, len(test_layer.get_weights()))
# A layer created without the bias should have only one weight.
test_layer = dense_einsum.DenseEinsum(
output_shape=64, num_summed_dimensions=1, use_bias=False)
input_tensor = tf.keras.Input(shape=(None, 80))
_ = test_layer(input_tensor)
self.assertEqual(1, len(test_layer.get_weights()))
def test_activation(self):
# Create a model that does not use an activation.
no_activation_layer = dense_einsum.DenseEinsum(
output_shape=64, num_summed_dimensions=1, activation=None)
input_tensor = tf.keras.Input(shape=(None, 80))
output_tensor = no_activation_layer(input_tensor)
no_activation_model = tf.keras.Model(input_tensor, output_tensor)
# Create a model that uses a softmax activation.
activation_layer = dense_einsum.DenseEinsum(
output_shape=64, num_summed_dimensions=1, activation="softmax")
input_tensor = tf.keras.Input(shape=(None, 80))
output_tensor = activation_layer(input_tensor)
activation_model = tf.keras.Model(input_tensor, output_tensor)
# Make sure the models' weights are identical.
activation_model.set_weights(no_activation_model.get_weights())
# Predict using each model on the same input data. The output should be
# different, since one is using a softmax - even though the models' weights
# are the same.
input_values = 10 * np.random.random_sample((10, 4, 80))
non_activated_data = no_activation_model.predict(input_values)
activated_data = activation_model.predict(input_values)
self.assertNotAllClose(activated_data, non_activated_data)
def test_non_iterable_output_shape(self):
test_layer = dense_einsum.DenseEinsum(
output_shape=64, num_summed_dimensions=1)
# Create a 3-dimensional input (the first dimension is implicit).
input_tensor = tf.keras.Input(shape=(None, 80))
_ = test_layer(input_tensor)
self.assertEqual(test_layer._einsum_string, "abc,cd->abd")
self.assertEqual(test_layer._kernel_shape, (80, 64))
def test_with_explicit_initializer(self):
test_layer = dense_einsum.DenseEinsum(
output_shape=(64,),
num_summed_dimensions=2,
kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
# Create a 4-dimensional input (the first dimension is implicit).
input_tensor = tf.keras.Input(shape=(None, 40, 80))
_ = test_layer(input_tensor)
self.assertEqual(test_layer._einsum_string, "abcd,cde->abe")
self.assertEqual(test_layer._kernel_shape, (40, 80, 64))
if __name__ == "__main__":
tf.test.main()
......@@ -68,7 +68,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size = 6
input_data = 10 * np.random.random_sample(
input_data = np.random.random_sample(
(batch_size, sequence_length, width))
_ = model.predict(input_data)
......@@ -89,7 +89,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size = 6
input_data = 10 * np.random.random_sample(
input_data = np.random.random_sample(
(batch_size, sequence_length, width))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length)
......@@ -104,7 +104,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
width = 80
batch_size = 6
input_data = 10 * np.random.random_sample(
input_data = np.random.random_sample(
(batch_size, sequence_length, width))
mask_data = np.random.randint(
2, size=(batch_size, sequence_length, sequence_length))
......@@ -121,7 +121,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
new_layer.set_weights(test_layer.get_weights())
new_output_tensor, _ = new_layer([input_data, mask_data])
self.assertAllClose(
new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
new_output_tensor, output_tensor[:, 0:1, :], atol=0.002, rtol=0.01)
def test_layer_output_range_with_relative_pe(self, transformer_cls):
test_layer = transformer_cls(
......@@ -131,7 +131,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
width = 80
batch_size = 6
input_data = 10 * np.random.random_sample(
input_data = np.random.random_sample(
(batch_size, sequence_length, width))
mask_data = np.random.randint(
2, size=(batch_size, sequence_length, sequence_length))
......@@ -149,7 +149,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
new_layer.set_weights(test_layer.get_weights())
new_output_tensor, _ = new_layer([input_data, mask_data])
self.assertAllClose(
new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
new_output_tensor, output_tensor[:, 0:1, :], atol=0.002, rtol=0.01)
def test_layer_output_range_without_mask(self, transformer_cls):
test_layer = transformer_cls(
......@@ -159,7 +159,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
width = 80
batch_size = 6
input_data = 10 * np.random.random_sample(
input_data = np.random.random_sample(
(batch_size, sequence_length, width))
output_tensor, _ = test_layer(input_data)
......@@ -175,7 +175,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
new_layer.set_weights(test_layer.get_weights())
new_output_tensor, _ = new_layer(input_data)
self.assertAllClose(
new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
new_output_tensor, output_tensor[:, 0:1, :], atol=0.002, rtol=0.01)
def test_layer_output_range_with_pre_norm(self, transformer_cls):
test_layer = transformer_cls(
......@@ -185,7 +185,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
width = 80
batch_size = 6
input_data = 10 * np.random.random_sample(
input_data = np.random.random_sample(
(batch_size, sequence_length, width))
mask_data = np.random.randint(
2, size=(batch_size, sequence_length, sequence_length))
......@@ -203,7 +203,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
new_layer.set_weights(test_layer.get_weights())
new_output_tensor, _ = new_layer([input_data, mask_data])
self.assertAllClose(
new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
new_output_tensor, output_tensor[:, 0:1, :], atol=0.002, rtol=0.01)
def test_layer_invocation_with_float16_dtype(self, transformer_cls):
tf.keras.mixed_precision.set_global_policy('mixed_float16')
......@@ -223,7 +223,7 @@ class ReuseTransformerLayerTest(tf.test.TestCase, parameterized.TestCase):
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size = 6
input_data = (10 * np.random.random_sample(
input_data = (np.random.random_sample(
(batch_size, sequence_length, width)))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length)
......@@ -368,7 +368,7 @@ class ReuseTransformerArgumentTest(tf.test.TestCase, parameterized.TestCase):
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size = 6
input_data = 10 * np.random.random_sample(
input_data = np.random.random_sample(
(batch_size, sequence_length, width))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length)
......@@ -404,7 +404,7 @@ class ReuseTransformerArgumentTest(tf.test.TestCase, parameterized.TestCase):
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size = 6
input_data = (10 * np.random.random_sample(
input_data = (np.random.random_sample(
(batch_size, sequence_length, width)))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length)
......
......@@ -18,6 +18,8 @@
import gin
import tensorflow as tf
from official.nlp.modeling.layers import util
@tf.keras.utils.register_keras_serializable(package="Text")
@gin.configurable
......@@ -45,6 +47,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels.
use_layer_norm: If add layer_norm on top of the ReZero.
share_rezero: If attention layer and FFN layer share the same alpha.
"""
def __init__(self,
......@@ -62,7 +65,14 @@ class ReZeroTransformer(tf.keras.layers.Layer):
kernel_constraint=None,
bias_constraint=None,
use_layer_norm=False,
share_rezero=True,
**kwargs):
# attention_dropout will override attention_dropout_rate.
# This is to unify the input params with TransformerEncoderBlock.
attention_dropout_rate = kwargs.pop("attention_dropout",
attention_dropout_rate)
dropout_rate = kwargs.pop("output_dropout", dropout_rate)
util.filter_kwargs(kwargs)
super(ReZeroTransformer, self).__init__(**kwargs)
self._num_heads = num_attention_heads
......@@ -78,10 +88,18 @@ class ReZeroTransformer(tf.keras.layers.Layer):
self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
self._bias_constraint = tf.keras.constraints.get(bias_constraint)
self._use_layer_norm = use_layer_norm
self._share_rezero = share_rezero
def build(self, input_shape):
input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
input_tensor_shape = tf.TensorShape(input_tensor)
if isinstance(input_shape, tf.TensorShape):
input_tensor_shape = input_shape
elif isinstance(input_shape, (list, tuple)):
input_tensor_shape = tf.TensorShape(input_shape[0])
else:
raise ValueError(
"The type of input shape argument is not supported, got: %s" %
type(input_shape))
if len(input_tensor_shape.as_list()) != 3:
raise ValueError("TransformerLayer expects a three-dimensional input of "
"shape [batch, sequence, width].")
......@@ -158,6 +176,15 @@ class ReZeroTransformer(tf.keras.layers.Layer):
trainable=True,
dtype=tf.float32)
if self._share_rezero:
self._rezero_a_ffn = self._rezero_a
else:
self._rezero_a_ffn = self.add_weight(
name="rezero_alpha_ffn",
initializer=tf.keras.initializers.Zeros(),
trainable=True,
dtype=tf.float32)
super(ReZeroTransformer, self).build(input_shape)
def get_config(self):
......@@ -176,6 +203,8 @@ class ReZeroTransformer(tf.keras.layers.Layer):
self._output_range,
"use_layer_norm":
self._use_layer_norm,
"share_rezero":
self._share_rezero,
"kernel_initializer":
tf.keras.initializers.serialize(self._kernel_initializer),
"bias_initializer":
......@@ -196,21 +225,34 @@ class ReZeroTransformer(tf.keras.layers.Layer):
def reset_rezero(self):
self._rezero_a.assign(0.)
if not self._share_rezero:
self._rezero_a_ffn.assign(0.)
def call(self, inputs):
if isinstance(inputs, (list, tuple)) and len(inputs) == 2:
if isinstance(inputs, (list, tuple)):
if len(inputs) == 2:
input_tensor, attention_mask = inputs
key_value = None
elif len(inputs) == 3:
input_tensor, key_value, attention_mask = inputs
else:
input_tensor, attention_mask = (inputs, None)
raise ValueError("Unexpected inputs to %s with length at %d" %
(self.__class__, len(inputs)))
else:
input_tensor, key_value, attention_mask = (inputs, None, None)
if self._output_range:
target_tensor = input_tensor[:, 0:self._output_range, :]
if attention_mask is not None:
attention_mask = attention_mask[:, 0:self._output_range, :]
else:
target_tensor = input_tensor
if key_value is None:
key_value = input_tensor
attention_output = self._attention_layer(
query=target_tensor, value=input_tensor, attention_mask=attention_mask)
query=target_tensor, value=key_value, attention_mask=attention_mask)
attention_output = self._attention_dropout(attention_output)
attention_output = target_tensor + self._rezero_a * attention_output
if self._use_layer_norm:
......@@ -225,7 +267,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
layer_output = self._output_dropout(layer_output)
# During mixed precision training, attention_output is from layer norm and
# is always fp32 for now. Cast layer_output to fp32 for the subsequent add.
layer_output = attention_output + tf.cast(self._rezero_a * layer_output,
layer_output = attention_output + tf.cast(self._rezero_a_ffn * layer_output,
tf.float32)
if self._use_layer_norm:
layer_output = self._output_layer_norm(layer_output)
......
......@@ -14,6 +14,7 @@
"""Tests for Keras-based rezero-transformer block layer."""
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
......@@ -30,12 +31,15 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):
super(TransformerWithReZeroLayerTest, self).tearDown()
tf.keras.mixed_precision.set_global_policy('float32')
def test_layer_invocation_with_float16_dtype(self):
@parameterized.named_parameters(('no_share_attn_ffn', False),
('share_attn_ffn', True))
def test_layer_invocation_with_float16_dtype(self, share_rezero):
tf.keras.mixed_precision.set_global_policy('mixed_float16')
test_layer = rezero_transformer.ReZeroTransformer(
num_attention_heads=10,
intermediate_size=2048,
intermediate_activation='relu')
intermediate_activation='relu',
share_rezero=share_rezero)
sequence_length = 21
width = 80
# Create a 3-dimensional input (the first dimension is implicit).
......@@ -124,6 +128,20 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):
new_output_tensor = new_layer([input_data, mask_data])
self.assertAllClose(new_output_tensor, output_tensor[:, 0:1, :])
def test_separate_qkv(self):
test_layer = rezero_transformer.ReZeroTransformer(
num_attention_heads=2,
intermediate_size=128,
intermediate_activation='relu',
kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
# Forward path.
q_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
kv_tensor = tf.zeros([2, 8, 16], dtype=tf.float32)
dummy_mask = tf.zeros([2, 4, 8], dtype=tf.float32)
inputs = [q_tensor, kv_tensor, dummy_mask]
output = test_layer(inputs)
self.assertEqual(output.shape, q_tensor.shape)
if __name__ == '__main__':
tf.test.main()
......@@ -13,18 +13,22 @@
# limitations under the License.
"""Keras Layers for BERT-specific preprocessing."""
# pylint: disable=g-import-not-at-top
from typing import Any, Dict, List, Optional, Union
from absl import logging
import tensorflow as tf
try:
import tensorflow_text as text # pylint: disable=g-import-not-at-top
import tensorflow_text as text
from tensorflow_text.python.ops import bert_tokenizer
except ImportError:
text = None
bert_tokenizer = None
except tf.errors.NotFoundError as e:
logging.warn("Encountered error when importing tensorflow_text: %s", e)
text = None
bert_tokenizer = None
def _check_if_tf_text_installed():
......@@ -587,3 +591,139 @@ class BertPackInputs(tf.keras.layers.Layer):
return dict(input_word_ids=_reshape(input_word_ids),
input_mask=_reshape(input_mask),
input_type_ids=_reshape(input_type_ids))
class FastWordpieceBertTokenizer(tf.keras.layers.Layer):
"""A bert tokenizer keras layer using text.FastWordpieceTokenizer.
See details: "Fast WordPiece Tokenization" (https://arxiv.org/abs/2012.15524)
"""
def __init__(self,
*,
vocab_file: str,
lower_case: bool,
tokenize_with_offsets: bool = False,
**kwargs):
"""Initializes a FastWordpieceBertTokenizer layer.
Args:
vocab_file: A Python string with the path of the vocabulary file. This is
a text file with newline-separated wordpiece tokens. This layer loads
a list of tokens from it to create text.FastWordpieceTokenizer.
lower_case: A Python boolean forwarded to text.BasicTokenizer. If true,
input text is converted to lower case (where applicable) before
tokenization. This must be set to match the way in which the vocab_file
was created.
tokenize_with_offsets: A Python boolean. If true, this layer calls
FastWordpieceTokenizer.tokenize_with_offsets() instead of plain
.tokenize() and outputs a triple of (tokens, start_offsets,
limit_offsets) insead of just tokens.
**kwargs: standard arguments to Layer().
"""
super().__init__(**kwargs)
logging.info("Initialize a FastWordpieceBertTokenizer.")
self.tokenize_with_offsets = tokenize_with_offsets
self._basic_tokenizer = bert_tokenizer.BasicTokenizer(lower_case=lower_case)
# Read the vocab file into a list of tokens to create `fast_wp_tokenizer`.
self._vocab = [line.rstrip() for line in tf.io.gfile.GFile(vocab_file)]
self._fast_wp_tokenizer = text.FastWordpieceTokenizer(
vocab=self._vocab, token_out_type=tf.int32, no_pretokenization=True)
self._special_tokens_dict = self._create_special_tokens_dict()
@property
def vocab_size(self):
return len(self._vocab)
def get_config(self):
# Skip in tf.saved_model.save(); fail if called direcly.
# We cannot just put the original, user-supplied vocab file name into
# the config, because the path has to change as the SavedModel is copied
# around.
raise NotImplementedError("Not implemented yet.")
def get_special_tokens_dict(self):
"""Returns dict of token ids, keyed by standard names for their purpose.
Returns:
A dict from Python strings to Python integers. Each key is a standard
name for a special token describing its use. (For example, "padding_id"
is what BERT traditionally calls "[PAD]" but others may call "<pad>".)
The corresponding value is the integer token id. If a special token
is not found, its entry is omitted from the dict.
The supported keys and tokens are:
* start_of_sequence_id: looked up from "[CLS]"
* end_of_segment_id: looked up from "[SEP]"
* padding_id: looked up form "[PAD]"
* mask_id: looked up from "[MASK]"
* vocab_size: one past the largest token id used
"""
return self._special_tokens_dict
def _create_special_tokens_dict(self):
"""Creates dict of token ids, keyed by standard names for their purpose."""
special_tokens = {"vocab_size": self.vocab_size}
def add_special_token(key, token):
try:
token_id = self._vocab.index(token)
special_tokens[key] = token_id
except ValueError:
# Similar as nlp.modeling.layers.BertTokenizer, if a special token
# is not found, its entry is omitted from the dict.
logging.warning("Could not find %s as token \"%s\" in vocab file", key,
token)
add_special_token("start_of_sequence_id", "[CLS]")
add_special_token("end_of_segment_id", "[SEP]")
add_special_token("padding_id", "[PAD]")
add_special_token("mask_id", "[MASK]")
return special_tokens
def _tokenize_with_offsets(self, text_input: tf.Tensor):
tokens, begin, _ = self._basic_tokenizer.tokenize_with_offsets(text_input)
wordpieces, wp_begin, wp_end = (
self._fast_wp_tokenizer.tokenize_with_offsets(tokens))
begin_expanded = tf.expand_dims(begin, axis=2)
final_begin = begin_expanded + wp_begin
final_end = begin_expanded + wp_end
return wordpieces, final_begin, final_end
def _tokenize(self, text_input: tf.Tensor):
tokens = self._basic_tokenizer.tokenize(text_input)
return self._fast_wp_tokenizer.tokenize(tokens)
def call(self, inputs: tf.Tensor):
"""Calls text.BertTokenizer on inputs.
Args:
inputs: A string Tensor of shape [batch_size].
Returns:
One or three of RaggedTensors if tokenize_with_offsets is False or True,
respectively. These are
tokens: A RaggedTensor of shape [batch_size, (words), (pieces_per_word)]
and type int32. tokens[i,j,k] contains the k-th wordpiece of the
j-th word in the i-th input.
start_offsets, limit_offsets: If tokenize_with_offsets is True,
RaggedTensors of type int64 with the same indices as tokens.
Element [i,j,k] contains the byte offset at the start, or past the
end, resp., for the k-th wordpiece of the j-th word in the i-th input.
"""
# Prepare to reshape the result to work around broken shape inference.
batch_size = tf.shape(inputs)[0]
def _reshape(rt):
values = rt.values
row_splits = rt.row_splits
row_splits = tf.reshape(row_splits, [batch_size + 1])
return tf.RaggedTensor.from_row_splits(values, row_splits)
if self.tokenize_with_offsets:
tokens, start_offsets, limit_offsets = self._tokenize_with_offsets(inputs)
return _reshape(tokens), _reshape(start_offsets), _reshape(limit_offsets)
else:
tokens = self._tokenize(inputs)
return _reshape(tokens)
......@@ -442,5 +442,109 @@ class BertPackInputsTest(tf.test.TestCase):
[1001, 21, 22, 23, 24, 25, 26, 27, 28, 1002]]))
# This test covers the in-process behavior of FastWordpieceBertTokenizer layer.
class FastWordPieceBertTokenizerTest(tf.test.TestCase):
def _make_vocab_file(self, vocab, filename="vocab.txt"):
path = os.path.join(
tempfile.mkdtemp(dir=self.get_temp_dir()), # New subdir each time.
filename)
with tf.io.gfile.GFile(path, "w") as f:
f.write("\n".join(vocab + [""]))
return path
def test_uncased(self):
vocab_file = self._make_vocab_file(
["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "xy"])
bert_tokenize = text_layers.FastWordpieceBertTokenizer(
vocab_file=vocab_file, lower_case=True)
inputs = tf.constant(["abc def", "ABC DEF d"])
token_ids = bert_tokenize(inputs)
self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
[[6], [4, 5], [4]]]))
bert_tokenize.tokenize_with_offsets = True
token_ids_2, start_offsets, limit_offsets = bert_tokenize(inputs)
self.assertAllEqual(token_ids, token_ids_2)
self.assertAllEqual(start_offsets, tf.ragged.constant([[[0], [4, 5]],
[[0], [4, 5], [8]]]))
self.assertAllEqual(limit_offsets, tf.ragged.constant([[[3], [5, 7]],
[[3], [5, 7], [9]]]))
self.assertEqual(bert_tokenize.vocab_size, 8)
# Repeat the above and test that case matters with lower_case=False.
def test_cased(self):
vocab_file = self._make_vocab_file(
["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "ABC"])
bert_tokenize = text_layers.FastWordpieceBertTokenizer(
vocab_file=vocab_file, lower_case=False, tokenize_with_offsets=True)
inputs = tf.constant(["abc def", "ABC DEF"])
token_ids, start_offsets, limit_offsets = bert_tokenize(inputs)
self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
[[7], [1]]]))
self.assertAllEqual(start_offsets, tf.ragged.constant([[[0], [4, 5]],
[[0], [4]]]))
self.assertAllEqual(limit_offsets, tf.ragged.constant([[[3], [5, 7]],
[[3], [7]]]))
def test_special_tokens_complete(self):
vocab_file = self._make_vocab_file(
["foo", "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "xy"])
bert_tokenize = text_layers.FastWordpieceBertTokenizer(
vocab_file=vocab_file, lower_case=True)
self.assertDictEqual(bert_tokenize.get_special_tokens_dict(),
dict(padding_id=1,
start_of_sequence_id=3,
end_of_segment_id=4,
mask_id=5,
vocab_size=7))
def test_special_tokens_partial(self):
# [UNK] token is required by fast wordpiece tokenizer.
vocab_file = self._make_vocab_file(
["[PAD]", "[CLS]", "[SEP]", "[UNK]"])
bert_tokenize = text_layers.FastWordpieceBertTokenizer(
vocab_file=vocab_file, lower_case=True)
self.assertDictEqual(bert_tokenize.get_special_tokens_dict(),
dict(padding_id=0,
start_of_sequence_id=1,
end_of_segment_id=2,
vocab_size=4)) # No mask_id,
def test_special_tokens_in_estimator(self):
"""Tests getting special tokens without an Eager init context."""
vocab_file = self._make_vocab_file(
["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "xy"])
def input_fn():
with tf.init_scope():
self.assertFalse(tf.executing_eagerly())
# Build a preprocessing Model.
sentences = tf.keras.layers.Input(shape=[], dtype=tf.string)
bert_tokenizer = text_layers.FastWordpieceBertTokenizer(
vocab_file=vocab_file, lower_case=True)
special_tokens_dict = bert_tokenizer.get_special_tokens_dict()
for k, v in special_tokens_dict.items():
self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
tokens = bert_tokenizer(sentences)
packed_inputs = text_layers.BertPackInputs(
4, special_tokens_dict=special_tokens_dict)(tokens)
preprocessing = tf.keras.Model(sentences, packed_inputs)
# Map the dataset.
ds = tf.data.Dataset.from_tensors(
(tf.constant(["abc", "DEF"]), tf.constant([0, 1])))
ds = ds.map(lambda features, labels: (preprocessing(features), labels))
return ds
def model_fn(features, labels, mode):
del labels # Unused.
return tf.estimator.EstimatorSpec(mode=mode,
predictions=features["input_word_ids"])
estimator = tf.estimator.Estimator(model_fn=model_fn)
outputs = list(estimator.predict(input_fn))
self.assertAllEqual(outputs, np.array([[2, 6, 3, 0],
[2, 4, 5, 3]]))
if __name__ == "__main__":
tf.test.main()
......@@ -16,6 +16,8 @@
import tensorflow as tf
from official.nlp.modeling.layers import util
@tf.keras.utils.register_keras_serializable(package="Text")
class TransformerEncoderBlock(tf.keras.layers.Layer):
......@@ -86,8 +88,9 @@ class TransformerEncoderBlock(tf.keras.layers.Layer):
kernel.
attention_axes: axes over which the attention is applied. `None` means
attention over all axes, but batch, heads, and features.
**kwargs: keyword arguments/
**kwargs: keyword arguments.
"""
util.filter_kwargs(kwargs)
super().__init__(**kwargs)
self._num_heads = num_attention_heads
......
......@@ -30,13 +30,13 @@ class TfFunctionIfEagerDecorator(object):
@functools.wraps(func)
def wrapped_func(*args):
# TODO(b/150147476, b/150024785): Fix tf.function in TF1 crash.
if not hasattr(tf.compat.v1, "executing_eagerly_outside_functions"
if not hasattr(tf.compat.v1, 'executing_eagerly_outside_functions'
) or tf.compat.v1.executing_eagerly_outside_functions():
return tf.function(func=func, **self.func_kwargs)(*args)
return func(*args)
# Cache the created function in self._call_impl.
if not hasattr(self, "_call_impl"):
if not hasattr(self, '_call_impl'):
self._call_impl = wrapped_func
return self._call_impl
......@@ -44,3 +44,29 @@ class TfFunctionIfEagerDecorator(object):
def tf_function_if_eager(**kwargs):
"""Applies the @tf.function decorator only if running in eager mode."""
return TfFunctionIfEagerDecorator(**kwargs)
def filter_kwargs(kwargs):
"""In place removes unused options in kwargs.
This function removes the construction signatures: e.g.
number_attention_heads... in TransformerEncoderBlock. This is needed,
otherwise base_layer.py in Keras will complain.
Args:
kwargs: keyword arguments to be filtered.
"""
# This is the union of signatures of TransformerEncoderBlock and
# ReZeroTransformer. Every Transformer
# block that uses compatible signature with TransformerEncoderBlock should
# call this function before base constructor super().__init__(**kwargs).
denylist = [
'num_attention_heads', 'intermediate_size', 'intermediate_activation',
'inner_dim', 'inner_activation', 'output_range', 'kernel_initializer',
'bias_initializer', 'kernel_regularizer', 'bias_regularizer',
'activity_regularizer', 'kernel_constraint', 'bias_constraint',
'use_bias', 'norm_first', 'norm_epsilon', 'output_dropout',
'attention_dropout', 'inner_dropout', 'attention_initializer',
'attention_axes', 'share_rezero'
]
for unused_key in denylist:
kwargs.pop(unused_key, None)
......@@ -260,11 +260,9 @@ class Seq2SeqTransformer(tf.keras.Model):
return {"outputs": top_decoded_ids, "scores": top_scores}
decoder_inputs = self.embedding_lookup(targets)
embedding_mask = tf.cast(tf.not_equal(targets, 0), decoder_inputs.dtype)
decoder_inputs *= tf.expand_dims(embedding_mask, -1)
# Shift targets to the right, and remove the last element
decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
targets = tf.pad(targets, [[0, 0], [1, 0]])[:, :-1]
decoder_inputs = self.embedding_lookup(targets)
length = tf.shape(decoder_inputs)[1]
pos_encoding = self.position_embedding(decoder_inputs)
pos_encoding = tf.cast(pos_encoding, embedded_inputs.dtype)
......@@ -325,12 +323,7 @@ class Seq2SeqTransformer(tf.keras.Model):
decoder_input = ids[:, -1:]
# Preprocess decoder input by getting embeddings and adding timing signal.
# decoder_input = self.embedding_softmax_layer(decoder_input)
source_decoder_input = decoder_input
decoder_input = self.embedding_lookup(decoder_input)
embedding_mask = tf.cast(
tf.not_equal(source_decoder_input, 0), decoder_input.dtype)
decoder_input *= tf.expand_dims(embedding_mask, -1)
decoder_input += timing_signal[i]
if self._padded_decode:
# indexing does not work on TPU.
......
......@@ -20,29 +20,30 @@ import numpy as np
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.networks import bert_dense_encoder
from official.nlp.modeling.networks import bert_encoder
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes
class BertDenseEncoderTest(keras_parameterized.TestCase):
class BertEncoderV2Test(keras_parameterized.TestCase):
def tearDown(self):
super(BertDenseEncoderTest, self).tearDown()
super(BertEncoderV2Test, self).tearDown()
tf.keras.mixed_precision.set_global_policy("float32")
def test_dict_outputs_network_creation(self):
hidden_size = 32
sequence_length = 21
dense_sequence_length = 20
# Create a small dense BertDenseEncoder for testing.
# Create a small dense BertEncoderV2 for testing.
kwargs = {}
test_network = bert_dense_encoder.BertDenseEncoder(
test_network = bert_encoder.BertEncoderV2(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
with_dense_inputs=True,
**kwargs)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
......@@ -86,12 +87,13 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
sequence_length = 21
dense_sequence_length = 20
# Create a small BertEncoder for testing.
test_network = bert_dense_encoder.BertDenseEncoder(
test_network = bert_encoder.BertEncoderV2(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
dict_outputs=True)
dict_outputs=True,
with_dense_inputs=True)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
......@@ -134,12 +136,13 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
dense_sequence_length = 20
tf.keras.mixed_precision.set_global_policy("mixed_float16")
# Create a small BertEncoder for testing.
test_network = bert_dense_encoder.BertDenseEncoder(
test_network = bert_encoder.BertEncoderV2(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
dict_outputs=True)
dict_outputs=True,
with_dense_inputs=True)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
......@@ -176,9 +179,8 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
self.assertAllEqual(tf.float16, pooled.dtype)
@parameterized.named_parameters(
("all_sequence_encoder_v2", bert_dense_encoder.BertDenseEncoder, None,
41),
("output_range_encoder_v2", bert_dense_encoder.BertDenseEncoder, 1, 1),
("all_sequence_encoder_v2", bert_encoder.BertEncoderV2, None, 41),
("output_range_encoder_v2", bert_encoder.BertEncoderV2, 1, 1),
)
def test_dict_outputs_network_invocation(
self, encoder_cls, output_range, out_seq_len):
......@@ -195,7 +197,8 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
num_layers=3,
type_vocab_size=num_types,
output_range=output_range,
dict_outputs=True)
dict_outputs=True,
with_dense_inputs=True)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
......@@ -276,7 +279,7 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
# Creates a BertEncoder with embedding_width != hidden_size
embedding_width = 16
test_network = bert_dense_encoder.BertDenseEncoder(
test_network = bert_encoder.BertEncoderV2(
vocab_size=vocab_size,
hidden_size=hidden_size,
max_sequence_length=max_sequence_length,
......@@ -316,11 +319,12 @@ class BertDenseEncoderTest(keras_parameterized.TestCase):
sequence_length = 21
dense_sequence_length = 20
# Create a small BertEncoder for testing.
test_network = bert_dense_encoder.BertDenseEncoder(
test_network = bert_encoder.BertEncoderV2(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3)
num_layers=3,
with_dense_inputs=True)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
......
......@@ -23,6 +23,8 @@ from official.nlp.modeling import layers
_Initializer = Union[str, tf.keras.initializers.Initializer]
_Activation = Union[str, Callable[..., Any]]
_approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True)
......@@ -72,6 +74,7 @@ class BertEncoderV2(tf.keras.layers.Layer):
norm_first: Whether to normalize inputs to attention and intermediate dense
layers. If set False, output of attention and intermediate dense layers is
normalized.
with_dense_inputs: Whether to accept dense embeddings as the input.
"""
def __init__(
......@@ -83,7 +86,7 @@ class BertEncoderV2(tf.keras.layers.Layer):
max_sequence_length: int = 512,
type_vocab_size: int = 16,
inner_dim: int = 3072,
inner_activation: Callable[..., Any] = _approx_gelu,
inner_activation: _Activation = _approx_gelu,
output_dropout: float = 0.1,
attention_dropout: float = 0.1,
initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
......@@ -92,6 +95,7 @@ class BertEncoderV2(tf.keras.layers.Layer):
embedding_width: Optional[int] = None,
embedding_layer: Optional[tf.keras.layers.Layer] = None,
norm_first: bool = False,
with_dense_inputs: bool = False,
**kwargs):
# Pops kwargs that are used in V1 implementation.
if 'dict_outputs' in kwargs:
......@@ -190,7 +194,19 @@ class BertEncoderV2(tf.keras.layers.Layer):
'embedding_width': embedding_width,
'embedding_layer': embedding_layer,
'norm_first': norm_first,
'with_dense_inputs': with_dense_inputs,
}
if with_dense_inputs:
self.inputs = dict(
input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
dense_inputs=tf.keras.Input(
shape=(None, embedding_width), dtype=tf.float32),
dense_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
dense_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
)
else:
self.inputs = dict(
input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
......@@ -203,11 +219,22 @@ class BertEncoderV2(tf.keras.layers.Layer):
mask = inputs.get('input_mask')
type_ids = inputs.get('input_type_ids')
word_embeddings = inputs.get('input_word_embeddings', None)
dense_inputs = inputs.get('dense_inputs', None)
dense_mask = inputs.get('dense_mask', None)
dense_type_ids = inputs.get('dense_type_ids', None)
else:
raise ValueError('Unexpected inputs type to %s.' % self.__class__)
if word_embeddings is None:
word_embeddings = self._embedding_layer(word_ids)
if dense_inputs is not None:
# Concat the dense embeddings at sequence end.
word_embeddings = tf.concat([word_embeddings, dense_inputs], axis=1)
type_ids = tf.concat([type_ids, dense_type_ids], axis=1)
mask = tf.concat([mask, dense_mask], axis=1)
# absolute position embeddings.
position_embeddings = self._position_embedding_layer(word_embeddings)
type_embeddings = self._type_embedding_layer(type_ids)
......
......@@ -15,17 +15,32 @@
"""Funnel Transformer network."""
# pylint: disable=g-classes-have-attributes
from typing import Union, Sequence
from typing import Any, Callable, Optional, Union, Sequence
from absl import logging
import numpy as np
import tensorflow as tf
from official.nlp.modeling import layers
_Initializer = Union[str, tf.keras.initializers.Initializer]
_Activation = Union[str, Callable[..., Any]]
_MAX = 'max'
_AVG = 'avg'
_TRUNCATED_AVG = 'truncated_avg'
_transformer_cls2str = {
layers.TransformerEncoderBlock: 'TransformerEncoderBlock',
layers.ReZeroTransformer: 'ReZeroTransformer'
}
_str2transformer_cls = {
'TransformerEncoderBlock': layers.TransformerEncoderBlock,
'ReZeroTransformer': layers.ReZeroTransformer
}
_approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True)
def _get_policy_dtype():
try:
......@@ -206,29 +221,37 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
embeddings for the input word IDs.
norm_first: Whether to normalize inputs to attention and intermediate dense
layers. If set False, output of attention and intermediate dense layers is
normalized.
normalized. This does not apply to ReZero.
transformer_cls: str or a keras Layer. This is the base TransformerBlock the
funnel encoder relies on.
share_rezero: bool. Whether to share ReZero alpha between the attention
layer and the ffn layer. This option is specific to ReZero.
"""
def __init__(
self,
vocab_size,
hidden_size=768,
num_layers=12,
num_attention_heads=12,
max_sequence_length=512,
type_vocab_size=16,
inner_dim=3072,
inner_activation=lambda x: tf.keras.activations.gelu(x, approximate=True),
output_dropout=0.1,
attention_dropout=0.1,
pool_type=_MAX,
pool_stride=2,
unpool_length=0,
initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
output_range=None,
embedding_width=None,
embedding_layer=None,
norm_first=False,
vocab_size: int,
hidden_size: int = 768,
num_layers: int = 12,
num_attention_heads: int = 12,
max_sequence_length: int = 512,
type_vocab_size: int = 16,
inner_dim: int = 3072,
inner_activation: _Activation = _approx_gelu,
output_dropout: float = 0.1,
attention_dropout: float = 0.1,
pool_type: str = _MAX,
pool_stride: int = 2,
unpool_length: int = 0,
initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
stddev=0.02),
output_range: Optional[int] = None,
embedding_width: Optional[int] = None,
embedding_layer: Optional[tf.keras.layers.Layer] = None,
norm_first: bool = False,
transformer_cls: Union[
str, tf.keras.layers.Layer] = layers.TransformerEncoderBlock,
share_rezero: bool = True,
**kwargs):
super().__init__(**kwargs)
activation = tf.keras.activations.get(inner_activation)
......@@ -278,16 +301,22 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
self._transformer_layers = []
self._attention_mask_layer = layers.SelfAttentionMask(
name='self_attention_mask')
# Will raise an error if the string is not supported.
if isinstance(transformer_cls, str):
transformer_cls = _str2transformer_cls[transformer_cls]
for i in range(num_layers):
layer = layers.TransformerEncoderBlock(
layer = transformer_cls(
num_attention_heads=num_attention_heads,
intermediate_size=inner_dim,
inner_dim=inner_dim,
intermediate_activation=inner_activation,
inner_activation=inner_activation,
output_dropout=output_dropout,
attention_dropout=attention_dropout,
norm_first=norm_first,
output_range=output_range if i == num_layers - 1 else None,
kernel_initializer=initializer,
share_rezero=share_rezero,
name='transformer/layer_%d' % i)
self._transformer_layers.append(layer)
......@@ -333,24 +362,44 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
self._pool_type = pool_type
self._config = {
'vocab_size': vocab_size,
'hidden_size': hidden_size,
'num_layers': num_layers,
'num_attention_heads': num_attention_heads,
'max_sequence_length': max_sequence_length,
'type_vocab_size': type_vocab_size,
'inner_dim': inner_dim,
'inner_activation': tf.keras.activations.serialize(activation),
'output_dropout': output_dropout,
'attention_dropout': attention_dropout,
'initializer': tf.keras.initializers.serialize(initializer),
'output_range': output_range,
'embedding_width': embedding_width,
'embedding_layer': embedding_layer,
'norm_first': norm_first,
'pool_type': pool_type,
'pool_stride': pool_stride,
'unpool_length': unpool_length,
'vocab_size':
vocab_size,
'hidden_size':
hidden_size,
'num_layers':
num_layers,
'num_attention_heads':
num_attention_heads,
'max_sequence_length':
max_sequence_length,
'type_vocab_size':
type_vocab_size,
'inner_dim':
inner_dim,
'inner_activation':
tf.keras.activations.serialize(activation),
'output_dropout':
output_dropout,
'attention_dropout':
attention_dropout,
'initializer':
tf.keras.initializers.serialize(initializer),
'output_range':
output_range,
'embedding_width':
embedding_width,
'embedding_layer':
embedding_layer,
'norm_first':
norm_first,
'pool_type':
pool_type,
'pool_stride':
pool_stride,
'unpool_length':
unpool_length,
'transformer_cls':
_transformer_cls2str.get(transformer_cls, str(transformer_cls))
}
def call(self, inputs):
......
......@@ -38,13 +38,20 @@ class FunnelTransformerEncoderTest(parameterized.TestCase, tf.test.TestCase):
tf.keras.mixed_precision.set_global_policy("float32")
@parameterized.named_parameters(
("mix_truncated_avg", "mixed_float16", tf.float16, "truncated_avg"),
("float32_truncated_avg", "float32", tf.float32, "truncated_avg"),
("mix_max", "mixed_float16", tf.float16, "max"),
("float32_max", "float32", tf.float32, "max"),
("mix_avg", "mixed_float16", tf.float16, "avg"),
("float32_avg", "float32", tf.float32, "avg"))
def test_network_creation(self, policy, pooled_dtype, pool_type):
("mix_truncated_avg_rezero", "mixed_float16", tf.float16, "truncated_avg",
"ReZeroTransformer"), ("float32_truncated_avg_rezero", "float32",
tf.float32, "truncated_avg", "ReZeroTransformer"),
("mix_truncated_avg", "mixed_float16", tf.float16, "truncated_avg",
"TransformerEncoderBlock"),
("float32_truncated_avg", "float32", tf.float32, "truncated_avg",
"TransformerEncoderBlock"), ("mix_max", "mixed_float16", tf.float16,
"max", "TransformerEncoderBlock"),
("float32_max", "float32", tf.float32, "max", "TransformerEncoderBlock"),
("mix_avg", "mixed_float16", tf.float16, "avg",
"TransformerEncoderBlock"),
("float32_avg", "float32", tf.float32, "avg", "TransformerEncoderBlock"))
def test_network_creation(self, policy, pooled_dtype, pool_type,
transformer_cls):
tf.keras.mixed_precision.set_global_policy(policy)
hidden_size = 32
......@@ -60,7 +67,8 @@ class FunnelTransformerEncoderTest(parameterized.TestCase, tf.test.TestCase):
pool_stride=pool_stride,
pool_type=pool_type,
max_sequence_length=sequence_length,
unpool_length=0)
unpool_length=0,
transformer_cls=transformer_cls)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
......@@ -253,7 +261,8 @@ class FunnelTransformerEncoderTest(parameterized.TestCase, tf.test.TestCase):
norm_first=False,
pool_type="max",
pool_stride=2,
unpool_length=0)
unpool_length=0,
transformer_cls="TransformerEncoderBlock")
network = funnel_transformer.FunnelTransformerEncoder(**kwargs)
expected_config = dict(kwargs)
expected_config["inner_activation"] = tf.keras.activations.serialize(
......
......@@ -13,12 +13,14 @@
# limitations under the License.
"""A binary/library to export TF-NLP serving `SavedModel`."""
import dataclasses
import os
from typing import Any, Dict, Text
from absl import app
from absl import flags
import dataclasses
import yaml
from official.core import base_task
from official.core import task_factory
from official.modeling import hyperparams
......@@ -29,6 +31,7 @@ from official.nlp.tasks import masked_lm
from official.nlp.tasks import question_answering
from official.nlp.tasks import sentence_prediction
from official.nlp.tasks import tagging
from official.nlp.tasks import translation
FLAGS = flags.FLAGS
......@@ -40,7 +43,9 @@ SERVING_MODULES = {
question_answering.QuestionAnsweringTask:
serving_modules.QuestionAnswering,
tagging.TaggingTask:
serving_modules.Tagging
serving_modules.Tagging,
translation.TranslationTask:
serving_modules.Translation
}
......@@ -60,6 +65,10 @@ def define_flags():
flags.DEFINE_string(
"function_keys", None,
"A string key to retrieve pre-defined serving signatures.")
flags.DEFINE_string(
"module_key", None,
"For multi-task case, load the export module weights from a specific "
"checkpoint item.")
flags.DEFINE_bool("convert_tpu", False, "")
flags.DEFINE_multi_integer("allowed_batch_size", None,
"Allowed batch sizes for batching ops.")
......@@ -116,7 +125,8 @@ def main(_):
export_module,
function_keys=[FLAGS.function_keys],
checkpoint_path=FLAGS.checkpoint_path,
export_savedmodel_dir=FLAGS.export_savedmodel_dir)
export_savedmodel_dir=FLAGS.export_savedmodel_dir,
module_key=FLAGS.module_key)
if FLAGS.convert_tpu:
# pylint: disable=g-import-not-at-top
......
......@@ -13,24 +13,21 @@
# limitations under the License.
"""Common library to export a SavedModel from the export module."""
import os
import time
from typing import Dict, List, Optional, Text, Union
from absl import logging
import tensorflow as tf
from official.core import export_base
MAX_DIRECTORY_CREATION_ATTEMPTS = 10
get_timestamped_export_dir = export_base.get_timestamped_export_dir
def export(export_module: export_base.ExportModule,
function_keys: Union[List[Text], Dict[Text, Text]],
export_savedmodel_dir: Text,
checkpoint_path: Optional[Text] = None,
timestamped: bool = True) -> Text:
timestamped: bool = True,
module_key: Optional[Text] = None) -> Text:
"""Exports to SavedModel format.
Args:
......@@ -41,6 +38,8 @@ def export(export_module: export_base.ExportModule,
export_savedmodel_dir: Output saved model directory.
checkpoint_path: Object-based checkpoint path or directory.
timestamped: Whether to export the savedmodel to a timestamped directory.
module_key: Optional string to identify a checkpoint object to load for the
model in the export module.
Returns:
The savedmodel directory path.
......@@ -48,37 +47,16 @@ def export(export_module: export_base.ExportModule,
save_options = tf.saved_model.SaveOptions(function_aliases={
'tpu_candidate': export_module.serve,
})
return export_base.export(export_module, function_keys, export_savedmodel_dir,
checkpoint_path, timestamped, save_options)
def get_timestamped_export_dir(export_dir_base):
"""Builds a path to a new subdirectory within the base directory.
Args:
export_dir_base: A string containing a directory to write the exported graph
and checkpoints.
Returns:
The full path of the new subdirectory (which is not actually created yet).
Raises:
RuntimeError: if repeated attempts fail to obtain a unique timestamped
directory name.
"""
attempts = 0
while attempts < MAX_DIRECTORY_CREATION_ATTEMPTS:
timestamp = int(time.time())
result_dir = os.path.join(export_dir_base, str(timestamp))
if not tf.io.gfile.exists(result_dir):
# Collisions are still possible (though extremely unlikely): this
# directory is not actually created yet, but it will be almost
# instantly on return from this function.
return result_dir
time.sleep(1)
attempts += 1
logging.warning('Directory %s already exists; retrying (attempt %s/%s)',
str(result_dir), attempts, MAX_DIRECTORY_CREATION_ATTEMPTS)
raise RuntimeError('Failed to obtain a unique export directory name after '
f'{MAX_DIRECTORY_CREATION_ATTEMPTS} attempts.')
if module_key:
kwargs = {module_key: export_module.model}
checkpoint = tf.train.Checkpoint(**kwargs)
else:
checkpoint = None
return export_base.export(
export_module,
function_keys,
export_savedmodel_dir,
checkpoint_path,
timestamped,
save_options,
checkpoint=checkpoint)
......@@ -14,10 +14,12 @@
"""Serving export modules for TF Model Garden NLP models."""
# pylint:disable=missing-class-docstring
import dataclasses
from typing import Dict, List, Optional, Text
import dataclasses
import tensorflow as tf
import tensorflow_text as tf_text
from official.core import export_base
from official.modeling.hyperparams import base_config
from official.nlp.data import sentence_prediction_dataloader
......@@ -407,3 +409,48 @@ class Tagging(export_base.ExportModule):
signatures[signature_key] = self.serve_examples.get_concrete_function(
tf.TensorSpec(shape=[None], dtype=tf.string, name="examples"))
return signatures
class Translation(export_base.ExportModule):
"""The export module for the translation task."""
@dataclasses.dataclass
class Params(base_config.Config):
sentencepiece_model_path: str = ""
def __init__(self, params, model: tf.keras.Model, inference_step=None):
super().__init__(params, model, inference_step)
self._sp_tokenizer = tf_text.SentencepieceTokenizer(
model=tf.io.gfile.GFile(params.sentencepiece_model_path, "rb").read(),
add_eos=True)
try:
empty_str_tokenized = self._sp_tokenizer.tokenize("").numpy()
except tf.errors.InternalError:
raise ValueError(
"EOS token not in tokenizer vocab."
"Please make sure the tokenizer generates a single token for an "
"empty string.")
self._eos_id = empty_str_tokenized.item()
@tf.function
def serve(self, inputs) -> Dict[str, tf.Tensor]:
return self.inference_step(inputs)
@tf.function
def serve_text(self, text: tf.Tensor) -> Dict[str, tf.Tensor]:
tokenized = self._sp_tokenizer.tokenize(text).to_tensor(0)
return self._sp_tokenizer.detokenize(
self.serve({"inputs": tokenized})["outputs"])
def get_inference_signatures(self, function_keys: Dict[Text, Text]):
signatures = {}
valid_keys = ("serve_text")
for func_key, signature_key in function_keys.items():
if func_key not in valid_keys:
raise ValueError("Invalid function key for the module: %s with key %s. "
"Valid keys are: %s" %
(self.__class__, func_key, valid_keys))
if func_key == "serve_text":
signatures[signature_key] = self.serve_text.get_concrete_function(
tf.TensorSpec(shape=[None], dtype=tf.string, name="text"))
return signatures
......@@ -15,8 +15,11 @@
"""Tests for nlp.serving.serving_modules."""
import os
from absl.testing import parameterized
import tensorflow as tf
from sentencepiece import SentencePieceTrainer
from official.nlp.configs import bert
from official.nlp.configs import encoders
from official.nlp.serving import serving_modules
......@@ -24,6 +27,7 @@ from official.nlp.tasks import masked_lm
from official.nlp.tasks import question_answering
from official.nlp.tasks import sentence_prediction
from official.nlp.tasks import tagging
from official.nlp.tasks import translation
def _create_fake_serialized_examples(features_dict):
......@@ -59,6 +63,33 @@ def _create_fake_vocab_file(vocab_file_path):
outfile.write("\n".join(tokens))
def _train_sentencepiece(input_path, vocab_size, model_path, eos_id=1):
argstr = " ".join([
f"--input={input_path}", f"--vocab_size={vocab_size}",
"--character_coverage=0.995",
f"--model_prefix={model_path}", "--model_type=bpe",
"--bos_id=-1", "--pad_id=0", f"--eos_id={eos_id}", "--unk_id=2"
])
SentencePieceTrainer.Train(argstr)
def _generate_line_file(filepath, lines):
with tf.io.gfile.GFile(filepath, "w") as f:
for l in lines:
f.write("{}\n".format(l))
def _make_sentencepeice(output_dir):
src_lines = ["abc ede fg", "bbcd ef a g", "de f a a g"]
tgt_lines = ["dd cc a ef g", "bcd ef a g", "gef cd ba"]
sentencepeice_input_path = os.path.join(output_dir, "inputs.txt")
_generate_line_file(sentencepeice_input_path, src_lines + tgt_lines)
sentencepeice_model_prefix = os.path.join(output_dir, "sp")
_train_sentencepiece(sentencepeice_input_path, 11, sentencepeice_model_prefix)
sentencepeice_model_path = "{}.model".format(sentencepeice_model_prefix)
return sentencepeice_model_path
class ServingModulesTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(
......@@ -312,6 +343,31 @@ class ServingModulesTest(tf.test.TestCase, parameterized.TestCase):
with self.assertRaises(ValueError):
_ = export_module.get_inference_signatures({"foo": None})
def test_translation(self):
sp_path = _make_sentencepeice(self.get_temp_dir())
encdecoder = translation.EncDecoder(
num_attention_heads=4, intermediate_size=256)
config = translation.TranslationConfig(
model=translation.ModelConfig(
encoder=encdecoder,
decoder=encdecoder,
embedding_width=256,
padded_decode=False,
decode_max_length=100),
sentencepiece_model_path=sp_path,
)
task = translation.TranslationTask(config)
model = task.build_model()
params = serving_modules.Translation.Params(
sentencepiece_model_path=sp_path)
export_module = serving_modules.Translation(params=params, model=model)
functions = export_module.get_inference_signatures({
"serve_text": "serving_default"
})
outputs = functions["serving_default"](tf.constant(["abcd", "ef gh"]))
self.assertEqual(outputs.shape, (2,))
self.assertEqual(outputs.dtype, tf.string)
if __name__ == "__main__":
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment