Unverified Commit f16a7b5b authored by vedanshu's avatar vedanshu Committed by GitHub
Browse files

Merge pull request #1 from tensorflow/master

new pull
parents 8e9296ff 8f58f396
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,12 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for Keras-based gated feedforward layer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for Keras-based gated feedforward layer."""
from absl.testing import parameterized
import numpy as np
......@@ -33,7 +29,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
def tearDown(self):
super(GatedFeedforwardTest, self).tearDown()
tf.keras.mixed_precision.experimental.set_policy("float32")
tf.keras.mixed_precision.set_global_policy("float32")
@parameterized.parameters(
(True, 1, "after_residual", "float32"),
......@@ -46,7 +42,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
(False, 1, "before_residual", "mixed_float16"),
)
def test_layer_creation(self, use_gate, num_blocks, dropout_position, dtype):
tf.keras.mixed_precision.experimental.set_policy(dtype)
tf.keras.mixed_precision.set_global_policy(dtype)
kwargs = dict(
intermediate_size=128,
intermediate_activation="relu",
......@@ -78,7 +74,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
)
def test_layer_invocation(self, use_gate, num_blocks, dropout_position,
dtype):
tf.keras.mixed_precision.experimental.set_policy(dtype)
tf.keras.mixed_precision.set_global_policy(dtype)
kwargs = dict(
intermediate_size=16,
intermediate_activation="relu",
......@@ -123,5 +119,6 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
if __name__ == "__main__":
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Definitions for random feature Gaussian process layer."""
import math
import tensorflow as tf
_SUPPORTED_LIKELIHOOD = ('binary_logistic', 'poisson', 'gaussian')
class RandomFeatureGaussianProcess(tf.keras.layers.Layer):
"""Gaussian process layer with random feature approximation [1].
During training, the model updates the maximum a posteriori (MAP) logits
estimates and posterior precision matrix using minibatch statistics. During
inference, the model divides the MAP logit estimates by the predictive
standard deviation, which is equivalent to approximating the posterior mean
of the predictive probability via the mean-field approximation.
User can specify different types of random features by setting
`use_custom_random_features=True`, and change the initializer and activations
of the custom random features. For example:
MLP Kernel: initializer='random_normal', activation=tf.nn.relu
RBF Kernel: initializer='random_normal', activation=tf.math.cos
A linear kernel can also be specified by setting gp_kernel_type='linear' and
`use_custom_random_features=True`.
[1]: Ali Rahimi and Benjamin Recht. Random Features for Large-Scale Kernel
Machines. In _Neural Information Processing Systems_, 2007.
https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf
Attributes:
units: (int) The dimensionality of layer.
num_inducing: (int) The number of random features for the approximation.
is_training: (tf.bool) Whether the layer is set in training mode. If so the
layer updates the Gaussian process' variance estimate using statistics
computed from the incoming minibatches.
"""
def __init__(self,
units,
num_inducing=1024,
gp_kernel_type='gaussian',
gp_kernel_scale=1.,
gp_output_bias=0.,
normalize_input=False,
gp_kernel_scale_trainable=False,
gp_output_bias_trainable=False,
gp_cov_momentum=0.999,
gp_cov_ridge_penalty=1.,
scale_random_features=True,
use_custom_random_features=True,
custom_random_features_initializer=None,
custom_random_features_activation=None,
l2_regularization=1e-6,
gp_cov_likelihood='gaussian',
return_gp_cov=True,
return_random_features=False,
dtype=None,
name='random_feature_gaussian_process',
**gp_output_kwargs):
"""Initializes a random-feature Gaussian process layer instance.
Args:
units: (int) Number of output units.
num_inducing: (int) Number of random Fourier features used for
approximating the Gaussian process.
gp_kernel_type: (string) The type of kernel function to use for Gaussian
process. Currently default to 'gaussian' which is the Gaussian RBF
kernel.
gp_kernel_scale: (float) The length-scale parameter of the a
shift-invariant kernel function, i.e., for RBF kernel:
exp(-|x1 - x2|**2 / gp_kernel_scale).
gp_output_bias: (float) Scalar initial value for the bias vector.
normalize_input: (bool) Whether to normalize the input to Gaussian
process.
gp_kernel_scale_trainable: (bool) Whether the length scale variable is
trainable.
gp_output_bias_trainable: (bool) Whether the bias is trainable.
gp_cov_momentum: (float) A discount factor used to compute the moving
average for posterior covariance matrix.
gp_cov_ridge_penalty: (float) Initial Ridge penalty to posterior
covariance matrix.
scale_random_features: (bool) Whether to scale the random feature
by sqrt(2. / num_inducing).
use_custom_random_features: (bool) Whether to use custom random
features implemented using tf.keras.layers.Dense.
custom_random_features_initializer: (str or callable) Initializer for
the random features. Default to random normal which approximates a RBF
kernel function if activation function is cos.
custom_random_features_activation: (callable) Activation function for the
random feature layer. Default to cosine which approximates a RBF
kernel function.
l2_regularization: (float) The strength of l2 regularization on the output
weights.
gp_cov_likelihood: (string) Likelihood to use for computing Laplace
approximation for covariance matrix. Default to `gaussian`.
return_gp_cov: (bool) Whether to also return GP covariance matrix.
If False then no covariance learning is performed.
return_random_features: (bool) Whether to also return random features.
dtype: (tf.DType) Input data type.
name: (string) Layer name.
**gp_output_kwargs: Additional keyword arguments to dense output layer.
"""
super(RandomFeatureGaussianProcess, self).__init__(name=name, dtype=dtype)
self.units = units
self.num_inducing = num_inducing
self.normalize_input = normalize_input
self.gp_input_scale = 1. / tf.sqrt(gp_kernel_scale)
self.gp_feature_scale = tf.sqrt(2. / float(num_inducing))
self.scale_random_features = scale_random_features
self.return_random_features = return_random_features
self.return_gp_cov = return_gp_cov
self.gp_kernel_type = gp_kernel_type
self.gp_kernel_scale = gp_kernel_scale
self.gp_output_bias = gp_output_bias
self.gp_kernel_scale_trainable = gp_kernel_scale_trainable
self.gp_output_bias_trainable = gp_output_bias_trainable
self.use_custom_random_features = use_custom_random_features
self.custom_random_features_initializer = custom_random_features_initializer
self.custom_random_features_activation = custom_random_features_activation
self.l2_regularization = l2_regularization
self.gp_output_kwargs = gp_output_kwargs
self.gp_cov_momentum = gp_cov_momentum
self.gp_cov_ridge_penalty = gp_cov_ridge_penalty
self.gp_cov_likelihood = gp_cov_likelihood
if self.use_custom_random_features:
# Default to Gaussian RBF kernel.
self.random_features_bias_initializer = tf.random_uniform_initializer(
minval=0., maxval=2. * math.pi)
if self.custom_random_features_initializer is None:
self.custom_random_features_initializer = (
tf.keras.initializers.RandomNormal(stddev=1.))
if self.custom_random_features_activation is None:
self.custom_random_features_activation = tf.math.cos
def build(self, input_shape):
# Defines model layers.
if self.normalize_input:
self._input_norm_layer = tf.keras.layers.LayerNormalization(
name='gp_input_normalization')
self._input_norm_layer.build(input_shape)
input_shape = self._input_norm_layer.compute_output_shape(input_shape)
self._random_feature = self._make_random_feature_layer(
name='gp_random_feature')
self._random_feature.build(input_shape)
input_shape = self._random_feature.compute_output_shape(input_shape)
if self.return_gp_cov:
self._gp_cov_layer = LaplaceRandomFeatureCovariance(
momentum=self.gp_cov_momentum,
ridge_penalty=self.gp_cov_ridge_penalty,
likelihood=self.gp_cov_likelihood,
dtype=self.dtype,
name='gp_covariance')
self._gp_cov_layer.build(input_shape)
self._gp_output_layer = tf.keras.layers.Dense(
units=self.units,
use_bias=False,
kernel_regularizer=tf.keras.regularizers.l2(self.l2_regularization),
dtype=self.dtype,
name='gp_output_weights',
**self.gp_output_kwargs)
self._gp_output_layer.build(input_shape)
self._gp_output_bias = tf.Variable(
initial_value=[self.gp_output_bias] * self.units,
dtype=self.dtype,
trainable=self.gp_output_bias_trainable,
name='gp_output_bias')
self.built = True
def _make_random_feature_layer(self, name):
"""Defines random feature layer depending on kernel type."""
if not self.use_custom_random_features:
# Use default RandomFourierFeatures layer from tf.keras.
return tf.keras.layers.experimental.RandomFourierFeatures(
output_dim=self.num_inducing,
kernel_initializer=self.gp_kernel_type,
scale=self.gp_kernel_scale,
trainable=self.gp_kernel_scale_trainable,
dtype=self.dtype,
name=name)
if self.gp_kernel_type.lower() == 'linear':
custom_random_feature_layer = tf.keras.layers.Lambda(
lambda x: x, name=name)
else:
# Use user-supplied configurations.
custom_random_feature_layer = tf.keras.layers.Dense(
units=self.num_inducing,
use_bias=True,
activation=self.custom_random_features_activation,
kernel_initializer=self.custom_random_features_initializer,
bias_initializer=self.random_features_bias_initializer,
trainable=False,
name=name)
return custom_random_feature_layer
def reset_covariance_matrix(self):
"""Resets covariance matrix of the GP layer.
This function is useful for reseting the model's covariance matrix at the
begining of a new epoch.
"""
self._gp_cov_layer.reset_precision_matrix()
def call(self, inputs, global_step=None, training=None):
# Computes random features.
gp_inputs = inputs
if self.normalize_input:
gp_inputs = self._input_norm_layer(gp_inputs)
elif self.use_custom_random_features:
# Supports lengthscale for custom random feature layer by directly
# rescaling the input.
gp_input_scale = tf.cast(self.gp_input_scale, inputs.dtype)
gp_inputs = gp_inputs * gp_input_scale
gp_feature = self._random_feature(gp_inputs)
if self.scale_random_features:
# Scale random feature by 2. / sqrt(num_inducing) following [1].
# When using GP layer as the output layer of a nerual network,
# it is recommended to turn this scaling off to prevent it from changing
# the learning rate to the hidden layers.
gp_feature_scale = tf.cast(self.gp_feature_scale, inputs.dtype)
gp_feature = gp_feature * gp_feature_scale
# Computes posterior center (i.e., MAP estimate) and variance.
gp_output = self._gp_output_layer(gp_feature) + self._gp_output_bias
if self.return_gp_cov:
gp_covmat = self._gp_cov_layer(gp_feature, gp_output, training)
# Assembles model output.
model_output = [gp_output,]
if self.return_gp_cov:
model_output.append(gp_covmat)
if self.return_random_features:
model_output.append(gp_feature)
return model_output
class LaplaceRandomFeatureCovariance(tf.keras.layers.Layer):
"""Computes the Gaussian Process covariance using Laplace method.
At training time, this layer updates the Gaussian process posterior using
model features in minibatches.
Attributes:
momentum: (float) A discount factor used to compute the moving average for
posterior precision matrix. Analogous to the momentum factor in batch
normalization. If -1 then update covariance matrix using a naive sum
without momentum, which is desirable if the goal is to compute the exact
covariance matrix by passing through data once (say in the final epoch).
ridge_penalty: (float) Initial Ridge penalty to weight covariance matrix.
This value is used to stablize the eigenvalues of weight covariance
estimate so that the matrix inverse can be computed for Cov = inv(t(X) * X
+ s * I). The ridge factor s cannot be too large since otherwise it will
dominate the t(X) * X term and make covariance estimate not meaningful.
likelihood: (str) The likelihood to use for computing Laplace approximation
for the covariance matrix. Can be one of ('binary_logistic', 'poisson',
'gaussian').
"""
def __init__(self,
momentum=0.999,
ridge_penalty=1.,
likelihood='gaussian',
dtype=None,
name='laplace_covariance'):
if likelihood not in _SUPPORTED_LIKELIHOOD:
raise ValueError(
f'"likelihood" must be one of {_SUPPORTED_LIKELIHOOD}, got {likelihood}.'
)
self.ridge_penalty = ridge_penalty
self.momentum = momentum
self.likelihood = likelihood
super(LaplaceRandomFeatureCovariance, self).__init__(dtype=dtype, name=name)
def compute_output_shape(self, input_shape):
gp_feature_dim = input_shape[-1]
return tf.TensorShape([gp_feature_dim, gp_feature_dim])
def build(self, input_shape):
gp_feature_dim = input_shape[-1]
# Convert gp_feature_dim to int value for TF1 compatibility.
if isinstance(gp_feature_dim, tf.compat.v1.Dimension):
gp_feature_dim = gp_feature_dim.value
# Posterior precision matrix for the GP's random feature coefficients.
self.initial_precision_matrix = (
self.ridge_penalty * tf.eye(gp_feature_dim, dtype=self.dtype))
self.precision_matrix = (
self.add_weight(
name='gp_precision_matrix',
shape=(gp_feature_dim, gp_feature_dim),
dtype=self.dtype,
initializer=tf.keras.initializers.Identity(self.ridge_penalty),
trainable=False,
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA))
self.built = True
def make_precision_matrix_update_op(self,
gp_feature,
logits,
precision_matrix):
"""Defines update op for the precision matrix of feature weights."""
if self.likelihood != 'gaussian':
if logits is None:
raise ValueError(
f'"logits" cannot be None when likelihood={self.likelihood}')
if logits.shape[-1] != 1:
raise ValueError(
f'likelihood={self.likelihood} only support univariate logits.'
f'Got logits dimension: {logits.shape[-1]}')
batch_size = tf.shape(gp_feature)[0]
batch_size = tf.cast(batch_size, dtype=gp_feature.dtype)
# Computes batch-specific normalized precision matrix.
if self.likelihood == 'binary_logistic':
prob = tf.sigmoid(logits)
prob_multiplier = prob * (1. - prob)
elif self.likelihood == 'poisson':
prob_multiplier = tf.exp(logits)
else:
prob_multiplier = 1.
gp_feature_adjusted = tf.sqrt(prob_multiplier) * gp_feature
precision_matrix_minibatch = tf.matmul(
gp_feature_adjusted, gp_feature_adjusted, transpose_a=True)
# Updates the population-wise precision matrix.
if self.momentum > 0:
# Use moving-average updates to accumulate batch-specific precision
# matrices.
precision_matrix_minibatch = precision_matrix_minibatch / batch_size
precision_matrix_new = (
self.momentum * precision_matrix +
(1. - self.momentum) * precision_matrix_minibatch)
else:
# Compute exact population-wise covariance without momentum.
# If use this option, make sure to pass through data only once.
precision_matrix_new = precision_matrix + precision_matrix_minibatch
# Returns the update op.
return precision_matrix.assign(precision_matrix_new)
def reset_precision_matrix(self):
"""Resets precision matrix to its initial value.
This function is useful for reseting the model's covariance matrix at the
begining of a new epoch.
"""
precision_matrix_reset_op = self.precision_matrix.assign(
self.initial_precision_matrix)
self.add_update(precision_matrix_reset_op)
def compute_predictive_covariance(self, gp_feature):
"""Computes posterior predictive variance.
Approximates the Gaussian process posterior using random features.
Given training random feature Phi_tr (num_train, num_hidden) and testing
random feature Phi_ts (batch_size, num_hidden). The predictive covariance
matrix is computed as (assuming Gaussian likelihood):
s * Phi_ts @ inv(t(Phi_tr) * Phi_tr + s * I) @ t(Phi_ts),
where s is the ridge factor to be used for stablizing the inverse, and I is
the identity matrix with shape (num_hidden, num_hidden).
Args:
gp_feature: (tf.Tensor) The random feature of testing data to be used for
computing the covariance matrix. Shape (batch_size, gp_hidden_size).
Returns:
(tf.Tensor) Predictive covariance matrix, shape (batch_size, batch_size).
"""
# Computes the covariance matrix of the feature coefficient.
feature_cov_matrix = tf.linalg.inv(self.precision_matrix)
# Computes the covariance matrix of the gp prediction.
cov_feature_product = tf.matmul(
feature_cov_matrix, gp_feature, transpose_b=True) * self.ridge_penalty
gp_cov_matrix = tf.matmul(gp_feature, cov_feature_product)
return gp_cov_matrix
def _get_training_value(self, training=None):
if training is None:
training = tf.keras.backend.learning_phase()
if isinstance(training, int):
training = bool(training)
return training
def call(self, inputs, logits=None, training=None):
"""Minibatch updates the GP's posterior precision matrix estimate.
Args:
inputs: (tf.Tensor) GP random features, shape (batch_size,
gp_hidden_size).
logits: (tf.Tensor) Pre-activation output from the model. Needed
for Laplace approximation under a non-Gaussian likelihood.
training: (tf.bool) whether or not the layer is in training mode. If in
training mode, the gp_weight covariance is updated using gp_feature.
Returns:
gp_stddev (tf.Tensor): GP posterior predictive variance,
shape (batch_size, batch_size).
"""
batch_size = tf.shape(inputs)[0]
training = self._get_training_value(training)
if training:
# Define and register the update op for feature precision matrix.
precision_matrix_update_op = self.make_precision_matrix_update_op(
gp_feature=inputs,
logits=logits,
precision_matrix=self.precision_matrix)
self.add_update(precision_matrix_update_op)
# Return null estimate during training.
return tf.eye(batch_size, dtype=self.dtype)
else:
# Return covariance estimate during inference.
return self.compute_predictive_covariance(gp_feature=inputs)
def mean_field_logits(logits, covariance_matrix=None, mean_field_factor=1.):
"""Adjust the model logits so its softmax approximates the posterior mean [1].
[1]: Zhiyun Lu, Eugene Ie, Fei Sha. Uncertainty Estimation with Infinitesimal
Jackknife. _arXiv preprint arXiv:2006.07584_, 2020.
https://arxiv.org/abs/2006.07584
Arguments:
logits: A float tensor of shape (batch_size, num_classes).
covariance_matrix: The covariance matrix of shape (batch_size, batch_size).
If None then it assumes the covariance_matrix is an identity matrix.
mean_field_factor: The scale factor for mean-field approximation, used to
adjust the influence of posterior variance in posterior mean
approximation. If covariance_matrix=None then it is used as the
temperature parameter for temperature scaling.
Returns:
Tensor of adjusted logits, shape (batch_size, num_classes).
"""
if mean_field_factor is None or mean_field_factor < 0:
return logits
# Compute standard deviation.
if covariance_matrix is None:
variances = 1.
else:
variances = tf.linalg.diag_part(covariance_matrix)
# Compute scaling coefficient for mean-field approximation.
logits_scale = tf.sqrt(1. + variances * mean_field_factor)
if len(logits.shape) > 1:
# Cast logits_scale to compatible dimension.
logits_scale = tf.expand_dims(logits_scale, axis=-1)
return logits / logits_scale
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for Gaussian process functions."""
import os
import shutil
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.nlp.modeling.layers import gaussian_process
def exact_gaussian_kernel(x1, x2):
"""Computes exact Gaussian kernel value(s) for tensors x1 and x2."""
x1_squared = tf.reduce_sum(tf.square(x1), list(range(1, len(x1.shape))))
x2_squared = tf.reduce_sum(tf.square(x2), list(range(1, len(x2.shape))))
square = (x1_squared[:, tf.newaxis] + x2_squared[tf.newaxis, :] -
2 * tf.matmul(x1, x2, transpose_b=True))
return tf.math.exp(-square / 2.)
def _generate_normal_data(num_sample, num_dim, loc):
"""Generates random data sampled from i.i.d. normal distribution."""
return np.random.normal(
size=(num_sample, num_dim), loc=loc, scale=1. / np.sqrt(num_dim))
def _generate_rbf_data(x_data, orthogonal=True):
"""Generates high-dim data that is the eigen components of a RBF kernel."""
k_rbf = exact_gaussian_kernel(x_data, x_data)
x_orth, x_diag, _ = np.linalg.svd(k_rbf)
if orthogonal:
return x_orth
return np.diag(np.sqrt(x_diag)).dot(x_orth.T)
def _make_minibatch_iterator(data_numpy, batch_size, num_epoch):
"""Makes a tf.data.Dataset for given batch size and num epoches."""
dataset = tf.data.Dataset.from_tensor_slices(data_numpy)
dataset = dataset.repeat(num_epoch).batch(batch_size)
return iter(dataset)
def _compute_posterior_kernel(x_tr, x_ts, kernel_func, ridge_penalty):
"""Computes the posterior covariance matrix of a Gaussian process."""
num_sample = x_tr.shape[0]
k_tt_inv = tf.linalg.inv(
kernel_func(x_tr, x_tr) + ridge_penalty * np.eye(num_sample))
k_ts = kernel_func(x_tr, x_ts)
k_ss = kernel_func(x_ts, x_ts)
return k_ss - tf.matmul(k_ts, tf.matmul(k_tt_inv, k_ts), transpose_a=True)
class GaussianProcessTest(tf.test.TestCase, parameterized.TestCase):
def setUp(self):
super(GaussianProcessTest, self).setUp()
self.num_data_dim = 10
self.num_inducing = 1024
self.num_train_sample = 1024
self.num_test_sample = 256
self.prec_tolerance = {'atol': 1e-3, 'rtol': 5e-2}
self.cov_tolerance = {'atol': 5e-2, 'rtol': 2.}
self.rbf_kern_func = exact_gaussian_kernel
self.x_tr = _generate_normal_data(
self.num_train_sample, self.num_data_dim, loc=0.)
self.x_ts = _generate_normal_data(
self.num_test_sample, self.num_data_dim, loc=1.)
def test_layer_build(self):
"""Tests if layer.built=True after building."""
rfgp_model = gaussian_process.RandomFeatureGaussianProcess(units=1)
rfgp_model.build(input_shape=self.x_tr.shape)
self.assertTrue(rfgp_model.built)
@parameterized.named_parameters(('rbf_data', False),
('orthogonal_data', True))
def test_laplace_covariance_minibatch(self, generate_orthogonal_data):
"""Tests if model correctly learns population-lvel precision matrix."""
batch_size = 50
epochs = 1000
x_data = _generate_rbf_data(self.x_ts, generate_orthogonal_data)
data_iterator = _make_minibatch_iterator(x_data, batch_size, epochs)
# Estimates precision matrix using minibatch.
cov_estimator = gaussian_process.LaplaceRandomFeatureCovariance(
momentum=0.999, ridge_penalty=0)
for minibatch_data in data_iterator:
_ = cov_estimator(minibatch_data, training=True)
# Evaluation
prec_mat_expected = x_data.T.dot(x_data)
prec_mat_computed = (
cov_estimator.precision_matrix.numpy() * self.num_test_sample)
np.testing.assert_allclose(prec_mat_computed, prec_mat_expected,
**self.prec_tolerance)
def test_random_feature_prior_approximation(self):
"""Tests random feature GP's ability in approximating exact GP prior."""
num_inducing = 10240
rfgp_model = gaussian_process.RandomFeatureGaussianProcess(
units=1,
num_inducing=num_inducing,
normalize_input=False,
gp_kernel_type='gaussian',
return_random_features=True)
# Extract random features.
_, _, gp_feature = rfgp_model(self.x_tr, training=True)
gp_feature_np = gp_feature.numpy()
prior_kernel_computed = gp_feature_np.dot(gp_feature_np.T)
prior_kernel_expected = self.rbf_kern_func(self.x_tr, self.x_tr)
np.testing.assert_allclose(prior_kernel_computed, prior_kernel_expected,
**self.cov_tolerance)
def test_random_feature_posterior_approximation(self):
"""Tests random feature GP's ability in approximating exact GP posterior."""
# Set momentum = 0.5 so posterior precision matrix is 0.5 * (I + K).
gp_cov_momentum = 0.5
gp_cov_ridge_penalty = 1.
num_inducing = 1024
rfgp_model = gaussian_process.RandomFeatureGaussianProcess(
units=1,
num_inducing=num_inducing,
normalize_input=False,
gp_kernel_type='gaussian',
gp_cov_momentum=gp_cov_momentum,
gp_cov_ridge_penalty=gp_cov_ridge_penalty)
# Computes posterior covariance on test data.
_, _ = rfgp_model(self.x_tr, training=True)
_, gp_cov_ts = rfgp_model(self.x_ts, training=False)
# Scale up covariance estimate since prec matrix is down-scaled by momentum.
post_kernel_computed = gp_cov_ts * gp_cov_momentum
post_kernel_expected = _compute_posterior_kernel(self.x_tr, self.x_ts,
self.rbf_kern_func,
gp_cov_ridge_penalty)
np.testing.assert_allclose(post_kernel_computed, post_kernel_expected,
**self.cov_tolerance)
def test_random_feature_linear_kernel(self):
"""Tests if linear kernel indeed leads to an identity mapping."""
# Specify linear kernel
gp_kernel_type = 'linear'
normalize_input = False
scale_random_features = False
use_custom_random_features = True
rfgp_model = gaussian_process.RandomFeatureGaussianProcess(
units=1,
normalize_input=normalize_input,
gp_kernel_type=gp_kernel_type,
scale_random_features=scale_random_features,
use_custom_random_features=use_custom_random_features,
return_random_features=True)
_, _, gp_feature = rfgp_model(self.x_tr, training=True)
# Check if linear kernel leads to identity mapping.
np.testing.assert_allclose(gp_feature, self.x_tr, **self.prec_tolerance)
def test_no_matrix_update_during_test(self):
"""Tests if the precision matrix is not updated during testing."""
rfgp_model = gaussian_process.RandomFeatureGaussianProcess(units=1)
# Training.
_, gp_covmat_null = rfgp_model(self.x_tr, training=True)
precision_mat_before_test = rfgp_model._gp_cov_layer.precision_matrix
# Testing.
_ = rfgp_model(self.x_ts, training=False)
precision_mat_after_test = rfgp_model._gp_cov_layer.precision_matrix
self.assertAllClose(
gp_covmat_null, tf.eye(self.num_train_sample), atol=1e-4)
self.assertAllClose(
precision_mat_before_test, precision_mat_after_test, atol=1e-4)
def test_state_saving_and_loading(self):
"""Tests if the loaded model returns same results."""
input_data = np.random.random((1, 2))
rfgp_model = gaussian_process.RandomFeatureGaussianProcess(units=1)
inputs = tf.keras.Input((2,), batch_size=1)
outputs = rfgp_model(inputs)
model = tf.keras.Model(inputs, outputs)
gp_output, gp_covmat = model.predict(input_data)
# Save and then load the model.
temp_dir = self.get_temp_dir()
self.addCleanup(shutil.rmtree, temp_dir)
saved_model_dir = os.path.join(temp_dir, 'rfgp_model')
model.save(saved_model_dir)
new_model = tf.keras.models.load_model(saved_model_dir)
gp_output_new, gp_covmat_new = new_model.predict(input_data)
self.assertAllClose(gp_output, gp_output_new, atol=1e-4)
self.assertAllClose(gp_covmat, gp_covmat_new, atol=1e-4)
class MeanFieldLogitsTest(tf.test.TestCase):
def testMeanFieldLogitsLikelihood(self):
"""Tests if scaling is correct under different likelihood."""
batch_size = 10
num_classes = 12
variance = 1.5
mean_field_factor = 2.
rng = np.random.RandomState(0)
tf.random.set_seed(1)
logits = rng.randn(batch_size, num_classes)
covmat = tf.linalg.diag([variance] * batch_size)
logits_logistic = gaussian_process.mean_field_logits(
logits, covmat, mean_field_factor=mean_field_factor)
self.assertAllClose(logits_logistic, logits / 2., atol=1e-4)
def testMeanFieldLogitsTemperatureScaling(self):
"""Tests using mean_field_logits as temperature scaling method."""
batch_size = 10
num_classes = 12
rng = np.random.RandomState(0)
tf.random.set_seed(1)
logits = rng.randn(batch_size, num_classes)
# Test if there's no change to logits when mean_field_factor < 0.
logits_no_change = gaussian_process.mean_field_logits(
logits, covariance_matrix=None, mean_field_factor=-1)
# Test if mean_field_logits functions as a temperature scaling method when
# mean_field_factor > 0, with temperature = sqrt(1. + mean_field_factor).
logits_scale_by_two = gaussian_process.mean_field_logits(
logits, covariance_matrix=None, mean_field_factor=3.)
self.assertAllClose(logits_no_change, logits, atol=1e-4)
self.assertAllClose(logits_scale_by_two, logits / 2., atol=1e-4)
if __name__ == '__main__':
tf.test.main()
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,114 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Masked language model network."""
# pylint: disable=g-classes-have-attributes
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import tensorflow as tf
from official.modeling import tf_utils
@tf.keras.utils.register_keras_serializable(package='Text')
class MaskedLM(tf.keras.layers.Layer):
"""Masked language model network head for BERT modeling.
This network implements a masked language model based on the provided network.
It assumes that the network being passed has a "get_embedding_table()" method.
Arguments:
embedding_table: The embedding table of the targets.
activation: The activation, if any, for the dense layer.
initializer: The intializer for the dense layer. Defaults to a Glorot
uniform initializer.
output: The output style for this network. Can be either 'logits' or
'predictions'.
"""
def __init__(self,
embedding_table,
activation=None,
initializer='glorot_uniform',
output='logits',
name='cls/predictions',
**kwargs):
super(MaskedLM, self).__init__(name=name, **kwargs)
self.embedding_table = embedding_table
self.activation = activation
self.initializer = tf.keras.initializers.get(initializer)
if output not in ('predictions', 'logits'):
raise ValueError(
('Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"') % output)
self._output_type = output
def build(self, input_shape):
self._vocab_size, hidden_size = self.embedding_table.shape
self.dense = tf.keras.layers.Dense(
hidden_size,
activation=self.activation,
kernel_initializer=self.initializer,
name='transform/dense')
self.layer_norm = tf.keras.layers.LayerNormalization(
axis=-1, epsilon=1e-12, name='transform/LayerNorm')
self.bias = self.add_weight(
'output_bias/bias',
shape=(self._vocab_size,),
initializer='zeros',
trainable=True)
super(MaskedLM, self).build(input_shape)
def call(self, sequence_data, masked_positions):
masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
lm_data = self.dense(masked_lm_input)
lm_data = self.layer_norm(lm_data)
lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
logits = tf.nn.bias_add(lm_data, self.bias)
masked_positions_shape = tf_utils.get_shape_list(
masked_positions, name='masked_positions_tensor')
logits = tf.reshape(logits,
[-1, masked_positions_shape[1], self._vocab_size])
if self._output_type == 'logits':
return logits
return tf.nn.log_softmax(logits)
def get_config(self):
raise NotImplementedError('MaskedLM cannot be directly serialized because '
'it has variable sharing logic.')
def _gather_indexes(self, sequence_tensor, positions):
"""Gathers the vectors at the specific positions.
Args:
sequence_tensor: Sequence output of `BertModel` layer of shape
(`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
hidden units of `BertModel` layer.
positions: Positions ids of tokens in sequence to mask for pretraining
of with dimension (batch_size, num_predictions) where
`num_predictions` is maximum number of tokens to mask out and predict
per each sequence.
Returns:
Masked out sequence tensor of shape (batch_size * num_predictions,
num_hidden).
"""
sequence_shape = tf_utils.get_shape_list(
sequence_tensor, name='sequence_output_tensor')
batch_size, seq_length, width = sequence_shape
from official.nlp import keras_nlp
flat_offsets = tf.reshape(
tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
flat_positions = tf.reshape(positions + flat_offsets, [-1])
flat_sequence_tensor = tf.reshape(sequence_tensor,
[batch_size * seq_length, width])
output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
return output_tensor
MaskedLM = keras_nlp.layers.MaskedLM
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,12 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for masked language model network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for masked language model network."""
import numpy as np
import tensorflow as tf
......@@ -24,7 +20,7 @@ import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.layers import masked_lm
from official.nlp.modeling.networks import transformer_encoder
from official.nlp.modeling.networks import bert_encoder
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
......@@ -34,25 +30,22 @@ class MaskedLMTest(keras_parameterized.TestCase):
def create_layer(self,
vocab_size,
sequence_length,
hidden_size,
output='predictions',
xformer_stack=None):
# First, create a transformer stack that we can use to get the LM's
# vocabulary weight.
if xformer_stack is None:
xformer_stack = transformer_encoder.TransformerEncoder(
xformer_stack = bert_encoder.BertEncoder(
vocab_size=vocab_size,
num_layers=1,
sequence_length=sequence_length,
hidden_size=hidden_size,
num_attention_heads=4,
)
# Create a maskedLM from the transformer stack.
test_layer = masked_lm.MaskedLM(
embedding_table=xformer_stack.get_embedding_table(),
output=output)
embedding_table=xformer_stack.get_embedding_table(), output=output)
return test_layer
def test_layer_creation(self):
......@@ -61,9 +54,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
hidden_size = 64
num_predictions = 21
test_layer = self.create_layer(
vocab_size=vocab_size,
sequence_length=sequence_length,
hidden_size=hidden_size)
vocab_size=vocab_size, hidden_size=hidden_size)
# Make sure that the output tensor of the masked LM is the right shape.
lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
......@@ -78,22 +69,19 @@ class MaskedLMTest(keras_parameterized.TestCase):
sequence_length = 32
hidden_size = 64
num_predictions = 21
xformer_stack = transformer_encoder.TransformerEncoder(
xformer_stack = bert_encoder.BertEncoder(
vocab_size=vocab_size,
num_layers=1,
sequence_length=sequence_length,
hidden_size=hidden_size,
num_attention_heads=4,
)
test_layer = self.create_layer(
vocab_size=vocab_size,
sequence_length=sequence_length,
hidden_size=hidden_size,
xformer_stack=xformer_stack,
output='predictions')
logit_layer = self.create_layer(
vocab_size=vocab_size,
sequence_length=sequence_length,
hidden_size=hidden_size,
xformer_stack=xformer_stack,
output='logits')
......@@ -133,9 +121,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
hidden_size = 64
num_predictions = 21
test_layer = self.create_layer(
vocab_size=vocab_size,
sequence_length=sequence_length,
hidden_size=hidden_size)
vocab_size=vocab_size, hidden_size=hidden_size)
# Create a model from the masked LM layer.
lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
......@@ -154,8 +140,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
def test_unknown_output_type_fails(self):
with self.assertRaisesRegex(ValueError, 'Unknown `output` value "bad".*'):
_ = self.create_layer(
vocab_size=8, sequence_length=8, hidden_size=8, output='bad')
_ = self.create_layer(vocab_size=8, hidden_size=8, output='bad')
if __name__ == '__main__':
......
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,22 +11,35 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras-based softmax layer with optional masking."""
# pylint: disable=g-classes-have-attributes
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import tensorflow as tf
def _large_compatible_negative(tensor_type):
"""Large negative number as Tensor.
This function is necessary because the standard value for epsilon
in this module (-1e9) cannot be represented using `tf.float16`.
Args:
tensor_type: A dtype to determine the type.
Returns:
A large negative number.
"""
if tensor_type == tf.float16:
return tf.float16.min
return -1e9
@tf.keras.utils.register_keras_serializable(package='Text')
class MaskedSoftmax(tf.keras.layers.Layer):
"""Performs a softmax with optional masking on a tensor.
Arguments:
Args:
mask_expansion_axes: Any axes that should be padded on the mask tensor.
normalization_axes: On which axes the softmax should perform.
"""
......@@ -50,9 +63,9 @@ class MaskedSoftmax(tf.keras.layers.Layer):
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
adder = (1.0 - tf.cast(mask, scores.dtype)) * -10000.0
# positions we want to attend and -1.e9 for masked positions.
adder = (1.0 - tf.cast(mask, scores.dtype)) * _large_compatible_negative(
scores.dtype)
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
scores += adder
......
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,12 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for Keras-based masked softmax layer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for Keras-based masked softmax layer."""
import numpy as np
import tensorflow as tf
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dot product with margin layer."""
# pylint: disable=g-classes-have-attributes
from typing import Tuple
# Import libraries
import tensorflow as tf
from official.modeling import tf_utils
@tf.keras.utils.register_keras_serializable(package='Text')
class MatMulWithMargin(tf.keras.layers.Layer):
"""This layer computs a dot product matrix given two encoded inputs.
Args:
logit_scale: The scaling factor of dot products when doing training.
logit_margin: The margin value between the positive and negative examples
when doing training.
"""
def __init__(self,
logit_scale=1.0,
logit_margin=0.0,
**kwargs):
super(MatMulWithMargin, self).__init__(**kwargs)
self.logit_scale = logit_scale
self.logit_margin = logit_margin
def call(self, left_encoded: tf.Tensor,
right_encoded: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
batch_size = tf_utils.get_shape_list(
left_encoded, name='sequence_output_tensor')[0]
# Left -> Right dot product.
left_dot_products = tf.matmul(
left_encoded, right_encoded, transpose_b=True)
self.left_logits = self.logit_scale * (
left_dot_products - self.logit_margin * tf.eye(batch_size))
# Right -> Left dot product.
self.right_logits = tf.transpose(self.left_logits)
return (self.left_logits, self.right_logits)
def get_config(self):
config = {
'logit_scale': self.logit_scale,
'logit_margin': self.logit_margin}
config.update(super(MatMulWithMargin, self).get_config())
return config
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for mat_mul_with_margin layer."""
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.layers import mat_mul_with_margin
class MatMulWithMarginTest(keras_parameterized.TestCase):
def test_layer_invocation(self):
"""Validate that the Keras object can be created and invoked."""
input_width = 512
test_layer = mat_mul_with_margin.MatMulWithMargin()
# Create a 2-dimensional input (the first dimension is implicit).
left_encoded = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
right_encoded = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
left_logits, right_logits = test_layer(left_encoded, right_encoded)
# Validate that the outputs are of the expected shape.
expected_output_shape = [None, None]
self.assertEqual(expected_output_shape, left_logits.shape.as_list())
self.assertEqual(expected_output_shape, right_logits.shape.as_list())
def test_serialize_deserialize(self):
# Create a layer object that sets all of its config options.
layer = mat_mul_with_margin.MatMulWithMargin()
# Create another layer object from the first object's config.
new_layer = mat_mul_with_margin.MatMulWithMargin.from_config(
layer.get_config())
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(layer.get_config(), new_layer.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MobileBERT embedding and transformer layers."""
import tensorflow as tf
from official.nlp import keras_nlp
@tf.keras.utils.register_keras_serializable(package='Text')
class NoNorm(tf.keras.layers.Layer):
"""Apply element-wise linear transformation to the last dimension."""
def __init__(self, name=None):
super(NoNorm, self).__init__(name=name)
def build(self, shape):
kernal_size = shape[-1]
self.bias = self.add_weight('beta',
shape=[kernal_size],
initializer='zeros')
self.scale = self.add_weight('gamma',
shape=[kernal_size],
initializer='ones')
def call(self, feature):
output = feature * self.scale + self.bias
return output
def _get_norm_layer(normalization_type='no_norm', name=None):
"""Get normlization layer.
Args:
normalization_type: String. The type of normalization_type, only
`no_norm` and `layer_norm` are supported.
name: Name for the norm layer.
Returns:
layer norm class.
"""
if normalization_type == 'no_norm':
layer = NoNorm(name=name)
elif normalization_type == 'layer_norm':
layer = tf.keras.layers.LayerNormalization(
name=name,
axis=-1,
epsilon=1e-12,
dtype=tf.float32)
else:
raise NotImplementedError('Only "no_norm" and "layer_norm" and supported.')
return layer
@tf.keras.utils.register_keras_serializable(package='Text')
class MobileBertEmbedding(tf.keras.layers.Layer):
"""Performs an embedding lookup for MobileBERT.
This layer includes word embedding, token type embedding, position embedding.
"""
def __init__(self,
word_vocab_size,
word_embed_size,
type_vocab_size,
output_embed_size,
max_sequence_length=512,
normalization_type='no_norm',
initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
dropout_rate=0.1,
**kwargs):
"""Class initialization.
Args:
word_vocab_size: Number of words in the vocabulary.
word_embed_size: Word embedding size.
type_vocab_size: Number of word types.
output_embed_size: Embedding size for the final embedding output.
max_sequence_length: Maximum length of input sequence.
normalization_type: String. The type of normalization_type, only
`no_norm` and `layer_norm` are supported.
initializer: The initializer to use for the embedding weights and
linear projection weights.
dropout_rate: Dropout rate.
**kwargs: keyword arguments.
"""
super(MobileBertEmbedding, self).__init__(**kwargs)
self.word_vocab_size = word_vocab_size
self.word_embed_size = word_embed_size
self.type_vocab_size = type_vocab_size
self.output_embed_size = output_embed_size
self.max_sequence_length = max_sequence_length
self.normalization_type = normalization_type
self.initializer = tf.keras.initializers.get(initializer)
self.dropout_rate = dropout_rate
self.word_embedding = keras_nlp.layers.OnDeviceEmbedding(
self.word_vocab_size,
self.word_embed_size,
initializer=initializer,
name='word_embedding')
self.type_embedding = keras_nlp.layers.OnDeviceEmbedding(
self.type_vocab_size,
self.output_embed_size,
initializer=initializer,
name='type_embedding')
self.pos_embedding = keras_nlp.layers.PositionEmbedding(
max_length=max_sequence_length,
initializer=initializer,
name='position_embedding')
self.word_embedding_proj = tf.keras.layers.experimental.EinsumDense(
'abc,cd->abd',
output_shape=[None, self.output_embed_size],
kernel_initializer=initializer,
bias_axes='d',
name='embedding_projection')
self.layer_norm = _get_norm_layer(normalization_type, 'embedding_norm')
self.dropout_layer = tf.keras.layers.Dropout(
self.dropout_rate,
name='embedding_dropout')
def get_config(self):
config = {
'word_vocab_size': self.word_vocab_size,
'word_embed_size': self.word_embed_size,
'type_vocab_size': self.type_vocab_size,
'output_embed_size': self.output_embed_size,
'max_sequence_length': self.max_sequence_length,
'normalization_type': self.normalization_type,
'initializer': tf.keras.initializers.serialize(self.initializer),
'dropout_rate': self.dropout_rate
}
base_config = super(MobileBertEmbedding, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, input_ids, token_type_ids=None):
word_embedding_out = self.word_embedding(input_ids)
word_embedding_out = tf.concat(
[tf.pad(word_embedding_out[:, 1:], ((0, 0), (0, 1), (0, 0))),
word_embedding_out,
tf.pad(word_embedding_out[:, :-1], ((0, 0), (1, 0), (0, 0)))],
axis=2)
word_embedding_out = self.word_embedding_proj(word_embedding_out)
pos_embedding_out = self.pos_embedding(word_embedding_out)
embedding_out = word_embedding_out + pos_embedding_out
if token_type_ids is not None:
type_embedding_out = self.type_embedding(token_type_ids)
embedding_out += type_embedding_out
embedding_out = self.layer_norm(embedding_out)
embedding_out = self.dropout_layer(embedding_out)
return embedding_out
@tf.keras.utils.register_keras_serializable(package='Text')
class MobileBertTransformer(tf.keras.layers.Layer):
"""Transformer block for MobileBERT.
An implementation of one layer (block) of Transformer with bottleneck and
inverted-bottleneck for MobilerBERT.
Original paper for MobileBERT:
https://arxiv.org/pdf/2004.02984.pdf
"""
def __init__(self,
hidden_size=512,
num_attention_heads=4,
intermediate_size=512,
intermediate_act_fn='relu',
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
intra_bottleneck_size=128,
use_bottleneck_attention=False,
key_query_shared_bottleneck=True,
num_feedforward_networks=4,
normalization_type='no_norm',
initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
**kwargs):
"""Class initialization.
Args:
hidden_size: Hidden size for the Transformer input and output tensor.
num_attention_heads: Number of attention heads in the Transformer.
intermediate_size: The size of the "intermediate" (a.k.a., feed
forward) layer.
intermediate_act_fn: The non-linear activation function to apply
to the output of the intermediate/feed-forward layer.
hidden_dropout_prob: Dropout probability for the hidden layers.
attention_probs_dropout_prob: Dropout probability of the attention
probabilities.
intra_bottleneck_size: Size of bottleneck.
use_bottleneck_attention: Use attention inputs from the bottleneck
transformation. If true, the following `key_query_shared_bottleneck`
will be ignored.
key_query_shared_bottleneck: Whether to share linear transformation for
keys and queries.
num_feedforward_networks: Number of stacked feed-forward networks.
normalization_type: The type of normalization_type, only `no_norm` and
`layer_norm` are supported. `no_norm` represents the element-wise
linear transformation for the student model, as suggested by the
original MobileBERT paper. `layer_norm` is used for the teacher model.
initializer: The initializer to use for the embedding weights and
linear projection weights.
**kwargs: keyword arguments.
Raises:
ValueError: A Tensor shape or parameter is invalid.
"""
super(MobileBertTransformer, self).__init__(**kwargs)
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.intermediate_act_fn = intermediate_act_fn
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.intra_bottleneck_size = intra_bottleneck_size
self.use_bottleneck_attention = use_bottleneck_attention
self.key_query_shared_bottleneck = key_query_shared_bottleneck
self.num_feedforward_networks = num_feedforward_networks
self.normalization_type = normalization_type
self.initializer = tf.keras.initializers.get(initializer)
if intra_bottleneck_size % num_attention_heads != 0:
raise ValueError(
(f'The bottleneck size {intra_bottleneck_size} is not a multiple '
f'of the number of attention heads {num_attention_heads}.'))
attention_head_size = int(intra_bottleneck_size / num_attention_heads)
self.block_layers = {}
# add input bottleneck
dense_layer_2d = tf.keras.layers.experimental.EinsumDense(
'abc,cd->abd',
output_shape=[None, self.intra_bottleneck_size],
bias_axes='d',
kernel_initializer=initializer,
name='bottleneck_input/dense')
layer_norm = _get_norm_layer(self.normalization_type,
name='bottleneck_input/norm')
self.block_layers['bottleneck_input'] = [dense_layer_2d,
layer_norm]
if self.key_query_shared_bottleneck:
dense_layer_2d = tf.keras.layers.experimental.EinsumDense(
'abc,cd->abd',
output_shape=[None, self.intra_bottleneck_size],
bias_axes='d',
kernel_initializer=initializer,
name='kq_shared_bottleneck/dense')
layer_norm = _get_norm_layer(self.normalization_type,
name='kq_shared_bottleneck/norm')
self.block_layers['kq_shared_bottleneck'] = [dense_layer_2d,
layer_norm]
# add attention layer
attention_layer = tf.keras.layers.MultiHeadAttention(
num_heads=self.num_attention_heads,
key_dim=attention_head_size,
value_dim=attention_head_size,
dropout=self.attention_probs_dropout_prob,
output_shape=self.intra_bottleneck_size,
kernel_initializer=initializer,
name='attention')
layer_norm = _get_norm_layer(self.normalization_type,
name='attention/norm')
self.block_layers['attention'] = [attention_layer,
layer_norm]
# add stacked feed-forward networks
self.block_layers['ffn'] = []
for ffn_layer_idx in range(self.num_feedforward_networks):
layer_prefix = f'ffn_layer_{ffn_layer_idx}'
layer_name = layer_prefix + '/intermediate_dense'
intermediate_layer = tf.keras.layers.experimental.EinsumDense(
'abc,cd->abd',
activation=self.intermediate_act_fn,
output_shape=[None, self.intermediate_size],
bias_axes='d',
kernel_initializer=initializer,
name=layer_name)
layer_name = layer_prefix + '/output_dense'
output_layer = tf.keras.layers.experimental.EinsumDense(
'abc,cd->abd',
output_shape=[None, self.intra_bottleneck_size],
bias_axes='d',
kernel_initializer=initializer,
name=layer_name)
layer_name = layer_prefix + '/norm'
layer_norm = _get_norm_layer(self.normalization_type,
name=layer_name)
self.block_layers['ffn'].append([intermediate_layer,
output_layer,
layer_norm])
# add output bottleneck
bottleneck = tf.keras.layers.experimental.EinsumDense(
'abc,cd->abd',
output_shape=[None, self.hidden_size],
activation=None,
bias_axes='d',
kernel_initializer=initializer,
name='bottleneck_output/dense')
dropout_layer = tf.keras.layers.Dropout(
self.hidden_dropout_prob,
name='bottleneck_output/dropout')
layer_norm = _get_norm_layer(self.normalization_type,
name='bottleneck_output/norm')
self.block_layers['bottleneck_output'] = [bottleneck,
dropout_layer,
layer_norm]
def get_config(self):
config = {
'hidden_size': self.hidden_size,
'num_attention_heads': self.num_attention_heads,
'intermediate_size': self.intermediate_size,
'intermediate_act_fn': self.intermediate_act_fn,
'hidden_dropout_prob': self.hidden_dropout_prob,
'attention_probs_dropout_prob': self.attention_probs_dropout_prob,
'intra_bottleneck_size': self.intra_bottleneck_size,
'use_bottleneck_attention': self.use_bottleneck_attention,
'key_query_shared_bottleneck': self.key_query_shared_bottleneck,
'num_feedforward_networks': self.num_feedforward_networks,
'normalization_type': self.normalization_type,
'initializer': tf.keras.initializers.serialize(self.initializer),
}
base_config = super(MobileBertTransformer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self,
input_tensor,
attention_mask=None,
return_attention_scores=False):
"""Implementes the forward pass.
Args:
input_tensor: Float tensor of shape
`(batch_size, seq_length, hidden_size)`.
attention_mask: (optional) int32 tensor of shape
`(batch_size, seq_length, seq_length)`, with 1 for positions that can
be attended to and 0 in positions that should not be.
return_attention_scores: If return attention score.
Returns:
layer_output: Float tensor of shape
`(batch_size, seq_length, hidden_size)`.
attention_scores (Optional): Only when return_attention_scores is True.
Raises:
ValueError: A Tensor shape or parameter is invalid.
"""
input_width = input_tensor.shape.as_list()[-1]
if input_width != self.hidden_size:
raise ValueError(
(f'The width of the input tensor {input_width} != '
f'hidden size {self.hidden_size}'))
prev_output = input_tensor
# input bottleneck
dense_layer = self.block_layers['bottleneck_input'][0]
layer_norm = self.block_layers['bottleneck_input'][1]
layer_input = dense_layer(prev_output)
layer_input = layer_norm(layer_input)
if self.use_bottleneck_attention:
key_tensor = layer_input
query_tensor = layer_input
value_tensor = layer_input
elif self.key_query_shared_bottleneck:
dense_layer = self.block_layers['kq_shared_bottleneck'][0]
layer_norm = self.block_layers['kq_shared_bottleneck'][1]
shared_attention_input = dense_layer(prev_output)
shared_attention_input = layer_norm(shared_attention_input)
key_tensor = shared_attention_input
query_tensor = shared_attention_input
value_tensor = prev_output
else:
key_tensor = prev_output
query_tensor = prev_output
value_tensor = prev_output
# attention layer
attention_layer = self.block_layers['attention'][0]
layer_norm = self.block_layers['attention'][1]
attention_output, attention_scores = attention_layer(
query_tensor,
value_tensor,
key_tensor,
attention_mask,
return_attention_scores=True,
)
attention_output = layer_norm(attention_output + layer_input)
# stacked feed-forward networks
layer_input = attention_output
for ffn_idx in range(self.num_feedforward_networks):
intermediate_layer = self.block_layers['ffn'][ffn_idx][0]
output_layer = self.block_layers['ffn'][ffn_idx][1]
layer_norm = self.block_layers['ffn'][ffn_idx][2]
intermediate_output = intermediate_layer(layer_input)
layer_output = output_layer(intermediate_output)
layer_output = layer_norm(layer_output + layer_input)
layer_input = layer_output
# output bottleneck
bottleneck = self.block_layers['bottleneck_output'][0]
dropout_layer = self.block_layers['bottleneck_output'][1]
layer_norm = self.block_layers['bottleneck_output'][2]
layer_output = bottleneck(layer_output)
layer_output = dropout_layer(layer_output)
layer_output = layer_norm(layer_output + prev_output)
if return_attention_scores:
return layer_output, attention_scores
else:
return layer_output
@tf.keras.utils.register_keras_serializable(package='Text')
class MobileBertMaskedLM(tf.keras.layers.Layer):
"""Masked language model network head for BERT modeling.
This layer implements a masked language model based on the provided
transformer based encoder. It assumes that the encoder network being passed
has a "get_embedding_table()" method. Different from canonical BERT's masked
LM layer, when the embedding width is smaller than hidden_size, it adds an
extra output weights in shape [vocab_size, (hidden_size - embedding_width)].
"""
def __init__(self,
embedding_table,
activation=None,
initializer='glorot_uniform',
output='logits',
**kwargs):
"""Class initialization.
Args:
embedding_table: The embedding table from encoder network.
activation: The activation, if any, for the dense layer.
initializer: The initializer for the dense layer. Defaults to a Glorot
uniform initializer.
output: The output style for this layer. Can be either `logits` or
`predictions`.
**kwargs: keyword arguments.
"""
super(MobileBertMaskedLM, self).__init__(**kwargs)
self.embedding_table = embedding_table
self.activation = activation
self.initializer = tf.keras.initializers.get(initializer)
if output not in ('predictions', 'logits'):
raise ValueError(
('Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"') % output)
self._output_type = output
def build(self, input_shape):
self._vocab_size, embedding_width = self.embedding_table.shape
hidden_size = input_shape[-1]
self.dense = tf.keras.layers.Dense(
hidden_size,
activation=self.activation,
kernel_initializer=self.initializer,
name='transform/dense')
if hidden_size > embedding_width:
self.extra_output_weights = self.add_weight(
'extra_output_weights',
shape=(self._vocab_size, hidden_size - embedding_width),
initializer=self.initializer,
trainable=True)
elif hidden_size == embedding_width:
self.extra_output_weights = None
else:
raise ValueError(
'hidden size %d cannot be smaller than embedding width %d.' %
(hidden_size, embedding_width))
self.layer_norm = tf.keras.layers.LayerNormalization(
axis=-1, epsilon=1e-12, name='transform/LayerNorm')
self.bias = self.add_weight(
'output_bias/bias',
shape=(self._vocab_size,),
initializer='zeros',
trainable=True)
super(MobileBertMaskedLM, self).build(input_shape)
def call(self, sequence_data, masked_positions):
masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
lm_data = self.dense(masked_lm_input)
lm_data = self.layer_norm(lm_data)
if self.extra_output_weights is None:
lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
else:
lm_data = tf.matmul(
lm_data,
tf.concat([self.embedding_table, self.extra_output_weights], axis=1),
transpose_b=True)
logits = tf.nn.bias_add(lm_data, self.bias)
masked_positions_length = masked_positions.shape.as_list()[1] or tf.shape(
masked_positions)[1]
logits = tf.reshape(logits,
[-1, masked_positions_length, self._vocab_size])
if self._output_type == 'logits':
return logits
return tf.nn.log_softmax(logits)
def get_config(self):
raise NotImplementedError('MaskedLM cannot be directly serialized because '
'it has variable sharing logic.')
def _gather_indexes(self, sequence_tensor, positions):
"""Gathers the vectors at the specific positions.
Args:
sequence_tensor: Sequence output of `BertModel` layer of shape
`(batch_size, seq_length, num_hidden)` where `num_hidden` is number of
hidden units of `BertModel` layer.
positions: Positions ids of tokens in sequence to mask for pretraining
of with dimension `(batch_size, num_predictions)` where
`num_predictions` is maximum number of tokens to mask out and predict
per each sequence.
Returns:
Masked out sequence tensor of shape
`(batch_size * num_predictions, num_hidden)`.
"""
sequence_shape = tf.shape(sequence_tensor)
batch_size, seq_length = sequence_shape[0], sequence_shape[1]
width = sequence_tensor.shape.as_list()[2] or sequence_shape[2]
flat_offsets = tf.reshape(
tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
flat_positions = tf.reshape(positions + flat_offsets, [-1])
flat_sequence_tensor = tf.reshape(sequence_tensor,
[batch_size * seq_length, width])
output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
return output_tensor
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.nlp.modeling.layers import mobile_bert_layers
from official.nlp.modeling.networks import mobile_bert_encoder
def generate_fake_input(batch_size=1, seq_len=5, vocab_size=10000, seed=0):
"""Generate consistent fake integer input sequences."""
np.random.seed(seed)
fake_input = []
for _ in range(batch_size):
fake_input.append([])
for _ in range(seq_len):
fake_input[-1].append(np.random.randint(0, vocab_size))
fake_input = np.asarray(fake_input)
return fake_input
class MobileBertEncoderTest(parameterized.TestCase, tf.test.TestCase):
def test_embedding_layer_with_token_type(self):
layer = mobile_bert_layers.MobileBertEmbedding(10, 8, 2, 16)
input_seq = tf.Variable([[2, 3, 4, 5]])
token_type = tf.Variable([[0, 1, 1, 1]])
output = layer(input_seq, token_type)
output_shape = output.shape.as_list()
expected_shape = [1, 4, 16]
self.assertListEqual(output_shape, expected_shape, msg=None)
def test_embedding_layer_without_token_type(self):
layer = mobile_bert_layers.MobileBertEmbedding(10, 8, 2, 16)
input_seq = tf.Variable([[2, 3, 4, 5]])
output = layer(input_seq)
output_shape = output.shape.as_list()
expected_shape = [1, 4, 16]
self.assertListEqual(output_shape, expected_shape, msg=None)
def test_embedding_layer_get_config(self):
layer = mobile_bert_layers.MobileBertEmbedding(
word_vocab_size=16,
word_embed_size=32,
type_vocab_size=4,
output_embed_size=32,
max_sequence_length=32,
normalization_type='layer_norm',
initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01),
dropout_rate=0.5)
layer_config = layer.get_config()
new_layer = mobile_bert_layers.MobileBertEmbedding.from_config(layer_config)
self.assertEqual(layer_config, new_layer.get_config())
def test_no_norm(self):
layer = mobile_bert_layers.NoNorm()
feature = tf.random.normal([2, 3, 4])
output = layer(feature)
output_shape = output.shape.as_list()
expected_shape = [2, 3, 4]
self.assertListEqual(output_shape, expected_shape, msg=None)
@parameterized.named_parameters(('with_kq_shared_bottleneck', False),
('without_kq_shared_bottleneck', True))
def test_transfomer_kq_shared_bottleneck(self, is_kq_shared):
feature = tf.random.uniform([2, 3, 512])
layer = mobile_bert_layers.MobileBertTransformer(
key_query_shared_bottleneck=is_kq_shared)
output = layer(feature)
output_shape = output.shape.as_list()
expected_shape = [2, 3, 512]
self.assertListEqual(output_shape, expected_shape, msg=None)
def test_transfomer_with_mask(self):
feature = tf.random.uniform([2, 3, 512])
input_mask = [[[0., 0., 1.], [0., 0., 1.], [0., 0., 1.]],
[[0., 1., 1.], [0., 1., 1.], [0., 1., 1.]]]
input_mask = np.asarray(input_mask)
layer = mobile_bert_layers.MobileBertTransformer()
output = layer(feature, input_mask)
output_shape = output.shape.as_list()
expected_shape = [2, 3, 512]
self.assertListEqual(output_shape, expected_shape, msg=None)
def test_transfomer_return_attention_score(self):
sequence_length = 5
num_attention_heads = 8
feature = tf.random.uniform([2, sequence_length, 512])
layer = mobile_bert_layers.MobileBertTransformer(
num_attention_heads=num_attention_heads)
_, attention_score = layer(feature, return_attention_scores=True)
expected_shape = [2, num_attention_heads, sequence_length, sequence_length]
self.assertListEqual(
attention_score.shape.as_list(), expected_shape, msg=None)
def test_transformer_get_config(self):
layer = mobile_bert_layers.MobileBertTransformer(
hidden_size=32,
num_attention_heads=2,
intermediate_size=48,
intermediate_act_fn='gelu',
hidden_dropout_prob=0.5,
attention_probs_dropout_prob=0.4,
intra_bottleneck_size=64,
use_bottleneck_attention=True,
key_query_shared_bottleneck=False,
num_feedforward_networks=2,
normalization_type='layer_norm',
initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01),
name='block')
layer_config = layer.get_config()
new_layer = mobile_bert_layers.MobileBertTransformer.from_config(
layer_config)
self.assertEqual(layer_config, new_layer.get_config())
class MobileBertMaskedLMTest(tf.test.TestCase):
def create_layer(self,
vocab_size,
hidden_size,
embedding_width,
output='predictions',
xformer_stack=None):
# First, create a transformer stack that we can use to get the LM's
# vocabulary weight.
if xformer_stack is None:
xformer_stack = mobile_bert_encoder.MobileBERTEncoder(
word_vocab_size=vocab_size,
num_blocks=1,
hidden_size=hidden_size,
num_attention_heads=4,
word_embed_size=embedding_width)
# Create a maskedLM from the transformer stack.
test_layer = mobile_bert_layers.MobileBertMaskedLM(
embedding_table=xformer_stack.get_embedding_table(), output=output)
return test_layer
def test_layer_creation(self):
vocab_size = 100
sequence_length = 32
hidden_size = 64
embedding_width = 32
num_predictions = 21
test_layer = self.create_layer(
vocab_size=vocab_size,
hidden_size=hidden_size,
embedding_width=embedding_width)
# Make sure that the output tensor of the masked LM is the right shape.
lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
output = test_layer(lm_input_tensor, masked_positions=masked_positions)
expected_output_shape = [None, num_predictions, vocab_size]
self.assertEqual(expected_output_shape, output.shape.as_list())
def test_layer_invocation_with_external_logits(self):
vocab_size = 100
sequence_length = 32
hidden_size = 64
embedding_width = 32
num_predictions = 21
xformer_stack = mobile_bert_encoder.MobileBERTEncoder(
word_vocab_size=vocab_size,
num_blocks=1,
hidden_size=hidden_size,
num_attention_heads=4,
word_embed_size=embedding_width)
test_layer = self.create_layer(
vocab_size=vocab_size,
hidden_size=hidden_size,
embedding_width=embedding_width,
xformer_stack=xformer_stack,
output='predictions')
logit_layer = self.create_layer(
vocab_size=vocab_size,
hidden_size=hidden_size,
embedding_width=embedding_width,
xformer_stack=xformer_stack,
output='logits')
# Create a model from the masked LM layer.
lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
output = test_layer(lm_input_tensor, masked_positions)
logit_output = logit_layer(lm_input_tensor, masked_positions)
logit_output = tf.keras.layers.Activation(tf.nn.log_softmax)(logit_output)
logit_layer.set_weights(test_layer.get_weights())
model = tf.keras.Model([lm_input_tensor, masked_positions], output)
logits_model = tf.keras.Model(([lm_input_tensor, masked_positions]),
logit_output)
# Invoke the masked LM on some fake data to make sure there are no runtime
# errors in the code.
batch_size = 3
lm_input_data = 10 * np.random.random_sample(
(batch_size, sequence_length, hidden_size))
masked_position_data = np.random.randint(
sequence_length, size=(batch_size, num_predictions))
# ref_outputs = model.predict([lm_input_data, masked_position_data])
# outputs = logits_model.predict([lm_input_data, masked_position_data])
ref_outputs = model([lm_input_data, masked_position_data])
outputs = logits_model([lm_input_data, masked_position_data])
# Ensure that the tensor shapes are correct.
expected_output_shape = (batch_size, num_predictions, vocab_size)
self.assertEqual(expected_output_shape, ref_outputs.shape)
self.assertEqual(expected_output_shape, outputs.shape)
self.assertAllClose(ref_outputs, outputs)
def test_layer_invocation(self):
vocab_size = 100
sequence_length = 32
hidden_size = 64
embedding_width = 32
num_predictions = 21
test_layer = self.create_layer(
vocab_size=vocab_size,
hidden_size=hidden_size,
embedding_width=embedding_width)
# Create a model from the masked LM layer.
lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
output = test_layer(lm_input_tensor, masked_positions)
model = tf.keras.Model([lm_input_tensor, masked_positions], output)
# Invoke the masked LM on some fake data to make sure there are no runtime
# errors in the code.
batch_size = 3
lm_input_data = 10 * np.random.random_sample(
(batch_size, sequence_length, hidden_size))
masked_position_data = np.random.randint(
2, size=(batch_size, num_predictions))
_ = model.predict([lm_input_data, masked_position_data])
def test_unknown_output_type_fails(self):
with self.assertRaisesRegex(ValueError, 'Unknown `output` value "bad".*'):
_ = self.create_layer(
vocab_size=8, hidden_size=8, embedding_width=4, output='bad')
def test_hidden_size_smaller_than_embedding_width(self):
hidden_size = 8
sequence_length = 32
num_predictions = 20
with self.assertRaisesRegex(
ValueError, 'hidden size 8 cannot be smaller than embedding width 16.'):
test_layer = self.create_layer(
vocab_size=8, hidden_size=8, embedding_width=16)
lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
masked_positions = tf.keras.Input(
shape=(num_predictions,), dtype=tf.int32)
_ = test_layer(lm_input_tensor, masked_positions)
if __name__ == '__main__':
tf.test.main()
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -12,29 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Multi-channel Attention."""
# pylint: disable=g-classes-have-attributes
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import math
import tensorflow as tf
from official.modeling import tf_utils
from official.nlp.modeling.layers import attention
from official.nlp.modeling.layers import masked_softmax
class VotingAttention(tf.keras.layers.Layer):
"""Voting Attention layer.
Arguments:
num_heads: the number of attention heads.
head_size: per-head hidden size.
Args:
num_heads: The number of attention heads.
head_size: Per-head hidden size.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
......@@ -107,43 +100,61 @@ class VotingAttention(tf.keras.layers.Layer):
return tf.nn.softmax(doc_attention_probs + infadder)
class MultiChannelAttention(attention.MultiHeadAttention):
class MultiChannelAttention(tf.keras.layers.MultiHeadAttention):
"""Multi-channel Attention layer.
Introduced in: https://arxiv.org/abs/2001.09386. Expects multiple
cross-attention target sequences.
Introduced in, [Generating Representative Headlines for News Stories
](https://arxiv.org/abs/2001.09386). Expects multiple cross-attention
target sequences.
Call args:
query: Query `Tensor` of shape `[B, T, dim]`.
value: Value `Tensor` of shape `[B, A, S, dim]`, where A denotes the
context_attention_weights: Context weights of shape `[B, N, T, A]`, where N
is the number of attention heads. Combines multi-channel sources
context tensors according to the distribution among channels.
key: Optional key `Tensor` of shape `[B, A, S, dim]`. If not given, will use
`value` for both `key` and `value`, which is the most common case.
attention_mask: A boolean mask of shape `[B, T, S]`, that prevents attention
to certain positions.
"""
def _build_attention(self, qkv_rank):
super(MultiChannelAttention, self)._build_attention(qkv_rank)
def _build_attention(self, rank):
super(MultiChannelAttention, self)._build_attention(rank)
self._masked_softmax = masked_softmax.MaskedSoftmax(mask_expansion_axes=[2])
def call(self, inputs, attention_mask=None):
from_tensor = inputs[0]
to_tensor = inputs[1]
doc_attention_probs = inputs[2]
def call(self,
query,
value,
key=None,
context_attention_weights=None,
attention_mask=None):
if not self._built_from_signature:
self._build_from_signature(query, value, key=key)
if key is None:
key = value
# Scalar dimensions referenced here:
# B = batch size (number of stories)
# A = num_docs (number of docs)
# F = `from_tensor` sequence length
# T = `to_tensor` sequence length
# F = target sequence length
# T = source sequence length
# N = `num_attention_heads`
# H = `size_per_head`
# `query_tensor` = [B, F, N ,H]
query_tensor = self._query_dense(from_tensor)
query_tensor = self._query_dense(query)
# `key_tensor` = [B, A, T, N, H]
key_tensor = self._key_dense(to_tensor)
key_tensor = self._key_dense(key)
# `value_tensor` = [B, A, T, N, H]
value_tensor = self._value_dense(to_tensor)
value_tensor = self._value_dense(value)
# Take the dot product between "query" and "key" to get the raw
# attention scores.
attention_scores = tf.einsum("BATNH,BFNH->BANFT", key_tensor, query_tensor)
attention_scores = tf.multiply(attention_scores,
1.0 / math.sqrt(float(self._key_size)))
1.0 / math.sqrt(float(self._key_dim)))
# Normalize the attention scores to probabilities.
# `attention_probs` = [B, A, N, F, T]
......@@ -156,7 +167,7 @@ class MultiChannelAttention(attention.MultiHeadAttention):
# `context_layer` = [B, F, N, H]
context_layer = tf.einsum("BANFT,BATNH->BAFNH", attention_probs,
value_tensor)
attention_output = tf.einsum("BNFA,BAFNH->BFNH", doc_attention_probs,
attention_output = tf.einsum("BNFA,BAFNH->BFNH", context_attention_weights,
context_layer)
attention_output = self._output_dense(attention_output)
return attention_output
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -12,12 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for nlp.nhnet.multi_channel_attention."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for nlp.nhnet.multi_channel_attention."""
import numpy as np
import tensorflow as tf
......@@ -41,14 +36,18 @@ class MultiChannelAttentionTest(tf.test.TestCase):
num_heads = 2
num_docs = 5
attention_layer = multi_channel_attention.MultiChannelAttention(
num_heads, key_size=2)
num_heads, key_dim=2)
from_data = 10 * np.random.random_sample((3, 4, 8))
to_data = 10 * np.random.random_sample((3, num_docs, 2, 8))
mask_data = np.random.randint(2, size=(3, num_docs, 4, 2))
doc_probs = np.random.randint(
2, size=(3, num_heads, 4, num_docs)).astype(float)
outputs = attention_layer([from_data, to_data, doc_probs], mask_data)
outputs = attention_layer(
query=from_data,
value=to_data,
context_attention_weights=doc_probs,
attention_mask=mask_data)
self.assertEqual(outputs.shape, (3, 4, 8))
......
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,78 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras-based one-hot embedding layer."""
# pylint: disable=g-classes-have-attributes
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import tensorflow as tf
@tf.keras.utils.register_keras_serializable(package="Text")
class OnDeviceEmbedding(tf.keras.layers.Layer):
"""Performs an embedding lookup suitable for accelerator devices.
This layer uses either tf.gather or tf.one_hot to translate integer indices to
float embeddings.
Arguments:
vocab_size: Number of elements in the vocabulary.
embedding_width: Output size of the embedding layer.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
lookup. Defaults to False (that is, using tf.gather). Setting this option
to True may improve performance, especially on small vocabulary sizes, but
will generally require more memory.
"""
def __init__(self,
vocab_size,
embedding_width,
initializer="glorot_uniform",
use_one_hot=False,
**kwargs):
super(OnDeviceEmbedding, self).__init__(**kwargs)
self._vocab_size = vocab_size
self._embedding_width = embedding_width
self._initializer = initializer
self._use_one_hot = use_one_hot
def get_config(self):
config = {
"vocab_size": self._vocab_size,
"embedding_width": self._embedding_width,
"initializer": self._initializer,
"use_one_hot": self._use_one_hot,
}
base_config = super(OnDeviceEmbedding, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def build(self, input_shape):
self.embeddings = self.add_weight(
"embeddings",
shape=[self._vocab_size, self._embedding_width],
initializer=self._initializer,
dtype=tf.float32)
from official.nlp import keras_nlp
super(OnDeviceEmbedding, self).build(input_shape)
def call(self, inputs):
flat_inputs = tf.reshape(inputs, [-1])
if self._use_one_hot:
one_hot_data = tf.one_hot(
flat_inputs, depth=self._vocab_size, dtype=self.embeddings.dtype)
embeddings = tf.matmul(one_hot_data, self.embeddings)
else:
embeddings = tf.gather(self.embeddings, flat_inputs)
embeddings = tf.reshape(
embeddings,
# Work around b/142213824: prefer concat to shape over a Python list.
tf.concat([tf.shape(inputs), [self._embedding_width]], axis=0))
embeddings.set_shape(inputs.shape.as_list() + [self._embedding_width])
return embeddings
OnDeviceEmbedding = keras_nlp.layers.OnDeviceEmbedding
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,115 +11,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras-based positional embedding layer."""
# pylint: disable=g-classes-have-attributes
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import math
from typing import Optional
import tensorflow as tf
from official.modeling import tf_utils
@tf.keras.utils.register_keras_serializable(package="Text")
class PositionEmbedding(tf.keras.layers.Layer):
"""Creates a positional embedding.
This layer creates a positional embedding as described in "BERT: Pre-training
of Deep Bidirectional Transformers for Language Understanding"
(https://arxiv.org/abs/1810.04805).
This layer can be set up to either create a statically shaped slice or a
dynamically shaped slice. If `use_dynamic_slicing` is True, the input tensor
can have a dynamic 1st dimension, while if `use_dynamic_slicing` is False the
input size must be fixed.
Arguments:
use_dynamic_slicing: Whether to use the dynamic slicing path.
max_sequence_length: The maximum size of the dynamic sequence. Only
applicable if `use_dynamic_slicing` is True.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
"""
def __init__(self,
initializer="glorot_uniform",
use_dynamic_slicing=False,
max_sequence_length=None,
**kwargs):
# We need to have a default dtype of float32, since the inputs (which Keras
# usually uses to infer the dtype) will always be int32.
if "dtype" not in kwargs:
kwargs["dtype"] = "float32"
super(PositionEmbedding, self).__init__(**kwargs)
if use_dynamic_slicing and max_sequence_length is None:
raise ValueError(
"If `use_dynamic_slicing` is True, `max_sequence_length` must be set."
)
self._max_sequence_length = max_sequence_length
self._initializer = tf.keras.initializers.get(initializer)
self._use_dynamic_slicing = use_dynamic_slicing
def get_config(self):
config = {
"max_sequence_length": self._max_sequence_length,
"initializer": tf.keras.initializers.serialize(self._initializer),
"use_dynamic_slicing": self._use_dynamic_slicing,
}
base_config = super(PositionEmbedding, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def build(self, input_shape):
"""Implements build() for the layer."""
dimension_list = input_shape.as_list()
if len(dimension_list) != 3:
raise ValueError("PositionEmbedding expects a 3-dimensional input tensor "
"of shape [batch, sequence, width]")
seq_length = dimension_list[1]
width = dimension_list[2]
# If we are not using dynamic slicing, we must assume that the sequence
# length is fixed and max_sequence_length should not be specified.
if not self._use_dynamic_slicing:
if seq_length is None:
raise ValueError(
"PositionEmbedding must have `use_dynamic_slicing` set "
"to True (and max_sequence_length set) when the "
"sequence (1st) dimension of the input is None.")
if self._max_sequence_length is not None:
raise ValueError(
"When `use_dynamic_slicing` is False, max_sequence_length should "
"not be specified and we ought to use seq_length to get the "
"variable shape.")
if self._max_sequence_length is not None:
weight_sequence_length = self._max_sequence_length
else:
weight_sequence_length = seq_length
self._position_embeddings = self.add_weight(
"embeddings",
shape=[weight_sequence_length, width],
initializer=self._initializer)
super(PositionEmbedding, self).build(input_shape)
def call(self, inputs):
"""Implements call() for the layer."""
input_shape = tf_utils.get_shape_list(inputs, expected_rank=3)
if self._use_dynamic_slicing:
position_embeddings = self._position_embeddings[:input_shape[1], :]
else:
position_embeddings = self._position_embeddings
return tf.broadcast_to(position_embeddings, input_shape)
Initializer = tf.keras.initializers.Initializer
@tf.keras.utils.register_keras_serializable(package="Text")
......@@ -131,16 +33,16 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
"Attention is All You Need", section 3.5.
(https://arxiv.org/abs/1706.03762).
Arguments:
Args:
hidden_size: Size of the hidden layer.
min_timescale: Minimum scale that will be applied at each position
max_timescale: Maximum scale that will be applied at each position.
"""
def __init__(self,
hidden_size,
min_timescale=1.0,
max_timescale=1.0e4,
hidden_size: int,
min_timescale: float = 1.0,
max_timescale: float = 1.0e4,
**kwargs):
# We need to have a default dtype of float32, since the inputs (which Keras
# usually uses to infer the dtype) will always be int32.
......@@ -150,7 +52,7 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
if "dtype" not in kwargs:
kwargs["dtype"] = "float32"
super(RelativePositionEmbedding, self).__init__(**kwargs)
super().__init__(**kwargs)
self._hidden_size = hidden_size
self._min_timescale = min_timescale
self._max_timescale = max_timescale
......@@ -160,7 +62,6 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
"hidden_size": self._hidden_size,
"min_timescale": self._min_timescale,
"max_timescale": self._max_timescale,
"length": self._length,
}
base_config = super(RelativePositionEmbedding, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
......@@ -172,22 +73,20 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
inputs: An tensor whose second dimension will be used as `length`. If
`None`, the other `length` argument must be specified.
length: An optional integer specifying the number of positions. If both
`inputs` and `length` are spcified, `length` must be equal to the
second dimension of `inputs`.
`inputs` and `length` are spcified, `length` must be equal to the second
dimension of `inputs`.
Returns:
A tensor in shape of [length, hidden_size].
A tensor in shape of `(length, hidden_size)`.
"""
if inputs is None and length is None:
raise ValueError(
"If inputs is None, `length` must be set in "
"RelativePositionEmbedding().")
raise ValueError("If inputs is None, `length` must be set in "
"RelativePositionEmbedding().")
if inputs is not None:
input_shape = tf_utils.get_shape_list(inputs)
if length is not None and length != input_shape[1]:
raise ValueError(
"If inputs is not None, `length` must equal to input_shape[1]."
)
"If inputs is not None, `length` must equal to input_shape[1].")
length = input_shape[1]
position = tf.cast(tf.range(length), tf.float32)
num_timescales = self._hidden_size // 2
......@@ -198,8 +97,141 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
inv_timescales = min_timescale * tf.exp(
tf.cast(tf.range(num_timescales), tf.float32) *
-log_timescale_increment)
scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales,
0)
position_embeddings = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)],
axis=1)
scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(
inv_timescales, 0)
position_embeddings = tf.concat(
[tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
return position_embeddings
def _relative_position_bucket(relative_position,
bidirectional=True,
num_buckets=32,
max_distance=128):
"""Translate relative position to a bucket number for relative attention.
The relative position is defined as memory_position - query_position, i.e.
the distance in tokens from the attending position to the attended-to
position.
If `bidirectional=False`, then positive relative positions are invalid.
We use smaller buckets for small absolute relative_position and larger
buckets for larger absolute relative_positions.
All relative positions >=max_distance map to the same bucket.
All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences
than the model has been trained on.
Args:
relative_position: An int32 Tensor
bidirectional: A boolean - whether the attention is bidirectional
num_buckets: An integer
max_distance: An integer
Returns:
A Tensor with the same shape as relative_position, containing int32
values in the range [0, num_buckets)
"""
ret = 0
n = -relative_position
if bidirectional:
num_buckets //= 2
ret += tf.cast(tf.math.less(n, 0), tf.int32) * num_buckets
n = tf.math.abs(n)
else:
n = tf.math.maximum(n, 0)
# now n is in the range [0, inf)
max_exact = num_buckets // 2
is_small = tf.math.less(n, max_exact)
val_if_large = max_exact + tf.dtypes.cast(
tf.math.log(tf.cast(n, tf.float32) / max_exact) /
math.log(max_distance / max_exact) * (num_buckets - max_exact),
tf.int32,
)
val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
ret += tf.where(is_small, n, val_if_large)
return ret
@tf.keras.utils.register_keras_serializable(package="Text")
class RelativePositionBias(tf.keras.layers.Layer):
"""Relative position embedding via per-head bias in T5 style.
Reference implementation in MeshTF:
https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L1000
This layer implements the relative position bias used in "Exploring the Limits
of Transfer Learning with a Unified Text-to-Text Transformer"
(https://arxiv.org/abs/1910.10683)
"""
def __init__(self,
num_heads: int,
relative_attention_num_buckets: int = 32,
relative_attention_max_distance: int = 128,
bidirectional: bool = True,
embeddings_initializer: Optional[Initializer] = None,
**kwargs):
super().__init__(**kwargs)
self.num_heads = num_heads
self.relative_attention_num_buckets = relative_attention_num_buckets
self.bidirectional = bidirectional
self.relative_attention_max_distance = relative_attention_max_distance
if embeddings_initializer:
self._embed_init = embeddings_initializer
else:
self._embed_init = tf.keras.initializers.TruncatedNormal(stddev=1.0)
with tf.name_scope(self.name):
self._relative_attention_bias = self.add_weight(
"rel_embedding",
shape=[self.relative_attention_num_buckets, self.num_heads],
initializer=self._embed_init,
dtype=self.dtype,
trainable=True)
def get_config(self):
config = {
"num_heads":
self.num_heads,
"relative_attention_num_buckets":
self.relative_attention_num_buckets,
"relative_attention_max_distance":
self.relative_attention_max_distance,
"bidirectional":
self.bidirectional,
"embeddings_initializer":
tf.keras.initializers.serialize(self._embed_init),
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, query: tf.Tensor, key: tf.Tensor):
"""Implements the forward pass.
Args:
query: query input tensor shape [batch, query length, hidden size].
key: key input tensor shape [batch, key length, hidden size].
Returns:
A tensor in shape of [batch, heads, query length, key length].
"""
batch_size, qlen = tf_utils.get_shape_list(query)[:2]
klen = tf_utils.get_shape_list(key)[1]
context_position = tf.range(qlen)[:, None]
memory_position = tf.range(klen)[None, :]
relative_position = memory_position - context_position
rp_bucket = _relative_position_bucket(
relative_position,
bidirectional=self.bidirectional,
num_buckets=self.relative_attention_num_buckets,
max_distance=self.relative_attention_max_distance)
values = tf.nn.embedding_lookup(self._relative_attention_bias, rp_bucket)
values = tf.expand_dims(
tf.transpose(values, [2, 0, 1]),
axis=0) # shape (1, num_heads, qlen, klen)
values = tf.tile(values, [batch_size, 1, 1, 1])
return values
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,13 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for Keras-based positional embedding layer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for Keras-based positional embedding layer."""
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
......@@ -28,75 +25,7 @@ from official.nlp.modeling.layers import position_embedding
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes
class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
def test_static_layer_output_shape(self):
test_layer = position_embedding.PositionEmbedding()
# Create a 3-dimensional input (the first dimension is implicit).
sequence_length = 21
width = 30
input_tensor = tf.keras.Input(shape=(sequence_length, width))
output_tensor = test_layer(input_tensor)
# When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch.
expected_output_shape = [None, sequence_length, width]
self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
# The default output dtype for this layer should be tf.float32.
self.assertEqual(tf.float32, output_tensor.dtype)
def test_float16_dtype(self):
test_layer = position_embedding.PositionEmbedding(dtype="float16")
# Create a 3-dimensional input (the first dimension is implicit).
sequence_length = 21
width = 30
input_tensor = tf.keras.Input(shape=(sequence_length, width))
output_tensor = test_layer(input_tensor)
# When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch.
expected_output_shape = [None, sequence_length, width]
self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
# The default output dtype for this layer should be tf.float32.
self.assertEqual(tf.float16, output_tensor.dtype)
def test_dynamic_layer_output_shape(self):
max_sequence_length = 40
test_layer = position_embedding.PositionEmbedding(
use_dynamic_slicing=True, max_sequence_length=max_sequence_length)
# Create a 3-dimensional input (the first dimension is implicit).
width = 30
input_tensor = tf.keras.Input(shape=(None, width))
output_tensor = test_layer(input_tensor)
# When using dynamic positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions - but may be None if
# the input shape is None there.
expected_output_shape = [None, None, width]
self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
def test_dynamic_layer_slicing(self):
max_sequence_length = 40
test_layer = position_embedding.PositionEmbedding(
use_dynamic_slicing=True, max_sequence_length=max_sequence_length)
# Create a 3-dimensional input (the first dimension is implicit).
width = 30
input_tensor = tf.keras.Input(shape=(None, width))
output_tensor = test_layer(input_tensor)
model = tf.keras.Model(input_tensor, output_tensor)
# Create input data that is shorter than max_sequence_length, which should
# trigger a down-slice.
input_length = 17
# Note: This test explicitly uses a batch size of 1. This is to get around
# Keras' restriction on Model invocations: inputs are expected to have the
# same batch cardinality as outputs. In practice, this layer should be used
# inside a model, where it can be projected when added to another tensor.
input_data = np.ones((1, input_length, width))
output_data = model.predict(input_data)
self.assertAllEqual([1, input_length, width], output_data.shape)
class RelativePositionEmbeddingLayerTest(keras_parameterized.TestCase):
def test_relative_tensor_input(self):
hidden_size = 8
......@@ -127,5 +56,33 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
expected_output_tensor = tf.constant([[0, 0, 0, 0, 1, 1, 1, 1]])
self.assertAllEqual(output_tensor, expected_output_tensor)
@keras_parameterized.run_all_keras_modes
class RelativePositionBiasTest(keras_parameterized.TestCase):
@parameterized.named_parameters(("bidirectional", True),
("unidirectional", False))
def test_relative_position_bias(self, bidirectional):
query = tf.zeros((4, 4, 2))
key = tf.zeros((4, 2, 2))
l = position_embedding.RelativePositionBias(
num_heads=3,
bidirectional=bidirectional,
name="foo")
self.assertEqual(l(query, key).shape, (4, 3, 4, 2))
self.assertLen(l.trainable_variables, 1)
self.assertEqual(l.trainable_variables[0].name, "foo/rel_embedding:0")
def test_relative_position_bucket(self):
context_position = tf.range(3)[:, None]
memory_position = tf.range(2)[None, :]
relative_position = memory_position - context_position
outputs = position_embedding._relative_position_bucket(relative_position)
self.assertAllEqual(outputs.numpy(), np.array([[0, 17], [1, 0], [2, 1]]))
outputs = position_embedding._relative_position_bucket(
relative_position, bidirectional=False)
self.assertAllEqual(outputs.numpy(), np.array([[0, 0], [1, 0], [2, 1]]))
if __name__ == "__main__":
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Keras-based relative attention layers."""
import math
import string
import tensorflow as tf
_CHR_IDX = string.ascii_lowercase
def _build_proj_equation(free_dims, bound_dims, output_dims):
"""Builds an einsum equation for projections inside multi-head attention."""
input_str = ""
kernel_str = ""
output_str = ""
bias_axes = ""
letter_offset = 0
for i in range(free_dims):
char = _CHR_IDX[i + letter_offset]
input_str += char
output_str += char
letter_offset += free_dims
for i in range(bound_dims):
char = _CHR_IDX[i + letter_offset]
input_str += char
kernel_str += char
letter_offset += bound_dims
for i in range(output_dims):
char = _CHR_IDX[i + letter_offset]
kernel_str += char
output_str += char
bias_axes += char
equation = "%s,%s->%s" % (input_str, kernel_str, output_str)
return equation, bias_axes, len(output_str)
def _get_output_shape(output_rank, known_last_dims):
return [None] * (output_rank - len(known_last_dims)) + list(known_last_dims)
def _rel_shift(x, klen=-1):
"""Performs relative shift to form the relative attention score."""
x = tf.transpose(x, perm=[2, 3, 0, 1])
x_size = tf.shape(x)
x = tf.reshape(x, [x_size[1], x_size[0], x_size[2], x_size[3]])
x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
x = tf.reshape(x, [x_size[0], x_size[1] - 1, x_size[2], x_size[3]])
x = tf.slice(x, [0, 0, 0, 0], [-1, klen, -1, -1])
x = tf.transpose(x, perm=[2, 3, 0, 1])
return x
@tf.keras.utils.register_keras_serializable(package="Text")
class MultiHeadRelativeAttention(tf.keras.layers.MultiHeadAttention):
"""A multi-head attention layer with relative attention + position encoding.
This layer shares the same input/output projections as the common
`tf.keras.layers.MultiHeadAttention` layer.
When it calculates attention logits, position encoding is projected to form
relative keys. The logits are composed by shifted relative logits and content
logits.
**Note: This layer is currently experimental.
Attributes:
kernel_initializer: The kernel initializer. Defaults to variance_scaling.
Call args:
query: Query `Tensor` of shape `[B, T, dim]`.
value: Value `Tensor` of shape `[B, S, dim]`.
content_attention_bias: Bias `Tensor` for content based attention of shape
`[num_heads, dim]`.
positional_attention_bias: Bias `Tensor` for position based attention of
shape `[num_heads, dim]`.
key: Optional key `Tensor` of shape `[B, S, dim]`. If not given, will use
`value` for both `key` and `value`, which is the most common case.
relative_position_encoding: Relative positional encoding `Tensor` of shape
`[B, L, dim]`.
segment_matrix: Optional `Tensor` representing segmentation IDs used in
XLNet of shape `[B, S, S + M]`.
segment_encoding: Optional `Tensor` representing the segmentation
encoding as used in XLNet of shape `[2, num_heads, dim]`.
segment_attention_bias: Optional trainable bias parameter added to the
query had when calculating the segment-based attention score used in
XLNet of shape `[num_heads, dim]`.
state: Optional `Tensor` of shape `[B, M, E]` where M is the length of the
state or memory.
If passed, this is also attended over as in Transformer XL.
attention_mask: A boolean mask of shape `[B, T, S]` that prevents attention
to certain positions.
"""
def __init__(self,
kernel_initializer="variance_scaling",
**kwargs):
super().__init__(kernel_initializer=kernel_initializer,
**kwargs)
def _build_from_signature(self, query, value, key=None):
super(MultiHeadRelativeAttention, self)._build_from_signature(
query=query,
value=value,
key=key)
if hasattr(value, "shape"):
value_shape = tf.TensorShape(value.shape)
else:
value_shape = value
if key is None:
key_shape = value_shape
elif hasattr(key, "shape"):
key_shape = tf.TensorShape(key.shape)
else:
key_shape = key
common_kwargs = dict(
kernel_initializer=self._kernel_initializer,
bias_initializer=self._bias_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer,
activity_regularizer=self._activity_regularizer,
kernel_constraint=self._kernel_constraint,
bias_constraint=self._bias_constraint)
with tf.init_scope():
einsum_equation, _, output_rank = _build_proj_equation(
key_shape.rank - 1, bound_dims=1, output_dims=2)
self._encoding_dense = tf.keras.layers.experimental.EinsumDense(
einsum_equation,
output_shape=_get_output_shape(output_rank - 1,
[self._num_heads, self._key_dim]),
bias_axes=None,
name="encoding",
**common_kwargs)
def compute_attention(self,
query,
key,
value,
position,
content_attention_bias,
positional_attention_bias,
segment_matrix=None,
segment_encoding=None,
segment_attention_bias=None,
attention_mask=None):
"""Computes the attention.
This function defines the computation inside `call` with projected
multihead Q, K, V, R inputs.
Args:
query: Projected query `Tensor` of shape `[B, T, N, key_dim]`.
key: Projected key `Tensor` of shape `[B, S + M, N, key_dim]`.
value: Projected value `Tensor` of shape `[B, S + M, N, key_dim]`.
position: Projected position `Tensor` of shape `[B, L, N, key_dim]`.
content_attention_bias: Trainable bias parameter added to the query head
when calculating the content-based attention score.
positional_attention_bias: Trainable bias parameter added to the query
head when calculating the position-based attention score.
segment_matrix: Optional `Tensor` representing segmentation IDs used in
XLNet.
segment_encoding: Optional trainable `Tensor` representing the
segmentation encoding as used in XLNet.
segment_attention_bias: Optional trainable bias parameter added to the
query had when calculating the segment-based attention score used in
XLNet.
attention_mask: (default None) Optional mask that is added to attention
logits. If state is not None, the mask source sequence dimension should
extend M.
Returns:
attention_output: Multi-headed output of attention computation of shape
`[B, S, N, key_dim]`.
"""
content_attention = tf.einsum(self._dot_product_equation,
key,
query + content_attention_bias)
positional_attention = tf.einsum(self._dot_product_equation,
position,
query + positional_attention_bias)
positional_attention = _rel_shift(
positional_attention, klen=tf.shape(content_attention)[3])
if segment_matrix is not None:
segment_attention = tf.einsum("bind,snd->bnis",
query + segment_attention_bias,
segment_encoding)
target_shape = tf.shape(positional_attention)
segment_attention = tf.where(
tf.broadcast_to(tf.expand_dims(segment_matrix, 1), target_shape),
tf.broadcast_to(segment_attention[:, :, :, 1:], target_shape),
tf.broadcast_to(segment_attention[:, :, :, :1], target_shape))
attention_sum = (
content_attention + positional_attention + segment_attention)
else:
attention_sum = content_attention + positional_attention
attention_scores = tf.multiply(
attention_sum, 1.0 / math.sqrt(float(self._key_dim)))
attention_scores = self._masked_softmax(attention_scores, attention_mask)
attention_output = self._dropout_layer(attention_scores)
attention_output = tf.einsum(self._combine_equation,
attention_output,
value)
return attention_output
def call(self,
query,
value,
content_attention_bias,
positional_attention_bias,
key=None,
relative_position_encoding=None,
segment_matrix=None,
segment_encoding=None,
segment_attention_bias=None,
state=None,
attention_mask=None):
"""Compute multi-head relative attention over inputs.
Size glossary:
* Number of heads (H): the number of attention heads.
* Value size (V): the size of each value embedding per head.
* Key size (K): the size of each key embedding per head. Equally, the size
of each query embedding per head. Typically K <= V.
* Batch dimensions (B).
* Query (target) attention axes shape (T).
* Value (source) attention axes shape (S), the rank must match the target.
* Encoding length (L): The relative positional encoding length.
Args:
query: attention input.
value: attention input.
content_attention_bias: A trainable bias parameter added to the query
head when calculating the content-based attention score.
positional_attention_bias: A trainable bias parameter added to the query
head when calculating the position-based attention score.
key: attention input.
relative_position_encoding: relative positional encoding for key and
value.
segment_matrix: Optional `Tensor` representing segmentation IDs used in
XLNet.
segment_encoding: Optional `Tensor` representing the segmentation
encoding as used in XLNet.
segment_attention_bias: Optional trainable bias parameter added to the
query had when calculating the segment-based attention score used in
XLNet.
state: (default None) optional state. If passed, this is also attended
over as in TransformerXL.
attention_mask: (default None) Optional mask that is added to attention
logits. If state is not None, the mask source sequence dimension should
extend M.
Returns:
attention_output: The result of the computation, of shape [B, T, E],
where `T` is for target sequence shapes and `E` is the query input last
dimension if `output_shape` is `None`. Otherwise, the multi-head outputs
are projected to the shape specified by `output_shape`.
"""
if not self._built_from_signature:
self._build_from_signature(query, value, key=key)
if key is None:
key = value
if state is not None and state.shape.ndims > 1:
value = tf.concat([state, value], 1)
key = tf.concat([state, key], 1)
# `query` = [B, T, N ,H]
query = self._query_dense(query)
# `key` = [B, S + M, N, H]
key = self._key_dense(key)
# `value` = [B, S + M, N, H]
value = self._value_dense(value)
# `position` = [B, L, N, H]
position = self._encoding_dense(relative_position_encoding)
attention_output = self.compute_attention(
query=query,
key=key,
value=value,
position=position,
content_attention_bias=content_attention_bias,
positional_attention_bias=positional_attention_bias,
segment_matrix=segment_matrix,
segment_encoding=segment_encoding,
segment_attention_bias=segment_attention_bias,
attention_mask=attention_mask)
# `attention_output` = [B, S, N, H]
attention_output = self._output_dense(attention_output)
return attention_output
@tf.keras.utils.register_keras_serializable(package="Text")
class TwoStreamRelativeAttention(MultiHeadRelativeAttention):
"""Two-stream relative self-attention for XLNet.
In XLNet, each token has two associated vectors at each self-attention layer,
the content stream (h) and the query stream (g).
The content stream is the self-attention stream as in Transformer XL and
represents the context and content (the token itself).
The query stream only has access to contextual information and the position,
but not the content.
This layer shares the same build signature as
`tf.keras.layers.MultiHeadAttention` but has different input/output
projections.
**Note: This layer is currently experimental.
Call args:
content_stream: `Tensor` of shape `[B, T, dim]`.
content_attention_bias: Bias `Tensor` for content based attention of shape
`[num_heads, dim]`.
positional_attention_bias: Bias `Tensor` for position based attention of
shape `[num_heads, dim]`.
query_stream: `Tensor` of shape `[B, P, dim]`.
target_mapping: `Tensor` of shape `[B, P, S]`.
relative_position_encoding: Relative positional encoding `Tensor` of shape
`[B, L, dim]`.
segment_matrix: Optional `Tensor` representing segmentation IDs used in
XLNet of shape `[B, S, S + M]`.
segment_encoding: Optional `Tensor` representing the segmentation
encoding as used in XLNet of shape `[2, num_heads, dim]`.
segment_attention_bias: Optional trainable bias parameter added to the
query had when calculating the segment-based attention score used in
XLNet of shape `[num_heads, dim]`.
state: Optional `Tensor` of shape [B, M, E] where M is the length of the
state or memory.
If passed, this is also attended over as in Transformer XL.
content_attention_mask: a boolean mask of shape `[B, T, S]` that
prevents attention to certain positions for content attention computation.
query_attention_mask: a boolean mask of shape `[B, T, S]` that
prevents attention to certain position for query attention computation.
"""
def call(self,
content_stream,
content_attention_bias,
positional_attention_bias,
query_stream,
relative_position_encoding,
target_mapping=None,
segment_matrix=None,
segment_encoding=None,
segment_attention_bias=None,
state=None,
content_attention_mask=None,
query_attention_mask=None):
"""Compute multi-head relative attention over inputs.
Size glossary:
* Number of heads (H): the number of attention heads.
* Value size (V): the size of each value embedding per head.
* Key size (K): the size of each key embedding per head. Equally, the size
of each query embedding per head. Typically K <= V.
* Number of predictions (P): the number of predictions.
* Batch dimensions (B).
* Query (target) attention axes shape (T).
* Value (source) attention axes shape (S), the rank must match the target.
* Encoding length (L): The relative positional encoding length.
Args:
content_stream: The content representation, commonly referred to as h.
This serves a similar role to the standard hidden states in
Transformer-XL.
content_attention_bias: A trainable bias parameter added to the query
head when calculating the content-based attention score.
positional_attention_bias: A trainable bias parameter added to the query
head when calculating the position-based attention score.
query_stream: The query representation, commonly referred to as g.
This only has access to contextual information and position, but not
content. If not provided, then this is MultiHeadRelativeAttention with
self-attention.
relative_position_encoding: relative positional encoding for key and
value.
target_mapping: Optional `Tensor` representing the target mapping used
in partial prediction.
segment_matrix: Optional `Tensor` representing segmentation IDs used in
XLNet.
segment_encoding: Optional `Tensor` representing the segmentation
encoding as used in XLNet.
segment_attention_bias: Optional trainable bias parameter added to the
query head when calculating the segment-based attention score.
state: (default None) optional state. If passed, this is also attended
over as in TransformerXL and XLNet.
content_attention_mask: (default None) Optional mask that is added to
content attention logits. If state is not None, the mask source sequence
dimension should extend M.
query_attention_mask: (default None) Optional mask that is added to
query attention logits. If state is not None, the mask source sequence
dimension should extend M.
Returns:
content_attention_output, query_attention_output: the results of the
computation, both of shape [B, T, E]. `T` is for target sequence shapes,
`E` is the query input last dimension if `output_shape` is `None`.
Otherwise, the multi-head outputs are projected to the shape specified
by `output_shape`.
"""
if not self._built_from_signature:
self._build_from_signature(content_stream, content_stream, content_stream)
if state is not None and state.shape.ndims > 1:
content_and_memory_stream = tf.concat([state, content_stream], 1)
else:
content_and_memory_stream = content_stream
# `query` = [B, T, N, H]
query = self._query_dense(content_stream)
# `key` = [B, S + M, N, H]
key = self._key_dense(content_and_memory_stream)
# `value` = [B, S + M, N, H]
value = self._value_dense(content_and_memory_stream)
# `position` = [B, L, N, H]
position = self._encoding_dense(relative_position_encoding)
content_attention_output = self.compute_attention(
query=query,
key=key,
value=value,
position=position,
content_attention_bias=content_attention_bias,
positional_attention_bias=positional_attention_bias,
segment_matrix=segment_matrix,
segment_encoding=segment_encoding,
segment_attention_bias=segment_attention_bias,
attention_mask=content_attention_mask)
# `content_attention_output` = [B, S, N, H]
content_attention_output = self._output_dense(content_attention_output)
query_attention_output = None
if query_stream is not None:
query = self._query_dense(query_stream)
if target_mapping is not None:
query = tf.einsum("bmnd,bml->blnd", query, target_mapping)
query_attention_output = self.compute_attention(
query=query,
key=key,
value=value,
position=position,
content_attention_bias=content_attention_bias,
positional_attention_bias=positional_attention_bias,
segment_matrix=segment_matrix,
segment_encoding=segment_encoding,
segment_attention_bias=segment_attention_bias,
attention_mask=query_attention_mask)
query_attention_output = tf.einsum("blnd,bml->bmnd",
query_attention_output,
target_mapping)
else:
query_attention_output = self.compute_attention(
query=query,
key=key,
value=value,
position=position,
content_attention_bias=content_attention_bias,
positional_attention_bias=positional_attention_bias,
segment_matrix=segment_matrix,
segment_encoding=segment_encoding,
segment_attention_bias=segment_attention_bias,
attention_mask=query_attention_mask)
query_attention_output = self._output_dense(query_attention_output)
return content_attention_output, query_attention_output
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for the attention layer."""
import numpy as np
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.layers import relative_attention
def _create_mock_attention_data(
num_heads,
key_dim,
value_dim,
seq_length,
batch_size,
memory_length=0,
num_predictions=2,
two_stream=False,
include_state=False,
include_mask=False,
include_segment=False):
"""Creates mock testing data.
Args:
num_heads: `int`, Number of attention heads.
key_dim: `int`, Size of query head.
value_dim: `int`, Size of key, value dim.
seq_length: `int`, Sequence length of the input.
batch_size: `int`, the batch size.
memory_length: optional `int`, the length of the state. Defaults to 0.
num_predictions: `int`, the number of predictions used in two stream
attention.
two_stream: `bool`, whether or not to generate two stream data.
include_state: optional `bool`, whether or not to include state data.
include_mask: optional `bool`, whether or not to include mask data.
include_segment: optional `bool`, whether or not to include segment data.
Returns:
A dictionary with `str` as keys and `Tensor` as values.
"""
query_shape = (batch_size, seq_length, key_dim)
value_shape = (batch_size, seq_length, value_dim)
encoding_shape = (batch_size, seq_length * 2, key_dim)
attention_bias_shape = (num_heads, key_dim)
data = dict(
relative_position_encoding=tf.random.normal(shape=encoding_shape),
content_attention_bias=tf.random.normal(shape=attention_bias_shape),
positional_attention_bias=tf.random.normal(shape=attention_bias_shape))
if two_stream:
query_stream_shape = (batch_size, num_predictions, key_dim)
target_mapping_shape = (batch_size, num_predictions, seq_length)
stream_data = dict(
content_stream=tf.random.normal(shape=query_shape),
query_stream=tf.random.normal(shape=query_stream_shape),
target_mapping=tf.random.normal(shape=target_mapping_shape))
else:
stream_data = dict(
query=tf.random.normal(shape=query_shape),
value=tf.random.normal(shape=value_shape),
key=tf.random.normal(shape=value_shape))
data.update(stream_data)
if include_state:
total_seq_length = seq_length + memory_length
state_data = dict(
state=tf.random.normal(shape=(batch_size, memory_length, value_dim)))
data.update(state_data)
else:
total_seq_length = seq_length
if include_mask:
mask_shape = (batch_size, num_heads, seq_length, total_seq_length)
mask_data = np.random.randint(2, size=mask_shape).astype("float32")
if two_stream:
mask_data = dict(
content_attention_mask=mask_data,
query_attention_mask=mask_data)
else:
mask_data = dict(attention_mask=mask_data)
data.update(mask_data)
if include_segment:
segment_encoding_shape = (2, num_heads, key_dim)
segment_matrix = np.random.randint(
2, size=(batch_size, seq_length, total_seq_length))
segment_matrix = tf.math.equal(segment_matrix, 1)
segment_data = dict(
segment_attention_bias=tf.random.normal(shape=attention_bias_shape),
segment_encoding=tf.random.normal(shape=segment_encoding_shape),
segment_matrix=segment_matrix)
data.update(segment_data)
return data
@keras_parameterized.run_all_keras_modes
class MultiHeadRelativeAttentionTest(keras_parameterized.TestCase):
@combinations.generate(combinations.combine(
value_dim=[32, 64],
memory_length=[0, 4],
state=[True, False],
mask=[True, False],
segment=[True, False]))
def test_attention_scores(self,
value_dim,
memory_length,
state,
mask,
segment):
"""Tests combinations of attention score calculations."""
batch_size, num_heads, key_dim, seq_length = 2, 12, 64, 8
test_layer = relative_attention.MultiHeadRelativeAttention(
num_heads=num_heads,
key_dim=key_dim,
value_dim=value_dim)
data = _create_mock_attention_data(
num_heads=num_heads,
key_dim=key_dim,
value_dim=value_dim,
seq_length=seq_length,
memory_length=memory_length,
two_stream=False,
batch_size=batch_size,
include_state=state,
include_mask=mask,
include_segment=segment)
output = test_layer(**data)
self.assertEqual(output.shape, [batch_size, seq_length, key_dim])
@keras_parameterized.run_all_keras_modes
class TwoStreamRelativeAttentionTest(keras_parameterized.TestCase):
@combinations.generate(combinations.combine(
num_predictions=[2, 10],
memory_length=[0, 4],
state=[True, False],
mask=[True, False],
segment=[True, False]))
def test_attention_scores(self,
num_predictions,
memory_length,
state,
mask,
segment):
"""Tests combinations of attention score calculations."""
batch_size, num_heads, key_dim, seq_length = 2, 12, 64, 8
test_layer = relative_attention.TwoStreamRelativeAttention(
num_heads=num_heads,
key_dim=key_dim,
value_dim=key_dim)
data = _create_mock_attention_data(
num_heads=num_heads,
key_dim=key_dim,
value_dim=key_dim,
seq_length=seq_length,
memory_length=memory_length,
num_predictions=num_predictions,
two_stream=True,
batch_size=batch_size,
include_state=state,
include_mask=mask,
include_segment=segment)
content_output, query_output, = test_layer(**data)
self.assertEqual(content_output.shape, [batch_size, seq_length, key_dim])
self.assertEqual(query_output.shape, [batch_size, num_predictions, key_dim])
if __name__ == "__main__":
np.random.seed(0)
tf.random.set_seed(0)
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,19 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras-based rezero-transformer block layer (Transformer with ReZero)."""
# pylint: disable=g-classes-have-attributes
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import gin
import tensorflow as tf
from official.nlp.modeling.layers import attention
@tf.keras.utils.register_keras_serializable(package="Text")
@gin.configurable
......@@ -35,7 +29,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
The residual connection implements the ReZero method.
(https://arxiv.org/abs/2003.04887)
Arguments:
Args:
num_attention_heads: Number of attention heads.
intermediate_size: Size of the intermediate layer.
intermediate_activation: Activation for the intermediate layer.
......@@ -88,7 +82,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
def build(self, input_shape):
input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
input_tensor_shape = tf.TensorShape(input_tensor)
if len(input_tensor_shape) != 3:
if len(input_tensor_shape.as_list()) != 3:
raise ValueError("TransformerLayer expects a three-dimensional input of "
"shape [batch, sequence, width].")
batch_size, sequence_length, hidden_size = input_tensor_shape
......@@ -116,9 +110,9 @@ class ReZeroTransformer(tf.keras.layers.Layer):
activity_regularizer=self._activity_regularizer,
kernel_constraint=self._kernel_constraint,
bias_constraint=self._bias_constraint)
self._attention_layer = attention.MultiHeadAttention(
self._attention_layer = tf.keras.layers.MultiHeadAttention(
num_heads=self._num_heads,
key_size=self._attention_head_size,
key_dim=self._attention_head_size,
dropout=self._attention_dropout_rate,
name="self_attention",
**common_kwargs)
......@@ -138,7 +132,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
bias_axes="d",
name="intermediate",
**common_kwargs)
policy = tf.keras.mixed_precision.experimental.global_policy()
policy = tf.keras.mixed_precision.global_policy()
if policy.name == "mixed_bfloat16":
# bfloat16 causes BERT with the LAMB optimizer to not converge
# as well, so we use float32.
......@@ -161,7 +155,8 @@ class ReZeroTransformer(tf.keras.layers.Layer):
self._rezero_a = self.add_weight(
name="rezero_alpha",
initializer=tf.keras.initializers.Zeros(),
trainable=True, dtype=tf.float32)
trainable=True,
dtype=tf.float32)
super(ReZeroTransformer, self).build(input_shape)
......@@ -213,9 +208,9 @@ class ReZeroTransformer(tf.keras.layers.Layer):
attention_mask = attention_mask[:, 0:self._output_range, :]
else:
target_tensor = input_tensor
attention_inputs = [target_tensor, input_tensor]
attention_output = self._attention_layer(attention_inputs, attention_mask)
attention_output = self._attention_layer(
query=target_tensor, value=input_tensor, attention_mask=attention_mask)
attention_output = self._attention_dropout(attention_output)
attention_output = target_tensor + self._rezero_a * attention_output
if self._use_layer_norm:
......
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,12 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for Keras-based rezero-transformer block layer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for Keras-based rezero-transformer block layer."""
import numpy as np
import tensorflow as tf
......@@ -32,10 +28,10 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):
def tearDown(self):
super(TransformerWithReZeroLayerTest, self).tearDown()
tf.keras.mixed_precision.experimental.set_policy('float32')
tf.keras.mixed_precision.set_global_policy('float32')
def test_layer_invocation_with_float16_dtype(self):
tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy('mixed_float16')
test_layer = rezero_transformer.ReZeroTransformer(
num_attention_heads=10,
intermediate_size=2048,
......@@ -95,9 +91,9 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):
input_data = np.random.rand(2, input_length, width) + 2.0
output_data = model.predict(input_data)
input_data_normed = (
input_data - np.mean(input_data, axis=-1, keepdims=True)) / (
np.std(input_data, axis=-1, keepdims=True))
input_data_normed = (input_data -
np.mean(input_data, axis=-1, keepdims=True)) / (
np.std(input_data, axis=-1, keepdims=True))
self.assertAllClose(input_data_normed, output_data)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment