Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
f16a7b5b
Unverified
Commit
f16a7b5b
authored
May 04, 2021
by
vedanshu
Committed by
GitHub
May 04, 2021
Browse files
Merge pull request
#1
from tensorflow/master
new pull
parents
8e9296ff
8f58f396
Changes
298
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2698 additions
and
487 deletions
+2698
-487
official/nlp/modeling/layers/gated_feedforward_test.py
official/nlp/modeling/layers/gated_feedforward_test.py
+6
-9
official/nlp/modeling/layers/gaussian_process.py
official/nlp/modeling/layers/gaussian_process.py
+495
-0
official/nlp/modeling/layers/gaussian_process_test.py
official/nlp/modeling/layers/gaussian_process_test.py
+268
-0
official/nlp/modeling/layers/masked_lm.py
official/nlp/modeling/layers/masked_lm.py
+4
-108
official/nlp/modeling/layers/masked_lm_test.py
official/nlp/modeling/layers/masked_lm_test.py
+9
-24
official/nlp/modeling/layers/masked_softmax.py
official/nlp/modeling/layers/masked_softmax.py
+23
-10
official/nlp/modeling/layers/masked_softmax_test.py
official/nlp/modeling/layers/masked_softmax_test.py
+2
-6
official/nlp/modeling/layers/mat_mul_with_margin.py
official/nlp/modeling/layers/mat_mul_with_margin.py
+69
-0
official/nlp/modeling/layers/mat_mul_with_margin_test.py
official/nlp/modeling/layers/mat_mul_with_margin_test.py
+52
-0
official/nlp/modeling/layers/mobile_bert_layers.py
official/nlp/modeling/layers/mobile_bert_layers.py
+554
-0
official/nlp/modeling/layers/mobile_bert_layers_test.py
official/nlp/modeling/layers/mobile_bert_layers_test.py
+273
-0
official/nlp/modeling/layers/multi_channel_attention.py
official/nlp/modeling/layers/multi_channel_attention.py
+39
-28
official/nlp/modeling/layers/multi_channel_attention_test.py
official/nlp/modeling/layers/multi_channel_attention_test.py
+8
-9
official/nlp/modeling/layers/on_device_embedding.py
official/nlp/modeling/layers/on_device_embedding.py
+4
-71
official/nlp/modeling/layers/position_embedding.py
official/nlp/modeling/layers/position_embedding.py
+152
-120
official/nlp/modeling/layers/position_embedding_test.py
official/nlp/modeling/layers/position_embedding_test.py
+32
-75
official/nlp/modeling/layers/relative_attention.py
official/nlp/modeling/layers/relative_attention.py
+499
-0
official/nlp/modeling/layers/relative_attention_test.py
official/nlp/modeling/layers/relative_attention_test.py
+191
-0
official/nlp/modeling/layers/rezero_transformer.py
official/nlp/modeling/layers/rezero_transformer.py
+11
-16
official/nlp/modeling/layers/rezero_transformer_test.py
official/nlp/modeling/layers/rezero_transformer_test.py
+7
-11
No files found.
Too many changes to show.
To preserve performance only
298 of 298+
files are displayed.
Plain diff
Email patch
official/nlp/modeling/layers/gated_feedforward_test.py
View file @
f16a7b5b
# Copyright 202
0
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -11,12 +11,8 @@
...
@@ -11,12 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
# ==============================================================================
"""Tests for Keras-based gated feedforward layer."""
from
__future__
import
absolute_import
"""Tests for Keras-based gated feedforward layer."""
from
__future__
import
division
from
__future__
import
print_function
from
absl.testing
import
parameterized
from
absl.testing
import
parameterized
import
numpy
as
np
import
numpy
as
np
...
@@ -33,7 +29,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
...
@@ -33,7 +29,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
def
tearDown
(
self
):
def
tearDown
(
self
):
super
(
GatedFeedforwardTest
,
self
).
tearDown
()
super
(
GatedFeedforwardTest
,
self
).
tearDown
()
tf
.
keras
.
mixed_precision
.
experimental
.
set
_policy
(
"float32"
)
tf
.
keras
.
mixed_precision
.
set_global
_policy
(
"float32"
)
@
parameterized
.
parameters
(
@
parameterized
.
parameters
(
(
True
,
1
,
"after_residual"
,
"float32"
),
(
True
,
1
,
"after_residual"
,
"float32"
),
...
@@ -46,7 +42,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
...
@@ -46,7 +42,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
(
False
,
1
,
"before_residual"
,
"mixed_float16"
),
(
False
,
1
,
"before_residual"
,
"mixed_float16"
),
)
)
def
test_layer_creation
(
self
,
use_gate
,
num_blocks
,
dropout_position
,
dtype
):
def
test_layer_creation
(
self
,
use_gate
,
num_blocks
,
dropout_position
,
dtype
):
tf
.
keras
.
mixed_precision
.
experimental
.
set
_policy
(
dtype
)
tf
.
keras
.
mixed_precision
.
set_global
_policy
(
dtype
)
kwargs
=
dict
(
kwargs
=
dict
(
intermediate_size
=
128
,
intermediate_size
=
128
,
intermediate_activation
=
"relu"
,
intermediate_activation
=
"relu"
,
...
@@ -78,7 +74,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
...
@@ -78,7 +74,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
)
)
def
test_layer_invocation
(
self
,
use_gate
,
num_blocks
,
dropout_position
,
def
test_layer_invocation
(
self
,
use_gate
,
num_blocks
,
dropout_position
,
dtype
):
dtype
):
tf
.
keras
.
mixed_precision
.
experimental
.
set
_policy
(
dtype
)
tf
.
keras
.
mixed_precision
.
set_global
_policy
(
dtype
)
kwargs
=
dict
(
kwargs
=
dict
(
intermediate_size
=
16
,
intermediate_size
=
16
,
intermediate_activation
=
"relu"
,
intermediate_activation
=
"relu"
,
...
@@ -123,5 +119,6 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
...
@@ -123,5 +119,6 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
# If the serialization was successful, the new config should match the old.
# If the serialization was successful, the new config should match the old.
self
.
assertAllEqual
(
test_layer
.
get_config
(),
new_layer
.
get_config
())
self
.
assertAllEqual
(
test_layer
.
get_config
(),
new_layer
.
get_config
())
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
tf
.
test
.
main
()
official/nlp/modeling/layers/gaussian_process.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Definitions for random feature Gaussian process layer."""
import
math
import
tensorflow
as
tf
_SUPPORTED_LIKELIHOOD
=
(
'binary_logistic'
,
'poisson'
,
'gaussian'
)
class
RandomFeatureGaussianProcess
(
tf
.
keras
.
layers
.
Layer
):
"""Gaussian process layer with random feature approximation [1].
During training, the model updates the maximum a posteriori (MAP) logits
estimates and posterior precision matrix using minibatch statistics. During
inference, the model divides the MAP logit estimates by the predictive
standard deviation, which is equivalent to approximating the posterior mean
of the predictive probability via the mean-field approximation.
User can specify different types of random features by setting
`use_custom_random_features=True`, and change the initializer and activations
of the custom random features. For example:
MLP Kernel: initializer='random_normal', activation=tf.nn.relu
RBF Kernel: initializer='random_normal', activation=tf.math.cos
A linear kernel can also be specified by setting gp_kernel_type='linear' and
`use_custom_random_features=True`.
[1]: Ali Rahimi and Benjamin Recht. Random Features for Large-Scale Kernel
Machines. In _Neural Information Processing Systems_, 2007.
https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf
Attributes:
units: (int) The dimensionality of layer.
num_inducing: (int) The number of random features for the approximation.
is_training: (tf.bool) Whether the layer is set in training mode. If so the
layer updates the Gaussian process' variance estimate using statistics
computed from the incoming minibatches.
"""
def
__init__
(
self
,
units
,
num_inducing
=
1024
,
gp_kernel_type
=
'gaussian'
,
gp_kernel_scale
=
1.
,
gp_output_bias
=
0.
,
normalize_input
=
False
,
gp_kernel_scale_trainable
=
False
,
gp_output_bias_trainable
=
False
,
gp_cov_momentum
=
0.999
,
gp_cov_ridge_penalty
=
1.
,
scale_random_features
=
True
,
use_custom_random_features
=
True
,
custom_random_features_initializer
=
None
,
custom_random_features_activation
=
None
,
l2_regularization
=
1e-6
,
gp_cov_likelihood
=
'gaussian'
,
return_gp_cov
=
True
,
return_random_features
=
False
,
dtype
=
None
,
name
=
'random_feature_gaussian_process'
,
**
gp_output_kwargs
):
"""Initializes a random-feature Gaussian process layer instance.
Args:
units: (int) Number of output units.
num_inducing: (int) Number of random Fourier features used for
approximating the Gaussian process.
gp_kernel_type: (string) The type of kernel function to use for Gaussian
process. Currently default to 'gaussian' which is the Gaussian RBF
kernel.
gp_kernel_scale: (float) The length-scale parameter of the a
shift-invariant kernel function, i.e., for RBF kernel:
exp(-|x1 - x2|**2 / gp_kernel_scale).
gp_output_bias: (float) Scalar initial value for the bias vector.
normalize_input: (bool) Whether to normalize the input to Gaussian
process.
gp_kernel_scale_trainable: (bool) Whether the length scale variable is
trainable.
gp_output_bias_trainable: (bool) Whether the bias is trainable.
gp_cov_momentum: (float) A discount factor used to compute the moving
average for posterior covariance matrix.
gp_cov_ridge_penalty: (float) Initial Ridge penalty to posterior
covariance matrix.
scale_random_features: (bool) Whether to scale the random feature
by sqrt(2. / num_inducing).
use_custom_random_features: (bool) Whether to use custom random
features implemented using tf.keras.layers.Dense.
custom_random_features_initializer: (str or callable) Initializer for
the random features. Default to random normal which approximates a RBF
kernel function if activation function is cos.
custom_random_features_activation: (callable) Activation function for the
random feature layer. Default to cosine which approximates a RBF
kernel function.
l2_regularization: (float) The strength of l2 regularization on the output
weights.
gp_cov_likelihood: (string) Likelihood to use for computing Laplace
approximation for covariance matrix. Default to `gaussian`.
return_gp_cov: (bool) Whether to also return GP covariance matrix.
If False then no covariance learning is performed.
return_random_features: (bool) Whether to also return random features.
dtype: (tf.DType) Input data type.
name: (string) Layer name.
**gp_output_kwargs: Additional keyword arguments to dense output layer.
"""
super
(
RandomFeatureGaussianProcess
,
self
).
__init__
(
name
=
name
,
dtype
=
dtype
)
self
.
units
=
units
self
.
num_inducing
=
num_inducing
self
.
normalize_input
=
normalize_input
self
.
gp_input_scale
=
1.
/
tf
.
sqrt
(
gp_kernel_scale
)
self
.
gp_feature_scale
=
tf
.
sqrt
(
2.
/
float
(
num_inducing
))
self
.
scale_random_features
=
scale_random_features
self
.
return_random_features
=
return_random_features
self
.
return_gp_cov
=
return_gp_cov
self
.
gp_kernel_type
=
gp_kernel_type
self
.
gp_kernel_scale
=
gp_kernel_scale
self
.
gp_output_bias
=
gp_output_bias
self
.
gp_kernel_scale_trainable
=
gp_kernel_scale_trainable
self
.
gp_output_bias_trainable
=
gp_output_bias_trainable
self
.
use_custom_random_features
=
use_custom_random_features
self
.
custom_random_features_initializer
=
custom_random_features_initializer
self
.
custom_random_features_activation
=
custom_random_features_activation
self
.
l2_regularization
=
l2_regularization
self
.
gp_output_kwargs
=
gp_output_kwargs
self
.
gp_cov_momentum
=
gp_cov_momentum
self
.
gp_cov_ridge_penalty
=
gp_cov_ridge_penalty
self
.
gp_cov_likelihood
=
gp_cov_likelihood
if
self
.
use_custom_random_features
:
# Default to Gaussian RBF kernel.
self
.
random_features_bias_initializer
=
tf
.
random_uniform_initializer
(
minval
=
0.
,
maxval
=
2.
*
math
.
pi
)
if
self
.
custom_random_features_initializer
is
None
:
self
.
custom_random_features_initializer
=
(
tf
.
keras
.
initializers
.
RandomNormal
(
stddev
=
1.
))
if
self
.
custom_random_features_activation
is
None
:
self
.
custom_random_features_activation
=
tf
.
math
.
cos
def
build
(
self
,
input_shape
):
# Defines model layers.
if
self
.
normalize_input
:
self
.
_input_norm_layer
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
'gp_input_normalization'
)
self
.
_input_norm_layer
.
build
(
input_shape
)
input_shape
=
self
.
_input_norm_layer
.
compute_output_shape
(
input_shape
)
self
.
_random_feature
=
self
.
_make_random_feature_layer
(
name
=
'gp_random_feature'
)
self
.
_random_feature
.
build
(
input_shape
)
input_shape
=
self
.
_random_feature
.
compute_output_shape
(
input_shape
)
if
self
.
return_gp_cov
:
self
.
_gp_cov_layer
=
LaplaceRandomFeatureCovariance
(
momentum
=
self
.
gp_cov_momentum
,
ridge_penalty
=
self
.
gp_cov_ridge_penalty
,
likelihood
=
self
.
gp_cov_likelihood
,
dtype
=
self
.
dtype
,
name
=
'gp_covariance'
)
self
.
_gp_cov_layer
.
build
(
input_shape
)
self
.
_gp_output_layer
=
tf
.
keras
.
layers
.
Dense
(
units
=
self
.
units
,
use_bias
=
False
,
kernel_regularizer
=
tf
.
keras
.
regularizers
.
l2
(
self
.
l2_regularization
),
dtype
=
self
.
dtype
,
name
=
'gp_output_weights'
,
**
self
.
gp_output_kwargs
)
self
.
_gp_output_layer
.
build
(
input_shape
)
self
.
_gp_output_bias
=
tf
.
Variable
(
initial_value
=
[
self
.
gp_output_bias
]
*
self
.
units
,
dtype
=
self
.
dtype
,
trainable
=
self
.
gp_output_bias_trainable
,
name
=
'gp_output_bias'
)
self
.
built
=
True
def
_make_random_feature_layer
(
self
,
name
):
"""Defines random feature layer depending on kernel type."""
if
not
self
.
use_custom_random_features
:
# Use default RandomFourierFeatures layer from tf.keras.
return
tf
.
keras
.
layers
.
experimental
.
RandomFourierFeatures
(
output_dim
=
self
.
num_inducing
,
kernel_initializer
=
self
.
gp_kernel_type
,
scale
=
self
.
gp_kernel_scale
,
trainable
=
self
.
gp_kernel_scale_trainable
,
dtype
=
self
.
dtype
,
name
=
name
)
if
self
.
gp_kernel_type
.
lower
()
==
'linear'
:
custom_random_feature_layer
=
tf
.
keras
.
layers
.
Lambda
(
lambda
x
:
x
,
name
=
name
)
else
:
# Use user-supplied configurations.
custom_random_feature_layer
=
tf
.
keras
.
layers
.
Dense
(
units
=
self
.
num_inducing
,
use_bias
=
True
,
activation
=
self
.
custom_random_features_activation
,
kernel_initializer
=
self
.
custom_random_features_initializer
,
bias_initializer
=
self
.
random_features_bias_initializer
,
trainable
=
False
,
name
=
name
)
return
custom_random_feature_layer
def
reset_covariance_matrix
(
self
):
"""Resets covariance matrix of the GP layer.
This function is useful for reseting the model's covariance matrix at the
begining of a new epoch.
"""
self
.
_gp_cov_layer
.
reset_precision_matrix
()
def
call
(
self
,
inputs
,
global_step
=
None
,
training
=
None
):
# Computes random features.
gp_inputs
=
inputs
if
self
.
normalize_input
:
gp_inputs
=
self
.
_input_norm_layer
(
gp_inputs
)
elif
self
.
use_custom_random_features
:
# Supports lengthscale for custom random feature layer by directly
# rescaling the input.
gp_input_scale
=
tf
.
cast
(
self
.
gp_input_scale
,
inputs
.
dtype
)
gp_inputs
=
gp_inputs
*
gp_input_scale
gp_feature
=
self
.
_random_feature
(
gp_inputs
)
if
self
.
scale_random_features
:
# Scale random feature by 2. / sqrt(num_inducing) following [1].
# When using GP layer as the output layer of a nerual network,
# it is recommended to turn this scaling off to prevent it from changing
# the learning rate to the hidden layers.
gp_feature_scale
=
tf
.
cast
(
self
.
gp_feature_scale
,
inputs
.
dtype
)
gp_feature
=
gp_feature
*
gp_feature_scale
# Computes posterior center (i.e., MAP estimate) and variance.
gp_output
=
self
.
_gp_output_layer
(
gp_feature
)
+
self
.
_gp_output_bias
if
self
.
return_gp_cov
:
gp_covmat
=
self
.
_gp_cov_layer
(
gp_feature
,
gp_output
,
training
)
# Assembles model output.
model_output
=
[
gp_output
,]
if
self
.
return_gp_cov
:
model_output
.
append
(
gp_covmat
)
if
self
.
return_random_features
:
model_output
.
append
(
gp_feature
)
return
model_output
class
LaplaceRandomFeatureCovariance
(
tf
.
keras
.
layers
.
Layer
):
"""Computes the Gaussian Process covariance using Laplace method.
At training time, this layer updates the Gaussian process posterior using
model features in minibatches.
Attributes:
momentum: (float) A discount factor used to compute the moving average for
posterior precision matrix. Analogous to the momentum factor in batch
normalization. If -1 then update covariance matrix using a naive sum
without momentum, which is desirable if the goal is to compute the exact
covariance matrix by passing through data once (say in the final epoch).
ridge_penalty: (float) Initial Ridge penalty to weight covariance matrix.
This value is used to stablize the eigenvalues of weight covariance
estimate so that the matrix inverse can be computed for Cov = inv(t(X) * X
+ s * I). The ridge factor s cannot be too large since otherwise it will
dominate the t(X) * X term and make covariance estimate not meaningful.
likelihood: (str) The likelihood to use for computing Laplace approximation
for the covariance matrix. Can be one of ('binary_logistic', 'poisson',
'gaussian').
"""
def
__init__
(
self
,
momentum
=
0.999
,
ridge_penalty
=
1.
,
likelihood
=
'gaussian'
,
dtype
=
None
,
name
=
'laplace_covariance'
):
if
likelihood
not
in
_SUPPORTED_LIKELIHOOD
:
raise
ValueError
(
f
'"likelihood" must be one of
{
_SUPPORTED_LIKELIHOOD
}
, got
{
likelihood
}
.'
)
self
.
ridge_penalty
=
ridge_penalty
self
.
momentum
=
momentum
self
.
likelihood
=
likelihood
super
(
LaplaceRandomFeatureCovariance
,
self
).
__init__
(
dtype
=
dtype
,
name
=
name
)
def
compute_output_shape
(
self
,
input_shape
):
gp_feature_dim
=
input_shape
[
-
1
]
return
tf
.
TensorShape
([
gp_feature_dim
,
gp_feature_dim
])
def
build
(
self
,
input_shape
):
gp_feature_dim
=
input_shape
[
-
1
]
# Convert gp_feature_dim to int value for TF1 compatibility.
if
isinstance
(
gp_feature_dim
,
tf
.
compat
.
v1
.
Dimension
):
gp_feature_dim
=
gp_feature_dim
.
value
# Posterior precision matrix for the GP's random feature coefficients.
self
.
initial_precision_matrix
=
(
self
.
ridge_penalty
*
tf
.
eye
(
gp_feature_dim
,
dtype
=
self
.
dtype
))
self
.
precision_matrix
=
(
self
.
add_weight
(
name
=
'gp_precision_matrix'
,
shape
=
(
gp_feature_dim
,
gp_feature_dim
),
dtype
=
self
.
dtype
,
initializer
=
tf
.
keras
.
initializers
.
Identity
(
self
.
ridge_penalty
),
trainable
=
False
,
aggregation
=
tf
.
VariableAggregation
.
ONLY_FIRST_REPLICA
))
self
.
built
=
True
def
make_precision_matrix_update_op
(
self
,
gp_feature
,
logits
,
precision_matrix
):
"""Defines update op for the precision matrix of feature weights."""
if
self
.
likelihood
!=
'gaussian'
:
if
logits
is
None
:
raise
ValueError
(
f
'"logits" cannot be None when likelihood=
{
self
.
likelihood
}
'
)
if
logits
.
shape
[
-
1
]
!=
1
:
raise
ValueError
(
f
'likelihood=
{
self
.
likelihood
}
only support univariate logits.'
f
'Got logits dimension:
{
logits
.
shape
[
-
1
]
}
'
)
batch_size
=
tf
.
shape
(
gp_feature
)[
0
]
batch_size
=
tf
.
cast
(
batch_size
,
dtype
=
gp_feature
.
dtype
)
# Computes batch-specific normalized precision matrix.
if
self
.
likelihood
==
'binary_logistic'
:
prob
=
tf
.
sigmoid
(
logits
)
prob_multiplier
=
prob
*
(
1.
-
prob
)
elif
self
.
likelihood
==
'poisson'
:
prob_multiplier
=
tf
.
exp
(
logits
)
else
:
prob_multiplier
=
1.
gp_feature_adjusted
=
tf
.
sqrt
(
prob_multiplier
)
*
gp_feature
precision_matrix_minibatch
=
tf
.
matmul
(
gp_feature_adjusted
,
gp_feature_adjusted
,
transpose_a
=
True
)
# Updates the population-wise precision matrix.
if
self
.
momentum
>
0
:
# Use moving-average updates to accumulate batch-specific precision
# matrices.
precision_matrix_minibatch
=
precision_matrix_minibatch
/
batch_size
precision_matrix_new
=
(
self
.
momentum
*
precision_matrix
+
(
1.
-
self
.
momentum
)
*
precision_matrix_minibatch
)
else
:
# Compute exact population-wise covariance without momentum.
# If use this option, make sure to pass through data only once.
precision_matrix_new
=
precision_matrix
+
precision_matrix_minibatch
# Returns the update op.
return
precision_matrix
.
assign
(
precision_matrix_new
)
def
reset_precision_matrix
(
self
):
"""Resets precision matrix to its initial value.
This function is useful for reseting the model's covariance matrix at the
begining of a new epoch.
"""
precision_matrix_reset_op
=
self
.
precision_matrix
.
assign
(
self
.
initial_precision_matrix
)
self
.
add_update
(
precision_matrix_reset_op
)
def
compute_predictive_covariance
(
self
,
gp_feature
):
"""Computes posterior predictive variance.
Approximates the Gaussian process posterior using random features.
Given training random feature Phi_tr (num_train, num_hidden) and testing
random feature Phi_ts (batch_size, num_hidden). The predictive covariance
matrix is computed as (assuming Gaussian likelihood):
s * Phi_ts @ inv(t(Phi_tr) * Phi_tr + s * I) @ t(Phi_ts),
where s is the ridge factor to be used for stablizing the inverse, and I is
the identity matrix with shape (num_hidden, num_hidden).
Args:
gp_feature: (tf.Tensor) The random feature of testing data to be used for
computing the covariance matrix. Shape (batch_size, gp_hidden_size).
Returns:
(tf.Tensor) Predictive covariance matrix, shape (batch_size, batch_size).
"""
# Computes the covariance matrix of the feature coefficient.
feature_cov_matrix
=
tf
.
linalg
.
inv
(
self
.
precision_matrix
)
# Computes the covariance matrix of the gp prediction.
cov_feature_product
=
tf
.
matmul
(
feature_cov_matrix
,
gp_feature
,
transpose_b
=
True
)
*
self
.
ridge_penalty
gp_cov_matrix
=
tf
.
matmul
(
gp_feature
,
cov_feature_product
)
return
gp_cov_matrix
def
_get_training_value
(
self
,
training
=
None
):
if
training
is
None
:
training
=
tf
.
keras
.
backend
.
learning_phase
()
if
isinstance
(
training
,
int
):
training
=
bool
(
training
)
return
training
def
call
(
self
,
inputs
,
logits
=
None
,
training
=
None
):
"""Minibatch updates the GP's posterior precision matrix estimate.
Args:
inputs: (tf.Tensor) GP random features, shape (batch_size,
gp_hidden_size).
logits: (tf.Tensor) Pre-activation output from the model. Needed
for Laplace approximation under a non-Gaussian likelihood.
training: (tf.bool) whether or not the layer is in training mode. If in
training mode, the gp_weight covariance is updated using gp_feature.
Returns:
gp_stddev (tf.Tensor): GP posterior predictive variance,
shape (batch_size, batch_size).
"""
batch_size
=
tf
.
shape
(
inputs
)[
0
]
training
=
self
.
_get_training_value
(
training
)
if
training
:
# Define and register the update op for feature precision matrix.
precision_matrix_update_op
=
self
.
make_precision_matrix_update_op
(
gp_feature
=
inputs
,
logits
=
logits
,
precision_matrix
=
self
.
precision_matrix
)
self
.
add_update
(
precision_matrix_update_op
)
# Return null estimate during training.
return
tf
.
eye
(
batch_size
,
dtype
=
self
.
dtype
)
else
:
# Return covariance estimate during inference.
return
self
.
compute_predictive_covariance
(
gp_feature
=
inputs
)
def
mean_field_logits
(
logits
,
covariance_matrix
=
None
,
mean_field_factor
=
1.
):
"""Adjust the model logits so its softmax approximates the posterior mean [1].
[1]: Zhiyun Lu, Eugene Ie, Fei Sha. Uncertainty Estimation with Infinitesimal
Jackknife. _arXiv preprint arXiv:2006.07584_, 2020.
https://arxiv.org/abs/2006.07584
Arguments:
logits: A float tensor of shape (batch_size, num_classes).
covariance_matrix: The covariance matrix of shape (batch_size, batch_size).
If None then it assumes the covariance_matrix is an identity matrix.
mean_field_factor: The scale factor for mean-field approximation, used to
adjust the influence of posterior variance in posterior mean
approximation. If covariance_matrix=None then it is used as the
temperature parameter for temperature scaling.
Returns:
Tensor of adjusted logits, shape (batch_size, num_classes).
"""
if
mean_field_factor
is
None
or
mean_field_factor
<
0
:
return
logits
# Compute standard deviation.
if
covariance_matrix
is
None
:
variances
=
1.
else
:
variances
=
tf
.
linalg
.
diag_part
(
covariance_matrix
)
# Compute scaling coefficient for mean-field approximation.
logits_scale
=
tf
.
sqrt
(
1.
+
variances
*
mean_field_factor
)
if
len
(
logits
.
shape
)
>
1
:
# Cast logits_scale to compatible dimension.
logits_scale
=
tf
.
expand_dims
(
logits_scale
,
axis
=-
1
)
return
logits
/
logits_scale
official/nlp/modeling/layers/gaussian_process_test.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for Gaussian process functions."""
import
os
import
shutil
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.nlp.modeling.layers
import
gaussian_process
def
exact_gaussian_kernel
(
x1
,
x2
):
"""Computes exact Gaussian kernel value(s) for tensors x1 and x2."""
x1_squared
=
tf
.
reduce_sum
(
tf
.
square
(
x1
),
list
(
range
(
1
,
len
(
x1
.
shape
))))
x2_squared
=
tf
.
reduce_sum
(
tf
.
square
(
x2
),
list
(
range
(
1
,
len
(
x2
.
shape
))))
square
=
(
x1_squared
[:,
tf
.
newaxis
]
+
x2_squared
[
tf
.
newaxis
,
:]
-
2
*
tf
.
matmul
(
x1
,
x2
,
transpose_b
=
True
))
return
tf
.
math
.
exp
(
-
square
/
2.
)
def
_generate_normal_data
(
num_sample
,
num_dim
,
loc
):
"""Generates random data sampled from i.i.d. normal distribution."""
return
np
.
random
.
normal
(
size
=
(
num_sample
,
num_dim
),
loc
=
loc
,
scale
=
1.
/
np
.
sqrt
(
num_dim
))
def
_generate_rbf_data
(
x_data
,
orthogonal
=
True
):
"""Generates high-dim data that is the eigen components of a RBF kernel."""
k_rbf
=
exact_gaussian_kernel
(
x_data
,
x_data
)
x_orth
,
x_diag
,
_
=
np
.
linalg
.
svd
(
k_rbf
)
if
orthogonal
:
return
x_orth
return
np
.
diag
(
np
.
sqrt
(
x_diag
)).
dot
(
x_orth
.
T
)
def
_make_minibatch_iterator
(
data_numpy
,
batch_size
,
num_epoch
):
"""Makes a tf.data.Dataset for given batch size and num epoches."""
dataset
=
tf
.
data
.
Dataset
.
from_tensor_slices
(
data_numpy
)
dataset
=
dataset
.
repeat
(
num_epoch
).
batch
(
batch_size
)
return
iter
(
dataset
)
def
_compute_posterior_kernel
(
x_tr
,
x_ts
,
kernel_func
,
ridge_penalty
):
"""Computes the posterior covariance matrix of a Gaussian process."""
num_sample
=
x_tr
.
shape
[
0
]
k_tt_inv
=
tf
.
linalg
.
inv
(
kernel_func
(
x_tr
,
x_tr
)
+
ridge_penalty
*
np
.
eye
(
num_sample
))
k_ts
=
kernel_func
(
x_tr
,
x_ts
)
k_ss
=
kernel_func
(
x_ts
,
x_ts
)
return
k_ss
-
tf
.
matmul
(
k_ts
,
tf
.
matmul
(
k_tt_inv
,
k_ts
),
transpose_a
=
True
)
class
GaussianProcessTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
def
setUp
(
self
):
super
(
GaussianProcessTest
,
self
).
setUp
()
self
.
num_data_dim
=
10
self
.
num_inducing
=
1024
self
.
num_train_sample
=
1024
self
.
num_test_sample
=
256
self
.
prec_tolerance
=
{
'atol'
:
1e-3
,
'rtol'
:
5e-2
}
self
.
cov_tolerance
=
{
'atol'
:
5e-2
,
'rtol'
:
2.
}
self
.
rbf_kern_func
=
exact_gaussian_kernel
self
.
x_tr
=
_generate_normal_data
(
self
.
num_train_sample
,
self
.
num_data_dim
,
loc
=
0.
)
self
.
x_ts
=
_generate_normal_data
(
self
.
num_test_sample
,
self
.
num_data_dim
,
loc
=
1.
)
def
test_layer_build
(
self
):
"""Tests if layer.built=True after building."""
rfgp_model
=
gaussian_process
.
RandomFeatureGaussianProcess
(
units
=
1
)
rfgp_model
.
build
(
input_shape
=
self
.
x_tr
.
shape
)
self
.
assertTrue
(
rfgp_model
.
built
)
@
parameterized
.
named_parameters
((
'rbf_data'
,
False
),
(
'orthogonal_data'
,
True
))
def
test_laplace_covariance_minibatch
(
self
,
generate_orthogonal_data
):
"""Tests if model correctly learns population-lvel precision matrix."""
batch_size
=
50
epochs
=
1000
x_data
=
_generate_rbf_data
(
self
.
x_ts
,
generate_orthogonal_data
)
data_iterator
=
_make_minibatch_iterator
(
x_data
,
batch_size
,
epochs
)
# Estimates precision matrix using minibatch.
cov_estimator
=
gaussian_process
.
LaplaceRandomFeatureCovariance
(
momentum
=
0.999
,
ridge_penalty
=
0
)
for
minibatch_data
in
data_iterator
:
_
=
cov_estimator
(
minibatch_data
,
training
=
True
)
# Evaluation
prec_mat_expected
=
x_data
.
T
.
dot
(
x_data
)
prec_mat_computed
=
(
cov_estimator
.
precision_matrix
.
numpy
()
*
self
.
num_test_sample
)
np
.
testing
.
assert_allclose
(
prec_mat_computed
,
prec_mat_expected
,
**
self
.
prec_tolerance
)
def
test_random_feature_prior_approximation
(
self
):
"""Tests random feature GP's ability in approximating exact GP prior."""
num_inducing
=
10240
rfgp_model
=
gaussian_process
.
RandomFeatureGaussianProcess
(
units
=
1
,
num_inducing
=
num_inducing
,
normalize_input
=
False
,
gp_kernel_type
=
'gaussian'
,
return_random_features
=
True
)
# Extract random features.
_
,
_
,
gp_feature
=
rfgp_model
(
self
.
x_tr
,
training
=
True
)
gp_feature_np
=
gp_feature
.
numpy
()
prior_kernel_computed
=
gp_feature_np
.
dot
(
gp_feature_np
.
T
)
prior_kernel_expected
=
self
.
rbf_kern_func
(
self
.
x_tr
,
self
.
x_tr
)
np
.
testing
.
assert_allclose
(
prior_kernel_computed
,
prior_kernel_expected
,
**
self
.
cov_tolerance
)
def
test_random_feature_posterior_approximation
(
self
):
"""Tests random feature GP's ability in approximating exact GP posterior."""
# Set momentum = 0.5 so posterior precision matrix is 0.5 * (I + K).
gp_cov_momentum
=
0.5
gp_cov_ridge_penalty
=
1.
num_inducing
=
1024
rfgp_model
=
gaussian_process
.
RandomFeatureGaussianProcess
(
units
=
1
,
num_inducing
=
num_inducing
,
normalize_input
=
False
,
gp_kernel_type
=
'gaussian'
,
gp_cov_momentum
=
gp_cov_momentum
,
gp_cov_ridge_penalty
=
gp_cov_ridge_penalty
)
# Computes posterior covariance on test data.
_
,
_
=
rfgp_model
(
self
.
x_tr
,
training
=
True
)
_
,
gp_cov_ts
=
rfgp_model
(
self
.
x_ts
,
training
=
False
)
# Scale up covariance estimate since prec matrix is down-scaled by momentum.
post_kernel_computed
=
gp_cov_ts
*
gp_cov_momentum
post_kernel_expected
=
_compute_posterior_kernel
(
self
.
x_tr
,
self
.
x_ts
,
self
.
rbf_kern_func
,
gp_cov_ridge_penalty
)
np
.
testing
.
assert_allclose
(
post_kernel_computed
,
post_kernel_expected
,
**
self
.
cov_tolerance
)
def
test_random_feature_linear_kernel
(
self
):
"""Tests if linear kernel indeed leads to an identity mapping."""
# Specify linear kernel
gp_kernel_type
=
'linear'
normalize_input
=
False
scale_random_features
=
False
use_custom_random_features
=
True
rfgp_model
=
gaussian_process
.
RandomFeatureGaussianProcess
(
units
=
1
,
normalize_input
=
normalize_input
,
gp_kernel_type
=
gp_kernel_type
,
scale_random_features
=
scale_random_features
,
use_custom_random_features
=
use_custom_random_features
,
return_random_features
=
True
)
_
,
_
,
gp_feature
=
rfgp_model
(
self
.
x_tr
,
training
=
True
)
# Check if linear kernel leads to identity mapping.
np
.
testing
.
assert_allclose
(
gp_feature
,
self
.
x_tr
,
**
self
.
prec_tolerance
)
def
test_no_matrix_update_during_test
(
self
):
"""Tests if the precision matrix is not updated during testing."""
rfgp_model
=
gaussian_process
.
RandomFeatureGaussianProcess
(
units
=
1
)
# Training.
_
,
gp_covmat_null
=
rfgp_model
(
self
.
x_tr
,
training
=
True
)
precision_mat_before_test
=
rfgp_model
.
_gp_cov_layer
.
precision_matrix
# Testing.
_
=
rfgp_model
(
self
.
x_ts
,
training
=
False
)
precision_mat_after_test
=
rfgp_model
.
_gp_cov_layer
.
precision_matrix
self
.
assertAllClose
(
gp_covmat_null
,
tf
.
eye
(
self
.
num_train_sample
),
atol
=
1e-4
)
self
.
assertAllClose
(
precision_mat_before_test
,
precision_mat_after_test
,
atol
=
1e-4
)
def
test_state_saving_and_loading
(
self
):
"""Tests if the loaded model returns same results."""
input_data
=
np
.
random
.
random
((
1
,
2
))
rfgp_model
=
gaussian_process
.
RandomFeatureGaussianProcess
(
units
=
1
)
inputs
=
tf
.
keras
.
Input
((
2
,),
batch_size
=
1
)
outputs
=
rfgp_model
(
inputs
)
model
=
tf
.
keras
.
Model
(
inputs
,
outputs
)
gp_output
,
gp_covmat
=
model
.
predict
(
input_data
)
# Save and then load the model.
temp_dir
=
self
.
get_temp_dir
()
self
.
addCleanup
(
shutil
.
rmtree
,
temp_dir
)
saved_model_dir
=
os
.
path
.
join
(
temp_dir
,
'rfgp_model'
)
model
.
save
(
saved_model_dir
)
new_model
=
tf
.
keras
.
models
.
load_model
(
saved_model_dir
)
gp_output_new
,
gp_covmat_new
=
new_model
.
predict
(
input_data
)
self
.
assertAllClose
(
gp_output
,
gp_output_new
,
atol
=
1e-4
)
self
.
assertAllClose
(
gp_covmat
,
gp_covmat_new
,
atol
=
1e-4
)
class
MeanFieldLogitsTest
(
tf
.
test
.
TestCase
):
def
testMeanFieldLogitsLikelihood
(
self
):
"""Tests if scaling is correct under different likelihood."""
batch_size
=
10
num_classes
=
12
variance
=
1.5
mean_field_factor
=
2.
rng
=
np
.
random
.
RandomState
(
0
)
tf
.
random
.
set_seed
(
1
)
logits
=
rng
.
randn
(
batch_size
,
num_classes
)
covmat
=
tf
.
linalg
.
diag
([
variance
]
*
batch_size
)
logits_logistic
=
gaussian_process
.
mean_field_logits
(
logits
,
covmat
,
mean_field_factor
=
mean_field_factor
)
self
.
assertAllClose
(
logits_logistic
,
logits
/
2.
,
atol
=
1e-4
)
def
testMeanFieldLogitsTemperatureScaling
(
self
):
"""Tests using mean_field_logits as temperature scaling method."""
batch_size
=
10
num_classes
=
12
rng
=
np
.
random
.
RandomState
(
0
)
tf
.
random
.
set_seed
(
1
)
logits
=
rng
.
randn
(
batch_size
,
num_classes
)
# Test if there's no change to logits when mean_field_factor < 0.
logits_no_change
=
gaussian_process
.
mean_field_logits
(
logits
,
covariance_matrix
=
None
,
mean_field_factor
=-
1
)
# Test if mean_field_logits functions as a temperature scaling method when
# mean_field_factor > 0, with temperature = sqrt(1. + mean_field_factor).
logits_scale_by_two
=
gaussian_process
.
mean_field_logits
(
logits
,
covariance_matrix
=
None
,
mean_field_factor
=
3.
)
self
.
assertAllClose
(
logits_no_change
,
logits
,
atol
=
1e-4
)
self
.
assertAllClose
(
logits_scale_by_two
,
logits
/
2.
,
atol
=
1e-4
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/nlp/modeling/layers/masked_lm.py
View file @
f16a7b5b
# Copyright 201
9
The TensorFlow Authors. All Rights Reserved.
# Copyright 20
2
1 The TensorFlow Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -11,114 +11,10 @@
...
@@ -11,114 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
# ==============================================================================
"""Masked language model network."""
"""Masked language model network."""
# pylint: disable=g-classes-have-attributes
# pylint: disable=g-classes-have-attributes
from
__future__
import
absolute_import
from
official.nlp
import
keras_nlp
from
__future__
import
division
# from __future__ import google_type_annotations
from
__future__
import
print_function
import
tensorflow
as
tf
from
official.modeling
import
tf_utils
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
MaskedLM
(
tf
.
keras
.
layers
.
Layer
):
"""Masked language model network head for BERT modeling.
This network implements a masked language model based on the provided network.
It assumes that the network being passed has a "get_embedding_table()" method.
Arguments:
embedding_table: The embedding table of the targets.
activation: The activation, if any, for the dense layer.
initializer: The intializer for the dense layer. Defaults to a Glorot
uniform initializer.
output: The output style for this network. Can be either 'logits' or
'predictions'.
"""
def
__init__
(
self
,
embedding_table
,
activation
=
None
,
initializer
=
'glorot_uniform'
,
output
=
'logits'
,
name
=
'cls/predictions'
,
**
kwargs
):
super
(
MaskedLM
,
self
).
__init__
(
name
=
name
,
**
kwargs
)
self
.
embedding_table
=
embedding_table
self
.
activation
=
activation
self
.
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
if
output
not
in
(
'predictions'
,
'logits'
):
raise
ValueError
(
(
'Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"'
)
%
output
)
self
.
_output_type
=
output
def
build
(
self
,
input_shape
):
self
.
_vocab_size
,
hidden_size
=
self
.
embedding_table
.
shape
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
hidden_size
,
activation
=
self
.
activation
,
kernel_initializer
=
self
.
initializer
,
name
=
'transform/dense'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
axis
=-
1
,
epsilon
=
1e-12
,
name
=
'transform/LayerNorm'
)
self
.
bias
=
self
.
add_weight
(
'output_bias/bias'
,
shape
=
(
self
.
_vocab_size
,),
initializer
=
'zeros'
,
trainable
=
True
)
super
(
MaskedLM
,
self
).
build
(
input_shape
)
def
call
(
self
,
sequence_data
,
masked_positions
):
masked_lm_input
=
self
.
_gather_indexes
(
sequence_data
,
masked_positions
)
lm_data
=
self
.
dense
(
masked_lm_input
)
lm_data
=
self
.
layer_norm
(
lm_data
)
lm_data
=
tf
.
matmul
(
lm_data
,
self
.
embedding_table
,
transpose_b
=
True
)
logits
=
tf
.
nn
.
bias_add
(
lm_data
,
self
.
bias
)
masked_positions_shape
=
tf_utils
.
get_shape_list
(
masked_positions
,
name
=
'masked_positions_tensor'
)
logits
=
tf
.
reshape
(
logits
,
[
-
1
,
masked_positions_shape
[
1
],
self
.
_vocab_size
])
if
self
.
_output_type
==
'logits'
:
return
logits
return
tf
.
nn
.
log_softmax
(
logits
)
def
get_config
(
self
):
raise
NotImplementedError
(
'MaskedLM cannot be directly serialized because '
'it has variable sharing logic.'
)
def
_gather_indexes
(
self
,
sequence_tensor
,
positions
):
"""Gathers the vectors at the specific positions.
Args:
sequence_tensor: Sequence output of `BertModel` layer of shape
(`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
hidden units of `BertModel` layer.
positions: Positions ids of tokens in sequence to mask for pretraining
of with dimension (batch_size, num_predictions) where
`num_predictions` is maximum number of tokens to mask out and predict
per each sequence.
Returns:
Masked out sequence tensor of shape (batch_size * num_predictions,
num_hidden).
"""
sequence_shape
=
tf_utils
.
get_shape_list
(
sequence_tensor
,
name
=
'sequence_output_tensor'
)
batch_size
,
seq_length
,
width
=
sequence_shape
flat_offsets
=
tf
.
reshape
(
tf
.
range
(
0
,
batch_size
,
dtype
=
tf
.
int32
)
*
seq_length
,
[
-
1
,
1
])
flat_positions
=
tf
.
reshape
(
positions
+
flat_offsets
,
[
-
1
])
flat_sequence_tensor
=
tf
.
reshape
(
sequence_tensor
,
[
batch_size
*
seq_length
,
width
])
output_tensor
=
tf
.
gather
(
flat_sequence_tensor
,
flat_positions
)
return
output_tensor
MaskedLM
=
keras_nlp
.
layers
.
MaskedLM
official/nlp/modeling/layers/masked_lm_test.py
View file @
f16a7b5b
# Copyright 201
9
The TensorFlow Authors. All Rights Reserved.
# Copyright 20
2
1 The TensorFlow Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -11,12 +11,8 @@
...
@@ -11,12 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
# ==============================================================================
"""Tests for masked language model network."""
from
__future__
import
absolute_import
"""Tests for masked language model network."""
from
__future__
import
division
from
__future__
import
print_function
import
numpy
as
np
import
numpy
as
np
import
tensorflow
as
tf
import
tensorflow
as
tf
...
@@ -24,7 +20,7 @@ import tensorflow as tf
...
@@ -24,7 +20,7 @@ import tensorflow as tf
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
official.nlp.modeling.layers
import
masked_lm
from
official.nlp.modeling.layers
import
masked_lm
from
official.nlp.modeling.networks
import
transform
er_encoder
from
official.nlp.modeling.networks
import
b
er
t
_encoder
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
...
@@ -34,25 +30,22 @@ class MaskedLMTest(keras_parameterized.TestCase):
...
@@ -34,25 +30,22 @@ class MaskedLMTest(keras_parameterized.TestCase):
def
create_layer
(
self
,
def
create_layer
(
self
,
vocab_size
,
vocab_size
,
sequence_length
,
hidden_size
,
hidden_size
,
output
=
'predictions'
,
output
=
'predictions'
,
xformer_stack
=
None
):
xformer_stack
=
None
):
# First, create a transformer stack that we can use to get the LM's
# First, create a transformer stack that we can use to get the LM's
# vocabulary weight.
# vocabulary weight.
if
xformer_stack
is
None
:
if
xformer_stack
is
None
:
xformer_stack
=
transform
er_encoder
.
Transform
erEncoder
(
xformer_stack
=
b
er
t
_encoder
.
B
er
t
Encoder
(
vocab_size
=
vocab_size
,
vocab_size
=
vocab_size
,
num_layers
=
1
,
num_layers
=
1
,
sequence_length
=
sequence_length
,
hidden_size
=
hidden_size
,
hidden_size
=
hidden_size
,
num_attention_heads
=
4
,
num_attention_heads
=
4
,
)
)
# Create a maskedLM from the transformer stack.
# Create a maskedLM from the transformer stack.
test_layer
=
masked_lm
.
MaskedLM
(
test_layer
=
masked_lm
.
MaskedLM
(
embedding_table
=
xformer_stack
.
get_embedding_table
(),
embedding_table
=
xformer_stack
.
get_embedding_table
(),
output
=
output
)
output
=
output
)
return
test_layer
return
test_layer
def
test_layer_creation
(
self
):
def
test_layer_creation
(
self
):
...
@@ -61,9 +54,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
...
@@ -61,9 +54,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
hidden_size
=
64
hidden_size
=
64
num_predictions
=
21
num_predictions
=
21
test_layer
=
self
.
create_layer
(
test_layer
=
self
.
create_layer
(
vocab_size
=
vocab_size
,
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
)
sequence_length
=
sequence_length
,
hidden_size
=
hidden_size
)
# Make sure that the output tensor of the masked LM is the right shape.
# Make sure that the output tensor of the masked LM is the right shape.
lm_input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
hidden_size
))
lm_input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
hidden_size
))
...
@@ -78,22 +69,19 @@ class MaskedLMTest(keras_parameterized.TestCase):
...
@@ -78,22 +69,19 @@ class MaskedLMTest(keras_parameterized.TestCase):
sequence_length
=
32
sequence_length
=
32
hidden_size
=
64
hidden_size
=
64
num_predictions
=
21
num_predictions
=
21
xformer_stack
=
transform
er_encoder
.
Transform
erEncoder
(
xformer_stack
=
b
er
t
_encoder
.
B
er
t
Encoder
(
vocab_size
=
vocab_size
,
vocab_size
=
vocab_size
,
num_layers
=
1
,
num_layers
=
1
,
sequence_length
=
sequence_length
,
hidden_size
=
hidden_size
,
hidden_size
=
hidden_size
,
num_attention_heads
=
4
,
num_attention_heads
=
4
,
)
)
test_layer
=
self
.
create_layer
(
test_layer
=
self
.
create_layer
(
vocab_size
=
vocab_size
,
vocab_size
=
vocab_size
,
sequence_length
=
sequence_length
,
hidden_size
=
hidden_size
,
hidden_size
=
hidden_size
,
xformer_stack
=
xformer_stack
,
xformer_stack
=
xformer_stack
,
output
=
'predictions'
)
output
=
'predictions'
)
logit_layer
=
self
.
create_layer
(
logit_layer
=
self
.
create_layer
(
vocab_size
=
vocab_size
,
vocab_size
=
vocab_size
,
sequence_length
=
sequence_length
,
hidden_size
=
hidden_size
,
hidden_size
=
hidden_size
,
xformer_stack
=
xformer_stack
,
xformer_stack
=
xformer_stack
,
output
=
'logits'
)
output
=
'logits'
)
...
@@ -133,9 +121,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
...
@@ -133,9 +121,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
hidden_size
=
64
hidden_size
=
64
num_predictions
=
21
num_predictions
=
21
test_layer
=
self
.
create_layer
(
test_layer
=
self
.
create_layer
(
vocab_size
=
vocab_size
,
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
)
sequence_length
=
sequence_length
,
hidden_size
=
hidden_size
)
# Create a model from the masked LM layer.
# Create a model from the masked LM layer.
lm_input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
hidden_size
))
lm_input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
hidden_size
))
...
@@ -154,8 +140,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
...
@@ -154,8 +140,7 @@ class MaskedLMTest(keras_parameterized.TestCase):
def
test_unknown_output_type_fails
(
self
):
def
test_unknown_output_type_fails
(
self
):
with
self
.
assertRaisesRegex
(
ValueError
,
'Unknown `output` value "bad".*'
):
with
self
.
assertRaisesRegex
(
ValueError
,
'Unknown `output` value "bad".*'
):
_
=
self
.
create_layer
(
_
=
self
.
create_layer
(
vocab_size
=
8
,
hidden_size
=
8
,
output
=
'bad'
)
vocab_size
=
8
,
sequence_length
=
8
,
hidden_size
=
8
,
output
=
'bad'
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
official/nlp/modeling/layers/masked_softmax.py
View file @
f16a7b5b
# Copyright 201
9
The TensorFlow Authors. All Rights Reserved.
# Copyright 20
2
1 The TensorFlow Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -11,22 +11,35 @@
...
@@ -11,22 +11,35 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
# ==============================================================================
"""Keras-based softmax layer with optional masking."""
"""Keras-based softmax layer with optional masking."""
# pylint: disable=g-classes-have-attributes
# pylint: disable=g-classes-have-attributes
from
__future__
import
absolute_import
from
__future__
import
division
# from __future__ import google_type_annotations
from
__future__
import
print_function
import
tensorflow
as
tf
import
tensorflow
as
tf
def
_large_compatible_negative
(
tensor_type
):
"""Large negative number as Tensor.
This function is necessary because the standard value for epsilon
in this module (-1e9) cannot be represented using `tf.float16`.
Args:
tensor_type: A dtype to determine the type.
Returns:
A large negative number.
"""
if
tensor_type
==
tf
.
float16
:
return
tf
.
float16
.
min
return
-
1e9
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
MaskedSoftmax
(
tf
.
keras
.
layers
.
Layer
):
class
MaskedSoftmax
(
tf
.
keras
.
layers
.
Layer
):
"""Performs a softmax with optional masking on a tensor.
"""Performs a softmax with optional masking on a tensor.
Arg
ument
s:
Args:
mask_expansion_axes: Any axes that should be padded on the mask tensor.
mask_expansion_axes: Any axes that should be padded on the mask tensor.
normalization_axes: On which axes the softmax should perform.
normalization_axes: On which axes the softmax should perform.
"""
"""
...
@@ -50,9 +63,9 @@ class MaskedSoftmax(tf.keras.layers.Layer):
...
@@ -50,9 +63,9 @@ class MaskedSoftmax(tf.keras.layers.Layer):
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -1
0000.0
for masked positions.
# positions we want to attend and -1
.e9
for masked positions.
adder
=
(
1.0
-
tf
.
cast
(
mask
,
scores
.
dtype
))
*
-
10000.0
adder
=
(
1.0
-
tf
.
cast
(
mask
,
scores
.
dtype
))
*
_large_compatible_negative
(
scores
.
dtype
)
# Since we are adding it to the raw scores before the softmax, this is
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
# effectively the same as removing these entirely.
scores
+=
adder
scores
+=
adder
...
...
official/nlp/modeling/layers/masked_softmax_test.py
View file @
f16a7b5b
# Copyright 201
9
The TensorFlow Authors. All Rights Reserved.
# Copyright 20
2
1 The TensorFlow Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -11,12 +11,8 @@
...
@@ -11,12 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
# ==============================================================================
"""Tests for Keras-based masked softmax layer."""
from
__future__
import
absolute_import
"""Tests for Keras-based masked softmax layer."""
from
__future__
import
division
from
__future__
import
print_function
import
numpy
as
np
import
numpy
as
np
import
tensorflow
as
tf
import
tensorflow
as
tf
...
...
official/nlp/modeling/layers/mat_mul_with_margin.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dot product with margin layer."""
# pylint: disable=g-classes-have-attributes
from
typing
import
Tuple
# Import libraries
import
tensorflow
as
tf
from
official.modeling
import
tf_utils
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
MatMulWithMargin
(
tf
.
keras
.
layers
.
Layer
):
"""This layer computs a dot product matrix given two encoded inputs.
Args:
logit_scale: The scaling factor of dot products when doing training.
logit_margin: The margin value between the positive and negative examples
when doing training.
"""
def
__init__
(
self
,
logit_scale
=
1.0
,
logit_margin
=
0.0
,
**
kwargs
):
super
(
MatMulWithMargin
,
self
).
__init__
(
**
kwargs
)
self
.
logit_scale
=
logit_scale
self
.
logit_margin
=
logit_margin
def
call
(
self
,
left_encoded
:
tf
.
Tensor
,
right_encoded
:
tf
.
Tensor
)
->
Tuple
[
tf
.
Tensor
,
tf
.
Tensor
]:
batch_size
=
tf_utils
.
get_shape_list
(
left_encoded
,
name
=
'sequence_output_tensor'
)[
0
]
# Left -> Right dot product.
left_dot_products
=
tf
.
matmul
(
left_encoded
,
right_encoded
,
transpose_b
=
True
)
self
.
left_logits
=
self
.
logit_scale
*
(
left_dot_products
-
self
.
logit_margin
*
tf
.
eye
(
batch_size
))
# Right -> Left dot product.
self
.
right_logits
=
tf
.
transpose
(
self
.
left_logits
)
return
(
self
.
left_logits
,
self
.
right_logits
)
def
get_config
(
self
):
config
=
{
'logit_scale'
:
self
.
logit_scale
,
'logit_margin'
:
self
.
logit_margin
}
config
.
update
(
super
(
MatMulWithMargin
,
self
).
get_config
())
return
config
@
classmethod
def
from_config
(
cls
,
config
,
custom_objects
=
None
):
return
cls
(
**
config
)
official/nlp/modeling/layers/mat_mul_with_margin_test.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for mat_mul_with_margin layer."""
import
tensorflow
as
tf
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
official.nlp.modeling.layers
import
mat_mul_with_margin
class
MatMulWithMarginTest
(
keras_parameterized
.
TestCase
):
def
test_layer_invocation
(
self
):
"""Validate that the Keras object can be created and invoked."""
input_width
=
512
test_layer
=
mat_mul_with_margin
.
MatMulWithMargin
()
# Create a 2-dimensional input (the first dimension is implicit).
left_encoded
=
tf
.
keras
.
Input
(
shape
=
(
input_width
,),
dtype
=
tf
.
float32
)
right_encoded
=
tf
.
keras
.
Input
(
shape
=
(
input_width
,),
dtype
=
tf
.
float32
)
left_logits
,
right_logits
=
test_layer
(
left_encoded
,
right_encoded
)
# Validate that the outputs are of the expected shape.
expected_output_shape
=
[
None
,
None
]
self
.
assertEqual
(
expected_output_shape
,
left_logits
.
shape
.
as_list
())
self
.
assertEqual
(
expected_output_shape
,
right_logits
.
shape
.
as_list
())
def
test_serialize_deserialize
(
self
):
# Create a layer object that sets all of its config options.
layer
=
mat_mul_with_margin
.
MatMulWithMargin
()
# Create another layer object from the first object's config.
new_layer
=
mat_mul_with_margin
.
MatMulWithMargin
.
from_config
(
layer
.
get_config
())
# If the serialization was successful, the new config should match the old.
self
.
assertAllEqual
(
layer
.
get_config
(),
new_layer
.
get_config
())
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/nlp/modeling/layers/mobile_bert_layers.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MobileBERT embedding and transformer layers."""
import
tensorflow
as
tf
from
official.nlp
import
keras_nlp
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
NoNorm
(
tf
.
keras
.
layers
.
Layer
):
"""Apply element-wise linear transformation to the last dimension."""
def
__init__
(
self
,
name
=
None
):
super
(
NoNorm
,
self
).
__init__
(
name
=
name
)
def
build
(
self
,
shape
):
kernal_size
=
shape
[
-
1
]
self
.
bias
=
self
.
add_weight
(
'beta'
,
shape
=
[
kernal_size
],
initializer
=
'zeros'
)
self
.
scale
=
self
.
add_weight
(
'gamma'
,
shape
=
[
kernal_size
],
initializer
=
'ones'
)
def
call
(
self
,
feature
):
output
=
feature
*
self
.
scale
+
self
.
bias
return
output
def
_get_norm_layer
(
normalization_type
=
'no_norm'
,
name
=
None
):
"""Get normlization layer.
Args:
normalization_type: String. The type of normalization_type, only
`no_norm` and `layer_norm` are supported.
name: Name for the norm layer.
Returns:
layer norm class.
"""
if
normalization_type
==
'no_norm'
:
layer
=
NoNorm
(
name
=
name
)
elif
normalization_type
==
'layer_norm'
:
layer
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
name
,
axis
=-
1
,
epsilon
=
1e-12
,
dtype
=
tf
.
float32
)
else
:
raise
NotImplementedError
(
'Only "no_norm" and "layer_norm" and supported.'
)
return
layer
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
MobileBertEmbedding
(
tf
.
keras
.
layers
.
Layer
):
"""Performs an embedding lookup for MobileBERT.
This layer includes word embedding, token type embedding, position embedding.
"""
def
__init__
(
self
,
word_vocab_size
,
word_embed_size
,
type_vocab_size
,
output_embed_size
,
max_sequence_length
=
512
,
normalization_type
=
'no_norm'
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
dropout_rate
=
0.1
,
**
kwargs
):
"""Class initialization.
Args:
word_vocab_size: Number of words in the vocabulary.
word_embed_size: Word embedding size.
type_vocab_size: Number of word types.
output_embed_size: Embedding size for the final embedding output.
max_sequence_length: Maximum length of input sequence.
normalization_type: String. The type of normalization_type, only
`no_norm` and `layer_norm` are supported.
initializer: The initializer to use for the embedding weights and
linear projection weights.
dropout_rate: Dropout rate.
**kwargs: keyword arguments.
"""
super
(
MobileBertEmbedding
,
self
).
__init__
(
**
kwargs
)
self
.
word_vocab_size
=
word_vocab_size
self
.
word_embed_size
=
word_embed_size
self
.
type_vocab_size
=
type_vocab_size
self
.
output_embed_size
=
output_embed_size
self
.
max_sequence_length
=
max_sequence_length
self
.
normalization_type
=
normalization_type
self
.
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
self
.
dropout_rate
=
dropout_rate
self
.
word_embedding
=
keras_nlp
.
layers
.
OnDeviceEmbedding
(
self
.
word_vocab_size
,
self
.
word_embed_size
,
initializer
=
initializer
,
name
=
'word_embedding'
)
self
.
type_embedding
=
keras_nlp
.
layers
.
OnDeviceEmbedding
(
self
.
type_vocab_size
,
self
.
output_embed_size
,
initializer
=
initializer
,
name
=
'type_embedding'
)
self
.
pos_embedding
=
keras_nlp
.
layers
.
PositionEmbedding
(
max_length
=
max_sequence_length
,
initializer
=
initializer
,
name
=
'position_embedding'
)
self
.
word_embedding_proj
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
'abc,cd->abd'
,
output_shape
=
[
None
,
self
.
output_embed_size
],
kernel_initializer
=
initializer
,
bias_axes
=
'd'
,
name
=
'embedding_projection'
)
self
.
layer_norm
=
_get_norm_layer
(
normalization_type
,
'embedding_norm'
)
self
.
dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
self
.
dropout_rate
,
name
=
'embedding_dropout'
)
def
get_config
(
self
):
config
=
{
'word_vocab_size'
:
self
.
word_vocab_size
,
'word_embed_size'
:
self
.
word_embed_size
,
'type_vocab_size'
:
self
.
type_vocab_size
,
'output_embed_size'
:
self
.
output_embed_size
,
'max_sequence_length'
:
self
.
max_sequence_length
,
'normalization_type'
:
self
.
normalization_type
,
'initializer'
:
tf
.
keras
.
initializers
.
serialize
(
self
.
initializer
),
'dropout_rate'
:
self
.
dropout_rate
}
base_config
=
super
(
MobileBertEmbedding
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
input_ids
,
token_type_ids
=
None
):
word_embedding_out
=
self
.
word_embedding
(
input_ids
)
word_embedding_out
=
tf
.
concat
(
[
tf
.
pad
(
word_embedding_out
[:,
1
:],
((
0
,
0
),
(
0
,
1
),
(
0
,
0
))),
word_embedding_out
,
tf
.
pad
(
word_embedding_out
[:,
:
-
1
],
((
0
,
0
),
(
1
,
0
),
(
0
,
0
)))],
axis
=
2
)
word_embedding_out
=
self
.
word_embedding_proj
(
word_embedding_out
)
pos_embedding_out
=
self
.
pos_embedding
(
word_embedding_out
)
embedding_out
=
word_embedding_out
+
pos_embedding_out
if
token_type_ids
is
not
None
:
type_embedding_out
=
self
.
type_embedding
(
token_type_ids
)
embedding_out
+=
type_embedding_out
embedding_out
=
self
.
layer_norm
(
embedding_out
)
embedding_out
=
self
.
dropout_layer
(
embedding_out
)
return
embedding_out
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
MobileBertTransformer
(
tf
.
keras
.
layers
.
Layer
):
"""Transformer block for MobileBERT.
An implementation of one layer (block) of Transformer with bottleneck and
inverted-bottleneck for MobilerBERT.
Original paper for MobileBERT:
https://arxiv.org/pdf/2004.02984.pdf
"""
def
__init__
(
self
,
hidden_size
=
512
,
num_attention_heads
=
4
,
intermediate_size
=
512
,
intermediate_act_fn
=
'relu'
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
intra_bottleneck_size
=
128
,
use_bottleneck_attention
=
False
,
key_query_shared_bottleneck
=
True
,
num_feedforward_networks
=
4
,
normalization_type
=
'no_norm'
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
**
kwargs
):
"""Class initialization.
Args:
hidden_size: Hidden size for the Transformer input and output tensor.
num_attention_heads: Number of attention heads in the Transformer.
intermediate_size: The size of the "intermediate" (a.k.a., feed
forward) layer.
intermediate_act_fn: The non-linear activation function to apply
to the output of the intermediate/feed-forward layer.
hidden_dropout_prob: Dropout probability for the hidden layers.
attention_probs_dropout_prob: Dropout probability of the attention
probabilities.
intra_bottleneck_size: Size of bottleneck.
use_bottleneck_attention: Use attention inputs from the bottleneck
transformation. If true, the following `key_query_shared_bottleneck`
will be ignored.
key_query_shared_bottleneck: Whether to share linear transformation for
keys and queries.
num_feedforward_networks: Number of stacked feed-forward networks.
normalization_type: The type of normalization_type, only `no_norm` and
`layer_norm` are supported. `no_norm` represents the element-wise
linear transformation for the student model, as suggested by the
original MobileBERT paper. `layer_norm` is used for the teacher model.
initializer: The initializer to use for the embedding weights and
linear projection weights.
**kwargs: keyword arguments.
Raises:
ValueError: A Tensor shape or parameter is invalid.
"""
super
(
MobileBertTransformer
,
self
).
__init__
(
**
kwargs
)
self
.
hidden_size
=
hidden_size
self
.
num_attention_heads
=
num_attention_heads
self
.
intermediate_size
=
intermediate_size
self
.
intermediate_act_fn
=
intermediate_act_fn
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
intra_bottleneck_size
=
intra_bottleneck_size
self
.
use_bottleneck_attention
=
use_bottleneck_attention
self
.
key_query_shared_bottleneck
=
key_query_shared_bottleneck
self
.
num_feedforward_networks
=
num_feedforward_networks
self
.
normalization_type
=
normalization_type
self
.
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
if
intra_bottleneck_size
%
num_attention_heads
!=
0
:
raise
ValueError
(
(
f
'The bottleneck size
{
intra_bottleneck_size
}
is not a multiple '
f
'of the number of attention heads
{
num_attention_heads
}
.'
))
attention_head_size
=
int
(
intra_bottleneck_size
/
num_attention_heads
)
self
.
block_layers
=
{}
# add input bottleneck
dense_layer_2d
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
'abc,cd->abd'
,
output_shape
=
[
None
,
self
.
intra_bottleneck_size
],
bias_axes
=
'd'
,
kernel_initializer
=
initializer
,
name
=
'bottleneck_input/dense'
)
layer_norm
=
_get_norm_layer
(
self
.
normalization_type
,
name
=
'bottleneck_input/norm'
)
self
.
block_layers
[
'bottleneck_input'
]
=
[
dense_layer_2d
,
layer_norm
]
if
self
.
key_query_shared_bottleneck
:
dense_layer_2d
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
'abc,cd->abd'
,
output_shape
=
[
None
,
self
.
intra_bottleneck_size
],
bias_axes
=
'd'
,
kernel_initializer
=
initializer
,
name
=
'kq_shared_bottleneck/dense'
)
layer_norm
=
_get_norm_layer
(
self
.
normalization_type
,
name
=
'kq_shared_bottleneck/norm'
)
self
.
block_layers
[
'kq_shared_bottleneck'
]
=
[
dense_layer_2d
,
layer_norm
]
# add attention layer
attention_layer
=
tf
.
keras
.
layers
.
MultiHeadAttention
(
num_heads
=
self
.
num_attention_heads
,
key_dim
=
attention_head_size
,
value_dim
=
attention_head_size
,
dropout
=
self
.
attention_probs_dropout_prob
,
output_shape
=
self
.
intra_bottleneck_size
,
kernel_initializer
=
initializer
,
name
=
'attention'
)
layer_norm
=
_get_norm_layer
(
self
.
normalization_type
,
name
=
'attention/norm'
)
self
.
block_layers
[
'attention'
]
=
[
attention_layer
,
layer_norm
]
# add stacked feed-forward networks
self
.
block_layers
[
'ffn'
]
=
[]
for
ffn_layer_idx
in
range
(
self
.
num_feedforward_networks
):
layer_prefix
=
f
'ffn_layer_
{
ffn_layer_idx
}
'
layer_name
=
layer_prefix
+
'/intermediate_dense'
intermediate_layer
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
'abc,cd->abd'
,
activation
=
self
.
intermediate_act_fn
,
output_shape
=
[
None
,
self
.
intermediate_size
],
bias_axes
=
'd'
,
kernel_initializer
=
initializer
,
name
=
layer_name
)
layer_name
=
layer_prefix
+
'/output_dense'
output_layer
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
'abc,cd->abd'
,
output_shape
=
[
None
,
self
.
intra_bottleneck_size
],
bias_axes
=
'd'
,
kernel_initializer
=
initializer
,
name
=
layer_name
)
layer_name
=
layer_prefix
+
'/norm'
layer_norm
=
_get_norm_layer
(
self
.
normalization_type
,
name
=
layer_name
)
self
.
block_layers
[
'ffn'
].
append
([
intermediate_layer
,
output_layer
,
layer_norm
])
# add output bottleneck
bottleneck
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
'abc,cd->abd'
,
output_shape
=
[
None
,
self
.
hidden_size
],
activation
=
None
,
bias_axes
=
'd'
,
kernel_initializer
=
initializer
,
name
=
'bottleneck_output/dense'
)
dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
self
.
hidden_dropout_prob
,
name
=
'bottleneck_output/dropout'
)
layer_norm
=
_get_norm_layer
(
self
.
normalization_type
,
name
=
'bottleneck_output/norm'
)
self
.
block_layers
[
'bottleneck_output'
]
=
[
bottleneck
,
dropout_layer
,
layer_norm
]
def
get_config
(
self
):
config
=
{
'hidden_size'
:
self
.
hidden_size
,
'num_attention_heads'
:
self
.
num_attention_heads
,
'intermediate_size'
:
self
.
intermediate_size
,
'intermediate_act_fn'
:
self
.
intermediate_act_fn
,
'hidden_dropout_prob'
:
self
.
hidden_dropout_prob
,
'attention_probs_dropout_prob'
:
self
.
attention_probs_dropout_prob
,
'intra_bottleneck_size'
:
self
.
intra_bottleneck_size
,
'use_bottleneck_attention'
:
self
.
use_bottleneck_attention
,
'key_query_shared_bottleneck'
:
self
.
key_query_shared_bottleneck
,
'num_feedforward_networks'
:
self
.
num_feedforward_networks
,
'normalization_type'
:
self
.
normalization_type
,
'initializer'
:
tf
.
keras
.
initializers
.
serialize
(
self
.
initializer
),
}
base_config
=
super
(
MobileBertTransformer
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
input_tensor
,
attention_mask
=
None
,
return_attention_scores
=
False
):
"""Implementes the forward pass.
Args:
input_tensor: Float tensor of shape
`(batch_size, seq_length, hidden_size)`.
attention_mask: (optional) int32 tensor of shape
`(batch_size, seq_length, seq_length)`, with 1 for positions that can
be attended to and 0 in positions that should not be.
return_attention_scores: If return attention score.
Returns:
layer_output: Float tensor of shape
`(batch_size, seq_length, hidden_size)`.
attention_scores (Optional): Only when return_attention_scores is True.
Raises:
ValueError: A Tensor shape or parameter is invalid.
"""
input_width
=
input_tensor
.
shape
.
as_list
()[
-
1
]
if
input_width
!=
self
.
hidden_size
:
raise
ValueError
(
(
f
'The width of the input tensor
{
input_width
}
!= '
f
'hidden size
{
self
.
hidden_size
}
'
))
prev_output
=
input_tensor
# input bottleneck
dense_layer
=
self
.
block_layers
[
'bottleneck_input'
][
0
]
layer_norm
=
self
.
block_layers
[
'bottleneck_input'
][
1
]
layer_input
=
dense_layer
(
prev_output
)
layer_input
=
layer_norm
(
layer_input
)
if
self
.
use_bottleneck_attention
:
key_tensor
=
layer_input
query_tensor
=
layer_input
value_tensor
=
layer_input
elif
self
.
key_query_shared_bottleneck
:
dense_layer
=
self
.
block_layers
[
'kq_shared_bottleneck'
][
0
]
layer_norm
=
self
.
block_layers
[
'kq_shared_bottleneck'
][
1
]
shared_attention_input
=
dense_layer
(
prev_output
)
shared_attention_input
=
layer_norm
(
shared_attention_input
)
key_tensor
=
shared_attention_input
query_tensor
=
shared_attention_input
value_tensor
=
prev_output
else
:
key_tensor
=
prev_output
query_tensor
=
prev_output
value_tensor
=
prev_output
# attention layer
attention_layer
=
self
.
block_layers
[
'attention'
][
0
]
layer_norm
=
self
.
block_layers
[
'attention'
][
1
]
attention_output
,
attention_scores
=
attention_layer
(
query_tensor
,
value_tensor
,
key_tensor
,
attention_mask
,
return_attention_scores
=
True
,
)
attention_output
=
layer_norm
(
attention_output
+
layer_input
)
# stacked feed-forward networks
layer_input
=
attention_output
for
ffn_idx
in
range
(
self
.
num_feedforward_networks
):
intermediate_layer
=
self
.
block_layers
[
'ffn'
][
ffn_idx
][
0
]
output_layer
=
self
.
block_layers
[
'ffn'
][
ffn_idx
][
1
]
layer_norm
=
self
.
block_layers
[
'ffn'
][
ffn_idx
][
2
]
intermediate_output
=
intermediate_layer
(
layer_input
)
layer_output
=
output_layer
(
intermediate_output
)
layer_output
=
layer_norm
(
layer_output
+
layer_input
)
layer_input
=
layer_output
# output bottleneck
bottleneck
=
self
.
block_layers
[
'bottleneck_output'
][
0
]
dropout_layer
=
self
.
block_layers
[
'bottleneck_output'
][
1
]
layer_norm
=
self
.
block_layers
[
'bottleneck_output'
][
2
]
layer_output
=
bottleneck
(
layer_output
)
layer_output
=
dropout_layer
(
layer_output
)
layer_output
=
layer_norm
(
layer_output
+
prev_output
)
if
return_attention_scores
:
return
layer_output
,
attention_scores
else
:
return
layer_output
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
MobileBertMaskedLM
(
tf
.
keras
.
layers
.
Layer
):
"""Masked language model network head for BERT modeling.
This layer implements a masked language model based on the provided
transformer based encoder. It assumes that the encoder network being passed
has a "get_embedding_table()" method. Different from canonical BERT's masked
LM layer, when the embedding width is smaller than hidden_size, it adds an
extra output weights in shape [vocab_size, (hidden_size - embedding_width)].
"""
def
__init__
(
self
,
embedding_table
,
activation
=
None
,
initializer
=
'glorot_uniform'
,
output
=
'logits'
,
**
kwargs
):
"""Class initialization.
Args:
embedding_table: The embedding table from encoder network.
activation: The activation, if any, for the dense layer.
initializer: The initializer for the dense layer. Defaults to a Glorot
uniform initializer.
output: The output style for this layer. Can be either `logits` or
`predictions`.
**kwargs: keyword arguments.
"""
super
(
MobileBertMaskedLM
,
self
).
__init__
(
**
kwargs
)
self
.
embedding_table
=
embedding_table
self
.
activation
=
activation
self
.
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
if
output
not
in
(
'predictions'
,
'logits'
):
raise
ValueError
(
(
'Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"'
)
%
output
)
self
.
_output_type
=
output
def
build
(
self
,
input_shape
):
self
.
_vocab_size
,
embedding_width
=
self
.
embedding_table
.
shape
hidden_size
=
input_shape
[
-
1
]
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
hidden_size
,
activation
=
self
.
activation
,
kernel_initializer
=
self
.
initializer
,
name
=
'transform/dense'
)
if
hidden_size
>
embedding_width
:
self
.
extra_output_weights
=
self
.
add_weight
(
'extra_output_weights'
,
shape
=
(
self
.
_vocab_size
,
hidden_size
-
embedding_width
),
initializer
=
self
.
initializer
,
trainable
=
True
)
elif
hidden_size
==
embedding_width
:
self
.
extra_output_weights
=
None
else
:
raise
ValueError
(
'hidden size %d cannot be smaller than embedding width %d.'
%
(
hidden_size
,
embedding_width
))
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
axis
=-
1
,
epsilon
=
1e-12
,
name
=
'transform/LayerNorm'
)
self
.
bias
=
self
.
add_weight
(
'output_bias/bias'
,
shape
=
(
self
.
_vocab_size
,),
initializer
=
'zeros'
,
trainable
=
True
)
super
(
MobileBertMaskedLM
,
self
).
build
(
input_shape
)
def
call
(
self
,
sequence_data
,
masked_positions
):
masked_lm_input
=
self
.
_gather_indexes
(
sequence_data
,
masked_positions
)
lm_data
=
self
.
dense
(
masked_lm_input
)
lm_data
=
self
.
layer_norm
(
lm_data
)
if
self
.
extra_output_weights
is
None
:
lm_data
=
tf
.
matmul
(
lm_data
,
self
.
embedding_table
,
transpose_b
=
True
)
else
:
lm_data
=
tf
.
matmul
(
lm_data
,
tf
.
concat
([
self
.
embedding_table
,
self
.
extra_output_weights
],
axis
=
1
),
transpose_b
=
True
)
logits
=
tf
.
nn
.
bias_add
(
lm_data
,
self
.
bias
)
masked_positions_length
=
masked_positions
.
shape
.
as_list
()[
1
]
or
tf
.
shape
(
masked_positions
)[
1
]
logits
=
tf
.
reshape
(
logits
,
[
-
1
,
masked_positions_length
,
self
.
_vocab_size
])
if
self
.
_output_type
==
'logits'
:
return
logits
return
tf
.
nn
.
log_softmax
(
logits
)
def
get_config
(
self
):
raise
NotImplementedError
(
'MaskedLM cannot be directly serialized because '
'it has variable sharing logic.'
)
def
_gather_indexes
(
self
,
sequence_tensor
,
positions
):
"""Gathers the vectors at the specific positions.
Args:
sequence_tensor: Sequence output of `BertModel` layer of shape
`(batch_size, seq_length, num_hidden)` where `num_hidden` is number of
hidden units of `BertModel` layer.
positions: Positions ids of tokens in sequence to mask for pretraining
of with dimension `(batch_size, num_predictions)` where
`num_predictions` is maximum number of tokens to mask out and predict
per each sequence.
Returns:
Masked out sequence tensor of shape
`(batch_size * num_predictions, num_hidden)`.
"""
sequence_shape
=
tf
.
shape
(
sequence_tensor
)
batch_size
,
seq_length
=
sequence_shape
[
0
],
sequence_shape
[
1
]
width
=
sequence_tensor
.
shape
.
as_list
()[
2
]
or
sequence_shape
[
2
]
flat_offsets
=
tf
.
reshape
(
tf
.
range
(
0
,
batch_size
,
dtype
=
tf
.
int32
)
*
seq_length
,
[
-
1
,
1
])
flat_positions
=
tf
.
reshape
(
positions
+
flat_offsets
,
[
-
1
])
flat_sequence_tensor
=
tf
.
reshape
(
sequence_tensor
,
[
batch_size
*
seq_length
,
width
])
output_tensor
=
tf
.
gather
(
flat_sequence_tensor
,
flat_positions
)
return
output_tensor
official/nlp/modeling/layers/mobile_bert_layers_test.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.nlp.modeling.layers
import
mobile_bert_layers
from
official.nlp.modeling.networks
import
mobile_bert_encoder
def
generate_fake_input
(
batch_size
=
1
,
seq_len
=
5
,
vocab_size
=
10000
,
seed
=
0
):
"""Generate consistent fake integer input sequences."""
np
.
random
.
seed
(
seed
)
fake_input
=
[]
for
_
in
range
(
batch_size
):
fake_input
.
append
([])
for
_
in
range
(
seq_len
):
fake_input
[
-
1
].
append
(
np
.
random
.
randint
(
0
,
vocab_size
))
fake_input
=
np
.
asarray
(
fake_input
)
return
fake_input
class
MobileBertEncoderTest
(
parameterized
.
TestCase
,
tf
.
test
.
TestCase
):
def
test_embedding_layer_with_token_type
(
self
):
layer
=
mobile_bert_layers
.
MobileBertEmbedding
(
10
,
8
,
2
,
16
)
input_seq
=
tf
.
Variable
([[
2
,
3
,
4
,
5
]])
token_type
=
tf
.
Variable
([[
0
,
1
,
1
,
1
]])
output
=
layer
(
input_seq
,
token_type
)
output_shape
=
output
.
shape
.
as_list
()
expected_shape
=
[
1
,
4
,
16
]
self
.
assertListEqual
(
output_shape
,
expected_shape
,
msg
=
None
)
def
test_embedding_layer_without_token_type
(
self
):
layer
=
mobile_bert_layers
.
MobileBertEmbedding
(
10
,
8
,
2
,
16
)
input_seq
=
tf
.
Variable
([[
2
,
3
,
4
,
5
]])
output
=
layer
(
input_seq
)
output_shape
=
output
.
shape
.
as_list
()
expected_shape
=
[
1
,
4
,
16
]
self
.
assertListEqual
(
output_shape
,
expected_shape
,
msg
=
None
)
def
test_embedding_layer_get_config
(
self
):
layer
=
mobile_bert_layers
.
MobileBertEmbedding
(
word_vocab_size
=
16
,
word_embed_size
=
32
,
type_vocab_size
=
4
,
output_embed_size
=
32
,
max_sequence_length
=
32
,
normalization_type
=
'layer_norm'
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.01
),
dropout_rate
=
0.5
)
layer_config
=
layer
.
get_config
()
new_layer
=
mobile_bert_layers
.
MobileBertEmbedding
.
from_config
(
layer_config
)
self
.
assertEqual
(
layer_config
,
new_layer
.
get_config
())
def
test_no_norm
(
self
):
layer
=
mobile_bert_layers
.
NoNorm
()
feature
=
tf
.
random
.
normal
([
2
,
3
,
4
])
output
=
layer
(
feature
)
output_shape
=
output
.
shape
.
as_list
()
expected_shape
=
[
2
,
3
,
4
]
self
.
assertListEqual
(
output_shape
,
expected_shape
,
msg
=
None
)
@
parameterized
.
named_parameters
((
'with_kq_shared_bottleneck'
,
False
),
(
'without_kq_shared_bottleneck'
,
True
))
def
test_transfomer_kq_shared_bottleneck
(
self
,
is_kq_shared
):
feature
=
tf
.
random
.
uniform
([
2
,
3
,
512
])
layer
=
mobile_bert_layers
.
MobileBertTransformer
(
key_query_shared_bottleneck
=
is_kq_shared
)
output
=
layer
(
feature
)
output_shape
=
output
.
shape
.
as_list
()
expected_shape
=
[
2
,
3
,
512
]
self
.
assertListEqual
(
output_shape
,
expected_shape
,
msg
=
None
)
def
test_transfomer_with_mask
(
self
):
feature
=
tf
.
random
.
uniform
([
2
,
3
,
512
])
input_mask
=
[[[
0.
,
0.
,
1.
],
[
0.
,
0.
,
1.
],
[
0.
,
0.
,
1.
]],
[[
0.
,
1.
,
1.
],
[
0.
,
1.
,
1.
],
[
0.
,
1.
,
1.
]]]
input_mask
=
np
.
asarray
(
input_mask
)
layer
=
mobile_bert_layers
.
MobileBertTransformer
()
output
=
layer
(
feature
,
input_mask
)
output_shape
=
output
.
shape
.
as_list
()
expected_shape
=
[
2
,
3
,
512
]
self
.
assertListEqual
(
output_shape
,
expected_shape
,
msg
=
None
)
def
test_transfomer_return_attention_score
(
self
):
sequence_length
=
5
num_attention_heads
=
8
feature
=
tf
.
random
.
uniform
([
2
,
sequence_length
,
512
])
layer
=
mobile_bert_layers
.
MobileBertTransformer
(
num_attention_heads
=
num_attention_heads
)
_
,
attention_score
=
layer
(
feature
,
return_attention_scores
=
True
)
expected_shape
=
[
2
,
num_attention_heads
,
sequence_length
,
sequence_length
]
self
.
assertListEqual
(
attention_score
.
shape
.
as_list
(),
expected_shape
,
msg
=
None
)
def
test_transformer_get_config
(
self
):
layer
=
mobile_bert_layers
.
MobileBertTransformer
(
hidden_size
=
32
,
num_attention_heads
=
2
,
intermediate_size
=
48
,
intermediate_act_fn
=
'gelu'
,
hidden_dropout_prob
=
0.5
,
attention_probs_dropout_prob
=
0.4
,
intra_bottleneck_size
=
64
,
use_bottleneck_attention
=
True
,
key_query_shared_bottleneck
=
False
,
num_feedforward_networks
=
2
,
normalization_type
=
'layer_norm'
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.01
),
name
=
'block'
)
layer_config
=
layer
.
get_config
()
new_layer
=
mobile_bert_layers
.
MobileBertTransformer
.
from_config
(
layer_config
)
self
.
assertEqual
(
layer_config
,
new_layer
.
get_config
())
class
MobileBertMaskedLMTest
(
tf
.
test
.
TestCase
):
def
create_layer
(
self
,
vocab_size
,
hidden_size
,
embedding_width
,
output
=
'predictions'
,
xformer_stack
=
None
):
# First, create a transformer stack that we can use to get the LM's
# vocabulary weight.
if
xformer_stack
is
None
:
xformer_stack
=
mobile_bert_encoder
.
MobileBERTEncoder
(
word_vocab_size
=
vocab_size
,
num_blocks
=
1
,
hidden_size
=
hidden_size
,
num_attention_heads
=
4
,
word_embed_size
=
embedding_width
)
# Create a maskedLM from the transformer stack.
test_layer
=
mobile_bert_layers
.
MobileBertMaskedLM
(
embedding_table
=
xformer_stack
.
get_embedding_table
(),
output
=
output
)
return
test_layer
def
test_layer_creation
(
self
):
vocab_size
=
100
sequence_length
=
32
hidden_size
=
64
embedding_width
=
32
num_predictions
=
21
test_layer
=
self
.
create_layer
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
embedding_width
=
embedding_width
)
# Make sure that the output tensor of the masked LM is the right shape.
lm_input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
hidden_size
))
masked_positions
=
tf
.
keras
.
Input
(
shape
=
(
num_predictions
,),
dtype
=
tf
.
int32
)
output
=
test_layer
(
lm_input_tensor
,
masked_positions
=
masked_positions
)
expected_output_shape
=
[
None
,
num_predictions
,
vocab_size
]
self
.
assertEqual
(
expected_output_shape
,
output
.
shape
.
as_list
())
def
test_layer_invocation_with_external_logits
(
self
):
vocab_size
=
100
sequence_length
=
32
hidden_size
=
64
embedding_width
=
32
num_predictions
=
21
xformer_stack
=
mobile_bert_encoder
.
MobileBERTEncoder
(
word_vocab_size
=
vocab_size
,
num_blocks
=
1
,
hidden_size
=
hidden_size
,
num_attention_heads
=
4
,
word_embed_size
=
embedding_width
)
test_layer
=
self
.
create_layer
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
embedding_width
=
embedding_width
,
xformer_stack
=
xformer_stack
,
output
=
'predictions'
)
logit_layer
=
self
.
create_layer
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
embedding_width
=
embedding_width
,
xformer_stack
=
xformer_stack
,
output
=
'logits'
)
# Create a model from the masked LM layer.
lm_input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
hidden_size
))
masked_positions
=
tf
.
keras
.
Input
(
shape
=
(
num_predictions
,),
dtype
=
tf
.
int32
)
output
=
test_layer
(
lm_input_tensor
,
masked_positions
)
logit_output
=
logit_layer
(
lm_input_tensor
,
masked_positions
)
logit_output
=
tf
.
keras
.
layers
.
Activation
(
tf
.
nn
.
log_softmax
)(
logit_output
)
logit_layer
.
set_weights
(
test_layer
.
get_weights
())
model
=
tf
.
keras
.
Model
([
lm_input_tensor
,
masked_positions
],
output
)
logits_model
=
tf
.
keras
.
Model
(([
lm_input_tensor
,
masked_positions
]),
logit_output
)
# Invoke the masked LM on some fake data to make sure there are no runtime
# errors in the code.
batch_size
=
3
lm_input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
hidden_size
))
masked_position_data
=
np
.
random
.
randint
(
sequence_length
,
size
=
(
batch_size
,
num_predictions
))
# ref_outputs = model.predict([lm_input_data, masked_position_data])
# outputs = logits_model.predict([lm_input_data, masked_position_data])
ref_outputs
=
model
([
lm_input_data
,
masked_position_data
])
outputs
=
logits_model
([
lm_input_data
,
masked_position_data
])
# Ensure that the tensor shapes are correct.
expected_output_shape
=
(
batch_size
,
num_predictions
,
vocab_size
)
self
.
assertEqual
(
expected_output_shape
,
ref_outputs
.
shape
)
self
.
assertEqual
(
expected_output_shape
,
outputs
.
shape
)
self
.
assertAllClose
(
ref_outputs
,
outputs
)
def
test_layer_invocation
(
self
):
vocab_size
=
100
sequence_length
=
32
hidden_size
=
64
embedding_width
=
32
num_predictions
=
21
test_layer
=
self
.
create_layer
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
embedding_width
=
embedding_width
)
# Create a model from the masked LM layer.
lm_input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
hidden_size
))
masked_positions
=
tf
.
keras
.
Input
(
shape
=
(
num_predictions
,),
dtype
=
tf
.
int32
)
output
=
test_layer
(
lm_input_tensor
,
masked_positions
)
model
=
tf
.
keras
.
Model
([
lm_input_tensor
,
masked_positions
],
output
)
# Invoke the masked LM on some fake data to make sure there are no runtime
# errors in the code.
batch_size
=
3
lm_input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
hidden_size
))
masked_position_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
num_predictions
))
_
=
model
.
predict
([
lm_input_data
,
masked_position_data
])
def
test_unknown_output_type_fails
(
self
):
with
self
.
assertRaisesRegex
(
ValueError
,
'Unknown `output` value "bad".*'
):
_
=
self
.
create_layer
(
vocab_size
=
8
,
hidden_size
=
8
,
embedding_width
=
4
,
output
=
'bad'
)
def
test_hidden_size_smaller_than_embedding_width
(
self
):
hidden_size
=
8
sequence_length
=
32
num_predictions
=
20
with
self
.
assertRaisesRegex
(
ValueError
,
'hidden size 8 cannot be smaller than embedding width 16.'
):
test_layer
=
self
.
create_layer
(
vocab_size
=
8
,
hidden_size
=
8
,
embedding_width
=
16
)
lm_input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
hidden_size
))
masked_positions
=
tf
.
keras
.
Input
(
shape
=
(
num_predictions
,),
dtype
=
tf
.
int32
)
_
=
test_layer
(
lm_input_tensor
,
masked_positions
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/nlp/modeling/layers/multi_channel_attention.py
View file @
f16a7b5b
# Lint as: python3
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -12,29 +11,23 @@
...
@@ -12,29 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
# ==============================================================================
"""Multi-channel Attention."""
"""Multi-channel Attention."""
# pylint: disable=g-classes-have-attributes
# pylint: disable=g-classes-have-attributes
from
__future__
import
absolute_import
from
__future__
import
division
# from __future__ import google_type_annotations
from
__future__
import
print_function
import
math
import
math
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.modeling
import
tf_utils
from
official.modeling
import
tf_utils
from
official.nlp.modeling.layers
import
attention
from
official.nlp.modeling.layers
import
masked_softmax
from
official.nlp.modeling.layers
import
masked_softmax
class
VotingAttention
(
tf
.
keras
.
layers
.
Layer
):
class
VotingAttention
(
tf
.
keras
.
layers
.
Layer
):
"""Voting Attention layer.
"""Voting Attention layer.
Arg
ument
s:
Args:
num_heads:
t
he number of attention heads.
num_heads:
T
he number of attention heads.
head_size:
p
er-head hidden size.
head_size:
P
er-head hidden size.
kernel_initializer: Initializer for dense layer kernels.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
kernel_regularizer: Regularizer for dense layer kernels.
...
@@ -107,43 +100,61 @@ class VotingAttention(tf.keras.layers.Layer):
...
@@ -107,43 +100,61 @@ class VotingAttention(tf.keras.layers.Layer):
return
tf
.
nn
.
softmax
(
doc_attention_probs
+
infadder
)
return
tf
.
nn
.
softmax
(
doc_attention_probs
+
infadder
)
class
MultiChannelAttention
(
attention
.
MultiHeadAttention
):
class
MultiChannelAttention
(
tf
.
keras
.
layers
.
MultiHeadAttention
):
"""Multi-channel Attention layer.
"""Multi-channel Attention layer.
Introduced in: https://arxiv.org/abs/2001.09386. Expects multiple
Introduced in, [Generating Representative Headlines for News Stories
cross-attention target sequences.
](https://arxiv.org/abs/2001.09386). Expects multiple cross-attention
target sequences.
Call args:
query: Query `Tensor` of shape `[B, T, dim]`.
value: Value `Tensor` of shape `[B, A, S, dim]`, where A denotes the
context_attention_weights: Context weights of shape `[B, N, T, A]`, where N
is the number of attention heads. Combines multi-channel sources
context tensors according to the distribution among channels.
key: Optional key `Tensor` of shape `[B, A, S, dim]`. If not given, will use
`value` for both `key` and `value`, which is the most common case.
attention_mask: A boolean mask of shape `[B, T, S]`, that prevents attention
to certain positions.
"""
"""
def
_build_attention
(
self
,
qkv_
rank
):
def
_build_attention
(
self
,
rank
):
super
(
MultiChannelAttention
,
self
).
_build_attention
(
qkv_
rank
)
super
(
MultiChannelAttention
,
self
).
_build_attention
(
rank
)
self
.
_masked_softmax
=
masked_softmax
.
MaskedSoftmax
(
mask_expansion_axes
=
[
2
])
self
.
_masked_softmax
=
masked_softmax
.
MaskedSoftmax
(
mask_expansion_axes
=
[
2
])
def
call
(
self
,
inputs
,
attention_mask
=
None
):
def
call
(
self
,
from_tensor
=
inputs
[
0
]
query
,
to_tensor
=
inputs
[
1
]
value
,
doc_attention_probs
=
inputs
[
2
]
key
=
None
,
context_attention_weights
=
None
,
attention_mask
=
None
):
if
not
self
.
_built_from_signature
:
self
.
_build_from_signature
(
query
,
value
,
key
=
key
)
if
key
is
None
:
key
=
value
# Scalar dimensions referenced here:
# Scalar dimensions referenced here:
# B = batch size (number of stories)
# B = batch size (number of stories)
# A = num_docs (number of docs)
# A = num_docs (number of docs)
# F =
`from_tensor`
sequence length
# F =
target
sequence length
# T =
`to_tensor`
sequence length
# T =
source
sequence length
# N = `num_attention_heads`
# N = `num_attention_heads`
# H = `size_per_head`
# H = `size_per_head`
# `query_tensor` = [B, F, N ,H]
# `query_tensor` = [B, F, N ,H]
query_tensor
=
self
.
_query_dense
(
from_tensor
)
query_tensor
=
self
.
_query_dense
(
query
)
# `key_tensor` = [B, A, T, N, H]
# `key_tensor` = [B, A, T, N, H]
key_tensor
=
self
.
_key_dense
(
to_tensor
)
key_tensor
=
self
.
_key_dense
(
key
)
# `value_tensor` = [B, A, T, N, H]
# `value_tensor` = [B, A, T, N, H]
value_tensor
=
self
.
_value_dense
(
to_tensor
)
value_tensor
=
self
.
_value_dense
(
value
)
# Take the dot product between "query" and "key" to get the raw
# Take the dot product between "query" and "key" to get the raw
# attention scores.
# attention scores.
attention_scores
=
tf
.
einsum
(
"BATNH,BFNH->BANFT"
,
key_tensor
,
query_tensor
)
attention_scores
=
tf
.
einsum
(
"BATNH,BFNH->BANFT"
,
key_tensor
,
query_tensor
)
attention_scores
=
tf
.
multiply
(
attention_scores
,
attention_scores
=
tf
.
multiply
(
attention_scores
,
1.0
/
math
.
sqrt
(
float
(
self
.
_key_
size
)))
1.0
/
math
.
sqrt
(
float
(
self
.
_key_
dim
)))
# Normalize the attention scores to probabilities.
# Normalize the attention scores to probabilities.
# `attention_probs` = [B, A, N, F, T]
# `attention_probs` = [B, A, N, F, T]
...
@@ -156,7 +167,7 @@ class MultiChannelAttention(attention.MultiHeadAttention):
...
@@ -156,7 +167,7 @@ class MultiChannelAttention(attention.MultiHeadAttention):
# `context_layer` = [B, F, N, H]
# `context_layer` = [B, F, N, H]
context_layer
=
tf
.
einsum
(
"BANFT,BATNH->BAFNH"
,
attention_probs
,
context_layer
=
tf
.
einsum
(
"BANFT,BATNH->BAFNH"
,
attention_probs
,
value_tensor
)
value_tensor
)
attention_output
=
tf
.
einsum
(
"BNFA,BAFNH->BFNH"
,
doc
_attention_
prob
s
,
attention_output
=
tf
.
einsum
(
"BNFA,BAFNH->BFNH"
,
context
_attention_
weight
s
,
context_layer
)
context_layer
)
attention_output
=
self
.
_output_dense
(
attention_output
)
attention_output
=
self
.
_output_dense
(
attention_output
)
return
attention_output
return
attention_output
official/nlp/modeling/layers/multi_channel_attention_test.py
View file @
f16a7b5b
# Lint as: python3
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -12,12 +11,8 @@
...
@@ -12,12 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
# ==============================================================================
"""Tests for nlp.nhnet.multi_channel_attention."""
from
__future__
import
absolute_import
"""Tests for nlp.nhnet.multi_channel_attention."""
from
__future__
import
division
from
__future__
import
print_function
import
numpy
as
np
import
numpy
as
np
import
tensorflow
as
tf
import
tensorflow
as
tf
...
@@ -41,14 +36,18 @@ class MultiChannelAttentionTest(tf.test.TestCase):
...
@@ -41,14 +36,18 @@ class MultiChannelAttentionTest(tf.test.TestCase):
num_heads
=
2
num_heads
=
2
num_docs
=
5
num_docs
=
5
attention_layer
=
multi_channel_attention
.
MultiChannelAttention
(
attention_layer
=
multi_channel_attention
.
MultiChannelAttention
(
num_heads
,
key_
size
=
2
)
num_heads
,
key_
dim
=
2
)
from_data
=
10
*
np
.
random
.
random_sample
((
3
,
4
,
8
))
from_data
=
10
*
np
.
random
.
random_sample
((
3
,
4
,
8
))
to_data
=
10
*
np
.
random
.
random_sample
((
3
,
num_docs
,
2
,
8
))
to_data
=
10
*
np
.
random
.
random_sample
((
3
,
num_docs
,
2
,
8
))
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
3
,
num_docs
,
4
,
2
))
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
3
,
num_docs
,
4
,
2
))
doc_probs
=
np
.
random
.
randint
(
doc_probs
=
np
.
random
.
randint
(
2
,
size
=
(
3
,
num_heads
,
4
,
num_docs
)).
astype
(
float
)
2
,
size
=
(
3
,
num_heads
,
4
,
num_docs
)).
astype
(
float
)
outputs
=
attention_layer
([
from_data
,
to_data
,
doc_probs
],
mask_data
)
outputs
=
attention_layer
(
query
=
from_data
,
value
=
to_data
,
context_attention_weights
=
doc_probs
,
attention_mask
=
mask_data
)
self
.
assertEqual
(
outputs
.
shape
,
(
3
,
4
,
8
))
self
.
assertEqual
(
outputs
.
shape
,
(
3
,
4
,
8
))
...
...
official/nlp/modeling/layers/on_device_embedding.py
View file @
f16a7b5b
# Copyright 201
9
The TensorFlow Authors. All Rights Reserved.
# Copyright 20
2
1 The TensorFlow Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -11,78 +11,11 @@
...
@@ -11,78 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
# ==============================================================================
"""Keras-based one-hot embedding layer."""
"""Keras-based one-hot embedding layer."""
# pylint: disable=g-classes-have-attributes
# pylint: disable=g-classes-have-attributes
from
__future__
import
absolute_import
from
__future__
import
division
# from __future__ import google_type_annotations
from
__future__
import
print_function
import
tensorflow
as
tf
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
OnDeviceEmbedding
(
tf
.
keras
.
layers
.
Layer
):
"""Performs an embedding lookup suitable for accelerator devices.
This layer uses either tf.gather or tf.one_hot to translate integer indices to
float embeddings.
Arguments:
vocab_size: Number of elements in the vocabulary.
embedding_width: Output size of the embedding layer.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
lookup. Defaults to False (that is, using tf.gather). Setting this option
to True may improve performance, especially on small vocabulary sizes, but
will generally require more memory.
"""
def
__init__
(
self
,
vocab_size
,
embedding_width
,
initializer
=
"glorot_uniform"
,
use_one_hot
=
False
,
**
kwargs
):
super
(
OnDeviceEmbedding
,
self
).
__init__
(
**
kwargs
)
self
.
_vocab_size
=
vocab_size
self
.
_embedding_width
=
embedding_width
self
.
_initializer
=
initializer
self
.
_use_one_hot
=
use_one_hot
def
get_config
(
self
):
config
=
{
"vocab_size"
:
self
.
_vocab_size
,
"embedding_width"
:
self
.
_embedding_width
,
"initializer"
:
self
.
_initializer
,
"use_one_hot"
:
self
.
_use_one_hot
,
}
base_config
=
super
(
OnDeviceEmbedding
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
build
(
self
,
input_shape
):
from
official.nlp
import
keras_nlp
self
.
embeddings
=
self
.
add_weight
(
"embeddings"
,
shape
=
[
self
.
_vocab_size
,
self
.
_embedding_width
],
initializer
=
self
.
_initializer
,
dtype
=
tf
.
float32
)
super
(
OnDeviceEmbedding
,
self
).
build
(
input_shape
)
def
call
(
self
,
inputs
):
OnDeviceEmbedding
=
keras_nlp
.
layers
.
OnDeviceEmbedding
flat_inputs
=
tf
.
reshape
(
inputs
,
[
-
1
])
if
self
.
_use_one_hot
:
one_hot_data
=
tf
.
one_hot
(
flat_inputs
,
depth
=
self
.
_vocab_size
,
dtype
=
self
.
embeddings
.
dtype
)
embeddings
=
tf
.
matmul
(
one_hot_data
,
self
.
embeddings
)
else
:
embeddings
=
tf
.
gather
(
self
.
embeddings
,
flat_inputs
)
embeddings
=
tf
.
reshape
(
embeddings
,
# Work around b/142213824: prefer concat to shape over a Python list.
tf
.
concat
([
tf
.
shape
(
inputs
),
[
self
.
_embedding_width
]],
axis
=
0
))
embeddings
.
set_shape
(
inputs
.
shape
.
as_list
()
+
[
self
.
_embedding_width
])
return
embeddings
official/nlp/modeling/layers/position_embedding.py
View file @
f16a7b5b
# Copyright 201
9
The TensorFlow Authors. All Rights Reserved.
# Copyright 20
2
1 The TensorFlow Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -11,115 +11,17 @@
...
@@ -11,115 +11,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
# ==============================================================================
"""Keras-based positional embedding layer."""
"""Keras-based positional embedding layer."""
# pylint: disable=g-classes-have-attributes
# pylint: disable=g-classes-have-attributes
from
__future__
import
absolute_import
from
__future__
import
division
# from __future__ import google_type_annotations
from
__future__
import
print_function
import
math
import
math
from
typing
import
Optional
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.modeling
import
tf_utils
from
official.modeling
import
tf_utils
Initializer
=
tf
.
keras
.
initializers
.
Initializer
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
PositionEmbedding
(
tf
.
keras
.
layers
.
Layer
):
"""Creates a positional embedding.
This layer creates a positional embedding as described in "BERT: Pre-training
of Deep Bidirectional Transformers for Language Understanding"
(https://arxiv.org/abs/1810.04805).
This layer can be set up to either create a statically shaped slice or a
dynamically shaped slice. If `use_dynamic_slicing` is True, the input tensor
can have a dynamic 1st dimension, while if `use_dynamic_slicing` is False the
input size must be fixed.
Arguments:
use_dynamic_slicing: Whether to use the dynamic slicing path.
max_sequence_length: The maximum size of the dynamic sequence. Only
applicable if `use_dynamic_slicing` is True.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
"""
def
__init__
(
self
,
initializer
=
"glorot_uniform"
,
use_dynamic_slicing
=
False
,
max_sequence_length
=
None
,
**
kwargs
):
# We need to have a default dtype of float32, since the inputs (which Keras
# usually uses to infer the dtype) will always be int32.
if
"dtype"
not
in
kwargs
:
kwargs
[
"dtype"
]
=
"float32"
super
(
PositionEmbedding
,
self
).
__init__
(
**
kwargs
)
if
use_dynamic_slicing
and
max_sequence_length
is
None
:
raise
ValueError
(
"If `use_dynamic_slicing` is True, `max_sequence_length` must be set."
)
self
.
_max_sequence_length
=
max_sequence_length
self
.
_initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
self
.
_use_dynamic_slicing
=
use_dynamic_slicing
def
get_config
(
self
):
config
=
{
"max_sequence_length"
:
self
.
_max_sequence_length
,
"initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_initializer
),
"use_dynamic_slicing"
:
self
.
_use_dynamic_slicing
,
}
base_config
=
super
(
PositionEmbedding
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
build
(
self
,
input_shape
):
"""Implements build() for the layer."""
dimension_list
=
input_shape
.
as_list
()
if
len
(
dimension_list
)
!=
3
:
raise
ValueError
(
"PositionEmbedding expects a 3-dimensional input tensor "
"of shape [batch, sequence, width]"
)
seq_length
=
dimension_list
[
1
]
width
=
dimension_list
[
2
]
# If we are not using dynamic slicing, we must assume that the sequence
# length is fixed and max_sequence_length should not be specified.
if
not
self
.
_use_dynamic_slicing
:
if
seq_length
is
None
:
raise
ValueError
(
"PositionEmbedding must have `use_dynamic_slicing` set "
"to True (and max_sequence_length set) when the "
"sequence (1st) dimension of the input is None."
)
if
self
.
_max_sequence_length
is
not
None
:
raise
ValueError
(
"When `use_dynamic_slicing` is False, max_sequence_length should "
"not be specified and we ought to use seq_length to get the "
"variable shape."
)
if
self
.
_max_sequence_length
is
not
None
:
weight_sequence_length
=
self
.
_max_sequence_length
else
:
weight_sequence_length
=
seq_length
self
.
_position_embeddings
=
self
.
add_weight
(
"embeddings"
,
shape
=
[
weight_sequence_length
,
width
],
initializer
=
self
.
_initializer
)
super
(
PositionEmbedding
,
self
).
build
(
input_shape
)
def
call
(
self
,
inputs
):
"""Implements call() for the layer."""
input_shape
=
tf_utils
.
get_shape_list
(
inputs
,
expected_rank
=
3
)
if
self
.
_use_dynamic_slicing
:
position_embeddings
=
self
.
_position_embeddings
[:
input_shape
[
1
],
:]
else
:
position_embeddings
=
self
.
_position_embeddings
return
tf
.
broadcast_to
(
position_embeddings
,
input_shape
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
...
@@ -131,16 +33,16 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
...
@@ -131,16 +33,16 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
"Attention is All You Need", section 3.5.
"Attention is All You Need", section 3.5.
(https://arxiv.org/abs/1706.03762).
(https://arxiv.org/abs/1706.03762).
Arg
ument
s:
Args:
hidden_size: Size of the hidden layer.
hidden_size: Size of the hidden layer.
min_timescale: Minimum scale that will be applied at each position
min_timescale: Minimum scale that will be applied at each position
max_timescale: Maximum scale that will be applied at each position.
max_timescale: Maximum scale that will be applied at each position.
"""
"""
def
__init__
(
self
,
def
__init__
(
self
,
hidden_size
,
hidden_size
:
int
,
min_timescale
=
1.0
,
min_timescale
:
float
=
1.0
,
max_timescale
=
1.0e4
,
max_timescale
:
float
=
1.0e4
,
**
kwargs
):
**
kwargs
):
# We need to have a default dtype of float32, since the inputs (which Keras
# We need to have a default dtype of float32, since the inputs (which Keras
# usually uses to infer the dtype) will always be int32.
# usually uses to infer the dtype) will always be int32.
...
@@ -150,7 +52,7 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
...
@@ -150,7 +52,7 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
if
"dtype"
not
in
kwargs
:
if
"dtype"
not
in
kwargs
:
kwargs
[
"dtype"
]
=
"float32"
kwargs
[
"dtype"
]
=
"float32"
super
(
RelativePositionEmbedding
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
_hidden_size
=
hidden_size
self
.
_hidden_size
=
hidden_size
self
.
_min_timescale
=
min_timescale
self
.
_min_timescale
=
min_timescale
self
.
_max_timescale
=
max_timescale
self
.
_max_timescale
=
max_timescale
...
@@ -160,7 +62,6 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
...
@@ -160,7 +62,6 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
"hidden_size"
:
self
.
_hidden_size
,
"hidden_size"
:
self
.
_hidden_size
,
"min_timescale"
:
self
.
_min_timescale
,
"min_timescale"
:
self
.
_min_timescale
,
"max_timescale"
:
self
.
_max_timescale
,
"max_timescale"
:
self
.
_max_timescale
,
"length"
:
self
.
_length
,
}
}
base_config
=
super
(
RelativePositionEmbedding
,
self
).
get_config
()
base_config
=
super
(
RelativePositionEmbedding
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
...
@@ -172,22 +73,20 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
...
@@ -172,22 +73,20 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
inputs: An tensor whose second dimension will be used as `length`. If
inputs: An tensor whose second dimension will be used as `length`. If
`None`, the other `length` argument must be specified.
`None`, the other `length` argument must be specified.
length: An optional integer specifying the number of positions. If both
length: An optional integer specifying the number of positions. If both
`inputs` and `length` are spcified, `length` must be equal to the
`inputs` and `length` are spcified, `length` must be equal to the
second
second
dimension of `inputs`.
dimension of `inputs`.
Returns:
Returns:
A tensor in shape of
[
length, hidden_size
]
.
A tensor in shape of
`(
length, hidden_size
)`
.
"""
"""
if
inputs
is
None
and
length
is
None
:
if
inputs
is
None
and
length
is
None
:
raise
ValueError
(
raise
ValueError
(
"If inputs is None, `length` must be set in "
"If inputs is None, `length` must be set in "
"RelativePositionEmbedding()."
)
"RelativePositionEmbedding()."
)
if
inputs
is
not
None
:
if
inputs
is
not
None
:
input_shape
=
tf_utils
.
get_shape_list
(
inputs
)
input_shape
=
tf_utils
.
get_shape_list
(
inputs
)
if
length
is
not
None
and
length
!=
input_shape
[
1
]:
if
length
is
not
None
and
length
!=
input_shape
[
1
]:
raise
ValueError
(
raise
ValueError
(
"If inputs is not None, `length` must equal to input_shape[1]."
"If inputs is not None, `length` must equal to input_shape[1]."
)
)
length
=
input_shape
[
1
]
length
=
input_shape
[
1
]
position
=
tf
.
cast
(
tf
.
range
(
length
),
tf
.
float32
)
position
=
tf
.
cast
(
tf
.
range
(
length
),
tf
.
float32
)
num_timescales
=
self
.
_hidden_size
//
2
num_timescales
=
self
.
_hidden_size
//
2
...
@@ -198,8 +97,141 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
...
@@ -198,8 +97,141 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
inv_timescales
=
min_timescale
*
tf
.
exp
(
inv_timescales
=
min_timescale
*
tf
.
exp
(
tf
.
cast
(
tf
.
range
(
num_timescales
),
tf
.
float32
)
*
tf
.
cast
(
tf
.
range
(
num_timescales
),
tf
.
float32
)
*
-
log_timescale_increment
)
-
log_timescale_increment
)
scaled_time
=
tf
.
expand_dims
(
position
,
1
)
*
tf
.
expand_dims
(
inv_timescales
,
scaled_time
=
tf
.
expand_dims
(
position
,
1
)
*
tf
.
expand_dims
(
0
)
inv_timescales
,
0
)
position_embeddings
=
tf
.
concat
(
[
tf
.
sin
(
scaled_time
),
tf
.
cos
(
scaled_time
)],
position_embeddings
=
tf
.
concat
(
axis
=
1
)
[
tf
.
sin
(
scaled_time
),
tf
.
cos
(
scaled_time
)],
axis
=
1
)
return
position_embeddings
return
position_embeddings
def
_relative_position_bucket
(
relative_position
,
bidirectional
=
True
,
num_buckets
=
32
,
max_distance
=
128
):
"""Translate relative position to a bucket number for relative attention.
The relative position is defined as memory_position - query_position, i.e.
the distance in tokens from the attending position to the attended-to
position.
If `bidirectional=False`, then positive relative positions are invalid.
We use smaller buckets for small absolute relative_position and larger
buckets for larger absolute relative_positions.
All relative positions >=max_distance map to the same bucket.
All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences
than the model has been trained on.
Args:
relative_position: An int32 Tensor
bidirectional: A boolean - whether the attention is bidirectional
num_buckets: An integer
max_distance: An integer
Returns:
A Tensor with the same shape as relative_position, containing int32
values in the range [0, num_buckets)
"""
ret
=
0
n
=
-
relative_position
if
bidirectional
:
num_buckets
//=
2
ret
+=
tf
.
cast
(
tf
.
math
.
less
(
n
,
0
),
tf
.
int32
)
*
num_buckets
n
=
tf
.
math
.
abs
(
n
)
else
:
n
=
tf
.
math
.
maximum
(
n
,
0
)
# now n is in the range [0, inf)
max_exact
=
num_buckets
//
2
is_small
=
tf
.
math
.
less
(
n
,
max_exact
)
val_if_large
=
max_exact
+
tf
.
dtypes
.
cast
(
tf
.
math
.
log
(
tf
.
cast
(
n
,
tf
.
float32
)
/
max_exact
)
/
math
.
log
(
max_distance
/
max_exact
)
*
(
num_buckets
-
max_exact
),
tf
.
int32
,
)
val_if_large
=
tf
.
math
.
minimum
(
val_if_large
,
num_buckets
-
1
)
ret
+=
tf
.
where
(
is_small
,
n
,
val_if_large
)
return
ret
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
RelativePositionBias
(
tf
.
keras
.
layers
.
Layer
):
"""Relative position embedding via per-head bias in T5 style.
Reference implementation in MeshTF:
https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L1000
This layer implements the relative position bias used in "Exploring the Limits
of Transfer Learning with a Unified Text-to-Text Transformer"
(https://arxiv.org/abs/1910.10683)
"""
def
__init__
(
self
,
num_heads
:
int
,
relative_attention_num_buckets
:
int
=
32
,
relative_attention_max_distance
:
int
=
128
,
bidirectional
:
bool
=
True
,
embeddings_initializer
:
Optional
[
Initializer
]
=
None
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
self
.
num_heads
=
num_heads
self
.
relative_attention_num_buckets
=
relative_attention_num_buckets
self
.
bidirectional
=
bidirectional
self
.
relative_attention_max_distance
=
relative_attention_max_distance
if
embeddings_initializer
:
self
.
_embed_init
=
embeddings_initializer
else
:
self
.
_embed_init
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
1.0
)
with
tf
.
name_scope
(
self
.
name
):
self
.
_relative_attention_bias
=
self
.
add_weight
(
"rel_embedding"
,
shape
=
[
self
.
relative_attention_num_buckets
,
self
.
num_heads
],
initializer
=
self
.
_embed_init
,
dtype
=
self
.
dtype
,
trainable
=
True
)
def
get_config
(
self
):
config
=
{
"num_heads"
:
self
.
num_heads
,
"relative_attention_num_buckets"
:
self
.
relative_attention_num_buckets
,
"relative_attention_max_distance"
:
self
.
relative_attention_max_distance
,
"bidirectional"
:
self
.
bidirectional
,
"embeddings_initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_embed_init
),
}
base_config
=
super
().
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
query
:
tf
.
Tensor
,
key
:
tf
.
Tensor
):
"""Implements the forward pass.
Args:
query: query input tensor shape [batch, query length, hidden size].
key: key input tensor shape [batch, key length, hidden size].
Returns:
A tensor in shape of [batch, heads, query length, key length].
"""
batch_size
,
qlen
=
tf_utils
.
get_shape_list
(
query
)[:
2
]
klen
=
tf_utils
.
get_shape_list
(
key
)[
1
]
context_position
=
tf
.
range
(
qlen
)[:,
None
]
memory_position
=
tf
.
range
(
klen
)[
None
,
:]
relative_position
=
memory_position
-
context_position
rp_bucket
=
_relative_position_bucket
(
relative_position
,
bidirectional
=
self
.
bidirectional
,
num_buckets
=
self
.
relative_attention_num_buckets
,
max_distance
=
self
.
relative_attention_max_distance
)
values
=
tf
.
nn
.
embedding_lookup
(
self
.
_relative_attention_bias
,
rp_bucket
)
values
=
tf
.
expand_dims
(
tf
.
transpose
(
values
,
[
2
,
0
,
1
]),
axis
=
0
)
# shape (1, num_heads, qlen, klen)
values
=
tf
.
tile
(
values
,
[
batch_size
,
1
,
1
,
1
])
return
values
official/nlp/modeling/layers/position_embedding_test.py
View file @
f16a7b5b
# Copyright 201
9
The TensorFlow Authors. All Rights Reserved.
# Copyright 20
2
1 The TensorFlow Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -11,13 +11,10 @@
...
@@ -11,13 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
# ==============================================================================
"""Tests for Keras-based positional embedding layer."""
from
__future__
import
absolute_import
"""Tests for Keras-based positional embedding layer."""
from
__future__
import
division
from
__future__
import
print_function
from
absl.testing
import
parameterized
import
numpy
as
np
import
numpy
as
np
import
tensorflow
as
tf
import
tensorflow
as
tf
...
@@ -28,75 +25,7 @@ from official.nlp.modeling.layers import position_embedding
...
@@ -28,75 +25,7 @@ from official.nlp.modeling.layers import position_embedding
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
# guarantees forward compatibility of this code for the V2 switchover.
@
keras_parameterized
.
run_all_keras_modes
@
keras_parameterized
.
run_all_keras_modes
class
PositionEmbeddingLayerTest
(
keras_parameterized
.
TestCase
):
class
RelativePositionEmbeddingLayerTest
(
keras_parameterized
.
TestCase
):
def
test_static_layer_output_shape
(
self
):
test_layer
=
position_embedding
.
PositionEmbedding
()
# Create a 3-dimensional input (the first dimension is implicit).
sequence_length
=
21
width
=
30
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
output_tensor
=
test_layer
(
input_tensor
)
# When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch.
expected_output_shape
=
[
None
,
sequence_length
,
width
]
self
.
assertEqual
(
expected_output_shape
,
output_tensor
.
shape
.
as_list
())
# The default output dtype for this layer should be tf.float32.
self
.
assertEqual
(
tf
.
float32
,
output_tensor
.
dtype
)
def
test_float16_dtype
(
self
):
test_layer
=
position_embedding
.
PositionEmbedding
(
dtype
=
"float16"
)
# Create a 3-dimensional input (the first dimension is implicit).
sequence_length
=
21
width
=
30
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
output_tensor
=
test_layer
(
input_tensor
)
# When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch.
expected_output_shape
=
[
None
,
sequence_length
,
width
]
self
.
assertEqual
(
expected_output_shape
,
output_tensor
.
shape
.
as_list
())
# The default output dtype for this layer should be tf.float32.
self
.
assertEqual
(
tf
.
float16
,
output_tensor
.
dtype
)
def
test_dynamic_layer_output_shape
(
self
):
max_sequence_length
=
40
test_layer
=
position_embedding
.
PositionEmbedding
(
use_dynamic_slicing
=
True
,
max_sequence_length
=
max_sequence_length
)
# Create a 3-dimensional input (the first dimension is implicit).
width
=
30
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
None
,
width
))
output_tensor
=
test_layer
(
input_tensor
)
# When using dynamic positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions - but may be None if
# the input shape is None there.
expected_output_shape
=
[
None
,
None
,
width
]
self
.
assertEqual
(
expected_output_shape
,
output_tensor
.
shape
.
as_list
())
def
test_dynamic_layer_slicing
(
self
):
max_sequence_length
=
40
test_layer
=
position_embedding
.
PositionEmbedding
(
use_dynamic_slicing
=
True
,
max_sequence_length
=
max_sequence_length
)
# Create a 3-dimensional input (the first dimension is implicit).
width
=
30
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
None
,
width
))
output_tensor
=
test_layer
(
input_tensor
)
model
=
tf
.
keras
.
Model
(
input_tensor
,
output_tensor
)
# Create input data that is shorter than max_sequence_length, which should
# trigger a down-slice.
input_length
=
17
# Note: This test explicitly uses a batch size of 1. This is to get around
# Keras' restriction on Model invocations: inputs are expected to have the
# same batch cardinality as outputs. In practice, this layer should be used
# inside a model, where it can be projected when added to another tensor.
input_data
=
np
.
ones
((
1
,
input_length
,
width
))
output_data
=
model
.
predict
(
input_data
)
self
.
assertAllEqual
([
1
,
input_length
,
width
],
output_data
.
shape
)
def
test_relative_tensor_input
(
self
):
def
test_relative_tensor_input
(
self
):
hidden_size
=
8
hidden_size
=
8
...
@@ -127,5 +56,33 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
...
@@ -127,5 +56,33 @@ class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
expected_output_tensor
=
tf
.
constant
([[
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
]])
expected_output_tensor
=
tf
.
constant
([[
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
]])
self
.
assertAllEqual
(
output_tensor
,
expected_output_tensor
)
self
.
assertAllEqual
(
output_tensor
,
expected_output_tensor
)
@
keras_parameterized
.
run_all_keras_modes
class
RelativePositionBiasTest
(
keras_parameterized
.
TestCase
):
@
parameterized
.
named_parameters
((
"bidirectional"
,
True
),
(
"unidirectional"
,
False
))
def
test_relative_position_bias
(
self
,
bidirectional
):
query
=
tf
.
zeros
((
4
,
4
,
2
))
key
=
tf
.
zeros
((
4
,
2
,
2
))
l
=
position_embedding
.
RelativePositionBias
(
num_heads
=
3
,
bidirectional
=
bidirectional
,
name
=
"foo"
)
self
.
assertEqual
(
l
(
query
,
key
).
shape
,
(
4
,
3
,
4
,
2
))
self
.
assertLen
(
l
.
trainable_variables
,
1
)
self
.
assertEqual
(
l
.
trainable_variables
[
0
].
name
,
"foo/rel_embedding:0"
)
def
test_relative_position_bucket
(
self
):
context_position
=
tf
.
range
(
3
)[:,
None
]
memory_position
=
tf
.
range
(
2
)[
None
,
:]
relative_position
=
memory_position
-
context_position
outputs
=
position_embedding
.
_relative_position_bucket
(
relative_position
)
self
.
assertAllEqual
(
outputs
.
numpy
(),
np
.
array
([[
0
,
17
],
[
1
,
0
],
[
2
,
1
]]))
outputs
=
position_embedding
.
_relative_position_bucket
(
relative_position
,
bidirectional
=
False
)
self
.
assertAllEqual
(
outputs
.
numpy
(),
np
.
array
([[
0
,
0
],
[
1
,
0
],
[
2
,
1
]]))
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
tf
.
test
.
main
()
official/nlp/modeling/layers/relative_attention.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Keras-based relative attention layers."""
import
math
import
string
import
tensorflow
as
tf
_CHR_IDX
=
string
.
ascii_lowercase
def
_build_proj_equation
(
free_dims
,
bound_dims
,
output_dims
):
"""Builds an einsum equation for projections inside multi-head attention."""
input_str
=
""
kernel_str
=
""
output_str
=
""
bias_axes
=
""
letter_offset
=
0
for
i
in
range
(
free_dims
):
char
=
_CHR_IDX
[
i
+
letter_offset
]
input_str
+=
char
output_str
+=
char
letter_offset
+=
free_dims
for
i
in
range
(
bound_dims
):
char
=
_CHR_IDX
[
i
+
letter_offset
]
input_str
+=
char
kernel_str
+=
char
letter_offset
+=
bound_dims
for
i
in
range
(
output_dims
):
char
=
_CHR_IDX
[
i
+
letter_offset
]
kernel_str
+=
char
output_str
+=
char
bias_axes
+=
char
equation
=
"%s,%s->%s"
%
(
input_str
,
kernel_str
,
output_str
)
return
equation
,
bias_axes
,
len
(
output_str
)
def
_get_output_shape
(
output_rank
,
known_last_dims
):
return
[
None
]
*
(
output_rank
-
len
(
known_last_dims
))
+
list
(
known_last_dims
)
def
_rel_shift
(
x
,
klen
=-
1
):
"""Performs relative shift to form the relative attention score."""
x
=
tf
.
transpose
(
x
,
perm
=
[
2
,
3
,
0
,
1
])
x_size
=
tf
.
shape
(
x
)
x
=
tf
.
reshape
(
x
,
[
x_size
[
1
],
x_size
[
0
],
x_size
[
2
],
x_size
[
3
]])
x
=
tf
.
slice
(
x
,
[
1
,
0
,
0
,
0
],
[
-
1
,
-
1
,
-
1
,
-
1
])
x
=
tf
.
reshape
(
x
,
[
x_size
[
0
],
x_size
[
1
]
-
1
,
x_size
[
2
],
x_size
[
3
]])
x
=
tf
.
slice
(
x
,
[
0
,
0
,
0
,
0
],
[
-
1
,
klen
,
-
1
,
-
1
])
x
=
tf
.
transpose
(
x
,
perm
=
[
2
,
3
,
0
,
1
])
return
x
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
MultiHeadRelativeAttention
(
tf
.
keras
.
layers
.
MultiHeadAttention
):
"""A multi-head attention layer with relative attention + position encoding.
This layer shares the same input/output projections as the common
`tf.keras.layers.MultiHeadAttention` layer.
When it calculates attention logits, position encoding is projected to form
relative keys. The logits are composed by shifted relative logits and content
logits.
**Note: This layer is currently experimental.
Attributes:
kernel_initializer: The kernel initializer. Defaults to variance_scaling.
Call args:
query: Query `Tensor` of shape `[B, T, dim]`.
value: Value `Tensor` of shape `[B, S, dim]`.
content_attention_bias: Bias `Tensor` for content based attention of shape
`[num_heads, dim]`.
positional_attention_bias: Bias `Tensor` for position based attention of
shape `[num_heads, dim]`.
key: Optional key `Tensor` of shape `[B, S, dim]`. If not given, will use
`value` for both `key` and `value`, which is the most common case.
relative_position_encoding: Relative positional encoding `Tensor` of shape
`[B, L, dim]`.
segment_matrix: Optional `Tensor` representing segmentation IDs used in
XLNet of shape `[B, S, S + M]`.
segment_encoding: Optional `Tensor` representing the segmentation
encoding as used in XLNet of shape `[2, num_heads, dim]`.
segment_attention_bias: Optional trainable bias parameter added to the
query had when calculating the segment-based attention score used in
XLNet of shape `[num_heads, dim]`.
state: Optional `Tensor` of shape `[B, M, E]` where M is the length of the
state or memory.
If passed, this is also attended over as in Transformer XL.
attention_mask: A boolean mask of shape `[B, T, S]` that prevents attention
to certain positions.
"""
def
__init__
(
self
,
kernel_initializer
=
"variance_scaling"
,
**
kwargs
):
super
().
__init__
(
kernel_initializer
=
kernel_initializer
,
**
kwargs
)
def
_build_from_signature
(
self
,
query
,
value
,
key
=
None
):
super
(
MultiHeadRelativeAttention
,
self
).
_build_from_signature
(
query
=
query
,
value
=
value
,
key
=
key
)
if
hasattr
(
value
,
"shape"
):
value_shape
=
tf
.
TensorShape
(
value
.
shape
)
else
:
value_shape
=
value
if
key
is
None
:
key_shape
=
value_shape
elif
hasattr
(
key
,
"shape"
):
key_shape
=
tf
.
TensorShape
(
key
.
shape
)
else
:
key_shape
=
key
common_kwargs
=
dict
(
kernel_initializer
=
self
.
_kernel_initializer
,
bias_initializer
=
self
.
_bias_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
bias_regularizer
=
self
.
_bias_regularizer
,
activity_regularizer
=
self
.
_activity_regularizer
,
kernel_constraint
=
self
.
_kernel_constraint
,
bias_constraint
=
self
.
_bias_constraint
)
with
tf
.
init_scope
():
einsum_equation
,
_
,
output_rank
=
_build_proj_equation
(
key_shape
.
rank
-
1
,
bound_dims
=
1
,
output_dims
=
2
)
self
.
_encoding_dense
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
einsum_equation
,
output_shape
=
_get_output_shape
(
output_rank
-
1
,
[
self
.
_num_heads
,
self
.
_key_dim
]),
bias_axes
=
None
,
name
=
"encoding"
,
**
common_kwargs
)
def
compute_attention
(
self
,
query
,
key
,
value
,
position
,
content_attention_bias
,
positional_attention_bias
,
segment_matrix
=
None
,
segment_encoding
=
None
,
segment_attention_bias
=
None
,
attention_mask
=
None
):
"""Computes the attention.
This function defines the computation inside `call` with projected
multihead Q, K, V, R inputs.
Args:
query: Projected query `Tensor` of shape `[B, T, N, key_dim]`.
key: Projected key `Tensor` of shape `[B, S + M, N, key_dim]`.
value: Projected value `Tensor` of shape `[B, S + M, N, key_dim]`.
position: Projected position `Tensor` of shape `[B, L, N, key_dim]`.
content_attention_bias: Trainable bias parameter added to the query head
when calculating the content-based attention score.
positional_attention_bias: Trainable bias parameter added to the query
head when calculating the position-based attention score.
segment_matrix: Optional `Tensor` representing segmentation IDs used in
XLNet.
segment_encoding: Optional trainable `Tensor` representing the
segmentation encoding as used in XLNet.
segment_attention_bias: Optional trainable bias parameter added to the
query had when calculating the segment-based attention score used in
XLNet.
attention_mask: (default None) Optional mask that is added to attention
logits. If state is not None, the mask source sequence dimension should
extend M.
Returns:
attention_output: Multi-headed output of attention computation of shape
`[B, S, N, key_dim]`.
"""
content_attention
=
tf
.
einsum
(
self
.
_dot_product_equation
,
key
,
query
+
content_attention_bias
)
positional_attention
=
tf
.
einsum
(
self
.
_dot_product_equation
,
position
,
query
+
positional_attention_bias
)
positional_attention
=
_rel_shift
(
positional_attention
,
klen
=
tf
.
shape
(
content_attention
)[
3
])
if
segment_matrix
is
not
None
:
segment_attention
=
tf
.
einsum
(
"bind,snd->bnis"
,
query
+
segment_attention_bias
,
segment_encoding
)
target_shape
=
tf
.
shape
(
positional_attention
)
segment_attention
=
tf
.
where
(
tf
.
broadcast_to
(
tf
.
expand_dims
(
segment_matrix
,
1
),
target_shape
),
tf
.
broadcast_to
(
segment_attention
[:,
:,
:,
1
:],
target_shape
),
tf
.
broadcast_to
(
segment_attention
[:,
:,
:,
:
1
],
target_shape
))
attention_sum
=
(
content_attention
+
positional_attention
+
segment_attention
)
else
:
attention_sum
=
content_attention
+
positional_attention
attention_scores
=
tf
.
multiply
(
attention_sum
,
1.0
/
math
.
sqrt
(
float
(
self
.
_key_dim
)))
attention_scores
=
self
.
_masked_softmax
(
attention_scores
,
attention_mask
)
attention_output
=
self
.
_dropout_layer
(
attention_scores
)
attention_output
=
tf
.
einsum
(
self
.
_combine_equation
,
attention_output
,
value
)
return
attention_output
def
call
(
self
,
query
,
value
,
content_attention_bias
,
positional_attention_bias
,
key
=
None
,
relative_position_encoding
=
None
,
segment_matrix
=
None
,
segment_encoding
=
None
,
segment_attention_bias
=
None
,
state
=
None
,
attention_mask
=
None
):
"""Compute multi-head relative attention over inputs.
Size glossary:
* Number of heads (H): the number of attention heads.
* Value size (V): the size of each value embedding per head.
* Key size (K): the size of each key embedding per head. Equally, the size
of each query embedding per head. Typically K <= V.
* Batch dimensions (B).
* Query (target) attention axes shape (T).
* Value (source) attention axes shape (S), the rank must match the target.
* Encoding length (L): The relative positional encoding length.
Args:
query: attention input.
value: attention input.
content_attention_bias: A trainable bias parameter added to the query
head when calculating the content-based attention score.
positional_attention_bias: A trainable bias parameter added to the query
head when calculating the position-based attention score.
key: attention input.
relative_position_encoding: relative positional encoding for key and
value.
segment_matrix: Optional `Tensor` representing segmentation IDs used in
XLNet.
segment_encoding: Optional `Tensor` representing the segmentation
encoding as used in XLNet.
segment_attention_bias: Optional trainable bias parameter added to the
query had when calculating the segment-based attention score used in
XLNet.
state: (default None) optional state. If passed, this is also attended
over as in TransformerXL.
attention_mask: (default None) Optional mask that is added to attention
logits. If state is not None, the mask source sequence dimension should
extend M.
Returns:
attention_output: The result of the computation, of shape [B, T, E],
where `T` is for target sequence shapes and `E` is the query input last
dimension if `output_shape` is `None`. Otherwise, the multi-head outputs
are projected to the shape specified by `output_shape`.
"""
if
not
self
.
_built_from_signature
:
self
.
_build_from_signature
(
query
,
value
,
key
=
key
)
if
key
is
None
:
key
=
value
if
state
is
not
None
and
state
.
shape
.
ndims
>
1
:
value
=
tf
.
concat
([
state
,
value
],
1
)
key
=
tf
.
concat
([
state
,
key
],
1
)
# `query` = [B, T, N ,H]
query
=
self
.
_query_dense
(
query
)
# `key` = [B, S + M, N, H]
key
=
self
.
_key_dense
(
key
)
# `value` = [B, S + M, N, H]
value
=
self
.
_value_dense
(
value
)
# `position` = [B, L, N, H]
position
=
self
.
_encoding_dense
(
relative_position_encoding
)
attention_output
=
self
.
compute_attention
(
query
=
query
,
key
=
key
,
value
=
value
,
position
=
position
,
content_attention_bias
=
content_attention_bias
,
positional_attention_bias
=
positional_attention_bias
,
segment_matrix
=
segment_matrix
,
segment_encoding
=
segment_encoding
,
segment_attention_bias
=
segment_attention_bias
,
attention_mask
=
attention_mask
)
# `attention_output` = [B, S, N, H]
attention_output
=
self
.
_output_dense
(
attention_output
)
return
attention_output
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
TwoStreamRelativeAttention
(
MultiHeadRelativeAttention
):
"""Two-stream relative self-attention for XLNet.
In XLNet, each token has two associated vectors at each self-attention layer,
the content stream (h) and the query stream (g).
The content stream is the self-attention stream as in Transformer XL and
represents the context and content (the token itself).
The query stream only has access to contextual information and the position,
but not the content.
This layer shares the same build signature as
`tf.keras.layers.MultiHeadAttention` but has different input/output
projections.
**Note: This layer is currently experimental.
Call args:
content_stream: `Tensor` of shape `[B, T, dim]`.
content_attention_bias: Bias `Tensor` for content based attention of shape
`[num_heads, dim]`.
positional_attention_bias: Bias `Tensor` for position based attention of
shape `[num_heads, dim]`.
query_stream: `Tensor` of shape `[B, P, dim]`.
target_mapping: `Tensor` of shape `[B, P, S]`.
relative_position_encoding: Relative positional encoding `Tensor` of shape
`[B, L, dim]`.
segment_matrix: Optional `Tensor` representing segmentation IDs used in
XLNet of shape `[B, S, S + M]`.
segment_encoding: Optional `Tensor` representing the segmentation
encoding as used in XLNet of shape `[2, num_heads, dim]`.
segment_attention_bias: Optional trainable bias parameter added to the
query had when calculating the segment-based attention score used in
XLNet of shape `[num_heads, dim]`.
state: Optional `Tensor` of shape [B, M, E] where M is the length of the
state or memory.
If passed, this is also attended over as in Transformer XL.
content_attention_mask: a boolean mask of shape `[B, T, S]` that
prevents attention to certain positions for content attention computation.
query_attention_mask: a boolean mask of shape `[B, T, S]` that
prevents attention to certain position for query attention computation.
"""
def
call
(
self
,
content_stream
,
content_attention_bias
,
positional_attention_bias
,
query_stream
,
relative_position_encoding
,
target_mapping
=
None
,
segment_matrix
=
None
,
segment_encoding
=
None
,
segment_attention_bias
=
None
,
state
=
None
,
content_attention_mask
=
None
,
query_attention_mask
=
None
):
"""Compute multi-head relative attention over inputs.
Size glossary:
* Number of heads (H): the number of attention heads.
* Value size (V): the size of each value embedding per head.
* Key size (K): the size of each key embedding per head. Equally, the size
of each query embedding per head. Typically K <= V.
* Number of predictions (P): the number of predictions.
* Batch dimensions (B).
* Query (target) attention axes shape (T).
* Value (source) attention axes shape (S), the rank must match the target.
* Encoding length (L): The relative positional encoding length.
Args:
content_stream: The content representation, commonly referred to as h.
This serves a similar role to the standard hidden states in
Transformer-XL.
content_attention_bias: A trainable bias parameter added to the query
head when calculating the content-based attention score.
positional_attention_bias: A trainable bias parameter added to the query
head when calculating the position-based attention score.
query_stream: The query representation, commonly referred to as g.
This only has access to contextual information and position, but not
content. If not provided, then this is MultiHeadRelativeAttention with
self-attention.
relative_position_encoding: relative positional encoding for key and
value.
target_mapping: Optional `Tensor` representing the target mapping used
in partial prediction.
segment_matrix: Optional `Tensor` representing segmentation IDs used in
XLNet.
segment_encoding: Optional `Tensor` representing the segmentation
encoding as used in XLNet.
segment_attention_bias: Optional trainable bias parameter added to the
query head when calculating the segment-based attention score.
state: (default None) optional state. If passed, this is also attended
over as in TransformerXL and XLNet.
content_attention_mask: (default None) Optional mask that is added to
content attention logits. If state is not None, the mask source sequence
dimension should extend M.
query_attention_mask: (default None) Optional mask that is added to
query attention logits. If state is not None, the mask source sequence
dimension should extend M.
Returns:
content_attention_output, query_attention_output: the results of the
computation, both of shape [B, T, E]. `T` is for target sequence shapes,
`E` is the query input last dimension if `output_shape` is `None`.
Otherwise, the multi-head outputs are projected to the shape specified
by `output_shape`.
"""
if
not
self
.
_built_from_signature
:
self
.
_build_from_signature
(
content_stream
,
content_stream
,
content_stream
)
if
state
is
not
None
and
state
.
shape
.
ndims
>
1
:
content_and_memory_stream
=
tf
.
concat
([
state
,
content_stream
],
1
)
else
:
content_and_memory_stream
=
content_stream
# `query` = [B, T, N, H]
query
=
self
.
_query_dense
(
content_stream
)
# `key` = [B, S + M, N, H]
key
=
self
.
_key_dense
(
content_and_memory_stream
)
# `value` = [B, S + M, N, H]
value
=
self
.
_value_dense
(
content_and_memory_stream
)
# `position` = [B, L, N, H]
position
=
self
.
_encoding_dense
(
relative_position_encoding
)
content_attention_output
=
self
.
compute_attention
(
query
=
query
,
key
=
key
,
value
=
value
,
position
=
position
,
content_attention_bias
=
content_attention_bias
,
positional_attention_bias
=
positional_attention_bias
,
segment_matrix
=
segment_matrix
,
segment_encoding
=
segment_encoding
,
segment_attention_bias
=
segment_attention_bias
,
attention_mask
=
content_attention_mask
)
# `content_attention_output` = [B, S, N, H]
content_attention_output
=
self
.
_output_dense
(
content_attention_output
)
query_attention_output
=
None
if
query_stream
is
not
None
:
query
=
self
.
_query_dense
(
query_stream
)
if
target_mapping
is
not
None
:
query
=
tf
.
einsum
(
"bmnd,bml->blnd"
,
query
,
target_mapping
)
query_attention_output
=
self
.
compute_attention
(
query
=
query
,
key
=
key
,
value
=
value
,
position
=
position
,
content_attention_bias
=
content_attention_bias
,
positional_attention_bias
=
positional_attention_bias
,
segment_matrix
=
segment_matrix
,
segment_encoding
=
segment_encoding
,
segment_attention_bias
=
segment_attention_bias
,
attention_mask
=
query_attention_mask
)
query_attention_output
=
tf
.
einsum
(
"blnd,bml->bmnd"
,
query_attention_output
,
target_mapping
)
else
:
query_attention_output
=
self
.
compute_attention
(
query
=
query
,
key
=
key
,
value
=
value
,
position
=
position
,
content_attention_bias
=
content_attention_bias
,
positional_attention_bias
=
positional_attention_bias
,
segment_matrix
=
segment_matrix
,
segment_encoding
=
segment_encoding
,
segment_attention_bias
=
segment_attention_bias
,
attention_mask
=
query_attention_mask
)
query_attention_output
=
self
.
_output_dense
(
query_attention_output
)
return
content_attention_output
,
query_attention_output
official/nlp/modeling/layers/relative_attention_test.py
0 → 100644
View file @
f16a7b5b
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for the attention layer."""
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.distribute
import
combinations
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
official.nlp.modeling.layers
import
relative_attention
def
_create_mock_attention_data
(
num_heads
,
key_dim
,
value_dim
,
seq_length
,
batch_size
,
memory_length
=
0
,
num_predictions
=
2
,
two_stream
=
False
,
include_state
=
False
,
include_mask
=
False
,
include_segment
=
False
):
"""Creates mock testing data.
Args:
num_heads: `int`, Number of attention heads.
key_dim: `int`, Size of query head.
value_dim: `int`, Size of key, value dim.
seq_length: `int`, Sequence length of the input.
batch_size: `int`, the batch size.
memory_length: optional `int`, the length of the state. Defaults to 0.
num_predictions: `int`, the number of predictions used in two stream
attention.
two_stream: `bool`, whether or not to generate two stream data.
include_state: optional `bool`, whether or not to include state data.
include_mask: optional `bool`, whether or not to include mask data.
include_segment: optional `bool`, whether or not to include segment data.
Returns:
A dictionary with `str` as keys and `Tensor` as values.
"""
query_shape
=
(
batch_size
,
seq_length
,
key_dim
)
value_shape
=
(
batch_size
,
seq_length
,
value_dim
)
encoding_shape
=
(
batch_size
,
seq_length
*
2
,
key_dim
)
attention_bias_shape
=
(
num_heads
,
key_dim
)
data
=
dict
(
relative_position_encoding
=
tf
.
random
.
normal
(
shape
=
encoding_shape
),
content_attention_bias
=
tf
.
random
.
normal
(
shape
=
attention_bias_shape
),
positional_attention_bias
=
tf
.
random
.
normal
(
shape
=
attention_bias_shape
))
if
two_stream
:
query_stream_shape
=
(
batch_size
,
num_predictions
,
key_dim
)
target_mapping_shape
=
(
batch_size
,
num_predictions
,
seq_length
)
stream_data
=
dict
(
content_stream
=
tf
.
random
.
normal
(
shape
=
query_shape
),
query_stream
=
tf
.
random
.
normal
(
shape
=
query_stream_shape
),
target_mapping
=
tf
.
random
.
normal
(
shape
=
target_mapping_shape
))
else
:
stream_data
=
dict
(
query
=
tf
.
random
.
normal
(
shape
=
query_shape
),
value
=
tf
.
random
.
normal
(
shape
=
value_shape
),
key
=
tf
.
random
.
normal
(
shape
=
value_shape
))
data
.
update
(
stream_data
)
if
include_state
:
total_seq_length
=
seq_length
+
memory_length
state_data
=
dict
(
state
=
tf
.
random
.
normal
(
shape
=
(
batch_size
,
memory_length
,
value_dim
)))
data
.
update
(
state_data
)
else
:
total_seq_length
=
seq_length
if
include_mask
:
mask_shape
=
(
batch_size
,
num_heads
,
seq_length
,
total_seq_length
)
mask_data
=
np
.
random
.
randint
(
2
,
size
=
mask_shape
).
astype
(
"float32"
)
if
two_stream
:
mask_data
=
dict
(
content_attention_mask
=
mask_data
,
query_attention_mask
=
mask_data
)
else
:
mask_data
=
dict
(
attention_mask
=
mask_data
)
data
.
update
(
mask_data
)
if
include_segment
:
segment_encoding_shape
=
(
2
,
num_heads
,
key_dim
)
segment_matrix
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
seq_length
,
total_seq_length
))
segment_matrix
=
tf
.
math
.
equal
(
segment_matrix
,
1
)
segment_data
=
dict
(
segment_attention_bias
=
tf
.
random
.
normal
(
shape
=
attention_bias_shape
),
segment_encoding
=
tf
.
random
.
normal
(
shape
=
segment_encoding_shape
),
segment_matrix
=
segment_matrix
)
data
.
update
(
segment_data
)
return
data
@
keras_parameterized
.
run_all_keras_modes
class
MultiHeadRelativeAttentionTest
(
keras_parameterized
.
TestCase
):
@
combinations
.
generate
(
combinations
.
combine
(
value_dim
=
[
32
,
64
],
memory_length
=
[
0
,
4
],
state
=
[
True
,
False
],
mask
=
[
True
,
False
],
segment
=
[
True
,
False
]))
def
test_attention_scores
(
self
,
value_dim
,
memory_length
,
state
,
mask
,
segment
):
"""Tests combinations of attention score calculations."""
batch_size
,
num_heads
,
key_dim
,
seq_length
=
2
,
12
,
64
,
8
test_layer
=
relative_attention
.
MultiHeadRelativeAttention
(
num_heads
=
num_heads
,
key_dim
=
key_dim
,
value_dim
=
value_dim
)
data
=
_create_mock_attention_data
(
num_heads
=
num_heads
,
key_dim
=
key_dim
,
value_dim
=
value_dim
,
seq_length
=
seq_length
,
memory_length
=
memory_length
,
two_stream
=
False
,
batch_size
=
batch_size
,
include_state
=
state
,
include_mask
=
mask
,
include_segment
=
segment
)
output
=
test_layer
(
**
data
)
self
.
assertEqual
(
output
.
shape
,
[
batch_size
,
seq_length
,
key_dim
])
@
keras_parameterized
.
run_all_keras_modes
class
TwoStreamRelativeAttentionTest
(
keras_parameterized
.
TestCase
):
@
combinations
.
generate
(
combinations
.
combine
(
num_predictions
=
[
2
,
10
],
memory_length
=
[
0
,
4
],
state
=
[
True
,
False
],
mask
=
[
True
,
False
],
segment
=
[
True
,
False
]))
def
test_attention_scores
(
self
,
num_predictions
,
memory_length
,
state
,
mask
,
segment
):
"""Tests combinations of attention score calculations."""
batch_size
,
num_heads
,
key_dim
,
seq_length
=
2
,
12
,
64
,
8
test_layer
=
relative_attention
.
TwoStreamRelativeAttention
(
num_heads
=
num_heads
,
key_dim
=
key_dim
,
value_dim
=
key_dim
)
data
=
_create_mock_attention_data
(
num_heads
=
num_heads
,
key_dim
=
key_dim
,
value_dim
=
key_dim
,
seq_length
=
seq_length
,
memory_length
=
memory_length
,
num_predictions
=
num_predictions
,
two_stream
=
True
,
batch_size
=
batch_size
,
include_state
=
state
,
include_mask
=
mask
,
include_segment
=
segment
)
content_output
,
query_output
,
=
test_layer
(
**
data
)
self
.
assertEqual
(
content_output
.
shape
,
[
batch_size
,
seq_length
,
key_dim
])
self
.
assertEqual
(
query_output
.
shape
,
[
batch_size
,
num_predictions
,
key_dim
])
if
__name__
==
"__main__"
:
np
.
random
.
seed
(
0
)
tf
.
random
.
set_seed
(
0
)
tf
.
test
.
main
()
official/nlp/modeling/layers/rezero_transformer.py
View file @
f16a7b5b
# Copyright 202
0
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -11,19 +11,13 @@
...
@@ -11,19 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
# ==============================================================================
"""Keras-based rezero-transformer block layer (Transformer with ReZero)."""
"""Keras-based rezero-transformer block layer (Transformer with ReZero)."""
# pylint: disable=g-classes-have-attributes
# pylint: disable=g-classes-have-attributes
from
__future__
import
absolute_import
from
__future__
import
division
# from __future__ import google_type_annotations
from
__future__
import
print_function
import
gin
import
gin
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.nlp.modeling.layers
import
attention
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
@
gin
.
configurable
@
gin
.
configurable
...
@@ -35,7 +29,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
...
@@ -35,7 +29,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
The residual connection implements the ReZero method.
The residual connection implements the ReZero method.
(https://arxiv.org/abs/2003.04887)
(https://arxiv.org/abs/2003.04887)
Arg
ument
s:
Args:
num_attention_heads: Number of attention heads.
num_attention_heads: Number of attention heads.
intermediate_size: Size of the intermediate layer.
intermediate_size: Size of the intermediate layer.
intermediate_activation: Activation for the intermediate layer.
intermediate_activation: Activation for the intermediate layer.
...
@@ -88,7 +82,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
...
@@ -88,7 +82,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
input_tensor
=
input_shape
[
0
]
if
len
(
input_shape
)
==
2
else
input_shape
input_tensor
=
input_shape
[
0
]
if
len
(
input_shape
)
==
2
else
input_shape
input_tensor_shape
=
tf
.
TensorShape
(
input_tensor
)
input_tensor_shape
=
tf
.
TensorShape
(
input_tensor
)
if
len
(
input_tensor_shape
)
!=
3
:
if
len
(
input_tensor_shape
.
as_list
()
)
!=
3
:
raise
ValueError
(
"TransformerLayer expects a three-dimensional input of "
raise
ValueError
(
"TransformerLayer expects a three-dimensional input of "
"shape [batch, sequence, width]."
)
"shape [batch, sequence, width]."
)
batch_size
,
sequence_length
,
hidden_size
=
input_tensor_shape
batch_size
,
sequence_length
,
hidden_size
=
input_tensor_shape
...
@@ -116,9 +110,9 @@ class ReZeroTransformer(tf.keras.layers.Layer):
...
@@ -116,9 +110,9 @@ class ReZeroTransformer(tf.keras.layers.Layer):
activity_regularizer
=
self
.
_activity_regularizer
,
activity_regularizer
=
self
.
_activity_regularizer
,
kernel_constraint
=
self
.
_kernel_constraint
,
kernel_constraint
=
self
.
_kernel_constraint
,
bias_constraint
=
self
.
_bias_constraint
)
bias_constraint
=
self
.
_bias_constraint
)
self
.
_attention_layer
=
attention
.
MultiHeadAttention
(
self
.
_attention_layer
=
tf
.
keras
.
layers
.
MultiHeadAttention
(
num_heads
=
self
.
_num_heads
,
num_heads
=
self
.
_num_heads
,
key_
size
=
self
.
_attention_head_size
,
key_
dim
=
self
.
_attention_head_size
,
dropout
=
self
.
_attention_dropout_rate
,
dropout
=
self
.
_attention_dropout_rate
,
name
=
"self_attention"
,
name
=
"self_attention"
,
**
common_kwargs
)
**
common_kwargs
)
...
@@ -138,7 +132,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
...
@@ -138,7 +132,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
bias_axes
=
"d"
,
bias_axes
=
"d"
,
name
=
"intermediate"
,
name
=
"intermediate"
,
**
common_kwargs
)
**
common_kwargs
)
policy
=
tf
.
keras
.
mixed_precision
.
experimental
.
global_policy
()
policy
=
tf
.
keras
.
mixed_precision
.
global_policy
()
if
policy
.
name
==
"mixed_bfloat16"
:
if
policy
.
name
==
"mixed_bfloat16"
:
# bfloat16 causes BERT with the LAMB optimizer to not converge
# bfloat16 causes BERT with the LAMB optimizer to not converge
# as well, so we use float32.
# as well, so we use float32.
...
@@ -161,7 +155,8 @@ class ReZeroTransformer(tf.keras.layers.Layer):
...
@@ -161,7 +155,8 @@ class ReZeroTransformer(tf.keras.layers.Layer):
self
.
_rezero_a
=
self
.
add_weight
(
self
.
_rezero_a
=
self
.
add_weight
(
name
=
"rezero_alpha"
,
name
=
"rezero_alpha"
,
initializer
=
tf
.
keras
.
initializers
.
Zeros
(),
initializer
=
tf
.
keras
.
initializers
.
Zeros
(),
trainable
=
True
,
dtype
=
tf
.
float32
)
trainable
=
True
,
dtype
=
tf
.
float32
)
super
(
ReZeroTransformer
,
self
).
build
(
input_shape
)
super
(
ReZeroTransformer
,
self
).
build
(
input_shape
)
...
@@ -213,9 +208,9 @@ class ReZeroTransformer(tf.keras.layers.Layer):
...
@@ -213,9 +208,9 @@ class ReZeroTransformer(tf.keras.layers.Layer):
attention_mask
=
attention_mask
[:,
0
:
self
.
_output_range
,
:]
attention_mask
=
attention_mask
[:,
0
:
self
.
_output_range
,
:]
else
:
else
:
target_tensor
=
input_tensor
target_tensor
=
input_tensor
attention_inputs
=
[
target_tensor
,
input_tensor
]
attention_output
=
self
.
_attention_layer
(
attention_inputs
,
attention_mask
)
attention_output
=
self
.
_attention_layer
(
query
=
target_tensor
,
value
=
input_tensor
,
attention_mask
=
attention_mask
)
attention_output
=
self
.
_attention_dropout
(
attention_output
)
attention_output
=
self
.
_attention_dropout
(
attention_output
)
attention_output
=
target_tensor
+
self
.
_rezero_a
*
attention_output
attention_output
=
target_tensor
+
self
.
_rezero_a
*
attention_output
if
self
.
_use_layer_norm
:
if
self
.
_use_layer_norm
:
...
...
official/nlp/modeling/layers/rezero_transformer_test.py
View file @
f16a7b5b
# Copyright 202
0
The TensorFlow Authors. All Rights Reserved.
# Copyright 202
1
The TensorFlow Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -11,12 +11,8 @@
...
@@ -11,12 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
# ==============================================================================
"""Tests for Keras-based rezero-transformer block layer."""
from
__future__
import
absolute_import
"""Tests for Keras-based rezero-transformer block layer."""
from
__future__
import
division
from
__future__
import
print_function
import
numpy
as
np
import
numpy
as
np
import
tensorflow
as
tf
import
tensorflow
as
tf
...
@@ -32,10 +28,10 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):
...
@@ -32,10 +28,10 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):
def
tearDown
(
self
):
def
tearDown
(
self
):
super
(
TransformerWithReZeroLayerTest
,
self
).
tearDown
()
super
(
TransformerWithReZeroLayerTest
,
self
).
tearDown
()
tf
.
keras
.
mixed_precision
.
experimental
.
set
_policy
(
'float32'
)
tf
.
keras
.
mixed_precision
.
set_global
_policy
(
'float32'
)
def
test_layer_invocation_with_float16_dtype
(
self
):
def
test_layer_invocation_with_float16_dtype
(
self
):
tf
.
keras
.
mixed_precision
.
experimental
.
set
_policy
(
'mixed_float16'
)
tf
.
keras
.
mixed_precision
.
set_global
_policy
(
'mixed_float16'
)
test_layer
=
rezero_transformer
.
ReZeroTransformer
(
test_layer
=
rezero_transformer
.
ReZeroTransformer
(
num_attention_heads
=
10
,
num_attention_heads
=
10
,
intermediate_size
=
2048
,
intermediate_size
=
2048
,
...
@@ -95,9 +91,9 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):
...
@@ -95,9 +91,9 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):
input_data
=
np
.
random
.
rand
(
2
,
input_length
,
width
)
+
2.0
input_data
=
np
.
random
.
rand
(
2
,
input_length
,
width
)
+
2.0
output_data
=
model
.
predict
(
input_data
)
output_data
=
model
.
predict
(
input_data
)
input_data_normed
=
(
input_data_normed
=
(
input_data
-
input_data
-
np
.
mean
(
input_data
,
axis
=-
1
,
keepdims
=
True
))
/
(
np
.
mean
(
input_data
,
axis
=-
1
,
keepdims
=
True
))
/
(
np
.
std
(
input_data
,
axis
=-
1
,
keepdims
=
True
))
np
.
std
(
input_data
,
axis
=-
1
,
keepdims
=
True
))
self
.
assertAllClose
(
input_data_normed
,
output_data
)
self
.
assertAllClose
(
input_data_normed
,
output_data
)
...
...
Prev
1
…
9
10
11
12
13
14
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment