Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
002b4240
Commit
002b4240
authored
Oct 07, 2021
by
Frederick Liu
Committed by
A. Unique TensorFlower
Oct 07, 2021
Browse files
[keras_nlp] Merge keras_nlp into tf_nlp.
PiperOrigin-RevId: 401593694
parent
03c096ab
Changes
27
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1276 additions
and
622 deletions
+1276
-622
official/nlp/keras_nlp/layers/masked_lm.py
official/nlp/keras_nlp/layers/masked_lm.py
+2
-106
official/nlp/keras_nlp/layers/on_device_embedding.py
official/nlp/keras_nlp/layers/on_device_embedding.py
+2
-89
official/nlp/keras_nlp/layers/position_embedding.py
official/nlp/keras_nlp/layers/position_embedding.py
+2
-70
official/nlp/keras_nlp/layers/self_attention_mask.py
official/nlp/keras_nlp/layers/self_attention_mask.py
+2
-37
official/nlp/keras_nlp/layers/transformer_encoder_block.py
official/nlp/keras_nlp/layers/transformer_encoder_block.py
+2
-290
official/nlp/modeling/layers/README.md
official/nlp/modeling/layers/README.md
+4
-0
official/nlp/modeling/layers/__init__.py
official/nlp/modeling/layers/__init__.py
+3
-0
official/nlp/modeling/layers/masked_lm.py
official/nlp/modeling/layers/masked_lm.py
+105
-2
official/nlp/modeling/layers/mobile_bert_layers.py
official/nlp/modeling/layers/mobile_bert_layers.py
+5
-4
official/nlp/modeling/layers/on_device_embedding.py
official/nlp/modeling/layers/on_device_embedding.py
+87
-2
official/nlp/modeling/layers/on_device_embedding_test.py
official/nlp/modeling/layers/on_device_embedding_test.py
+213
-0
official/nlp/modeling/layers/position_embedding.py
official/nlp/modeling/layers/position_embedding.py
+70
-0
official/nlp/modeling/layers/position_embedding_test.py
official/nlp/modeling/layers/position_embedding_test.py
+107
-0
official/nlp/modeling/layers/self_attention_mask.py
official/nlp/modeling/layers/self_attention_mask.py
+32
-13
official/nlp/modeling/layers/transformer.py
official/nlp/modeling/layers/transformer.py
+2
-2
official/nlp/modeling/layers/transformer_encoder_block.py
official/nlp/modeling/layers/transformer_encoder_block.py
+308
-0
official/nlp/modeling/layers/transformer_encoder_block_test.py
...ial/nlp/modeling/layers/transformer_encoder_block_test.py
+324
-0
official/nlp/modeling/models/seq2seq_transformer.py
official/nlp/modeling/models/seq2seq_transformer.py
+2
-3
official/nlp/modeling/networks/__init__.py
official/nlp/modeling/networks/__init__.py
+1
-0
official/nlp/modeling/networks/albert_encoder.py
official/nlp/modeling/networks/albert_encoder.py
+3
-4
No files found.
official/nlp/keras_nlp/layers/masked_lm.py
View file @
002b4240
...
@@ -13,111 +13,7 @@
...
@@ -13,111 +13,7 @@
# limitations under the License.
# limitations under the License.
"""Masked language model network."""
"""Masked language model network."""
# pylint: disable=g-classes-have-attributes
from
official.nlp.modeling
import
layers
import
tensorflow
as
tf
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'keras_nlp'
)
MaskedLM
=
layers
.
MaskedLM
class
MaskedLM
(
tf
.
keras
.
layers
.
Layer
):
"""Masked language model network head for BERT modeling.
This layer implements a masked language model based on the provided
transformer based encoder. It assumes that the encoder network being passed
has a "get_embedding_table()" method.
Example:
```python
encoder=keras_nlp.BertEncoder(...)
lm_layer=MaskedLM(embedding_table=encoder.get_embedding_table())
```
Args:
embedding_table: The embedding table from encoder network.
activation: The activation, if any, for the dense layer.
initializer: The initializer for the dense layer. Defaults to a Glorot
uniform initializer.
output: The output style for this layer. Can be either 'logits' or
'predictions'.
"""
def
__init__
(
self
,
embedding_table
,
activation
=
None
,
initializer
=
'glorot_uniform'
,
output
=
'logits'
,
name
=
None
,
**
kwargs
):
super
(
MaskedLM
,
self
).
__init__
(
name
=
name
,
**
kwargs
)
self
.
embedding_table
=
embedding_table
self
.
activation
=
activation
self
.
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
if
output
not
in
(
'predictions'
,
'logits'
):
raise
ValueError
(
(
'Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"'
)
%
output
)
self
.
_output_type
=
output
def
build
(
self
,
input_shape
):
self
.
_vocab_size
,
hidden_size
=
self
.
embedding_table
.
shape
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
hidden_size
,
activation
=
self
.
activation
,
kernel_initializer
=
self
.
initializer
,
name
=
'transform/dense'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
axis
=-
1
,
epsilon
=
1e-12
,
name
=
'transform/LayerNorm'
)
self
.
bias
=
self
.
add_weight
(
'output_bias/bias'
,
shape
=
(
self
.
_vocab_size
,),
initializer
=
'zeros'
,
trainable
=
True
)
super
(
MaskedLM
,
self
).
build
(
input_shape
)
def
call
(
self
,
sequence_data
,
masked_positions
):
masked_lm_input
=
self
.
_gather_indexes
(
sequence_data
,
masked_positions
)
lm_data
=
self
.
dense
(
masked_lm_input
)
lm_data
=
self
.
layer_norm
(
lm_data
)
lm_data
=
tf
.
matmul
(
lm_data
,
self
.
embedding_table
,
transpose_b
=
True
)
logits
=
tf
.
nn
.
bias_add
(
lm_data
,
self
.
bias
)
masked_positions_length
=
masked_positions
.
shape
.
as_list
()[
1
]
or
tf
.
shape
(
masked_positions
)[
1
]
logits
=
tf
.
reshape
(
logits
,
[
-
1
,
masked_positions_length
,
self
.
_vocab_size
])
if
self
.
_output_type
==
'logits'
:
return
logits
return
tf
.
nn
.
log_softmax
(
logits
)
def
get_config
(
self
):
raise
NotImplementedError
(
'MaskedLM cannot be directly serialized because '
'it has variable sharing logic.'
)
def
_gather_indexes
(
self
,
sequence_tensor
,
positions
):
"""Gathers the vectors at the specific positions, for performance.
Args:
sequence_tensor: Sequence output of shape
(`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
hidden units.
positions: Positions ids of tokens in sequence to mask for pretraining
of with dimension (batch_size, num_predictions) where
`num_predictions` is maximum number of tokens to mask out and predict
per each sequence.
Returns:
Masked out sequence tensor of shape (batch_size * num_predictions,
num_hidden).
"""
sequence_shape
=
tf
.
shape
(
sequence_tensor
)
batch_size
,
seq_length
=
sequence_shape
[
0
],
sequence_shape
[
1
]
width
=
sequence_tensor
.
shape
.
as_list
()[
2
]
or
sequence_shape
[
2
]
flat_offsets
=
tf
.
reshape
(
tf
.
range
(
0
,
batch_size
,
dtype
=
tf
.
int32
)
*
seq_length
,
[
-
1
,
1
])
flat_positions
=
tf
.
reshape
(
positions
+
flat_offsets
,
[
-
1
])
flat_sequence_tensor
=
tf
.
reshape
(
sequence_tensor
,
[
batch_size
*
seq_length
,
width
])
output_tensor
=
tf
.
gather
(
flat_sequence_tensor
,
flat_positions
)
return
output_tensor
official/nlp/keras_nlp/layers/on_device_embedding.py
View file @
002b4240
...
@@ -13,94 +13,7 @@
...
@@ -13,94 +13,7 @@
# limitations under the License.
# limitations under the License.
"""Keras-based one-hot embedding layer."""
"""Keras-based one-hot embedding layer."""
# pylint: disable=g-classes-have-attribute
s
from
official.nlp.modeling
import
layer
s
import
tensorflow
as
tf
OnDeviceEmbedding
=
layers
.
OnDeviceEmbedding
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"keras_nlp"
)
class
OnDeviceEmbedding
(
tf
.
keras
.
layers
.
Layer
):
"""Performs an embedding lookup suitable for accelerator devices.
This layer uses either tf.gather or tf.one_hot to translate integer indices to
float embeddings.
Args:
vocab_size: Number of elements in the vocabulary.
embedding_width: Output size of the embedding layer.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
lookup. Defaults to False (that is, using tf.gather). Setting this option
to True may improve performance, especially on small vocabulary sizes, but
will generally require more memory.
scale_factor: Whether to scale the output embeddings. Defaults to None (that
is, not to scale). Setting this option to a float will let values in
output embeddings multiplied by scale_factor.
"""
def
__init__
(
self
,
vocab_size
,
embedding_width
,
initializer
=
"glorot_uniform"
,
use_one_hot
=
False
,
scale_factor
=
None
,
**
kwargs
):
super
(
OnDeviceEmbedding
,
self
).
__init__
(
**
kwargs
)
self
.
_vocab_size
=
vocab_size
self
.
_embedding_width
=
embedding_width
self
.
_initializer
=
initializer
self
.
_use_one_hot
=
use_one_hot
self
.
_scale_factor
=
scale_factor
def
get_config
(
self
):
config
=
{
"vocab_size"
:
self
.
_vocab_size
,
"embedding_width"
:
self
.
_embedding_width
,
"initializer"
:
self
.
_initializer
,
"use_one_hot"
:
self
.
_use_one_hot
,
"scale_factor"
:
self
.
_scale_factor
,
}
base_config
=
super
(
OnDeviceEmbedding
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
build
(
self
,
input_shape
):
self
.
embeddings
=
self
.
add_weight
(
"embeddings"
,
shape
=
[
self
.
_vocab_size
,
self
.
_embedding_width
],
initializer
=
self
.
_initializer
,
dtype
=
tf
.
float32
)
super
(
OnDeviceEmbedding
,
self
).
build
(
input_shape
)
def
call
(
self
,
inputs
):
flat_inputs
=
tf
.
reshape
(
inputs
,
[
-
1
])
if
self
.
_use_one_hot
:
dtype
=
self
.
_compute_dtype
if
not
tf
.
dtypes
.
as_dtype
(
dtype
).
is_floating
:
# TensorFlow 1 compatibility. In TF1, self._compute_dtype is int32
# instead of a floating-point dtype, as the dtype is inferred from the
# dtype of the inputs
dtype
=
tf
.
float32
one_hot_data
=
tf
.
one_hot
(
flat_inputs
,
depth
=
self
.
_vocab_size
,
dtype
=
dtype
)
embeddings
=
tf
.
matmul
(
one_hot_data
,
self
.
embeddings
)
else
:
embeddings
=
tf
.
gather
(
self
.
embeddings
,
flat_inputs
)
embeddings
=
tf
.
reshape
(
embeddings
,
# Work around b/142213824: prefer concat to shape over a Python list.
tf
.
concat
([
tf
.
shape
(
inputs
),
[
self
.
_embedding_width
]],
axis
=
0
))
embeddings
.
set_shape
(
inputs
.
shape
.
as_list
()
+
[
self
.
_embedding_width
])
if
self
.
_scale_factor
:
embeddings
*=
self
.
_scale_factor
return
embeddings
@
property
def
vocab_size
(
self
):
return
self
.
_vocab_size
@
property
def
embedding_width
(
self
):
return
self
.
_embedding_width
official/nlp/keras_nlp/layers/position_embedding.py
View file @
002b4240
...
@@ -13,75 +13,7 @@
...
@@ -13,75 +13,7 @@
# limitations under the License.
# limitations under the License.
"""Keras-based positional embedding layer."""
"""Keras-based positional embedding layer."""
# pylint: disable=g-classes-have-attributes
from
official.nlp.modeling
import
layers
import
tensorflow
as
tf
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"keras_nlp"
)
PositionEmbedding
=
layers
.
PositionEmbedding
class
PositionEmbedding
(
tf
.
keras
.
layers
.
Layer
):
"""Creates a positional embedding.
Example:
```python
position_embedding = PositionEmbedding(max_length=100)
inputs = tf.keras.Input((100, 32), dtype=tf.float32)
outputs = position_embedding(inputs)
```
Args:
max_length: The maximum size of the dynamic sequence.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
seq_axis: The axis of the input tensor where we add the embeddings.
Reference: This layer creates a positional embedding as described in
[BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding](https://arxiv.org/abs/1810.04805).
"""
def
__init__
(
self
,
max_length
,
initializer
=
"glorot_uniform"
,
seq_axis
=
1
,
**
kwargs
):
super
(
PositionEmbedding
,
self
).
__init__
(
**
kwargs
)
if
max_length
is
None
:
raise
ValueError
(
"`max_length` must be an Integer, not `None`."
)
self
.
_max_length
=
max_length
self
.
_initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
self
.
_seq_axis
=
seq_axis
def
get_config
(
self
):
config
=
{
"max_length"
:
self
.
_max_length
,
"initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_initializer
),
"seq_axis"
:
self
.
_seq_axis
,
}
base_config
=
super
(
PositionEmbedding
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
build
(
self
,
input_shape
):
dimension_list
=
input_shape
.
as_list
()
width
=
dimension_list
[
-
1
]
weight_sequence_length
=
self
.
_max_length
self
.
_position_embeddings
=
self
.
add_weight
(
"embeddings"
,
shape
=
[
weight_sequence_length
,
width
],
initializer
=
self
.
_initializer
)
super
(
PositionEmbedding
,
self
).
build
(
input_shape
)
def
call
(
self
,
inputs
):
input_shape
=
tf
.
shape
(
inputs
)
actual_seq_len
=
input_shape
[
self
.
_seq_axis
]
position_embeddings
=
self
.
_position_embeddings
[:
actual_seq_len
,
:]
new_shape
=
[
1
for
_
in
inputs
.
get_shape
().
as_list
()]
new_shape
[
self
.
_seq_axis
]
=
actual_seq_len
new_shape
[
-
1
]
=
position_embeddings
.
get_shape
().
as_list
()[
-
1
]
position_embeddings
=
tf
.
reshape
(
position_embeddings
,
new_shape
)
return
tf
.
broadcast_to
(
position_embeddings
,
input_shape
)
official/nlp/keras_nlp/layers/self_attention_mask.py
View file @
002b4240
...
@@ -14,42 +14,7 @@
...
@@ -14,42 +14,7 @@
"""Keras layer that creates a self-attention mask."""
"""Keras layer that creates a self-attention mask."""
import
tensorflow
as
tf
from
official.nlp.modeling
import
layers
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'keras_nlp'
)
SelfAttentionMask
=
layers
.
SelfAttentionMask
class
SelfAttentionMask
(
tf
.
keras
.
layers
.
Layer
):
"""Create 3D attention mask from a 2D tensor mask.
inputs[0]: from_tensor: 2D or 3D Tensor of shape
[batch_size, from_seq_length, ...].
inputs[1]: to_mask: int32 Tensor of shape [batch_size, to_seq_length].
Returns:
float Tensor of shape [batch_size, from_seq_length, to_seq_length].
"""
def
call
(
self
,
inputs
,
to_mask
):
from_shape
=
tf
.
shape
(
inputs
)
batch_size
=
from_shape
[
0
]
from_seq_length
=
from_shape
[
1
]
to_shape
=
tf
.
shape
(
to_mask
)
to_seq_length
=
to_shape
[
1
]
to_mask
=
tf
.
cast
(
tf
.
reshape
(
to_mask
,
[
batch_size
,
1
,
to_seq_length
]),
dtype
=
inputs
.
dtype
)
# We don't assume that `from_tensor` is a mask (although it could be). We
# don't actually care if we attend *from* padding tokens (only *to* padding)
# tokens so we create a tensor of all ones.
#
# `broadcast_ones` = [batch_size, from_seq_length, 1]
broadcast_ones
=
tf
.
ones
(
shape
=
[
batch_size
,
from_seq_length
,
1
],
dtype
=
inputs
.
dtype
)
# Here we broadcast along two dimensions to create the mask.
mask
=
broadcast_ones
*
to_mask
return
mask
official/nlp/keras_nlp/layers/transformer_encoder_block.py
View file @
002b4240
...
@@ -14,295 +14,7 @@
...
@@ -14,295 +14,7 @@
"""Keras-based TransformerEncoder block layer."""
"""Keras-based TransformerEncoder block layer."""
import
tensorflow
as
tf
from
official.nlp.modeling
import
layers
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"keras_nlp"
)
TransformerEncoderBlock
=
layers
.
TransformerEncoderBlock
class
TransformerEncoderBlock
(
tf
.
keras
.
layers
.
Layer
):
"""TransformerEncoderBlock layer.
This layer implements the Transformer Encoder from
"Attention Is All You Need". (https://arxiv.org/abs/1706.03762),
which combines a `tf.keras.layers.MultiHeadAttention` layer with a
two-layer feedforward network.
References:
[Attention Is All You Need](https://arxiv.org/abs/1706.03762)
[BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding](https://arxiv.org/abs/1810.04805)
"""
def
__init__
(
self
,
num_attention_heads
,
inner_dim
,
inner_activation
,
output_range
=
None
,
kernel_initializer
=
"glorot_uniform"
,
bias_initializer
=
"zeros"
,
kernel_regularizer
=
None
,
bias_regularizer
=
None
,
activity_regularizer
=
None
,
kernel_constraint
=
None
,
bias_constraint
=
None
,
use_bias
=
True
,
norm_first
=
False
,
norm_epsilon
=
1e-12
,
output_dropout
=
0.0
,
attention_dropout
=
0.0
,
inner_dropout
=
0.0
,
attention_initializer
=
None
,
attention_axes
=
None
,
**
kwargs
):
"""Initializes `TransformerEncoderBlock`.
Args:
num_attention_heads: Number of attention heads.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network.
output_range: the sequence output range, [0, output_range) for slicing the
target sequence. `None` means the target sequence is not sliced.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
bias_regularizer: Regularizer for dense layer biases.
activity_regularizer: Regularizer for dense layer activity.
kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels.
use_bias: Whether to enable use_bias in attention layer. If set False,
use_bias in attention layer is disabled.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set False, output of attention and intermediate dense
layers is normalized.
norm_epsilon: Epsilon value to initialize normalization layers.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: Dropout probability for within the attention layer.
inner_dropout: Dropout probability for the first Dense layer in a
two-layer feedforward network.
attention_initializer: Initializer for kernels of attention layers. If set
`None`, attention layers use kernel_initializer as initializer for
kernel.
attention_axes: axes over which the attention is applied. `None` means
attention over all axes, but batch, heads, and features.
**kwargs: keyword arguments/
"""
super
().
__init__
(
**
kwargs
)
self
.
_num_heads
=
num_attention_heads
self
.
_inner_dim
=
inner_dim
self
.
_inner_activation
=
inner_activation
self
.
_attention_dropout
=
attention_dropout
self
.
_attention_dropout_rate
=
attention_dropout
self
.
_output_dropout
=
output_dropout
self
.
_output_dropout_rate
=
output_dropout
self
.
_output_range
=
output_range
self
.
_kernel_initializer
=
tf
.
keras
.
initializers
.
get
(
kernel_initializer
)
self
.
_bias_initializer
=
tf
.
keras
.
initializers
.
get
(
bias_initializer
)
self
.
_kernel_regularizer
=
tf
.
keras
.
regularizers
.
get
(
kernel_regularizer
)
self
.
_bias_regularizer
=
tf
.
keras
.
regularizers
.
get
(
bias_regularizer
)
self
.
_activity_regularizer
=
tf
.
keras
.
regularizers
.
get
(
activity_regularizer
)
self
.
_kernel_constraint
=
tf
.
keras
.
constraints
.
get
(
kernel_constraint
)
self
.
_bias_constraint
=
tf
.
keras
.
constraints
.
get
(
bias_constraint
)
self
.
_use_bias
=
use_bias
self
.
_norm_first
=
norm_first
self
.
_norm_epsilon
=
norm_epsilon
self
.
_inner_dropout
=
inner_dropout
if
attention_initializer
:
self
.
_attention_initializer
=
tf
.
keras
.
initializers
.
get
(
attention_initializer
)
else
:
self
.
_attention_initializer
=
self
.
_kernel_initializer
self
.
_attention_axes
=
attention_axes
def
build
(
self
,
input_shape
):
if
isinstance
(
input_shape
,
tf
.
TensorShape
):
input_tensor_shape
=
input_shape
elif
isinstance
(
input_shape
,
(
list
,
tuple
)):
input_tensor_shape
=
tf
.
TensorShape
(
input_shape
[
0
])
else
:
raise
ValueError
(
"The type of input shape argument is not supported, got: %s"
%
type
(
input_shape
))
einsum_equation
=
"abc,cd->abd"
if
len
(
input_tensor_shape
.
as_list
())
>
3
:
einsum_equation
=
"...bc,cd->...bd"
hidden_size
=
input_tensor_shape
[
-
1
]
if
hidden_size
%
self
.
_num_heads
!=
0
:
raise
ValueError
(
"The input size (%d) is not a multiple of the number of attention "
"heads (%d)"
%
(
hidden_size
,
self
.
_num_heads
))
self
.
_attention_head_size
=
int
(
hidden_size
//
self
.
_num_heads
)
common_kwargs
=
dict
(
bias_initializer
=
self
.
_bias_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
bias_regularizer
=
self
.
_bias_regularizer
,
activity_regularizer
=
self
.
_activity_regularizer
,
kernel_constraint
=
self
.
_kernel_constraint
,
bias_constraint
=
self
.
_bias_constraint
)
self
.
_attention_layer
=
tf
.
keras
.
layers
.
MultiHeadAttention
(
num_heads
=
self
.
_num_heads
,
key_dim
=
self
.
_attention_head_size
,
dropout
=
self
.
_attention_dropout
,
use_bias
=
self
.
_use_bias
,
kernel_initializer
=
self
.
_attention_initializer
,
attention_axes
=
self
.
_attention_axes
,
name
=
"self_attention"
,
**
common_kwargs
)
self
.
_attention_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_output_dropout
)
# Use float32 in layernorm for numeric stability.
# It is probably safe in mixed_float16, but we haven't validated this yet.
self
.
_attention_layer_norm
=
(
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"self_attention_layer_norm"
,
axis
=-
1
,
epsilon
=
self
.
_norm_epsilon
,
dtype
=
tf
.
float32
))
self
.
_intermediate_dense
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
einsum_equation
,
output_shape
=
(
None
,
self
.
_inner_dim
),
bias_axes
=
"d"
,
kernel_initializer
=
self
.
_kernel_initializer
,
name
=
"intermediate"
,
**
common_kwargs
)
policy
=
tf
.
keras
.
mixed_precision
.
global_policy
()
if
policy
.
name
==
"mixed_bfloat16"
:
# bfloat16 causes BERT with the LAMB optimizer to not converge
# as well, so we use float32.
# TODO(b/154538392): Investigate this.
policy
=
tf
.
float32
self
.
_intermediate_activation_layer
=
tf
.
keras
.
layers
.
Activation
(
self
.
_inner_activation
,
dtype
=
policy
)
self
.
_inner_dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_inner_dropout
)
self
.
_output_dense
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
einsum_equation
,
output_shape
=
(
None
,
hidden_size
),
bias_axes
=
"d"
,
name
=
"output"
,
kernel_initializer
=
self
.
_kernel_initializer
,
**
common_kwargs
)
self
.
_output_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_output_dropout
)
# Use float32 in layernorm for numeric stability.
self
.
_output_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"output_layer_norm"
,
axis
=-
1
,
epsilon
=
self
.
_norm_epsilon
,
dtype
=
tf
.
float32
)
super
(
TransformerEncoderBlock
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
config
=
{
"num_attention_heads"
:
self
.
_num_heads
,
"inner_dim"
:
self
.
_inner_dim
,
"inner_activation"
:
self
.
_inner_activation
,
"output_dropout"
:
self
.
_output_dropout_rate
,
"attention_dropout"
:
self
.
_attention_dropout_rate
,
"output_range"
:
self
.
_output_range
,
"kernel_initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_kernel_initializer
),
"bias_initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_bias_initializer
),
"kernel_regularizer"
:
tf
.
keras
.
regularizers
.
serialize
(
self
.
_kernel_regularizer
),
"bias_regularizer"
:
tf
.
keras
.
regularizers
.
serialize
(
self
.
_bias_regularizer
),
"activity_regularizer"
:
tf
.
keras
.
regularizers
.
serialize
(
self
.
_activity_regularizer
),
"kernel_constraint"
:
tf
.
keras
.
constraints
.
serialize
(
self
.
_kernel_constraint
),
"bias_constraint"
:
tf
.
keras
.
constraints
.
serialize
(
self
.
_bias_constraint
),
"use_bias"
:
self
.
_use_bias
,
"norm_first"
:
self
.
_norm_first
,
"norm_epsilon"
:
self
.
_norm_epsilon
,
"inner_dropout"
:
self
.
_inner_dropout
,
"attention_initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_attention_initializer
),
"attention_axes"
:
self
.
_attention_axes
,
}
base_config
=
super
(
TransformerEncoderBlock
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
inputs
):
"""Transformer self-attention encoder block call.
Args:
inputs: a single tensor or a list of tensors.
`input tensor` as the single sequence of embeddings.
[`input tensor`, `attention mask`] to have the additional attention
mask.
[`query tensor`, `key value tensor`, `attention mask`] to have separate
input streams for the query, and key/value to the multi-head
attention.
Returns:
An output tensor with the same dimensions as input/query tensor.
"""
if
isinstance
(
inputs
,
(
list
,
tuple
)):
if
len
(
inputs
)
==
2
:
input_tensor
,
attention_mask
=
inputs
key_value
=
None
elif
len
(
inputs
)
==
3
:
input_tensor
,
key_value
,
attention_mask
=
inputs
else
:
raise
ValueError
(
"Unexpected inputs to %s with length at %d"
%
(
self
.
__class__
,
len
(
inputs
)))
else
:
input_tensor
,
key_value
,
attention_mask
=
(
inputs
,
None
,
None
)
if
self
.
_output_range
:
if
self
.
_norm_first
:
source_tensor
=
input_tensor
[:,
0
:
self
.
_output_range
,
:]
input_tensor
=
self
.
_attention_layer_norm
(
input_tensor
)
if
key_value
is
not
None
:
key_value
=
self
.
_attention_layer_norm
(
key_value
)
target_tensor
=
input_tensor
[:,
0
:
self
.
_output_range
,
:]
if
attention_mask
is
not
None
:
attention_mask
=
attention_mask
[:,
0
:
self
.
_output_range
,
:]
else
:
if
self
.
_norm_first
:
source_tensor
=
input_tensor
input_tensor
=
self
.
_attention_layer_norm
(
input_tensor
)
if
key_value
is
not
None
:
key_value
=
self
.
_attention_layer_norm
(
key_value
)
target_tensor
=
input_tensor
if
key_value
is
None
:
key_value
=
input_tensor
attention_output
=
self
.
_attention_layer
(
query
=
target_tensor
,
value
=
key_value
,
attention_mask
=
attention_mask
)
attention_output
=
self
.
_attention_dropout
(
attention_output
)
if
self
.
_norm_first
:
attention_output
=
source_tensor
+
attention_output
else
:
attention_output
=
self
.
_attention_layer_norm
(
target_tensor
+
attention_output
)
if
self
.
_norm_first
:
source_attention_output
=
attention_output
attention_output
=
self
.
_output_layer_norm
(
attention_output
)
inner_output
=
self
.
_intermediate_dense
(
attention_output
)
inner_output
=
self
.
_intermediate_activation_layer
(
inner_output
)
inner_output
=
self
.
_inner_dropout_layer
(
inner_output
)
layer_output
=
self
.
_output_dense
(
inner_output
)
layer_output
=
self
.
_output_dropout
(
layer_output
)
if
self
.
_norm_first
:
return
source_attention_output
+
layer_output
# During mixed precision training, layer norm output is always fp32 for now.
# Casts fp32 for the subsequent add.
layer_output
=
tf
.
cast
(
layer_output
,
tf
.
float32
)
return
self
.
_output_layer_norm
(
layer_output
+
attention_output
)
official/nlp/modeling/layers/README.md
View file @
002b4240
...
@@ -121,3 +121,7 @@ assemble new `tf.keras` layers or models.
...
@@ -121,3 +121,7 @@ assemble new `tf.keras` layers or models.
[
BertTokenizer
](
text_layers.py
)
and
[
SentencepieceTokenizer
](
text_layers.py
)
[
BertTokenizer
](
text_layers.py
)
and
[
SentencepieceTokenizer
](
text_layers.py
)
implements the layer to tokenize raw text and pack them into the inputs for
implements the layer to tokenize raw text and pack them into the inputs for
BERT models.
BERT models.
*
[
TransformerEncoderBlock
](
transformer_encoder_block.py
)
implements
an optionally masked transformer as described in
[
"Attention Is All You Need"
](
https://arxiv.org/abs/1706.03762
)
.
official/nlp/modeling/layers/__init__.py
View file @
002b4240
...
@@ -22,6 +22,7 @@ from official.nlp.modeling.layers.bigbird_attention import BigBirdAttention
...
@@ -22,6 +22,7 @@ from official.nlp.modeling.layers.bigbird_attention import BigBirdAttention
from
official.nlp.modeling.layers.bigbird_attention
import
BigBirdMasks
from
official.nlp.modeling.layers.bigbird_attention
import
BigBirdMasks
from
official.nlp.modeling.layers.cls_head
import
*
from
official.nlp.modeling.layers.cls_head
import
*
from
official.nlp.modeling.layers.dense_einsum
import
DenseEinsum
from
official.nlp.modeling.layers.dense_einsum
import
DenseEinsum
from
official.nlp.modeling.layers.exbert_layers
import
*
from
official.nlp.modeling.layers.gated_feedforward
import
GatedFeedforward
from
official.nlp.modeling.layers.gated_feedforward
import
GatedFeedforward
from
official.nlp.modeling.layers.gaussian_process
import
RandomFeatureGaussianProcess
from
official.nlp.modeling.layers.gaussian_process
import
RandomFeatureGaussianProcess
from
official.nlp.modeling.layers.kernel_attention
import
KernelAttention
from
official.nlp.modeling.layers.kernel_attention
import
KernelAttention
...
@@ -34,6 +35,7 @@ from official.nlp.modeling.layers.mobile_bert_layers import MobileBertMaskedLM
...
@@ -34,6 +35,7 @@ from official.nlp.modeling.layers.mobile_bert_layers import MobileBertMaskedLM
from
official.nlp.modeling.layers.mobile_bert_layers
import
MobileBertTransformer
from
official.nlp.modeling.layers.mobile_bert_layers
import
MobileBertTransformer
from
official.nlp.modeling.layers.multi_channel_attention
import
*
from
official.nlp.modeling.layers.multi_channel_attention
import
*
from
official.nlp.modeling.layers.on_device_embedding
import
OnDeviceEmbedding
from
official.nlp.modeling.layers.on_device_embedding
import
OnDeviceEmbedding
from
official.nlp.modeling.layers.position_embedding
import
PositionEmbedding
from
official.nlp.modeling.layers.position_embedding
import
RelativePositionBias
from
official.nlp.modeling.layers.position_embedding
import
RelativePositionBias
from
official.nlp.modeling.layers.position_embedding
import
RelativePositionEmbedding
from
official.nlp.modeling.layers.position_embedding
import
RelativePositionEmbedding
from
official.nlp.modeling.layers.relative_attention
import
MultiHeadRelativeAttention
from
official.nlp.modeling.layers.relative_attention
import
MultiHeadRelativeAttention
...
@@ -47,6 +49,7 @@ from official.nlp.modeling.layers.text_layers import BertTokenizer
...
@@ -47,6 +49,7 @@ from official.nlp.modeling.layers.text_layers import BertTokenizer
from
official.nlp.modeling.layers.text_layers
import
SentencepieceTokenizer
from
official.nlp.modeling.layers.text_layers
import
SentencepieceTokenizer
from
official.nlp.modeling.layers.tn_transformer_expand_condense
import
TNTransformerExpandCondense
from
official.nlp.modeling.layers.tn_transformer_expand_condense
import
TNTransformerExpandCondense
from
official.nlp.modeling.layers.transformer
import
*
from
official.nlp.modeling.layers.transformer
import
*
from
official.nlp.modeling.layers.transformer_encoder_block
import
TransformerEncoderBlock
from
official.nlp.modeling.layers.transformer_scaffold
import
TransformerScaffold
from
official.nlp.modeling.layers.transformer_scaffold
import
TransformerScaffold
from
official.nlp.modeling.layers.transformer_xl
import
TransformerXL
from
official.nlp.modeling.layers.transformer_xl
import
TransformerXL
from
official.nlp.modeling.layers.transformer_xl
import
TransformerXLBlock
from
official.nlp.modeling.layers.transformer_xl
import
TransformerXLBlock
official/nlp/modeling/layers/masked_lm.py
View file @
002b4240
...
@@ -14,7 +14,110 @@
...
@@ -14,7 +14,110 @@
"""Masked language model network."""
"""Masked language model network."""
# pylint: disable=g-classes-have-attributes
# pylint: disable=g-classes-have-attributes
from
official.nlp
import
keras_nlp
import
tensorflow
as
tf
MaskedLM
=
keras_nlp
.
layers
.
MaskedLM
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
MaskedLM
(
tf
.
keras
.
layers
.
Layer
):
"""Masked language model network head for BERT modeling.
This layer implements a masked language model based on the provided
transformer based encoder. It assumes that the encoder network being passed
has a "get_embedding_table()" method.
Example:
```python
encoder=keras_nlp.BertEncoder(...)
lm_layer=MaskedLM(embedding_table=encoder.get_embedding_table())
```
Args:
embedding_table: The embedding table from encoder network.
activation: The activation, if any, for the dense layer.
initializer: The initializer for the dense layer. Defaults to a Glorot
uniform initializer.
output: The output style for this layer. Can be either 'logits' or
'predictions'.
"""
def
__init__
(
self
,
embedding_table
,
activation
=
None
,
initializer
=
'glorot_uniform'
,
output
=
'logits'
,
name
=
None
,
**
kwargs
):
super
(
MaskedLM
,
self
).
__init__
(
name
=
name
,
**
kwargs
)
self
.
embedding_table
=
embedding_table
self
.
activation
=
activation
self
.
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
if
output
not
in
(
'predictions'
,
'logits'
):
raise
ValueError
(
(
'Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"'
)
%
output
)
self
.
_output_type
=
output
def
build
(
self
,
input_shape
):
self
.
_vocab_size
,
hidden_size
=
self
.
embedding_table
.
shape
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
hidden_size
,
activation
=
self
.
activation
,
kernel_initializer
=
self
.
initializer
,
name
=
'transform/dense'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
axis
=-
1
,
epsilon
=
1e-12
,
name
=
'transform/LayerNorm'
)
self
.
bias
=
self
.
add_weight
(
'output_bias/bias'
,
shape
=
(
self
.
_vocab_size
,),
initializer
=
'zeros'
,
trainable
=
True
)
super
(
MaskedLM
,
self
).
build
(
input_shape
)
def
call
(
self
,
sequence_data
,
masked_positions
):
masked_lm_input
=
self
.
_gather_indexes
(
sequence_data
,
masked_positions
)
lm_data
=
self
.
dense
(
masked_lm_input
)
lm_data
=
self
.
layer_norm
(
lm_data
)
lm_data
=
tf
.
matmul
(
lm_data
,
self
.
embedding_table
,
transpose_b
=
True
)
logits
=
tf
.
nn
.
bias_add
(
lm_data
,
self
.
bias
)
masked_positions_length
=
masked_positions
.
shape
.
as_list
()[
1
]
or
tf
.
shape
(
masked_positions
)[
1
]
logits
=
tf
.
reshape
(
logits
,
[
-
1
,
masked_positions_length
,
self
.
_vocab_size
])
if
self
.
_output_type
==
'logits'
:
return
logits
return
tf
.
nn
.
log_softmax
(
logits
)
def
get_config
(
self
):
raise
NotImplementedError
(
'MaskedLM cannot be directly serialized because '
'it has variable sharing logic.'
)
def
_gather_indexes
(
self
,
sequence_tensor
,
positions
):
"""Gathers the vectors at the specific positions, for performance.
Args:
sequence_tensor: Sequence output of shape
(`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
hidden units.
positions: Positions ids of tokens in sequence to mask for pretraining
of with dimension (batch_size, num_predictions) where
`num_predictions` is maximum number of tokens to mask out and predict
per each sequence.
Returns:
Masked out sequence tensor of shape (batch_size * num_predictions,
num_hidden).
"""
sequence_shape
=
tf
.
shape
(
sequence_tensor
)
batch_size
,
seq_length
=
sequence_shape
[
0
],
sequence_shape
[
1
]
width
=
sequence_tensor
.
shape
.
as_list
()[
2
]
or
sequence_shape
[
2
]
flat_offsets
=
tf
.
reshape
(
tf
.
range
(
0
,
batch_size
,
dtype
=
tf
.
int32
)
*
seq_length
,
[
-
1
,
1
])
flat_positions
=
tf
.
reshape
(
positions
+
flat_offsets
,
[
-
1
])
flat_sequence_tensor
=
tf
.
reshape
(
sequence_tensor
,
[
batch_size
*
seq_length
,
width
])
output_tensor
=
tf
.
gather
(
flat_sequence_tensor
,
flat_positions
)
return
output_tensor
official/nlp/modeling/layers/mobile_bert_layers.py
View file @
002b4240
...
@@ -15,7 +15,8 @@
...
@@ -15,7 +15,8 @@
"""MobileBERT embedding and transformer layers."""
"""MobileBERT embedding and transformer layers."""
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.nlp
import
keras_nlp
from
official.nlp.modeling.layers
import
on_device_embedding
from
official.nlp.modeling.layers
import
position_embedding
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
...
@@ -105,17 +106,17 @@ class MobileBertEmbedding(tf.keras.layers.Layer):
...
@@ -105,17 +106,17 @@ class MobileBertEmbedding(tf.keras.layers.Layer):
self
.
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
self
.
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
self
.
dropout_rate
=
dropout_rate
self
.
dropout_rate
=
dropout_rate
self
.
word_embedding
=
keras_nlp
.
layers
.
OnDeviceEmbedding
(
self
.
word_embedding
=
on_device_embedding
.
OnDeviceEmbedding
(
self
.
word_vocab_size
,
self
.
word_vocab_size
,
self
.
word_embed_size
,
self
.
word_embed_size
,
initializer
=
initializer
,
initializer
=
initializer
,
name
=
'word_embedding'
)
name
=
'word_embedding'
)
self
.
type_embedding
=
keras_nlp
.
layers
.
OnDeviceEmbedding
(
self
.
type_embedding
=
on_device_embedding
.
OnDeviceEmbedding
(
self
.
type_vocab_size
,
self
.
type_vocab_size
,
self
.
output_embed_size
,
self
.
output_embed_size
,
initializer
=
initializer
,
initializer
=
initializer
,
name
=
'type_embedding'
)
name
=
'type_embedding'
)
self
.
pos_embedding
=
keras_nlp
.
layers
.
PositionEmbedding
(
self
.
pos_embedding
=
position_embedding
.
PositionEmbedding
(
max_length
=
max_sequence_length
,
max_length
=
max_sequence_length
,
initializer
=
initializer
,
initializer
=
initializer
,
name
=
'position_embedding'
)
name
=
'position_embedding'
)
...
...
official/nlp/modeling/layers/on_device_embedding.py
View file @
002b4240
...
@@ -15,7 +15,92 @@
...
@@ -15,7 +15,92 @@
"""Keras-based one-hot embedding layer."""
"""Keras-based one-hot embedding layer."""
# pylint: disable=g-classes-have-attributes
# pylint: disable=g-classes-have-attributes
from
official.nlp
import
keras_nlp
import
tensorflow
as
tf
OnDeviceEmbedding
=
keras_nlp
.
layers
.
OnDeviceEmbedding
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
OnDeviceEmbedding
(
tf
.
keras
.
layers
.
Layer
):
"""Performs an embedding lookup suitable for accelerator devices.
This layer uses either tf.gather or tf.one_hot to translate integer indices to
float embeddings.
Args:
vocab_size: Number of elements in the vocabulary.
embedding_width: Output size of the embedding layer.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
lookup. Defaults to False (that is, using tf.gather). Setting this option
to True may improve performance, especially on small vocabulary sizes, but
will generally require more memory.
scale_factor: Whether to scale the output embeddings. Defaults to None (that
is, not to scale). Setting this option to a float will let values in
output embeddings multiplied by scale_factor.
"""
def
__init__
(
self
,
vocab_size
,
embedding_width
,
initializer
=
"glorot_uniform"
,
use_one_hot
=
False
,
scale_factor
=
None
,
**
kwargs
):
super
(
OnDeviceEmbedding
,
self
).
__init__
(
**
kwargs
)
self
.
_vocab_size
=
vocab_size
self
.
_embedding_width
=
embedding_width
self
.
_initializer
=
initializer
self
.
_use_one_hot
=
use_one_hot
self
.
_scale_factor
=
scale_factor
def
get_config
(
self
):
config
=
{
"vocab_size"
:
self
.
_vocab_size
,
"embedding_width"
:
self
.
_embedding_width
,
"initializer"
:
self
.
_initializer
,
"use_one_hot"
:
self
.
_use_one_hot
,
"scale_factor"
:
self
.
_scale_factor
,
}
base_config
=
super
(
OnDeviceEmbedding
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
build
(
self
,
input_shape
):
self
.
embeddings
=
self
.
add_weight
(
"embeddings"
,
shape
=
[
self
.
_vocab_size
,
self
.
_embedding_width
],
initializer
=
self
.
_initializer
,
dtype
=
tf
.
float32
)
super
(
OnDeviceEmbedding
,
self
).
build
(
input_shape
)
def
call
(
self
,
inputs
):
flat_inputs
=
tf
.
reshape
(
inputs
,
[
-
1
])
if
self
.
_use_one_hot
:
dtype
=
self
.
_compute_dtype
if
not
tf
.
dtypes
.
as_dtype
(
dtype
).
is_floating
:
# TensorFlow 1 compatibility. In TF1, self._compute_dtype is int32
# instead of a floating-point dtype, as the dtype is inferred from the
# dtype of the inputs
dtype
=
tf
.
float32
one_hot_data
=
tf
.
one_hot
(
flat_inputs
,
depth
=
self
.
_vocab_size
,
dtype
=
dtype
)
embeddings
=
tf
.
matmul
(
one_hot_data
,
self
.
embeddings
)
else
:
embeddings
=
tf
.
gather
(
self
.
embeddings
,
flat_inputs
)
embeddings
=
tf
.
reshape
(
embeddings
,
# Work around b/142213824: prefer concat to shape over a Python list.
tf
.
concat
([
tf
.
shape
(
inputs
),
[
self
.
_embedding_width
]],
axis
=
0
))
embeddings
.
set_shape
(
inputs
.
shape
.
as_list
()
+
[
self
.
_embedding_width
])
if
self
.
_scale_factor
:
embeddings
*=
self
.
_scale_factor
return
embeddings
@
property
def
vocab_size
(
self
):
return
self
.
_vocab_size
@
property
def
embedding_width
(
self
):
return
self
.
_embedding_width
official/nlp/modeling/layers/on_device_embedding_test.py
0 → 100644
View file @
002b4240
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for Keras-based one-hot embedding layer."""
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
official.nlp.modeling.layers
import
on_device_embedding
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@
keras_parameterized
.
run_all_keras_modes
class
OnDeviceEmbeddingTest
(
keras_parameterized
.
TestCase
):
def
test_layer_creation
(
self
):
vocab_size
=
31
embedding_width
=
27
test_layer
=
on_device_embedding
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length
=
23
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
),
dtype
=
tf
.
int32
)
output_tensor
=
test_layer
(
input_tensor
)
# The output should be the same as the input, save that it has an extra
# embedding_width dimension on the end.
expected_output_shape
=
[
None
,
sequence_length
,
embedding_width
]
self
.
assertEqual
(
expected_output_shape
,
output_tensor
.
shape
.
as_list
())
self
.
assertEqual
(
output_tensor
.
dtype
,
tf
.
float32
)
def
test_layer_creation_with_mixed_precision
(
self
):
vocab_size
=
31
embedding_width
=
27
test_layer
=
on_device_embedding
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
dtype
=
"mixed_float16"
)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length
=
23
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
),
dtype
=
tf
.
int32
)
output_tensor
=
test_layer
(
input_tensor
)
# The output should be the same as the input, save that it has an extra
# embedding_width dimension on the end.
expected_output_shape
=
[
None
,
sequence_length
,
embedding_width
]
self
.
assertEqual
(
expected_output_shape
,
output_tensor
.
shape
.
as_list
())
self
.
assertEqual
(
output_tensor
.
dtype
,
tf
.
float16
)
def
test_layer_invocation
(
self
):
vocab_size
=
31
embedding_width
=
27
test_layer
=
on_device_embedding
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length
=
23
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
),
dtype
=
tf
.
int32
)
output_tensor
=
test_layer
(
input_tensor
)
# Create a model from the test layer.
model
=
tf
.
keras
.
Model
(
input_tensor
,
output_tensor
)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
3
input_data
=
np
.
random
.
randint
(
vocab_size
,
size
=
(
batch_size
,
sequence_length
))
output
=
model
.
predict
(
input_data
)
self
.
assertEqual
(
tf
.
float32
,
output
.
dtype
)
def
test_layer_invocation_with_mixed_precision
(
self
):
vocab_size
=
31
embedding_width
=
27
test_layer
=
on_device_embedding
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
dtype
=
"mixed_float16"
)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length
=
23
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
),
dtype
=
tf
.
int32
)
output_tensor
=
test_layer
(
input_tensor
)
# Create a model from the test layer.
model
=
tf
.
keras
.
Model
(
input_tensor
,
output_tensor
)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
3
input_data
=
np
.
random
.
randint
(
vocab_size
,
size
=
(
batch_size
,
sequence_length
))
output
=
model
.
predict
(
input_data
)
self
.
assertEqual
(
tf
.
float16
,
output
.
dtype
)
def
test_one_hot_layer_creation
(
self
):
vocab_size
=
31
embedding_width
=
27
test_layer
=
on_device_embedding
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
use_one_hot
=
True
)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length
=
23
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
),
dtype
=
tf
.
int32
)
output_tensor
=
test_layer
(
input_tensor
)
# The output should be the same as the input, save that it has an extra
# embedding_width dimension on the end.
expected_output_shape
=
[
None
,
sequence_length
,
embedding_width
]
self
.
assertEqual
(
expected_output_shape
,
output_tensor
.
shape
.
as_list
())
self
.
assertEqual
(
output_tensor
.
dtype
,
tf
.
float32
)
def
test_one_hot_layer_creation_with_mixed_precision
(
self
):
vocab_size
=
31
embedding_width
=
27
test_layer
=
on_device_embedding
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
dtype
=
"mixed_float16"
,
use_one_hot
=
True
)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length
=
23
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
),
dtype
=
tf
.
int32
)
output_tensor
=
test_layer
(
input_tensor
)
# The output should be the same as the input, save that it has an extra
# embedding_width dimension on the end.
expected_output_shape
=
[
None
,
sequence_length
,
embedding_width
]
self
.
assertEqual
(
expected_output_shape
,
output_tensor
.
shape
.
as_list
())
self
.
assertEqual
(
output_tensor
.
dtype
,
tf
.
float16
)
def
test_one_hot_layer_invocation
(
self
):
vocab_size
=
31
embedding_width
=
27
test_layer
=
on_device_embedding
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
use_one_hot
=
True
)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length
=
23
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
),
dtype
=
tf
.
int32
)
output_tensor
=
test_layer
(
input_tensor
)
# Create a model from the test layer.
model
=
tf
.
keras
.
Model
(
input_tensor
,
output_tensor
)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
3
input_data
=
np
.
random
.
randint
(
vocab_size
,
size
=
(
batch_size
,
sequence_length
))
output
=
model
.
predict
(
input_data
)
self
.
assertEqual
(
tf
.
float32
,
output
.
dtype
)
def
test_one_hot_layer_invocation_with_mixed_precision
(
self
):
vocab_size
=
31
embedding_width
=
27
test_layer
=
on_device_embedding
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
dtype
=
"mixed_float16"
,
use_one_hot
=
True
)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length
=
23
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
),
dtype
=
tf
.
int32
)
output_tensor
=
test_layer
(
input_tensor
)
# Create a model from the test layer.
model
=
tf
.
keras
.
Model
(
input_tensor
,
output_tensor
)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
3
input_data
=
np
.
random
.
randint
(
vocab_size
,
size
=
(
batch_size
,
sequence_length
))
output
=
model
.
predict
(
input_data
)
self
.
assertEqual
(
tf
.
float16
,
output
.
dtype
)
def
test_use_scale_layer_invocation
(
self
):
vocab_size
=
31
embedding_width
=
27
test_layer
=
on_device_embedding
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
scale_factor
=
embedding_width
**
0.5
)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length
=
23
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
),
dtype
=
tf
.
int32
)
output_tensor
=
test_layer
(
input_tensor
)
# Create a model from the test layer.
model
=
tf
.
keras
.
Model
(
input_tensor
,
output_tensor
)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
3
input_data
=
np
.
random
.
randint
(
vocab_size
,
size
=
(
batch_size
,
sequence_length
))
output
=
model
.
predict
(
input_data
)
self
.
assertEqual
(
tf
.
float32
,
output
.
dtype
)
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/nlp/modeling/layers/position_embedding.py
View file @
002b4240
...
@@ -24,6 +24,76 @@ from official.modeling import tf_utils
...
@@ -24,6 +24,76 @@ from official.modeling import tf_utils
Initializer
=
tf
.
keras
.
initializers
.
Initializer
Initializer
=
tf
.
keras
.
initializers
.
Initializer
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
PositionEmbedding
(
tf
.
keras
.
layers
.
Layer
):
"""Creates a positional embedding.
Example:
```python
position_embedding = PositionEmbedding(max_length=100)
inputs = tf.keras.Input((100, 32), dtype=tf.float32)
outputs = position_embedding(inputs)
```
Args:
max_length: The maximum size of the dynamic sequence.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
seq_axis: The axis of the input tensor where we add the embeddings.
Reference: This layer creates a positional embedding as described in
[BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding](https://arxiv.org/abs/1810.04805).
"""
def
__init__
(
self
,
max_length
,
initializer
=
"glorot_uniform"
,
seq_axis
=
1
,
**
kwargs
):
super
(
PositionEmbedding
,
self
).
__init__
(
**
kwargs
)
if
max_length
is
None
:
raise
ValueError
(
"`max_length` must be an Integer, not `None`."
)
self
.
_max_length
=
max_length
self
.
_initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
self
.
_seq_axis
=
seq_axis
def
get_config
(
self
):
config
=
{
"max_length"
:
self
.
_max_length
,
"initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_initializer
),
"seq_axis"
:
self
.
_seq_axis
,
}
base_config
=
super
(
PositionEmbedding
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
build
(
self
,
input_shape
):
dimension_list
=
input_shape
.
as_list
()
width
=
dimension_list
[
-
1
]
weight_sequence_length
=
self
.
_max_length
self
.
_position_embeddings
=
self
.
add_weight
(
"embeddings"
,
shape
=
[
weight_sequence_length
,
width
],
initializer
=
self
.
_initializer
)
super
(
PositionEmbedding
,
self
).
build
(
input_shape
)
def
call
(
self
,
inputs
):
input_shape
=
tf
.
shape
(
inputs
)
actual_seq_len
=
input_shape
[
self
.
_seq_axis
]
position_embeddings
=
self
.
_position_embeddings
[:
actual_seq_len
,
:]
new_shape
=
[
1
for
_
in
inputs
.
get_shape
().
as_list
()]
new_shape
[
self
.
_seq_axis
]
=
actual_seq_len
new_shape
[
-
1
]
=
position_embeddings
.
get_shape
().
as_list
()[
-
1
]
position_embeddings
=
tf
.
reshape
(
position_embeddings
,
new_shape
)
return
tf
.
broadcast_to
(
position_embeddings
,
input_shape
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
RelativePositionEmbedding
(
tf
.
keras
.
layers
.
Layer
):
class
RelativePositionEmbedding
(
tf
.
keras
.
layers
.
Layer
):
"""Creates a positional embedding.
"""Creates a positional embedding.
...
...
official/nlp/modeling/layers/position_embedding_test.py
View file @
002b4240
...
@@ -22,6 +22,113 @@ from tensorflow.python.keras import keras_parameterized # pylint: disable=g-dir
...
@@ -22,6 +22,113 @@ from tensorflow.python.keras import keras_parameterized # pylint: disable=g-dir
from
official.nlp.modeling.layers
import
position_embedding
from
official.nlp.modeling.layers
import
position_embedding
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@
keras_parameterized
.
run_all_keras_modes
class
PositionEmbeddingLayerTest
(
keras_parameterized
.
TestCase
):
def
test_static_layer_output_shape
(
self
):
# Create a 3-dimensional input (the first dimension is implicit).
sequence_length
=
21
test_layer
=
position_embedding
.
PositionEmbedding
(
max_length
=
sequence_length
)
width
=
30
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
output_tensor
=
test_layer
(
input_tensor
)
# When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch.
expected_output_shape
=
[
None
,
sequence_length
,
width
]
self
.
assertEqual
(
expected_output_shape
,
output_tensor
.
shape
.
as_list
())
# The default output dtype for this layer should be tf.float32.
self
.
assertEqual
(
tf
.
float32
,
output_tensor
.
dtype
)
def
test_non_default_axis_static
(
self
):
# Create a 3-dimensional input (the first dimension is implicit).
sequence_length
=
21
test_layer
=
position_embedding
.
PositionEmbedding
(
max_length
=
sequence_length
,
seq_axis
=
2
)
width
=
30
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
width
,
sequence_length
,
width
))
output_tensor
=
test_layer
(
input_tensor
)
# When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch.
expected_output_shape
=
[
None
,
width
,
sequence_length
,
width
]
self
.
assertEqual
(
expected_output_shape
,
output_tensor
.
shape
.
as_list
())
# The default output dtype for this layer should be tf.float32.
self
.
assertEqual
(
tf
.
float32
,
output_tensor
.
dtype
)
def
test_float16_dtype
(
self
):
# Create a 3-dimensional input (the first dimension is implicit).
sequence_length
=
21
test_layer
=
position_embedding
.
PositionEmbedding
(
max_length
=
sequence_length
,
dtype
=
"float16"
)
width
=
30
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
output_tensor
=
test_layer
(
input_tensor
)
# When using static positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions save batch.
expected_output_shape
=
[
None
,
sequence_length
,
width
]
self
.
assertEqual
(
expected_output_shape
,
output_tensor
.
shape
.
as_list
())
# The default output dtype for this layer should be tf.float32.
self
.
assertEqual
(
tf
.
float16
,
output_tensor
.
dtype
)
def
test_dynamic_layer_output_shape
(
self
):
max_sequence_length
=
40
test_layer
=
position_embedding
.
PositionEmbedding
(
max_length
=
max_sequence_length
)
# Create a 3-dimensional input (the first dimension is implicit).
width
=
30
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
None
,
width
))
output_tensor
=
test_layer
(
input_tensor
)
# When using dynamic positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions - but may be None if
# the input shape is None there.
expected_output_shape
=
[
None
,
None
,
width
]
self
.
assertEqual
(
expected_output_shape
,
output_tensor
.
shape
.
as_list
())
def
test_non_default_axis_dynamic
(
self
):
max_sequence_length
=
60
test_layer
=
position_embedding
.
PositionEmbedding
(
max_length
=
max_sequence_length
,
seq_axis
=
2
)
# Create a 3-dimensional input (the first dimension is implicit).
width
=
30
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
None
,
None
,
width
))
output_tensor
=
test_layer
(
input_tensor
)
# When using dynamic positional embedding shapes, the output is expected
# to be the same as the input shape in all dimensions - but may be None if
# the input shape is None there.
expected_output_shape
=
[
None
,
None
,
None
,
width
]
self
.
assertEqual
(
expected_output_shape
,
output_tensor
.
shape
.
as_list
())
def
test_dynamic_layer_slicing
(
self
):
max_sequence_length
=
40
test_layer
=
position_embedding
.
PositionEmbedding
(
max_length
=
max_sequence_length
)
# Create a 3-dimensional input (the first dimension is implicit).
width
=
30
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
None
,
width
))
output_tensor
=
test_layer
(
input_tensor
)
model
=
tf
.
keras
.
Model
(
input_tensor
,
output_tensor
)
# Create input data that is shorter than max_sequence_length, which should
# trigger a down-slice.
input_length
=
17
# Note: This test explicitly uses a batch size of 1. This is to get around
# Keras' restriction on Model invocations: inputs are expected to have the
# same batch cardinality as outputs. In practice, this layer should be used
# inside a model, where it can be projected when added to another tensor.
input_data
=
np
.
ones
((
1
,
input_length
,
width
))
output_data
=
model
.
predict
(
input_data
)
self
.
assertAllEqual
([
1
,
input_length
,
width
],
output_data
.
shape
)
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
# guarantees forward compatibility of this code for the V2 switchover.
@
keras_parameterized
.
run_all_keras_modes
@
keras_parameterized
.
run_all_keras_modes
...
...
official/nlp/modeling/layers/self_attention_mask.py
View file @
002b4240
...
@@ -16,24 +16,43 @@
...
@@ -16,24 +16,43 @@
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.nlp.keras_nlp
import
layers
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
SelfAttentionMask
(
layers
.
SelfAttentionMask
):
class
SelfAttentionMask
(
tf
.
keras
.
layers
.
Layer
):
"""Create
s
3D attention mask from a 2D tensor mask.
"""Create 3D attention mask from a 2D tensor mask.
**Warning: Please use the `keras_nlp.layers.SelfAttentionMask`.**
inputs[0]: from_tensor: 2D or 3D Tensor of shape
inputs[0]: from_tensor: 2D or 3D Tensor of shape
`(
batch_size, from_seq_length, ...
)`
.
[
batch_size, from_seq_length, ...
]
.
inputs[1]: to_mask: int32 Tensor of shape
`(
batch_size, to_seq_length
)`
.
inputs[1]: to_mask: int32 Tensor of shape
[
batch_size, to_seq_length
]
.
Returns:
Returns:
F
loat Tensor of shape
`(
batch_size, from_seq_length, to_seq_length
)`
.
f
loat Tensor of shape
[
batch_size, from_seq_length, to_seq_length
]
.
"""
"""
def
call
(
self
,
inputs
):
def
call
(
self
,
inputs
,
to_mask
=
None
):
if
isinstance
(
inputs
,
list
):
if
isinstance
(
inputs
,
list
)
and
to_mask
is
None
:
return
super
().
call
(
inputs
[
0
],
inputs
[
1
])
to_mask
=
inputs
[
1
]
else
:
inputs
=
inputs
[
0
]
return
super
().
call
(
inputs
)
from_shape
=
tf
.
shape
(
inputs
)
batch_size
=
from_shape
[
0
]
from_seq_length
=
from_shape
[
1
]
to_shape
=
tf
.
shape
(
to_mask
)
to_seq_length
=
to_shape
[
1
]
to_mask
=
tf
.
cast
(
tf
.
reshape
(
to_mask
,
[
batch_size
,
1
,
to_seq_length
]),
dtype
=
inputs
.
dtype
)
# We don't assume that `from_tensor` is a mask (although it could be). We
# don't actually care if we attend *from* padding tokens (only *to* padding)
# tokens so we create a tensor of all ones.
#
# `broadcast_ones` = [batch_size, from_seq_length, 1]
broadcast_ones
=
tf
.
ones
(
shape
=
[
batch_size
,
from_seq_length
,
1
],
dtype
=
inputs
.
dtype
)
# Here we broadcast along two dimensions to create the mask.
mask
=
broadcast_ones
*
to_mask
return
mask
official/nlp/modeling/layers/transformer.py
View file @
002b4240
...
@@ -18,14 +18,14 @@
...
@@ -18,14 +18,14 @@
import
gin
import
gin
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.nlp
import
keras_nlp
from
official.nlp.modeling.layers
import
attention
from
official.nlp.modeling.layers
import
attention
from
official.nlp.modeling.layers
import
multi_channel_attention
from
official.nlp.modeling.layers
import
multi_channel_attention
from
official.nlp.modeling.layers
import
transformer_encoder_block
from
official.nlp.modeling.layers.util
import
tf_function_if_eager
from
official.nlp.modeling.layers.util
import
tf_function_if_eager
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
Transformer
(
keras_nlp
.
layers
.
TransformerEncoderBlock
):
class
Transformer
(
transformer_encoder_block
.
TransformerEncoderBlock
):
"""Transformer layer.
"""Transformer layer.
This layer implements the Transformer from "Attention Is All You Need".
This layer implements the Transformer from "Attention Is All You Need".
...
...
official/nlp/modeling/layers/transformer_encoder_block.py
0 → 100644
View file @
002b4240
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Keras-based TransformerEncoder block layer."""
import
tensorflow
as
tf
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
TransformerEncoderBlock
(
tf
.
keras
.
layers
.
Layer
):
"""TransformerEncoderBlock layer.
This layer implements the Transformer Encoder from
"Attention Is All You Need". (https://arxiv.org/abs/1706.03762),
which combines a `tf.keras.layers.MultiHeadAttention` layer with a
two-layer feedforward network.
References:
[Attention Is All You Need](https://arxiv.org/abs/1706.03762)
[BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding](https://arxiv.org/abs/1810.04805)
"""
def
__init__
(
self
,
num_attention_heads
,
inner_dim
,
inner_activation
,
output_range
=
None
,
kernel_initializer
=
"glorot_uniform"
,
bias_initializer
=
"zeros"
,
kernel_regularizer
=
None
,
bias_regularizer
=
None
,
activity_regularizer
=
None
,
kernel_constraint
=
None
,
bias_constraint
=
None
,
use_bias
=
True
,
norm_first
=
False
,
norm_epsilon
=
1e-12
,
output_dropout
=
0.0
,
attention_dropout
=
0.0
,
inner_dropout
=
0.0
,
attention_initializer
=
None
,
attention_axes
=
None
,
**
kwargs
):
"""Initializes `TransformerEncoderBlock`.
Args:
num_attention_heads: Number of attention heads.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network.
output_range: the sequence output range, [0, output_range) for slicing the
target sequence. `None` means the target sequence is not sliced.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
bias_regularizer: Regularizer for dense layer biases.
activity_regularizer: Regularizer for dense layer activity.
kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels.
use_bias: Whether to enable use_bias in attention layer. If set False,
use_bias in attention layer is disabled.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set False, output of attention and intermediate dense
layers is normalized.
norm_epsilon: Epsilon value to initialize normalization layers.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: Dropout probability for within the attention layer.
inner_dropout: Dropout probability for the first Dense layer in a
two-layer feedforward network.
attention_initializer: Initializer for kernels of attention layers. If set
`None`, attention layers use kernel_initializer as initializer for
kernel.
attention_axes: axes over which the attention is applied. `None` means
attention over all axes, but batch, heads, and features.
**kwargs: keyword arguments/
"""
super
().
__init__
(
**
kwargs
)
self
.
_num_heads
=
num_attention_heads
self
.
_inner_dim
=
inner_dim
self
.
_inner_activation
=
inner_activation
self
.
_attention_dropout
=
attention_dropout
self
.
_attention_dropout_rate
=
attention_dropout
self
.
_output_dropout
=
output_dropout
self
.
_output_dropout_rate
=
output_dropout
self
.
_output_range
=
output_range
self
.
_kernel_initializer
=
tf
.
keras
.
initializers
.
get
(
kernel_initializer
)
self
.
_bias_initializer
=
tf
.
keras
.
initializers
.
get
(
bias_initializer
)
self
.
_kernel_regularizer
=
tf
.
keras
.
regularizers
.
get
(
kernel_regularizer
)
self
.
_bias_regularizer
=
tf
.
keras
.
regularizers
.
get
(
bias_regularizer
)
self
.
_activity_regularizer
=
tf
.
keras
.
regularizers
.
get
(
activity_regularizer
)
self
.
_kernel_constraint
=
tf
.
keras
.
constraints
.
get
(
kernel_constraint
)
self
.
_bias_constraint
=
tf
.
keras
.
constraints
.
get
(
bias_constraint
)
self
.
_use_bias
=
use_bias
self
.
_norm_first
=
norm_first
self
.
_norm_epsilon
=
norm_epsilon
self
.
_inner_dropout
=
inner_dropout
if
attention_initializer
:
self
.
_attention_initializer
=
tf
.
keras
.
initializers
.
get
(
attention_initializer
)
else
:
self
.
_attention_initializer
=
self
.
_kernel_initializer
self
.
_attention_axes
=
attention_axes
def
build
(
self
,
input_shape
):
if
isinstance
(
input_shape
,
tf
.
TensorShape
):
input_tensor_shape
=
input_shape
elif
isinstance
(
input_shape
,
(
list
,
tuple
)):
input_tensor_shape
=
tf
.
TensorShape
(
input_shape
[
0
])
else
:
raise
ValueError
(
"The type of input shape argument is not supported, got: %s"
%
type
(
input_shape
))
einsum_equation
=
"abc,cd->abd"
if
len
(
input_tensor_shape
.
as_list
())
>
3
:
einsum_equation
=
"...bc,cd->...bd"
hidden_size
=
input_tensor_shape
[
-
1
]
if
hidden_size
%
self
.
_num_heads
!=
0
:
raise
ValueError
(
"The input size (%d) is not a multiple of the number of attention "
"heads (%d)"
%
(
hidden_size
,
self
.
_num_heads
))
self
.
_attention_head_size
=
int
(
hidden_size
//
self
.
_num_heads
)
common_kwargs
=
dict
(
bias_initializer
=
self
.
_bias_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
bias_regularizer
=
self
.
_bias_regularizer
,
activity_regularizer
=
self
.
_activity_regularizer
,
kernel_constraint
=
self
.
_kernel_constraint
,
bias_constraint
=
self
.
_bias_constraint
)
self
.
_attention_layer
=
tf
.
keras
.
layers
.
MultiHeadAttention
(
num_heads
=
self
.
_num_heads
,
key_dim
=
self
.
_attention_head_size
,
dropout
=
self
.
_attention_dropout
,
use_bias
=
self
.
_use_bias
,
kernel_initializer
=
self
.
_attention_initializer
,
attention_axes
=
self
.
_attention_axes
,
name
=
"self_attention"
,
**
common_kwargs
)
self
.
_attention_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_output_dropout
)
# Use float32 in layernorm for numeric stability.
# It is probably safe in mixed_float16, but we haven't validated this yet.
self
.
_attention_layer_norm
=
(
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"self_attention_layer_norm"
,
axis
=-
1
,
epsilon
=
self
.
_norm_epsilon
,
dtype
=
tf
.
float32
))
self
.
_intermediate_dense
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
einsum_equation
,
output_shape
=
(
None
,
self
.
_inner_dim
),
bias_axes
=
"d"
,
kernel_initializer
=
self
.
_kernel_initializer
,
name
=
"intermediate"
,
**
common_kwargs
)
policy
=
tf
.
keras
.
mixed_precision
.
global_policy
()
if
policy
.
name
==
"mixed_bfloat16"
:
# bfloat16 causes BERT with the LAMB optimizer to not converge
# as well, so we use float32.
# TODO(b/154538392): Investigate this.
policy
=
tf
.
float32
self
.
_intermediate_activation_layer
=
tf
.
keras
.
layers
.
Activation
(
self
.
_inner_activation
,
dtype
=
policy
)
self
.
_inner_dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_inner_dropout
)
self
.
_output_dense
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
einsum_equation
,
output_shape
=
(
None
,
hidden_size
),
bias_axes
=
"d"
,
name
=
"output"
,
kernel_initializer
=
self
.
_kernel_initializer
,
**
common_kwargs
)
self
.
_output_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_output_dropout
)
# Use float32 in layernorm for numeric stability.
self
.
_output_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"output_layer_norm"
,
axis
=-
1
,
epsilon
=
self
.
_norm_epsilon
,
dtype
=
tf
.
float32
)
super
(
TransformerEncoderBlock
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
config
=
{
"num_attention_heads"
:
self
.
_num_heads
,
"inner_dim"
:
self
.
_inner_dim
,
"inner_activation"
:
self
.
_inner_activation
,
"output_dropout"
:
self
.
_output_dropout_rate
,
"attention_dropout"
:
self
.
_attention_dropout_rate
,
"output_range"
:
self
.
_output_range
,
"kernel_initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_kernel_initializer
),
"bias_initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_bias_initializer
),
"kernel_regularizer"
:
tf
.
keras
.
regularizers
.
serialize
(
self
.
_kernel_regularizer
),
"bias_regularizer"
:
tf
.
keras
.
regularizers
.
serialize
(
self
.
_bias_regularizer
),
"activity_regularizer"
:
tf
.
keras
.
regularizers
.
serialize
(
self
.
_activity_regularizer
),
"kernel_constraint"
:
tf
.
keras
.
constraints
.
serialize
(
self
.
_kernel_constraint
),
"bias_constraint"
:
tf
.
keras
.
constraints
.
serialize
(
self
.
_bias_constraint
),
"use_bias"
:
self
.
_use_bias
,
"norm_first"
:
self
.
_norm_first
,
"norm_epsilon"
:
self
.
_norm_epsilon
,
"inner_dropout"
:
self
.
_inner_dropout
,
"attention_initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_attention_initializer
),
"attention_axes"
:
self
.
_attention_axes
,
}
base_config
=
super
(
TransformerEncoderBlock
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
inputs
):
"""Transformer self-attention encoder block call.
Args:
inputs: a single tensor or a list of tensors.
`input tensor` as the single sequence of embeddings.
[`input tensor`, `attention mask`] to have the additional attention
mask.
[`query tensor`, `key value tensor`, `attention mask`] to have separate
input streams for the query, and key/value to the multi-head
attention.
Returns:
An output tensor with the same dimensions as input/query tensor.
"""
if
isinstance
(
inputs
,
(
list
,
tuple
)):
if
len
(
inputs
)
==
2
:
input_tensor
,
attention_mask
=
inputs
key_value
=
None
elif
len
(
inputs
)
==
3
:
input_tensor
,
key_value
,
attention_mask
=
inputs
else
:
raise
ValueError
(
"Unexpected inputs to %s with length at %d"
%
(
self
.
__class__
,
len
(
inputs
)))
else
:
input_tensor
,
key_value
,
attention_mask
=
(
inputs
,
None
,
None
)
if
self
.
_output_range
:
if
self
.
_norm_first
:
source_tensor
=
input_tensor
[:,
0
:
self
.
_output_range
,
:]
input_tensor
=
self
.
_attention_layer_norm
(
input_tensor
)
if
key_value
is
not
None
:
key_value
=
self
.
_attention_layer_norm
(
key_value
)
target_tensor
=
input_tensor
[:,
0
:
self
.
_output_range
,
:]
if
attention_mask
is
not
None
:
attention_mask
=
attention_mask
[:,
0
:
self
.
_output_range
,
:]
else
:
if
self
.
_norm_first
:
source_tensor
=
input_tensor
input_tensor
=
self
.
_attention_layer_norm
(
input_tensor
)
if
key_value
is
not
None
:
key_value
=
self
.
_attention_layer_norm
(
key_value
)
target_tensor
=
input_tensor
if
key_value
is
None
:
key_value
=
input_tensor
attention_output
=
self
.
_attention_layer
(
query
=
target_tensor
,
value
=
key_value
,
attention_mask
=
attention_mask
)
attention_output
=
self
.
_attention_dropout
(
attention_output
)
if
self
.
_norm_first
:
attention_output
=
source_tensor
+
attention_output
else
:
attention_output
=
self
.
_attention_layer_norm
(
target_tensor
+
attention_output
)
if
self
.
_norm_first
:
source_attention_output
=
attention_output
attention_output
=
self
.
_output_layer_norm
(
attention_output
)
inner_output
=
self
.
_intermediate_dense
(
attention_output
)
inner_output
=
self
.
_intermediate_activation_layer
(
inner_output
)
inner_output
=
self
.
_inner_dropout_layer
(
inner_output
)
layer_output
=
self
.
_output_dense
(
inner_output
)
layer_output
=
self
.
_output_dropout
(
layer_output
)
if
self
.
_norm_first
:
return
source_attention_output
+
layer_output
# During mixed precision training, layer norm output is always fp32 for now.
# Casts fp32 for the subsequent add.
layer_output
=
tf
.
cast
(
layer_output
,
tf
.
float32
)
return
self
.
_output_layer_norm
(
layer_output
+
attention_output
)
official/nlp/modeling/layers/transformer_encoder_block_test.py
0 → 100644
View file @
002b4240
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for Keras-based transformer block layer."""
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
official.nlp.modeling.layers.transformer_encoder_block
import
TransformerEncoderBlock
@
keras_parameterized
.
run_all_keras_modes
@
parameterized
.
named_parameters
(
(
'base'
,
TransformerEncoderBlock
))
class
TransformerEncoderBlockLayerTest
(
keras_parameterized
.
TestCase
):
def
tearDown
(
self
):
super
(
TransformerEncoderBlockLayerTest
,
self
).
tearDown
()
tf
.
keras
.
mixed_precision
.
set_global_policy
(
'float32'
)
def
test_layer_creation
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
)
sequence_length
=
21
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
output_tensor
=
test_layer
(
data_tensor
)
# The default output of a transformer layer should be the same as the input.
self
.
assertEqual
(
data_tensor
.
shape
.
as_list
(),
output_tensor
.
shape
.
as_list
())
def
test_layer_creation_with_mask
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
)
sequence_length
=
21
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
# Create a 2-dimensional input (the first dimension is implicit).
mask_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
data_tensor
,
mask_tensor
])
# The default output of a transformer layer should be the same as the input.
self
.
assertEqual
(
data_tensor
.
shape
.
as_list
(),
output_tensor
.
shape
.
as_list
())
def
test_layer_invocation
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
)
sequence_length
=
21
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
output_tensor
=
test_layer
(
data_tensor
)
# Create a model from the test layer.
model
=
tf
.
keras
.
Model
(
data_tensor
,
output_tensor
)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
6
input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
))
_
=
model
.
predict
(
input_data
)
def
test_layer_invocation_with_mask
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
)
sequence_length
=
21
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
# Create a 2-dimensional input (the first dimension is implicit).
mask_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
data_tensor
,
mask_tensor
])
# Create a model from the test layer.
model
=
tf
.
keras
.
Model
([
data_tensor
,
mask_tensor
],
output_tensor
)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
6
input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length)
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
,
sequence_length
))
_
=
model
.
predict
([
input_data
,
mask_data
])
def
test_layer_output_range
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
)
sequence_length
=
21
width
=
80
batch_size
=
6
input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
))
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
input_data
,
mask_data
])
# The layer only attends to the first token and outputs the first token
# embedding.
new_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
output_range
=
1
)
_
=
new_layer
([
input_data
,
mask_data
])
new_layer
.
set_weights
(
test_layer
.
get_weights
())
new_output_tensor
=
new_layer
([
input_data
,
mask_data
])
self
.
assertAllClose
(
new_output_tensor
,
output_tensor
[:,
0
:
1
,
:],
atol
=
5e-5
,
rtol
=
0.003
)
def
test_layer_output_range_without_mask
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
norm_first
=
True
)
sequence_length
=
21
width
=
80
batch_size
=
6
input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
))
output_tensor
=
test_layer
(
input_data
)
# The layer only attends to the first token and outputs the first token
# embedding.
new_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
output_range
=
1
,
norm_first
=
True
)
_
=
new_layer
(
input_data
)
new_layer
.
set_weights
(
test_layer
.
get_weights
())
new_output_tensor
=
new_layer
(
input_data
)
self
.
assertAllClose
(
new_output_tensor
,
output_tensor
[:,
0
:
1
,
:],
atol
=
5e-5
,
rtol
=
0.003
)
def
test_layer_output_range_with_pre_norm
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
norm_first
=
True
)
sequence_length
=
21
width
=
80
batch_size
=
6
input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
))
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
input_data
,
mask_data
])
# The layer only attends to the first token and outputs the first token
# embedding.
new_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
output_range
=
1
,
norm_first
=
True
)
_
=
new_layer
([
input_data
,
mask_data
])
new_layer
.
set_weights
(
test_layer
.
get_weights
())
new_output_tensor
=
new_layer
([
input_data
,
mask_data
])
self
.
assertAllClose
(
new_output_tensor
,
output_tensor
[:,
0
:
1
,
:],
atol
=
5e-5
,
rtol
=
0.003
)
def
test_layer_invocation_with_float16_dtype
(
self
,
transformer_cls
):
tf
.
keras
.
mixed_precision
.
set_global_policy
(
'mixed_float16'
)
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
)
sequence_length
=
21
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
# Create a 2-dimensional input (the first dimension is implicit).
mask_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
data_tensor
,
mask_tensor
])
# Create a model from the test layer.
model
=
tf
.
keras
.
Model
([
data_tensor
,
mask_tensor
],
output_tensor
)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
6
input_data
=
(
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
)))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length)
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
,
sequence_length
))
_
=
model
.
predict
([
input_data
,
mask_data
])
def
test_transform_with_initializer
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
kernel_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
))
sequence_length
=
21
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
output
=
test_layer
(
data_tensor
)
# The default output of a transformer layer should be the same as the input.
self
.
assertEqual
(
data_tensor
.
shape
.
as_list
(),
output
.
shape
.
as_list
())
def
test_dynamic_layer_sequence
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
kernel_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
))
# Create a 3-dimensional input (the first dimension is implicit).
width
=
30
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
None
,
width
))
output_tensor
=
test_layer
(
input_tensor
)
model
=
tf
.
keras
.
Model
(
input_tensor
,
output_tensor
)
input_length
=
17
input_data
=
np
.
ones
((
1
,
input_length
,
width
))
output_data
=
model
.
predict
(
input_data
)
self
.
assertAllEqual
([
1
,
input_length
,
width
],
output_data
.
shape
)
def
test_separate_qkv
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
2
,
inner_dim
=
128
,
inner_activation
=
'relu'
,
kernel_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
))
# Forward path.
q_tensor
=
tf
.
zeros
([
2
,
4
,
16
],
dtype
=
tf
.
float32
)
kv_tensor
=
tf
.
zeros
([
2
,
8
,
16
],
dtype
=
tf
.
float32
)
dummy_mask
=
tf
.
zeros
([
2
,
4
,
8
],
dtype
=
tf
.
float32
)
inputs
=
[
q_tensor
,
kv_tensor
,
dummy_mask
]
output
=
test_layer
(
inputs
)
self
.
assertEqual
(
output
.
shape
,
q_tensor
.
shape
)
@
keras_parameterized
.
run_all_keras_modes
class
TransformerArgumentTest
(
keras_parameterized
.
TestCase
):
def
test_use_bias_norm_first
(
self
):
num_attention_heads
=
2
hidden_size
=
16
encoder_block
=
TransformerEncoderBlock
(
num_attention_heads
=
num_attention_heads
,
inner_dim
=
32
,
inner_activation
=
'relu'
,
output_dropout
=
0.1
,
attention_dropout
=
0.1
,
use_bias
=
False
,
norm_first
=
True
,
norm_epsilon
=
1e-6
,
inner_dropout
=
0.1
,
attention_initializer
=
tf
.
keras
.
initializers
.
RandomUniform
(
minval
=
0.
,
maxval
=
1.
))
# Forward path.
dummy_tensor
=
tf
.
zeros
([
2
,
4
,
16
],
dtype
=
tf
.
float32
)
dummy_mask
=
tf
.
zeros
([
2
,
4
,
4
],
dtype
=
tf
.
float32
)
inputs
=
[
dummy_tensor
,
dummy_mask
]
output
=
encoder_block
(
inputs
)
self
.
assertEqual
(
output
.
shape
,
(
2
,
4
,
hidden_size
))
def
test_get_config
(
self
):
num_attention_heads
=
2
encoder_block
=
TransformerEncoderBlock
(
num_attention_heads
=
num_attention_heads
,
inner_dim
=
32
,
inner_activation
=
'relu'
,
output_dropout
=
0.1
,
attention_dropout
=
0.1
,
use_bias
=
False
,
norm_first
=
True
,
norm_epsilon
=
1e-6
,
inner_dropout
=
0.1
,
attention_initializer
=
tf
.
keras
.
initializers
.
RandomUniform
(
minval
=
0.
,
maxval
=
1.
))
encoder_block_config
=
encoder_block
.
get_config
()
new_encoder_block
=
TransformerEncoderBlock
.
from_config
(
encoder_block_config
)
self
.
assertEqual
(
encoder_block_config
,
new_encoder_block
.
get_config
())
@
parameterized
.
parameters
({
'attention_axes'
:
None
},
{
'attention_axes'
:
[
1
]},
{
'attention_axes'
:
[
2
]},
{
'attention_axes'
:
[
1
,
2
]})
def
test_several_attention_axes
(
self
,
attention_axes
):
test_layer
=
TransformerEncoderBlock
(
inner_dim
=
32
,
inner_activation
=
'relu'
,
output_dropout
=
0.1
,
attention_dropout
=
0.1
,
use_bias
=
False
,
norm_first
=
True
,
norm_epsilon
=
1e-6
,
inner_dropout
=
0.1
,
num_attention_heads
=
10
,
attention_axes
=
attention_axes
)
num_rows
=
21
num_cols
=
13
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
num_rows
,
num_cols
,
width
))
output_tensor
=
test_layer
(
data_tensor
)
# The default output of a transformer layer should be the same as the input.
self
.
assertEqual
(
data_tensor
.
shape
.
as_list
(),
output_tensor
.
shape
.
as_list
())
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/nlp/modeling/models/seq2seq_transformer.py
View file @
002b4240
...
@@ -20,7 +20,6 @@ import math
...
@@ -20,7 +20,6 @@ import math
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.modeling
import
tf_utils
from
official.modeling
import
tf_utils
from
official.nlp
import
keras_nlp
from
official.nlp.modeling
import
layers
from
official.nlp.modeling
import
layers
from
official.nlp.modeling.ops
import
beam_search
from
official.nlp.modeling.ops
import
beam_search
...
@@ -79,7 +78,7 @@ class Seq2SeqTransformer(tf.keras.Model):
...
@@ -79,7 +78,7 @@ class Seq2SeqTransformer(tf.keras.Model):
self
.
_beam_size
=
beam_size
self
.
_beam_size
=
beam_size
self
.
_alpha
=
alpha
self
.
_alpha
=
alpha
self
.
_eos_id
=
eos_id
self
.
_eos_id
=
eos_id
self
.
embedding_lookup
=
keras_nlp
.
layers
.
OnDeviceEmbedding
(
self
.
embedding_lookup
=
layers
.
OnDeviceEmbedding
(
vocab_size
=
self
.
_vocab_size
,
vocab_size
=
self
.
_vocab_size
,
embedding_width
=
self
.
_embedding_width
,
embedding_width
=
self
.
_embedding_width
,
initializer
=
tf
.
random_normal_initializer
(
initializer
=
tf
.
random_normal_initializer
(
...
@@ -393,7 +392,7 @@ class TransformerEncoder(tf.keras.layers.Layer):
...
@@ -393,7 +392,7 @@ class TransformerEncoder(tf.keras.layers.Layer):
self
.
encoder_layers
=
[]
self
.
encoder_layers
=
[]
for
i
in
range
(
self
.
num_layers
):
for
i
in
range
(
self
.
num_layers
):
self
.
encoder_layers
.
append
(
self
.
encoder_layers
.
append
(
keras_nlp
.
layers
.
TransformerEncoderBlock
(
layers
.
TransformerEncoderBlock
(
num_attention_heads
=
self
.
num_attention_heads
,
num_attention_heads
=
self
.
num_attention_heads
,
inner_dim
=
self
.
_intermediate_size
,
inner_dim
=
self
.
_intermediate_size
,
inner_activation
=
self
.
_activation
,
inner_activation
=
self
.
_activation
,
...
...
official/nlp/modeling/networks/__init__.py
View file @
002b4240
...
@@ -20,6 +20,7 @@ handled object with a standardized configuration.
...
@@ -20,6 +20,7 @@ handled object with a standardized configuration.
"""
"""
from
official.nlp.modeling.networks.albert_encoder
import
AlbertEncoder
from
official.nlp.modeling.networks.albert_encoder
import
AlbertEncoder
from
official.nlp.modeling.networks.bert_encoder
import
BertEncoder
from
official.nlp.modeling.networks.bert_encoder
import
BertEncoder
from
official.nlp.modeling.networks.bert_encoder
import
BertEncoderV2
from
official.nlp.modeling.networks.classification
import
Classification
from
official.nlp.modeling.networks.classification
import
Classification
from
official.nlp.modeling.networks.encoder_scaffold
import
EncoderScaffold
from
official.nlp.modeling.networks.encoder_scaffold
import
EncoderScaffold
from
official.nlp.modeling.networks.funnel_transformer
import
FunnelTransformerEncoder
from
official.nlp.modeling.networks.funnel_transformer
import
FunnelTransformerEncoder
...
...
official/nlp/modeling/networks/albert_encoder.py
View file @
002b4240
...
@@ -18,7 +18,6 @@ import collections
...
@@ -18,7 +18,6 @@ import collections
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.modeling
import
activations
from
official.modeling
import
activations
from
official.nlp
import
keras_nlp
from
official.nlp.modeling
import
layers
from
official.nlp.modeling
import
layers
...
@@ -98,7 +97,7 @@ class AlbertEncoder(tf.keras.Model):
...
@@ -98,7 +97,7 @@ class AlbertEncoder(tf.keras.Model):
word_embeddings
=
embedding_layer
(
word_ids
)
word_embeddings
=
embedding_layer
(
word_ids
)
# Always uses dynamic slicing for simplicity.
# Always uses dynamic slicing for simplicity.
position_embedding_layer
=
keras_nlp
.
layers
.
PositionEmbedding
(
position_embedding_layer
=
layers
.
PositionEmbedding
(
initializer
=
initializer
,
initializer
=
initializer
,
max_length
=
max_sequence_length
,
max_length
=
max_sequence_length
,
name
=
'position_embedding'
)
name
=
'position_embedding'
)
...
@@ -133,8 +132,8 @@ class AlbertEncoder(tf.keras.Model):
...
@@ -133,8 +132,8 @@ class AlbertEncoder(tf.keras.Model):
embeddings
)
embeddings
)
data
=
embeddings
data
=
embeddings
attention_mask
=
keras_nlp
.
layers
.
SelfAttentionMask
()(
data
,
mask
)
attention_mask
=
layers
.
SelfAttentionMask
()(
data
,
mask
)
shared_layer
=
keras_nlp
.
layers
.
TransformerEncoderBlock
(
shared_layer
=
layers
.
TransformerEncoderBlock
(
num_attention_heads
=
num_attention_heads
,
num_attention_heads
=
num_attention_heads
,
inner_dim
=
intermediate_size
,
inner_dim
=
intermediate_size
,
inner_activation
=
activation
,
inner_activation
=
activation
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment