Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
b0ccdb11
Commit
b0ccdb11
authored
Sep 28, 2020
by
Shixin Luo
Browse files
resolve conflict with master
parents
e61588cd
1611a8c5
Changes
210
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2252 additions
and
546 deletions
+2252
-546
official/nlp/keras_nlp/requirements.txt
official/nlp/keras_nlp/requirements.txt
+1
-0
official/nlp/keras_nlp/setup.py
official/nlp/keras_nlp/setup.py
+69
-0
official/nlp/modeling/layers/README.md
official/nlp/modeling/layers/README.md
+22
-1
official/nlp/modeling/layers/__init__.py
official/nlp/modeling/layers/__init__.py
+5
-0
official/nlp/modeling/layers/attention.py
official/nlp/modeling/layers/attention.py
+0
-279
official/nlp/modeling/layers/attention_test.py
official/nlp/modeling/layers/attention_test.py
+0
-33
official/nlp/modeling/layers/gated_feedforward.py
official/nlp/modeling/layers/gated_feedforward.py
+11
-7
official/nlp/modeling/layers/masked_lm.py
official/nlp/modeling/layers/masked_lm.py
+2
-101
official/nlp/modeling/layers/on_device_embedding.py
official/nlp/modeling/layers/on_device_embedding.py
+2
-73
official/nlp/modeling/layers/relative_attention.py
official/nlp/modeling/layers/relative_attention.py
+525
-0
official/nlp/modeling/layers/relative_attention_test.py
official/nlp/modeling/layers/relative_attention_test.py
+191
-0
official/nlp/modeling/layers/tn_expand_condense.py
official/nlp/modeling/layers/tn_expand_condense.py
+180
-0
official/nlp/modeling/layers/tn_expand_condense_test.py
official/nlp/modeling/layers/tn_expand_condense_test.py
+176
-0
official/nlp/modeling/layers/tn_transformer_expand_condense.py
...ial/nlp/modeling/layers/tn_transformer_expand_condense.py
+253
-0
official/nlp/modeling/layers/tn_transformer_test.py
official/nlp/modeling/layers/tn_transformer_test.py
+214
-0
official/nlp/modeling/layers/transformer.py
official/nlp/modeling/layers/transformer.py
+7
-7
official/nlp/modeling/layers/transformer_scaffold.py
official/nlp/modeling/layers/transformer_scaffold.py
+28
-16
official/nlp/modeling/layers/transformer_scaffold_test.py
official/nlp/modeling/layers/transformer_scaffold_test.py
+0
-24
official/nlp/modeling/layers/transformer_test.py
official/nlp/modeling/layers/transformer_test.py
+5
-5
official/nlp/modeling/layers/transformer_xl.py
official/nlp/modeling/layers/transformer_xl.py
+561
-0
No files found.
official/nlp/keras_nlp/requirements.txt
0 → 100644
View file @
b0ccdb11
numpy>=1.15.4
official/nlp/keras_nlp/setup.py
0 → 100644
View file @
b0ccdb11
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Setup script."""
import
os
from
setuptools
import
find_packages
from
setuptools
import
setup
version
=
'0.0.1'
def
_get_requirements
():
"""Parses requirements.txt file."""
install_requires_tmp
=
[]
dependency_links_tmp
=
[]
with
open
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'./requirements.txt'
),
'r'
)
as
f
:
for
line
in
f
:
package_name
=
line
.
strip
()
# Skip empty line or comments starting with "#".
if
not
package_name
or
package_name
[
0
]
==
'#'
:
continue
if
package_name
.
startswith
(
'-e '
):
dependency_links_tmp
.
append
(
package_name
[
3
:].
strip
())
else
:
install_requires_tmp
.
append
(
package_name
)
return
install_requires_tmp
,
dependency_links_tmp
install_requires
,
dependency_links
=
_get_requirements
()
install_requires
.
append
(
'tf-nightly'
)
setup
(
name
=
'keras-nlp'
,
version
=
version
,
description
=
'Keras Natural Language Processing Library'
,
url
=
'https://github.com/keras-team/keras-nlp'
,
author
=
'The Keras authors'
,
author_email
=
'keras-team@google.com'
,
license
=
'Apache License 2.0'
,
install_requires
=
install_requires
,
classifiers
=
[
'Programming Language :: Python'
,
'Programming Language :: Python :: 3.6'
,
'Operating System :: Unix'
,
'Operating System :: Microsoft :: Windows'
,
'Operating System :: MacOS'
,
'Intended Audience :: Science/Research'
,
'Topic :: Scientific/Engineering'
,
'Topic :: Software Development'
],
packages
=
find_packages
(
exclude
=
(
'tests'
,)),
exclude_package_data
=
{
''
:
[
'*_test.py'
,],},
dependency_links
=
dependency_links
,
python_requires
=
'>=3.6'
,
)
official/nlp/modeling/layers/README.md
View file @
b0ccdb11
...
@@ -29,7 +29,7 @@ assemble new layers, networks, or models.
...
@@ -29,7 +29,7 @@ assemble new layers, networks, or models.
described in
described in
[
"Attention Is All You Need"
](
https://arxiv.org/abs/1706.03762
)
.
[
"Attention Is All You Need"
](
https://arxiv.org/abs/1706.03762
)
.
*
[
TransformerDecoder
Layer
](
transformer.py
)
TransformerDecoder
Layer
is made up
*
[
TransformerDecoder
Block
](
transformer.py
)
TransformerDecoder
Block
is made up
of self multi-head attention, cross multi-head attention and feedforward
of self multi-head attention, cross multi-head attention and feedforward
network.
network.
...
@@ -63,3 +63,24 @@ assemble new layers, networks, or models.
...
@@ -63,3 +63,24 @@ assemble new layers, networks, or models.
*
[
GatedFeedforward
](
gated_feedforward.py
)
implements the gated linear layer
*
[
GatedFeedforward
](
gated_feedforward.py
)
implements the gated linear layer
feedforward as described in
feedforward as described in
[
"GLU Variants Improve Transformer"
](
https://arxiv.org/abs/2002.05202
)
.
[
"GLU Variants Improve Transformer"
](
https://arxiv.org/abs/2002.05202
)
.
*
[
MultiHeadRelativeAttention
](
relative_attention.py
)
implements a variant
of multi-head attention with support for relative position encodings as
described in "Transformer-XL: Attentive Language Models Beyond a
Fixed-Length Context"(https://arxiv.org/abs/1901.02860). This also has
extended support for segment-based attention, a re-parameterization
introduced in "XLNet: Generalized Autoregressive Pretraining for Language
Understanding" (https://arxiv.org/abs/1906.08237).
*
[
TwoStreamRelativeAttention
](
relative_attention.py
)
implements a variant
of multi-head relative attention as described in "XLNet: Generalized
Autoregressive Pretraining for Language Understanding"
(https://arxiv.org/abs/1906.08237). This takes in a query and content
stream and applies self attention.
*
[
TransformerXL
](
transformer_xl.py
)
implements Transformer XL introduced in
"Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
(https://arxiv.org/abs/1901.02860). This contains
`TransformerXLBlock`
, a
block containing either one or two stream relative self-attention as well as
subsequent feedforward networks. It also contains
`TransformerXL`
, which
contains attention biases as well as multiple
`TransformerXLBlocks`
.
official/nlp/modeling/layers/__init__.py
View file @
b0ccdb11
...
@@ -24,8 +24,13 @@ from official.nlp.modeling.layers.mat_mul_with_margin import MatMulWithMargin
...
@@ -24,8 +24,13 @@ from official.nlp.modeling.layers.mat_mul_with_margin import MatMulWithMargin
from
official.nlp.modeling.layers.multi_channel_attention
import
*
from
official.nlp.modeling.layers.multi_channel_attention
import
*
from
official.nlp.modeling.layers.on_device_embedding
import
OnDeviceEmbedding
from
official.nlp.modeling.layers.on_device_embedding
import
OnDeviceEmbedding
from
official.nlp.modeling.layers.position_embedding
import
RelativePositionEmbedding
from
official.nlp.modeling.layers.position_embedding
import
RelativePositionEmbedding
from
official.nlp.modeling.layers.relative_attention
import
MultiHeadRelativeAttention
from
official.nlp.modeling.layers.relative_attention
import
TwoStreamRelativeAttention
from
official.nlp.modeling.layers.rezero_transformer
import
ReZeroTransformer
from
official.nlp.modeling.layers.rezero_transformer
import
ReZeroTransformer
from
official.nlp.modeling.layers.self_attention_mask
import
SelfAttentionMask
from
official.nlp.modeling.layers.self_attention_mask
import
SelfAttentionMask
from
official.nlp.modeling.layers.talking_heads_attention
import
TalkingHeadsAttention
from
official.nlp.modeling.layers.talking_heads_attention
import
TalkingHeadsAttention
from
official.nlp.modeling.layers.tn_transformer_expand_condense
import
TNTransformerExpandCondense
from
official.nlp.modeling.layers.transformer
import
*
from
official.nlp.modeling.layers.transformer
import
*
from
official.nlp.modeling.layers.transformer_scaffold
import
TransformerScaffold
from
official.nlp.modeling.layers.transformer_scaffold
import
TransformerScaffold
from
official.nlp.modeling.layers.transformer_xl
import
TransformerXL
from
official.nlp.modeling.layers.transformer_xl
import
TransformerXLBlock
official/nlp/modeling/layers/attention.py
View file @
b0ccdb11
...
@@ -16,16 +16,11 @@
...
@@ -16,16 +16,11 @@
"""Keras-based attention layer."""
"""Keras-based attention layer."""
# pylint: disable=g-classes-have-attributes
# pylint: disable=g-classes-have-attributes
import
math
import
math
import
string
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.nlp.modeling.layers
import
masked_softmax
EinsumDense
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
EinsumDense
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
MultiHeadAttention
=
tf
.
keras
.
layers
.
MultiHeadAttention
MultiHeadAttention
=
tf
.
keras
.
layers
.
MultiHeadAttention
_CHR_IDX
=
string
.
ascii_lowercase
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
...
@@ -111,277 +106,3 @@ class CachedAttention(tf.keras.layers.MultiHeadAttention):
...
@@ -111,277 +106,3 @@ class CachedAttention(tf.keras.layers.MultiHeadAttention):
if
return_attention_scores
:
if
return_attention_scores
:
return
attention_output
,
attention_scores
,
cache
return
attention_output
,
attention_scores
,
cache
return
attention_output
,
cache
return
attention_output
,
cache
def
_rel_shift
(
x
,
klen
=-
1
):
"""Performs relative shift to form the relative attention score."""
x
=
tf
.
transpose
(
x
,
perm
=
[
1
,
2
,
0
,
3
])
x_size
=
tf
.
shape
(
x
)
x
=
tf
.
reshape
(
x
,
[
x_size
[
1
],
x_size
[
0
],
x_size
[
2
],
x_size
[
3
]])
x
=
tf
.
slice
(
x
,
[
1
,
0
,
0
,
0
],
[
-
1
,
-
1
,
-
1
,
-
1
])
x
=
tf
.
reshape
(
x
,
[
x_size
[
0
],
x_size
[
1
]
-
1
,
x_size
[
2
],
x_size
[
3
]])
x
=
tf
.
slice
(
x
,
[
0
,
0
,
0
,
0
],
[
-
1
,
klen
,
-
1
,
-
1
])
x
=
tf
.
transpose
(
x
,
perm
=
[
2
,
0
,
1
,
3
])
return
x
def
_build_proj_equation
(
free_dims
,
bound_dims
,
output_dims
):
"""Builds an einsum equation for projections inside multi-head attention."""
input_str
=
""
kernel_str
=
""
output_str
=
""
bias_axes
=
""
letter_offset
=
0
for
i
in
range
(
free_dims
):
char
=
_CHR_IDX
[
i
+
letter_offset
]
input_str
+=
char
output_str
+=
char
letter_offset
+=
free_dims
for
i
in
range
(
bound_dims
):
char
=
_CHR_IDX
[
i
+
letter_offset
]
input_str
+=
char
kernel_str
+=
char
letter_offset
+=
bound_dims
for
i
in
range
(
output_dims
):
char
=
_CHR_IDX
[
i
+
letter_offset
]
kernel_str
+=
char
output_str
+=
char
bias_axes
+=
char
equation
=
"%s,%s->%s"
%
(
input_str
,
kernel_str
,
output_str
)
return
equation
,
bias_axes
,
len
(
output_str
)
def
_get_output_shape
(
output_rank
,
known_last_dims
):
return
[
None
]
*
(
output_rank
-
len
(
known_last_dims
))
+
list
(
known_last_dims
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
MultiHeadRelativeAttention
(
MultiHeadAttention
):
"""A multi-head attention layer with relative attention + position encoding.
This layer shares the same input/output projections as the common
MultiHeadAttention layer.
When it calculates attention logits, position encoding is projected to form
relative keys. The logits are composed by shifted relative logits and content
logits.
**Note: This layer is currently experimental.
Arguments:
num_heads: The number of attention heads.
key_dim: Size of each attention head for query and key.
value_dim: Size of attention head for value.
dropout: Dropout probability for attention.
use_bias: Boolean, whether the dense layers use bias vectors/matrices.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
Call args:
query: Query `Tensor` of shape `[B, T, dim]`.
value: Value `Tensor` of shape `[B, S, dim]`.
content_attention_bias: Bias `Tensor` for content based attention of shape
`[num_heads, dim]`.
position_attention_bias: Bias `Tensor` for position based attention of shape
`[num_heads, dim]`.
relative_position_encoding: Relative positional encoding `Tensor` of shape
`[B, L, dim]`.
state: Optional `Tensor` of shape [B, M, E] where M is the length of the
state or memory.
If passed, this is also attended over as in Transformer XL.
key: Optional key `Tensor` of shape `[B, S, dim]`. If not given, will use
`value` for both `key` and `value`, which is the most common case.
attention_mask: a boolean mask of shape `[B, T, S]`, that prevents attention
to certain positions.
"""
def
_build_from_signature
(
self
,
query
,
value
,
key
=
None
):
super
(
MultiHeadRelativeAttention
,
self
).
_build_from_signature
(
query
=
query
,
value
=
value
,
key
=
key
)
if
hasattr
(
query
,
"shape"
):
query_shape
=
tf
.
TensorShape
(
query
.
shape
)
else
:
query_shape
=
query
if
hasattr
(
value
,
"shape"
):
value_shape
=
tf
.
TensorShape
(
value
.
shape
)
else
:
value_shape
=
value
if
key
is
None
:
key_shape
=
value_shape
elif
hasattr
(
key
,
"shape"
):
key_shape
=
tf
.
TensorShape
(
key
.
shape
)
else
:
key_shape
=
key
common_kwargs
=
dict
(
kernel_initializer
=
self
.
_kernel_initializer
,
bias_initializer
=
self
.
_bias_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
bias_regularizer
=
self
.
_bias_regularizer
,
activity_regularizer
=
self
.
_activity_regularizer
,
kernel_constraint
=
self
.
_kernel_constraint
,
bias_constraint
=
self
.
_bias_constraint
)
with
tf
.
init_scope
():
free_dims
=
query_shape
.
rank
-
1
einsum_equation
,
bias_axes
,
output_rank
=
_build_proj_equation
(
key_shape
.
rank
-
1
,
bound_dims
=
1
,
output_dims
=
2
)
self
.
_encoding_dense
=
EinsumDense
(
einsum_equation
,
output_shape
=
_get_output_shape
(
output_rank
-
1
,
[
self
.
_num_heads
,
self
.
_key_dim
]),
bias_axes
=
bias_axes
if
self
.
_use_bias
else
None
,
name
=
"encoding"
,
**
common_kwargs
)
output_shape
=
[
query_shape
[
-
1
]]
einsum_equation
,
bias_axes
,
output_rank
=
_build_proj_equation
(
free_dims
,
bound_dims
=
2
,
output_dims
=
len
(
output_shape
))
# TODO(allencwang) - replace all einsums with programmatic equations.
einsum_equation
=
"abcd,ecd->abe"
self
.
_output_dense
=
EinsumDense
(
einsum_equation
,
output_shape
=
_get_output_shape
(
output_rank
-
1
,
output_shape
),
bias_axes
=
bias_axes
if
self
.
_use_bias
else
None
,
name
=
"attention_output"
,
**
common_kwargs
)
def
_build_attention
(
self
,
rank
):
self
.
_masked_softmax
=
masked_softmax
.
MaskedSoftmax
(
mask_expansion_axes
=
[
1
],
normalization_axes
=
[
2
])
self
.
_dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_dropout
)
def
compute_attention
(
self
,
query
,
key
,
value
,
position
,
content_attention_bias
,
positional_attention_bias
,
attention_mask
=
None
):
"""Computes the attention.
This function defines the computation inside `call` with projected
multihead Q, K, V, R inputs.
Args:
query: Projected query `Tensor` of shape `[B, T, N, key_dim]`.
key: Projected key `Tensor` of shape `[B, S + M, N, key_dim]`.
value: Projected value `Tensor` of shape `[B, S + M, N, key_dim]`.
position: Projected position `Tensor` of shape `[B, L, N, key_dim]`.
content_attention_bias: Trainable bias parameter added to the query head
when calculating the content-based attention score.
positional_attention_bias: Trainable bias parameter added to the query
head when calculating the position-based attention score.
attention_mask: (default None) Optional mask that is added to attention
logits. If state is not None, the mask source sequence dimension should
extend M.
Returns:
attention_output: Multi-headed output of attention computation of shape
`[B, T, N, key_dim]`.
"""
content_attention
=
tf
.
einsum
(
"bind,bjnd->bijn"
,
query
+
content_attention_bias
,
key
)
positional_attention
=
tf
.
einsum
(
"bind,bjnd->bijn"
,
query
+
positional_attention_bias
,
position
)
positional_attention
=
_rel_shift
(
positional_attention
,
klen
=
tf
.
shape
(
content_attention
)[
2
])
attention_scores
=
tf
.
multiply
((
content_attention
+
positional_attention
),
1.0
/
math
.
sqrt
(
float
(
self
.
_key_dim
)))
attention_scores
=
self
.
_masked_softmax
(
attention_scores
,
attention_mask
)
attention_output
=
self
.
_dropout_layer
(
attention_scores
)
attention_output
=
tf
.
einsum
(
"bijn,bjnd->bind"
,
attention_output
,
value
)
return
attention_output
def
call
(
self
,
query
,
value
,
content_attention_bias
,
positional_attention_bias
,
key
=
None
,
relative_position_encoding
=
None
,
state
=
None
,
attention_mask
=
None
):
"""Compute multi-head relative attention over inputs.
Size glossary:
* Number of heads (H): the number of attention heads.
* Value size (V): the size of each value embedding per head.
* Key size (K): the size of each key embedding per head. Equally, the size
of each query embedding per head. Typically K <= V.
* Batch dimensions (B).
* Query (target) attention axes shape (T).
* Value (source) attention axes shape (S), the rank must match the target.
* Encoding length (L): The relative positional encoding length.
Args:
query: attention input.
value: attention input.
content_attention_bias: A trainable bias parameter added to the query
head when calculating the content-based attention score.
positional_attention_bias: A trainable bias parameter added to the query
head when calculating the position-based attention score.
key: attention input.
relative_position_encoding: relative positional encoding for key and
value.
state: (default None) optional state. If passed, this is also attended
over as in TransformerXL.
attention_mask: (default None) Optional mask that is added to attention
logits. If state is not None, the mask source sequence dimension should
extend M.
Returns:
attention_output: The result of the computation, of shape [B, T, E],
where `T` is for target sequence shapes and `E` is the query input last
dimension if `output_shape` is `None`. Otherwise, the multi-head outputs
are projected to the shape specified by `output_shape`.
"""
if
not
self
.
_built_from_signature
:
self
.
_build_from_signature
(
query
,
value
,
key
=
key
)
if
key
is
None
:
key
=
value
if
state
is
not
None
and
state
.
shape
.
ndims
>
1
:
value
=
tf
.
concat
([
state
,
value
],
1
)
key
=
tf
.
concat
([
state
,
key
],
1
)
# `query` = [B, T, N ,H]
query
=
self
.
_query_dense
(
query
)
# `key` = [B, S + M, N, H]
key
=
self
.
_key_dense
(
key
)
# `value` = [B, S + M, N, H]
value
=
self
.
_value_dense
(
value
)
# `position` = [B, L, N, H]
position
=
self
.
_encoding_dense
(
relative_position_encoding
)
attention_output
=
self
.
compute_attention
(
query
=
query
,
key
=
key
,
value
=
value
,
position
=
position
,
content_attention_bias
=
content_attention_bias
,
positional_attention_bias
=
positional_attention_bias
,
attention_mask
=
attention_mask
)
attention_output
=
self
.
_output_dense
(
attention_output
)
return
attention_output
official/nlp/modeling/layers/attention_test.py
View file @
b0ccdb11
...
@@ -92,38 +92,5 @@ class CachedAttentionTest(keras_parameterized.TestCase):
...
@@ -92,38 +92,5 @@ class CachedAttentionTest(keras_parameterized.TestCase):
self
.
assertEqual
(
cache
[
"value"
].
shape
,
(
3
,
4
,
2
,
2
))
self
.
assertEqual
(
cache
[
"value"
].
shape
,
(
3
,
4
,
2
,
2
))
@
keras_parameterized
.
run_all_keras_modes
class
MultiHeadRelativeAttentionTest
(
keras_parameterized
.
TestCase
):
def
test_attention_scores
(
self
):
num_heads
=
12
key_dim
=
64
value_dim
=
32
seq_length
=
8
batch_size
=
2
test_layer
=
attention
.
MultiHeadRelativeAttention
(
num_heads
=
num_heads
,
key_dim
=
key_dim
,
value_dim
=
value_dim
)
query
=
tf
.
random
.
normal
(
shape
=
(
batch_size
,
seq_length
,
key_dim
))
value
=
query
relative_position_encoding
=
tf
.
random
.
normal
(
shape
=
(
batch_size
,
seq_length
*
2
,
key_dim
))
content_attention_bias
=
tf
.
random
.
normal
(
shape
=
(
num_heads
,
key_dim
))
positional_attention_bias
=
tf
.
random
.
normal
(
shape
=
(
num_heads
,
key_dim
))
output
=
test_layer
(
query
=
query
,
value
=
value
,
content_attention_bias
=
content_attention_bias
,
positional_attention_bias
=
positional_attention_bias
,
relative_position_encoding
=
relative_position_encoding
,
state
=
None
,
attention_mask
=
None
)
self
.
assertEqual
(
output
.
shape
,
[
batch_size
,
seq_length
,
key_dim
])
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
tf
.
test
.
main
()
official/nlp/modeling/layers/gated_feedforward.py
View file @
b0ccdb11
...
@@ -59,6 +59,7 @@ class GatedFeedforward(tf.keras.layers.Layer):
...
@@ -59,6 +59,7 @@ class GatedFeedforward(tf.keras.layers.Layer):
intermediate_activation
,
intermediate_activation
,
dropout
,
dropout
,
use_gate
=
True
,
use_gate
=
True
,
apply_output_layer_norm
=
True
,
num_blocks
=
1
,
num_blocks
=
1
,
dropout_position
=
"before_residual"
,
dropout_position
=
"before_residual"
,
kernel_initializer
=
"glorot_uniform"
,
kernel_initializer
=
"glorot_uniform"
,
...
@@ -75,6 +76,7 @@ class GatedFeedforward(tf.keras.layers.Layer):
...
@@ -75,6 +76,7 @@ class GatedFeedforward(tf.keras.layers.Layer):
self
.
_dropout
=
dropout
self
.
_dropout
=
dropout
self
.
_use_gate
=
use_gate
self
.
_use_gate
=
use_gate
self
.
_num_blocks
=
num_blocks
self
.
_num_blocks
=
num_blocks
self
.
_apply_output_layer_norm
=
apply_output_layer_norm
self
.
_dropout_position
=
dropout_position
self
.
_dropout_position
=
dropout_position
if
self
.
_dropout_position
not
in
(
"before_residual"
,
"after_residual"
):
if
self
.
_dropout_position
not
in
(
"before_residual"
,
"after_residual"
):
raise
ValueError
(
raise
ValueError
(
...
@@ -140,12 +142,13 @@ class GatedFeedforward(tf.keras.layers.Layer):
...
@@ -140,12 +142,13 @@ class GatedFeedforward(tf.keras.layers.Layer):
**
common_kwargs
))
**
common_kwargs
))
self
.
_output_dropout
.
append
(
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_dropout
))
self
.
_output_dropout
.
append
(
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_dropout
))
# Use float32 in layernorm for numeric stability.
# Use float32 in layernorm for numeric stability.
self
.
_output_layer_norm
.
append
(
if
self
.
_apply_output_layer_norm
:
tf
.
keras
.
layers
.
LayerNormalization
(
self
.
_output_layer_norm
.
append
(
name
=
"output_layer_norm_%d"
%
i
,
tf
.
keras
.
layers
.
LayerNormalization
(
axis
=-
1
,
name
=
"output_layer_norm_%d"
%
i
,
epsilon
=
1e-12
,
axis
=-
1
,
dtype
=
tf
.
float32
))
epsilon
=
1e-12
,
dtype
=
tf
.
float32
))
def
get_config
(
self
):
def
get_config
(
self
):
config
=
{
config
=
{
...
@@ -199,7 +202,8 @@ class GatedFeedforward(tf.keras.layers.Layer):
...
@@ -199,7 +202,8 @@ class GatedFeedforward(tf.keras.layers.Layer):
# add.
# add.
if
layer_input
.
dtype
==
tf
.
float32
:
if
layer_input
.
dtype
==
tf
.
float32
:
layer_output
=
tf
.
cast
(
layer_output
,
tf
.
float32
)
layer_output
=
tf
.
cast
(
layer_output
,
tf
.
float32
)
layer_output
=
self
.
_output_layer_norm
[
i
](
layer_output
+
layer_input
)
if
self
.
_apply_output_layer_norm
:
layer_output
=
self
.
_output_layer_norm
[
i
](
layer_output
+
layer_input
)
if
self
.
_dropout_position
==
"after_residual"
:
if
self
.
_dropout_position
==
"after_residual"
:
layer_output
=
self
.
_output_dropout
[
i
](
layer_output
)
layer_output
=
self
.
_output_dropout
[
i
](
layer_output
)
...
...
official/nlp/modeling/layers/masked_lm.py
View file @
b0ccdb11
...
@@ -14,106 +14,7 @@
...
@@ -14,106 +14,7 @@
# ==============================================================================
# ==============================================================================
"""Masked language model network."""
"""Masked language model network."""
# pylint: disable=g-classes-have-attributes
# pylint: disable=g-classes-have-attributes
import
tensorflow
as
tf
from
official.nlp
import
keras_nlp
from
official.modeling
import
tf_utils
MaskedLM
=
keras_nlp
.
layers
.
MaskedLM
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
MaskedLM
(
tf
.
keras
.
layers
.
Layer
):
"""Masked language model network head for BERT modeling.
This network implements a masked language model based on the provided network.
It assumes that the network being passed has a "get_embedding_table()" method.
Arguments:
embedding_table: The embedding table of the targets.
activation: The activation, if any, for the dense layer.
initializer: The initializer for the dense layer. Defaults to a Glorot
uniform initializer.
output: The output style for this network. Can be either 'logits' or
'predictions'.
"""
def
__init__
(
self
,
embedding_table
,
activation
=
None
,
initializer
=
'glorot_uniform'
,
output
=
'logits'
,
name
=
'cls/predictions'
,
**
kwargs
):
super
(
MaskedLM
,
self
).
__init__
(
name
=
name
,
**
kwargs
)
self
.
embedding_table
=
embedding_table
self
.
activation
=
activation
self
.
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
if
output
not
in
(
'predictions'
,
'logits'
):
raise
ValueError
(
(
'Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"'
)
%
output
)
self
.
_output_type
=
output
def
build
(
self
,
input_shape
):
self
.
_vocab_size
,
hidden_size
=
self
.
embedding_table
.
shape
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
hidden_size
,
activation
=
self
.
activation
,
kernel_initializer
=
self
.
initializer
,
name
=
'transform/dense'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
axis
=-
1
,
epsilon
=
1e-12
,
name
=
'transform/LayerNorm'
)
self
.
bias
=
self
.
add_weight
(
'output_bias/bias'
,
shape
=
(
self
.
_vocab_size
,),
initializer
=
'zeros'
,
trainable
=
True
)
super
(
MaskedLM
,
self
).
build
(
input_shape
)
def
call
(
self
,
sequence_data
,
masked_positions
):
masked_lm_input
=
self
.
_gather_indexes
(
sequence_data
,
masked_positions
)
lm_data
=
self
.
dense
(
masked_lm_input
)
lm_data
=
self
.
layer_norm
(
lm_data
)
lm_data
=
tf
.
matmul
(
lm_data
,
self
.
embedding_table
,
transpose_b
=
True
)
logits
=
tf
.
nn
.
bias_add
(
lm_data
,
self
.
bias
)
masked_positions_shape
=
tf_utils
.
get_shape_list
(
masked_positions
,
name
=
'masked_positions_tensor'
)
logits
=
tf
.
reshape
(
logits
,
[
-
1
,
masked_positions_shape
[
1
],
self
.
_vocab_size
])
if
self
.
_output_type
==
'logits'
:
return
logits
return
tf
.
nn
.
log_softmax
(
logits
)
def
get_config
(
self
):
raise
NotImplementedError
(
'MaskedLM cannot be directly serialized because '
'it has variable sharing logic.'
)
def
_gather_indexes
(
self
,
sequence_tensor
,
positions
):
"""Gathers the vectors at the specific positions.
Args:
sequence_tensor: Sequence output of `BertModel` layer of shape
(`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
hidden units of `BertModel` layer.
positions: Positions ids of tokens in sequence to mask for pretraining
of with dimension (batch_size, num_predictions) where
`num_predictions` is maximum number of tokens to mask out and predict
per each sequence.
Returns:
Masked out sequence tensor of shape (batch_size * num_predictions,
num_hidden).
"""
sequence_shape
=
tf_utils
.
get_shape_list
(
sequence_tensor
,
name
=
'sequence_output_tensor'
)
batch_size
,
seq_length
,
width
=
sequence_shape
flat_offsets
=
tf
.
reshape
(
tf
.
range
(
0
,
batch_size
,
dtype
=
tf
.
int32
)
*
seq_length
,
[
-
1
,
1
])
flat_positions
=
tf
.
reshape
(
positions
+
flat_offsets
,
[
-
1
])
flat_sequence_tensor
=
tf
.
reshape
(
sequence_tensor
,
[
batch_size
*
seq_length
,
width
])
output_tensor
=
tf
.
gather
(
flat_sequence_tensor
,
flat_positions
)
return
output_tensor
official/nlp/modeling/layers/on_device_embedding.py
View file @
b0ccdb11
...
@@ -15,78 +15,7 @@
...
@@ -15,78 +15,7 @@
"""Keras-based one-hot embedding layer."""
"""Keras-based one-hot embedding layer."""
# pylint: disable=g-classes-have-attributes
# pylint: disable=g-classes-have-attributes
import
tensorflow
as
tf
from
official.nlp
import
keras_nlp
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
OnDeviceEmbedding
=
keras_nlp
.
layers
.
OnDeviceEmbedding
class
OnDeviceEmbedding
(
tf
.
keras
.
layers
.
Layer
):
"""Performs an embedding lookup suitable for accelerator devices.
This layer uses either tf.gather or tf.one_hot to translate integer indices to
float embeddings.
Arguments:
vocab_size: Number of elements in the vocabulary.
embedding_width: Output size of the embedding layer.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
lookup. Defaults to False (that is, using tf.gather). Setting this option
to True may improve performance, especially on small vocabulary sizes, but
will generally require more memory.
use_scale: Whether to scale the output embeddings. Defaults to False (that
is, not to scale). Setting this option to True will let values in output
embeddings multiplied by self._embedding_width ** 0.5.
"""
def
__init__
(
self
,
vocab_size
,
embedding_width
,
initializer
=
"glorot_uniform"
,
use_one_hot
=
False
,
use_scale
=
False
,
**
kwargs
):
super
(
OnDeviceEmbedding
,
self
).
__init__
(
**
kwargs
)
self
.
_vocab_size
=
vocab_size
self
.
_embedding_width
=
embedding_width
self
.
_initializer
=
initializer
self
.
_use_one_hot
=
use_one_hot
self
.
_use_scale
=
use_scale
def
get_config
(
self
):
config
=
{
"vocab_size"
:
self
.
_vocab_size
,
"embedding_width"
:
self
.
_embedding_width
,
"initializer"
:
self
.
_initializer
,
"use_one_hot"
:
self
.
_use_one_hot
,
"use_scale"
:
self
.
_use_scale
,
}
base_config
=
super
(
OnDeviceEmbedding
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
build
(
self
,
input_shape
):
self
.
embeddings
=
self
.
add_weight
(
"embeddings"
,
shape
=
[
self
.
_vocab_size
,
self
.
_embedding_width
],
initializer
=
self
.
_initializer
,
dtype
=
tf
.
float32
)
super
(
OnDeviceEmbedding
,
self
).
build
(
input_shape
)
def
call
(
self
,
inputs
):
flat_inputs
=
tf
.
reshape
(
inputs
,
[
-
1
])
if
self
.
_use_one_hot
:
one_hot_data
=
tf
.
one_hot
(
flat_inputs
,
depth
=
self
.
_vocab_size
,
dtype
=
self
.
embeddings
.
dtype
)
embeddings
=
tf
.
matmul
(
one_hot_data
,
self
.
embeddings
)
else
:
embeddings
=
tf
.
gather
(
self
.
embeddings
,
flat_inputs
)
embeddings
=
tf
.
reshape
(
embeddings
,
# Work around b/142213824: prefer concat to shape over a Python list.
tf
.
concat
([
tf
.
shape
(
inputs
),
[
self
.
_embedding_width
]],
axis
=
0
))
embeddings
.
set_shape
(
inputs
.
shape
.
as_list
()
+
[
self
.
_embedding_width
])
if
self
.
_use_scale
:
embeddings
*=
self
.
_embedding_width
**
0.5
return
embeddings
official/nlp/modeling/layers/relative_attention.py
0 → 100644
View file @
b0ccdb11
This diff is collapsed.
Click to expand it.
official/nlp/modeling/layers/relative_attention_test.py
0 → 100644
View file @
b0ccdb11
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for the attention layer."""
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.distribute
import
combinations
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
official.nlp.modeling.layers
import
relative_attention
def
_create_mock_attention_data
(
num_heads
,
key_dim
,
value_dim
,
seq_length
,
batch_size
,
memory_length
=
0
,
num_predictions
=
2
,
two_stream
=
False
,
include_state
=
False
,
include_mask
=
False
,
include_segment
=
False
):
"""Creates mock testing data.
Args:
num_heads: `int`, Number of attention heads.
key_dim: `int`, Size of query head.
value_dim: `int`, Size of key, value dim.
seq_length: `int`, Sequence length of the input.
batch_size: `int`, the batch size.
memory_length: optional `int`, the length of the state. Defaults to 0.
num_predictions: `int`, the number of predictions used in two stream
attention.
two_stream: `bool`, whether or not to generate two stream data.
include_state: optional `bool`, whether or not to include state data.
include_mask: optional `bool`, whether or not to include mask data.
include_segment: optional `bool`, whether or not to include segment data.
Returns:
A dictionary with `str` as keys and `Tensor` as values.
"""
query_shape
=
(
batch_size
,
seq_length
,
key_dim
)
value_shape
=
(
batch_size
,
seq_length
,
value_dim
)
encoding_shape
=
(
batch_size
,
seq_length
*
2
,
key_dim
)
attention_bias_shape
=
(
num_heads
,
key_dim
)
data
=
dict
(
relative_position_encoding
=
tf
.
random
.
normal
(
shape
=
encoding_shape
),
content_attention_bias
=
tf
.
random
.
normal
(
shape
=
attention_bias_shape
),
positional_attention_bias
=
tf
.
random
.
normal
(
shape
=
attention_bias_shape
))
if
two_stream
:
query_stream_shape
=
(
batch_size
,
num_predictions
,
key_dim
)
target_mapping_shape
=
(
batch_size
,
num_predictions
,
seq_length
)
stream_data
=
dict
(
content_stream
=
tf
.
random
.
normal
(
shape
=
query_shape
),
query_stream
=
tf
.
random
.
normal
(
shape
=
query_stream_shape
),
target_mapping
=
tf
.
random
.
normal
(
shape
=
target_mapping_shape
))
else
:
stream_data
=
dict
(
query
=
tf
.
random
.
normal
(
shape
=
query_shape
),
value
=
tf
.
random
.
normal
(
shape
=
value_shape
),
key
=
tf
.
random
.
normal
(
shape
=
value_shape
))
data
.
update
(
stream_data
)
if
include_state
:
total_seq_length
=
seq_length
+
memory_length
state_data
=
dict
(
state
=
tf
.
random
.
normal
(
shape
=
(
batch_size
,
memory_length
,
value_dim
)))
data
.
update
(
state_data
)
else
:
total_seq_length
=
seq_length
if
include_mask
:
mask_shape
=
(
batch_size
,
num_heads
,
seq_length
,
total_seq_length
)
mask_data
=
np
.
random
.
randint
(
2
,
size
=
mask_shape
).
astype
(
"float32"
)
if
two_stream
:
mask_data
=
dict
(
content_attention_mask
=
mask_data
,
query_attention_mask
=
mask_data
)
else
:
mask_data
=
dict
(
attention_mask
=
mask_data
)
data
.
update
(
mask_data
)
if
include_segment
:
segment_encoding_shape
=
(
2
,
num_heads
,
key_dim
)
segment_matrix
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
seq_length
,
total_seq_length
))
segment_matrix
=
tf
.
math
.
equal
(
segment_matrix
,
1
)
segment_data
=
dict
(
segment_attention_bias
=
tf
.
random
.
normal
(
shape
=
attention_bias_shape
),
segment_encoding
=
tf
.
random
.
normal
(
shape
=
segment_encoding_shape
),
segment_matrix
=
segment_matrix
)
data
.
update
(
segment_data
)
return
data
@
keras_parameterized
.
run_all_keras_modes
class
MultiHeadRelativeAttentionTest
(
keras_parameterized
.
TestCase
):
@
combinations
.
generate
(
combinations
.
combine
(
value_dim
=
[
32
,
64
],
memory_length
=
[
0
,
4
],
state
=
[
True
,
False
],
mask
=
[
True
,
False
],
segment
=
[
True
,
False
]))
def
test_attention_scores
(
self
,
value_dim
,
memory_length
,
state
,
mask
,
segment
):
"""Tests combinations of attention score calculations."""
batch_size
,
num_heads
,
key_dim
,
seq_length
=
2
,
12
,
64
,
8
test_layer
=
relative_attention
.
MultiHeadRelativeAttention
(
num_heads
=
num_heads
,
key_dim
=
key_dim
,
value_dim
=
value_dim
)
data
=
_create_mock_attention_data
(
num_heads
=
num_heads
,
key_dim
=
key_dim
,
value_dim
=
value_dim
,
seq_length
=
seq_length
,
memory_length
=
memory_length
,
two_stream
=
False
,
batch_size
=
batch_size
,
include_state
=
state
,
include_mask
=
mask
,
include_segment
=
segment
)
output
=
test_layer
(
**
data
)
self
.
assertEqual
(
output
.
shape
,
[
batch_size
,
seq_length
,
key_dim
])
@
keras_parameterized
.
run_all_keras_modes
class
TwoStreamRelativeAttentionTest
(
keras_parameterized
.
TestCase
):
@
combinations
.
generate
(
combinations
.
combine
(
num_predictions
=
[
2
,
10
],
memory_length
=
[
0
,
4
],
state
=
[
True
,
False
],
mask
=
[
True
,
False
],
segment
=
[
True
,
False
]))
def
test_attention_scores
(
self
,
num_predictions
,
memory_length
,
state
,
mask
,
segment
):
"""Tests combinations of attention score calculations."""
batch_size
,
num_heads
,
key_dim
,
seq_length
=
2
,
12
,
64
,
8
test_layer
=
relative_attention
.
TwoStreamRelativeAttention
(
num_heads
=
num_heads
,
key_dim
=
key_dim
,
value_dim
=
key_dim
)
data
=
_create_mock_attention_data
(
num_heads
=
num_heads
,
key_dim
=
key_dim
,
value_dim
=
key_dim
,
seq_length
=
seq_length
,
memory_length
=
memory_length
,
num_predictions
=
num_predictions
,
two_stream
=
True
,
batch_size
=
batch_size
,
include_state
=
state
,
include_mask
=
mask
,
include_segment
=
segment
)
content_output
,
query_output
,
=
test_layer
(
**
data
)
self
.
assertEqual
(
content_output
.
shape
,
[
batch_size
,
seq_length
,
key_dim
])
self
.
assertEqual
(
query_output
.
shape
,
[
batch_size
,
num_predictions
,
key_dim
])
if
__name__
==
"__main__"
:
np
.
random
.
seed
(
0
)
tf
.
random
.
set_seed
(
0
)
tf
.
test
.
main
()
official/nlp/modeling/layers/tn_expand_condense.py
0 → 100644
View file @
b0ccdb11
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""ExpandCondense tensor network layer used in TN-BERT."""
# pylint: disable=g-classes-have-attributes
from
typing
import
List
,
Optional
,
Text
,
Any
,
Dict
import
tensorflow
as
tf
Layer
=
tf
.
keras
.
layers
.
Layer
activations
=
tf
.
keras
.
activations
initializers
=
tf
.
keras
.
initializers
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
TNExpandCondense
(
Layer
):
"""A TPU-optimized TensorNetwork layer.
Designed for use in models that currently use Dense layers to achieve
up projection followed by down projection.
This layer is a TPU-optimized combination of 3 operations:
Expand, Apply Activation, and Condense. The layer projects up from
`input_shape[-1]` to `input_shape[-1] * proj_multiplier`, applies
`self.activation`, and then condenses back to `input_shape[-1]`.
Note the input shape and output shape will be identical.
Arguments:
proj_multiplier: Positive integer, multiple of input_shape[-1] to project
up to. Must be one of [2, 4, 6, 8].
use_bias: Boolean, whether the layer uses a bias vector.
activation: Activation function to use between Expand and Condense. If you
don't specify anything, no activation is applied
(ie. "linear" activation: `a(x) = x`).
kernel_initializer: Initializer for the weight matrices.
bias_initializer: Initializer for the bias vector.
Input shape:
N-D tensor with shape: `(batch_size, ..., input_shape[-1])`.
Output shape:
N-D tensor with shape: `(batch_size, ..., input_shape[-1])`.
"""
def
__init__
(
self
,
proj_multiplier
:
int
,
use_bias
:
Optional
[
bool
]
=
True
,
activation
:
Optional
[
Text
]
=
'relu'
,
kernel_initializer
:
Optional
[
Text
]
=
'glorot_uniform'
,
bias_initializer
:
Optional
[
Text
]
=
'zeros'
,
**
kwargs
)
->
None
:
# Allow specification of input_dim instead of input_shape,
# for compatability with Keras layers that support this
if
'input_shape'
not
in
kwargs
and
'input_dim'
in
kwargs
:
kwargs
[
'input_shape'
]
=
(
kwargs
.
pop
(
'input_dim'
),)
super
(
TNExpandCondense
,
self
).
__init__
(
**
kwargs
)
assert
proj_multiplier
in
[
2
,
4
,
6
,
8
,
10
,
12
],
'proj_multiplier needs to be one of [2, 4, 6, 8, 10, 12]'
self
.
proj_multiplier
=
proj_multiplier
self
.
use_bias
=
use_bias
self
.
activation
=
activations
.
get
(
activation
)
self
.
kernel_initializer
=
initializers
.
get
(
kernel_initializer
)
self
.
bias_initializer
=
initializers
.
get
(
bias_initializer
)
def
build
(
self
,
input_shape
:
List
[
int
])
->
None
:
# Disable the attribute-defined-outside-init violations in this function
# pylint: disable=attribute-defined-outside-init
if
input_shape
[
-
1
]
is
None
:
raise
ValueError
(
'The last dimension of the inputs to `TNExpandCondense` '
'should be defined. Found `None`.'
)
super
(
TNExpandCondense
,
self
).
build
(
input_shape
)
self
.
proj_size
=
self
.
proj_multiplier
*
input_shape
[
-
1
]
assert
(
self
.
proj_size
//
input_shape
[
-
1
])
*
input_shape
[
-
1
]
==
self
.
proj_size
,
(
f
'
{
self
.
proj_size
}
/
{
input_shape
[
-
1
]
}
must be '
f
'round'
)
assert
(
input_shape
[
-
1
]
//
128
)
*
128
==
input_shape
[
-
1
],
f
'
{
input_shape
[
-
1
]
}
/ 128 must be round'
self
.
w1
=
self
.
add_weight
(
name
=
'w1'
,
shape
=
(
input_shape
[
-
1
],
input_shape
[
-
1
]),
trainable
=
True
,
initializer
=
self
.
kernel_initializer
)
self
.
w2
=
self
.
add_weight
(
name
=
'w2'
,
shape
=
(
128
,
(
128
*
(
self
.
proj_size
//
input_shape
[
-
1
]))),
trainable
=
True
,
initializer
=
self
.
kernel_initializer
)
self
.
w3
=
self
.
add_weight
(
name
=
'w3'
,
shape
=
(
128
*
(
self
.
proj_size
//
input_shape
[
-
1
]),
128
),
trainable
=
True
,
initializer
=
self
.
kernel_initializer
)
self
.
w4
=
self
.
add_weight
(
name
=
'w4'
,
shape
=
(
input_shape
[
-
1
]
//
128
,
128
,
input_shape
[
-
1
]),
trainable
=
True
,
initializer
=
self
.
kernel_initializer
)
if
self
.
use_bias
:
self
.
bias
=
self
.
add_weight
(
name
=
'b'
,
shape
=
(
input_shape
[
-
1
]
//
128
,
1
,
128
*
(
self
.
proj_size
//
input_shape
[
-
1
])),
trainable
=
True
,
initializer
=
self
.
bias_initializer
)
else
:
self
.
bias
=
None
def
call
(
self
,
inputs
:
tf
.
Tensor
,
**
kwargs
):
orig_shape
=
tf
.
shape
(
inputs
)
input_dim
=
inputs
.
shape
[
-
1
]
tmp
=
tf
.
reshape
(
inputs
,
(
-
1
,
input_dim
))
# Shape is (BatchSeq, input_dim)
# Expansion network
tmp
=
tf
.
einsum
(
'ab,Qb->aQ'
,
self
.
w1
,
tmp
)
# Note: Letter Q will always represent the BatchSeq axis.
tmp
=
tf
.
reshape
(
tmp
,
(
input_dim
//
128
,
128
,
-
1
))
tmp
=
tf
.
einsum
(
'abQ,bd->aQd'
,
tmp
,
self
.
w2
)
# Apply activation and then Condense
tmp
=
self
.
activation
(
tmp
+
self
.
bias
)
tmp
=
tf
.
einsum
(
'aQd,db->aQb'
,
tmp
,
self
.
w3
)
tmp
=
tf
.
einsum
(
'aQb,abd->Qd'
,
tmp
,
self
.
w4
)
out
=
tf
.
reshape
(
tmp
,
orig_shape
)
return
out
def
compute_output_shape
(
self
,
input_shape
:
List
[
int
])
->
List
[
int
]:
return
input_shape
def
get_config
(
self
)
->
Dict
[
Any
,
Any
]:
"""Returns the config of the layer.
The same layer can be reinstantiated later
(without its trained weights) from this configuration.
Returns:
Python dictionary containing the configuration of the layer.
"""
config
=
{}
# Include the layer-specific arguments
args
=
[
'proj_multiplier'
,
'use_bias'
]
for
arg
in
args
:
config
[
arg
]
=
getattr
(
self
,
arg
)
# Serialize the activation
config
[
'activation'
]
=
activations
.
serialize
(
getattr
(
self
,
'activation'
))
# Serialize the initializers
decomp_initializers
=
[
'kernel_initializer'
,
'bias_initializer'
]
for
initializer_arg
in
decomp_initializers
:
config
[
initializer_arg
]
=
initializers
.
serialize
(
getattr
(
self
,
initializer_arg
))
# Get base config
base_config
=
super
(
TNExpandCondense
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
official/nlp/modeling/layers/tn_expand_condense_test.py
0 → 100644
View file @
b0ccdb11
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for ExpandCondense tensor network layer."""
import
os
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
# pylint: disable=g-direct-tensorflow-import
from
tensorflow.python.keras.testing_utils
import
layer_test
from
official.nlp.modeling.layers.tn_expand_condense
import
TNExpandCondense
class
TNLayerTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
"""Unit tests for ExpandCondense TN layer.
"""
def
setUp
(
self
):
super
(
TNLayerTest
,
self
).
setUp
()
self
.
labels
=
np
.
concatenate
((
np
.
ones
((
50
,
1
)),
np
.
zeros
((
50
,
1
))),
axis
=
0
)
def
_build_model
(
self
,
data
,
proj_multiple
=
2
):
model
=
tf
.
keras
.
models
.
Sequential
()
model
.
add
(
TNExpandCondense
(
proj_multiplier
=
proj_multiple
,
use_bias
=
True
,
activation
=
'relu'
,
input_shape
=
(
data
.
shape
[
-
1
],)))
model
.
add
(
tf
.
keras
.
layers
.
Dense
(
1
,
activation
=
'sigmoid'
))
return
model
@
parameterized
.
parameters
((
768
,
6
),
(
1024
,
2
))
def
test_keras_layer
(
self
,
input_dim
,
proj_multiple
):
data
=
np
.
random
.
normal
(
size
=
(
100
,
input_dim
))
data
=
data
.
astype
(
np
.
float32
)
layer_test
(
TNExpandCondense
,
kwargs
=
{
'proj_multiplier'
:
proj_multiple
,
'input_shape'
:
data
.
shape
},
input_shape
=
data
.
shape
,
input_data
=
data
,
expected_output_shape
=
(
None
,
data
.
shape
[
-
1
]),
expected_output_dtype
=
data
.
dtype
)
@
parameterized
.
parameters
((
768
,
6
),
(
1024
,
2
))
def
test_train
(
self
,
input_dim
,
proj_multiple
):
data
=
np
.
random
.
randint
(
10
,
size
=
(
100
,
input_dim
))
model
=
self
.
_build_model
(
data
,
proj_multiple
)
tf
.
random
.
set_seed
(
0
)
model
.
compile
(
optimizer
=
'adam'
,
loss
=
'binary_crossentropy'
,
metrics
=
[
'accuracy'
])
# Train the model for 5 epochs
history
=
model
.
fit
(
data
,
self
.
labels
,
epochs
=
5
,
batch_size
=
32
)
# Check that loss decreases and accuracy increases
self
.
assertGreater
(
history
.
history
[
'loss'
][
0
],
history
.
history
[
'loss'
][
-
1
])
self
.
assertLess
(
history
.
history
[
'accuracy'
][
0
],
history
.
history
[
'accuracy'
][
-
1
])
@
parameterized
.
parameters
((
768
,
6
),
(
1024
,
2
))
def
test_weights_change
(
self
,
input_dim
,
proj_multiple
):
tf
.
random
.
set_seed
(
0
)
data
=
np
.
random
.
randint
(
10
,
size
=
(
100
,
input_dim
))
model
=
self
.
_build_model
(
data
,
proj_multiple
)
model
.
compile
(
optimizer
=
'adam'
,
loss
=
'binary_crossentropy'
,
metrics
=
[
'accuracy'
])
before
=
model
.
get_weights
()
model
.
fit
(
data
,
self
.
labels
,
epochs
=
5
,
batch_size
=
32
)
after
=
model
.
get_weights
()
# Make sure every layer's weights changed
for
i
,
_
in
enumerate
(
before
):
self
.
assertTrue
((
after
[
i
]
!=
before
[
i
]).
any
())
@
parameterized
.
parameters
((
768
,
6
),
(
1024
,
2
))
def
test_output_shape
(
self
,
input_dim
,
proj_multiple
):
data
=
np
.
random
.
randint
(
10
,
size
=
(
100
,
input_dim
))
model
=
self
.
_build_model
(
data
,
proj_multiple
)
input_shape
=
data
.
shape
actual_output_shape
=
model
(
data
).
shape
expected_output_shape
=
model
.
compute_output_shape
(
input_shape
)
self
.
assertEqual
(
expected_output_shape
,
actual_output_shape
)
@
parameterized
.
parameters
((
768
,
6
),
(
1024
,
2
))
def
test_expandcondense_num_parameters
(
self
,
input_dim
,
proj_multiple
):
data
=
np
.
random
.
randint
(
10
,
size
=
(
100
,
input_dim
))
proj_size
=
proj_multiple
*
data
.
shape
[
-
1
]
model
=
tf
.
keras
.
models
.
Sequential
()
model
.
add
(
TNExpandCondense
(
proj_multiplier
=
proj_multiple
,
use_bias
=
True
,
activation
=
'relu'
,
input_shape
=
(
data
.
shape
[
-
1
],)))
w1_params
=
data
.
shape
[
-
1
]
**
2
w2_params
=
128
*
128
*
(
proj_size
//
data
.
shape
[
-
1
])
w3_params
=
128
*
128
*
(
proj_size
//
data
.
shape
[
-
1
])
w4_params
=
(
data
.
shape
[
-
1
]
//
128
)
*
128
*
data
.
shape
[
-
1
]
bias_params
=
((
data
.
shape
[
-
1
]
//
128
)
*
128
*
(
proj_size
//
data
.
shape
[
-
1
]))
expected_num_parameters
=
(
w1_params
+
w2_params
+
w3_params
+
w4_params
)
+
bias_params
self
.
assertEqual
(
expected_num_parameters
,
model
.
count_params
())
@
parameterized
.
parameters
((
912
,
6
),
(
200
,
2
))
def
test_incorrect_sizes
(
self
,
input_dim
,
proj_multiple
):
data
=
np
.
random
.
randint
(
10
,
size
=
(
100
,
input_dim
))
with
self
.
assertRaises
(
AssertionError
):
model
=
self
.
_build_model
(
data
,
proj_multiple
)
model
.
compile
(
optimizer
=
'adam'
,
loss
=
'binary_crossentropy'
)
@
parameterized
.
parameters
((
768
,
6
),
(
1024
,
2
))
def
test_config
(
self
,
input_dim
,
proj_multiple
):
data
=
np
.
random
.
randint
(
10
,
size
=
(
100
,
input_dim
))
model
=
self
.
_build_model
(
data
,
proj_multiple
)
expected_num_parameters
=
model
.
layers
[
0
].
count_params
()
# Serialize model and use config to create new layer
model_config
=
model
.
get_config
()
layer_config
=
model_config
[
'layers'
][
1
][
'config'
]
new_model
=
TNExpandCondense
.
from_config
(
layer_config
)
# Build the layer so we can count params below
new_model
.
build
(
layer_config
[
'batch_input_shape'
])
# Check that original layer had same num params as layer built from config
self
.
assertEqual
(
expected_num_parameters
,
new_model
.
count_params
())
@
parameterized
.
parameters
((
768
,
6
),
(
1024
,
2
))
def
test_model_save
(
self
,
input_dim
,
proj_multiple
):
data
=
np
.
random
.
randint
(
10
,
size
=
(
100
,
input_dim
))
model
=
self
.
_build_model
(
data
,
proj_multiple
)
model
.
compile
(
optimizer
=
'adam'
,
loss
=
'binary_crossentropy'
,
metrics
=
[
'accuracy'
])
# Train the model for 5 epochs
model
.
fit
(
data
,
self
.
labels
,
epochs
=
5
,
batch_size
=
32
)
save_path
=
os
.
path
.
join
(
self
.
get_temp_dir
(),
'test_model'
)
model
.
save
(
save_path
)
loaded_model
=
tf
.
keras
.
models
.
load_model
(
save_path
)
# Compare model predictions and loaded_model predictions
self
.
assertAllEqual
(
model
.
predict
(
data
),
loaded_model
.
predict
(
data
))
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/nlp/modeling/layers/tn_transformer_expand_condense.py
0 → 100644
View file @
b0ccdb11
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""TN-BERT TNTransformerExpandCondense employing Expand-Condense layer instead of Dense."""
# pylint: disable=g-classes-have-attributes
# Import libraries
import
gin
import
tensorflow
as
tf
from
official.nlp.modeling.layers.tn_expand_condense
import
TNExpandCondense
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
@
gin
.
configurable
class
TNTransformerExpandCondense
(
tf
.
keras
.
layers
.
Layer
):
"""Transformer layer using tensor network Expand-Condense layer.
This layer implements the Transformer from transformer.py, with a single
tensor network layer replacing the usual intermediate and output Dense
layers.
Arguments:
num_attention_heads: Number of attention heads.
intermediate_size: Size of the intermediate layer.
intermediate_activation: Activation for the intermediate layer.
dropout_rate: Dropout probability for the post-attention and output dropout.
attention_dropout_rate: Dropout probability for within the attention layer.
output_range: the sequence output range, [0, output_range) by slicing the
target sequence. `None` means the target sequence is not sliced.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
bias_regularizer: Regularizer for dense layer biases.
activity_regularizer: Regularizer for dense layer activity.
kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels.
use_bias: Whether to enable use_bias in attention layer. If set to False,
use_bias in attention layer is disabled.
norm_first: Whether to normalize inputs to attention and intermediate dense
layers. If set False, output of attention and intermediate dense layers is
normalized.
norm_epsilon: Epsilon value to initialize normalization layers.
intermediate_dropout: Dropout probability for intermediate_dropout_layer.
attention_initializer: Initializer for kernels of attention layers. If set
`None`, attention layers use kernel_initializer as initializer for kernel.
"""
def
__init__
(
self
,
num_attention_heads
,
intermediate_size
,
intermediate_activation
,
dropout_rate
=
0.0
,
attention_dropout_rate
=
0.0
,
output_range
=
None
,
kernel_initializer
=
"glorot_uniform"
,
bias_initializer
=
"zeros"
,
kernel_regularizer
=
None
,
bias_regularizer
=
None
,
activity_regularizer
=
None
,
kernel_constraint
=
None
,
bias_constraint
=
None
,
use_bias
=
True
,
norm_first
=
False
,
norm_epsilon
=
1e-12
,
intermediate_dropout
=
0.0
,
attention_initializer
=
None
,
**
kwargs
):
super
(
TNTransformerExpandCondense
,
self
).
__init__
(
**
kwargs
)
self
.
_num_heads
=
num_attention_heads
self
.
_intermediate_size
=
intermediate_size
self
.
_intermediate_activation
=
intermediate_activation
self
.
_attention_dropout_rate
=
attention_dropout_rate
self
.
_dropout_rate
=
dropout_rate
self
.
_output_range
=
output_range
self
.
_kernel_initializer
=
tf
.
keras
.
initializers
.
get
(
kernel_initializer
)
self
.
_bias_initializer
=
tf
.
keras
.
initializers
.
get
(
bias_initializer
)
self
.
_kernel_regularizer
=
tf
.
keras
.
regularizers
.
get
(
kernel_regularizer
)
self
.
_bias_regularizer
=
tf
.
keras
.
regularizers
.
get
(
bias_regularizer
)
self
.
_activity_regularizer
=
tf
.
keras
.
regularizers
.
get
(
activity_regularizer
)
self
.
_kernel_constraint
=
tf
.
keras
.
constraints
.
get
(
kernel_constraint
)
self
.
_bias_constraint
=
tf
.
keras
.
constraints
.
get
(
bias_constraint
)
self
.
_use_bias
=
use_bias
self
.
_norm_first
=
norm_first
self
.
_norm_epsilon
=
norm_epsilon
self
.
_intermediate_dropout
=
intermediate_dropout
if
attention_initializer
:
self
.
_attention_initializer
=
tf
.
keras
.
initializers
.
get
(
attention_initializer
)
else
:
self
.
_attention_initializer
=
self
.
_kernel_initializer
def
build
(
self
,
input_shape
):
input_tensor
=
input_shape
[
0
]
if
len
(
input_shape
)
==
2
else
input_shape
input_tensor_shape
=
tf
.
TensorShape
(
input_tensor
)
if
len
(
input_tensor_shape
.
as_list
())
!=
3
:
raise
ValueError
(
"TNTransformerExpandCondense expects a three-dimensional input of "
"shape [batch, sequence, width]."
)
batch_size
,
sequence_length
,
hidden_size
=
input_tensor_shape
if
len
(
input_shape
)
==
2
:
mask_tensor_shape
=
tf
.
TensorShape
(
input_shape
[
1
])
expected_mask_tensor_shape
=
tf
.
TensorShape
(
[
batch_size
,
sequence_length
,
sequence_length
])
if
not
expected_mask_tensor_shape
.
is_compatible_with
(
mask_tensor_shape
):
raise
ValueError
(
"When passing a mask tensor to TNTransformerExpandCondense, the "
"mask tensor must be of shape [batch, "
"sequence_length, sequence_length] (here %s). Got a "
"mask tensor of shape %s."
%
(
expected_mask_tensor_shape
,
mask_tensor_shape
))
if
hidden_size
%
self
.
_num_heads
!=
0
:
raise
ValueError
(
"The input size (%d) is not a multiple of the number of attention "
"heads (%d)"
%
(
hidden_size
,
self
.
_num_heads
))
self
.
_attention_head_size
=
int
(
hidden_size
//
self
.
_num_heads
)
common_kwargs
=
dict
(
bias_initializer
=
self
.
_bias_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
bias_regularizer
=
self
.
_bias_regularizer
,
activity_regularizer
=
self
.
_activity_regularizer
,
kernel_constraint
=
self
.
_kernel_constraint
,
bias_constraint
=
self
.
_bias_constraint
)
self
.
_attention_layer
=
tf
.
keras
.
layers
.
MultiHeadAttention
(
num_heads
=
self
.
_num_heads
,
key_dim
=
self
.
_attention_head_size
,
dropout
=
self
.
_attention_dropout_rate
,
use_bias
=
self
.
_use_bias
,
kernel_initializer
=
self
.
_attention_initializer
,
name
=
"self_attention"
,
**
common_kwargs
)
self
.
_attention_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_dropout_rate
)
# Use float32 in layernorm for numeric stability.
# It is probably safe in mixed_float16, but we haven't validated this yet.
self
.
_attention_layer_norm
=
(
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"self_attention_layer_norm"
,
axis
=-
1
,
epsilon
=
self
.
_norm_epsilon
,
dtype
=
tf
.
float32
))
# Substitute Dense layers with a single Expand-Condense layer.
self
.
_output_dense
=
TNExpandCondense
(
4
,
use_bias
=
True
,
activation
=
self
.
_intermediate_activation
,
kernel_initializer
=
self
.
_kernel_initializer
,
bias_initializer
=
self
.
_bias_initializer
)
self
.
_output_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_dropout_rate
)
# Use float32 in layernorm for numeric stability.
self
.
_output_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"output_layer_norm"
,
axis
=-
1
,
epsilon
=
self
.
_norm_epsilon
,
dtype
=
tf
.
float32
)
super
(
TNTransformerExpandCondense
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
config
=
{
"num_attention_heads"
:
self
.
_num_heads
,
"intermediate_size"
:
self
.
_intermediate_size
,
"intermediate_activation"
:
self
.
_intermediate_activation
,
"dropout_rate"
:
self
.
_dropout_rate
,
"attention_dropout_rate"
:
self
.
_attention_dropout_rate
,
"output_range"
:
self
.
_output_range
,
"kernel_initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_kernel_initializer
),
"bias_initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_bias_initializer
),
"kernel_regularizer"
:
tf
.
keras
.
regularizers
.
serialize
(
self
.
_kernel_regularizer
),
"bias_regularizer"
:
tf
.
keras
.
regularizers
.
serialize
(
self
.
_bias_regularizer
),
"activity_regularizer"
:
tf
.
keras
.
regularizers
.
serialize
(
self
.
_activity_regularizer
),
"kernel_constraint"
:
tf
.
keras
.
constraints
.
serialize
(
self
.
_kernel_constraint
),
"bias_constraint"
:
tf
.
keras
.
constraints
.
serialize
(
self
.
_bias_constraint
),
"use_bias"
:
self
.
_use_bias
,
"norm_first"
:
self
.
_norm_first
,
"norm_epsilon"
:
self
.
_norm_epsilon
,
"intermediate_dropout"
:
self
.
_intermediate_dropout
,
"attention_initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_attention_initializer
)
}
base_config
=
super
(
TNTransformerExpandCondense
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
inputs
):
if
isinstance
(
inputs
,
(
list
,
tuple
))
and
len
(
inputs
)
==
2
:
input_tensor
,
attention_mask
=
inputs
else
:
input_tensor
,
attention_mask
=
(
inputs
,
None
)
if
self
.
_output_range
:
target_tensor
=
input_tensor
[:,
0
:
self
.
_output_range
,
:]
attention_mask
=
attention_mask
[:,
0
:
self
.
_output_range
,
:]
else
:
if
self
.
_norm_first
:
source_tensor
=
input_tensor
input_tensor
=
self
.
_attention_layer_norm
(
input_tensor
)
target_tensor
=
input_tensor
attention_output
=
self
.
_attention_layer
(
query
=
target_tensor
,
value
=
input_tensor
,
attention_mask
=
attention_mask
)
attention_output
=
self
.
_attention_dropout
(
attention_output
)
if
self
.
_norm_first
:
attention_output
=
source_tensor
+
attention_output
else
:
attention_output
=
self
.
_attention_layer_norm
(
target_tensor
+
attention_output
)
if
self
.
_norm_first
:
source_attention_output
=
attention_output
attention_output
=
self
.
_output_layer_norm
(
attention_output
)
layer_output
=
self
.
_output_dense
(
attention_output
)
layer_output
=
self
.
_output_dropout
(
layer_output
)
# During mixed precision training, attention_output is from layer norm and
# is always fp32 for now. Cast layer_output to fp32 for the subsequent
# add.
layer_output
=
tf
.
cast
(
layer_output
,
tf
.
float32
)
if
self
.
_norm_first
:
layer_output
=
source_attention_output
+
layer_output
else
:
layer_output
=
self
.
_output_layer_norm
(
layer_output
+
attention_output
)
return
layer_output
official/nlp/modeling/layers/tn_transformer_test.py
0 → 100644
View file @
b0ccdb11
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for TN-BERT transformer."""
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
official.nlp.modeling.layers.tn_transformer_expand_condense
import
TNTransformerExpandCondense
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@
keras_parameterized
.
run_all_keras_modes
@
parameterized
.
named_parameters
((
'tn'
,
TNTransformerExpandCondense
))
class
TransformerLayerTest
(
keras_parameterized
.
TestCase
):
def
tearDown
(
self
):
super
(
TransformerLayerTest
,
self
).
tearDown
()
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
'float32'
)
def
test_layer_creation
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
16
,
intermediate_size
=
2048
,
intermediate_activation
=
'relu'
)
sequence_length
=
21
width
=
256
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
output_tensor
=
test_layer
(
data_tensor
)
# The default output of a transformer layer should be the same as the input.
self
.
assertEqual
(
data_tensor
.
shape
.
as_list
(),
output_tensor
.
shape
.
as_list
())
def
test_layer_creation_with_mask
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
16
,
intermediate_size
=
2048
,
intermediate_activation
=
'relu'
)
sequence_length
=
21
width
=
256
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
# Create a 2-dimensional input (the first dimension is implicit).
mask_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
data_tensor
,
mask_tensor
])
# The default output of a transformer layer should be the same as the input.
self
.
assertEqual
(
data_tensor
.
shape
.
as_list
(),
output_tensor
.
shape
.
as_list
())
def
test_layer_creation_with_incorrect_mask_fails
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
16
,
intermediate_size
=
2048
,
intermediate_activation
=
'relu'
)
sequence_length
=
21
width
=
256
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
# Create a 2-dimensional input (the first dimension is implicit).
mask_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
sequence_length
-
3
))
with
self
.
assertRaisesRegex
(
ValueError
,
'When passing a mask tensor.*'
):
_
=
test_layer
([
data_tensor
,
mask_tensor
])
def
test_layer_invocation
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
16
,
intermediate_size
=
2048
,
intermediate_activation
=
'relu'
)
sequence_length
=
21
width
=
256
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
output_tensor
=
test_layer
(
data_tensor
)
# Create a model from the test layer.
model
=
tf
.
keras
.
Model
(
data_tensor
,
output_tensor
)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
6
input_data
=
16
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
))
_
=
model
.
predict
(
input_data
)
def
test_layer_invocation_with_mask
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
16
,
intermediate_size
=
2048
,
intermediate_activation
=
'relu'
)
sequence_length
=
21
width
=
256
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
# Create a 2-dimensional input (the first dimension is implicit).
mask_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
data_tensor
,
mask_tensor
])
# Create a model from the test layer.
model
=
tf
.
keras
.
Model
([
data_tensor
,
mask_tensor
],
output_tensor
)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
6
input_data
=
16
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length)
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
,
sequence_length
))
_
=
model
.
predict
([
input_data
,
mask_data
])
def
test_layer_output_range
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
16
,
intermediate_size
=
2048
,
intermediate_activation
=
'relu'
)
sequence_length
=
21
width
=
256
batch_size
=
6
input_data
=
16
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
))
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
input_data
,
mask_data
])
# The layer only attends to the first token and outputs the first token
# embeeding.
new_layer
=
transformer_cls
(
num_attention_heads
=
16
,
intermediate_size
=
2048
,
intermediate_activation
=
'relu'
,
output_range
=
1
)
_
=
new_layer
([
input_data
,
mask_data
])
new_layer
.
set_weights
(
test_layer
.
get_weights
())
new_output_tensor
=
new_layer
([
input_data
,
mask_data
])
self
.
assertAllClose
(
new_output_tensor
,
output_tensor
[:,
0
:
1
,
:],
atol
=
5e-5
,
rtol
=
0.003
)
def
test_layer_invocation_with_float16_dtype
(
self
,
transformer_cls
):
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
'mixed_float16'
)
test_layer
=
transformer_cls
(
num_attention_heads
=
16
,
intermediate_size
=
2048
,
intermediate_activation
=
'relu'
)
sequence_length
=
21
width
=
256
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
# Create a 2-dimensional input (the first dimension is implicit).
mask_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
data_tensor
,
mask_tensor
])
# Create a model from the test layer.
model
=
tf
.
keras
.
Model
([
data_tensor
,
mask_tensor
],
output_tensor
)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
6
input_data
=
(
16
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
)))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length)
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
,
sequence_length
))
_
=
model
.
predict
([
input_data
,
mask_data
])
def
test_transform_with_initializer
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
16
,
intermediate_size
=
2048
,
intermediate_activation
=
'relu'
,
kernel_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
))
sequence_length
=
21
width
=
256
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
output
=
test_layer
(
data_tensor
)
# The default output of a transformer layer should be the same as the input.
self
.
assertEqual
(
data_tensor
.
shape
.
as_list
(),
output
.
shape
.
as_list
())
def
test_dynamic_layer_sequence
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
16
,
intermediate_size
=
2048
,
intermediate_activation
=
'relu'
,
kernel_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
))
# Create a 3-dimensional input (the first dimension is implicit).
width
=
256
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
None
,
width
))
output_tensor
=
test_layer
(
input_tensor
)
model
=
tf
.
keras
.
Model
(
input_tensor
,
output_tensor
)
input_length
=
17
input_data
=
np
.
ones
((
1
,
input_length
,
width
))
output_data
=
model
.
predict
(
input_data
)
self
.
assertAllEqual
([
1
,
input_length
,
width
],
output_data
.
shape
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/nlp/modeling/layers/transformer.py
View file @
b0ccdb11
...
@@ -25,7 +25,7 @@ from official.nlp.modeling.layers.util import tf_function_if_eager
...
@@ -25,7 +25,7 @@ from official.nlp.modeling.layers.util import tf_function_if_eager
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
Transformer
(
keras_nlp
.
TransformerEncoderBlock
):
class
Transformer
(
keras_nlp
.
layers
.
TransformerEncoderBlock
):
"""Transformer layer.
"""Transformer layer.
This layer implements the Transformer from "Attention Is All You Need".
This layer implements the Transformer from "Attention Is All You Need".
...
@@ -109,7 +109,7 @@ class CompiledTransformer(Transformer):
...
@@ -109,7 +109,7 @@ class CompiledTransformer(Transformer):
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
TransformerDecoder
Layer
(
tf
.
keras
.
layers
.
Layer
):
class
TransformerDecoder
Block
(
tf
.
keras
.
layers
.
Layer
):
"""Single transformer layer for decoder.
"""Single transformer layer for decoder.
It has three sub-layers:
It has three sub-layers:
...
@@ -163,7 +163,7 @@ class TransformerDecoderLayer(tf.keras.layers.Layer):
...
@@ -163,7 +163,7 @@ class TransformerDecoderLayer(tf.keras.layers.Layer):
intermediate_dropout
=
0.0
,
intermediate_dropout
=
0.0
,
attention_initializer
=
None
,
attention_initializer
=
None
,
**
kwargs
):
**
kwargs
):
super
(
TransformerDecoderLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
num_attention_heads
=
num_attention_heads
self
.
num_attention_heads
=
num_attention_heads
self
.
intermediate_size
=
intermediate_size
self
.
intermediate_size
=
intermediate_size
self
.
intermediate_activation
=
tf
.
keras
.
activations
.
get
(
self
.
intermediate_activation
=
tf
.
keras
.
activations
.
get
(
...
@@ -274,7 +274,7 @@ class TransformerDecoderLayer(tf.keras.layers.Layer):
...
@@ -274,7 +274,7 @@ class TransformerDecoderLayer(tf.keras.layers.Layer):
self
.
output_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout_rate
)
self
.
output_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout_rate
)
self
.
output_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
self
.
output_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"output_layer_norm"
,
axis
=-
1
,
epsilon
=
self
.
_norm_epsilon
)
name
=
"output_layer_norm"
,
axis
=-
1
,
epsilon
=
self
.
_norm_epsilon
)
super
(
TransformerDecoderLayer
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
get_config
(
self
):
def
get_config
(
self
):
config
=
{
config
=
{
...
@@ -315,7 +315,7 @@ class TransformerDecoderLayer(tf.keras.layers.Layer):
...
@@ -315,7 +315,7 @@ class TransformerDecoderLayer(tf.keras.layers.Layer):
"attention_initializer"
:
"attention_initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_attention_initializer
)
tf
.
keras
.
initializers
.
serialize
(
self
.
_attention_initializer
)
}
}
base_config
=
super
(
TransformerDecoderLayer
,
self
).
get_config
()
base_config
=
super
().
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
common_layers_with_encoder
(
self
):
def
common_layers_with_encoder
(
self
):
...
@@ -329,11 +329,11 @@ class TransformerDecoderLayer(tf.keras.layers.Layer):
...
@@ -329,11 +329,11 @@ class TransformerDecoderLayer(tf.keras.layers.Layer):
if
self
.
multi_channel_cross_attention
:
if
self
.
multi_channel_cross_attention
:
if
len
(
inputs
)
!=
5
:
if
len
(
inputs
)
!=
5
:
raise
ValueError
(
raise
ValueError
(
"TransformerDecoder
Layer
must have 5 inputs, when it uses "
"TransformerDecoder
Block
must have 5 inputs, when it uses "
"multi_channel_cross_attention. But it got: %d"
%
len
(
inputs
))
"multi_channel_cross_attention. But it got: %d"
%
len
(
inputs
))
elif
len
(
inputs
)
!=
4
:
elif
len
(
inputs
)
!=
4
:
raise
ValueError
(
raise
ValueError
(
"TransformerDecoder
Layer
must have 4 inputs, but it got: %d"
%
"TransformerDecoder
Block
must have 4 inputs, but it got: %d"
%
len
(
inputs
))
len
(
inputs
))
input_tensor
,
memory
,
attention_mask
,
self_attention_mask
=
inputs
[:
4
]
input_tensor
,
memory
,
attention_mask
,
self_attention_mask
=
inputs
[:
4
]
source_tensor
=
input_tensor
source_tensor
=
input_tensor
...
...
official/nlp/modeling/layers/transformer_scaffold.py
View file @
b0ccdb11
...
@@ -82,6 +82,7 @@ class TransformerScaffold(tf.keras.layers.Layer):
...
@@ -82,6 +82,7 @@ class TransformerScaffold(tf.keras.layers.Layer):
feedforward_cfg
=
None
,
feedforward_cfg
=
None
,
dropout_rate
=
0.0
,
dropout_rate
=
0.0
,
attention_dropout_rate
=
0.0
,
attention_dropout_rate
=
0.0
,
norm_first
=
False
,
kernel_initializer
=
"glorot_uniform"
,
kernel_initializer
=
"glorot_uniform"
,
bias_initializer
=
"zeros"
,
bias_initializer
=
"zeros"
,
kernel_regularizer
=
None
,
kernel_regularizer
=
None
,
...
@@ -96,6 +97,7 @@ class TransformerScaffold(tf.keras.layers.Layer):
...
@@ -96,6 +97,7 @@ class TransformerScaffold(tf.keras.layers.Layer):
self
.
_attention_cls
=
attention_cls
self
.
_attention_cls
=
attention_cls
self
.
_feedforward_cls
=
feedforward_cls
self
.
_feedforward_cls
=
feedforward_cls
self
.
_feedforward_cfg
=
feedforward_cfg
self
.
_feedforward_cfg
=
feedforward_cfg
self
.
_norm_first
=
norm_first
self
.
_num_heads
=
num_attention_heads
self
.
_num_heads
=
num_attention_heads
self
.
_intermediate_size
=
intermediate_size
self
.
_intermediate_size
=
intermediate_size
self
.
_intermediate_activation
=
intermediate_activation
self
.
_intermediate_activation
=
intermediate_activation
...
@@ -115,18 +117,7 @@ class TransformerScaffold(tf.keras.layers.Layer):
...
@@ -115,18 +117,7 @@ class TransformerScaffold(tf.keras.layers.Layer):
raise
ValueError
(
raise
ValueError
(
"TransformerScaffold expects a three-dimensional input of "
"TransformerScaffold expects a three-dimensional input of "
"shape [batch, sequence, width]."
)
"shape [batch, sequence, width]."
)
batch_size
,
sequence_length
,
hidden_size
=
input_tensor_shape
hidden_size
=
input_tensor_shape
[
-
1
]
if
len
(
input_shape
)
==
2
:
mask_tensor_shape
=
tf
.
TensorShape
(
input_shape
[
1
])
expected_mask_tensor_shape
=
tf
.
TensorShape
(
[
batch_size
,
sequence_length
,
sequence_length
])
if
not
expected_mask_tensor_shape
.
is_compatible_with
(
mask_tensor_shape
):
raise
ValueError
(
"When passing a mask tensor to TransformerLayer, the "
"mask tensor must be of shape [batch, "
"sequence_length, sequence_length] (here %s). Got a "
"mask tensor of shape %s."
%
(
expected_mask_tensor_shape
,
mask_tensor_shape
))
if
hidden_size
%
self
.
_num_heads
!=
0
:
if
hidden_size
%
self
.
_num_heads
!=
0
:
raise
ValueError
(
raise
ValueError
(
"The input size (%d) is not a multiple of the number of attention "
"The input size (%d) is not a multiple of the number of attention "
...
@@ -257,11 +248,23 @@ class TransformerScaffold(tf.keras.layers.Layer):
...
@@ -257,11 +248,23 @@ class TransformerScaffold(tf.keras.layers.Layer):
else
:
else
:
input_tensor
,
attention_mask
=
(
inputs
,
None
)
input_tensor
,
attention_mask
=
(
inputs
,
None
)
if
self
.
_norm_first
:
source_tensor
=
input_tensor
input_tensor
=
self
.
_attention_layer_norm
(
input_tensor
)
attention_output
=
self
.
_attention_layer
(
attention_output
=
self
.
_attention_layer
(
query
=
input_tensor
,
value
=
input_tensor
,
attention_mask
=
attention_mask
)
query
=
input_tensor
,
value
=
input_tensor
,
attention_mask
=
attention_mask
)
attention_output
=
self
.
_attention_dropout
(
attention_output
)
attention_output
=
self
.
_attention_dropout
(
attention_output
)
attention_output
=
self
.
_attention_layer_norm
(
input_tensor
+
attention_output
)
if
self
.
_norm_first
:
attention_output
=
source_tensor
+
attention_output
else
:
attention_output
=
self
.
_attention_layer_norm
(
input_tensor
+
attention_output
)
if
self
.
_norm_first
:
source_attention_output
=
attention_output
attention_output
=
self
.
_output_layer_norm
(
attention_output
)
if
self
.
_feedforward_block
is
None
:
if
self
.
_feedforward_block
is
None
:
intermediate_output
=
self
.
_intermediate_dense
(
attention_output
)
intermediate_output
=
self
.
_intermediate_dense
(
attention_output
)
intermediate_output
=
self
.
_intermediate_activation_layer
(
intermediate_output
=
self
.
_intermediate_activation_layer
(
...
@@ -272,8 +275,17 @@ class TransformerScaffold(tf.keras.layers.Layer):
...
@@ -272,8 +275,17 @@ class TransformerScaffold(tf.keras.layers.Layer):
# and is always fp32 for now. Cast layer_output to fp32 for the subsequent
# and is always fp32 for now. Cast layer_output to fp32 for the subsequent
# add.
# add.
layer_output
=
tf
.
cast
(
layer_output
,
tf
.
float32
)
layer_output
=
tf
.
cast
(
layer_output
,
tf
.
float32
)
layer_output
=
self
.
_output_layer_norm
(
layer_output
+
attention_output
)
if
self
.
_norm_first
:
layer_output
=
source_attention_output
+
layer_output
else
:
layer_output
=
self
.
_output_layer_norm
(
layer_output
+
attention_output
)
else
:
else
:
layer_output
=
self
.
_feedforward_block
(
attention_output
)
if
self
.
_norm_first
:
# if norm_first, assume the feedforward block will not apply layer norm
layer_output
=
self
.
_feedforward_block
(
attention_output
)
layer_output
+=
source_attention_output
else
:
# if not norm_first, assume that the feedforwad does apply layer norm
layer_output
=
self
.
_feedforward_block
(
attention_output
)
return
layer_output
return
layer_output
official/nlp/modeling/layers/transformer_scaffold_test.py
View file @
b0ccdb11
...
@@ -182,30 +182,6 @@ class TransformerLayerTest(keras_parameterized.TestCase):
...
@@ -182,30 +182,6 @@ class TransformerLayerTest(keras_parameterized.TestCase):
self
.
assertNotEmpty
(
call_list
)
self
.
assertNotEmpty
(
call_list
)
self
.
assertTrue
(
call_list
[
0
],
"The passed layer class wasn't instantiated."
)
self
.
assertTrue
(
call_list
[
0
],
"The passed layer class wasn't instantiated."
)
def
test_layer_creation_with_incorrect_mask_fails
(
self
):
sequence_length
=
21
width
=
80
call_list
=
[]
attention_layer_cfg
=
{
'num_heads'
:
10
,
'key_dim'
:
8
,
'call_list'
:
call_list
,
}
test_layer
=
transformer_scaffold
.
TransformerScaffold
(
attention_cls
=
ValidatedAttentionLayer
,
attention_cfg
=
attention_layer_cfg
,
num_attention_heads
=
10
,
intermediate_size
=
2048
,
intermediate_activation
=
'relu'
)
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
# Create a 2-dimensional input (the first dimension is implicit).
mask_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
sequence_length
-
3
))
with
self
.
assertRaisesRegex
(
ValueError
,
'When passing a mask tensor.*'
):
_
=
test_layer
([
data_tensor
,
mask_tensor
])
def
test_layer_invocation
(
self
):
def
test_layer_invocation
(
self
):
sequence_length
=
21
sequence_length
=
21
width
=
80
width
=
80
...
...
official/nlp/modeling/layers/transformer_test.py
View file @
b0ccdb11
...
@@ -32,12 +32,12 @@ def _create_cache(batch_size, init_decode_length, num_heads, head_size):
...
@@ -32,12 +32,12 @@ def _create_cache(batch_size, init_decode_length, num_heads, head_size):
@
keras_parameterized
.
run_all_keras_modes
@
keras_parameterized
.
run_all_keras_modes
class
TransformerDecoder
Layer
Test
(
keras_parameterized
.
TestCase
):
class
TransformerDecoder
Block
Test
(
keras_parameterized
.
TestCase
):
def
test_decoder_block_with_cache
(
self
):
def
test_decoder_block_with_cache
(
self
):
num_attention_heads
=
2
num_attention_heads
=
2
hidden_size
=
16
hidden_size
=
16
decoder_block
=
transformer
.
TransformerDecoder
Layer
(
decoder_block
=
transformer
.
TransformerDecoder
Block
(
num_attention_heads
=
num_attention_heads
,
num_attention_heads
=
num_attention_heads
,
intermediate_size
=
32
,
intermediate_size
=
32
,
intermediate_activation
=
'relu'
,
intermediate_activation
=
'relu'
,
...
@@ -56,7 +56,7 @@ class TransformerDecoderLayerTest(keras_parameterized.TestCase):
...
@@ -56,7 +56,7 @@ class TransformerDecoderLayerTest(keras_parameterized.TestCase):
def
test_use_bias_norm_first
(
self
):
def
test_use_bias_norm_first
(
self
):
num_attention_heads
=
2
num_attention_heads
=
2
hidden_size
=
16
hidden_size
=
16
decoder_block
=
transformer
.
TransformerDecoder
Layer
(
decoder_block
=
transformer
.
TransformerDecoder
Block
(
num_attention_heads
=
num_attention_heads
,
num_attention_heads
=
num_attention_heads
,
intermediate_size
=
32
,
intermediate_size
=
32
,
intermediate_activation
=
'relu'
,
intermediate_activation
=
'relu'
,
...
@@ -77,7 +77,7 @@ class TransformerDecoderLayerTest(keras_parameterized.TestCase):
...
@@ -77,7 +77,7 @@ class TransformerDecoderLayerTest(keras_parameterized.TestCase):
def
test_get_config
(
self
):
def
test_get_config
(
self
):
num_attention_heads
=
2
num_attention_heads
=
2
decoder_block
=
transformer
.
TransformerDecoder
Layer
(
decoder_block
=
transformer
.
TransformerDecoder
Block
(
num_attention_heads
=
num_attention_heads
,
num_attention_heads
=
num_attention_heads
,
intermediate_size
=
32
,
intermediate_size
=
32
,
intermediate_activation
=
'relu'
,
intermediate_activation
=
'relu'
,
...
@@ -90,7 +90,7 @@ class TransformerDecoderLayerTest(keras_parameterized.TestCase):
...
@@ -90,7 +90,7 @@ class TransformerDecoderLayerTest(keras_parameterized.TestCase):
attention_initializer
=
tf
.
keras
.
initializers
.
RandomUniform
(
attention_initializer
=
tf
.
keras
.
initializers
.
RandomUniform
(
minval
=
0.
,
maxval
=
1.
))
minval
=
0.
,
maxval
=
1.
))
decoder_block_config
=
decoder_block
.
get_config
()
decoder_block_config
=
decoder_block
.
get_config
()
new_decoder_block
=
transformer
.
TransformerDecoder
Layer
.
from_config
(
new_decoder_block
=
transformer
.
TransformerDecoder
Block
.
from_config
(
decoder_block_config
)
decoder_block_config
)
self
.
assertEqual
(
decoder_block_config
,
new_decoder_block
.
get_config
())
self
.
assertEqual
(
decoder_block_config
,
new_decoder_block
.
get_config
())
...
...
official/nlp/modeling/layers/transformer_xl.py
0 → 100644
View file @
b0ccdb11
This diff is collapsed.
Click to expand it.
Prev
1
2
3
4
5
6
7
…
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment