Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
790e49e5
Commit
790e49e5
authored
Mar 23, 2021
by
stephenwu
Browse files
Merge branch 'master' of
https://github.com/tensorflow/models
into run_superglue
parents
8ab018b0
5bb827c3
Changes
378
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
521 additions
and
123 deletions
+521
-123
official/nlp/modeling/layers/masked_softmax.py
official/nlp/modeling/layers/masked_softmax.py
+2
-2
official/nlp/modeling/layers/mobile_bert_layers.py
official/nlp/modeling/layers/mobile_bert_layers.py
+18
-16
official/nlp/modeling/layers/multi_channel_attention.py
official/nlp/modeling/layers/multi_channel_attention.py
+3
-3
official/nlp/modeling/layers/position_embedding.py
official/nlp/modeling/layers/position_embedding.py
+7
-7
official/nlp/modeling/layers/relative_attention.py
official/nlp/modeling/layers/relative_attention.py
+2
-2
official/nlp/modeling/layers/self_attention_mask.py
official/nlp/modeling/layers/self_attention_mask.py
+4
-4
official/nlp/modeling/layers/spectral_normalization.py
official/nlp/modeling/layers/spectral_normalization.py
+295
-0
official/nlp/modeling/layers/spectral_normalization_test.py
official/nlp/modeling/layers/spectral_normalization_test.py
+86
-0
official/nlp/modeling/layers/talking_heads_attention.py
official/nlp/modeling/layers/talking_heads_attention.py
+1
-1
official/nlp/modeling/layers/text_layers.py
official/nlp/modeling/layers/text_layers.py
+44
-59
official/nlp/modeling/layers/tn_expand_condense.py
official/nlp/modeling/layers/tn_expand_condense.py
+2
-2
official/nlp/modeling/layers/transformer.py
official/nlp/modeling/layers/transformer.py
+6
-3
official/nlp/modeling/layers/transformer_scaffold.py
official/nlp/modeling/layers/transformer_scaffold.py
+5
-2
official/nlp/modeling/losses/__init__.py
official/nlp/modeling/losses/__init__.py
+1
-1
official/nlp/modeling/models/README.md
official/nlp/modeling/models/README.md
+3
-3
official/nlp/modeling/models/__init__.py
official/nlp/modeling/models/__init__.py
+5
-1
official/nlp/modeling/models/bert_pretrainer.py
official/nlp/modeling/models/bert_pretrainer.py
+2
-2
official/nlp/modeling/models/bert_span_labeler.py
official/nlp/modeling/models/bert_span_labeler.py
+3
-3
official/nlp/modeling/models/bert_token_classifier.py
official/nlp/modeling/models/bert_token_classifier.py
+15
-5
official/nlp/modeling/models/bert_token_classifier_test.py
official/nlp/modeling/models/bert_token_classifier_test.py
+17
-7
No files found.
official/nlp/modeling/layers/masked_softmax.py
View file @
790e49e5
...
...
@@ -25,10 +25,10 @@ def _large_compatible_negative(tensor_type):
in this module (-1e9) cannot be represented using `tf.float16`.
Args:
tensor_type:
a
dtype to determine the type.
tensor_type:
A
dtype to determine the type.
Returns:
a
large negative number.
A
large negative number.
"""
if
tensor_type
==
tf
.
float16
:
return
tf
.
float16
.
min
...
...
official/nlp/modeling/layers/mobile_bert_layers.py
View file @
790e49e5
...
...
@@ -44,7 +44,7 @@ def _get_norm_layer(normalization_type='no_norm', name=None):
Args:
normalization_type: String. The type of normalization_type, only
'
no_norm
'
and
'
layer_norm
'
are supported.
`
no_norm
`
and
`
layer_norm
`
are supported.
name: Name for the norm layer.
Returns:
...
...
@@ -89,7 +89,7 @@ class MobileBertEmbedding(tf.keras.layers.Layer):
output_embed_size: Embedding size for the final embedding output.
max_sequence_length: Maximum length of input sequence.
normalization_type: String. The type of normalization_type, only
'
no_norm
'
and
'
layer_norm
'
are supported.
`
no_norm
`
and
`
layer_norm
`
are supported.
initializer: The initializer to use for the embedding weights and
linear projection weights.
dropout_rate: Dropout rate.
...
...
@@ -208,10 +208,10 @@ class MobileBertTransformer(tf.keras.layers.Layer):
key_query_shared_bottleneck: Whether to share linear transformation for
keys and queries.
num_feedforward_networks: Number of stacked feed-forward networks.
normalization_type: The type of normalization_type, only
'
no_norm
'
and
'
layer_norm
'
are supported.
'
no_norm
'
represents the element-wise
normalization_type: The type of normalization_type, only
`
no_norm
`
and
`
layer_norm
`
are supported.
`
no_norm
`
represents the element-wise
linear transformation for the student model, as suggested by the
original MobileBERT paper.
'
layer_norm
'
is used for the teacher model.
original MobileBERT paper.
`
layer_norm
`
is used for the teacher model.
initializer: The initializer to use for the embedding weights and
linear projection weights.
**kwargs: keyword arguments.
...
...
@@ -346,14 +346,16 @@ class MobileBertTransformer(tf.keras.layers.Layer):
"""Implementes the forward pass.
Args:
input_tensor: Float tensor of shape [batch_size, seq_length, hidden_size].
attention_mask: (optional) int32 tensor of shape [batch_size, seq_length,
seq_length], with 1 for positions that can be attended to and 0 in
positions that should not be.
input_tensor: Float tensor of shape
`(batch_size, seq_length, hidden_size)`.
attention_mask: (optional) int32 tensor of shape
`(batch_size, seq_length, seq_length)`, with 1 for positions that can
be attended to and 0 in positions that should not be.
return_attention_scores: If return attention score.
Returns:
layer_output: Float tensor of shape [batch_size, seq_length, hidden_size].
layer_output: Float tensor of shape
`(batch_size, seq_length, hidden_size)`.
attention_scores (Optional): Only when return_attention_scores is True.
Raises:
...
...
@@ -450,8 +452,8 @@ class MobileBertMaskedLM(tf.keras.layers.Layer):
activation: The activation, if any, for the dense layer.
initializer: The initializer for the dense layer. Defaults to a Glorot
uniform initializer.
output: The output style for this layer. Can be either
'
logits
'
or
'
predictions
'
.
output: The output style for this layer. Can be either
`
logits
`
or
`
predictions
`
.
**kwargs: keyword arguments.
"""
super
(
MobileBertMaskedLM
,
self
).
__init__
(
**
kwargs
)
...
...
@@ -527,16 +529,16 @@ class MobileBertMaskedLM(tf.keras.layers.Layer):
Args:
sequence_tensor: Sequence output of `BertModel` layer of shape
(
`batch_size
`
,
`
seq_length
`
, num_hidden) where num_hidden is number of
`
(
batch_size, seq_length, num_hidden)
`
where
`
num_hidden
`
is number of
hidden units of `BertModel` layer.
positions: Positions ids of tokens in sequence to mask for pretraining
of with dimension (batch_size, num_predictions) where
of with dimension
`
(batch_size, num_predictions)
`
where
`num_predictions` is maximum number of tokens to mask out and predict
per each sequence.
Returns:
Masked out sequence tensor of shape
(batch_size * num_predictions,
num_hidden).
Masked out sequence tensor of shape
`(batch_size * num_predictions,
num_hidden)
`
.
"""
sequence_shape
=
tf
.
shape
(
sequence_tensor
)
batch_size
,
seq_length
=
sequence_shape
[
0
],
sequence_shape
[
1
]
...
...
official/nlp/modeling/layers/multi_channel_attention.py
View file @
790e49e5
...
...
@@ -26,8 +26,8 @@ class VotingAttention(tf.keras.layers.Layer):
"""Voting Attention layer.
Args:
num_heads:
t
he number of attention heads.
head_size:
p
er-head hidden size.
num_heads:
T
he number of attention heads.
head_size:
P
er-head hidden size.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
...
...
@@ -115,7 +115,7 @@ class MultiChannelAttention(tf.keras.layers.MultiHeadAttention):
context tensors according to the distribution among channels.
key: Optional key `Tensor` of shape `[B, A, S, dim]`. If not given, will use
`value` for both `key` and `value`, which is the most common case.
attention_mask:
a
boolean mask of shape `[B, T, S]`, that prevents attention
attention_mask:
A
boolean mask of shape `[B, T, S]`, that prevents attention
to certain positions.
"""
...
...
official/nlp/modeling/layers/position_embedding.py
View file @
790e49e5
...
...
@@ -77,7 +77,7 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
dimension of `inputs`.
Returns:
A tensor in shape of
[
length, hidden_size
]
.
A tensor in shape of
`(
length, hidden_size
)`
.
"""
if
inputs
is
None
and
length
is
None
:
raise
ValueError
(
"If inputs is None, `length` must be set in "
...
...
@@ -114,7 +114,7 @@ def _relative_position_bucket(relative_position,
the distance in tokens from the attending position to the attended-to
position.
If bidirectional=False, then positive relative positions are invalid.
If
`
bidirectional=False
`
, then positive relative positions are invalid.
We use smaller buckets for small absolute relative_position and larger
buckets for larger absolute relative_positions.
...
...
@@ -127,13 +127,13 @@ def _relative_position_bucket(relative_position,
than the model has been trained on.
Args:
relative_position:
a
n int32 Tensor
bidirectional:
a
boolean - whether the attention is bidirectional
num_buckets:
a
n integer
max_distance:
a
n integer
relative_position:
A
n int32 Tensor
bidirectional:
A
boolean - whether the attention is bidirectional
num_buckets:
A
n integer
max_distance:
A
n integer
Returns:
a
Tensor with the same shape as relative_position, containing int32
A
Tensor with the same shape as relative_position, containing int32
values in the range [0, num_buckets)
"""
ret
=
0
...
...
official/nlp/modeling/layers/relative_attention.py
View file @
790e49e5
...
...
@@ -103,10 +103,10 @@ class MultiHeadRelativeAttention(tf.keras.layers.MultiHeadAttention):
segment_attention_bias: Optional trainable bias parameter added to the
query had when calculating the segment-based attention score used in
XLNet of shape `[num_heads, dim]`.
state: Optional `Tensor` of shape [B, M, E] where M is the length of the
state: Optional `Tensor` of shape
`
[B, M, E]
`
where M is the length of the
state or memory.
If passed, this is also attended over as in Transformer XL.
attention_mask:
a
boolean mask of shape `[B, T, S]` that prevents attention
attention_mask:
A
boolean mask of shape `[B, T, S]` that prevents attention
to certain positions.
"""
...
...
official/nlp/modeling/layers/self_attention_mask.py
View file @
790e49e5
...
...
@@ -21,15 +21,15 @@ from official.nlp.keras_nlp import layers
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
SelfAttentionMask
(
layers
.
SelfAttentionMask
):
"""Create 3D attention mask from a 2D tensor mask.
"""Create
s
3D attention mask from a 2D tensor mask.
**Warning: Please use the `keras_nlp.layers.SelfAttentionMask`.**
inputs[0]: from_tensor: 2D or 3D Tensor of shape
[
batch_size, from_seq_length, ...
]
.
inputs[1]: to_mask: int32 Tensor of shape
[
batch_size, to_seq_length
]
.
`(
batch_size, from_seq_length, ...
)`
.
inputs[1]: to_mask: int32 Tensor of shape
`(
batch_size, to_seq_length
)`
.
Returns:
f
loat Tensor of shape
[
batch_size, from_seq_length, to_seq_length
]
.
F
loat Tensor of shape
`(
batch_size, from_seq_length, to_seq_length
)`
.
"""
def
call
(
self
,
inputs
):
...
...
official/nlp/modeling/layers/spectral_normalization.py
0 → 100644
View file @
790e49e5
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Normalization layers.
## References:
[1] Yuichi Yoshida, Takeru Miyato. Spectral Norm Regularization for Improving
the Generalizability of Deep Learning.
_arXiv preprint arXiv:1705.10941_, 2017. https://arxiv.org/abs/1705.10941
[2] Takeru Miyato, Toshiki Kataoka, Masanori Koyama, Yuichi Yoshida.
Spectral normalization for generative adversarial networks.
In _International Conference on Learning Representations_, 2018.
[3] Henry Gouk, Eibe Frank, Bernhard Pfahringer, Michael Cree.
Regularisation of neural networks by enforcing lipschitz continuity.
_arXiv preprint arXiv:1804.04368_, 2018. https://arxiv.org/abs/1804.04368
"""
import
numpy
as
np
import
tensorflow
as
tf
class
SpectralNormalization
(
tf
.
keras
.
layers
.
Wrapper
):
"""Implements spectral normalization for Dense layer."""
def
__init__
(
self
,
layer
,
iteration
=
1
,
norm_multiplier
=
0.95
,
training
=
True
,
aggregation
=
tf
.
VariableAggregation
.
MEAN
,
inhere_layer_name
=
False
,
**
kwargs
):
"""Initializer.
Args:
layer: (tf.keras.layers.Layer) A TF Keras layer to apply normalization to.
iteration: (int) The number of power iteration to perform to estimate
weight matrix's singular value.
norm_multiplier: (float) Multiplicative constant to threshold the
normalization. Usually under normalization, the singular value will
converge to this value.
training: (bool) Whether to perform power iteration to update the singular
value estimate.
aggregation: (tf.VariableAggregation) Indicates how a distributed variable
will be aggregated. Accepted values are constants defined in the class
tf.VariableAggregation.
inhere_layer_name: (bool) Whether to inhere the name of the input layer.
**kwargs: (dict) Other keyword arguments for the layers.Wrapper class.
"""
self
.
iteration
=
iteration
self
.
do_power_iteration
=
training
self
.
aggregation
=
aggregation
self
.
norm_multiplier
=
norm_multiplier
# Set layer name.
wrapper_name
=
kwargs
.
pop
(
'name'
,
None
)
if
inhere_layer_name
:
wrapper_name
=
layer
.
name
if
not
isinstance
(
layer
,
tf
.
keras
.
layers
.
Layer
):
raise
ValueError
(
'`layer` must be a `tf.keras.layer.Layer`. '
'Observed `{}`'
.
format
(
layer
))
super
(
SpectralNormalization
,
self
).
__init__
(
layer
,
name
=
wrapper_name
,
**
kwargs
)
def
build
(
self
,
input_shape
):
super
(
SpectralNormalization
,
self
).
build
(
input_shape
)
self
.
layer
.
kernel
.
_aggregation
=
self
.
aggregation
# pylint: disable=protected-access
self
.
_dtype
=
self
.
layer
.
kernel
.
dtype
self
.
w
=
self
.
layer
.
kernel
self
.
w_shape
=
self
.
w
.
shape
.
as_list
()
self
.
uv_initializer
=
tf
.
initializers
.
random_normal
()
self
.
v
=
self
.
add_weight
(
shape
=
(
1
,
np
.
prod
(
self
.
w_shape
[:
-
1
])),
initializer
=
self
.
uv_initializer
,
trainable
=
False
,
name
=
'v'
,
dtype
=
self
.
dtype
,
aggregation
=
self
.
aggregation
)
self
.
u
=
self
.
add_weight
(
shape
=
(
1
,
self
.
w_shape
[
-
1
]),
initializer
=
self
.
uv_initializer
,
trainable
=
False
,
name
=
'u'
,
dtype
=
self
.
dtype
,
aggregation
=
self
.
aggregation
)
self
.
update_weights
()
def
call
(
self
,
inputs
,
*
,
training
=
None
):
training
=
self
.
do_power_iteration
if
training
is
None
else
training
u_update_op
,
v_update_op
,
w_update_op
=
self
.
update_weights
(
training
=
training
)
output
=
self
.
layer
(
inputs
)
w_restore_op
=
self
.
restore_weights
()
# Register update ops.
self
.
add_update
(
u_update_op
)
self
.
add_update
(
v_update_op
)
self
.
add_update
(
w_update_op
)
self
.
add_update
(
w_restore_op
)
return
output
def
update_weights
(
self
,
*
,
training
=
True
):
w_reshaped
=
tf
.
reshape
(
self
.
w
,
[
-
1
,
self
.
w_shape
[
-
1
]])
u_hat
=
self
.
u
v_hat
=
self
.
v
if
training
:
for
_
in
range
(
self
.
iteration
):
v_hat
=
tf
.
nn
.
l2_normalize
(
tf
.
matmul
(
u_hat
,
tf
.
transpose
(
w_reshaped
)))
u_hat
=
tf
.
nn
.
l2_normalize
(
tf
.
matmul
(
v_hat
,
w_reshaped
))
sigma
=
tf
.
matmul
(
tf
.
matmul
(
v_hat
,
w_reshaped
),
tf
.
transpose
(
u_hat
))
# Convert sigma from a 1x1 matrix to a scalar.
sigma
=
tf
.
reshape
(
sigma
,
[])
u_update_op
=
self
.
u
.
assign
(
u_hat
)
v_update_op
=
self
.
v
.
assign
(
v_hat
)
# Bound spectral norm to be not larger than self.norm_multiplier.
w_norm
=
tf
.
cond
((
self
.
norm_multiplier
/
sigma
)
<
1
,
lambda
:
# pylint:disable=g-long-lambda
(
self
.
norm_multiplier
/
sigma
)
*
self
.
w
,
lambda
:
self
.
w
)
w_update_op
=
self
.
layer
.
kernel
.
assign
(
w_norm
)
return
u_update_op
,
v_update_op
,
w_update_op
def
restore_weights
(
self
):
"""Restores layer weights to maintain gradient update (See Alg 1 of [1])."""
return
self
.
layer
.
kernel
.
assign
(
self
.
w
)
class
SpectralNormalizationConv2D
(
tf
.
keras
.
layers
.
Wrapper
):
"""Implements spectral normalization for Conv2D layer based on [3]."""
def
__init__
(
self
,
layer
,
iteration
=
1
,
norm_multiplier
=
0.95
,
training
=
True
,
aggregation
=
tf
.
VariableAggregation
.
MEAN
,
legacy_mode
=
False
,
**
kwargs
):
"""Initializer.
Args:
layer: (tf.keras.layers.Layer) A TF Keras layer to apply normalization to.
iteration: (int) The number of power iteration to perform to estimate
weight matrix's singular value.
norm_multiplier: (float) Multiplicative constant to threshold the
normalization. Usually under normalization, the singular value will
converge to this value.
training: (bool) Whether to perform power iteration to update the singular
value estimate.
aggregation: (tf.VariableAggregation) Indicates how a distributed variable
will be aggregated. Accepted values are constants defined in the class
tf.VariableAggregation.
legacy_mode: (bool) Whether to use the legacy implementation where the
dimension of the u and v vectors are set to the batch size. It should
not be enabled unless for backward compatibility reasons.
**kwargs: (dict) Other keyword arguments for the layers.Wrapper class.
"""
self
.
iteration
=
iteration
self
.
do_power_iteration
=
training
self
.
aggregation
=
aggregation
self
.
norm_multiplier
=
norm_multiplier
self
.
legacy_mode
=
legacy_mode
# Set layer attributes.
layer
.
_name
+=
'_spec_norm'
if
not
isinstance
(
layer
,
tf
.
keras
.
layers
.
Conv2D
):
raise
ValueError
(
'layer must be a `tf.keras.layer.Conv2D` instance. You passed: {input}'
.
format
(
input
=
layer
))
super
(
SpectralNormalizationConv2D
,
self
).
__init__
(
layer
,
**
kwargs
)
def
build
(
self
,
input_shape
):
self
.
layer
.
build
(
input_shape
)
self
.
layer
.
kernel
.
_aggregation
=
self
.
aggregation
# pylint: disable=protected-access
self
.
_dtype
=
self
.
layer
.
kernel
.
dtype
# Shape (kernel_size_1, kernel_size_2, in_channel, out_channel).
self
.
w
=
self
.
layer
.
kernel
self
.
w_shape
=
self
.
w
.
shape
.
as_list
()
self
.
strides
=
self
.
layer
.
strides
# Set the dimensions of u and v vectors.
batch_size
=
input_shape
[
0
]
uv_dim
=
batch_size
if
self
.
legacy_mode
else
1
# Resolve shapes.
in_height
=
input_shape
[
1
]
in_width
=
input_shape
[
2
]
in_channel
=
self
.
w_shape
[
2
]
out_height
=
in_height
//
self
.
strides
[
0
]
out_width
=
in_width
//
self
.
strides
[
1
]
out_channel
=
self
.
w_shape
[
3
]
self
.
in_shape
=
(
uv_dim
,
in_height
,
in_width
,
in_channel
)
self
.
out_shape
=
(
uv_dim
,
out_height
,
out_width
,
out_channel
)
self
.
uv_initializer
=
tf
.
initializers
.
random_normal
()
self
.
v
=
self
.
add_weight
(
shape
=
self
.
in_shape
,
initializer
=
self
.
uv_initializer
,
trainable
=
False
,
name
=
'v'
,
dtype
=
self
.
dtype
,
aggregation
=
self
.
aggregation
)
self
.
u
=
self
.
add_weight
(
shape
=
self
.
out_shape
,
initializer
=
self
.
uv_initializer
,
trainable
=
False
,
name
=
'u'
,
dtype
=
self
.
dtype
,
aggregation
=
self
.
aggregation
)
super
(
SpectralNormalizationConv2D
,
self
).
build
()
def
call
(
self
,
inputs
):
u_update_op
,
v_update_op
,
w_update_op
=
self
.
update_weights
()
output
=
self
.
layer
(
inputs
)
w_restore_op
=
self
.
restore_weights
()
# Register update ops.
self
.
add_update
(
u_update_op
)
self
.
add_update
(
v_update_op
)
self
.
add_update
(
w_update_op
)
self
.
add_update
(
w_restore_op
)
return
output
def
update_weights
(
self
):
"""Computes power iteration for convolutional filters based on [3]."""
# Initialize u, v vectors.
u_hat
=
self
.
u
v_hat
=
self
.
v
if
self
.
do_power_iteration
:
for
_
in
range
(
self
.
iteration
):
# Updates v.
v_
=
tf
.
nn
.
conv2d_transpose
(
u_hat
,
self
.
w
,
output_shape
=
self
.
in_shape
,
strides
=
self
.
strides
,
padding
=
'SAME'
)
v_hat
=
tf
.
nn
.
l2_normalize
(
tf
.
reshape
(
v_
,
[
1
,
-
1
]))
v_hat
=
tf
.
reshape
(
v_hat
,
v_
.
shape
)
# Updates u.
u_
=
tf
.
nn
.
conv2d
(
v_hat
,
self
.
w
,
strides
=
self
.
strides
,
padding
=
'SAME'
)
u_hat
=
tf
.
nn
.
l2_normalize
(
tf
.
reshape
(
u_
,
[
1
,
-
1
]))
u_hat
=
tf
.
reshape
(
u_hat
,
u_
.
shape
)
v_w_hat
=
tf
.
nn
.
conv2d
(
v_hat
,
self
.
w
,
strides
=
self
.
strides
,
padding
=
'SAME'
)
sigma
=
tf
.
matmul
(
tf
.
reshape
(
v_w_hat
,
[
1
,
-
1
]),
tf
.
reshape
(
u_hat
,
[
-
1
,
1
]))
# Convert sigma from a 1x1 matrix to a scalar.
sigma
=
tf
.
reshape
(
sigma
,
[])
u_update_op
=
self
.
u
.
assign
(
u_hat
)
v_update_op
=
self
.
v
.
assign
(
v_hat
)
w_norm
=
tf
.
cond
((
self
.
norm_multiplier
/
sigma
)
<
1
,
lambda
:
# pylint:disable=g-long-lambda
(
self
.
norm_multiplier
/
sigma
)
*
self
.
w
,
lambda
:
self
.
w
)
w_update_op
=
self
.
layer
.
kernel
.
assign
(
w_norm
)
return
u_update_op
,
v_update_op
,
w_update_op
def
restore_weights
(
self
):
"""Restores layer weights to maintain gradient update (See Alg 1 of [1])."""
return
self
.
layer
.
kernel
.
assign
(
self
.
w
)
official/nlp/modeling/layers/spectral_normalization_test.py
0 → 100644
View file @
790e49e5
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for normalization layers.
## References:
[1] Hanie Sedghi, Vineet Gupta, Philip M. Long.
The Singular Values of Convolutional Layers.
In _International Conference on Learning Representations_, 2019.
"""
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
official.nlp.modeling.layers
import
spectral_normalization
DenseLayer
=
tf
.
keras
.
layers
.
Dense
(
10
)
Conv2DLayer
=
tf
.
keras
.
layers
.
Conv2D
(
filters
=
64
,
kernel_size
=
3
,
padding
=
'valid'
)
def
_compute_spectral_norm
(
weight
):
if
weight
.
ndim
>
2
:
# Computes Conv2D via FFT transform as in [1].
weight
=
np
.
fft
.
fft2
(
weight
,
weight
.
shape
[
1
:
3
],
axes
=
[
0
,
1
])
return
np
.
max
(
np
.
linalg
.
svd
(
weight
,
compute_uv
=
False
))
class
NormalizationTest
(
tf
.
test
.
TestCase
,
parameterized
.
TestCase
):
def
setUp
(
self
):
super
(
NormalizationTest
,
self
).
setUp
()
self
.
num_iterations
=
1000
self
.
norm_multiplier
=
0.95
@
parameterized
.
named_parameters
(
(
'Dense'
,
(
None
,
10
),
DenseLayer
,
spectral_normalization
.
SpectralNormalization
),
(
'Conv2D'
,
(
None
,
32
,
32
,
3
),
Conv2DLayer
,
spectral_normalization
.
SpectralNormalizationConv2D
))
def
test_spec_norm_magnitude
(
self
,
input_shape
,
layer
,
norm_wrapper
):
"""Tests if the weights spectral norm converges to norm_multiplier."""
layer
.
build
(
input_shape
)
sn_layer
=
norm_wrapper
(
layer
,
iteration
=
self
.
num_iterations
,
norm_multiplier
=
self
.
norm_multiplier
)
# Perform normalization.
sn_layer
.
build
(
input_shape
)
sn_layer
.
update_weights
()
normalized_kernel
=
sn_layer
.
layer
.
kernel
.
numpy
()
spectral_norm_computed
=
_compute_spectral_norm
(
normalized_kernel
)
spectral_norm_expected
=
self
.
norm_multiplier
self
.
assertAllClose
(
spectral_norm_computed
,
spectral_norm_expected
,
atol
=
5e-2
)
# Test that the normalized layer is K-Lipschitz. In particular, if the layer
# is a function f, then ||f(x1) - f(x2)||_2 <= K * ||(x1 - x2)||_2, where K
# is the norm multiplier.
new_input_shape
=
(
16
,)
+
input_shape
[
1
:]
new_input
=
tf
.
random
.
uniform
(
new_input_shape
)
delta_vec
=
tf
.
random
.
uniform
(
new_input_shape
)
output1
=
sn_layer
(
new_input
)
output2
=
sn_layer
(
new_input
+
delta_vec
)
delta_input
=
tf
.
norm
(
tf
.
reshape
(
delta_vec
,
(
-
1
,))).
numpy
()
delta_output
=
tf
.
norm
(
tf
.
reshape
(
output2
-
output1
,
(
-
1
,))).
numpy
()
self
.
assertLessEqual
(
delta_output
,
self
.
norm_multiplier
*
delta_input
)
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/nlp/modeling/layers/talking_heads_attention.py
View file @
790e49e5
...
...
@@ -63,7 +63,7 @@ class TalkingHeadsAttention(tf.keras.layers.MultiHeadAttention):
that will be applied on attention scores before and after softmax.
Args:
qkv_rank:
t
he rank of query, key, value tensors after projection.
qkv_rank:
T
he rank of query, key, value tensors after projection.
"""
super
(
TalkingHeadsAttention
,
self
).
_build_attention
(
qkv_rank
)
...
...
official/nlp/modeling/layers/text_layers.py
View file @
790e49e5
...
...
@@ -100,10 +100,10 @@ class BertTokenizer(tf.keras.layers.Layer):
tokenize_with_offsets: If true, calls
`text.BertTokenizer.tokenize_with_offsets()` instead of plain
`text.BertTokenizer.tokenize()` and outputs a triple of
(tokens, start_offsets, limit_offsets).
raw_table_access: An object with methods .lookup(keys) and .size()
`
(tokens, start_offsets, limit_offsets)
`
.
raw_table_access: An object with methods
`
.lookup(keys) and
`
.size()
`
that operate on the raw lookup table of tokens. It can be used to
look up special token synbols like [MASK].
look up special token synbols like
`
[MASK]
`
.
"""
def
__init__
(
self
,
*
,
...
...
@@ -121,16 +121,16 @@ class BertTokenizer(tf.keras.layers.Layer):
lower_case: A Python boolean forwarded to `text.BertTokenizer`.
If true, input text is converted to lower case (where applicable)
before tokenization. This must be set to match the way in which
the vocab_file was created.
the
`
vocab_file
`
was created.
tokenize_with_offsets: A Python boolean. If true, this layer calls
`text.BertTokenizer.tokenize_with_offsets()` instead of plain
`text.BertTokenizer.tokenize()` and outputs a triple of
(tokens, start_offsets, limit_offsets)
insead of just tokens.
**kwargs:
s
tandard arguments to Layer().
`text.BertTokenizer.tokenize_with_offsets()` instead of plain
`text.BertTokenizer.tokenize()` and outputs a triple of
`
(tokens, start_offsets, limit_offsets)
`
insead of just tokens.
**kwargs:
S
tandard arguments to
`
Layer()
`
.
Raises:
ImportError:
i
f importing `tensorflow_text` failed.
ImportError:
I
f importing `tensorflow_text` failed.
"""
_check_if_tf_text_installed
()
...
...
@@ -167,18 +167,19 @@ class BertTokenizer(tf.keras.layers.Layer):
"""Calls `text.BertTokenizer` on inputs.
Args:
inputs: A string Tensor of shape
[
batch_size
]
.
inputs: A string Tensor of shape
`(
batch_size
,)`
.
Returns:
One or three of `RaggedTensors` if `tokenize_with_offsets` is False or
True, respectively. These are
tokens: A `RaggedTensor` of shape [batch_size, (words), (pieces_per_word)]
and type int32. tokens[i,j,k] contains the k-th wordpiece of the
j-th word in the i-th input.
start_offsets, limit_offsets: If `tokenize_with_offsets` is True,
RaggedTensors of type int64 with the same indices as tokens.
Element [i,j,k] contains the byte offset at the start, or past the
end, resp., for the k-th wordpiece of the j-th word in the i-th input.
tokens: A `RaggedTensor` of shape
`[batch_size, (words), (pieces_per_word)]`
and type int32. `tokens[i,j,k]` contains the k-th wordpiece of the
j-th word in the i-th input.
start_offsets, limit_offsets: If `tokenize_with_offsets` is True,
RaggedTensors of type int64 with the same indices as tokens.
Element `[i,j,k]` contains the byte offset at the start, or past the
end, resp., for the k-th wordpiece of the j-th word in the i-th input.
"""
# Prepare to reshape the result to work around broken shape inference.
batch_size
=
tf
.
shape
(
inputs
)[
0
]
...
...
@@ -201,12 +202,7 @@ class BertTokenizer(tf.keras.layers.Layer):
def
get_config
(
self
):
# Skip in tf.saved_model.save(); fail if called direcly.
# TODO(arnoegw): Implement when switching to MutableHashTable, which gets
# initialized from the checkpoint and not from a vocab file.
# We cannot just put the original, user-supplied vocab file name into
# the config, because the path has to change as the SavedModel is copied
# around.
raise
NotImplementedError
(
"Not implemented yet."
)
raise
NotImplementedError
(
"TODO(b/170480226): implement"
)
def
get_special_tokens_dict
(
self
):
"""Returns dict of token ids, keyed by standard names for their purpose.
...
...
@@ -268,13 +264,13 @@ class BertTokenizer(tf.keras.layers.Layer):
class
SentencepieceTokenizer
(
tf
.
keras
.
layers
.
Layer
):
"""Wraps tf_text.SentencepieceTokenizer as a Keras Layer.
"""Wraps
`
tf_text.SentencepieceTokenizer
`
as a Keras Layer.
Attributes:
tokenize_with_offsets: If true, calls
SentencepieceTokenizer.tokenize_with_offsets()
instead of plain .tokenize() and outputs a triple of
(tokens, start_offsets, limit_offsets).
`
SentencepieceTokenizer.tokenize_with_offsets()
`
instead of plain
`
.tokenize()
`
and outputs a triple of
`
(tokens, start_offsets, limit_offsets)
`
.
"""
def
__init__
(
self
,
...
...
@@ -300,9 +296,9 @@ class SentencepieceTokenizer(tf.keras.layers.Layer):
store the actual proto (not a filename passed here).
model_serialized_proto: The sentencepiece model serialized proto string.
tokenize_with_offsets: A Python boolean. If true, this layer calls
SentencepieceTokenizer.tokenize_with_offsets() instead of
plain .tokenize() and outputs a triple of
(tokens, start_offsets, limit_offsets) insead of just tokens.
`
SentencepieceTokenizer.tokenize_with_offsets()
`
instead of
plain
`
.tokenize()
`
and outputs a triple of
`
(tokens, start_offsets, limit_offsets)
`
insead of just tokens.
Note that when following `strip_diacritics` is set to True, returning
offsets is not supported now.
nbest_size: A scalar for sampling:
...
...
@@ -320,7 +316,7 @@ class SentencepieceTokenizer(tf.keras.layers.Layer):
`tokenize_with_offsets`. NOTE: New models are encouraged to put this
into custom normalization rules for the Sentencepiece model itself to
avoid this extra step and the limitation regarding offsets.
**kwargs: standard arguments to Layer().
**kwargs: standard arguments to
`
Layer()
`
.
Raises:
ImportError: if importing tensorflow_text failed.
...
...
@@ -360,19 +356,19 @@ class SentencepieceTokenizer(tf.keras.layers.Layer):
return
self
.
_tokenizer
.
vocab_size
()
def
call
(
self
,
inputs
:
tf
.
Tensor
):
"""Calls text.SentencepieceTokenizer on inputs.
"""Calls
`
text.SentencepieceTokenizer
`
on inputs.
Args:
inputs: A string Tensor of shape
[
batch_size
]
.
inputs: A string Tensor of shape
`(
batch_size
,)`
.
Returns:
One or three of RaggedTensors if tokenize_with_offsets is False or True,
respectively. These are
tokens: A RaggedTensor of shape [batch_size, (pieces)] and type int32.
tokens[i,j] contains the j-th piece in the i-th input.
start_offsets, limit_offsets: If tokenize_with_offsets is True,
RaggedTensors of type int64 with the same indices as tokens.
Element [i,j] contains the byte offset at the start, or past the
tokens: A RaggedTensor of shape
`
[batch_size, (pieces)]
`
and type
`
int32
`
.
`
tokens[i,j]
`
contains the j-th piece in the i-th input.
start_offsets, limit_offsets: If
`
tokenize_with_offsets
`
is True,
RaggedTensors of type
`
int64
`
with the same indices as tokens.
Element
`
[i,j]
`
contains the byte offset at the start, or past the
end, resp., for the j-th piece in the i-th input.
"""
if
self
.
_strip_diacritics
:
...
...
@@ -403,19 +399,8 @@ class SentencepieceTokenizer(tf.keras.layers.Layer):
return
_reshape
(
tokens
)
def
get_config
(
self
):
raise
NotImplementedError
(
"b/170480226"
)
# TODO(b/170480226): Uncomment and improve to fix the bug.
# config = {
# "model_serialized_proto": self._model_serialized_proto,
# "lower_case": self._lower_case,
# "tokenize_with_offsets": self.tokenize_with_offsets,
# "nbest_size": self._nbest_size,
# "alpha": self._alpha,
# "strip_diacritics": self._strip_diacritics,
# }
# base_config = super(SentencepieceTokenizer, self).get_config()
# base_config.update(config)
# return base_config
# Skip in tf.saved_model.save(); fail if called direcly.
raise
NotImplementedError
(
"TODO(b/170480226): implement"
)
def
get_special_tokens_dict
(
self
):
"""Returns dict of token ids, keyed by standard names for their purpose.
...
...
@@ -492,7 +477,7 @@ class BertPackInputs(tf.keras.layers.Layer):
special_tokens_dict
=
None
,
truncator
=
"round_robin"
,
**
kwargs
):
"""Initializes with a target seq_length, relevant token ids and truncator.
"""Initializes with a target
`
seq_length
`
, relevant token ids and truncator.
Args:
seq_length: The desired output length. Must not exceed the max_seq_length
...
...
@@ -505,13 +490,13 @@ class BertPackInputs(tf.keras.layers.Layer):
unused positions after the last segment in the sequence
(called "[PAD]" for BERT).
special_tokens_dict: Optionally, a dict from Python strings to Python
integers that contains values for start_of_sequence_id,
end_of_segment_id and padding_id. (Further values in the dict are
integers that contains values for
`
start_of_sequence_id
`
,
`
end_of_segment_id
`
and
`
padding_id
`
. (Further values in the dict are
silenty ignored.) If this is passed, separate *_id arguments must be
omitted.
truncator: The algorithm to truncate a list of batched segments to fit a
per-example length limit. The value can be either
"
round_robin
"
or
"
waterfall
"
:
per-example length limit. The value can be either
`
round_robin
`
or
`
waterfall
`
:
(1) For "round_robin" algorithm, available space is assigned
one token at a time in a round-robin fashion to the inputs that still
need some, until the limit is reached. It currently only supports
...
...
@@ -521,10 +506,10 @@ class BertPackInputs(tf.keras.layers.Layer):
left-to-right manner and fills up the buckets until we run out of
budget. It support arbitrary number of segments.
**kwargs: standard arguments to Layer().
**kwargs: standard arguments to
`
Layer()
`
.
Raises:
ImportError: if importing tensorflow_text failed.
ImportError: if importing
`
tensorflow_text
`
failed.
"""
_check_if_tf_text_installed
()
super
().
__init__
(
**
kwargs
)
...
...
official/nlp/modeling/layers/tn_expand_condense.py
View file @
790e49e5
...
...
@@ -37,8 +37,8 @@ class TNExpandCondense(Layer):
Note the input shape and output shape will be identical.
Args:
proj_multiplier: Positive integer, multiple of input_shape[-1] to project
up to. Must be one of [2, 4, 6, 8].
proj_multiplier: Positive integer, multiple of
`
input_shape[-1]
`
to project
up to. Must be one of
`
[2, 4, 6, 8]
`
.
use_bias: Boolean, whether the layer uses a bias vector.
activation: Activation function to use between Expand and Condense. If you
don't specify anything, no activation is applied
...
...
official/nlp/modeling/layers/transformer.py
View file @
790e49e5
...
...
@@ -232,7 +232,8 @@ class TransformerDecoderBlock(tf.keras.layers.Layer):
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"self_attention_layer_norm"
,
axis
=-
1
,
epsilon
=
self
.
_norm_epsilon
))
epsilon
=
self
.
_norm_epsilon
,
dtype
=
"float32"
))
# Encoder-decoder attention.
self
.
encdec_attention
=
self
.
_cross_attention_cls
(
num_heads
=
self
.
num_attention_heads
,
...
...
@@ -250,7 +251,8 @@ class TransformerDecoderBlock(tf.keras.layers.Layer):
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"attention/encdec_output_layer_norm"
,
axis
=-
1
,
epsilon
=
self
.
_norm_epsilon
))
epsilon
=
self
.
_norm_epsilon
,
dtype
=
"float32"
))
# Feed-forward projection.
self
.
intermediate_dense
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
...
...
@@ -273,7 +275,8 @@ class TransformerDecoderBlock(tf.keras.layers.Layer):
**
common_kwargs
)
self
.
output_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout_rate
)
self
.
output_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"output_layer_norm"
,
axis
=-
1
,
epsilon
=
self
.
_norm_epsilon
)
name
=
"output_layer_norm"
,
axis
=-
1
,
epsilon
=
self
.
_norm_epsilon
,
dtype
=
"float32"
)
super
().
build
(
input_shape
)
def
get_config
(
self
):
...
...
official/nlp/modeling/layers/transformer_scaffold.py
View file @
790e49e5
...
...
@@ -112,8 +112,9 @@ class TransformerScaffold(tf.keras.layers.Layer):
self
.
_bias_constraint
=
tf
.
keras
.
constraints
.
get
(
bias_constraint
)
def
build
(
self
,
input_shape
):
input_tensor
=
input_shape
[
0
]
if
len
(
input_shape
)
==
2
else
input_shape
input_tensor_shape
=
tf
.
TensorShape
(
input_tensor
)
input_tensor_shape
=
input_shape
[
0
]
if
(
len
(
input_shape
)
==
2
)
else
input_shape
input_tensor_shape
=
tf
.
TensorShape
(
input_tensor_shape
)
if
len
(
input_tensor_shape
.
as_list
())
!=
3
:
raise
ValueError
(
"TransformerScaffold expects a three-dimensional input of "
...
...
@@ -170,6 +171,8 @@ class TransformerScaffold(tf.keras.layers.Layer):
else
:
self
.
_feedforward_block
=
None
# self._dropout_rate controls dropout rates at two places:
# after attention, and after FFN.
self
.
_attention_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_dropout_rate
)
# Use float32 in layernorm for numeric stability.
# It is probably safe in mixed_float16, but we haven't validated this yet.
...
...
official/nlp/modeling/losses/__init__.py
View file @
790e49e5
...
...
@@ -12,5 +12,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Activations package definition. S
ubject to change."""
"""
Losses contains common loss computation used in NLP (s
ubject to change
)
."""
from
official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy
import
loss
as
weighted_sparse_categorical_crossentropy_loss
official/nlp/modeling/models/README.md
View file @
790e49e5
# Models
Models are combinations of layers and
network
s that
would
be trained.
Models are combinations of
`tf.keras`
layers and
model
s that
can
be trained.
Several pre-built canned models are provided to train encoder networks.
These
models are intended as both convenience functions and canonical examples.
Several pre-built canned models are provided to train encoder networks.
These
models are intended as both convenience functions and canonical examples.
*
[
`BertClassifier`
](
bert_classifier.py
)
implements a simple classification
model containing a single classification head using the Classification network.
...
...
official/nlp/modeling/models/__init__.py
View file @
790e49e5
...
...
@@ -12,7 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Models package definition."""
"""Models are combinations of `tf.keras` layers and models that can be trained.
Several pre-built canned models are provided to train encoder networks.
These models are intended as both convenience functions and canonical examples.
"""
from
official.nlp.modeling.models.bert_classifier
import
BertClassifier
from
official.nlp.modeling.models.bert_pretrainer
import
*
from
official.nlp.modeling.models.bert_span_labeler
import
BertSpanLabeler
...
...
official/nlp/modeling/models/bert_pretrainer.py
View file @
790e49e5
...
...
@@ -50,8 +50,8 @@ class BertPretrainer(tf.keras.Model):
None, no activation will be used.
initializer: The initializer (if any) to use in the masked LM and
classification networks. Defaults to a Glorot uniform initializer.
output: The output style for this network. Can be either
'
logits
'
or
'
predictions
'
.
output: The output style for this network. Can be either
`
logits
`
or
`
predictions
`
.
"""
def
__init__
(
self
,
...
...
official/nlp/modeling/models/bert_span_labeler.py
View file @
790e49e5
...
...
@@ -37,11 +37,11 @@ class BertSpanLabeler(tf.keras.Model):
Args:
network: A transformer network. This network should output a sequence output
and a classification output. Furthermore, it should expose its embedding
table via a
"
get_embedding_table
"
method.
table via a
`
get_embedding_table
`
method.
initializer: The initializer (if any) to use in the span labeling network.
Defaults to a Glorot uniform initializer.
output: The output style for this network. Can be either
'
logit
s
' or
'
predictions
'
.
output: The output style for this network. Can be either
`
logit
`
' or
`
predictions
`
.
"""
def
__init__
(
self
,
...
...
official/nlp/modeling/models/bert_token_classifier.py
View file @
790e49e5
...
...
@@ -36,12 +36,15 @@ class BertTokenClassifier(tf.keras.Model):
Args:
network: A transformer network. This network should output a sequence output
and a classification output. Furthermore, it should expose its embedding
table via a
"
get_embedding_table
"
method.
table via a
`
get_embedding_table
`
method.
num_classes: Number of classes to predict from the classification network.
initializer: The initializer (if any) to use in the classification networks.
Defaults to a Glorot uniform initializer.
output: The output style for this network. Can be either 'logits' or
'predictions'.
output: The output style for this network. Can be either `logits` or
`predictions`.
dropout_rate: The dropout probability of the token classification head.
output_encoder_outputs: Whether to include intermediate sequence output
in the final output.
"""
def
__init__
(
self
,
...
...
@@ -50,6 +53,7 @@ class BertTokenClassifier(tf.keras.Model):
initializer
=
'glorot_uniform'
,
output
=
'logits'
,
dropout_rate
=
0.1
,
output_encoder_outputs
=
False
,
**
kwargs
):
# We want to use the inputs of the passed network as the inputs to this
...
...
@@ -74,14 +78,19 @@ class BertTokenClassifier(tf.keras.Model):
name
=
'predictions/transform/logits'
)
logits
=
classifier
(
sequence_output
)
if
output
==
'logits'
:
output_tensors
=
logits
output_tensors
=
{
'
logits
'
:
logits
}
elif
output
==
'predictions'
:
output_tensors
=
tf
.
keras
.
layers
.
Activation
(
tf
.
nn
.
log_softmax
)(
logits
)
output_tensors
=
{
'predictions'
:
tf
.
keras
.
layers
.
Activation
(
tf
.
nn
.
log_softmax
)(
logits
)
}
else
:
raise
ValueError
(
(
'Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"'
)
%
output
)
if
output_encoder_outputs
:
output_tensors
[
'encoder_outputs'
]
=
sequence_output
# b/164516224
# Once we've created the network using the Functional API, we call
# super().__init__ as though we were invoking the Functional API Model
...
...
@@ -98,6 +107,7 @@ class BertTokenClassifier(tf.keras.Model):
'num_classes'
:
num_classes
,
'initializer'
:
initializer
,
'output'
:
output
,
'output_encoder_outputs'
:
output_encoder_outputs
}
# We are storing the config dict as a namedtuple here to ensure checkpoint
...
...
official/nlp/modeling/models/bert_token_classifier_test.py
View file @
790e49e5
...
...
@@ -27,22 +27,26 @@ from official.nlp.modeling.models import bert_token_classifier
@
keras_parameterized
.
run_all_keras_modes
class
BertTokenClassifierTest
(
keras_parameterized
.
TestCase
):
@
parameterized
.
parameters
(
True
,
False
)
def
test_bert_trainer
(
self
,
dict_outputs
):
@
parameterized
.
parameters
(
(
True
,
True
),
(
False
,
False
)
)
def
test_bert_trainer
(
self
,
dict_outputs
,
output_encoder_outputs
):
"""Validate that the Keras object can be created."""
# Build a transformer network to use within the BERT trainer.
vocab_size
=
100
sequence_length
=
512
hidden_size
=
768
test_network
=
networks
.
BertEncoder
(
vocab_size
=
vocab_size
,
num_layers
=
2
,
max_sequence_length
=
sequence_length
,
dict_outputs
=
dict_outputs
)
dict_outputs
=
dict_outputs
,
hidden_size
=
hidden_size
)
# Create a BERT trainer with the created network.
num_classes
=
3
bert_trainer_model
=
bert_token_classifier
.
BertTokenClassifier
(
test_network
,
num_classes
=
num_classes
)
test_network
,
num_classes
=
num_classes
,
output_encoder_outputs
=
output_encoder_outputs
)
# Create a set of 2-dimensional inputs (the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
...
...
@@ -50,12 +54,18 @@ class BertTokenClassifierTest(keras_parameterized.TestCase):
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
# Invoke the trainer model on the inputs. This causes the layer to be built.
sequence_outs
=
bert_trainer_model
([
word_ids
,
mask
,
type_ids
])
outputs
=
bert_trainer_model
([
word_ids
,
mask
,
type_ids
])
if
output_encoder_outputs
:
logits
=
outputs
[
'logits'
]
encoder_outputs
=
outputs
[
'encoder_outputs'
]
self
.
assertAllEqual
(
encoder_outputs
.
shape
.
as_list
(),
[
None
,
sequence_length
,
hidden_size
])
else
:
logits
=
outputs
[
'logits'
]
# Validate that the outputs are of the expected shape.
expected_classification_shape
=
[
None
,
sequence_length
,
num_classes
]
self
.
assertAllEqual
(
expected_classification_shape
,
sequence_outs
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_classification_shape
,
logits
.
shape
.
as_list
())
def
test_bert_trainer_tensor_call
(
self
):
"""Validate that the Keras object can be invoked."""
...
...
Prev
1
2
3
4
5
6
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment