Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
347f4044
Commit
347f4044
authored
Feb 07, 2020
by
Zongwei Zhou
Committed by
A. Unique TensorFlower
Feb 07, 2020
Browse files
Internal change
PiperOrigin-RevId: 293958491
parent
67f6015a
Changes
13
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
71 additions
and
121 deletions
+71
-121
official/nlp/albert/export_albert_tfhub.py
official/nlp/albert/export_albert_tfhub.py
+1
-1
official/nlp/bert/bert_models.py
official/nlp/bert/bert_models.py
+6
-18
official/nlp/bert/export_tfhub.py
official/nlp/bert/export_tfhub.py
+1
-1
official/nlp/bert/run_classifier.py
official/nlp/bert/run_classifier.py
+3
-2
official/nlp/bert/run_squad.py
official/nlp/bert/run_squad.py
+5
-4
official/nlp/modeling/layers/transformer.py
official/nlp/modeling/layers/transformer.py
+11
-20
official/nlp/modeling/layers/transformer_test.py
official/nlp/modeling/layers/transformer_test.py
+9
-5
official/nlp/modeling/networks/albert_transformer_encoder.py
official/nlp/modeling/networks/albert_transformer_encoder.py
+3
-15
official/nlp/modeling/networks/albert_transformer_encoder_test.py
.../nlp/modeling/networks/albert_transformer_encoder_test.py
+14
-8
official/nlp/modeling/networks/encoder_scaffold.py
official/nlp/modeling/networks/encoder_scaffold.py
+1
-9
official/nlp/modeling/networks/encoder_scaffold_test.py
official/nlp/modeling/networks/encoder_scaffold_test.py
+7
-23
official/nlp/modeling/networks/transformer_encoder.py
official/nlp/modeling/networks/transformer_encoder.py
+1
-9
official/nlp/modeling/networks/transformer_encoder_test.py
official/nlp/modeling/networks/transformer_encoder_test.py
+9
-6
No files found.
official/nlp/albert/export_albert_tfhub.py
View file @
347f4044
...
...
@@ -56,7 +56,7 @@ def create_albert_model(
input_type_ids
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
None
,),
dtype
=
tf
.
int32
,
name
=
"input_type_ids"
)
transformer_encoder
=
bert_models
.
get_transformer_encoder
(
albert_config
,
sequence_length
=
None
,
float_dtype
=
tf
.
float32
)
albert_config
,
sequence_length
=
None
)
sequence_output
,
pooled_output
=
transformer_encoder
(
[
input_word_ids
,
input_mask
,
input_type_ids
])
# To keep consistent with legacy hub modules, the outputs are
...
...
official/nlp/bert/bert_models.py
View file @
347f4044
...
...
@@ -85,14 +85,12 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
def
get_transformer_encoder
(
bert_config
,
sequence_length
,
float_dtype
=
tf
.
float32
):
sequence_length
):
"""Gets a 'TransformerEncoder' object.
Args:
bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object.
sequence_length: Maximum sequence length of the training data.
float_dtype: tf.dtype, tf.float32 or tf.float16.
Returns:
A networks.TransformerEncoder object.
...
...
@@ -110,8 +108,7 @@ def get_transformer_encoder(bert_config,
max_sequence_length
=
bert_config
.
max_position_embeddings
,
type_vocab_size
=
bert_config
.
type_vocab_size
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
bert_config
.
initializer_range
),
float_dtype
=
float_dtype
.
name
)
stddev
=
bert_config
.
initializer_range
))
if
isinstance
(
bert_config
,
bert_modeling
.
AlbertConfig
):
kwargs
[
'embedding_width'
]
=
bert_config
.
embedding_size
return
networks
.
AlbertTransformerEncoder
(
**
kwargs
)
...
...
@@ -191,10 +188,9 @@ def pretrain_model(bert_config,
class
BertSquadLogitsLayer
(
tf
.
keras
.
layers
.
Layer
):
"""Returns a layer that computes custom logits for BERT squad model."""
def
__init__
(
self
,
initializer
=
None
,
float_type
=
tf
.
float32
,
**
kwargs
):
def
__init__
(
self
,
initializer
=
None
,
**
kwargs
):
super
(
BertSquadLogitsLayer
,
self
).
__init__
(
**
kwargs
)
self
.
initializer
=
initializer
self
.
float_type
=
float_type
def
build
(
self
,
unused_input_shapes
):
"""Implements build() for the layer."""
...
...
@@ -217,14 +213,11 @@ class BertSquadLogitsLayer(tf.keras.layers.Layer):
logits
=
tf
.
keras
.
backend
.
reshape
(
logits
,
[
-
1
,
sequence_length
,
2
])
logits
=
tf
.
transpose
(
logits
,
[
2
,
0
,
1
])
unstacked_logits
=
tf
.
unstack
(
logits
,
axis
=
0
)
if
self
.
float_type
==
tf
.
float16
:
unstacked_logits
=
tf
.
cast
(
unstacked_logits
,
tf
.
float32
)
return
unstacked_logits
[
0
],
unstacked_logits
[
1
]
def
squad_model
(
bert_config
,
max_seq_length
,
float_type
,
initializer
=
None
,
hub_module_url
=
None
):
"""Returns BERT Squad model along with core BERT model to import weights.
...
...
@@ -232,7 +225,6 @@ def squad_model(bert_config,
Args:
bert_config: BertConfig, the config defines the core Bert model.
max_seq_length: integer, the maximum input sequence length.
float_type: tf.dtype, tf.float32 or tf.bfloat16.
initializer: Initializer for the final dense layer in the span labeler.
Defaulted to TruncatedNormal initializer.
hub_module_url: TF-Hub path/url to Bert module.
...
...
@@ -245,8 +237,7 @@ def squad_model(bert_config,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
bert_config
.
initializer_range
)
if
not
hub_module_url
:
bert_encoder
=
get_transformer_encoder
(
bert_config
,
max_seq_length
,
float_type
)
bert_encoder
=
get_transformer_encoder
(
bert_config
,
max_seq_length
)
return
bert_span_labeler
.
BertSpanLabeler
(
network
=
bert_encoder
,
initializer
=
initializer
),
bert_encoder
...
...
@@ -261,7 +252,7 @@ def squad_model(bert_config,
[
input_word_ids
,
input_mask
,
input_type_ids
])
squad_logits_layer
=
BertSquadLogitsLayer
(
initializer
=
initializer
,
float_type
=
float_type
,
name
=
'squad_logits'
)
initializer
=
initializer
,
name
=
'squad_logits'
)
start_logits
,
end_logits
=
squad_logits_layer
(
sequence_output
)
squad
=
tf
.
keras
.
Model
(
...
...
@@ -276,7 +267,6 @@ def squad_model(bert_config,
def
classifier_model
(
bert_config
,
float_type
,
num_labels
,
max_seq_length
,
final_layer_initializer
=
None
,
...
...
@@ -289,7 +279,6 @@ def classifier_model(bert_config,
Args:
bert_config: BertConfig or AlbertConfig, the config defines the core
BERT or ALBERT model.
float_type: dtype, tf.float32 or tf.bfloat16.
num_labels: integer, the number of classes.
max_seq_length: integer, the maximum input sequence length.
final_layer_initializer: Initializer for final dense layer. Defaulted
...
...
@@ -328,8 +317,7 @@ def classifier_model(bert_config,
output
=
tf
.
keras
.
layers
.
Dense
(
num_labels
,
kernel_initializer
=
initializer
,
name
=
'output'
,
dtype
=
float_type
)(
name
=
'output'
)(
output
)
return
tf
.
keras
.
Model
(
inputs
=
{
...
...
official/nlp/bert/export_tfhub.py
View file @
347f4044
...
...
@@ -54,7 +54,7 @@ def create_bert_model(bert_config: bert_modeling.BertConfig) -> tf.keras.Model:
input_type_ids
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
None
,),
dtype
=
tf
.
int32
,
name
=
"input_type_ids"
)
transformer_encoder
=
bert_models
.
get_transformer_encoder
(
bert_config
,
sequence_length
=
None
,
float_dtype
=
tf
.
float32
)
bert_config
,
sequence_length
=
None
)
sequence_output
,
pooled_output
=
transformer_encoder
(
[
input_word_ids
,
input_mask
,
input_type_ids
])
# To keep consistent with legacy hub modules, the outputs are
...
...
official/nlp/bert/run_classifier.py
View file @
347f4044
...
...
@@ -123,7 +123,6 @@ def run_bert_classifier(strategy,
classifier_model
,
core_model
=
(
bert_models
.
classifier_model
(
bert_config
,
tf
.
float32
,
num_classes
,
max_seq_length
,
hub_module_url
=
FLAGS
.
hub_module_url
))
...
...
@@ -271,8 +270,10 @@ def export_classifier(model_export_path, input_meta_data,
if
not
model_dir
:
raise
ValueError
(
'Export path is not specified: %s'
%
model_dir
)
# Export uses float32 for now, even if training uses mixed precision.
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
'float32'
)
classifier_model
=
bert_models
.
classifier_model
(
bert_config
,
tf
.
float32
,
input_meta_data
[
'num_labels'
],
bert_config
,
input_meta_data
[
'num_labels'
],
input_meta_data
[
'max_seq_length'
])[
0
]
model_saving_utils
.
export_bert_model
(
...
...
official/nlp/bert/run_squad.py
View file @
347f4044
...
...
@@ -186,7 +186,7 @@ def predict_squad_customized(strategy, input_meta_data, bert_config,
# Prediction always uses float32, even if training uses mixed precision.
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
'float32'
)
squad_model
,
_
=
bert_models
.
squad_model
(
bert_config
,
input_meta_data
[
'max_seq_length'
]
,
float_type
=
tf
.
float32
)
bert_config
,
input_meta_data
[
'max_seq_length'
])
checkpoint_path
=
tf
.
train
.
latest_checkpoint
(
FLAGS
.
model_dir
)
logging
.
info
(
'Restoring checkpoints from %s'
,
checkpoint_path
)
...
...
@@ -254,7 +254,6 @@ def train_squad(strategy,
squad_model
,
core_model
=
bert_models
.
squad_model
(
bert_config
,
max_seq_length
,
float_type
=
tf
.
float16
if
use_float16
else
tf
.
float32
,
hub_module_url
=
FLAGS
.
hub_module_url
)
squad_model
.
optimizer
=
optimization
.
create_optimizer
(
FLAGS
.
learning_rate
,
steps_per_epoch
*
epochs
,
warmup_steps
)
...
...
@@ -389,8 +388,10 @@ def export_squad(model_export_path, input_meta_data):
raise
ValueError
(
'Export path is not specified: %s'
%
model_export_path
)
bert_config
=
MODEL_CLASSES
[
FLAGS
.
model_type
][
0
].
from_json_file
(
FLAGS
.
bert_config_file
)
squad_model
,
_
=
bert_models
.
squad_model
(
bert_config
,
input_meta_data
[
'max_seq_length'
],
float_type
=
tf
.
float32
)
# Export uses float32 for now, even if training uses mixed precision.
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
'float32'
)
squad_model
,
_
=
bert_models
.
squad_model
(
bert_config
,
input_meta_data
[
'max_seq_length'
])
model_saving_utils
.
export_bert_model
(
model_export_path
,
model
=
squad_model
,
checkpoint_dir
=
FLAGS
.
model_dir
)
...
...
official/nlp/modeling/layers/transformer.py
View file @
347f4044
...
...
@@ -123,6 +123,8 @@ class Transformer(tf.keras.layers.Layer):
bias_constraint
=
self
.
_bias_constraint
,
name
=
"self_attention_output"
)
self
.
_attention_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_dropout_rate
)
# Use float32 in layernorm for numeric stability.
# It is probably safe in mixed_float16, but we haven't validated this yet.
self
.
_attention_layer_norm
=
(
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"self_attention_layer_norm"
,
...
...
@@ -140,8 +142,10 @@ class Transformer(tf.keras.layers.Layer):
kernel_constraint
=
self
.
_kernel_constraint
,
bias_constraint
=
self
.
_bias_constraint
,
name
=
"intermediate"
)
# Use float32 in intermediate gelu activation for numeric stability.
# TODO(b/149117297): investigate gelu numeric stability.
self
.
_intermediate_activation_layer
=
tf
.
keras
.
layers
.
Activation
(
self
.
_intermediate_activation
)
self
.
_intermediate_activation
,
dtype
=
tf
.
float32
)
self
.
_output_dense
=
dense_einsum
.
DenseEinsum
(
output_shape
=
hidden_size
,
kernel_initializer
=
self
.
_kernel_initializer
,
...
...
@@ -153,6 +157,7 @@ class Transformer(tf.keras.layers.Layer):
bias_constraint
=
self
.
_bias_constraint
,
name
=
"output"
)
self
.
_output_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_dropout_rate
)
# Use float32 in layernorm for numeric stability.
self
.
_output_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"output_layer_norm"
,
axis
=-
1
,
epsilon
=
1e-12
,
dtype
=
tf
.
float32
)
...
...
@@ -202,30 +207,16 @@ class Transformer(tf.keras.layers.Layer):
attention_output
=
self
.
_attention_layer
(
attention_inputs
)
attention_output
=
self
.
_attention_output_dense
(
attention_output
)
attention_output
=
self
.
_attention_dropout
(
attention_output
)
# Use float32 in keras layer norm and the gelu activation in the
# intermediate dense layer for numeric stability
if
self
.
dtype
==
tf
.
float16
:
input_tensor
=
tf
.
cast
(
input_tensor
,
tf
.
float32
)
attention_output
=
tf
.
cast
(
attention_output
,
tf
.
float32
)
attention_output
=
self
.
_attention_layer_norm
(
input_tensor
+
attention_output
)
intermediate_output
=
self
.
_intermediate_dense
(
attention_output
)
if
self
.
dtype
==
tf
.
float16
:
# Casts to float32 so that activation is done in float32.
intermediate_output
=
tf
.
cast
(
intermediate_output
,
tf
.
float32
)
intermediate_output
=
self
.
_intermediate_activation_layer
(
intermediate_output
)
intermediate_output
=
tf
.
cast
(
intermediate_output
,
tf
.
float16
)
else
:
intermediate_output
=
self
.
_intermediate_activation_layer
(
intermediate_output
)
layer_output
=
self
.
_output_dense
(
intermediate_output
)
layer_output
=
self
.
_output_dropout
(
layer_output
)
#
Use float32 in keras layer norm for numeric stability
if
self
.
dtype
==
tf
.
float16
:
#
During mixed precision training, attention_output is from layer norm and
# is always fp32 for now. cast layer_output to fp32 for the subsequent add.
layer_output
=
tf
.
cast
(
layer_output
,
tf
.
float32
)
layer_output
=
self
.
_output_layer_norm
(
layer_output
+
attention_output
)
if
self
.
dtype
==
tf
.
float16
:
layer_output
=
tf
.
cast
(
layer_output
,
tf
.
float16
)
return
layer_output
official/nlp/modeling/layers/transformer_test.py
View file @
347f4044
...
...
@@ -30,6 +30,10 @@ from official.nlp.modeling.layers import transformer
@
keras_parameterized
.
run_all_keras_modes
class
TransformerLayerTest
(
keras_parameterized
.
TestCase
):
def
tearDown
(
self
):
super
(
TransformerLayerTest
,
self
).
tearDown
()
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
'float32'
)
def
test_layer_creation
(
self
):
test_layer
=
transformer
.
Transformer
(
num_attention_heads
=
10
,
...
...
@@ -121,16 +125,15 @@ class TransformerLayerTest(keras_parameterized.TestCase):
_
=
model
.
predict
([
input_data
,
mask_data
])
def
test_layer_invocation_with_float16_dtype
(
self
):
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
'mixed_float16'
)
test_layer
=
transformer
.
Transformer
(
num_attention_heads
=
10
,
intermediate_size
=
2048
,
intermediate_activation
=
'relu'
,
dtype
=
'float16'
)
intermediate_activation
=
'relu'
)
sequence_length
=
21
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
),
dtype
=
tf
.
float16
)
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
# Create a 2-dimensional input (the first dimension is implicit).
mask_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
data_tensor
,
mask_tensor
])
...
...
@@ -142,7 +145,7 @@ class TransformerLayerTest(keras_parameterized.TestCase):
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
6
input_data
=
(
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
)))
.
astype
(
np
.
float16
)
(
batch_size
,
sequence_length
,
width
)))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length)
mask_data
=
np
.
random
.
randint
(
...
...
@@ -165,4 +168,5 @@ class TransformerLayerTest(keras_parameterized.TestCase):
if
__name__
==
'__main__'
:
assert
tf
.
version
.
VERSION
.
startswith
(
'2.'
)
tf
.
test
.
main
()
official/nlp/modeling/networks/albert_transformer_encoder.py
View file @
347f4044
...
...
@@ -65,7 +65,6 @@ class AlbertTransformerEncoder(network.Network):
attention_dropout_rate: The dropout rate to use for the attention layers
within the transformer layers.
initializer: The initialzer to use for all weights in this encoder.
float_dtype: The dtype of this encoder. Can be 'float32' or 'float16'.
"""
def
__init__
(
self
,
...
...
@@ -82,7 +81,6 @@ class AlbertTransformerEncoder(network.Network):
dropout_rate
=
0.1
,
attention_dropout_rate
=
0.1
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
float_dtype
=
'float32'
,
**
kwargs
):
activation
=
tf
.
keras
.
activations
.
get
(
activation
)
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
...
...
@@ -104,7 +102,6 @@ class AlbertTransformerEncoder(network.Network):
'dropout_rate'
:
dropout_rate
,
'attention_dropout_rate'
:
attention_dropout_rate
,
'initializer'
:
tf
.
keras
.
initializers
.
serialize
(
initializer
),
'float_dtype'
:
float_dtype
,
}
word_ids
=
tf
.
keras
.
layers
.
Input
(
...
...
@@ -118,7 +115,6 @@ class AlbertTransformerEncoder(network.Network):
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
initializer
=
initializer
,
dtype
=
float_dtype
,
name
=
'word_embeddings'
)
word_embeddings
=
self
.
_embedding_layer
(
word_ids
)
...
...
@@ -126,8 +122,7 @@ class AlbertTransformerEncoder(network.Network):
self
.
_position_embedding_layer
=
layers
.
PositionEmbedding
(
initializer
=
initializer
,
use_dynamic_slicing
=
True
,
max_sequence_length
=
max_sequence_length
,
dtype
=
float_dtype
)
max_sequence_length
=
max_sequence_length
)
position_embeddings
=
self
.
_position_embedding_layer
(
word_embeddings
)
type_embeddings
=
(
...
...
@@ -136,7 +131,6 @@ class AlbertTransformerEncoder(network.Network):
embedding_width
=
embedding_width
,
initializer
=
initializer
,
use_one_hot
=
True
,
dtype
=
float_dtype
,
name
=
'type_embeddings'
)(
type_ids
))
embeddings
=
tf
.
keras
.
layers
.
Add
()(
...
...
@@ -146,10 +140,9 @@ class AlbertTransformerEncoder(network.Network):
name
=
'embeddings/layer_norm'
,
axis
=-
1
,
epsilon
=
1e-12
,
dtype
=
float_dtype
)(
embeddings
))
embeddings
=
(
tf
.
keras
.
layers
.
Dropout
(
rate
=
dropout_rate
,
dtype
=
tf
.
float32
)(
embeddings
))
embeddings
=
(
tf
.
keras
.
layers
.
Dropout
(
rate
=
dropout_rate
)(
embeddings
))
# We project the 'embedding' output to 'hidden_size' if it is not already
# 'hidden_size'.
if
embedding_width
!=
hidden_size
:
...
...
@@ -159,9 +152,6 @@ class AlbertTransformerEncoder(network.Network):
name
=
'embedding_projection'
)(
embeddings
)
if
float_dtype
==
'float16'
:
embeddings
=
tf
.
cast
(
embeddings
,
tf
.
float16
)
data
=
embeddings
attention_mask
=
layers
.
SelfAttentionMask
()([
data
,
mask
])
shared_layer
=
layers
.
Transformer
(
...
...
@@ -171,7 +161,6 @@ class AlbertTransformerEncoder(network.Network):
dropout_rate
=
dropout_rate
,
attention_dropout_rate
=
attention_dropout_rate
,
kernel_initializer
=
initializer
,
dtype
=
float_dtype
,
name
=
'transformer'
)
for
_
in
range
(
num_layers
):
data
=
shared_layer
([
data
,
attention_mask
])
...
...
@@ -183,7 +172,6 @@ class AlbertTransformerEncoder(network.Network):
units
=
hidden_size
,
activation
=
'tanh'
,
kernel_initializer
=
initializer
,
dtype
=
float_dtype
,
name
=
'pooler_transform'
)(
first_token_tensor
)
...
...
official/nlp/modeling/networks/albert_transformer_encoder_test.py
View file @
347f4044
...
...
@@ -31,14 +31,17 @@ from official.nlp.modeling.networks import albert_transformer_encoder
@
keras_parameterized
.
run_all_keras_modes
class
AlbertTransformerEncoderTest
(
keras_parameterized
.
TestCase
):
def
tearDown
(
self
):
super
(
AlbertTransformerEncoderTest
,
self
).
tearDown
()
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
"float32"
)
@
parameterized
.
named_parameters
(
dict
(
testcase_name
=
"default"
,
expected_dtype
=
tf
.
float32
),
dict
(
testcase_name
=
"with_float16_dtype"
,
expected_dtype
=
tf
.
float16
,
float_dtype
=
"float16"
),
expected_dtype
=
tf
.
float16
),
)
def
test_network_creation
(
self
,
expected_dtype
,
float_dtype
=
None
):
def
test_network_creation
(
self
,
expected_dtype
):
hidden_size
=
32
sequence_length
=
21
...
...
@@ -48,8 +51,8 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
sequence_length
=
sequence_length
,
num_attention_heads
=
2
,
num_layers
=
3
)
if
float_dtype
is
not
None
:
kwargs
[
"float_dtype"
]
=
float_dtype
if
expected_dtype
==
tf
.
float16
:
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
"mixed_float16"
)
# Create a small TransformerEncoder for testing.
test_network
=
albert_transformer_encoder
.
AlbertTransformerEncoder
(
**
kwargs
)
...
...
@@ -65,7 +68,9 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
self
.
assertAllEqual
(
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_pooled_shape
,
pooled
.
shape
.
as_list
())
self
.
assertEqual
(
expected_dtype
,
data
.
dtype
)
# If float_dtype is set to float16, the data output is float32 (from a layer
# norm) and pool output should be float16.
self
.
assertEqual
(
tf
.
float32
,
data
.
dtype
)
self
.
assertEqual
(
expected_dtype
,
pooled
.
dtype
)
# ALBERT has additonal 'embedding_hidden_mapping_in' weights and
...
...
@@ -128,6 +133,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
_
=
model
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
def
test_serialize_deserialize
(
self
):
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
"mixed_float16"
)
# Create a network object that sets all of its config options.
kwargs
=
dict
(
vocab_size
=
100
,
...
...
@@ -142,8 +148,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
activation
=
"relu"
,
dropout_rate
=
0.05
,
attention_dropout_rate
=
0.22
,
initializer
=
"glorot_uniform"
,
float_dtype
=
"float16"
)
initializer
=
"glorot_uniform"
)
network
=
albert_transformer_encoder
.
AlbertTransformerEncoder
(
**
kwargs
)
expected_config
=
dict
(
kwargs
)
...
...
@@ -166,4 +171,5 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
if
__name__
==
"__main__"
:
assert
tf
.
version
.
VERSION
.
startswith
(
'2.'
)
tf
.
test
.
main
()
official/nlp/modeling/networks/encoder_scaffold.py
View file @
347f4044
...
...
@@ -50,7 +50,6 @@ class EncoderScaffold(network.Network):
num_output_classes: The output size of the classification layer.
classification_layer_initializer: The initializer for the classification
layer.
classification_layer_dtype: The dtype for the classification layer.
embedding_cls: The class or instance to use to embed the input data. This
class or instance defines the inputs to this encoder. If embedding_cls is
not set, a default embedding network (from the original BERT paper) will
...
...
@@ -65,7 +64,6 @@ class EncoderScaffold(network.Network):
"seq_length": The sequence length for this encoder.
"initializer": The initializer for the embedding portion of this encoder.
"dropout_rate": The dropout rate to apply before the encoding layers.
"dtype": (Optional): The dtype of the embedding layers.
embedding_data: A reference to the embedding weights that will be used to
train the masked language model, if necessary. This is optional, and only
needed if (1) you are overriding embedding_cls and (2) are doing standard
...
...
@@ -84,7 +82,6 @@ class EncoderScaffold(network.Network):
"dropout_rate": The overall dropout rate for the transformer layers.
"attention_dropout_rate": The dropout rate for the attention layers.
"kernel_initializer": The initializer for the transformer layers.
"dtype": The dtype of the transformer.
"""
def
__init__
(
...
...
@@ -92,7 +89,6 @@ class EncoderScaffold(network.Network):
num_output_classes
,
classification_layer_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
classification_layer_dtype
=
tf
.
float32
,
embedding_cls
=
None
,
embedding_cfg
=
None
,
embedding_data
=
None
,
...
...
@@ -168,10 +164,7 @@ class EncoderScaffold(network.Network):
dtype
=
tf
.
float32
)(
embeddings
))
embeddings
=
(
tf
.
keras
.
layers
.
Dropout
(
rate
=
embedding_cfg
[
'dropout_rate'
],
dtype
=
tf
.
float32
)(
embeddings
))
if
embedding_cfg
.
get
(
'dtype'
)
==
'float16'
:
embeddings
=
tf
.
cast
(
embeddings
,
tf
.
float16
)
rate
=
embedding_cfg
[
'dropout_rate'
])(
embeddings
))
attention_mask
=
layers
.
SelfAttentionMask
()([
embeddings
,
mask
])
data
=
embeddings
...
...
@@ -190,7 +183,6 @@ class EncoderScaffold(network.Network):
units
=
num_output_classes
,
activation
=
'tanh'
,
kernel_initializer
=
classification_layer_initializer
,
dtype
=
classification_layer_dtype
,
name
=
'cls_transform'
)(
first_token_tensor
)
...
...
official/nlp/modeling/networks/encoder_scaffold_test.py
View file @
347f4044
...
...
@@ -53,6 +53,10 @@ class ValidatedTransformerLayer(layers.Transformer):
@
keras_parameterized
.
run_all_keras_modes
class
EncoderScaffoldLayerClassTest
(
keras_parameterized
.
TestCase
):
def
tearDown
(
self
):
super
(
EncoderScaffoldLayerClassTest
,
self
).
tearDown
()
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
"float32"
)
def
test_network_creation
(
self
):
hidden_size
=
32
sequence_length
=
21
...
...
@@ -81,8 +85,6 @@ class EncoderScaffoldLayerClassTest(keras_parameterized.TestCase):
0.1
,
"kernel_initializer"
:
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
"dtype"
:
"float32"
,
"call_list"
:
call_list
}
...
...
@@ -127,7 +129,6 @@ class EncoderScaffoldLayerClassTest(keras_parameterized.TestCase):
"max_seq_length"
:
sequence_length
,
"initializer"
:
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
"dropout_rate"
:
0.1
,
"dtype"
:
"float16"
,
}
hidden_cfg
=
{
"num_attention_heads"
:
...
...
@@ -142,8 +143,6 @@ class EncoderScaffoldLayerClassTest(keras_parameterized.TestCase):
0.1
,
"kernel_initializer"
:
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
"dtype"
:
"float16"
,
}
# Create a small EncoderScaffold for testing.
test_network
=
encoder_scaffold
.
EncoderScaffold
(
...
...
@@ -151,7 +150,6 @@ class EncoderScaffoldLayerClassTest(keras_parameterized.TestCase):
num_output_classes
=
hidden_size
,
classification_layer_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
classification_layer_dtype
=
tf
.
float16
,
hidden_cfg
=
hidden_cfg
,
embedding_cfg
=
embedding_cfg
)
# Create the inputs (note that the first dimension is implicit).
...
...
@@ -165,8 +163,9 @@ class EncoderScaffoldLayerClassTest(keras_parameterized.TestCase):
self
.
assertAllEqual
(
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_pooled_shape
,
pooled
.
shape
.
as_list
())
# If float_dtype is set to float16, the output should always be float16.
self
.
assertAllEqual
(
tf
.
float16
,
data
.
dtype
)
# If float_dtype is set to float16, the data output is float32 (from a layer
# norm) and pool output should be float16.
self
.
assertAllEqual
(
tf
.
float32
,
data
.
dtype
)
self
.
assertAllEqual
(
tf
.
float16
,
pooled
.
dtype
)
def
test_network_invocation
(
self
):
...
...
@@ -196,10 +195,7 @@ class EncoderScaffoldLayerClassTest(keras_parameterized.TestCase):
0.1
,
"kernel_initializer"
:
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
"dtype"
:
"float32"
,
}
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
"float32"
)
print
(
hidden_cfg
)
print
(
embedding_cfg
)
# Create a small EncoderScaffold for testing.
...
...
@@ -293,8 +289,6 @@ class EncoderScaffoldLayerClassTest(keras_parameterized.TestCase):
0.1
,
"kernel_initializer"
:
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
"dtype"
:
"float32"
,
}
# Create a small EncoderScaffold for testing.
network
=
encoder_scaffold
.
EncoderScaffold
(
...
...
@@ -353,8 +347,6 @@ class EncoderScaffoldEmbeddingNetworkTest(keras_parameterized.TestCase):
0.1
,
"kernel_initializer"
:
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
"dtype"
:
"float32"
,
}
# Create a small EncoderScaffold for testing.
...
...
@@ -422,8 +414,6 @@ class EncoderScaffoldEmbeddingNetworkTest(keras_parameterized.TestCase):
0.1
,
"kernel_initializer"
:
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
"dtype"
:
"float32"
,
}
# Create a small EncoderScaffold for testing.
...
...
@@ -493,7 +483,6 @@ class EncoderScaffoldHiddenInstanceTest(keras_parameterized.TestCase):
"max_seq_length"
:
sequence_length
,
"initializer"
:
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
"dropout_rate"
:
0.1
,
"dtype"
:
"float32"
,
}
call_list
=
[]
...
...
@@ -510,8 +499,6 @@ class EncoderScaffoldHiddenInstanceTest(keras_parameterized.TestCase):
0.1
,
"kernel_initializer"
:
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
"dtype"
:
"float32"
,
"call_list"
:
call_list
}
...
...
@@ -566,7 +553,6 @@ class EncoderScaffoldHiddenInstanceTest(keras_parameterized.TestCase):
"max_seq_length"
:
sequence_length
,
"initializer"
:
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
"dropout_rate"
:
0.1
,
"dtype"
:
"float32"
,
}
call_list
=
[]
...
...
@@ -583,8 +569,6 @@ class EncoderScaffoldHiddenInstanceTest(keras_parameterized.TestCase):
0.1
,
"kernel_initializer"
:
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
"dtype"
:
"float32"
,
"call_list"
:
call_list
}
...
...
official/nlp/modeling/networks/transformer_encoder.py
View file @
347f4044
...
...
@@ -59,7 +59,6 @@ class TransformerEncoder(network.Network):
attention_dropout_rate: The dropout rate to use for the attention layers
within the transformer layers.
initializer: The initialzer to use for all weights in this encoder.
float_dtype: The dtype of this encoder. Can be 'float32' or 'float16'.
return_all_encoder_outputs: Whether to output sequence embedding outputs of
all encoder transformer layers.
"""
...
...
@@ -77,7 +76,6 @@ class TransformerEncoder(network.Network):
dropout_rate
=
0.1
,
attention_dropout_rate
=
0.1
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
float_dtype
=
'float32'
,
return_all_encoder_outputs
=
False
,
**
kwargs
):
activation
=
tf
.
keras
.
activations
.
get
(
activation
)
...
...
@@ -99,7 +97,6 @@ class TransformerEncoder(network.Network):
'dropout_rate'
:
dropout_rate
,
'attention_dropout_rate'
:
attention_dropout_rate
,
'initializer'
:
tf
.
keras
.
initializers
.
serialize
(
initializer
),
'float_dtype'
:
float_dtype
,
'return_all_encoder_outputs'
:
return_all_encoder_outputs
,
}
...
...
@@ -141,11 +138,7 @@ class TransformerEncoder(network.Network):
epsilon
=
1e-12
,
dtype
=
tf
.
float32
)(
embeddings
))
embeddings
=
(
tf
.
keras
.
layers
.
Dropout
(
rate
=
dropout_rate
,
dtype
=
tf
.
float32
)(
embeddings
))
if
float_dtype
==
'float16'
:
embeddings
=
tf
.
cast
(
embeddings
,
tf
.
float16
)
tf
.
keras
.
layers
.
Dropout
(
rate
=
dropout_rate
)(
embeddings
))
self
.
_transformer_layers
=
[]
data
=
embeddings
...
...
@@ -159,7 +152,6 @@ class TransformerEncoder(network.Network):
dropout_rate
=
dropout_rate
,
attention_dropout_rate
=
attention_dropout_rate
,
kernel_initializer
=
initializer
,
dtype
=
float_dtype
,
name
=
'transformer/layer_%d'
%
i
)
self
.
_transformer_layers
.
append
(
layer
)
data
=
layer
([
data
,
attention_mask
])
...
...
official/nlp/modeling/networks/transformer_encoder_test.py
View file @
347f4044
...
...
@@ -30,6 +30,10 @@ from official.nlp.modeling.networks import transformer_encoder
@
keras_parameterized
.
run_all_keras_modes
class
TransformerEncoderTest
(
keras_parameterized
.
TestCase
):
def
tearDown
(
self
):
super
(
TransformerEncoderTest
,
self
).
tearDown
()
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
"float32"
)
def
test_network_creation
(
self
):
hidden_size
=
32
sequence_length
=
21
...
...
@@ -93,8 +97,7 @@ class TransformerEncoderTest(keras_parameterized.TestCase):
hidden_size
=
hidden_size
,
sequence_length
=
sequence_length
,
num_attention_heads
=
2
,
num_layers
=
3
,
float_dtype
=
"float16"
)
num_layers
=
3
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
...
...
@@ -106,8 +109,9 @@ class TransformerEncoderTest(keras_parameterized.TestCase):
self
.
assertAllEqual
(
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_pooled_shape
,
pooled
.
shape
.
as_list
())
# If float_dtype is set to float16, the output should always be float16.
self
.
assertAllEqual
(
tf
.
float16
,
data
.
dtype
)
# If float_dtype is set to float16, the data output is float32 (from a layer
# norm) and pool output should be float16.
self
.
assertAllEqual
(
tf
.
float32
,
data
.
dtype
)
self
.
assertAllEqual
(
tf
.
float16
,
pooled
.
dtype
)
def
test_network_invocation
(
self
):
...
...
@@ -115,7 +119,6 @@ class TransformerEncoderTest(keras_parameterized.TestCase):
sequence_length
=
21
vocab_size
=
57
num_types
=
7
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
"float32"
)
# Create a small TransformerEncoder for testing.
test_network
=
transformer_encoder
.
TransformerEncoder
(
vocab_size
=
vocab_size
,
...
...
@@ -160,6 +163,7 @@ class TransformerEncoderTest(keras_parameterized.TestCase):
_
=
model
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
def
test_serialize_deserialize
(
self
):
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
"mixed_float16"
)
# Create a network object that sets all of its config options.
kwargs
=
dict
(
vocab_size
=
100
,
...
...
@@ -174,7 +178,6 @@ class TransformerEncoderTest(keras_parameterized.TestCase):
dropout_rate
=
0.05
,
attention_dropout_rate
=
0.22
,
initializer
=
"glorot_uniform"
,
float_dtype
=
"float16"
,
return_all_encoder_outputs
=
False
)
network
=
transformer_encoder
.
TransformerEncoder
(
**
kwargs
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment