Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
057895af
Commit
057895af
authored
Apr 12, 2020
by
Chen Chen
Committed by
A. Unique TensorFlower
Apr 12, 2020
Browse files
Internal change
PiperOrigin-RevId: 306182576
parent
d466d4e6
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
14 additions
and
18 deletions
+14
-18
official/nlp/modeling/layers/transformer_scaffold.py
official/nlp/modeling/layers/transformer_scaffold.py
+6
-13
official/nlp/modeling/layers/transformer_scaffold_test.py
official/nlp/modeling/layers/transformer_scaffold_test.py
+8
-5
No files found.
official/nlp/modeling/layers/transformer_scaffold.py
View file @
057895af
...
...
@@ -145,6 +145,8 @@ class TransformerScaffold(tf.keras.layers.Layer):
bias_constraint
=
self
.
_bias_constraint
,
name
=
"self_attention_output"
)
self
.
_attention_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_dropout_rate
)
# Use float32 in layernorm for numeric stability.
# It is probably safe in mixed_float16, but we haven't validated this yet.
self
.
_attention_layer_norm
=
(
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"self_attention_layer_norm"
,
axis
=-
1
,
epsilon
=
1e-12
,
...
...
@@ -159,7 +161,6 @@ class TransformerScaffold(tf.keras.layers.Layer):
activity_regularizer
=
self
.
_activity_regularizer
,
kernel_constraint
=
self
.
_kernel_constraint
,
bias_constraint
=
self
.
_bias_constraint
,
dtype
=
tf
.
float32
,
# This layer is always float32 for numeric stability.
name
=
"intermediate"
)
self
.
_output_dense
=
dense_einsum
.
DenseEinsum
(
output_shape
=
hidden_size
,
...
...
@@ -172,6 +173,7 @@ class TransformerScaffold(tf.keras.layers.Layer):
bias_constraint
=
self
.
_bias_constraint
,
name
=
"output"
)
self
.
_output_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_dropout_rate
)
# Use float32 in layernorm for numeric stability.
self
.
_output_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"output_layer_norm"
,
axis
=-
1
,
epsilon
=
1e-12
,
dtype
=
tf
.
float32
)
...
...
@@ -223,23 +225,14 @@ class TransformerScaffold(tf.keras.layers.Layer):
attention_output
=
self
.
_attention_layer
(
attention_inputs
)
attention_output
=
self
.
_attention_output_dense
(
attention_output
)
attention_output
=
self
.
_attention_dropout
(
attention_output
)
# Use float32 in keras layer norm and the gelu activation in the
# intermediate dense layer for numeric stability
if
self
.
dtype
==
tf
.
float16
:
input_tensor
=
tf
.
cast
(
input_tensor
,
tf
.
float32
)
attention_output
=
tf
.
cast
(
attention_output
,
tf
.
float32
)
attention_output
=
self
.
_attention_layer_norm
(
input_tensor
+
attention_output
)
intermediate_output
=
self
.
_intermediate_dense
(
attention_output
)
if
self
.
dtype
==
tf
.
float16
:
intermediate_output
=
tf
.
cast
(
intermediate_output
,
tf
.
float16
)
layer_output
=
self
.
_output_dense
(
intermediate_output
)
layer_output
=
self
.
_output_dropout
(
layer_output
)
#
Use float32 in keras layer norm for numeric stability
if
self
.
dtype
==
tf
.
float16
:
#
During mixed precision training, attention_output is from layer norm and
# is always fp32 for now. Cast layer_output to fp32 for the subsequent add.
layer_output
=
tf
.
cast
(
layer_output
,
tf
.
float32
)
layer_output
=
self
.
_output_layer_norm
(
layer_output
+
attention_output
)
if
self
.
dtype
==
tf
.
float16
:
layer_output
=
tf
.
cast
(
layer_output
,
tf
.
float16
)
return
layer_output
official/nlp/modeling/layers/transformer_scaffold_test.py
View file @
057895af
...
...
@@ -54,6 +54,10 @@ class ValidatedAttentionLayer(attention.MultiHeadAttention):
@
keras_parameterized
.
run_all_keras_modes
class
TransformerLayerTest
(
keras_parameterized
.
TestCase
):
def
tearDown
(
self
):
super
(
TransformerLayerTest
,
self
).
tearDown
()
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
'float32'
)
def
test_layer_creation
(
self
):
sequence_length
=
21
width
=
80
...
...
@@ -212,6 +216,7 @@ class TransformerLayerTest(keras_parameterized.TestCase):
self
.
assertTrue
(
call_list
[
0
],
"The passed layer class wasn't instantiated."
)
def
test_layer_invocation_with_float16_dtype
(
self
):
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
'mixed_float16'
)
sequence_length
=
21
width
=
80
...
...
@@ -226,12 +231,10 @@ class TransformerLayerTest(keras_parameterized.TestCase):
attention_cfg
=
attention_layer_cfg
,
num_attention_heads
=
10
,
intermediate_size
=
2048
,
intermediate_activation
=
'relu'
,
dtype
=
'float16'
)
intermediate_activation
=
'relu'
)
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
),
dtype
=
tf
.
float16
)
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
# Create a 2-dimensional input (the first dimension is implicit).
mask_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
data_tensor
,
mask_tensor
])
...
...
@@ -243,7 +246,7 @@ class TransformerLayerTest(keras_parameterized.TestCase):
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
6
input_data
=
(
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
)))
.
astype
(
np
.
float16
)
(
batch_size
,
sequence_length
,
width
)))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length)
mask_data
=
np
.
random
.
randint
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment