Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
448c31b6
Commit
448c31b6
authored
Jul 12, 2019
by
Zongwei Zhou
Committed by
zongweiz
Jul 19, 2019
Browse files
[Transformer] Use float16 input and output for softmax in mixed-precision training
parent
49b90e86
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
4 additions
and
19 deletions
+4
-19
official/transformer/v2/attention_layer.py
official/transformer/v2/attention_layer.py
+4
-19
No files found.
official/transformer/v2/attention_layer.py
View file @
448c31b6
...
@@ -21,24 +21,6 @@ from __future__ import print_function
...
@@ -21,24 +21,6 @@ from __future__ import print_function
import
tensorflow
as
tf
import
tensorflow
as
tf
def
_float32_softmax
(
logits
,
name
=
None
):
"""Computes a softmax activation in float32.
When training a model using float16, softmax is still done in float32 for
numeric stability.
Args:
logits: A tensor, with any shape accepted by `tf.nn.softmax`.
Returns:
A tensor with the same dtype as `logits`.
"""
input_dtype
=
logits
.
dtype
logits
=
tf
.
cast
(
logits
,
tf
.
float32
)
output
=
tf
.
nn
.
softmax
(
logits
,
name
=
name
)
return
tf
.
cast
(
output
,
input_dtype
)
class
Attention
(
tf
.
keras
.
layers
.
Layer
):
class
Attention
(
tf
.
keras
.
layers
.
Layer
):
"""Multi-headed attention layer."""
"""Multi-headed attention layer."""
...
@@ -166,7 +148,10 @@ class Attention(tf.keras.layers.Layer):
...
@@ -166,7 +148,10 @@ class Attention(tf.keras.layers.Layer):
# Calculate dot product attention
# Calculate dot product attention
logits
=
tf
.
matmul
(
q
,
k
,
transpose_b
=
True
)
logits
=
tf
.
matmul
(
q
,
k
,
transpose_b
=
True
)
logits
+=
bias
logits
+=
bias
weights
=
_float32_softmax
(
logits
,
name
=
"attention_weights"
)
# Note that softmax internally performs math operations using float32
# for numeric stability. When training with float16, we keep the input
# and output in float16 for better performance.
weights
=
tf
.
nn
.
softmax
(
logits
,
name
=
"attention_weights"
)
if
training
:
if
training
:
weights
=
tf
.
nn
.
dropout
(
weights
,
rate
=
self
.
attention_dropout
)
weights
=
tf
.
nn
.
dropout
(
weights
,
rate
=
self
.
attention_dropout
)
attention_output
=
tf
.
matmul
(
weights
,
v
)
attention_output
=
tf
.
matmul
(
weights
,
v
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment