[Transformer] Use float16 input and output for softmax in mixed-precision training

448c31b6 · Zongwei Zhou · zongweiz · 49b90e86 · 448c31b6
Commit 448c31b6 authored Jul 12, 2019 by Zongwei Zhou Committed by zongweiz Jul 19, 2019
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 19 deletions

official/transformer/v2/attention_layer.py official/transformer/v2/attention_layer.py +4 -19

No files found.
--- a/official/transformer/v2/attention_layer.py
+++ b/official/transformer/v2/attention_layer.py
@@ -21,24 +21,6 @@ from __future__ import print_function
 import tensorflow as tf
-def _float32_softmax(logits, name=None):
-  """Computes a softmax activation in float32.
-  When training a model using float16, softmax is still done in float32 for
-  numeric stability.
-  Args:
-    logits: A tensor, with any shape accepted by `tf.nn.softmax`.
-  Returns:
-    A tensor with the same dtype as `logits`.
-  """
-  input_dtype = logits.dtype
-  logits = tf.cast(logits, tf.float32)
-  output = tf.nn.softmax(logits, name=name)
-  return tf.cast(output, input_dtype)
 class Attention(tf.keras.layers.Layer):
  """Multi-headed attention layer."""
@@ -166,7 +148,10 @@ class Attention(tf.keras.layers.Layer):
    # Calculate dot product attention
    logits = tf.matmul(q, k, transpose_b=True)
    logits += bias
-    weights = _float32_softmax(logits, name="attention_weights")
+    # Note that softmax internally performs math operations using float32
+    # for numeric stability. When training with float16, we keep the input
+    # and output in float16 for better performance.
+    weights = tf.nn.softmax(logits, name="attention_weights")
    if training:
      weights = tf.nn.dropout(weights, rate=self.attention_dropout)
    attention_output = tf.matmul(weights, v)