Internal change

PiperOrigin-RevId: 306182576

Internal change
PiperOrigin-RevId: 306182576
057895af · Chen Chen · A. Unique TensorFlower · d466d4e6 · 057895af · 057895af
Commit 057895af authored Apr 12, 2020 by Chen Chen Committed by A. Unique TensorFlower Apr 12, 2020
2 changed files
--- a/official/nlp/modeling/layers/transformer_scaffold.py
+++ b/official/nlp/modeling/layers/transformer_scaffold.py
@@ -145,6 +145,8 @@ class TransformerScaffold(tf.keras.layers.Layer):
        bias_constraint=self._bias_constraint,
        name="self_attention_output")
    self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+    # Use float32 in layernorm for numeric stability.
+    # It is probably safe in mixed_float16, but we haven't validated this yet.
    self._attention_layer_norm = (
        tf.keras.layers.LayerNormalization(
            name="self_attention_layer_norm", axis=-1, epsilon=1e-12,
@@ -159,7 +161,6 @@ class TransformerScaffold(tf.keras.layers.Layer):
        activity_regularizer=self._activity_regularizer,
        kernel_constraint=self._kernel_constraint,
        bias_constraint=self._bias_constraint,
-        dtype=tf.float32,  # This layer is always float32 for numeric stability.
        name="intermediate")
    self._output_dense = dense_einsum.DenseEinsum(
        output_shape=hidden_size,
@@ -172,6 +173,7 @@ class TransformerScaffold(tf.keras.layers.Layer):
        bias_constraint=self._bias_constraint,
        name="output")
    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+    # Use float32 in layernorm for numeric stability.
    self._output_layer_norm = tf.keras.layers.LayerNormalization(
        name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)

@@ -223,23 +225,14 @@ class TransformerScaffold(tf.keras.layers.Layer):
    attention_output = self._attention_layer(attention_inputs)
    attention_output = self._attention_output_dense(attention_output)
    attention_output = self._attention_dropout(attention_output)
-    # Use float32 in keras layer norm and the gelu activation in the
-    # intermediate dense layer for numeric stability
-    if self.dtype == tf.float16:
-      input_tensor = tf.cast(input_tensor, tf.float32)
-      attention_output = tf.cast(attention_output, tf.float32)
    attention_output = self._attention_layer_norm(input_tensor +
                                                  attention_output)
    intermediate_output = self._intermediate_dense(attention_output)
-    if self.dtype == tf.float16:
-      intermediate_output = tf.cast(intermediate_output, tf.float16)
    layer_output = self._output_dense(intermediate_output)
    layer_output = self._output_dropout(layer_output)
-    # Use float32 in keras layer norm for numeric stability
-    if self.dtype == tf.float16:
+    # During mixed precision training, attention_output is from layer norm and
+    # is always fp32 for now. Cast layer_output to fp32 for the subsequent add.
    layer_output = tf.cast(layer_output, tf.float32)
    layer_output = self._output_layer_norm(layer_output + attention_output)
-    if self.dtype == tf.float16:
-      layer_output = tf.cast(layer_output, tf.float16)

    return layer_output
--- a/official/nlp/modeling/layers/transformer_scaffold_test.py
+++ b/official/nlp/modeling/layers/transformer_scaffold_test.py
@@ -54,6 +54,10 @@ class ValidatedAttentionLayer(attention.MultiHeadAttention):
 @keras_parameterized.run_all_keras_modes
 class TransformerLayerTest(keras_parameterized.TestCase):

+  def tearDown(self):
+    super(TransformerLayerTest, self).tearDown()
+    tf.keras.mixed_precision.experimental.set_policy('float32')
+
  def test_layer_creation(self):
    sequence_length = 21
    width = 80
@@ -212,6 +216,7 @@ class TransformerLayerTest(keras_parameterized.TestCase):
    self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")

  def test_layer_invocation_with_float16_dtype(self):
+    tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
    sequence_length = 21
    width = 80

@@ -226,12 +231,10 @@ class TransformerLayerTest(keras_parameterized.TestCase):
        attention_cfg=attention_layer_cfg,
        num_attention_heads=10,
        intermediate_size=2048,
-        intermediate_activation='relu',
-        dtype='float16')
+        intermediate_activation='relu')

    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(
-        shape=(sequence_length, width), dtype=tf.float16)
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
    # Create a 2-dimensional input (the first dimension is implicit).
    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
    output_tensor = test_layer([data_tensor, mask_tensor])
@@ -243,7 +246,7 @@ class TransformerLayerTest(keras_parameterized.TestCase):
    # (the NN is too complex) but this will rule out structural runtime errors.
    batch_size = 6
    input_data = (10 * np.random.random_sample(
-        (batch_size, sequence_length, width))).astype(np.float16)
+        (batch_size, sequence_length, width)))
    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
    # which here is (batch, sequence_length, sequence_length)
    mask_data = np.random.randint(