"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "30e5fb000c8b4b1ab4469b45c6e50d460e8de2bd"
Commit 057895af authored by Chen Chen's avatar Chen Chen Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 306182576
parent d466d4e6
...@@ -145,6 +145,8 @@ class TransformerScaffold(tf.keras.layers.Layer): ...@@ -145,6 +145,8 @@ class TransformerScaffold(tf.keras.layers.Layer):
bias_constraint=self._bias_constraint, bias_constraint=self._bias_constraint,
name="self_attention_output") name="self_attention_output")
self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate) self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
# Use float32 in layernorm for numeric stability.
# It is probably safe in mixed_float16, but we haven't validated this yet.
self._attention_layer_norm = ( self._attention_layer_norm = (
tf.keras.layers.LayerNormalization( tf.keras.layers.LayerNormalization(
name="self_attention_layer_norm", axis=-1, epsilon=1e-12, name="self_attention_layer_norm", axis=-1, epsilon=1e-12,
...@@ -159,7 +161,6 @@ class TransformerScaffold(tf.keras.layers.Layer): ...@@ -159,7 +161,6 @@ class TransformerScaffold(tf.keras.layers.Layer):
activity_regularizer=self._activity_regularizer, activity_regularizer=self._activity_regularizer,
kernel_constraint=self._kernel_constraint, kernel_constraint=self._kernel_constraint,
bias_constraint=self._bias_constraint, bias_constraint=self._bias_constraint,
dtype=tf.float32, # This layer is always float32 for numeric stability.
name="intermediate") name="intermediate")
self._output_dense = dense_einsum.DenseEinsum( self._output_dense = dense_einsum.DenseEinsum(
output_shape=hidden_size, output_shape=hidden_size,
...@@ -172,6 +173,7 @@ class TransformerScaffold(tf.keras.layers.Layer): ...@@ -172,6 +173,7 @@ class TransformerScaffold(tf.keras.layers.Layer):
bias_constraint=self._bias_constraint, bias_constraint=self._bias_constraint,
name="output") name="output")
self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate) self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
# Use float32 in layernorm for numeric stability.
self._output_layer_norm = tf.keras.layers.LayerNormalization( self._output_layer_norm = tf.keras.layers.LayerNormalization(
name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32) name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)
...@@ -223,23 +225,14 @@ class TransformerScaffold(tf.keras.layers.Layer): ...@@ -223,23 +225,14 @@ class TransformerScaffold(tf.keras.layers.Layer):
attention_output = self._attention_layer(attention_inputs) attention_output = self._attention_layer(attention_inputs)
attention_output = self._attention_output_dense(attention_output) attention_output = self._attention_output_dense(attention_output)
attention_output = self._attention_dropout(attention_output) attention_output = self._attention_dropout(attention_output)
# Use float32 in keras layer norm and the gelu activation in the
# intermediate dense layer for numeric stability
if self.dtype == tf.float16:
input_tensor = tf.cast(input_tensor, tf.float32)
attention_output = tf.cast(attention_output, tf.float32)
attention_output = self._attention_layer_norm(input_tensor + attention_output = self._attention_layer_norm(input_tensor +
attention_output) attention_output)
intermediate_output = self._intermediate_dense(attention_output) intermediate_output = self._intermediate_dense(attention_output)
if self.dtype == tf.float16:
intermediate_output = tf.cast(intermediate_output, tf.float16)
layer_output = self._output_dense(intermediate_output) layer_output = self._output_dense(intermediate_output)
layer_output = self._output_dropout(layer_output) layer_output = self._output_dropout(layer_output)
# Use float32 in keras layer norm for numeric stability # During mixed precision training, attention_output is from layer norm and
if self.dtype == tf.float16: # is always fp32 for now. Cast layer_output to fp32 for the subsequent add.
layer_output = tf.cast(layer_output, tf.float32) layer_output = tf.cast(layer_output, tf.float32)
layer_output = self._output_layer_norm(layer_output + attention_output) layer_output = self._output_layer_norm(layer_output + attention_output)
if self.dtype == tf.float16:
layer_output = tf.cast(layer_output, tf.float16)
return layer_output return layer_output
...@@ -54,6 +54,10 @@ class ValidatedAttentionLayer(attention.MultiHeadAttention): ...@@ -54,6 +54,10 @@ class ValidatedAttentionLayer(attention.MultiHeadAttention):
@keras_parameterized.run_all_keras_modes @keras_parameterized.run_all_keras_modes
class TransformerLayerTest(keras_parameterized.TestCase): class TransformerLayerTest(keras_parameterized.TestCase):
def tearDown(self):
super(TransformerLayerTest, self).tearDown()
tf.keras.mixed_precision.experimental.set_policy('float32')
def test_layer_creation(self): def test_layer_creation(self):
sequence_length = 21 sequence_length = 21
width = 80 width = 80
...@@ -212,6 +216,7 @@ class TransformerLayerTest(keras_parameterized.TestCase): ...@@ -212,6 +216,7 @@ class TransformerLayerTest(keras_parameterized.TestCase):
self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.") self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
def test_layer_invocation_with_float16_dtype(self): def test_layer_invocation_with_float16_dtype(self):
tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
sequence_length = 21 sequence_length = 21
width = 80 width = 80
...@@ -226,12 +231,10 @@ class TransformerLayerTest(keras_parameterized.TestCase): ...@@ -226,12 +231,10 @@ class TransformerLayerTest(keras_parameterized.TestCase):
attention_cfg=attention_layer_cfg, attention_cfg=attention_layer_cfg,
num_attention_heads=10, num_attention_heads=10,
intermediate_size=2048, intermediate_size=2048,
intermediate_activation='relu', intermediate_activation='relu')
dtype='float16')
# Create a 3-dimensional input (the first dimension is implicit). # Create a 3-dimensional input (the first dimension is implicit).
data_tensor = tf.keras.Input( data_tensor = tf.keras.Input(shape=(sequence_length, width))
shape=(sequence_length, width), dtype=tf.float16)
# Create a 2-dimensional input (the first dimension is implicit). # Create a 2-dimensional input (the first dimension is implicit).
mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length)) mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
output_tensor = test_layer([data_tensor, mask_tensor]) output_tensor = test_layer([data_tensor, mask_tensor])
...@@ -243,7 +246,7 @@ class TransformerLayerTest(keras_parameterized.TestCase): ...@@ -243,7 +246,7 @@ class TransformerLayerTest(keras_parameterized.TestCase):
# (the NN is too complex) but this will rule out structural runtime errors. # (the NN is too complex) but this will rule out structural runtime errors.
batch_size = 6 batch_size = 6
input_data = (10 * np.random.random_sample( input_data = (10 * np.random.random_sample(
(batch_size, sequence_length, width))).astype(np.float16) (batch_size, sequence_length, width)))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len), # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length) # which here is (batch, sequence_length, sequence_length)
mask_data = np.random.randint( mask_data = np.random.randint(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment