Use nonexperimental mixed precision API.

This replaces symbols in tf.keras.mixed_precision.experimental with the corresponding nonexperimental symbols. In some cases, passing a Policy is replaced with passing a policy name for conciseness. Additionally, for the Shakespeare model, the loss_scale flag is removed, since supporting it with the nonexperimental API is slightly more verbose and it is recommended users use the default loss scale. PiperOrigin-RevId: 368123944

Use nonexperimental mixed precision API.
This replaces symbols in tf.keras.mixed_precision.experimental with the corresponding nonexperimental symbols. In some cases, passing a Policy is replaced with passing a policy name for conciseness. Additionally, for the Shakespeare model, the loss_scale flag is removed, since supporting it with the nonexperimental API is slightly more verbose and it is recommended users use the default loss scale. PiperOrigin-RevId: 368123944
4334a892 · Reed Wanderman-Milne · A. Unique TensorFlower · 19d18c00 · 4334a892 · 4334a892
Commit 4334a892 authored Apr 12, 2021 by Reed Wanderman-Milne Committed by A. Unique TensorFlower Apr 12, 2021
19 changed files
--- a/official/nlp/keras_nlp/encoders/bert_encoder_test.py
+++ b/official/nlp/keras_nlp/encoders/bert_encoder_test.py
@@ -29,7 +29,7 @@ class BertEncoderTest(keras_parameterized.TestCase):

  def tearDown(self):
    super(BertEncoderTest, self).tearDown()
-    tf.keras.mixed_precision.experimental.set_policy("float32")
+    tf.keras.mixed_precision.set_global_policy("float32")

  def test_network_creation(self):
    hidden_size = 32
@@ -92,7 +92,7 @@ class BertEncoderTest(keras_parameterized.TestCase):
  def test_network_creation_with_float16_dtype(self):
    hidden_size = 32
    sequence_length = 21
-    tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
+    tf.keras.mixed_precision.set_global_policy("mixed_float16")
    # Create a small BertEncoder for testing.
    test_network = bert_encoder.BertEncoder(
        vocab_size=100,

--- a/official/nlp/keras_nlp/layers/on_device_embedding_test.py
+++ b/official/nlp/keras_nlp/layers/on_device_embedding_test.py
@@ -45,9 +45,9 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
  def test_layer_creation_with_mixed_precision(self):
    vocab_size = 31
    embedding_width = 27
-    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
    test_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size, embedding_width=embedding_width, dtype=policy)
+        vocab_size=vocab_size, embedding_width=embedding_width,
+        dtype="mixed_float16")
    # Create a 2-dimensional input (the first dimension is implicit).
    sequence_length = 23
    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
@@ -83,9 +83,9 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
  def test_layer_invocation_with_mixed_precision(self):
    vocab_size = 31
    embedding_width = 27
-    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
    test_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size, embedding_width=embedding_width, dtype=policy)
+        vocab_size=vocab_size, embedding_width=embedding_width,
+        dtype="mixed_float16")
    # Create a 2-dimensional input (the first dimension is implicit).
    sequence_length = 23
    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
@@ -123,11 +123,10 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
  def test_one_hot_layer_creation_with_mixed_precision(self):
    vocab_size = 31
    embedding_width = 27
-    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
    test_layer = on_device_embedding.OnDeviceEmbedding(
        vocab_size=vocab_size,
        embedding_width=embedding_width,
-        dtype=policy,
+        dtype="mixed_float16",
        use_one_hot=True)
    # Create a 2-dimensional input (the first dimension is implicit).
    sequence_length = 23
@@ -166,11 +165,10 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
  def test_one_hot_layer_invocation_with_mixed_precision(self):
    vocab_size = 31
    embedding_width = 27
-    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
    test_layer = on_device_embedding.OnDeviceEmbedding(
        vocab_size=vocab_size,
        embedding_width=embedding_width,
-        dtype=policy,
+        dtype="mixed_float16",
        use_one_hot=True)
    # Create a 2-dimensional input (the first dimension is implicit).
    sequence_length = 23

--- a/official/nlp/keras_nlp/layers/transformer_encoder_block.py
+++ b/official/nlp/keras_nlp/layers/transformer_encoder_block.py
@@ -159,7 +159,7 @@ class TransformerEncoderBlock(tf.keras.layers.Layer):
        kernel_initializer=self._kernel_initializer,
        name="intermediate",
        **common_kwargs)
-    policy = tf.keras.mixed_precision.experimental.global_policy()
+    policy = tf.keras.mixed_precision.global_policy()
    if policy.name == "mixed_bfloat16":
      # bfloat16 causes BERT with the LAMB optimizer to not converge
      # as well, so we use float32.

--- a/official/nlp/keras_nlp/layers/transformer_encoder_block_test.py
+++ b/official/nlp/keras_nlp/layers/transformer_encoder_block_test.py
@@ -29,7 +29,7 @@ class TransformerEncoderBlockLayerTest(keras_parameterized.TestCase):

  def tearDown(self):
    super(TransformerEncoderBlockLayerTest, self).tearDown()
-    tf.keras.mixed_precision.experimental.set_policy('float32')
+    tf.keras.mixed_precision.set_global_policy('float32')

  def test_layer_creation(self, transformer_cls):
    test_layer = transformer_cls(
@@ -180,7 +180,7 @@ class TransformerEncoderBlockLayerTest(keras_parameterized.TestCase):
        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)

  def test_layer_invocation_with_float16_dtype(self, transformer_cls):
-    tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    test_layer = transformer_cls(
        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
    sequence_length = 21

--- a/official/nlp/modeling/layers/gated_feedforward.py
+++ b/official/nlp/modeling/layers/gated_feedforward.py
@@ -108,7 +108,7 @@ class GatedFeedforward(tf.keras.layers.Layer):
    self._output_dense = []
    self._output_dropout = []
    self._output_layer_norm = []
-    activation_policy = tf.keras.mixed_precision.experimental.global_policy()
+    activation_policy = tf.keras.mixed_precision.global_policy()
    if activation_policy.name == "mixed_bfloat16":
      # bfloat16 causes BERT with the LAMB optimizer to not converge
      # as well, so we use float32.

--- a/official/nlp/modeling/layers/gated_feedforward_test.py
+++ b/official/nlp/modeling/layers/gated_feedforward_test.py
@@ -29,7 +29,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):

  def tearDown(self):
    super(GatedFeedforwardTest, self).tearDown()
-    tf.keras.mixed_precision.experimental.set_policy("float32")
+    tf.keras.mixed_precision.set_global_policy("float32")

  @parameterized.parameters(
      (True, 1, "after_residual", "float32"),
@@ -42,7 +42,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
      (False, 1, "before_residual", "mixed_float16"),
  )
  def test_layer_creation(self, use_gate, num_blocks, dropout_position, dtype):
-    tf.keras.mixed_precision.experimental.set_policy(dtype)
+    tf.keras.mixed_precision.set_global_policy(dtype)
    kwargs = dict(
        intermediate_size=128,
        intermediate_activation="relu",
@@ -74,7 +74,7 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
  )
  def test_layer_invocation(self, use_gate, num_blocks, dropout_position,
                            dtype):
-    tf.keras.mixed_precision.experimental.set_policy(dtype)
+    tf.keras.mixed_precision.set_global_policy(dtype)
    kwargs = dict(
        intermediate_size=16,
        intermediate_activation="relu",

--- a/official/nlp/modeling/layers/rezero_transformer.py
+++ b/official/nlp/modeling/layers/rezero_transformer.py
@@ -132,7 +132,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
        bias_axes="d",
        name="intermediate",
        **common_kwargs)
-    policy = tf.keras.mixed_precision.experimental.global_policy()
+    policy = tf.keras.mixed_precision.global_policy()
    if policy.name == "mixed_bfloat16":
      # bfloat16 causes BERT with the LAMB optimizer to not converge
      # as well, so we use float32.

--- a/official/nlp/modeling/layers/rezero_transformer_test.py
+++ b/official/nlp/modeling/layers/rezero_transformer_test.py
@@ -28,10 +28,10 @@ class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):

  def tearDown(self):
    super(TransformerWithReZeroLayerTest, self).tearDown()
-    tf.keras.mixed_precision.experimental.set_policy('float32')
+    tf.keras.mixed_precision.set_global_policy('float32')

  def test_layer_invocation_with_float16_dtype(self):
-    tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    test_layer = rezero_transformer.ReZeroTransformer(
        num_attention_heads=10,
        intermediate_size=2048,

--- a/official/nlp/modeling/layers/tn_transformer_test.py
+++ b/official/nlp/modeling/layers/tn_transformer_test.py
@@ -30,7 +30,7 @@ class TransformerLayerTest(keras_parameterized.TestCase):

  def tearDown(self):
    super(TransformerLayerTest, self).tearDown()
-    tf.keras.mixed_precision.experimental.set_policy('float32')
+    tf.keras.mixed_precision.set_global_policy('float32')

  def test_layer_creation(self, transformer_cls):
    test_layer = transformer_cls(
@@ -151,7 +151,7 @@ class TransformerLayerTest(keras_parameterized.TestCase):
        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)

  def test_layer_invocation_with_float16_dtype(self, transformer_cls):
-    tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    test_layer = transformer_cls(
        num_attention_heads=16,
        intermediate_size=2048,

--- a/official/nlp/modeling/layers/transformer_scaffold.py
+++ b/official/nlp/modeling/layers/transformer_scaffold.py
@@ -190,7 +190,7 @@ class TransformerScaffold(tf.keras.layers.Layer):
          bias_axes="d",
          name="intermediate",
          **common_kwargs)
-      policy = tf.keras.mixed_precision.experimental.global_policy()
+      policy = tf.keras.mixed_precision.global_policy()
      if policy.name == "mixed_bfloat16":
        # bfloat16 causes BERT with the LAMB optimizer to not converge
        # as well, so we use float32.

--- a/official/nlp/modeling/layers/transformer_scaffold_test.py
+++ b/official/nlp/modeling/layers/transformer_scaffold_test.py
@@ -83,7 +83,7 @@ class TransformerLayerTest(keras_parameterized.TestCase):

  def tearDown(self):
    super(TransformerLayerTest, self).tearDown()
-    tf.keras.mixed_precision.experimental.set_policy('float32')
+    tf.keras.mixed_precision.set_global_policy('float32')

  def test_layer_creation(self):
    sequence_length = 21
@@ -308,7 +308,7 @@ class TransformerLayerTest(keras_parameterized.TestCase):
    self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")

  def test_layer_invocation_with_float16_dtype(self):
-    tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    sequence_length = 21
    width = 80


--- a/official/nlp/modeling/networks/albert_encoder_test.py
+++ b/official/nlp/modeling/networks/albert_encoder_test.py
@@ -33,7 +33,7 @@ class AlbertEncoderTest(keras_parameterized.TestCase):

  def tearDown(self):
    super(AlbertEncoderTest, self).tearDown()
-    tf.keras.mixed_precision.experimental.set_policy("float32")
+    tf.keras.mixed_precision.set_global_policy("float32")

  @parameterized.named_parameters(
      dict(testcase_name="default", expected_dtype=tf.float32),
@@ -49,7 +49,7 @@ class AlbertEncoderTest(keras_parameterized.TestCase):
        num_attention_heads=2,
        num_layers=3)
    if expected_dtype == tf.float16:
-      tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
+      tf.keras.mixed_precision.set_global_policy("mixed_float16")

    # Create a small TransformerEncoder for testing.
    test_network = albert_encoder.AlbertEncoder(**kwargs)
@@ -148,7 +148,7 @@ class AlbertEncoderTest(keras_parameterized.TestCase):
    self.assertLen(dict_outputs["pooled_output"], num_layers)

  def test_serialize_deserialize(self):
-    tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
+    tf.keras.mixed_precision.set_global_policy("mixed_float16")
    # Create a network object that sets all of its config options.
    kwargs = dict(
        vocab_size=100,

--- a/official/nlp/modeling/networks/bert_encoder_test.py
+++ b/official/nlp/modeling/networks/bert_encoder_test.py
@@ -30,7 +30,7 @@ class BertEncoderTest(keras_parameterized.TestCase):

  def tearDown(self):
    super(BertEncoderTest, self).tearDown()
-    tf.keras.mixed_precision.experimental.set_policy("float32")
+    tf.keras.mixed_precision.set_global_policy("float32")

  def test_network_creation(self):
    hidden_size = 32
@@ -119,7 +119,7 @@ class BertEncoderTest(keras_parameterized.TestCase):
  def test_network_creation_with_float16_dtype(self):
    hidden_size = 32
    sequence_length = 21
-    tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
+    tf.keras.mixed_precision.set_global_policy("mixed_float16")
    # Create a small BertEncoder for testing.
    test_network = bert_encoder.BertEncoder(
        vocab_size=100,

--- a/official/nlp/modeling/networks/classification.py
+++ b/official/nlp/modeling/networks/classification.py
@@ -59,7 +59,7 @@ class Classification(tf.keras.Model):
    if output == 'logits':
      output_tensors = logits
    elif output == 'predictions':
-      policy = tf.keras.mixed_precision.experimental.global_policy()
+      policy = tf.keras.mixed_precision.global_policy()
      if policy.name == 'mixed_bfloat16':
        # b/158514794: bf16 is not stable with post-softmax cross-entropy.
        policy = tf.float32

--- a/official/nlp/modeling/networks/encoder_scaffold_test.py
+++ b/official/nlp/modeling/networks/encoder_scaffold_test.py
@@ -52,7 +52,7 @@ class EncoderScaffoldLayerClassTest(keras_parameterized.TestCase):

  def tearDown(self):
    super(EncoderScaffoldLayerClassTest, self).tearDown()
-    tf.keras.mixed_precision.experimental.set_policy("float32")
+    tf.keras.mixed_precision.set_global_policy("float32")

  @parameterized.named_parameters(
      dict(testcase_name="only_final_output", return_all_layer_outputs=False),
@@ -132,7 +132,7 @@ class EncoderScaffoldLayerClassTest(keras_parameterized.TestCase):
    self.assertTrue(hasattr(test_network, "_output_layer_norm"))

  def test_network_creation_with_float16_dtype(self):
-    tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
+    tf.keras.mixed_precision.set_global_policy("mixed_float16")
    hidden_size = 32
    sequence_length = 21
    embedding_cfg = {

--- a/official/nlp/modeling/networks/packed_sequence_embedding_test.py
+++ b/official/nlp/modeling/networks/packed_sequence_embedding_test.py
@@ -27,7 +27,7 @@ class PackedSequenceEmbeddingTest(tf.test.TestCase, parameterized.TestCase):

  def tearDown(self):
    super(PackedSequenceEmbeddingTest, self).tearDown()
-    tf.keras.mixed_precision.experimental.set_policy('float32')
+    tf.keras.mixed_precision.set_global_policy('float32')

  @parameterized.parameters([
      (True, True, True),
@@ -39,7 +39,7 @@ class PackedSequenceEmbeddingTest(tf.test.TestCase, parameterized.TestCase):
                            use_float16):
    """Validate that the Keras object can be created."""
    if use_float16:
-      tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
+      tf.keras.mixed_precision.set_global_policy('mixed_float16')
    seq_length = 16
    vocab_size = 100
    max_position_embeddings = 32
@@ -99,7 +99,7 @@ class PackedSequenceEmbeddingTest(tf.test.TestCase, parameterized.TestCase):
    self.assertAllEqual(expected_attention_mask_shape, attention_mask.shape)

  def test_serialize_deserialize(self):
-    tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    # Create a network object that sets all of its config options.
    embedding_cfg = dict(
        vocab_size=100,

--- a/official/nlp/transformer/transformer_main_test.py
+++ b/official/nlp/transformer/transformer_main_test.py
@@ -67,10 +67,10 @@ class TransformerTaskTest(tf.test.TestCase):
    self.bleu_source = os.path.join(temp_dir, 'bleu_source')
    self.bleu_ref = os.path.join(temp_dir, 'bleu_ref')
    self.orig_policy = (
-        tf.compat.v2.keras.mixed_precision.experimental.global_policy())
+        tf.compat.v2.keras.mixed_precision.global_policy())

  def tearDown(self):  # pylint: disable=g-missing-super-call
-    tf.compat.v2.keras.mixed_precision.experimental.set_policy(self.orig_policy)
+    tf.compat.v2.keras.mixed_precision.set_global_policy(self.orig_policy)

  def _assert_exists(self, filepath):
    self.assertTrue(os.path.exists(filepath))

--- a/official/vision/detection/main.py
+++ b/official/vision/detection/main.py
@@ -70,9 +70,7 @@ def run_executor(params,
  """Runs the object detection model on distribution strategy defined by the user."""

  if params.architecture.use_bfloat16:
-    policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
-        'mixed_bfloat16')
-    tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)
+    tf.compat.v2.keras.mixed_precision.set_global_policy('mixed_bfloat16')

  model_builder = model_factory.model_generator(params)


--- a/official/vision/detection/modeling/base_model.py
+++ b/official/vision/detection/modeling/base_model.py
@@ -60,9 +60,7 @@ class Model(object):
    self._use_bfloat16 = params.architecture.use_bfloat16

    if params.architecture.use_bfloat16:
-      policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
-          'mixed_bfloat16')
-      tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)
+      tf.compat.v2.keras.mixed_precision.set_global_policy('mixed_bfloat16')

    # Optimization.
    self._optimizer_fn = optimizers.OptimizerFactory(params.train.optimizer)