Merge branch 'master' of https://github.com/tensorflow/models into detection_generator_pr_2

b92025a9 · anivegesana · 1b425791 · 37536370 · b92025a9 · b92025a9
Commit b92025a9 authored Aug 18, 2021 by anivegesana
20 changed files
--- a/official/modeling/optimization/configs/learning_rate_config.py
+++ b/official/modeling/optimization/configs/learning_rate_config.py
@@ -211,6 +211,44 @@ class PowerDecayWithOffsetLrConfig(base_config.Config):
  pre_offset_learning_rate: float = 1.0e6


+@dataclasses.dataclass
+class StepCosineLrConfig(base_config.Config):
+  """Configuration for stepwise learning rate decay.
+
+  This class is a container for the piecewise cosine learning rate scheduling
+  configs. It will configure an instance of StepConsineDecayWithOffset keras
+  learning rate schedule.
+
+    ```python
+    boundaries: [100000, 110000]
+    values: [1.0, 0.5]
+    lr_decayed_fn = (
+    lr_schedule.StepConsineDecayWithOffset(
+        boundaries,
+        values))
+    ```
+    from 0 to 100000 step, it will cosine decay from 1.0 to 0.5
+    from 100000 to 110000 step, it cosine decay from 0.5 to 0.0
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to PiecewiseConstant.
+    boundaries: A list of ints of strictly increasing entries. Defaults to None.
+    values: A list of floats that specifies the values for the intervals defined
+      by `boundaries`. It should have one more element than `boundaries`.
+            The learning rate is computed as follows:
+              [0, boundaries[0]] -> cosine from values[0] to values[1]
+              [boundaries[0], boundaries[1]]     -> values[1] to values[2]
+              ...
+              [boundaries[n-1], boundaries[n]]   -> values[n] to values[n+1]
+              [boundaries[n], end]               -> values[n+1] to 0.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'StepConsineDecayWithOffset'
+  boundaries: Optional[List[int]] = None
+  values: Optional[List[float]] = None
+  offset: int = 0
+
+
 @dataclasses.dataclass
 class LinearWarmupConfig(base_config.Config):
  """Configuration for linear warmup schedule config.

--- a/official/modeling/optimization/configs/optimization_config.py
+++ b/official/modeling/optimization/configs/optimization_config.py
@@ -70,6 +70,7 @@ class LrConfig(oneof.OneOfConfig):
    power_linear: learning rate config of step^power followed by
      step^power*linear.
    power_with_offset: power decay with a step offset.
+    step_cosine_with_offset: Step cosine with a step offset.
  """
  type: Optional[str] = None
  constant: lr_cfg.ConstantLrConfig = lr_cfg.ConstantLrConfig()
@@ -82,6 +83,8 @@ class LrConfig(oneof.OneOfConfig):
      lr_cfg.PowerAndLinearDecayLrConfig())
  power_with_offset: lr_cfg.PowerDecayWithOffsetLrConfig = (
      lr_cfg.PowerDecayWithOffsetLrConfig())
+  step_cosine_with_offset: lr_cfg.StepCosineLrConfig = (
+      lr_cfg.StepCosineLrConfig())


 @dataclasses.dataclass

--- a/official/modeling/optimization/lr_schedule.py
+++ b/official/modeling/optimization/lr_schedule.py
@@ -14,6 +14,7 @@

 """Learning rate schedule classes."""

+import math
 from typing import Mapping, Any, Union, Optional

 import tensorflow as tf
@@ -383,3 +384,113 @@ class PowerDecayWithOffset(tf.keras.optimizers.schedules.LearningRateSchedule):
        "pre_offset_learning_rate": self._pre_offset_lr,
        "name": self._name,
    }
+
+
+class StepConsineDecayWithOffset(
+    tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Stepwise cosine learning rate decay with offset.
+
+  Learning rate is equivalent to one or more consine decay(s) starting and
+  ending at each interval.
+
+  ExampleL
+
+    ```python
+    boundaries: [100000, 110000]
+    values: [1.0, 0.5]
+    lr_decayed_fn = (
+    lr_schedule.StepConsineDecayWithOffset(
+        boundaries,
+        values))
+    ```
+
+    from 0 to 100000 step, it will cosine decay from 1.0 to 0.5
+    from 100000 to 110000 step, it cosine decay from 0.5 to 0.0
+  """
+
+  def __init__(self,
+               boundaries,
+               values,
+               offset: int = 0,
+               name: str = "StepConsineDecayWithOffset"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      boundaries: A list of `Tensor`s or `int`s with strictly
+        increasing entries, and with all elements having the same type as the
+        optimizer step.
+      values: A list of `Tensor`s or `float`s that specifies the
+        values for the intervals defined by `boundaries`. It should have one
+        more element than `boundaries`, and all elements should have the same
+        type.
+      offset: The offset when computing the power decay.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self.values = values
+    self.boundaries = boundaries
+    self.offset = offset
+    self.name = name
+
+    if len(self.values) < 1:
+      raise ValueError(f"Expect non empty {self.values}")
+    if len(self.boundaries) != len(self.values):
+      raise ValueError(
+          "Boundaries length is equal to learning rate levels length"
+          f"{len(self.boundaries)} != {len(self.values)}")
+
+    self.total_steps = (
+        [boundaries[i + 1] - boundaries[i] for i in range(len(boundaries) - 1)
+        ] + [0])
+
+  def __call__(self, global_step):
+    with tf.name_scope(self.name or "StepConsineDecayWithOffset"):
+      global_step = tf.cast(global_step - self.offset, tf.float32)
+      lr_levels = self.values
+      lr_steps = self.boundaries
+      level_total_steps = self.total_steps
+      num_levels = len(lr_levels)
+
+      init_lr = lr_levels[0]
+      next_init_lr = lr_levels[1] if num_levels > 1 else 0.
+
+      init_total_steps = level_total_steps[0]
+
+      cosine_learning_rate = ((init_lr - next_init_lr) * (tf.cos(
+          tf.constant(math.pi) * (global_step) /
+          (init_total_steps)) + 1.0) / 2.0 + next_init_lr)
+      learning_rate = cosine_learning_rate
+      tf.compat.v1.logging.info("DEBUG lr %r next lr %r", learning_rate,
+                                cosine_learning_rate)
+      tf.compat.v1.logging.info("DEBUG lr %r next lr %r inittotalstep %r",
+                                init_lr, next_init_lr, init_total_steps)
+
+      for i in range(1, num_levels):
+        next_init_lr = lr_levels[i]
+        next_start_step = lr_steps[i]
+        next_total_steps = level_total_steps[i]
+        next_next_init_lr = lr_levels[i + 1] if num_levels > i + 1 else 0.
+
+        tf.compat.v1.logging.info(
+            "DEBUG step %r nilr %r nss %r nts %r nnilr %r", global_step,
+            next_init_lr, next_start_step, next_total_steps, next_next_init_lr)
+        next_cosine_learning_rate = ((next_init_lr - next_next_init_lr) *
+                                     (tf.cos(
+                                         tf.constant(math.pi) *
+                                         (global_step - next_start_step) /
+                                         (next_total_steps)) + 1.0) / 2.0 +
+                                     next_next_init_lr)
+        learning_rate = tf.where(global_step >= next_start_step,
+                                 next_cosine_learning_rate, learning_rate)
+        tf.compat.v1.logging.info("DEBUG lr %r next lr %r", learning_rate,
+                                  next_cosine_learning_rate)
+
+    return learning_rate
+
+  def get_config(self):
+    return {
+        "boundaries": self.boundaries,
+        "values": self.values,
+        "offset": self.offset,
+        "name": self.name
+    }
--- a/official/modeling/optimization/optimizer_factory.py
+++ b/official/modeling/optimization/optimizer_factory.py
@@ -47,6 +47,7 @@ LR_CLS = {
    'power': lr_schedule.DirectPowerDecay,
    'power_linear': lr_schedule.PowerAndLinearDecay,
    'power_with_offset': lr_schedule.PowerDecayWithOffset,
+    'step_cosine_with_offset': lr_schedule.StepConsineDecayWithOffset,
 }

 WARMUP_CLS = {

--- a/official/modeling/optimization/optimizer_factory_test.py
+++ b/official/modeling/optimization/optimizer_factory_test.py
@@ -394,5 +394,38 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
    for step, value in expected_lr_step_values:
      self.assertAlmostEqual(lr(step).numpy(), value)

+  def test_step_cosine_lr_schedule_with_warmup(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'step_cosine_with_offset',
+            'step_cosine_with_offset': {
+                'values': (0.0001, 0.00005),
+                'boundaries': (0, 500000),
+                'offset': 10000,
+            }
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {
+                'warmup_steps': 10000,
+                'warmup_learning_rate': 0.0
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.0], [5000, 1e-4/2.0], [10000, 1e-4],
+                               [20000, 9.994863e-05], [499999, 5e-05]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
 if __name__ == '__main__':
  tf.test.main()
--- a/official/nlp/data/sentence_prediction_dataloader.py
+++ b/official/nlp/data/sentence_prediction_dataloader.py
@@ -44,6 +44,8 @@ class SentencePredictionDataConfig(cfg.DataConfig):
  # Maps the key in TfExample to feature name.
  # E.g 'label_ids' to 'next_sentence_labels'
  label_name: Optional[Tuple[str, str]] = None
+  # Either tfrecord, sstable, or recordio.
+  file_type: str = 'tfrecord'


 @data_loader_factory.register_data_loader_cls(SentencePredictionDataConfig)
@@ -111,7 +113,10 @@ class SentencePredictionDataLoader(data_loader.DataLoader):
  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
    """Returns a tf.dataset.Dataset."""
    reader = input_reader.InputReader(
-        params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
+        dataset_fn=dataset_fn.pick_dataset_fn(self._params.file_type),
+        params=self._params,
+        decoder_fn=self._decode,
+        parser_fn=self._parse)
    return reader.read(input_context)


@@ -168,7 +173,8 @@ class TextProcessor(tf.Module):
          vocab_file=vocab_file, lower_case=lower_case)
    elif tokenization == 'SentencePiece':
      self._tokenizer = modeling.layers.SentencepieceTokenizer(
-          model_file_path=vocab_file, lower_case=lower_case,
+          model_file_path=vocab_file,
+          lower_case=lower_case,
          strip_diacritics=True)  # Strip diacritics to follow ALBERT model
    else:
      raise ValueError('Unsupported tokenization: %s' % tokenization)

--- a/official/nlp/keras_nlp/layers/position_embedding.py
+++ b/official/nlp/keras_nlp/layers/position_embedding.py
@@ -66,14 +66,8 @@ class PositionEmbedding(tf.keras.layers.Layer):

  def build(self, input_shape):
    dimension_list = input_shape.as_list()
-
-    seq_length = dimension_list[self._seq_axis]
    width = dimension_list[-1]
-
-    if self._max_length is not None:
-      weight_sequence_length = self._max_length
-    else:
-      weight_sequence_length = seq_length
+    weight_sequence_length = self._max_length

    self._position_embeddings = self.add_weight(
        "embeddings",

--- a/official/nlp/modeling/layers/mobile_bert_layers.py
+++ b/official/nlp/modeling/layers/mobile_bert_layers.py
@@ -39,23 +39,6 @@ class NoNorm(tf.keras.layers.Layer):
    return output


-@tf.keras.utils.register_keras_serializable(package='Text')
-class NoNormClipped(NoNorm):
-  """Quantization friendly implementation for the NoNorm.
-
-  The output of NoNorm layer is clipped to [-6.0, 6.0] to make it quantization
-  friendly.
-  """
-
-  def __init__(self, name=None):
-    super(NoNormClipped, self).__init__(name=name)
-
-  def call(self, feature):
-    output = feature * self.scale + self.bias
-    clipped_output = tf.clip_by_value(output, -6.0, 6.0)
-    return clipped_output
-
-
 def _get_norm_layer(normalization_type='no_norm', name=None):
  """Get normlization layer.

@@ -69,8 +52,6 @@ def _get_norm_layer(normalization_type='no_norm', name=None):
  """
  if normalization_type == 'no_norm':
    layer = NoNorm(name=name)
-  elif normalization_type == 'no_norm_clipped':
-    layer = NoNormClipped(name=name)
  elif normalization_type == 'layer_norm':
    layer = tf.keras.layers.LayerNormalization(
        name=name,

--- a/official/nlp/modeling/layers/mobile_bert_layers_test.py
+++ b/official/nlp/modeling/layers/mobile_bert_layers_test.py
@@ -33,22 +33,6 @@ def generate_fake_input(batch_size=1, seq_len=5, vocab_size=10000, seed=0):
  return fake_input


-class EdgeTPUNoNormTest(tf.test.TestCase):
-
-  def test_no_norm(self):
-    layer = mobile_bert_layers.NoNormClipped()
-    feature = tf.random.uniform(
-        [2, 3, 4], minval=-8, maxval=8, dtype=tf.float32)
-    output = layer(feature)
-    output_shape = output.shape.as_list()
-    expected_shape = [2, 3, 4]
-    self.assertListEqual(output_shape, expected_shape, msg=None)
-    output_min = tf.reduce_min(output)
-    output_max = tf.reduce_max(output)
-    self.assertGreaterEqual(6.0, output_max)
-    self.assertLessEqual(-6.0, output_min)
-
-
 class MobileBertEncoderTest(parameterized.TestCase, tf.test.TestCase):

  def test_embedding_layer_with_token_type(self):

--- a/official/nlp/modeling/models/seq2seq_transformer.py
+++ b/official/nlp/modeling/models/seq2seq_transformer.py
@@ -544,7 +544,8 @@ class TransformerDecoder(tf.keras.layers.Layer):
           self_attention_mask=None,
           cross_attention_mask=None,
           cache=None,
-           decode_loop_step=None):
+           decode_loop_step=None,
+           return_all_decoder_outputs=False):
    """Return the output of the decoder layer stacks.

    Args:
@@ -561,6 +562,9 @@ class TransformerDecoder(tf.keras.layers.Layer):
                     ...}
      decode_loop_step: An integer, the step number of the decoding loop. Used
        only for autoregressive inference on TPU.
+      return_all_decoder_outputs: Return all decoder layer outputs.
+        Note that the outputs are layer normed.
+        This is useful when introducing per layer auxiliary loss.

    Returns:
      Output of decoder.
@@ -568,6 +572,7 @@ class TransformerDecoder(tf.keras.layers.Layer):
    """

    output_tensor = target
+    decoder_outputs = []
    for layer_idx in range(self.num_layers):
      transformer_inputs = [
          output_tensor, memory, cross_attention_mask, self_attention_mask
@@ -581,7 +586,13 @@ class TransformerDecoder(tf.keras.layers.Layer):
            transformer_inputs,
            cache=cache[cache_layer_idx],
            decode_loop_step=decode_loop_step)
-    return self.output_normalization(output_tensor)
+      if return_all_decoder_outputs:
+        decoder_outputs.append(self.output_normalization(output_tensor))
+
+    if return_all_decoder_outputs:
+      return decoder_outputs
+    else:
+      return self.output_normalization(output_tensor)


 def attention_initializer(hidden_size):

--- a/official/nlp/modeling/networks/classification.py
+++ b/official/nlp/modeling/networks/classification.py
@@ -16,6 +16,7 @@
 # pylint: disable=g-classes-have-attributes
 import collections
 import tensorflow as tf
+from tensorflow.python.util import deprecation


 @tf.keras.utils.register_keras_serializable(package='Text')
@@ -39,6 +40,8 @@ class Classification(tf.keras.Model):
      `predictions`.
  """

+  @deprecation.deprecated(None, 'Classification as a network is deprecated. '
+                          'Please use the layers.ClassificationHead instead.')
  def __init__(self,
               input_width,
               num_classes,

--- a/official/nlp/projects/mobilebert/distillation.py
+++ b/official/nlp/projects/mobilebert/distillation.py
@@ -13,18 +13,18 @@
 # limitations under the License.

 """Progressive distillation for MobileBERT student model."""
+import dataclasses
 from typing import List, Optional

 from absl import logging
-import dataclasses
 import orbit
 import tensorflow as tf
 from official.core import base_task
 from official.core import config_definitions as cfg
 from official.modeling import optimization
 from official.modeling import tf_utils
+from official.modeling.fast_training.progressive import policies
 from official.modeling.hyperparams import base_config
-from official.modeling.progressive import policies
 from official.nlp import keras_nlp
 from official.nlp.configs import bert
 from official.nlp.configs import encoders

--- a/official/nlp/projects/mobilebert/distillation_test.py
+++ b/official/nlp/projects/mobilebert/distillation_test.py
@@ -22,7 +22,7 @@ import tensorflow as tf
 from official.core import config_definitions as cfg
 from official.modeling import optimization
 from official.modeling import tf_utils
-from official.modeling.progressive import trainer as prog_trainer_lib
+from official.modeling.fast_training.progressive import trainer as prog_trainer_lib
 from official.nlp.configs import bert
 from official.nlp.configs import encoders
 from official.nlp.data import pretrain_dataloader

--- a/official/nlp/projects/mobilebert/run_distillation.py
+++ b/official/nlp/projects/mobilebert/run_distillation.py
@@ -28,8 +28,8 @@ from official.core import train_utils
 from official.modeling import hyperparams
 from official.modeling import optimization
 from official.modeling import performance
-from official.modeling.progressive import train_lib
-from official.modeling.progressive import trainer as prog_trainer_lib
+from official.modeling.fast_training.progressive import train_lib
+from official.modeling.fast_training.progressive import trainer as prog_trainer_lib
 from official.nlp.data import pretrain_dataloader
 from official.nlp.projects.mobilebert import distillation


--- a/official/nlp/serving/serving_modules.py
+++ b/official/nlp/serving/serving_modules.py
@@ -84,7 +84,8 @@ class SentencePrediction(export_base.ExportModule):
  def serve(self,
            input_word_ids,
            input_mask=None,
-            input_type_ids=None) -> Dict[str, tf.Tensor]:
+            input_type_ids=None,
+            use_prob=False) -> Dict[str, tf.Tensor]:
    if input_type_ids is None:
      # Requires CLS token is the first token of inputs.
      input_type_ids = tf.zeros_like(input_word_ids)
@@ -97,7 +98,10 @@ class SentencePrediction(export_base.ExportModule):
        input_word_ids=input_word_ids,
        input_mask=input_mask,
        input_type_ids=input_type_ids)
-    return dict(outputs=self.inference_step(inputs))
+    if not use_prob:
+      return dict(outputs=self.inference_step(inputs))
+    else:
+      return dict(outputs=tf.nn.softmax(self.inference_step(inputs)))

  @tf.function
  def serve_examples(self, inputs) -> Dict[str, tf.Tensor]:

--- a/official/nlp/tasks/sentence_prediction.py
+++ b/official/nlp/tasks/sentence_prediction.py
@@ -13,10 +13,10 @@
 # limitations under the License.

 """Sentence prediction (classification) task."""
+import dataclasses
 from typing import List, Union, Optional

 from absl import logging
-import dataclasses
 import numpy as np
 import orbit
 from scipy import stats
@@ -140,15 +140,26 @@ class SentencePredictionTask(base_task.Task):
    del training
    if self.task_config.model.num_classes == 1:
      metrics = [tf.keras.metrics.MeanSquaredError()]
+    elif self.task_config.model.num_classes == 2:
+      metrics = [
+          tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy'),
+          tf.keras.metrics.AUC(name='auc', curve='PR'),
+      ]
    else:
      metrics = [
-          tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')
+          tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy'),
      ]
    return metrics

  def process_metrics(self, metrics, labels, model_outputs):
    for metric in metrics:
-      metric.update_state(labels[self.label_field], model_outputs)
+      if metric.name == 'auc':
+        # Convert the logit to probability and extract the probability of True..
+        metric.update_state(
+            labels[self.label_field],
+            tf.expand_dims(tf.nn.softmax(model_outputs)[:, 1], axis=1))
+      if metric.name == 'cls_accuracy':
+        metric.update_state(labels[self.label_field], model_outputs)

  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
    compiled_metrics.update_state(labels[self.label_field], model_outputs)

--- a/official/nlp/tasks/translation.py
+++ b/official/nlp/tasks/translation.py
@@ -13,11 +13,11 @@
 # limitations under the License.

 """Defines the translation task."""
+import dataclasses
 import os
 from typing import Optional

 from absl import logging
-import dataclasses
 import sacrebleu
 import tensorflow as tf
 import tensorflow_text as tftxt

--- a/official/nlp/tasks/translation_test.py
+++ b/official/nlp/tasks/translation_test.py
@@ -85,7 +85,8 @@ class TranslationTaskTest(tf.test.TestCase):
  def test_task(self):
    config = translation.TranslationConfig(
        model=translation.ModelConfig(
-            encoder=translation.EncDecoder(), decoder=translation.EncDecoder()),
+            encoder=translation.EncDecoder(num_layers=1),
+            decoder=translation.EncDecoder(num_layers=1)),
        train_data=wmt_dataloader.WMTDataConfig(
            input_path=self._record_input_path,
            src_lang="en", tgt_lang="reverse_en",
@@ -102,7 +103,8 @@ class TranslationTaskTest(tf.test.TestCase):
  def test_no_sentencepiece_path(self):
    config = translation.TranslationConfig(
        model=translation.ModelConfig(
-            encoder=translation.EncDecoder(), decoder=translation.EncDecoder()),
+            encoder=translation.EncDecoder(num_layers=1),
+            decoder=translation.EncDecoder(num_layers=1)),
        train_data=wmt_dataloader.WMTDataConfig(
            input_path=self._record_input_path,
            src_lang="en", tgt_lang="reverse_en",
@@ -122,7 +124,8 @@ class TranslationTaskTest(tf.test.TestCase):
        sentencepeice_model_prefix)
    config = translation.TranslationConfig(
        model=translation.ModelConfig(
-            encoder=translation.EncDecoder(), decoder=translation.EncDecoder()),
+            encoder=translation.EncDecoder(num_layers=1),
+            decoder=translation.EncDecoder(num_layers=1)),
        train_data=wmt_dataloader.WMTDataConfig(
            input_path=self._record_input_path,
            src_lang="en", tgt_lang="reverse_en",
@@ -137,7 +140,8 @@ class TranslationTaskTest(tf.test.TestCase):
  def test_evaluation(self):
    config = translation.TranslationConfig(
        model=translation.ModelConfig(
-            encoder=translation.EncDecoder(), decoder=translation.EncDecoder(),
+            encoder=translation.EncDecoder(num_layers=1),
+            decoder=translation.EncDecoder(num_layers=1),
            padded_decode=False,
            decode_max_length=64),
        validation_data=wmt_dataloader.WMTDataConfig(

--- a/official/nlp/train.py
+++ b/official/nlp/train.py
@@ -27,9 +27,15 @@ from official.core import task_factory
 from official.core import train_lib
 from official.core import train_utils
 from official.modeling import performance
+from official.nlp import continuous_finetune_lib

 FLAGS = flags.FLAGS

+flags.DEFINE_integer(
+    'pretrain_steps',
+    default=None,
+    help='The number of total training steps for the pretraining job.')
+

 def main(_):
  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
@@ -40,27 +46,33 @@ def main(_):
    # may race against the train job for writing the same file.
    train_utils.serialize_config(params, model_dir)

-  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
-  # can have significant impact on model speeds by utilizing float16 in case of
-  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
-  # dtype is float16
-  if params.runtime.mixed_precision_dtype:
-    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
-  distribution_strategy = distribute_utils.get_distribution_strategy(
-      distribution_strategy=params.runtime.distribution_strategy,
-      all_reduce_alg=params.runtime.all_reduce_alg,
-      num_gpus=params.runtime.num_gpus,
-      tpu_address=params.runtime.tpu,
-      **params.runtime.model_parallelism())
-  with distribution_strategy.scope():
-    task = task_factory.get_task(params.task, logging_dir=model_dir)
+  if FLAGS.mode == 'continuous_train_and_eval':
+    continuous_finetune_lib.run_continuous_finetune(
+        FLAGS.mode, params, model_dir, pretrain_steps=FLAGS.pretrain_steps)
+
+  else:
+    # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
+    # can have significant impact on model speeds by utilizing float16 in case
+    # of GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only
+    # when dtype is float16
+    if params.runtime.mixed_precision_dtype:
+      performance.set_mixed_precision_policy(
+          params.runtime.mixed_precision_dtype)
+    distribution_strategy = distribute_utils.get_distribution_strategy(
+        distribution_strategy=params.runtime.distribution_strategy,
+        all_reduce_alg=params.runtime.all_reduce_alg,
+        num_gpus=params.runtime.num_gpus,
+        tpu_address=params.runtime.tpu,
+        **params.runtime.model_parallelism())
+    with distribution_strategy.scope():
+      task = task_factory.get_task(params.task, logging_dir=model_dir)

-  train_lib.run_experiment(
-      distribution_strategy=distribution_strategy,
-      task=task,
-      mode=FLAGS.mode,
-      params=params,
-      model_dir=model_dir)
+    train_lib.run_experiment(
+        distribution_strategy=distribution_strategy,
+        task=task,
+        mode=FLAGS.mode,
+        params=params,
+        model_dir=model_dir)

  train_utils.save_gin_config(FLAGS.mode, model_dir)


--- a/official/nlp/train_ctl_continuous_finetune.py
+++ b/official/nlp/train_ctl_continuous_finetune.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""TFM continuous finetuning+eval training driver."""
-from absl import app
-from absl import flags
-import gin
-
-# pylint: disable=unused-import
-from official.common import registry_imports
-# pylint: enable=unused-import
-from official.common import flags as tfm_flags
-from official.core import train_utils
-from official.nlp import continuous_finetune_lib
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_integer(
-    'pretrain_steps',
-    default=None,
-    help='The number of total training steps for the pretraining job.')
-
-
-def main(_):
-  # TODO(b/177863554): consolidate to nlp/train.py
-  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
-  params = train_utils.parse_configuration(FLAGS)
-  model_dir = FLAGS.model_dir
-  train_utils.serialize_config(params, model_dir)
-  continuous_finetune_lib.run_continuous_finetune(
-      FLAGS.mode, params, model_dir, pretrain_steps=FLAGS.pretrain_steps)
-  train_utils.save_gin_config(FLAGS.mode, model_dir)
-
-
-if __name__ == '__main__':
-  tfm_flags.define_flags()
-  app.run(main)