Merge branch 'panoptic-segmentation' into panoptic-deeplab-modeling

09d9656f · Srihari Humbarwadi · GitHub · ac671306 · 49a5706c · 09d9656f
Unverified Commit 09d9656f authored Jan 13, 2022 by Srihari Humbarwadi Committed by GitHub Jan 13, 2022
20 changed files
--- a/official/README.md
+++ b/official/README.md
@@ -58,12 +58,12 @@ In the near future, we will add:

 | Model | Reference (Paper) |
 |-------|-------------------|
-| [ALBERT (A Lite BERT)](nlp/albert) | [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942) |
-| [BERT (Bidirectional Encoder Representations from Transformers)](nlp/bert) | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) |
+| [ALBERT (A Lite BERT)](nlp/MODEL_GARDEN.md#available-model-configs) | [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942) |
+| [BERT (Bidirectional Encoder Representations from Transformers)](nlp/MODEL_GARDEN.md#available-model-configs) | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) |
 | [NHNet (News Headline generation model)](projects/nhnet) | [Generating Representative Headlines for News Stories](https://arxiv.org/abs/2001.09386) |
-| [Transformer](nlp/transformer) | [Attention Is All You Need](https://arxiv.org/abs/1706.03762) |
+| [Transformer](nlp/MODEL_GARDEN.md#available-model-configs) | [Attention Is All You Need](https://arxiv.org/abs/1706.03762) |
 | [XLNet](nlp/xlnet) | [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) |
-| [MobileBERT](nlp/projects/mobilebert) | [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) |
+| [MobileBERT](projects/mobilebert) | [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) |

 ### Recommendation


--- a/official/common/distribute_utils_test.py
+++ b/official/common/distribute_utils_test.py
@@ -14,10 +14,13 @@

 """Tests for distribution util functions."""

+import sys
 import tensorflow as tf

 from official.common import distribute_utils

+TPU_TEST = 'test_tpu' in sys.argv[0]
+

 class DistributeUtilsTest(tf.test.TestCase):
  """Tests for distribute util functions."""
@@ -51,6 +54,9 @@ class DistributeUtilsTest(tf.test.TestCase):
    self.assertIn('GPU', ds.extended.worker_devices[0])

  def test_mirrored_strategy(self):
+    # CPU only.
+    _ = distribute_utils.get_distribution_strategy(num_gpus=0)
+    # 5 GPUs.
    ds = distribute_utils.get_distribution_strategy(num_gpus=5)
    self.assertEquals(ds.num_replicas_in_sync, 5)
    self.assertEquals(len(ds.extended.worker_devices), 5)
@@ -78,10 +84,26 @@ class DistributeUtilsTest(tf.test.TestCase):
    self.assertIsInstance(
        ds, tf.distribute.experimental.MultiWorkerMirroredStrategy)

+    with self.assertRaisesRegex(
+        ValueError,
+        'When used with `multi_worker_mirrored`, valid values.*'):
+      _ = distribute_utils.get_distribution_strategy(
+          'multi_worker_mirrored', all_reduce_alg='dummy')
+
  def test_no_strategy(self):
    ds = distribute_utils.get_distribution_strategy('off')
    self.assertIs(ds, tf.distribute.get_strategy())

+  def test_tpu_strategy(self):
+    if not TPU_TEST:
+      self.skipTest('Only Cloud TPU VM instances can have local TPUs.')
+    with self.assertRaises(ValueError):
+      _ = distribute_utils.get_distribution_strategy('tpu')
+
+    ds = distribute_utils.get_distribution_strategy('tpu', tpu_address='local')
+    self.assertIsInstance(
+        ds, tf.distribute.TPUStrategy)
+
  def test_invalid_strategy(self):
    with self.assertRaisesRegexp(
        ValueError,

--- a/official/core/actions.py
+++ b/official/core/actions.py
@@ -28,7 +28,7 @@ from official.core import config_definitions
 from official.modeling import optimization


-class PruningActions:
+class PruningAction:
  """Train action to updates pruning related information.

  This action updates pruning steps at the end of trainig loop, and log
@@ -66,7 +66,7 @@ class PruningActions:
    """Update pruning step and log pruning summaries.

    Args:
-      output: The train output to test.
+      output: The train output.
    """
    self.update_pruning_step.on_epoch_end(batch=None)
    self.pruning_summaries.on_epoch_begin(epoch=None)
@@ -81,8 +81,11 @@ class EMACheckpointing:
  than training.
  """

-  def __init__(self, export_dir: str, optimizer: tf.keras.optimizers.Optimizer,
-               checkpoint: tf.train.Checkpoint, max_to_keep: int = 1):
+  def __init__(self,
+               export_dir: str,
+               optimizer: tf.keras.optimizers.Optimizer,
+               checkpoint: tf.train.Checkpoint,
+               max_to_keep: int = 1):
    """Initializes the instance.

    Args:
@@ -99,8 +102,7 @@ class EMACheckpointing:
                       'EMACheckpointing action')

    export_dir = os.path.join(export_dir, 'ema_checkpoints')
-    tf.io.gfile.makedirs(
-        os.path.dirname(export_dir))
+    tf.io.gfile.makedirs(os.path.dirname(export_dir))
    self._optimizer = optimizer
    self._checkpoint = checkpoint
    self._checkpoint_manager = tf.train.CheckpointManager(
@@ -113,7 +115,7 @@ class EMACheckpointing:
    """Swaps model weights, and saves the checkpoint.

    Args:
-      output: The train or eval output to test.
+      output: The train or eval output.
    """
    self._optimizer.swap_weights()
    self._checkpoint_manager.save(checkpoint_number=self._optimizer.iterations)
@@ -173,10 +175,9 @@ class RecoveryCondition:


 @gin.configurable
-def get_eval_actions(
-    params: config_definitions.ExperimentConfig,
-    trainer: base_trainer.Trainer,
-    model_dir: str) -> List[orbit.Action]:
+def get_eval_actions(params: config_definitions.ExperimentConfig,
+                     trainer: base_trainer.Trainer,
+                     model_dir: str) -> List[orbit.Action]:
  """Gets eval actions for TFM trainer."""
  eval_actions = []
  # Adds ema checkpointing action to save the average weights under
@@ -202,7 +203,7 @@ def get_train_actions(
  # Adds pruning callback actions.
  if hasattr(params.task, 'pruning'):
    train_actions.append(
-        PruningActions(
+        PruningAction(
            export_dir=model_dir,
            model=trainer.model,
            optimizer=trainer.optimizer))

--- a/official/core/actions_test.py
+++ b/official/core/actions_test.py
@@ -27,14 +27,16 @@ from official.core import actions
 from official.modeling import optimization


-class TestModel(tf.Module):
+class TestModel(tf.keras.Model):

  def __init__(self):
-    self.value = tf.Variable(0)
+    super().__init__()
+    self.value = tf.Variable(0.0)
+    self.dense = tf.keras.layers.Dense(2)
+    _ = self.dense(tf.zeros((2, 2), tf.float32))

-  @tf.function(input_signature=[])
-  def __call__(self):
-    return self.value
+  def call(self, x, training=None):
+    return self.value + x


 class ActionsTest(tf.test.TestCase, parameterized.TestCase):
@@ -43,7 +45,7 @@ class ActionsTest(tf.test.TestCase, parameterized.TestCase):
      combinations.combine(
          distribution=[
              strategy_combinations.cloud_tpu_strategy,
-              strategy_combinations.one_device_strategy_gpu,
+              strategy_combinations.one_device_strategy,
          ],))
  def test_ema_checkpointing(self, distribution):
    with distribution.scope():
@@ -62,18 +64,25 @@ class ActionsTest(tf.test.TestCase, parameterized.TestCase):
      model.value.assign(3)

      # Checks model.value is 3
-      self.assertEqual(model(), 3)
+      self.assertEqual(model(0.), 3)
      ema_action = actions.EMACheckpointing(directory, optimizer, checkpoint)

      ema_action({})
      self.assertNotEmpty(
          tf.io.gfile.glob(os.path.join(directory, 'ema_checkpoints')))

-      checkpoint.read(tf.train.latest_checkpoint(
-          os.path.join(directory, 'ema_checkpoints')))
+      checkpoint.read(
+          tf.train.latest_checkpoint(
+              os.path.join(directory, 'ema_checkpoints')))

      # Checks model.value is 0 after swapping.
-      self.assertEqual(model(), 0)
+      self.assertEqual(model(0.), 0)
+
+      # Raises an error for a normal optimizer.
+      with self.assertRaisesRegex(ValueError,
+                                  'Optimizer has to be instance of.*'):
+        _ = actions.EMACheckpointing(directory, tf.keras.optimizers.SGD(),
+                                     checkpoint)

  @combinations.generate(
      combinations.combine(
@@ -102,6 +111,21 @@ class ActionsTest(tf.test.TestCase, parameterized.TestCase):
      with self.assertRaises(RuntimeError):
        recover_condition(outputs)

+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.one_device_strategy_gpu,
+              strategy_combinations.one_device_strategy,
+          ],))
+  def test_pruning(self, distribution):
+    with distribution.scope():
+      directory = self.get_temp_dir()
+      model = TestModel()
+      optimizer = tf.keras.optimizers.SGD()
+      pruning = actions.PruningAction(directory, model, optimizer)
+
+      pruning({})
+

 if __name__ == '__main__':
  tf.test.main()
--- a/official/core/base_trainer.py
+++ b/official/core/base_trainer.py
@@ -247,14 +247,12 @@ class Trainer(_AsyncTrainer):
    self._validation_loss = tf.keras.metrics.Mean(
        "validation_loss", dtype=tf.float32)
    model_metrics = model.metrics if hasattr(model, "metrics") else []
-    self._train_metrics = self.task.build_metrics(
-        training=True) + model_metrics
-    self._validation_metrics = self.task.build_metrics(
-        training=False) + model_metrics

    self.init_async()

    if train:
+      self._train_metrics = self.task.build_metrics(
+          training=True) + model_metrics
      train_dataset = train_dataset or self.distribute_dataset(
          self.task.build_inputs, self.config.task.train_data)
      orbit.StandardTrainer.__init__(
@@ -266,6 +264,8 @@ class Trainer(_AsyncTrainer):
              use_tpu_summary_optimization=config.trainer.allow_tpu_summary))

    if evaluate:
+      self._validation_metrics = self.task.build_metrics(
+          training=False) + model_metrics
      validation_dataset = validation_dataset or self.distribute_dataset(
          self.task.build_inputs, self.config.task.validation_data)
      orbit.StandardEvaluator.__init__(
@@ -370,16 +370,6 @@ class Trainer(_AsyncTrainer):
    """Accesses the training checkpoint."""
    return self._checkpoint

-  # TODO(yejiayu): Remove this once all deps are fixed.
-  def add_recovery(self, params: TrainerConfig,
-                   checkpoint_manager: tf.train.CheckpointManager):
-    if params.recovery_max_trials >= 0:
-      self._recovery = Recovery(
-          loss_upper_bound=params.loss_upper_bound,
-          recovery_begin_steps=params.recovery_begin_steps,
-          recovery_max_trials=params.recovery_max_trials,
-          checkpoint_manager=checkpoint_manager)
-
  def train_loop_end(self):
    """See base class."""
    self.join()

--- a/official/core/export_base.py
+++ b/official/core/export_base.py
@@ -16,10 +16,13 @@

 import abc
 import functools
+import time
 from typing import Any, Callable, Dict, Mapping, List, Optional, Text, Union

+from absl import logging
 import tensorflow as tf
-from tensorflow.python.saved_model.model_utils import export_utils
+
+MAX_DIRECTORY_CREATION_ATTEMPTS = 10


 class ExportModule(tf.Module, metaclass=abc.ABCMeta):
@@ -89,7 +92,8 @@ def export(export_module: ExportModule,
           export_savedmodel_dir: Text,
           checkpoint_path: Optional[Text] = None,
           timestamped: bool = True,
-           save_options: Optional[tf.saved_model.SaveOptions] = None) -> Text:
+           save_options: Optional[tf.saved_model.SaveOptions] = None,
+           checkpoint: Optional[tf.train.Checkpoint] = None) -> Text:
  """Exports to SavedModel format.

  Args:
@@ -101,6 +105,8 @@ def export(export_module: ExportModule,
    checkpoint_path: Object-based checkpoint path or directory.
    timestamped: Whether to export the savedmodel to a timestamped directory.
    save_options: `SaveOptions` for `tf.saved_model.save`.
+    checkpoint: An optional tf.train.Checkpoint. If provided, the export module
+      will use it to read the weights.

  Returns:
    The savedmodel directory path.
@@ -109,7 +115,8 @@ def export(export_module: ExportModule,
  if ckpt_dir_or_file is not None and tf.io.gfile.isdir(ckpt_dir_or_file):
    ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
  if ckpt_dir_or_file:
-    checkpoint = tf.train.Checkpoint(model=export_module.model)
+    if checkpoint is None:
+      checkpoint = tf.train.Checkpoint(model=export_module.model)
    checkpoint.read(
        ckpt_dir_or_file).assert_existing_objects_matched().expect_partial()
  if isinstance(function_keys, list):
@@ -119,15 +126,48 @@ def export(export_module: ExportModule,
      }
    else:
      raise ValueError(
-          "If the function_keys is a list, it must contain a single element. %s"
+          'If the function_keys is a list, it must contain a single element. %s'
          % function_keys)

  signatures = export_module.get_inference_signatures(function_keys)
  if timestamped:
-    export_dir = export_utils.get_timestamped_export_dir(
-        export_savedmodel_dir).decode("utf-8")
+    export_dir = get_timestamped_export_dir(export_savedmodel_dir).decode(
+        'utf-8')
  else:
    export_dir = export_savedmodel_dir
  tf.saved_model.save(
      export_module, export_dir, signatures=signatures, options=save_options)
  return export_dir
+
+
+def get_timestamped_export_dir(export_dir_base):
+  """Builds a path to a new subdirectory within the base directory.
+
+  Args:
+    export_dir_base: A string containing a directory to write the exported graph
+      and checkpoints.
+
+  Returns:
+    The full path of the new subdirectory (which is not actually created yet).
+
+  Raises:
+    RuntimeError: if repeated attempts fail to obtain a unique timestamped
+      directory name.
+  """
+  attempts = 0
+  while attempts < MAX_DIRECTORY_CREATION_ATTEMPTS:
+    timestamp = int(time.time())
+
+    result_dir = tf.io.gfile.join(
+        tf.compat.as_bytes(export_dir_base), tf.compat.as_bytes(str(timestamp)))
+    if not tf.io.gfile.exists(result_dir):
+      # Collisions are still possible (though extremely unlikely): this
+      # directory is not actually created yet, but it will be almost
+      # instantly on return from this function.
+      return result_dir
+    time.sleep(1)
+    attempts += 1
+    logging.warning('Directory %s already exists; retrying (attempt %s/%s)',
+                    str(result_dir), attempts, MAX_DIRECTORY_CREATION_ATTEMPTS)
+  raise RuntimeError('Failed to obtain a unique export directory name after '
+                     f'{MAX_DIRECTORY_CREATION_ATTEMPTS} attempts.')
--- a/official/core/export_base_test.py
+++ b/official/core/export_base_test.py
@@ -121,6 +121,13 @@ class ExportBaseTest(tf.test.TestCase):
    output = module.serve(inputs)
    self.assertAllClose(output['outputs'].numpy(), 1.11)

+  def test_get_timestamped_export_dir(self):
+    export_dir = self.get_temp_dir()
+    timed_dir = export_base.get_timestamped_export_dir(
+        export_dir_base=export_dir)
+    self.assertFalse(tf.io.gfile.exists(timed_dir))
+    self.assertIn(export_dir, str(timed_dir))
+

 if __name__ == '__main__':
  tf.test.main()
--- a/official/core/train_utils.py
+++ b/official/core/train_utils.py
@@ -14,13 +14,13 @@

 """Training utils."""
 import copy
+import dataclasses
 import json
 import os
 import pprint
 from typing import Any, Callable, Dict, List, Optional, Union

 from absl import logging
-import dataclasses
 import gin
 import orbit
 import tensorflow as tf
@@ -244,49 +244,87 @@ class ParseConfigOptions:
    return name in dataclasses.asdict(self)


-def parse_configuration(flags_obj, lock_return=True, print_return=True):
-  """Parses ExperimentConfig from flags."""
+class ExperimentParser:
+  """Constructs the Experiment config from Flags or equivalent object.
+
+  Most of the cases, users only need to call the `parse()` function:
+  ```
+  builder = ExperimentParser(FLAGS)
+  params = builder.parse()
+  ```

-  if flags_obj.experiment is None:
-    raise ValueError('The flag --experiment must be specified.')
-
-  # 1. Get the default config from the registered experiment.
-  params = exp_factory.get_exp_config(flags_obj.experiment)
-
-  # 2. Get the first level of override from `--config_file`.
-  #    `--config_file` is typically used as a template that specifies the common
-  #    override for a particular experiment.
-  for config_file in flags_obj.config_file or []:
-    params = hyperparams.override_params_dict(
-        params, config_file, is_strict=True)
-
-  # 3. Override the TPU address and tf.data service address.
-  params.override({
-      'runtime': {
-          'tpu': flags_obj.tpu,
-      },
-  })
-  if ('tf_data_service' in flags_obj and flags_obj.tf_data_service and
-      isinstance(params.task, config_definitions.TaskConfig)):
+  The advanced users can modify the flow by calling the parse_*() functions
+  separately.
+  """
+
+  def __init__(self, flags_obj):
+    self._flags_obj = flags_obj
+
+  def parse(self):
+    """Overrall process of constructing Experiment config."""
+    params = self.base_experiment()
+    params = self.parse_config_file(params)
+    params = self.parse_runtime(params)
+    params = self.parse_data_service(params)
+    params = self.parse_params_override(params)
+    return params
+
+  def base_experiment(self):
+    """Get the base experiment config from --experiment field."""
+    if self._flags_obj.experiment is None:
+      raise ValueError('The flag --experiment must be specified.')
+    return exp_factory.get_exp_config(self._flags_obj.experiment)
+
+  def parse_config_file(self, params):
+    """Override the configs of params from the config_file."""
+    for config_file in self._flags_obj.config_file or []:
+      params = hyperparams.override_params_dict(
+          params, config_file, is_strict=True)
+    return params
+
+  def parse_runtime(self, params):
+    """Override the runtime configs of params from flags."""
+    # Override the TPU address and tf.data service address.
    params.override({
-        'task': {
-            'train_data': {
-                'tf_data_service_address': flags_obj.tf_data_service,
-            },
-            'validation_data': {
-                'tf_data_service_address': flags_obj.tf_data_service,
-            }
-        }
+        'runtime': {
+            'tpu': self._flags_obj.tpu,
+        },
    })
+    return params
+
+  def parse_data_service(self, params):
+    """Override the data service configs of params from flags."""
+    if ('tf_data_service' in self._flags_obj and
+        self._flags_obj.tf_data_service and
+        isinstance(params.task, config_definitions.TaskConfig)):
+      params.override({
+          'task': {
+              'train_data': {
+                  'tf_data_service_address': self._flags_obj.tf_data_service,
+              },
+              'validation_data': {
+                  'tf_data_service_address': self._flags_obj.tf_data_service,
+              }
+          }
+      })
+    return params
+
+  def parse_params_override(self, params):
+    # Get the second level of override from `--params_override`.
+    # `--params_override` is typically used as a further override over the
+    # template. For example, one may define a particular template for training
+    # ResNet50 on ImageNet in a config file and pass it via `--config_file`,
+    # then define different learning rates and pass it via `--params_override`.
+    if self._flags_obj.params_override:
+      params = hyperparams.override_params_dict(
+          params, self._flags_obj.params_override, is_strict=True)
+    return params
+
+
+def parse_configuration(flags_obj, lock_return=True, print_return=True):
+  """Parses ExperimentConfig from flags."""

-  # 4. Get the second level of override from `--params_override`.
-  #    `--params_override` is typically used as a further override over the
-  #    template. For example, one may define a particular template for training
-  #    ResNet50 on ImageNet in a config file and pass it via `--config_file`,
-  #    then define different learning rates and pass it via `--params_override`.
-  if flags_obj.params_override:
-    params = hyperparams.override_params_dict(
-        params, flags_obj.params_override, is_strict=True)
+  params = ExperimentParser(flags_obj).parse()

  params.validate()
  if lock_return:

--- a/official/core/train_utils_test.py
+++ b/official/core/train_utils_test.py
@@ -13,14 +13,38 @@
 # limitations under the License.

 """Tests for official.core.train_utils."""
-
+import json
 import os
+import pprint

 import numpy as np
 import tensorflow as tf

+from official.core import exp_factory
 from official.core import test_utils
 from official.core import train_utils
+from official.modeling import hyperparams
+
+
+@exp_factory.register_config_factory('foo')
+def foo():
+  """Multitask experiment for test."""
+  experiment_config = hyperparams.Config(
+      default_params={
+          'runtime': {
+              'tpu': 'fake',
+          },
+          'task': {
+              'model': {
+                  'model_id': 'bar',
+              },
+          },
+          'trainer': {
+              'train_steps': -1,
+              'validation_steps': -1,
+          },
+      })
+  return experiment_config


 class TrainUtilsTest(tf.test.TestCase):
@@ -93,6 +117,82 @@ class TrainUtilsTest(tf.test.TestCase):
    ]
    self.assertEqual(actual, expected)

+  def test_construct_experiment_from_flags(self):
+    options = train_utils.ParseConfigOptions(
+        experiment='foo',
+        config_file=[],
+        tpu='bar',
+        tf_data_service='',
+        params_override='task.model.model_id=new,'
+        'trainer.train_steps=10,'
+        'trainer.validation_steps=11')
+    builder = train_utils.ExperimentParser(options)
+    params_from_obj = builder.parse()
+    params_from_func = train_utils.parse_configuration(options)
+    pp = pprint.PrettyPrinter()
+    self.assertEqual(
+        pp.pformat(params_from_obj.as_dict()),
+        pp.pformat(params_from_func.as_dict()))
+    self.assertEqual(params_from_obj.runtime.tpu, 'bar')
+    self.assertEqual(params_from_obj.task.model.model_id, 'new')
+    self.assertEqual(params_from_obj.trainer.train_steps, 10)
+    self.assertEqual(params_from_obj.trainer.validation_steps, 11)
+
+
+class BestCheckpointExporterTest(tf.test.TestCase):
+
+  def test_maybe_export(self):
+    model_dir = self.create_tempdir().full_path
+    best_ckpt_path = os.path.join(model_dir, 'best_ckpt-1')
+    metric_name = 'test_metric|metric_1'
+    exporter = train_utils.BestCheckpointExporter(
+        model_dir, metric_name, 'higher')
+    v = tf.Variable(1.0)
+    checkpoint = tf.train.Checkpoint(v=v)
+    ret = exporter.maybe_export_checkpoint(
+        checkpoint, {'test_metric': {'metric_1': 5.0}}, 100)
+    with self.subTest(name='Successful first save.'):
+      self.assertEqual(ret, True)
+      v_2 = tf.Variable(2.0)
+      checkpoint_2 = tf.train.Checkpoint(v=v_2)
+      checkpoint_2.restore(best_ckpt_path)
+      self.assertEqual(v_2.numpy(), 1.0)
+
+    v = tf.Variable(3.0)
+    checkpoint = tf.train.Checkpoint(v=v)
+    ret = exporter.maybe_export_checkpoint(
+        checkpoint, {'test_metric': {'metric_1': 6.0}}, 200)
+    with self.subTest(name='Successful better metic save.'):
+      self.assertEqual(ret, True)
+      v_2 = tf.Variable(2.0)
+      checkpoint_2 = tf.train.Checkpoint(v=v_2)
+      checkpoint_2.restore(best_ckpt_path)
+      self.assertEqual(v_2.numpy(), 3.0)
+
+    v = tf.Variable(5.0)
+    checkpoint = tf.train.Checkpoint(v=v)
+    ret = exporter.maybe_export_checkpoint(
+        checkpoint, {'test_metric': {'metric_1': 1.0}}, 300)
+    with self.subTest(name='Worse metic no save.'):
+      self.assertEqual(ret, False)
+      v_2 = tf.Variable(2.0)
+      checkpoint_2 = tf.train.Checkpoint(v=v_2)
+      checkpoint_2.restore(best_ckpt_path)
+      self.assertEqual(v_2.numpy(), 3.0)
+
+  def test_export_best_eval_metric(self):
+    model_dir = self.create_tempdir().full_path
+    metric_name = 'test_metric|metric_1'
+    exporter = train_utils.BestCheckpointExporter(model_dir, metric_name,
+                                                  'higher')
+    exporter.export_best_eval_metric({'test_metric': {'metric_1': 5.0}}, 100)
+    with tf.io.gfile.GFile(os.path.join(model_dir, 'info.json'),
+                           'rb') as reader:
+      metric = json.loads(reader.read())
+      self.assertAllEqual(
+          metric,
+          {'test_metric': {'metric_1': 5.0}, 'best_ckpt_global_step': 100.0})
+

 if __name__ == '__main__':
  tf.test.main()
--- a/official/nlp/albert/__init__.py
+++ b/official/nlp/albert/__init__.py
--- a/official/legacy/albert/README.md
+++ b/official/legacy/albert/README.md
+# ALBERT (ALBERT: A Lite BERT for Self-supervised Learning of Language Representations)
+
+**WARNING**: This directory is deprecated.
+See `nlp/docs/MODEL_GARDEN.md` for the new ALBERT implementation.
--- a/official/nlp/projects/__init__.py
+++ b/official/nlp/projects/__init__.py
--- a/official/nlp/albert/configs.py
+++ b/official/nlp/albert/configs.py
--- a/official/legacy/detection/README.md
+++ b/official/legacy/detection/README.md
+# Object Detection Models on TensorFlow 2
+
+**WARNING**: This repository will be deprecated and replaced by the solid
+implementations inside vision/beta/.
+
+## Prerequsite
+To get started, download the code from TensorFlow models GitHub repository or
+use the pre-installed Google Cloud VM.
+
+```bash
+git clone https://github.com/tensorflow/models.git
+```
+
+Next, make sure to use TensorFlow 2.1+ on Google Cloud. Also here are
+a few package you need to install to get started:
+
+```bash
+sudo apt-get install -y python-tk && \
+pip3 install -r ~/models/official/requirements.txt
+```
+
+## Train RetinaNet on TPU
+
+### Train a vanilla ResNet-50 based RetinaNet.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu="${TPU_NAME?}" \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --params_override="{ type: retinanet, train: { checkpoint: { path: ${RESNET_CHECKPOINT?}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
+```
+
+The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
+
+Note: The ResNet implementation under
+[detection/](https://github.com/tensorflow/models/tree/master/official/legacy/detection)
+is currently different from the one under
+[classification/](https://github.com/tensorflow/models/tree/master/official/vision/image_classification),
+so the checkpoints are not compatible.
+We will unify the implementation soon.
+
+
+### Train a SpineNet-49 based RetinaNet.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu="${TPU_NAME?}" \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --params_override="{ type: retinanet, architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
+```
+
+
+### Train a custom RetinaNet using the config file.
+
+First, create a YAML config file, e.g. *my_retinanet.yaml*. This file specifies
+the parameters to be overridden, which should at least include the following
+fields.
+
+```YAML
+# my_retinanet.yaml
+type: 'retinanet'
+train:
+  train_file_pattern: <path to the TFRecord training data>
+eval:
+  eval_file_pattern: <path to the TFRecord validation data>
+  val_json_file: <path to the validation annotation JSON file>
+```
+
+Once the YAML config file is created, you can launch the training using the
+following command.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu="${TPU_NAME?}" \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --config_file="my_retinanet.yaml"
+```
+
+## Train RetinaNet on GPU
+
+Training on GPU is similar to that on TPU. The major change is the strategy
+type (use "[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)" for multiple GPU and
+"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)" for single GPU).
+
+Multi-GPUs example (assuming there are 8GPU connected to the host):
+
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=mirrored \
+  --num_gpus=8 \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --config_file="my_retinanet.yaml"
+```
+
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --config_file="my_retinanet.yaml"
+```
+
+An example with inline configuration (YAML or JSON format):
+
+```
+python3 ~/models/official/legacy/detection/main.py \
+  --model_dir=<model folder> \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --mode=train \
+  --params_override="eval:
+ eval_file_pattern: <Eval TFRecord file pattern>
+ batch_size: 8
+ val_json_file: <COCO format groundtruth JSON file>
+predict:
+ predict_batch_size: 8
+architecture:
+ use_bfloat16: False
+train:
+ total_steps: 1
+ batch_size: 8
+ train_file_pattern: <Eval TFRecord file pattern>
+use_tpu: False
+"
+```
+
+---
+
+## Train Mask R-CNN on TPU
+
+### Train a vanilla ResNet-50 based Mask R-CNN.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu=${TPU_NAME} \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=mask_rcnn \
+  --params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } }"
+```
+
+The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
+
+Note: The ResNet implementation under
+[detection/](https://github.com/tensorflow/models/tree/master/official/legacy/detection)
+is currently different from the one under
+[classification/](https://github.com/tensorflow/models/tree/master/official/vision/image_classification),
+so the checkpoints are not compatible.
+We will unify the implementation soon.
+
+
+### Train a SpineNet-49 based Mask R-CNN.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu="${TPU_NAME?}" \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --model=mask_rcnn \
+  --params_override="{architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
+```
+
+
+### Train a custom Mask R-CNN using the config file.
+
+First, create a YAML config file, e.g. *my_maskrcnn.yaml*.
+This file specifies the parameters to be overridden,
+which should at least include the following fields.
+
+```YAML
+# my_maskrcnn.yaml
+train:
+  train_file_pattern: <path to the TFRecord training data>
+eval:
+  eval_file_pattern: <path to the TFRecord validation data>
+  val_json_file: <path to the validation annotation JSON file>
+```
+
+Once the YAML config file is created, you can launch the training using the
+following command.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu=${TPU_NAME} \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=mask_rcnn \
+  --config_file="my_maskrcnn.yaml"
+```
+
+## Train Mask R-CNN on GPU
+
+Training on GPU is similar to that on TPU. The major change is the strategy type
+(use
+"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
+for multiple GPU and
+"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
+for single GPU).
+
+Multi-GPUs example (assuming there are 8GPU connected to the host):
+
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=mirrored \
+  --num_gpus=8 \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=mask_rcnn \
+  --config_file="my_maskrcnn.yaml"
+```
+
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=mask_rcnn \
+  --config_file="my_maskrcnn.yaml"
+```
+
+An example with inline configuration (YAML or JSON format):
+
+```
+python3 ~/models/official/legacy/detection/main.py \
+  --model_dir=<model folder> \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --mode=train \
+  --model=mask_rcnn \
+  --params_override="eval:
+ eval_file_pattern: <Eval TFRecord file pattern>
+ batch_size: 8
+ val_json_file: <COCO format groundtruth JSON file>
+predict:
+ predict_batch_size: 8
+architecture:
+ use_bfloat16: False
+train:
+ total_steps: 1000
+ batch_size: 8
+ train_file_pattern: <Eval TFRecord file pattern>
+use_tpu: False
+"
+```
+
+## Train ShapeMask on TPU
+
+### Train a ResNet-50 based ShapeMask.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+SHAPE_PRIOR_PATH="<path to shape priors>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu=${TPU_NAME} \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=shapemask \
+  --params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } shapemask_head: {use_category_for_mask: true, shape_prior_path: ${SHAPE_PRIOR_PATH}} }"
+```
+
+The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
+
+The shape priors can be downloaded [here]
+(https://storage.googleapis.com/cloud-tpu-checkpoints/shapemask/kmeans_class_priors_91x20x32x32.npy)
+
+
+### Train a custom ShapeMask using the config file.
+
+First, create a YAML config file, e.g. *my_shapemask.yaml*.
+This file specifies the parameters to be overridden:
+
+```YAML
+# my_shapemask.yaml
+train:
+  train_file_pattern: <path to the TFRecord training data>
+  total_steps: <total steps to train>
+  batch_size: <training batch size>
+eval:
+  eval_file_pattern: <path to the TFRecord validation data>
+  val_json_file: <path to the validation annotation JSON file>
+  batch_size: <evaluation batch size>
+shapemask_head:
+  shape_prior_path: <path to shape priors>
+```
+
+Once the YAML config file is created, you can launch the training using the
+following command.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=tpu \
+  --tpu=${TPU_NAME} \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=shapemask \
+  --config_file="my_shapemask.yaml"
+```
+
+## Train ShapeMask on GPU
+
+Training on GPU is similar to that on TPU. The major change is the strategy type
+(use
+"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
+for multiple GPU and
+"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
+for single GPU).
+
+Multi-GPUs example (assuming there are 8GPU connected to the host):
+
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=mirrored \
+  --num_gpus=8 \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=shapemask \
+  --config_file="my_shapemask.yaml"
+```
+
+A single GPU example
+
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/legacy/detection/main.py \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=shapemask \
+  --config_file="my_shapemask.yaml"
+```
+
+
+An example with inline configuration (YAML or JSON format):
+
+```
+python3 ~/models/official/legacy/detection/main.py \
+  --model_dir=<model folder> \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --mode=train \
+  --model=shapemask \
+  --params_override="eval:
+ eval_file_pattern: <Eval TFRecord file pattern>
+ batch_size: 8
+ val_json_file: <COCO format groundtruth JSON file>
+train:
+ total_steps: 1000
+ batch_size: 8
+ train_file_pattern: <Eval TFRecord file pattern>
+use_tpu: False
+"
+```
+
+
+### Run the evaluation (after training)
+
+```
+python3 /usr/share/models/official/legacy/detection/main.py \
+   --strategy_type=tpu \
+   --tpu=${TPU_NAME} \
+   --model_dir=${MODEL_DIR} \
+   --mode=eval \
+   --model=shapemask \
+   --params_override="{eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN}, eval_samples: 5000 } }"
+```
+
+`MODEL_DIR` needs to point to the trained path of ShapeMask model.
+Change `strategy_type=mirrored` and `num_gpus=1` to run on a GPU.
+
+Note: The JSON groundtruth file is useful for [COCO dataset](http://cocodataset.org/#home) and can be
+downloaded from the [COCO website](http://cocodataset.org/#download). For custom dataset, it is unncessary because the groundtruth can be included in the TFRecord files.
+
+## References
+
+1.  [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002).
+    Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, and Piotr Dollár. IEEE
+    International Conference on Computer Vision (ICCV), 2017.
--- a/official/nlp/projects/bigbird/__init__.py
+++ b/official/nlp/projects/bigbird/__init__.py
--- a/official/nlp/projects/mobilebert/__init__.py
+++ b/official/nlp/projects/mobilebert/__init__.py
--- a/official/vision/detection/configs/base_config.py
+++ b/official/vision/detection/configs/base_config.py
--- a/official/vision/detection/configs/factory.py
+++ b/official/vision/detection/configs/factory.py
@@ -14,11 +14,11 @@

 """Factory to provide model configs."""

+from official.legacy.detection.configs import maskrcnn_config
+from official.legacy.detection.configs import olnmask_config
+from official.legacy.detection.configs import retinanet_config
+from official.legacy.detection.configs import shapemask_config
 from official.modeling.hyperparams import params_dict
-from official.vision.detection.configs import maskrcnn_config
-from official.vision.detection.configs import olnmask_config
-from official.vision.detection.configs import retinanet_config
-from official.vision.detection.configs import shapemask_config


 def config_generator(model):

--- a/official/vision/detection/configs/maskrcnn_config.py
+++ b/official/vision/detection/configs/maskrcnn_config.py
@@ -14,8 +14,8 @@

 """Config template to train Mask R-CNN."""

+from official.legacy.detection.configs import base_config
 from official.modeling.hyperparams import params_dict
-from official.vision.detection.configs import base_config


 # pylint: disable=line-too-long

--- a/official/vision/detection/configs/olnmask_config.py
+++ b/official/vision/detection/configs/olnmask_config.py
@@ -14,8 +14,8 @@

 """Config template to train Object Localization Network (OLN)."""

+from official.legacy.detection.configs import base_config
 from official.modeling.hyperparams import params_dict
-from official.vision.detection.configs import base_config


 # pylint: disable=line-too-long