Merge branch 'master' of https://github.com/tensorflow/models into detection_generator_pr_2

b92025a9 · anivegesana · 1b425791 · 37536370 · b92025a9 · b92025a9
Commit b92025a9 authored Aug 18, 2021 by anivegesana
20 changed files
--- a/official/core/actions.py
+++ b/official/core/actions.py
@@ -16,6 +16,7 @@

 import os
 from typing import List
+from absl import logging

 import gin
 import orbit
@@ -119,6 +120,58 @@ class EMACheckpointing:
    self._optimizer.swap_weights()


+class RecoveryAction:
+  """Train action to recover from loss blowup.
+
+  Checks the loss value by the given threshold. If applicable, recover the
+  model by reading the checkpoint on disk.
+  """
+
+  def __init__(self, checkpoint_manager: tf.train.CheckpointManager):
+    self.checkpoint_manager = checkpoint_manager
+
+  def __call__(self, _):
+    """Recovers the training by triggering checkpoint restoration."""
+    # Loads the previous good checkpoint.
+    checkpoint_path = self.checkpoint_manager.restore_or_initialize()
+    logging.warning('Recovering the model from checkpoint: %s.',
+                    checkpoint_path)
+
+
+class RecoveryCondition:
+  """Recovery Condition."""
+
+  def __init__(self,
+               global_step: tf.Variable,
+               loss_upper_bound: float,
+               recovery_begin_steps: int = 0,
+               recovery_max_trials: int = 3):
+    self.recover_counter = 0
+    self.recovery_begin_steps = recovery_begin_steps
+    self.recovery_max_trials = recovery_max_trials
+    self.loss_upper_bound = loss_upper_bound
+    self.global_step = global_step
+
+  def __call__(self, outputs: orbit.runner.Output):
+    loss_value = outputs['training_loss']
+    if tf.math.is_nan(loss_value):
+      self.recover_counter += 1
+      if self.recover_counter > self.recovery_max_trials:
+        raise RuntimeError(
+            'The loss value is NaN after training loop and it happens %d times.'
+            % self.recover_counter)
+      return True
+    if (self.global_step >= self.recovery_begin_steps and
+        loss_value > self.loss_upper_bound):
+      self.recover_counter += 1
+      if self.recover_counter > self.recovery_max_trials:
+        raise RuntimeError(
+            f'The loss value is {loss_value}, which is larger than the bound {self.loss_upper_bound}, happens {self.recover_counter} times.'
+        )
+      return True
+    return False
+
+
 @gin.configurable
 def get_eval_actions(
    params: config_definitions.ExperimentConfig,
@@ -140,9 +193,10 @@ def get_eval_actions(


 @gin.configurable
-def get_train_actions(params: config_definitions.ExperimentConfig,
-                      trainer: base_trainer.Trainer,
-                      model_dir: str) -> List[orbit.Action]:
+def get_train_actions(
+    params: config_definitions.ExperimentConfig, trainer: base_trainer.Trainer,
+    model_dir: str,
+    checkpoint_manager: tf.train.CheckpointManager) -> List[orbit.Action]:
  """Gets train actions for TFM trainer."""
  train_actions = []
  # Adds pruning callback actions.
@@ -153,4 +207,16 @@ def get_train_actions(params: config_definitions.ExperimentConfig,
            model=trainer.model,
            optimizer=trainer.optimizer))

+  if params.trainer.recovery_max_trials >= 0:
+    recovery_condition = RecoveryCondition(
+        global_step=trainer.global_step,
+        loss_upper_bound=params.trainer.loss_upper_bound,
+        recovery_begin_steps=params.trainer.recovery_begin_steps,
+        recovery_max_trials=params.trainer.recovery_max_trials,
+    )
+    recover_action = orbit.actions.ConditionalAction(
+        condition=recovery_condition,
+        action=RecoveryAction(checkpoint_manager),
+    )
+    train_actions.append(recover_action)
  return train_actions
--- a/official/core/actions_test.py
+++ b/official/core/actions_test.py
@@ -17,6 +17,8 @@
 import os

 from absl.testing import parameterized
+import numpy as np
+import orbit
 import tensorflow as tf

 from tensorflow.python.distribute import combinations
@@ -35,17 +37,14 @@ class TestModel(tf.Module):
    return self.value


-def all_strategy_combinations():
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.cloud_tpu_strategy,
-          strategy_combinations.one_device_strategy_gpu,
-      ],)
-
-
 class ActionsTest(tf.test.TestCase, parameterized.TestCase):

-  @combinations.generate(all_strategy_combinations())
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],))
  def test_ema_checkpointing(self, distribution):
    with distribution.scope():
      directory = self.create_tempdir()
@@ -76,6 +75,33 @@ class ActionsTest(tf.test.TestCase, parameterized.TestCase):
      # Checks model.value is 0 after swapping.
      self.assertEqual(model(), 0)

+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.default_strategy,
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],))
+  def test_recovery_condition(self, distribution):
+    with distribution.scope():
+      global_step = orbit.utils.create_global_step()
+      recover_condition = actions.RecoveryCondition(
+          global_step, loss_upper_bound=0.5, recovery_max_trials=2)
+      outputs = {'training_loss': 0.6}
+      self.assertTrue(recover_condition(outputs))
+      self.assertTrue(recover_condition(outputs))
+      with self.assertRaises(RuntimeError):
+        recover_condition(outputs)
+
+      global_step = orbit.utils.create_global_step()
+      recover_condition = actions.RecoveryCondition(
+          global_step, loss_upper_bound=0.5, recovery_max_trials=2)
+      outputs = {'training_loss': tf.constant([np.nan], tf.float32)}
+      self.assertTrue(recover_condition(outputs))
+      self.assertTrue(recover_condition(outputs))
+      with self.assertRaises(RuntimeError):
+        recover_condition(outputs)
+

 if __name__ == '__main__':
  tf.test.main()
--- a/official/core/base_trainer.py
+++ b/official/core/base_trainer.py
@@ -370,6 +370,7 @@ class Trainer(_AsyncTrainer):
    """Accesses the training checkpoint."""
    return self._checkpoint

+  # TODO(yejiayu): Remove this once all deps are fixed.
  def add_recovery(self, params: TrainerConfig,
                   checkpoint_manager: tf.train.CheckpointManager):
    if params.recovery_max_trials >= 0:
@@ -382,11 +383,6 @@ class Trainer(_AsyncTrainer):
  def train_loop_end(self):
    """See base class."""
    self.join()
-    # Checks if the model numeric status is stable and conducts the checkpoint
-    # recovery accordingly.
-    if self._recovery:
-      self._recovery.maybe_recover(self.train_loss.result().numpy(),
-                                   self.global_step.numpy())
    logs = {}
    for metric in self.train_metrics + [self.train_loss]:
      logs[metric.name] = metric.result()

--- a/official/core/base_trainer_test.py
+++ b/official/core/base_trainer_test.py
@@ -14,12 +14,12 @@

 """Tests for tensorflow_models.core.trainers.trainer."""
 # pylint: disable=g-direct-tensorflow-import
+import gc
 import multiprocessing
 import os
 import sys

 from absl.testing import parameterized
-import numpy as np
 import orbit
 import portpicker
 import tensorflow as tf
@@ -165,6 +165,13 @@ class TrainerTest(tf.test.TestCase, parameterized.TestCase):
                }
            })))

+  def tearDown(self):
+    gc.collect()
+    # This will only contain uncollectable garbage, i.e. reference cycles
+    # involving objects with __del__ defined.
+    self.assertEmpty(gc.garbage)
+    super().tearDown()
+
  def create_test_trainer(self, config, model_dir=None, task=None):
    task = task or mock_task.MockTask(config.task, logging_dir=model_dir)
    ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir)
@@ -337,61 +344,6 @@ class TrainerTest(tf.test.TestCase, parameterized.TestCase):
    self.assertTrue(
        tf.io.gfile.exists(os.path.join(model_dir, 'best_ckpt', 'info.json')))

-  def test_recovery(self):
-    config = cfg.ExperimentConfig(
-        trainer=cfg.TrainerConfig(
-            loss_upper_bound=0.5,
-            recovery_max_trials=2,
-            optimizer_config=cfg.OptimizationConfig({
-                'optimizer': {
-                    'type': 'sgd'
-                },
-                'learning_rate': {
-                    'type': 'constant'
-                }
-            })))
-    model_dir = self.get_temp_dir()
-    trainer = self.create_test_trainer(config, model_dir=model_dir)
-    checkpoint_manager = tf.train.CheckpointManager(
-        trainer.checkpoint, self.get_temp_dir(), max_to_keep=2)
-    checkpoint_manager.save()
-    trainer.add_recovery(config.trainer, checkpoint_manager=checkpoint_manager)
-    before_weights = trainer.model.get_weights()
-    _ = trainer.train(tf.convert_to_tensor(1, dtype=tf.int32))
-    # The training loss is 1.0 and upper_bound is 0.5, so the recover happens.
-    after_weights = trainer.model.get_weights()
-    for left, right in zip(before_weights, after_weights):
-      self.assertAllEqual(left, right)
-
-    # Let's the loss be NaN and max_trials = 0 to see RuntimeError.
-    config = cfg.ExperimentConfig(
-        trainer=cfg.TrainerConfig(
-            recovery_max_trials=0,
-            optimizer_config=cfg.OptimizationConfig({
-                'optimizer': {
-                    'type': 'sgd'
-                },
-                'learning_rate': {
-                    'type': 'constant'
-                }
-            })))
-    task = mock_task.MockTask(config.task, logging_dir=model_dir)
-
-    def build_losses(labels, model_outputs, aux_losses=None):
-      del labels, model_outputs
-      return tf.constant([np.nan], tf.float32) + aux_losses
-
-    task.build_losses = build_losses
-    trainer = trainer_lib.Trainer(
-        config,
-        task,
-        model=task.build_model(),
-        optimizer=task.create_optimizer(config.trainer.optimizer_config,
-                                        config.runtime))
-    trainer.add_recovery(config.trainer, checkpoint_manager=checkpoint_manager)
-    with self.assertRaises(RuntimeError):
-      _ = trainer.train(tf.convert_to_tensor(2, dtype=tf.int32))
-
  def test_model_with_compiled_loss(self):
    task = mock_task.MockTask()
    model = task.build_model()

--- a/official/core/registry.py
+++ b/official/core/registry.py
@@ -87,11 +87,15 @@ def lookup(registered_collection, reg_key):
    for h_idx, entry_name in enumerate(hierarchy):
      if entry_name not in collection:
        raise LookupError(
-            "collection path {} at position {} never registered.".format(
-                entry_name, h_idx))
+            f"collection path {entry_name} at position {h_idx} is never "
+            f"registered. Please make sure the {entry_name} and its library is "
+            "imported and linked to the trainer binary.")
      collection = collection[entry_name]
    return collection
  else:
    if reg_key not in registered_collection:
-      raise LookupError("registration key {} never registered.".format(reg_key))
+      raise LookupError(
+          f"registration key {reg_key} is never "
+          f"registered. Please make sure the {reg_key} and its library is "
+          "imported and linked to the trainer binary.")
    return registered_collection[reg_key]
--- a/official/core/train_lib.py
+++ b/official/core/train_lib.py
@@ -87,8 +87,6 @@ def run_experiment(
        step_counter=trainer.global_step,
        checkpoint_interval=params.trainer.checkpoint_interval,
        init_fn=trainer.initialize)
-    # Adds recovery handling.
-    trainer.add_recovery(params.trainer, checkpoint_manager=checkpoint_manager)
  else:
    checkpoint_manager = None

@@ -105,7 +103,8 @@ def run_experiment(
      (save_summary) else None,
      summary_interval=params.trainer.summary_interval if
      (save_summary) else None,
-      train_actions=actions.get_train_actions(params, trainer, model_dir),
+      train_actions=actions.get_train_actions(
+          params, trainer, model_dir, checkpoint_manager=checkpoint_manager),
      eval_actions=actions.get_eval_actions(params, trainer, model_dir))

  logging.info('Starts to execute mode: %s', mode)

--- a/official/core/train_lib_test.py
+++ b/official/core/train_lib_test.py
@@ -19,6 +19,7 @@ import os
 from absl import flags
 from absl.testing import flagsaver
 from absl.testing import parameterized
+import numpy as np
 import tensorflow as tf

 from tensorflow.python.distribute import combinations
@@ -30,6 +31,7 @@ from official.common import registry_imports
 from official.core import task_factory
 from official.core import train_lib
 from official.core import train_utils
+from official.utils.testing import mock_task

 FLAGS = flags.FLAGS

@@ -114,7 +116,93 @@ class TrainTest(tf.test.TestCase, parameterized.TestCase):
        params=params,
        model_dir=model_dir,
        run_post_eval=run_post_eval)
-    print(logs)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution_strategy=[
+              strategy_combinations.default_strategy,
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          flag_mode=['train', 'train_and_eval'],
+      ))
+  def test_recovery_nan_error(self, distribution_strategy, flag_mode):
+    model_dir = self.get_temp_dir()
+    flags_dict = dict(
+        experiment='mock',
+        mode=flag_mode,
+        model_dir=model_dir,
+        params_override=json.dumps(self._test_config))
+    with flagsaver.flagsaver(**flags_dict):
+      params = train_utils.parse_configuration(flags.FLAGS)
+      train_utils.serialize_config(params, model_dir)
+      with distribution_strategy.scope():
+        # task = task_factory.get_task(params.task, logging_dir=model_dir)
+        task = mock_task.MockTask(params.task, logging_dir=model_dir)
+
+        # Set the loss to NaN to trigger RunTimeError.
+        def build_losses(labels, model_outputs, aux_losses=None):
+          del labels, model_outputs
+          return tf.constant([np.nan], tf.float32) + aux_losses
+
+        task.build_losses = build_losses
+
+      with self.assertRaises(RuntimeError):
+        train_lib.run_experiment(
+            distribution_strategy=distribution_strategy,
+            task=task,
+            mode=flag_mode,
+            params=params,
+            model_dir=model_dir)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution_strategy=[
+              strategy_combinations.default_strategy,
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          flag_mode=['train'],
+      ))
+  def test_recovery(self, distribution_strategy, flag_mode):
+    loss_threshold = 1.0
+    model_dir = self.get_temp_dir()
+    flags_dict = dict(
+        experiment='mock',
+        mode=flag_mode,
+        model_dir=model_dir,
+        params_override=json.dumps(self._test_config))
+    with flagsaver.flagsaver(**flags_dict):
+      params = train_utils.parse_configuration(flags.FLAGS)
+      params.trainer.loss_upper_bound = loss_threshold
+      params.trainer.recovery_max_trials = 1
+      train_utils.serialize_config(params, model_dir)
+      with distribution_strategy.scope():
+        task = task_factory.get_task(params.task, logging_dir=model_dir)
+
+      # Saves a checkpoint for reference.
+      model = task.build_model()
+      checkpoint = tf.train.Checkpoint(model=model)
+      checkpoint_manager = tf.train.CheckpointManager(
+          checkpoint, self.get_temp_dir(), max_to_keep=2)
+      checkpoint_manager.save()
+      before_weights = model.get_weights()
+
+      def build_losses(labels, model_outputs, aux_losses=None):
+        del labels, model_outputs
+        return tf.constant([loss_threshold], tf.float32) + aux_losses
+
+      task.build_losses = build_losses
+
+      model, _ = train_lib.run_experiment(
+          distribution_strategy=distribution_strategy,
+          task=task,
+          mode=flag_mode,
+          params=params,
+          model_dir=model_dir)
+      after_weights = model.get_weights()
+      for left, right in zip(before_weights, after_weights):
+        self.assertAllEqual(left, right)

  def test_parse_configuration(self):
    model_dir = self.get_temp_dir()

--- a/official/modeling/fast_training/experimental/tf2_utils_2x_wide.py
+++ b/official/modeling/fast_training/experimental/tf2_utils_2x_wide.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Stacking model horizontally."""
+
+from absl import logging
+import numpy as np
+import tensorflow as tf
+
+
+def expand_vector(v: np.ndarray) -> np.ndarray:
+  """Expands a vector with batch dimensions.
+
+  Equivalent to expand_1_axis(v, epsilon=0.0, axis=-1)
+
+  Args:
+    v: A vector with shape [..., a].
+
+  Returns:
+    A vector with shape [..., 2 * a].
+  """
+  return np.repeat(v, 2, axis=-1)
+
+
+def expand_1_axis(w: np.ndarray,
+                  epsilon: float,
+                  axis: int) -> np.ndarray:
+  """Expands either the first dimension or the last dimension of w.
+
+  If `axis = 0`, the following constraint will be satisfied:
+  matmul(x, w) ==
+      matmul(expand_vector(x), expand_1_axis(w, epsilon=0.1, axis=0))
+
+  If `axis = -1`, the following constraint will be satisfied if `epsilon = 0.0`:
+  expand_vector(matmul(x, w)) ==
+      2 * matmul(x, expand_1_axis(w, epsilon=0.0, axis=-1))
+
+  Args:
+    w: Numpy array of shape [a_0, a_1, ..., a_i-1, a_i].
+    epsilon: Symmetric Noise added to expanded tensor.
+    axis: Must be either 0 or -1.
+
+  Returns:
+    Expanded numpy array.
+  """
+  assert axis in (0, -1), (
+      "Only support expanding the first or the last dimension. "
+      "Got: {}".format(axis))
+
+  rank = len(w.shape)
+
+  d_w = np.random.normal(np.zeros_like(w), np.fabs(w) * epsilon, w.shape)
+  d_w = np.repeat(d_w, 2, axis=axis)
+
+  sign_flip = np.array([1, -1])
+  for _ in range(rank - 1):
+    sign_flip = np.expand_dims(sign_flip, axis=-1 if axis == 0 else 0)
+  sign_flip = np.tile(sign_flip,
+                      [w.shape[0]] + [1] * (rank - 2) + [w.shape[-1]])
+
+  d_w *= sign_flip
+  w_expand = (np.repeat(w, 2, axis=axis) + d_w) / 2
+  return w_expand
+
+
+def expand_2_axes(w: np.ndarray,
+                  epsilon: float) -> np.ndarray:
+  """Expands the first dimension and the last dimension of w.
+
+  The following constraint will be satisfied:
+  expand_vector(matmul(x, w)) == matmul(expand_vector(x), expand_2_axes(w))
+
+  Args:
+    w: Numpy array of shape [a_0, a_1, ..., a_i-1, a_i].
+    epsilon: Symmetric Noise added to expanded tensor.
+
+  Returns:
+    Expanded numpy array.
+  """
+  rank = len(w.shape)
+
+  d_w = np.random.normal(np.zeros_like(w), np.fabs(w) * epsilon, w.shape)
+  d_w = np.repeat(np.repeat(d_w, 2, axis=0), 2, axis=-1)
+
+  sign_flip = np.array([1, -1])
+  for _ in range(rank - 1):
+    sign_flip = np.expand_dims(sign_flip, axis=-1)
+  sign_flip = np.tile(sign_flip,
+                      [w.shape[0]] + [1] * (rank - 2) + [w.shape[-1] * 2])
+  d_w *= sign_flip
+
+  w_expand = (np.repeat(np.repeat(w, 2, axis=0), 2, axis=-1) + d_w) / 2
+  return w_expand
+
+
+def var_to_var(var_from: tf.Variable,
+               var_to: tf.Variable,
+               epsilon: float):
+  """Expands a variable to another variable.
+
+  Assume the shape of `var_from` is (a, b, ..., y, z), the shape of `var_to`
+  can be (a, ..., z * 2), (a * 2, ..., z * 2), (a * 2, ..., z)
+
+  If the shape of `var_to` is (a, ..., 2 * z):
+    For any x, tf.matmul(x, var_to) ~= expand_vector(tf.matmul(x, var_from)) / 2
+    Not that there will be noise added to the left hand side, if epsilon != 0.
+  If the shape of `var_to` is (2 * a, ..., z):
+    For any x, tf.matmul(expand_vector(x), var_to) == tf.matmul(x, var_from)
+  If the shape of `var_to` is (2 * a, ..., 2 * z):
+    For any x, tf.matmul(expand_vector(x), var_to) ==
+        expand_vector(tf.matmul(expand_vector(x), var_from))
+
+  Args:
+    var_from: input variable to expand.
+    var_to: output variable.
+    epsilon: the noise ratio that will be added, when splitting `var_from`.
+  """
+  shape_from = var_from.shape
+  shape_to = var_to.shape
+
+  if shape_from == shape_to:
+    var_to.assign(var_from)
+
+  elif len(shape_from) == 1 and len(shape_to) == 1:
+    var_to.assign(expand_vector(var_from.numpy()))
+
+  elif shape_from[0] * 2 == shape_to[0] and shape_from[-1] == shape_to[-1]:
+    var_to.assign(expand_1_axis(var_from.numpy(), epsilon=epsilon, axis=0))
+
+  elif shape_from[0] == shape_to[0] and shape_from[-1] * 2 == shape_to[-1]:
+    var_to.assign(expand_1_axis(var_from.numpy(), epsilon=epsilon, axis=-1))
+
+  elif shape_from[0] * 2 == shape_to[0] and shape_from[-1] * 2 == shape_to[-1]:
+    var_to.assign(expand_2_axes(var_from.numpy(), epsilon=epsilon))
+
+  else:
+    raise ValueError("Shape not supported, {}, {}".format(shape_from, shape_to))
+
+
+def model_to_model_2x_wide(model_from: tf.Module,
+                           model_to: tf.Module,
+                           epsilon: float = 0.1):
+  """Expands a model to a wider version.
+
+  Also makes sure that the output of the model is not changed after expanding.
+  For example:
+  ```
+  model_narrow = tf.keras.Sequential()
+  model_narrow.add(tf.keras.Input(shape=(3,)))
+  model_narrow.add(tf.keras.layers.Dense(4))
+  model_narrow.add(tf.keras.layers.Dense(1))
+
+  model_wide = tf.keras.Sequential()
+  model_wide.add(tf.keras.Input(shape=(6,)))
+  model_wide.add(tf.keras.layers.Dense(8))
+  model_wide.add(tf.keras.layers.Dense(1))
+
+  model_to_model_2x_wide(model_narrow, model_wide)
+  assert model_narrow([[1, 2, 3]]) == model_wide([[1, 1, 2, 2, 3, 3]])
+  ```
+
+  We assume that `model_from` and `model_to` has the same architecture and only
+  widths of them differ.
+
+  Args:
+    model_from: input model to expand.
+    model_to: output model whose variables will be assigned expanded values
+      according to `model_from`.
+    epsilon: the noise ratio that will be added, when splitting `var_from`.
+  """
+  for w_from, w_to in zip(model_from.trainable_variables,
+                          model_to.trainable_variables):
+    logging.info("expanding %s %s to %s %s",
+                 w_from.name, w_from.shape, w_to.name, w_to.shape)
+    var_to_var(w_from, w_to, epsilon=epsilon)
--- a/official/modeling/fast_training/experimental/tf2_utils_2x_wide_test.py
+++ b/official/modeling/fast_training/experimental/tf2_utils_2x_wide_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tf2_utils_2x_wide."""
+
+import numpy as np
+import tensorflow as tf
+
+from official.modeling.fast_training.experimental import tf2_utils_2x_wide
+
+
+class Tf2Utils2XWideTest(tf.test.TestCase):
+
+  def test_expand_vector(self):
+    x = np.array([1, 2])
+    self.assertAllClose(tf2_utils_2x_wide.expand_vector(x),
+                        np.array([1, 1, 2, 2]))
+
+  def test_expand_matrix(self):
+    x = np.array([[1, 2], [3, 4]])
+    x = tf2_utils_2x_wide.expand_2_axes(x, epsilon=0.1)
+    self.assertAllClose(x[0, :] + x[1, :], np.array([1, 1, 2, 2]))
+    self.assertAllClose(x[2, :] + x[3, :], np.array([3, 3, 4, 4]))
+
+  def test_expand_matrix_axis_0(self):
+    x = np.array([[1, 2], [3, 4]])
+    x = tf2_utils_2x_wide.expand_1_axis(x, axis=0, epsilon=0.1)
+    self.assertAllClose(x[0, :] + x[1, :], np.array([1, 2]))
+    self.assertAllClose(x[2, :] + x[3, :], np.array([3, 4]))
+
+  def test_expand_matrix_axis_1(self):
+    x = np.array([[1, 2], [3, 4]])
+    x = tf2_utils_2x_wide.expand_1_axis(x, axis=-1, epsilon=0.1)
+    self.assertAllClose(x[:, 0] + x[:, 1], np.array([1, 3]))
+    self.assertAllClose(x[:, 2] + x[:, 3], np.array([2, 4]))
+
+  def test_expand_3d_tensor(self):
+    x0 = np.array([10, 11])
+    x1 = np.array([10, 10, 11, 11])
+    w0 = np.random.rand(2, 2)
+    w1 = tf2_utils_2x_wide.expand_2_axes(w0, epsilon=0.1)
+    o0 = np.matmul(x0, w0)
+    o1 = np.matmul(x1, w1)
+    self.assertAllClose(np.repeat(o0, 2, axis=-1), o1)
+
+  def test_expand_3d_tensor_axis_0(self):
+    x0 = np.array([10, 11])
+    x1 = np.array([10, 10, 11, 11])
+    w0 = np.random.rand(2, 2)
+    w1 = tf2_utils_2x_wide.expand_1_axis(w0, axis=0, epsilon=0.1)
+    o0 = np.matmul(x0, w0)
+    o1 = np.matmul(x1, w1)
+    self.assertAllClose(o0, o1)
+
+  def test_expand_3d_tensor_axis_2(self):
+    x = np.array([10, 11])
+    w0 = np.random.rand(2, 2)
+    w1 = tf2_utils_2x_wide.expand_1_axis(w0, axis=-1, epsilon=0.1)
+    o0 = np.matmul(x, w0)
+    o1 = np.matmul(x, w1)
+    self.assertAllClose(o0, np.sum(o1.reshape(2, 2), axis=-1))
+
+  def test_end_to_end(self):
+    """Covers expand_vector, expand_2_axes, and expand_1_axis."""
+    model_narrow = tf.keras.Sequential()
+    model_narrow.add(tf.keras.Input(shape=(3,)))
+    model_narrow.add(tf.keras.layers.Dense(4))
+    model_narrow.add(tf.keras.layers.Dense(4))
+    model_narrow.add(tf.keras.layers.Dense(1))
+
+    model_wide = tf.keras.Sequential()
+    model_wide.add(tf.keras.Input(shape=(6,)))
+    model_wide.add(tf.keras.layers.Dense(8))
+    model_wide.add(tf.keras.layers.Dense(8))
+    model_wide.add(tf.keras.layers.Dense(1))
+
+    x0 = np.array([[1, 2, 3]])
+    x1 = np.array([[1, 1, 2, 2, 3, 3]])
+
+    # Call model once to build variables first.
+    _, _ = model_narrow(x0), model_wide(x1)
+    tf2_utils_2x_wide.model_to_model_2x_wide(
+        model_narrow, model_wide, epsilon=0.2)
+
+    self.assertAllClose(model_narrow(x0), model_wide(x1),
+                        rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/modeling/progressive/policies.py
+++ b/official/modeling/progressive/policies.py
@@ -19,13 +19,20 @@ abstract methods to handle each training stage.
 """

 import abc
+import dataclasses
 from typing import Any, Mapping
 from absl import logging
-import dataclasses
 import six
 import tensorflow as tf
+
+from tensorflow.python.eager import monitoring
+from official.modeling.fast_training.progressive import utils
 from official.modeling.hyperparams import base_config
-from official.modeling.progressive import utils
+
+
+_progressive_policy_creation_counter = monitoring.Counter(
+    '/tensorflow/training/fast_training/progressive_policy_creation',
+    'Counter for the number of ProgressivePolicy creations.')


 @dataclasses.dataclass
@@ -69,6 +76,8 @@ class ProgressivePolicy:
        optimizer=self.get_optimizer(stage_id),
        model=self.get_model(stage_id, old_model=None))

+    _progressive_policy_creation_counter.get_cell().increase_by(1)
+
  def compute_stage_id(self, global_step: int) -> int:
    for stage_id in range(self.num_stages()):
      global_step -= self.num_steps(stage_id)

--- a/official/modeling/progressive/train.py
+++ b/official/modeling/progressive/train.py
@@ -26,7 +26,7 @@ from official.common import flags as tfm_flags
 from official.core import task_factory
 from official.core import train_utils
 from official.modeling import performance
-from official.modeling.progressive import train_lib
+from official.modeling.fast_training.progressive import train_lib

 FLAGS = flags.FLAGS


--- a/official/modeling/progressive/train_lib.py
+++ b/official/modeling/progressive/train_lib.py
@@ -29,7 +29,7 @@ import tensorflow as tf
 from official.core import base_task
 from official.core import config_definitions
 from official.core import train_lib as base_train_lib
-from official.modeling.progressive import trainer as prog_trainer_lib
+from official.modeling.fast_training.progressive import trainer as prog_trainer_lib


 def run_experiment(distribution_strategy: tf.distribute.Strategy,

--- a/official/modeling/progressive/train_lib_test.py
+++ b/official/modeling/progressive/train_lib_test.py
@@ -31,9 +31,9 @@ from official.core import config_definitions as cfg
 from official.core import task_factory
 from official.modeling import optimization
 from official.modeling.hyperparams import params_dict
-from official.modeling.progressive import policies
-from official.modeling.progressive import train_lib
-from official.modeling.progressive import trainer as prog_trainer_lib
+from official.modeling.fast_training.progressive import policies
+from official.modeling.fast_training.progressive import train_lib
+from official.modeling.fast_training.progressive import trainer as prog_trainer_lib
 from official.utils.testing import mock_task

 FLAGS = flags.FLAGS

--- a/official/modeling/progressive/trainer.py
+++ b/official/modeling/progressive/trainer.py
@@ -18,21 +18,21 @@ The trainer implements the Orbit `StandardTrainable` and
 `StandardEvaluable` interfaces. Trainers inside this project should be
 interchangable and independent on model architectures and tasks.
 """
+
+import dataclasses
 import os
 from typing import Any, Optional

 # Import libraries
 from absl import logging
-
-import dataclasses
 import gin
 import orbit
 import tensorflow as tf
 from official.core import base_task
 from official.core import base_trainer as trainer_lib
 from official.core import config_definitions
-from official.modeling.progressive import policies
-from official.modeling.progressive import utils
+from official.modeling.fast_training.progressive import policies
+from official.modeling.fast_training.progressive import utils

 ExperimentConfig = config_definitions.ExperimentConfig


--- a/official/modeling/progressive/trainer_test.py
+++ b/official/modeling/progressive/trainer_test.py
@@ -24,8 +24,8 @@ from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
 from official.core import config_definitions as cfg
 from official.modeling import optimization
-from official.modeling.progressive import policies
-from official.modeling.progressive import trainer as trainer_lib
+from official.modeling.fast_training.progressive import policies
+from official.modeling.fast_training.progressive import trainer as trainer_lib
 from official.nlp.configs import bert
 from official.utils.testing import mock_task


--- a/official/modeling/progressive/utils.py
+++ b/official/modeling/progressive/utils.py
--- a/official/modeling/multitask/base_model.py
+++ b/official/modeling/multitask/base_model.py
@@ -12,21 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
 """Abstraction of multi-task model."""
 from typing import Text, Dict


--- a/official/modeling/multitask/base_trainer.py
+++ b/official/modeling/multitask/base_trainer.py
@@ -12,21 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
 """Multitask base trainer implementation.

 The trainer derives from the Orbit `StandardTrainer` class.

--- a/official/modeling/multitask/evaluator.py
+++ b/official/modeling/multitask/evaluator.py
@@ -54,8 +54,15 @@ class MultiTaskEvaluator(orbit.AbstractEvaluator):
    self._model = model
    self._global_step = global_step or orbit.utils.create_global_step()
    self._checkpoint_exporter = checkpoint_exporter
+    if hasattr(self.model, "checkpoint_items"):
+      checkpoint_items = self.model.checkpoint_items
+    else:
+      checkpoint_items = {}
+
    self._checkpoint = tf.train.Checkpoint(
-        global_step=self.global_step, model=self.model)
+        model=self.model,
+        global_step=self.global_step,
+        **checkpoint_items)

    self._validation_losses = None
    self._validation_metrics = None

--- a/official/modeling/multitask/task_sampler.py
+++ b/official/modeling/multitask/task_sampler.py
@@ -78,7 +78,10 @@ class ProportionalTaskSampler(TaskSampler):


 class AnnealingTaskSampler(TaskSampler):
-  """Sample tasks according to task weights as well as training progress."""
+  """Sample tasks according to task weights as well as training progress.
+
+  See http://proceedings.mlr.press/v97/stickland19a/stickland19a.pdf
+  """

  def __init__(self,
               task_weights: Dict[Text, Union[float, int]],