initial files for ResNet50

3d61d6b3 · qianyj · d3a70caf · 3d61d6b3 · 3d61d6b3 · 3d61d6b3
Commit 3d61d6b3 authored Mar 30, 2023 by qianyj
20 changed files
--- a/official/modeling/multitask/evaluator.py
+++ b/official/modeling/multitask/evaluator.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multitask Evaluator implementation.
+The evaluator implements the Orbit `AbstractEvaluator` interface.
+"""
+from typing import Dict, List, Optional, Union
+import gin
+import orbit
+import tensorflow as tf
+from official.core import base_task
+from official.core import train_utils
+from official.modeling.multitask import base_model
+@gin.configurable
+class MultiTaskEvaluator(orbit.AbstractEvaluator):
+  """Implements the common trainer shared for TensorFlow models."""
+  def __init__(
+      self,
+      eval_tasks: List[base_task.Task],
+      model: Union[tf.keras.Model, base_model.MultiTaskBaseModel],
+      global_step: Optional[tf.Variable] = None,
+      eval_steps: Optional[Dict[str, int]] = None,
+      checkpoint_exporter: Optional[train_utils.BestCheckpointExporter] = None):
+    """Initialize common trainer for TensorFlow models.
+    Args:
+      eval_tasks: A list of tasks to evaluate.
+      model: tf.keras.Model instance.
+      global_step: the global step variable.
+      eval_steps: a dictionary of steps to run eval keyed by task names.
+      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
+        interface.
+    """
+    # Gets the current distribution strategy. If not inside any strategy scope,
+    # it gets a single-replica no-op strategy.
+    self._strategy = tf.distribute.get_strategy()
+    self._tasks = eval_tasks
+    self._model = model
+    self._global_step = global_step or orbit.utils.create_global_step()
+    self._checkpoint_exporter = checkpoint_exporter
+    if hasattr(self.model, "checkpoint_items"):
+      checkpoint_items = self.model.checkpoint_items
+    else:
+      checkpoint_items = {}
+    self._checkpoint = tf.train.Checkpoint(
+        model=self.model,
+        global_step=self.global_step,
+        **checkpoint_items)
+    self._validation_losses = None
+    self._validation_metrics = None
+    # Builds per-task datasets.
+    self.eval_datasets = {}
+    self.eval_steps = eval_steps or {}
+    for task in self.tasks:
+      self.eval_datasets[task.name] = orbit.utils.make_distributed_dataset(
+          self.strategy, task.build_inputs, task.task_config.validation_data)
+    # Builds per-task validation loops.
+    def get_function(task_name, task):
+      task_metrics = self.validation_metrics[task_name]
+      task_loss = self.validation_losses[task_name]
+      if isinstance(self.model, base_model.MultiTaskBaseModel):
+        model = self.model.sub_tasks[task_name]
+      else:
+        model = self.model
+      def step_fn(inputs):
+        logs = task.validation_step(inputs, model=model, metrics=task_metrics)
+        task_loss.update_state(logs[task.loss])
+        return logs
+      @tf.function
+      def eval_step_fn(iterator):
+        distributed_outputs = self.strategy.run(step_fn, args=(next(iterator),))
+        return tf.nest.map_structure(self.strategy.experimental_local_results,
+                                     distributed_outputs)
+      return orbit.utils.create_loop_fn(eval_step_fn)
+    self.task_fns = {
+        task.name: get_function(task.name, task) for task in self.tasks
+    }
+  @property
+  def strategy(self):
+    return self._strategy
+  @property
+  def tasks(self):
+    return self._tasks
+  @property
+  def model(self):
+    return self._model
+  @property
+  def global_step(self):
+    return self._global_step
+  @property
+  def validation_losses(self):
+    """Accesses the validation loss metric object."""
+    if self._validation_losses is None:
+      # Builds the per-task metrics and losses.
+      self._validation_losses = {}
+      for task in self.tasks:
+        self._validation_losses[task.name] = tf.keras.metrics.Mean(
+            "validation_loss", dtype=tf.float32)
+    return self._validation_losses
+  @property
+  def validation_metrics(self):
+    """Accesses all validation metric metric objects."""
+    if self._validation_metrics is None:
+      # Builds the per-task metrics and losses.
+      self._validation_metrics = {}
+      for task in self.tasks:
+        self._validation_metrics[task.name] = task.build_metrics(training=False)
+    return self._validation_metrics
+  @property
+  def checkpoint(self):
+    """Accesses the training checkpoint."""
+    return self._checkpoint
+  def evaluate(self, num_steps: tf.Tensor):
+    """Performs evaluation for each `EvalTask`."""
+    for metric in self.validation_losses.values():
+      metric.reset_states()
+    for metrics in self.validation_metrics.values():
+      for metric in metrics:
+        metric.reset_states()
+    results = {}
+    eval_iters = tf.nest.map_structure(iter, self.eval_datasets)
+    for task in self.tasks:
+      outputs = None
+      name = task.name
+      eval_iter = eval_iters[name]
+      task_eval_steps = self.eval_steps.get(name, None) or num_steps
+      outputs = self.task_fns[name](
+          eval_iter,
+          task_eval_steps,
+          state=outputs,
+          reduce_fn=task.aggregate_logs)
+      task_metrics = self.validation_metrics[name]
+      task_loss = self.validation_losses[name]
+      logs = {}
+      for metric in task_metrics + [task_loss]:
+        logs[metric.name] = metric.result()
+      if outputs:
+        metrics = task.reduce_aggregated_logs(
+            outputs, global_step=self.global_step)
+        logs.update(metrics)
+      results[name] = logs
+    if self._checkpoint_exporter:
+      self._checkpoint_exporter.maybe_export_checkpoint(
+          self.checkpoint, results, self.global_step.numpy())
+    return results
--- a/official/modeling/multitask/evaluator_test.py
+++ b/official/modeling/multitask/evaluator_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for multitask.evaluator."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.core import base_task
+from official.core import config_definitions as cfg
+from official.modeling.multitask import evaluator
+def all_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],
+      mode="eager",
+  )
+class MockModel(tf.keras.Model):
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self.dense = tf.keras.layers.Dense(1)
+  def call(self, inputs):
+    print(inputs, type(inputs))
+    if "y" in inputs:
+      self.add_loss(tf.zeros((1,), dtype=tf.float32))
+    else:
+      self.add_loss(tf.ones((1,), dtype=tf.float32))
+    return self.dense(inputs["x"])
+class MockTask(base_task.Task):
+  """Mock task object for testing."""
+  def build_metrics(self, training: bool = True):
+    del training
+    return [tf.keras.metrics.Accuracy(name="acc")]
+  def build_inputs(self, params):
+    def generate_data(_):
+      x = tf.zeros(shape=(2,), dtype=tf.float32)
+      label = tf.zeros([1], dtype=tf.int32)
+      if self.name == "bar":
+        return dict(x=x, y=x), label
+      else:
+        return dict(x=x), label
+    dataset = tf.data.Dataset.range(1)
+    dataset = dataset.repeat()
+    dataset = dataset.map(
+        generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    return dataset.prefetch(buffer_size=1).batch(2, drop_remainder=True)
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    logs = super().validation_step(inputs, model, metrics)
+    logs["counter"] = tf.ones((1,), dtype=tf.float32)
+    return logs
+  def aggregate_logs(self, state, step_outputs):
+    if state is None:
+      state = {}
+    for key, value in step_outputs.items():
+      if key not in state:
+        state[key] = []
+      state[key].append(
+          np.concatenate([np.expand_dims(v.numpy(), axis=0) for v in value]))
+    return state
+  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
+    for k, v in aggregated_logs.items():
+      aggregated_logs[k] = np.sum(np.stack(v, axis=0))
+    return aggregated_logs
+class EvaluatorTest(tf.test.TestCase, parameterized.TestCase):
+  @combinations.generate(all_strategy_combinations())
+  def test_multitask_evaluator(self, distribution):
+    with distribution.scope():
+      tasks = [
+          MockTask(params=cfg.TaskConfig(), name="bar"),
+          MockTask(params=cfg.TaskConfig(), name="foo")
+      ]
+      model = MockModel()
+      test_evaluator = evaluator.MultiTaskEvaluator(
+          eval_tasks=tasks, model=model)
+      results = test_evaluator.evaluate(tf.convert_to_tensor(1, dtype=tf.int32))
+    self.assertContainsSubset(["validation_loss", "acc"], results["bar"].keys())
+    self.assertContainsSubset(["validation_loss", "acc"], results["foo"].keys())
+    self.assertEqual(results["bar"]["validation_loss"], 0.0)
+    self.assertEqual(results["foo"]["validation_loss"], 1.0)
+  @combinations.generate(all_strategy_combinations())
+  def test_multitask_evaluator_numpy_metrics(self, distribution):
+    with distribution.scope():
+      tasks = [
+          MockTask(params=cfg.TaskConfig(), name="bar"),
+          MockTask(params=cfg.TaskConfig(), name="foo")
+      ]
+      model = MockModel()
+      test_evaluator = evaluator.MultiTaskEvaluator(
+          eval_tasks=tasks, model=model)
+      results = test_evaluator.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
+    self.assertEqual(results["bar"]["counter"],
+                     5. * distribution.num_replicas_in_sync)
+    self.assertEqual(results["foo"]["counter"],
+                     5. * distribution.num_replicas_in_sync)
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/modeling/multitask/interleaving_trainer.py
+++ b/official/modeling/multitask/interleaving_trainer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multitask trainer that interleaves each task's train step."""
+from typing import Union
+import gin
+import orbit
+import tensorflow as tf
+from official.modeling.multitask import base_model
+from official.modeling.multitask import base_trainer
+from official.modeling.multitask import multitask
+from official.modeling.multitask import task_sampler as sampler
+@gin.configurable
+class MultiTaskInterleavingTrainer(base_trainer.MultiTaskBaseTrainer):
+  """MultiTask trainer that interleaves task update."""
+  def __init__(self,
+               multi_task: multitask.MultiTask,
+               multi_task_model: Union[tf.keras.Model,
+                                       base_model.MultiTaskBaseModel],
+               optimizer: tf.optimizers.Optimizer,
+               task_sampler: sampler.TaskSampler,
+               trainer_options=None):
+    super().__init__(
+        multi_task=multi_task,
+        multi_task_model=multi_task_model,
+        optimizer=optimizer,
+        trainer_options=trainer_options)
+    self._task_sampler = task_sampler
+    # Build per task train step.
+    def _get_task_step(task_name, task):
+      def step_fn(inputs):
+        if isinstance(self.multi_task_model, base_model.MultiTaskBaseModel):
+          task_model = self.multi_task_model.sub_tasks[task_name]
+        else:
+          task_model = self.multi_task_model
+        task_logs = task.train_step(
+            inputs,
+            model=task_model,
+            optimizer=self.optimizer,
+            metrics=self.training_metrics[task_name])
+        self.training_losses[task_name].update_state(task_logs[task.loss])
+      return step_fn
+    self._task_train_step_map = {
+        name: _get_task_step(name, task)
+        for name, task in self.multi_task.tasks.items()
+    }
+    # TODO(haozhangthu): Add taskwise step counter to train_loop_end for logging
+    # on TensorBoard.
+    self._task_step_counters = {
+        name: orbit.utils.create_global_step() for name in self.multi_task.tasks
+    }
+  def task_step_counter(self, name):
+    return self._task_step_counters[name]
+  def train_step(self, iterator_map):
+    # Sample one task to train according to a multinomial distribution
+    rn = tf.random.stateless_uniform(shape=[], seed=(0, self.global_step))
+    cumulative_sample_distribution = self._task_sampler.task_cumulative_distribution(
+        self.global_step)
+    # Prepend a [0.0] for indexing convenience.
+    cumulative_sample_distribution = tf.concat(
+        [tf.constant([0.0], dtype=tf.float32), cumulative_sample_distribution],
+        axis=0)
+    for idx, (name, _) in enumerate(self.multi_task.tasks.items()):
+      begin = cumulative_sample_distribution[idx]
+      end = cumulative_sample_distribution[idx + 1]
+      if rn >= begin and rn < end:
+        self._strategy.run(
+            self._task_train_step_map[name], args=(next(iterator_map[name]),))
+        self.global_step.assign_add(1)
+        self.task_step_counter(name).assign_add(1)
+  def train_loop_end(self):
+    """Record loss and metric values per task."""
+    result = super().train_loop_end()
+    # Interleaving training does not have a good semantic for `total_loss`. In
+    # fact, it is always zero. To avoid confusion, we filter the `total_loss`
+    # from the result logs.
+    if 'total_loss' in result:
+      result.pop('total_loss')
+    return result
--- a/official/modeling/multitask/interleaving_trainer_test.py
+++ b/official/modeling/multitask/interleaving_trainer_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for multitask.interleaving_trainer."""
+from absl.testing import parameterized
+import tensorflow as tf
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.modeling.multitask import configs
+from official.modeling.multitask import interleaving_trainer
+from official.modeling.multitask import multitask
+from official.modeling.multitask import task_sampler
+from official.modeling.multitask import test_utils
+def all_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],
+      mode="eager",
+  )
+class InterleavingTrainerTest(tf.test.TestCase, parameterized.TestCase):
+  @combinations.generate(all_strategy_combinations())
+  def test_multitask_interleaving_trainer(self, distribution):
+    with distribution.scope():
+      tasks = [
+          test_utils.MockFooTask(params=test_utils.FooConfig(), name="foo"),
+          test_utils.MockBarTask(params=test_utils.BarConfig(), name="bar")
+      ]
+      test_multitask = multitask.MultiTask(tasks=tasks)
+      test_optimizer = tf.keras.optimizers.SGD(0.1)
+      model = test_utils.MockMultiTaskModel()
+      sampler = task_sampler.UniformTaskSampler(
+          task_weights=test_multitask.task_weights)
+      test_trainer = interleaving_trainer.MultiTaskInterleavingTrainer(
+          multi_task=test_multitask,
+          multi_task_model=model,
+          optimizer=test_optimizer,
+          task_sampler=sampler)
+      results = test_trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertContainsSubset(["training_loss", "bar_acc"],
+                                results["bar"].keys())
+      self.assertContainsSubset(["training_loss", "foo_acc"],
+                                results["foo"].keys())
+      self.assertNotIn("total_loss", results)
+  @combinations.generate(all_strategy_combinations())
+  def test_trainer_with_configs(self, distribution):
+    config = configs.MultiTaskConfig(
+        task_routines=(configs.TaskRoutine(
+            task_name="foo",
+            task_config=test_utils.FooConfig(),
+            task_weight=3.0),
+                       configs.TaskRoutine(
+                           task_name="bar",
+                           task_config=test_utils.BarConfig(),
+                           task_weight=1.0)))
+    with distribution.scope():
+      test_multitask = multitask.MultiTask.from_config(config)
+    test_optimizer = tf.keras.optimizers.SGD(0.1)
+    model = test_utils.MockMultiTaskModel()
+    num_step = 1000
+    sampler = task_sampler.AnnealingTaskSampler(
+        task_weights=test_multitask.task_weights,
+        steps_per_epoch=num_step/5,
+        total_steps=num_step)
+    test_trainer = interleaving_trainer.MultiTaskInterleavingTrainer(
+        multi_task=test_multitask,
+        multi_task_model=model,
+        optimizer=test_optimizer,
+        task_sampler=sampler)
+    results = test_trainer.train(tf.convert_to_tensor(num_step, dtype=tf.int32))
+    self.assertContainsSubset(["training_loss", "bar_acc"],
+                              results["bar"].keys())
+    self.assertContainsSubset(["training_loss", "foo_acc"],
+                              results["foo"].keys())
+    self.assertEqual(test_trainer.global_step.numpy(), num_step)
+    bar_sampled_step = test_trainer.task_step_counter("bar").numpy()
+    foo_sampled_step = test_trainer.task_step_counter("foo").numpy()
+    self.assertEqual(bar_sampled_step + foo_sampled_step, num_step)
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/modeling/multitask/multitask.py
+++ b/official/modeling/multitask/multitask.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Experimental MultiTask base class for multi-task training/evaluation."""
+import abc
+from typing import Dict, List, Optional, Text, Union
+import tensorflow as tf
+from official.core import base_task
+from official.core import config_definitions
+from official.core import task_factory
+from official.modeling import optimization
+from official.modeling.multitask import base_model
+from official.modeling.multitask import configs
+OptimizationConfig = optimization.OptimizationConfig
+RuntimeConfig = config_definitions.RuntimeConfig
+class MultiTask(tf.Module, metaclass=abc.ABCMeta):
+  """A multi-task class to manage multiple tasks."""
+  def __init__(self,
+               tasks: Union[Dict[Text, base_task.Task], List[base_task.Task]],
+               task_weights: Optional[Dict[str, Union[float, int]]] = None,
+               task_eval_steps: Optional[Dict[str, int]] = None,
+               name: Optional[str] = None):
+    """MultiTask initialization.
+    Args:
+      tasks: a list or a flat dict of Task.
+      task_weights: a dict of (task, task weight), task weight can be applied
+        directly during loss summation in a joint backward step, or it can be
+        used to sample task among interleaved backward step.
+      task_eval_steps: a dict of (task, eval steps).
+      name: the instance name of a MultiTask object.
+    """
+    super().__init__(name=name)
+    if isinstance(tasks, list):
+      self._tasks = {}
+      for task in tasks:
+        if task.name in self._tasks:
+          raise ValueError("Duplicated tasks found, task.name is %s" %
+                           task.name)
+        self._tasks[task.name] = task
+    elif isinstance(tasks, dict):
+      self._tasks = tasks
+    else:
+      raise ValueError("The tasks argument has an invalid type: %s" %
+                       type(tasks))
+    self.task_eval_steps = task_eval_steps or {}
+    self._task_weights = task_weights or {}
+    self._task_weights = dict([
+        (name, self._task_weights.get(name, 1.0)) for name in self.tasks
+    ])
+  @classmethod
+  def from_config(cls, config: configs.MultiTaskConfig, logging_dir=None):
+    tasks = {}
+    task_eval_steps = {}
+    task_weights = {}
+    for task_routine in config.task_routines:
+      task_name = task_routine.task_name or task_routine.task_config.name
+      tasks[task_name] = task_factory.get_task(
+          task_routine.task_config, logging_dir=logging_dir, name=task_name)
+      task_eval_steps[task_name] = task_routine.eval_steps
+      task_weights[task_name] = task_routine.task_weight
+    return cls(
+        tasks, task_eval_steps=task_eval_steps, task_weights=task_weights)
+  @property
+  def tasks(self):
+    return self._tasks
+  def task_weight(self, task_name):
+    return self._task_weights[task_name]
+  @property
+  def task_weights(self):
+    return self._task_weights
+  @classmethod
+  def create_optimizer(cls,
+                       optimizer_config: OptimizationConfig,
+                       runtime_config: Optional[RuntimeConfig] = None):
+    return base_task.Task.create_optimizer(
+        optimizer_config=optimizer_config, runtime_config=runtime_config)
+  def joint_train_step(self, task_inputs,
+                       multi_task_model: base_model.MultiTaskBaseModel,
+                       optimizer: tf.keras.optimizers.Optimizer, task_metrics,
+                       **kwargs):
+    """The joint train step.
+    Args:
+      task_inputs: a dictionary of task names and per-task features.
+      multi_task_model: a MultiTaskBaseModel instance.
+      optimizer: a tf.optimizers.Optimizer.
+      task_metrics: a dictionary of task names and per-task metrics.
+      **kwargs: other arguments to pass through.
+    Returns:
+      A dictionary of losses, inculding per-task losses and their weighted sum.
+    """
+    losses = {}
+    with tf.GradientTape() as tape:
+      total_loss = 0.0
+      for name, model in multi_task_model.sub_tasks.items():
+        inputs = task_inputs[name]
+        if isinstance(inputs, tuple) and len(inputs) == 2:
+          features, labels = inputs
+        elif isinstance(inputs, dict):
+          features, labels = inputs, inputs
+        else:
+          raise ValueError("The iterator output is neither a tuple nor a "
+                           "dictionary. It is not implemented to support "
+                           "such outputs.")
+        outputs = model(features, training=True)
+        task_loss = self.tasks[name].build_losses(labels, outputs)
+        task_weight = self.task_weight(name)
+        total_loss += task_weight * task_loss
+        losses[name] = task_loss
+        self.tasks[name].process_metrics(task_metrics[name], labels, outputs,
+                                         **kwargs)
+        # Scales loss as the default gradients allreduce performs sum inside
+        # the optimizer.
+        scaled_loss = total_loss / tf.distribute.get_strategy(
+        ).num_replicas_in_sync
+    tvars = multi_task_model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+    losses["total_loss"] = total_loss
+    return losses
--- a/official/modeling/multitask/task_sampler.py
+++ b/official/modeling/multitask/task_sampler.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utils to sample tasks for interleaved optimization."""
+import abc
+from typing import Union, Dict, Text
+import tensorflow as tf
+from official.modeling.multitask import configs
+class TaskSampler(tf.Module, metaclass=abc.ABCMeta):
+  """An abstract class defining task sampling API for interleaving trainer."""
+  def __init__(self, task_weights: Dict[Text, Union[float, int]]):
+    self._task_weights = task_weights
+  @property
+  def task_weights(self):
+    return self._task_weights
+  @abc.abstractmethod
+  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
+    """Compute cumulative distribution to sample tasks.
+    It calculates the cumulative distribution of the multinomial task
+    distribution with respect to which to be sampled against.
+    Args:
+      global_step: A tensor indicating current progess of training.
+    Returns:
+      A float tensor with shape (#(task), 1) that represents the cumulative
+        sampling distribution.
+    """
+    pass
+class UniformTaskSampler(TaskSampler):
+  """Sample all tasks uniformly."""
+  def __init__(self, task_weights: Dict[Text, Union[float, int]]):
+    super(UniformTaskSampler, self).__init__(task_weights=task_weights)
+    self._uniform_cumulative = tf.math.cumsum(
+        tf.constant(
+            [1.0 / len(self._task_weights)] * len(self._task_weights),
+            dtype=tf.float32))
+  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
+    del global_step
+    return self._uniform_cumulative
+class ProportionalTaskSampler(TaskSampler):
+  """Sample tasks proportional to task weights."""
+  def __init__(self,
+               task_weights: Dict[Text, Union[float, int]],
+               alpha: float = 1.0):
+    super(ProportionalTaskSampler, self).__init__(task_weights=task_weights)
+    self._alpha = tf.cast(alpha, dtype=tf.float32)
+    task_weight_dict_ordered_list = tf.constant(
+        [weight for _, weight in self._task_weights.items()], dtype=tf.float32)
+    task_sizes = tf.math.pow(task_weight_dict_ordered_list, self._alpha)
+    task_distribution = task_sizes / tf.reduce_sum(task_sizes)
+    self._porportional_cumulative = tf.math.cumsum(task_distribution)
+  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
+    del global_step
+    return self._porportional_cumulative
+class AnnealingTaskSampler(TaskSampler):
+  """Sample tasks according to task weights as well as training progress.
+  See http://proceedings.mlr.press/v97/stickland19a/stickland19a.pdf
+  """
+  def __init__(self,
+               task_weights: Dict[Text, Union[float, int]],
+               steps_per_epoch: int,
+               total_steps: int):
+    super(AnnealingTaskSampler, self).__init__(task_weights=task_weights)
+    self._steps_per_epoch = tf.cast(steps_per_epoch, dtype=tf.float32)
+    self._total_epochs = tf.cast(
+        total_steps / self._steps_per_epoch, dtype=tf.float32)
+  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
+    cur_epoch = tf.math.floor(
+        tf.cast(global_step, dtype=tf.float32) / self._steps_per_epoch)
+    alpha = 1.0 - 0.8 * (cur_epoch - 1) / (self._total_epochs - 1 + 1e-10)
+    task_weight_dict_ordered_list = [
+        weight for _, weight in self._task_weights.items()
+    ]
+    task_sizes = tf.math.pow(
+        tf.constant(task_weight_dict_ordered_list, dtype=tf.float32),
+        tf.cast(alpha, dtype=tf.float32))
+    dynamic_task_distribution = task_sizes / tf.reduce_sum(task_sizes)
+    return tf.math.cumsum(dynamic_task_distribution)
+def get_task_sampler(config: configs.TaskSamplingConfig,
+                     task_weights: Dict[Text, float]) -> TaskSampler:
+  """Utils to create task sampler with configuration and task weights."""
+  oneof_config = config.get()
+  if config.type == 'uniform':
+    return UniformTaskSampler(task_weights=task_weights)
+  elif config.type == 'proportional':
+    return ProportionalTaskSampler(
+        task_weights=task_weights, alpha=oneof_config.alpha)
+  elif config.type == 'annealing':
+    return AnnealingTaskSampler(
+        task_weights=task_weights,
+        steps_per_epoch=oneof_config.steps_per_epoch,
+        total_steps=oneof_config.total_steps)
+  else:
+    raise RuntimeError('Task sampler type not supported')
--- a/official/modeling/multitask/task_sampler_test.py
+++ b/official/modeling/multitask/task_sampler_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for multitask.task_sampler."""
+import tensorflow as tf
+from official.modeling.multitask import configs
+from official.modeling.multitask import task_sampler as sampler
+class TaskSamplerTest(tf.test.TestCase):
+  def setUp(self):
+    super(TaskSamplerTest, self).setUp()
+    self._task_weights = {'A': 1.0, 'B': 2.0, 'C': 3.0}
+  def test_uniform_sample_distribution(self):
+    uniform_sampler = sampler.get_task_sampler(
+        configs.TaskSamplingConfig(type='uniform'), self._task_weights)
+    for step in range(5):
+      cumulative_distribution = uniform_sampler.task_cumulative_distribution(
+          tf.constant(step, dtype=tf.int64))
+      self.assertAllClose([0.333333, 0.666666, 1.0],
+                          cumulative_distribution.numpy())
+  def test_proportional_sample_distribution(self):
+    prop_sampler = sampler.get_task_sampler(
+        configs.TaskSamplingConfig(
+            type='proportional',
+            proportional=configs.ProportionalSampleConfig(alpha=2.0)),
+        self._task_weights)
+    # CucmulativeOf(Normalize([1.0^2, 2.0^2, 3.0^2]))
+    for step in range(5):
+      cumulative_distribution = prop_sampler.task_cumulative_distribution(
+          tf.constant(step, dtype=tf.int64))
+      self.assertAllClose([0.07142857, 0.35714286, 1.0],
+                          cumulative_distribution.numpy())
+  def test_annealing_sample_distribution(self):
+    num_epoch = 3
+    step_per_epoch = 6
+    annel_sampler = sampler.get_task_sampler(
+        configs.TaskSamplingConfig(
+            type='annealing',
+            annealing=configs.AnnealingSampleConfig(
+                steps_per_epoch=step_per_epoch,
+                total_steps=step_per_epoch * num_epoch)), self._task_weights)
+    global_step = tf.Variable(
+        0, dtype=tf.int64, name='global_step', trainable=False)
+    expected_cumulative_epochs = [[0.12056106, 0.4387236, 1.0],
+                                  [0.16666667, 0.5, 1.0],
+                                  [0.22477472, 0.5654695, 1.0]]
+    for epoch in range(num_epoch):
+      for _ in range(step_per_epoch):
+        cumulative_distribution = annel_sampler.task_cumulative_distribution(
+            tf.constant(global_step, dtype=tf.int64))
+        global_step.assign_add(1)
+        self.assertAllClose(expected_cumulative_epochs[epoch],
+                            cumulative_distribution.numpy())
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/modeling/multitask/test_utils.py
+++ b/official/modeling/multitask/test_utils.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing utils for mock models and tasks."""
+from typing import Dict, Text
+import tensorflow as tf
+from official.core import base_task
+from official.core import config_definitions as cfg
+from official.core import task_factory
+from official.modeling.multitask import base_model
+class MockFooModel(tf.keras.Model):
+  """A mock model can consume 'foo' and 'bar' inputs."""
+  def __init__(self, shared_layer, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self._share_layer = shared_layer
+    self._foo_specific_layer = tf.keras.layers.Dense(1)
+  def call(self, inputs):
+    self.add_loss(tf.zeros((1,), dtype=tf.float32))
+    if "foo" in inputs:
+      input_tensor = inputs["foo"]
+    else:
+      input_tensor = inputs["bar"]
+    return self._foo_specific_layer(self._share_layer(input_tensor))
+class MockBarModel(tf.keras.Model):
+  def __init__(self, shared_layer, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self._share_layer = shared_layer
+    self._bar_specific_layer = tf.keras.layers.Dense(1)
+  def call(self, inputs):
+    self.add_loss(tf.zeros((2,), dtype=tf.float32))
+    return self._bar_specific_layer(self._share_layer(inputs["bar"]))
+class MockMultiTaskModel(base_model.MultiTaskBaseModel):
+  def __init__(self, *args, **kwargs):
+    self._shared_dense = tf.keras.layers.Dense(1)
+    super().__init__(*args, **kwargs)
+  def _instantiate_sub_tasks(self) -> Dict[Text, tf.keras.Model]:
+    return {
+        "foo": MockFooModel(self._shared_dense),
+        "bar": MockBarModel(self._shared_dense)
+    }
+def mock_data(feature_name):
+  """Mock dataset function."""
+  def _generate_data(_):
+    x = tf.zeros(shape=(2,), dtype=tf.float32)
+    label = tf.zeros([1], dtype=tf.int32)
+    return {feature_name: x}, label
+  dataset = tf.data.Dataset.range(1)
+  dataset = dataset.repeat()
+  dataset = dataset.map(
+      _generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  return dataset.prefetch(buffer_size=1).batch(2, drop_remainder=True)
+class FooConfig(cfg.TaskConfig):
+  pass
+class BarConfig(cfg.TaskConfig):
+  pass
+@task_factory.register_task_cls(FooConfig)
+class MockFooTask(base_task.Task):
+  """Mock foo task object for testing."""
+  def build_metrics(self, training: bool = True):
+    del training
+    return [tf.keras.metrics.Accuracy(name="foo_acc")]
+  def build_inputs(self, params):
+    return mock_data("foo")
+  def build_model(self) -> tf.keras.Model:
+    return MockFooModel(shared_layer=tf.keras.layers.Dense(1))
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
+    if aux_losses:
+      loss += tf.add_n(aux_losses)
+    return tf.reduce_mean(loss)
+@task_factory.register_task_cls(BarConfig)
+class MockBarTask(base_task.Task):
+  """Mock bar task object for testing."""
+  def build_metrics(self, training: bool = True):
+    del training
+    return [tf.keras.metrics.Accuracy(name="bar_acc")]
+  def build_inputs(self, params):
+    return mock_data("bar")
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
+    if aux_losses:
+      loss += tf.add_n(aux_losses)
+    return tf.reduce_mean(loss)
--- a/official/modeling/multitask/train_lib.py
+++ b/official/modeling/multitask/train_lib.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multitask training driver library."""
+# pytype: disable=attribute-error
+import os
+from typing import Any, List, Optional, Tuple
+from absl import logging
+import orbit
+import tensorflow as tf
+from official.core import base_task
+from official.core import base_trainer as core_lib
+from official.core import train_utils
+from official.modeling.multitask import base_model
+from official.modeling.multitask import base_trainer
+from official.modeling.multitask import configs
+from official.modeling.multitask import evaluator as evaluator_lib
+from official.modeling.multitask import interleaving_trainer
+from official.modeling.multitask import multitask
+from official.modeling.multitask import task_sampler
+TRAINERS = {
+    'interleaving': interleaving_trainer.MultiTaskInterleavingTrainer,
+    'joint': base_trainer.MultiTaskBaseTrainer
+}
+def run_experiment(
+    *,
+    distribution_strategy: tf.distribute.Strategy,
+    task: multitask.MultiTask,
+    model: base_model.MultiTaskBaseModel,
+    mode: str,
+    params: configs.MultiTaskExperimentConfig,
+    model_dir: str,
+    trainer: base_trainer.MultiTaskBaseTrainer = None
+) -> base_model.MultiTaskBaseModel:
+  """Runs train/eval configured by the experiment params.
+  Args:
+    distribution_strategy: A distribution distribution_strategy.
+    task: A MultiTaskTask instance.
+    model: A MultiTaskBaseModel instance.
+    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
+      or 'continuous_eval'.
+    params: ExperimentConfig instance.
+    model_dir: A 'str', a path to store model checkpoints and summaries.
+    trainer: (optional) A multi-task trainer to use. If none is provided, a
+      default one will be created based on `params`.
+  Returns:
+      model: `base_model.MultiTaskBaseModel` instance.
+  """
+  is_training = 'train' in mode
+  is_eval = 'eval' in mode
+  with distribution_strategy.scope():
+    optimizer = task.create_optimizer(params.trainer.optimizer_config,
+                                      params.runtime)
+    kwargs = dict(multi_task=task, multi_task_model=model, optimizer=optimizer)
+    if params.trainer.trainer_type == 'interleaving':
+      sampler = task_sampler.get_task_sampler(params.trainer.task_sampler,
+                                              task.task_weights)
+      kwargs.update(dict(task_sampler=sampler))
+    if trainer is None:
+      trainer = TRAINERS[params.trainer.trainer_type](
+          **kwargs) if is_training else None
+    if is_eval:
+      eval_steps = task.task_eval_steps
+      evaluator = evaluator_lib.MultiTaskEvaluator(
+          eval_tasks=task.tasks.values(),
+          model=model,
+          eval_steps=eval_steps,
+          global_step=trainer.global_step if is_training else None,
+          checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
+              params, model_dir))
+    else:
+      evaluator = None
+  if trainer:
+    checkpoint = trainer.checkpoint
+    global_step = trainer.global_step
+  else:
+    checkpoint = evaluator.checkpoint
+    global_step = evaluator.global_step
+  # TODO(hongkuny,haozhangthu): Revisit initialization method.
+  checkpoint_manager = tf.train.CheckpointManager(
+      checkpoint,
+      directory=model_dir,
+      max_to_keep=params.trainer.max_to_keep,
+      step_counter=global_step,
+      checkpoint_interval=params.trainer.checkpoint_interval,
+      init_fn=model.initialize)
+  controller = orbit.Controller(
+      strategy=distribution_strategy,
+      trainer=trainer,
+      evaluator=evaluator,
+      global_step=global_step,
+      steps_per_loop=params.trainer.steps_per_loop,
+      checkpoint_manager=checkpoint_manager,
+      summary_dir=os.path.join(model_dir, 'train'),
+      eval_summary_dir=os.path.join(model_dir, 'validation'),
+      summary_interval=params.trainer.summary_interval)
+  logging.info('Starts to execute mode: %s', mode)
+  with distribution_strategy.scope():
+    if mode == 'train':
+      controller.train(steps=params.trainer.train_steps)
+    elif mode == 'train_and_eval':
+      controller.train_and_evaluate(
+          train_steps=params.trainer.train_steps,
+          eval_steps=params.trainer.validation_steps,
+          eval_interval=params.trainer.validation_interval)
+    elif mode == 'eval':
+      controller.evaluate(steps=params.trainer.validation_steps)
+    elif mode == 'continuous_eval':
+      def timeout_fn():
+        if evaluator.global_step.numpy() >= params.trainer.train_steps:
+          return True
+        return False
+      controller.evaluate_continuously(
+          steps=params.trainer.validation_steps,
+          timeout=params.trainer.continuous_eval_timeout,
+          timeout_fn=timeout_fn)
+    else:
+      raise NotImplementedError('The mode is not implemented: %s' % mode)
+    return model
+def run_experiment_with_multitask_eval(
+    *,
+    distribution_strategy: tf.distribute.Strategy,
+    train_task: base_task.Task,
+    eval_tasks: List[base_task.Task],
+    mode: str,
+    params: configs.MultiEvalExperimentConfig,
+    model_dir: str,
+    run_post_eval: bool = False,
+    save_summary: bool = True,
+    trainer: Optional[core_lib.Trainer] = None) -> Tuple[Any, Any]:
+  """Runs train/eval configured by the experiment params.
+  Args:
+    distribution_strategy: A distribution distribution_strategy.
+    train_task: A base_task.Task instance.
+    eval_tasks: A list of evaluation tasks.
+    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
+      or 'continuous_eval'.
+    params: MultiEvalExperimentConfig instance.
+    model_dir: A 'str', a path to store model checkpoints and summaries.
+    run_post_eval: Whether to run post eval once after training, metrics logs
+      are returned.
+    save_summary: Whether to save train and validation summary.
+    trainer: the core_lib.Trainer instance. It should be created within the
+      strategy.scope(). If not provided, an instance will be created by default
+      if `mode` contains 'train'.
+  Returns:
+      model: `tf.keras.Model` instance.
+  """
+  is_training = 'train' in mode
+  is_eval = 'eval' in mode
+  with distribution_strategy.scope():
+    if is_training:
+      trainer = trainer or core_lib.Trainer(
+          config=params,
+          task=train_task,
+          model=train_task.build_model(),
+          optimizer=train_task.create_optimizer(params.trainer.optimizer_config,
+                                                params.runtime),
+          train=True,
+          evaluate=False)
+    else:
+      trainer = None
+    model = trainer.model if trainer else train_task.build_model()
+    if is_eval:
+      eval_steps = dict([(task_routine.task_config.name,
+                          task_routine.eval_steps)
+                         for task_routine in params.eval_tasks])
+      evaluator = evaluator_lib.MultiTaskEvaluator(
+          eval_tasks=eval_tasks,
+          model=model,
+          global_step=trainer.global_step if is_training else None,
+          eval_steps=eval_steps,
+          checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
+              params, model_dir))
+    else:
+      evaluator = None
+  if trainer:
+    checkpoint = trainer.checkpoint
+    global_step = trainer.global_step
+  else:
+    checkpoint = evaluator.checkpoint
+    global_step = evaluator.global_step
+  checkpoint_manager = tf.train.CheckpointManager(
+      checkpoint,
+      directory=model_dir,
+      max_to_keep=params.trainer.max_to_keep,
+      step_counter=global_step,
+      checkpoint_interval=params.trainer.checkpoint_interval,
+      init_fn=trainer.initialize if trainer else None)
+  controller = orbit.Controller(
+      strategy=distribution_strategy,
+      trainer=trainer,
+      evaluator=evaluator,
+      global_step=global_step,
+      steps_per_loop=params.trainer.steps_per_loop,
+      checkpoint_manager=checkpoint_manager,
+      summary_dir=os.path.join(model_dir, 'train') if save_summary else None,
+      eval_summary_dir=os.path.join(model_dir, 'validation') if
+      (save_summary) else None,
+      summary_interval=params.trainer.summary_interval if
+      (save_summary) else None)
+  logging.info('Starts to execute mode: %s', mode)
+  with distribution_strategy.scope():
+    if mode == 'train':
+      controller.train(steps=params.trainer.train_steps)
+    elif mode == 'train_and_eval':
+      controller.train_and_evaluate(
+          train_steps=params.trainer.train_steps,
+          eval_steps=params.trainer.validation_steps,
+          eval_interval=params.trainer.validation_interval)
+    elif mode == 'eval':
+      controller.evaluate(steps=params.trainer.validation_steps)
+    elif mode == 'continuous_eval':
+      def timeout_fn():
+        if evaluator.global_step.numpy() >= params.trainer.train_steps:
+          return True
+        return False
+      controller.evaluate_continuously(
+          steps=params.trainer.validation_steps,
+          timeout=params.trainer.continuous_eval_timeout,
+          timeout_fn=timeout_fn)
+    else:
+      raise NotImplementedError('The mode is not implemented: %s' % mode)
+    if run_post_eval:
+      return model, evaluator.evaluate(
+          tf.convert_to_tensor(params.trainer.validation_steps))  # pytype: disable=bad-return-type  # typed-keras
+    else:
+      return model, {}  # pytype: disable=bad-return-type  # typed-keras
--- a/official/modeling/multitask/train_lib_test.py
+++ b/official/modeling/multitask/train_lib_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for multitask.train_lib."""
+from absl.testing import parameterized
+import tensorflow as tf
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.core import task_factory
+from official.modeling.hyperparams import params_dict
+from official.modeling.multitask import configs
+from official.modeling.multitask import multitask
+from official.modeling.multitask import test_utils
+from official.modeling.multitask import train_lib
+class TrainLibTest(tf.test.TestCase, parameterized.TestCase):
+  def setUp(self):
+    super().setUp()
+    self._test_config = {
+        'trainer': {
+            'checkpoint_interval': 10,
+            'steps_per_loop': 10,
+            'summary_interval': 10,
+            'train_steps': 10,
+            'validation_steps': 5,
+            'validation_interval': 10,
+            'continuous_eval_timeout': 1,
+            'optimizer_config': {
+                'optimizer': {
+                    'type': 'sgd',
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                }
+            }
+        },
+    }
+  @combinations.generate(
+      combinations.combine(
+          distribution_strategy=[
+              strategy_combinations.default_strategy,
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          mode='eager',
+          flag_mode=['train', 'eval', 'train_and_eval']))
+  def test_end_to_end(self, distribution_strategy, flag_mode):
+    model_dir = self.get_temp_dir()
+    experiment_config = configs.MultiTaskExperimentConfig(
+        task=configs.MultiTaskConfig(
+            task_routines=(
+                configs.TaskRoutine(
+                    task_name='foo', task_config=test_utils.FooConfig()),
+                configs.TaskRoutine(
+                    task_name='bar', task_config=test_utils.BarConfig()))))
+    experiment_config = params_dict.override_params_dict(
+        experiment_config, self._test_config, is_strict=False)
+    with distribution_strategy.scope():
+      test_multitask = multitask.MultiTask.from_config(experiment_config.task)
+      model = test_utils.MockMultiTaskModel()
+    train_lib.run_experiment(
+        distribution_strategy=distribution_strategy,
+        task=test_multitask,
+        model=model,
+        mode=flag_mode,
+        params=experiment_config,
+        model_dir=model_dir)
+  @combinations.generate(
+      combinations.combine(
+          distribution_strategy=[
+              strategy_combinations.default_strategy,
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          mode='eager',
+          flag_mode=['train', 'eval', 'train_and_eval']))
+  def test_end_to_end_multi_eval(self, distribution_strategy, flag_mode):
+    model_dir = self.get_temp_dir()
+    experiment_config = configs.MultiEvalExperimentConfig(
+        task=test_utils.FooConfig(),
+        eval_tasks=(configs.TaskRoutine(
+            task_name='foo', task_config=test_utils.FooConfig(), eval_steps=2),
+                    configs.TaskRoutine(
+                        task_name='bar',
+                        task_config=test_utils.BarConfig(),
+                        eval_steps=3)))
+    experiment_config = params_dict.override_params_dict(
+        experiment_config, self._test_config, is_strict=False)
+    with distribution_strategy.scope():
+      train_task = task_factory.get_task(experiment_config.task)
+      eval_tasks = [
+          task_factory.get_task(config.task_config, name=config.task_name)
+          for config in experiment_config.eval_tasks
+      ]
+    train_lib.run_experiment_with_multitask_eval(
+        distribution_strategy=distribution_strategy,
+        train_task=train_task,
+        eval_tasks=eval_tasks,
+        mode=flag_mode,
+        params=experiment_config,
+        model_dir=model_dir)
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/modeling/optimization/__init__.py
+++ b/official/modeling/optimization/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization package definition."""
+# pylint: disable=wildcard-import
+from official.modeling.optimization.configs.learning_rate_config import *
+from official.modeling.optimization.configs.optimization_config import *
+from official.modeling.optimization.configs.optimizer_config import *
+from official.modeling.optimization.ema_optimizer import ExponentialMovingAverage
+from official.modeling.optimization.lr_schedule import *
+from official.modeling.optimization.optimizer_factory import OptimizerFactory
+from official.modeling.optimization.optimizer_factory import register_optimizer_cls
--- a/official/modeling/optimization/adafactor_optimizer.py
+++ b/official/modeling/optimization/adafactor_optimizer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Adafactor optimizer.
+A new optimizer that will be open sourced soon.
+"""
+# pylint: disable=invalid-name, represents an unimplemented class definition.
+Adafactor = "Unimplemented"
--- a/official/modeling/optimization/configs/__init__.py
+++ b/official/modeling/optimization/configs/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/official/modeling/optimization/configs/learning_rate_config.py
+++ b/official/modeling/optimization/configs/learning_rate_config.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dataclasses for learning rate schedule config."""
+from typing import List, Optional
+import dataclasses
+from official.modeling.hyperparams import base_config
+@dataclasses.dataclass
+class ConstantLrConfig(base_config.Config):
+  """Configuration for constant learning rate.
+  This class is a containers for the constant learning rate decay configs.
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to Constant.
+    learning_rate: A float. The learning rate. Defaults to 0.1.
+  """
+  name: str = 'Constant'
+  learning_rate: float = 0.1
+@dataclasses.dataclass
+class StepwiseLrConfig(base_config.Config):
+  """Configuration for stepwise learning rate decay.
+  This class is a container for the piecewise constant learning rate scheduling
+  configs. It will configure an instance of PiecewiseConstantDecay keras
+  learning rate schedule.
+  An example (from keras docs): use a learning rate that's 1.0 for the first
+  100001 steps, 0.5 for the next 10000 steps, and 0.1 for any additional steps.
+    ```python
+    boundaries: [100000, 110000]
+    values: [1.0, 0.5, 0.1]
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to PiecewiseConstant.
+    boundaries: A list of ints of strictly increasing entries. Defaults to None.
+    values: A list of floats that specifies the values for the intervals defined
+      by `boundaries`. It should have one more element than `boundaries`.
+            The learning rate is computed as follows: [0, boundaries[0]] ->
+              values[0] [boundaries[0], boundaries[1]]     -> values[1]
+              [boundaries[n-1], boundaries[n]]   -> values[n] [boundaries[n],
+              end]               -> values[n+1] Defaults to None.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'PiecewiseConstantDecay'
+  boundaries: Optional[List[int]] = None
+  values: Optional[List[float]] = None
+  offset: int = 0
+@dataclasses.dataclass
+class ExponentialLrConfig(base_config.Config):
+  """Configuration for exponential learning rate decay.
+  This class is a containers for the exponential learning rate decay configs.
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to ExponentialDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    decay_steps: A positive integer that is used for decay computation. Defaults
+      to None.
+    decay_rate: A float. Defaults to None.
+    staircase: A boolean, if true, learning rate is decreased at discreate
+      intervals. Defaults to False.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'ExponentialDecay'
+  initial_learning_rate: Optional[float] = None
+  decay_steps: Optional[int] = None
+  decay_rate: Optional[float] = None
+  staircase: Optional[bool] = None
+  offset: int = 0
+@dataclasses.dataclass
+class PolynomialLrConfig(base_config.Config):
+  """Configuration for polynomial learning rate decay.
+  This class is a containers for the polynomial learning rate decay configs.
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to PolynomialDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    decay_steps: A positive integer that is used for decay computation. Defaults
+      to None.
+    end_learning_rate: A float.  The minimal end learning rate.
+    power: A float.  The power of the polynomial. Defaults to linear, 1.0.
+    cycle: A boolean, whether or not it should cycle beyond decay_steps.
+      Defaults to False.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'PolynomialDecay'
+  initial_learning_rate: Optional[float] = None
+  decay_steps: Optional[int] = None
+  end_learning_rate: float = 0.0001
+  power: float = 1.0
+  cycle: bool = False
+  offset: int = 0
+@dataclasses.dataclass
+class CosineLrConfig(base_config.Config):
+  """Configuration for Cosine learning rate decay.
+  This class is a containers for the cosine learning rate decay configs,
+  tf.keras.experimental.CosineDecay.
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to CosineDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    decay_steps: A positive integer that is used for decay computation. Defaults
+      to None.
+    alpha: A float.  Minimum learning rate value as a fraction of
+      initial_learning_rate.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'CosineDecay'
+  initial_learning_rate: Optional[float] = None
+  decay_steps: Optional[int] = None
+  alpha: float = 0.0
+  offset: int = 0
+@dataclasses.dataclass
+class DirectPowerLrConfig(base_config.Config):
+  """Configuration for DirectPower learning rate decay.
+  This class configures a schedule following follows lr * (step)^power.
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to DirectPowerDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    power: A float. Defaults to -0.5, for sqrt decay.
+  """
+  name: str = 'DirectPowerDecay'
+  initial_learning_rate: Optional[float] = None
+  power: float = -0.5
+@dataclasses.dataclass
+class PowerAndLinearDecayLrConfig(base_config.Config):
+  """Configuration for DirectPower learning rate decay.
+  The schedule has the following behavoir.
+  Let offset_step = step - offset.
+  1) offset_step < 0, the actual learning rate equals initial_learning_rate.
+  2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
+  actual learning rate equals lr * offset_step^power.
+  3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
+  total_decay_steps, the actual learning rate equals lr * offset_step^power *
+  (total_decay_steps - offset_step) / (total_decay_steps *
+  linear_decay_fraction).
+  4) offset_step >= total_decay_steps, the actual learning rate equals zero.
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to
+      PowerAndLinearDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    total_decay_steps: An int. The total number of steps for power + linear
+      decay. Defaults to None.
+    power: A float. The order of the polynomial. Defaults to -0.5, for sqrt
+      decay.
+    linear_decay_fraction: A float. In the last `linear_decay_fraction` steps,
+      the learning rate will be multiplied by a linear decay. Defaults to 0.1.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'PowerAndLinearDecay'
+  initial_learning_rate: Optional[float] = None
+  total_decay_steps: Optional[int] = None
+  power: float = -0.5
+  linear_decay_fraction: float = 0.1
+  offset: int = 0
+@dataclasses.dataclass
+class PowerDecayWithOffsetLrConfig(base_config.Config):
+  """Configuration for power learning rate decay with step offset.
+  Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
+  Otherwise, learning rate equals to lr * (step - offset)^power.
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to
+      PowerDecayWithOffset.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    power: A float. Defaults to -0.5, for sqrt decay.
+    offset: An integer. Power decay happens after `offset` steps.
+    pre_offset_learning_rate: A float. The constant learning rate before
+      `offset` steps.
+  """
+  name: str = 'PowerDecayWithOffset'
+  initial_learning_rate: Optional[float] = None
+  power: float = -0.5
+  offset: int = 0
+  pre_offset_learning_rate: float = 1.0e6
+@dataclasses.dataclass
+class StepCosineLrConfig(base_config.Config):
+  """Configuration for stepwise learning rate decay.
+  This class is a container for the piecewise cosine learning rate scheduling
+  configs. It will configure an instance of StepConsineDecayWithOffset keras
+  learning rate schedule.
+    ```python
+    boundaries: [100000, 110000]
+    values: [1.0, 0.5]
+    lr_decayed_fn = (
+    lr_schedule.StepConsineDecayWithOffset(
+        boundaries,
+        values))
+    ```
+    from 0 to 100000 step, it will cosine decay from 1.0 to 0.5
+    from 100000 to 110000 step, it cosine decay from 0.5 to 0.0
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to PiecewiseConstant.
+    boundaries: A list of ints of strictly increasing entries. Defaults to None.
+    values: A list of floats that specifies the values for the intervals defined
+      by `boundaries`. It should have one more element than `boundaries`.
+            The learning rate is computed as follows:
+              [0, boundaries[0]] -> cosine from values[0] to values[1]
+              [boundaries[0], boundaries[1]]     -> values[1] to values[2]
+              ...
+              [boundaries[n-1], boundaries[n]]   -> values[n] to values[n+1]
+              [boundaries[n], end]               -> values[n+1] to 0.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'StepConsineDecayWithOffset'
+  boundaries: Optional[List[int]] = None
+  values: Optional[List[float]] = None
+  offset: int = 0
+@dataclasses.dataclass
+class LinearWarmupConfig(base_config.Config):
+  """Configuration for linear warmup schedule config.
+  This class is a container for the linear warmup schedule configs.
+  Warmup_learning_rate is the initial learning rate, the final learning rate of
+  the warmup period is the learning_rate of the optimizer in use. The learning
+  rate at each step linearly increased according to the following formula:
+    warmup_learning_rate = warmup_learning_rate +
+    step / warmup_steps * (final_learning_rate - warmup_learning_rate).
+  Using warmup overrides the learning rate schedule by the number of warmup
+  steps.
+  Attributes:
+    name: The name of warmup schedule. Defaults to linear.
+    warmup_learning_rate: Initial learning rate for the warmup. Defaults to 0.
+    warmup_steps: Warmup steps. Defaults to None.
+  """
+  name: str = 'linear'
+  warmup_learning_rate: float = 0
+  warmup_steps: Optional[int] = None
+@dataclasses.dataclass
+class PolynomialWarmupConfig(base_config.Config):
+  """Configuration for linear warmup schedule config.
+  This class is a container for the polynomial warmup schedule configs.
+  Attributes:
+    name: The name of warmup schedule. Defaults to Polynomial.
+    power: Polynomial power. Defaults to 1.
+    warmup_steps: Warmup steps. Defaults to None.
+  """
+  name: str = 'polynomial'
+  power: float = 1
+  warmup_steps: Optional[int] = None
--- a/official/modeling/optimization/configs/optimization_config.py
+++ b/official/modeling/optimization/configs/optimization_config.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dataclasses for optimization configs.
+This file define the dataclass for optimization configs (OptimizationConfig).
+It also has two helper functions get_optimizer_config, and get_lr_config from
+an OptimizationConfig class.
+"""
+from typing import Optional
+import dataclasses
+from official.modeling.hyperparams import base_config
+from official.modeling.hyperparams import oneof
+from official.modeling.optimization.configs import learning_rate_config as lr_cfg
+from official.modeling.optimization.configs import optimizer_config as opt_cfg
+@dataclasses.dataclass
+class OptimizerConfig(oneof.OneOfConfig):
+  """Configuration for optimizer.
+  Attributes:
+    type: 'str', type of optimizer to be used, on the of fields below.
+    sgd: sgd optimizer config.
+    adam: adam optimizer config.
+    adamw: adam with weight decay.
+    lamb: lamb optimizer.
+    rmsprop: rmsprop optimizer.
+    lars: lars optimizer.
+    adagrad: adagrad optimizer.
+    slide: slide optimizer.
+  """
+  type: Optional[str] = None
+  sgd: opt_cfg.SGDConfig = opt_cfg.SGDConfig()
+  adam: opt_cfg.AdamConfig = opt_cfg.AdamConfig()
+  adamw: opt_cfg.AdamWeightDecayConfig = opt_cfg.AdamWeightDecayConfig()
+  lamb: opt_cfg.LAMBConfig = opt_cfg.LAMBConfig()
+  rmsprop: opt_cfg.RMSPropConfig = opt_cfg.RMSPropConfig()
+  lars: opt_cfg.LARSConfig = opt_cfg.LARSConfig()
+  adagrad: opt_cfg.AdagradConfig = opt_cfg.AdagradConfig()
+  slide: opt_cfg.SLIDEConfig = opt_cfg.SLIDEConfig()
+  adafactor: opt_cfg.AdafactorConfig = opt_cfg.AdafactorConfig()
+@dataclasses.dataclass
+class LrConfig(oneof.OneOfConfig):
+  """Configuration for lr schedule.
+  Attributes:
+    type: 'str', type of lr schedule to be used, one of the fields below.
+    constant: constant learning rate config.
+    stepwise: stepwise learning rate config.
+    exponential: exponential learning rate config.
+    polynomial: polynomial learning rate config.
+    cosine: cosine learning rate config.
+    power: step^power learning rate config.
+    power_linear: learning rate config of step^power followed by
+      step^power*linear.
+    power_with_offset: power decay with a step offset.
+    step_cosine_with_offset: Step cosine with a step offset.
+  """
+  type: Optional[str] = None
+  constant: lr_cfg.ConstantLrConfig = lr_cfg.ConstantLrConfig()
+  stepwise: lr_cfg.StepwiseLrConfig = lr_cfg.StepwiseLrConfig()
+  exponential: lr_cfg.ExponentialLrConfig = lr_cfg.ExponentialLrConfig()
+  polynomial: lr_cfg.PolynomialLrConfig = lr_cfg.PolynomialLrConfig()
+  cosine: lr_cfg.CosineLrConfig = lr_cfg.CosineLrConfig()
+  power: lr_cfg.DirectPowerLrConfig = lr_cfg.DirectPowerLrConfig()
+  power_linear: lr_cfg.PowerAndLinearDecayLrConfig = (
+      lr_cfg.PowerAndLinearDecayLrConfig())
+  power_with_offset: lr_cfg.PowerDecayWithOffsetLrConfig = (
+      lr_cfg.PowerDecayWithOffsetLrConfig())
+  step_cosine_with_offset: lr_cfg.StepCosineLrConfig = (
+      lr_cfg.StepCosineLrConfig())
+@dataclasses.dataclass
+class WarmupConfig(oneof.OneOfConfig):
+  """Configuration for lr schedule.
+  Attributes:
+    type: 'str', type of warmup schedule to be used, one of the fields below.
+    linear: linear warmup config.
+    polynomial: polynomial warmup config.
+  """
+  type: Optional[str] = None
+  linear: lr_cfg.LinearWarmupConfig = lr_cfg.LinearWarmupConfig()
+  polynomial: lr_cfg.PolynomialWarmupConfig = lr_cfg.PolynomialWarmupConfig()
+@dataclasses.dataclass
+class OptimizationConfig(base_config.Config):
+  """Configuration for optimizer and learning rate schedule.
+  Attributes:
+    optimizer: optimizer oneof config.
+    ema: optional exponential moving average optimizer config, if specified, ema
+      optimizer will be used.
+    learning_rate: learning rate oneof config.
+    warmup: warmup oneof config.
+  """
+  optimizer: OptimizerConfig = OptimizerConfig()
+  ema: Optional[opt_cfg.EMAConfig] = None
+  learning_rate: LrConfig = LrConfig()
+  warmup: WarmupConfig = WarmupConfig()
--- a/official/modeling/optimization/configs/optimization_config_test.py
+++ b/official/modeling/optimization/configs/optimization_config_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for optimization_config.py."""
+import tensorflow as tf
+from official.modeling.optimization.configs import learning_rate_config as lr_cfg
+from official.modeling.optimization.configs import optimization_config
+from official.modeling.optimization.configs import optimizer_config as opt_cfg
+class OptimizerConfigTest(tf.test.TestCase):
+  def test_no_optimizer(self):
+    optimizer = optimization_config.OptimizationConfig({}).optimizer.get()
+    self.assertIsNone(optimizer)
+  def test_no_lr_schedule(self):
+    lr = optimization_config.OptimizationConfig({}).learning_rate.get()
+    self.assertIsNone(lr)
+  def test_no_warmup_schedule(self):
+    warmup = optimization_config.OptimizationConfig({}).warmup.get()
+    self.assertIsNone(warmup)
+  def test_config(self):
+    opt_config = optimization_config.OptimizationConfig({
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {}  # default config
+        },
+        'learning_rate': {
+            'type': 'polynomial',
+            'polynomial': {}
+        },
+        'warmup': {
+            'type': 'linear'
+        }
+    })
+    self.assertEqual(opt_config.optimizer.get(), opt_cfg.SGDConfig())
+    self.assertEqual(opt_config.learning_rate.get(),
+                     lr_cfg.PolynomialLrConfig())
+    self.assertEqual(opt_config.warmup.get(), lr_cfg.LinearWarmupConfig())
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/modeling/optimization/configs/optimizer_config.py
+++ b/official/modeling/optimization/configs/optimizer_config.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dataclasses for optimizer configs."""
+from typing import List, Optional
+import dataclasses
+from official.modeling.hyperparams import base_config
+@dataclasses.dataclass
+class BaseOptimizerConfig(base_config.Config):
+  """Base optimizer config.
+  Attributes:
+    clipnorm: float >= 0 or None. If not None, Gradients will be clipped when
+      their L2 norm exceeds this value.
+    clipvalue: float >= 0 or None. If not None, Gradients will be clipped when
+      their absolute value exceeds this value.
+    global_clipnorm: float >= 0 or None. If not None, gradient of all weights is
+      clipped so that their global norm is no higher than this value
+  """
+  clipnorm: Optional[float] = None
+  clipvalue: Optional[float] = None
+  global_clipnorm: Optional[float] = None
+@dataclasses.dataclass
+class SGDConfig(BaseOptimizerConfig):
+  """Configuration for SGD optimizer.
+  The attributes for this class matches the arguments of tf.keras.optimizer.SGD.
+  Attributes:
+    name: name of the optimizer.
+    decay: decay rate for SGD optimizer.
+    nesterov: nesterov for SGD optimizer.
+    momentum: momentum for SGD optimizer.
+  """
+  name: str = "SGD"
+  decay: float = 0.0
+  nesterov: bool = False
+  momentum: float = 0.0
+@dataclasses.dataclass
+class RMSPropConfig(BaseOptimizerConfig):
+  """Configuration for RMSProp optimizer.
+  The attributes for this class matches the arguments of
+  tf.keras.optimizers.RMSprop.
+  Attributes:
+    name: name of the optimizer.
+    rho: discounting factor for RMSprop optimizer.
+    momentum: momentum for RMSprop optimizer.
+    epsilon: epsilon value for RMSprop optimizer, help with numerical stability.
+    centered: Whether to normalize gradients or not.
+  """
+  name: str = "RMSprop"
+  rho: float = 0.9
+  momentum: float = 0.0
+  epsilon: float = 1e-7
+  centered: bool = False
+@dataclasses.dataclass
+class AdagradConfig(BaseOptimizerConfig):
+  """Configuration for Adagrad optimizer.
+  The attributes of this class match the arguments of
+  tf.keras.optimizer.Adagrad.
+  Attributes:
+    name: name of the optimizer.
+    initial_accumulator_value: A floating point value. Starting value for the
+      accumulators, must be non-negative.
+    epsilon: A small floating point value to avoid zero denominator.
+  """
+  name: str = "Adagrad"
+  initial_accumulator_value: float = 0.1
+  epsilon: float = 1e-07
+@dataclasses.dataclass
+class AdamConfig(BaseOptimizerConfig):
+  """Configuration for Adam optimizer.
+  The attributes for this class matches the arguments of
+  tf.keras.optimizer.Adam.
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in Adam optimizer.
+    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+      the paper "On the Convergence of Adam and beyond".
+  """
+  name: str = "Adam"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-07
+  amsgrad: bool = False
+@dataclasses.dataclass
+class AdamWeightDecayConfig(BaseOptimizerConfig):
+  """Configuration for Adam optimizer with weight decay.
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in the optimizer.
+    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+      the paper "On the Convergence of Adam and beyond".
+    weight_decay_rate: float. Weight decay rate. Default to 0.
+    include_in_weight_decay: list[str], or None. List of weight names to include
+      in weight decay.
+    exclude_from_weight_decay: list[str], or None. List of weight names to not
+      include in weight decay.
+    gradient_clip_norm: A positive float. Clips the gradients to this maximum
+      L2-norm. Default to 1.0.
+  """
+  name: str = "AdamWeightDecay"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-07
+  amsgrad: bool = False
+  weight_decay_rate: float = 0.0
+  include_in_weight_decay: Optional[List[str]] = None
+  exclude_from_weight_decay: Optional[List[str]] = None
+  gradient_clip_norm: float = 1.0
+@dataclasses.dataclass
+class LAMBConfig(BaseOptimizerConfig):
+  """Configuration for LAMB optimizer.
+  The attributes for this class matches the arguments of
+  tensorflow_addons.optimizers.LAMB.
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in LAMB optimizer.
+    weight_decay_rate: float. Weight decay rate. Default to 0.
+    exclude_from_weight_decay: List of regex patterns of variables excluded from
+      weight decay. Variables whose name contain a substring matching the
+      pattern will be excluded.
+    exclude_from_layer_adaptation: List of regex patterns of variables excluded
+      from layer adaptation. Variables whose name contain a substring matching
+      the pattern will be excluded.
+  """
+  name: str = "LAMB"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-6
+  weight_decay_rate: float = 0.0
+  exclude_from_weight_decay: Optional[List[str]] = None
+  exclude_from_layer_adaptation: Optional[List[str]] = None
+@dataclasses.dataclass
+class EMAConfig(BaseOptimizerConfig):
+  """Exponential moving average optimizer config.
+  Attributes:
+    name: 'str', name of the optimizer.
+    trainable_weights_only: 'bool', if True, only model trainable weights will
+      be updated. Otherwise, all model weights will be updated. This mainly
+      affects batch normalization parameters.
+    average_decay: 'float', average decay value.
+    start_step: 'int', start step to apply moving average.
+    dynamic_decay: 'bool', whether to apply dynamic decay or not.
+  """
+  name: str = "ExponentialMovingAverage"
+  trainable_weights_only: bool = True
+  average_decay: float = 0.99
+  start_step: int = 0
+  dynamic_decay: bool = True
+@dataclasses.dataclass
+class LARSConfig(BaseOptimizerConfig):
+  """Layer-wise adaptive rate scaling config.
+  Attributes:
+    name: 'str', name of the optimizer.
+    momentum: `float` hyperparameter >= 0 that accelerates gradient descent in
+      the relevant direction and dampens oscillations. Defaults to 0.9.
+    eeta: `float` LARS coefficient as used in the paper. Default set to LARS
+      coefficient from the paper. (eeta / weight_decay) determines the highest
+      scaling factor in LARS..
+    weight_decay_rate: `float` for weight decay.
+    nesterov: 'boolean' for whether to use nesterov momentum.
+    classic_momentum: `boolean` for whether to use classic (or popular)
+      momentum. The learning rate is applied during momentum update in classic
+      momentum, but after momentum for popular momentum.
+    exclude_from_weight_decay: A list of `string` for variable screening, if any
+      of the string appears in a variable's name, the variable will be excluded
+      for computing weight decay. For example, one could specify the list like
+      ['batch_normalization', 'bias'] to exclude BN and bias from weight decay.
+    exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but for
+      layer adaptation. If it is None, it will be defaulted the same as
+      exclude_from_weight_decay.
+  """
+  name: str = "LARS"
+  momentum: float = 0.9
+  eeta: float = 0.001
+  weight_decay_rate: float = 0.0
+  nesterov: bool = False
+  classic_momentum: bool = True
+  exclude_from_weight_decay: Optional[List[str]] = None
+  exclude_from_layer_adaptation: Optional[List[str]] = None
+@dataclasses.dataclass
+class SLIDEConfig(BaseOptimizerConfig):
+  """Configuration for SLIDE optimizer.
+  Details coming soon.
+  """
+  name: str = "SLIDE"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-6
+  weight_decay_rate: float = 0.0
+  weight_decay_type: str = "inner"
+  exclude_from_weight_decay: Optional[List[str]] = None
+  exclude_from_layer_adaptation: Optional[List[str]] = None
+  include_in_sparse_layer_adaptation: Optional[List[str]] = None
+  sparse_layer_learning_rate: float = 0.1
+  do_gradient_rescaling: bool = True
+  norm_type: str = "layer"
+  ratio_clip_norm: float = 1e5
+@dataclasses.dataclass
+class AdafactorConfig(BaseOptimizerConfig):
+  """Configuration for Adafactor optimizer.
+  The attributes for this class matches the arguments of the Adafactor
+  implementation.
+  """
+  name: str = "Adafactor"
+  factored: bool = True
+  multiply_by_parameter_scale: bool = True
+  beta1: Optional[float] = None
+  decay_rate: float = 0.8
+  step_offset: int = 0
+  clipping_threshold: float = 1.0
+  min_dim_size_to_factor: int = 128
+  epsilon1: float = 1e-30
+  epsilon2: float = 1e-3
--- a/official/modeling/optimization/ema_optimizer.py
+++ b/official/modeling/optimization/ema_optimizer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Exponential moving average optimizer."""
+from typing import List, Optional, Text
+import tensorflow as tf
+# pylint: disable=protected-access
+class ExponentialMovingAverage(tf.keras.optimizers.Optimizer):
+  """Optimizer that computes an exponential moving average of the variables.
+  Empirically it has been found that using the moving average of the trained
+  parameters of a deep network is better than using its trained parameters
+  directly. This optimizer allows you to compute this moving average and swap
+  the variables at save time so that any code outside of the training loop
+  will use by default the average values instead of the original ones.
+  Example of usage for training:
+  ```python
+  opt = tf.keras.optimizers.SGD(learning_rate)
+  opt = ExponentialMovingAverage(opt)
+  opt.shadow_copy(model)
+  ```
+  At test time, swap the shadow variables to evaluate on the averaged weights:
+  ```python
+  opt.swap_weights()
+  # Test eval the model here
+  opt.swap_weights()
+  ```
+  """
+  def __init__(self,
+               optimizer: tf.keras.optimizers.Optimizer,
+               trainable_weights_only: bool = True,
+               average_decay: float = 0.99,
+               start_step: int = 0,
+               dynamic_decay: bool = True,
+               name: Text = 'ExponentialMovingAverage',
+               **kwargs):
+    """Construct a new ExponentialMovingAverage optimizer.
+    Args:
+      optimizer: `tf.keras.optimizers.Optimizer` that will be
+        used to compute and apply gradients.
+      trainable_weights_only: 'bool', if True, only model trainable weights will
+        be updated. Otherwise, all model weights will be updated. This mainly
+        affects batch normalization parameters.
+      average_decay: float. Decay to use to maintain the moving averages
+        of trained variables.
+      start_step: int. What step to start the moving average.
+      dynamic_decay: bool. Whether to change the decay based on the number
+        of optimizer updates. Decay will start at 0.1 and gradually increase
+        up to `average_decay` after each optimizer update. This behavior is
+        similar to `tf.train.ExponentialMovingAverage` in TF 1.x.
+      name: Optional name for the operations created when applying
+        gradients. Defaults to "moving_average".
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`,
+        `clipvalue`, `lr`, `decay`}.
+    """
+    super().__init__(name, **kwargs)
+    self._average_decay = average_decay
+    self._trainable_weights_only = trainable_weights_only
+    self._start_step = tf.constant(start_step, tf.float32)
+    self._dynamic_decay = dynamic_decay
+    self._optimizer = optimizer
+    self._track_trackable(self._optimizer, 'base_optimizer')
+    self._average_weights = None
+    self._model_weights = None
+  def shadow_copy(self, model: tf.keras.Model):
+    """Creates shadow variables for the given model weights."""
+    if self._trainable_weights_only:
+      self._model_weights = model.trainable_variables
+    else:
+      self._model_weights = model.variables
+    for var in self._model_weights:
+      self.add_slot(var, 'average', initializer='zeros')
+    self._average_weights = [
+        self.get_slot(var, 'average') for var in self._model_weights
+    ]
+  @property
+  def has_shadow_copy(self):
+    """Whether this optimizer has created shadow variables."""
+    return self._model_weights is not None and self._average_weights is not None
+  def _create_slots(self, var_list):
+    self._optimizer._create_slots(var_list=var_list)  # pylint: disable=protected-access
+  def apply_gradients(self, grads_and_vars, name: Optional[Text] = None):
+    result = self._optimizer.apply_gradients(grads_and_vars, name)
+    self.update_average(self.iterations)
+    return result
+  @tf.function
+  def update_average(self, step: tf.Tensor):
+    step = tf.cast(step, tf.float32)
+    if step < self._start_step:
+      decay = tf.constant(0., tf.float32)
+    elif self._dynamic_decay:
+      decay = step - self._start_step
+      decay = tf.minimum(self._average_decay, (1. + decay) / (10. + decay))
+    else:
+      decay = self._average_decay
+    def _apply_moving(v_moving, v_normal):
+      diff = v_moving - v_normal
+      v_moving.assign_sub(tf.cast(1. - decay, v_moving.dtype) * diff)
+      return v_moving
+    def _update(strategy, v_moving_and_v_normal):
+      for v_moving, v_normal in v_moving_and_v_normal:
+        strategy.extended.update(v_moving, _apply_moving, args=(v_normal,))
+    ctx = tf.distribute.get_replica_context()
+    return ctx.merge_call(_update, args=(zip(self._average_weights,
+                                             self._model_weights),))
+  def swap_weights(self):
+    """Swap the average and moving weights.
+    This is a convenience method to allow one to evaluate the averaged weights
+    at test time. Loads the weights stored in `self._average` into the model,
+    keeping a copy of the original model weights. Swapping twice will return
+    the original weights.
+    """
+    if tf.distribute.in_cross_replica_context():
+      strategy = tf.distribute.get_strategy()
+      strategy.run(self._swap_weights, args=())
+    else:
+      raise ValueError('Swapping weights must occur under a '
+                       'tf.distribute.Strategy')
+  @tf.function
+  def _swap_weights(self):
+    def fn_0(a, b):
+      a.assign_add(b)
+      return a
+    def fn_1(b, a):
+      b.assign(a - b)
+      return b
+    def fn_2(a, b):
+      a.assign_sub(b)
+      return a
+    def swap(strategy, a_and_b):
+      """Swap `a` and `b` and mirror to all devices."""
+      for a, b in a_and_b:
+        strategy.extended.update(a, fn_0, args=(b,))  # a = a + b
+        strategy.extended.update(b, fn_1, args=(a,))  # b = a - b
+        strategy.extended.update(a, fn_2, args=(b,))  # a = a - b
+    ctx = tf.distribute.get_replica_context()
+    return ctx.merge_call(
+        swap, args=(zip(self._average_weights, self._model_weights),))
+  def assign_average_vars(self, var_list: List[tf.Variable]):
+    """Assign variables in var_list with their respective averages.
+    Args:
+      var_list: List of model variables to be assigned to their average.
+    Returns:
+      assign_op: The op corresponding to the assignment operation of
+        variables to their average.
+    """
+    assign_op = tf.group([
+        var.assign(self.get_slot(var, 'average')) for var in var_list
+        if var.trainable
+    ])
+    return assign_op
+  def _create_hypers(self):
+    self._optimizer._create_hypers()  # pylint: disable=protected-access
+  def _prepare(self, var_list):
+    return self._optimizer._prepare(var_list=var_list)  # pylint: disable=protected-access
+  @property
+  def iterations(self):
+    return self._optimizer.iterations
+  @iterations.setter
+  def iterations(self, variable):
+    self._optimizer.iterations = variable
+  @property
+  def weights(self):
+    # return self._weights + self._optimizer.weights
+    return self._optimizer.weights
+  def variables(self):
+    return self._weights + [self.iterations]
+  @property
+  def lr(self):
+    return self._optimizer._get_hyper('learning_rate')
+  @lr.setter
+  def lr(self, lr):
+    self._optimizer._set_hyper('learning_rate', lr)
+  @property
+  def learning_rate(self):
+    return self._optimizer._get_hyper('learning_rate')
+  @learning_rate.setter
+  def learning_rate(self, learning_rate):  # pylint: disable=redefined-outer-name
+    self._optimizer._set_hyper('learning_rate', learning_rate)
+  def _resource_apply_dense(self, grad, var):
+    return self._optimizer._resource_apply_dense(grad, var)
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._optimizer._resource_apply_sparse(grad, var, indices)
+  def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
+    return self._optimizer._resource_apply_sparse_duplicate_indices(
+        grad, var, indices)
+  def get_config(self):
+    config = {
+        'optimizer': tf.keras.optimizers.serialize(self._optimizer),
+        'average_decay': self._average_decay,
+        'start_step': self._start_step,
+        'dynamic_decay': self._dynamic_decay,
+    }
+    base_config = super(ExponentialMovingAverage, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    optimizer = tf.keras.optimizers.deserialize(
+        config.pop('optimizer'),
+        custom_objects=custom_objects,
+    )
+    return cls(optimizer, **config)
--- a/official/modeling/optimization/lars_optimizer.py
+++ b/official/modeling/optimization/lars_optimizer.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Layer-wise adaptive rate scaling optimizer."""
+import re
+from typing import Text, List, Optional
+import tensorflow as tf
+# pylint: disable=protected-access
+class LARS(tf.keras.optimizers.Optimizer):
+  """Layer-wise Adaptive Rate Scaling for large batch training.
+  Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
+  I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
+  """
+  def __init__(self,
+               learning_rate: float = 0.01,
+               momentum: float = 0.9,
+               weight_decay_rate: float = 0.0,
+               eeta: float = 0.001,
+               nesterov: bool = False,
+               classic_momentum: bool = True,
+               exclude_from_weight_decay: Optional[List[Text]] = None,
+               exclude_from_layer_adaptation: Optional[List[Text]] = None,
+               name: Text = "LARS",
+               **kwargs):
+    """Constructs a LARSOptimizer.
+    Args:
+      learning_rate: `float` for learning rate. Defaults to 0.01.
+      momentum: `float` hyperparameter >= 0 that accelerates gradient descent
+          in the relevant direction and dampens oscillations. Defaults to 0.9.
+      weight_decay_rate: `float` for weight decay.
+      eeta: `float` LARS coefficient as used in the paper. Default set to LARS
+          coefficient from the paper. (eeta / weight_decay) determines the
+          highest scaling factor in LARS..
+      nesterov: 'boolean' for whether to use nesterov momentum.
+      classic_momentum: `boolean` for whether to use classic (or popular)
+          momentum. The learning rate is applied during momentum update in
+          classic momentum, but after momentum for popular momentum.
+      exclude_from_weight_decay: A list of `string` for variable screening, if
+          any of the string appears in a variable's name, the variable will be
+          excluded for computing weight decay. For example, one could specify
+          the list like ['batch_normalization', 'bias'] to exclude BN and bias
+          from weight decay.
+      exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but
+          for layer adaptation. If it is None, it will be defaulted the same as
+          exclude_from_weight_decay.
+      name: `Text` as optional name for the operations created when applying
+        gradients. Defaults to "LARS".
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for
+        backward compatibility, recommended to use `learning_rate` instead.
+    """
+    super(LARS, self).__init__(name, **kwargs)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("decay", self._initial_decay)
+    self.momentum = momentum
+    self.weight_decay_rate = weight_decay_rate
+    self.eeta = eeta
+    self.nesterov = nesterov
+    self.classic_momentum = classic_momentum
+    self.exclude_from_weight_decay = exclude_from_weight_decay
+    # exclude_from_layer_adaptation is set to exclude_from_weight_decay if the
+    # arg is None.
+    if exclude_from_layer_adaptation:
+      self.exclude_from_layer_adaptation = exclude_from_layer_adaptation
+    else:
+      self.exclude_from_layer_adaptation = exclude_from_weight_decay
+  def _create_slots(self, var_list):
+    for v in var_list:
+      self.add_slot(v, "momentum")
+  def _resource_apply_dense(self, grad, param, apply_state=None):
+    if grad is None or param is None:
+      return tf.no_op()
+    var_device, var_dtype = param.device, param.dtype.base_dtype
+    coefficients = ((apply_state or {}).get((var_device, var_dtype)) or
+                    self._fallback_apply_state(var_device, var_dtype))
+    learning_rate = coefficients["lr_t"]
+    param_name = param.name
+    v = self.get_slot(param, "momentum")
+    if self._use_weight_decay(param_name):
+      grad += self.weight_decay_rate * param
+    if self.classic_momentum:
+      trust_ratio = 1.0
+      if self._do_layer_adaptation(param_name):
+        w_norm = tf.norm(param, ord=2)
+        g_norm = tf.norm(grad, ord=2)
+        trust_ratio = tf.where(
+            tf.greater(w_norm, 0),
+            tf.where(tf.greater(g_norm, 0), (self.eeta * w_norm / g_norm), 1.0),
+            1.0)
+      scaled_lr = learning_rate * trust_ratio
+      next_v = tf.multiply(self.momentum, v) + scaled_lr * grad
+      if self.nesterov:
+        update = tf.multiply(self.momentum, next_v) + scaled_lr * grad
+      else:
+        update = next_v
+      next_param = param - update
+    else:
+      next_v = tf.multiply(self.momentum, v) + grad
+      if self.nesterov:
+        update = tf.multiply(self.momentum, next_v) + grad
+      else:
+        update = next_v
+      trust_ratio = 1.0
+      if self._do_layer_adaptation(param_name):
+        w_norm = tf.norm(param, ord=2)
+        v_norm = tf.norm(update, ord=2)
+        trust_ratio = tf.where(
+            tf.greater(w_norm, 0),
+            tf.where(tf.greater(v_norm, 0), (self.eeta * w_norm / v_norm), 1.0),
+            1.0)
+      scaled_lr = trust_ratio * learning_rate
+      next_param = param - scaled_lr * update
+    return tf.group(*[
+        param.assign(next_param, use_locking=False),
+        v.assign(next_v, use_locking=False)
+    ])
+  def _resource_apply_sparse(self, grad, handle, indices, apply_state):
+    raise NotImplementedError("Applying sparse gradients is not implemented.")
+  def _use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if not self.weight_decay_rate:
+      return False
+    if self.exclude_from_weight_decay:
+      for r in self.exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+  def _do_layer_adaptation(self, param_name):
+    """Whether to do layer-wise learning rate adaptation for `param_name`."""
+    if self.exclude_from_layer_adaptation:
+      for r in self.exclude_from_layer_adaptation:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+  def get_config(self):
+    config = super(LARS, self).get_config()
+    config.update({
+        "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "decay": self._serialize_hyperparameter("decay"),
+        "momentum": self.momentum,
+        "classic_momentum": self.classic_momentum,
+        "weight_decay_rate": self.weight_decay_rate,
+        "eeta": self.eeta,
+        "nesterov": self.nesterov,
+    })
+    return config
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/modeling/optimization/lr_schedule.py
+++ b/official/modeling/optimization/lr_schedule.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Learning rate schedule classes."""
+import math
+from typing import Mapping, Any, Union, Optional
+import tensorflow as tf
+def _make_offset_wrapper(new_class_name: str, base_lr_class):
+  """Generates a offset wrapper of learning rate schedule.
+  It will returns a subclass of the the `base_lr_class`, the subclass takes an
+  `offset` argument in the constructor. When the new class instance is called,
+  the behavior is:
+    new_class_object(step) = base_lr_class_object(step - offset)
+  Example:
+    CosineDecayWithOffset = _make_offset_wrapper(
+                     'CosineDecayWithOffset', tf.keras.experimental.CosineDecay)
+    # Use the lr:
+    lr = CosineDecayWithOffset(offset=100, initial_learning_rate=0.1,
+                               decay_steps=1000)
+    lr(101) # equals to tf.keras.experimental.CosineDecay(...)(101-100)
+  Args:
+    new_class_name: the name of the new class.
+    base_lr_class: the base learning rate schedule class. Should be subclass of
+      tf.keras.optimizers.schedules.LearningRateSchedule
+  Returns:
+    A new class (subclass of the base_lr_class) that can take an offset.
+  """
+  assert issubclass(base_lr_class,
+                    tf.keras.optimizers.schedules.LearningRateSchedule), (
+                        "base_lr_class should be subclass of keras "
+                        f"LearningRateSchedule, got {base_lr_class}")
+  # pylint: disable=protected-access,pointless-statement
+  def offset_learning_rate_init(self, offset=0, **kwargs):
+    """Construct learning rate schedule object.
+    When this object is called, its behavior is
+       self.__call__(step) == base_lr_class.__call__(step - offset)
+    Args:
+      self: this object.
+      offset: The offset when computing the learning rate schedule.
+      **kwargs: Pass through to base learning rate class constructor.
+    """
+    base_lr_class.__init__(self, **kwargs)
+    self._offset = offset
+  def offset_learning_rate_call(self, step):
+    step = tf.cast(step - self._offset, tf.float32)
+    return base_lr_class.__call__(self, step)
+  # pylint: enable=protected-access,pointless-statement
+  return type(
+      new_class_name, (base_lr_class,), {
+          "base_lr_class": base_lr_class,
+          "__init__": offset_learning_rate_init,
+          "__call__": offset_learning_rate_call
+      })
+PiecewiseConstantDecayWithOffset = _make_offset_wrapper(
+    "PiecewiseConstantDecayWithOffset",
+    tf.keras.optimizers.schedules.PiecewiseConstantDecay)
+PolynomialDecayWithOffset = _make_offset_wrapper(
+    "PolynomialDecayWithOffset", tf.keras.optimizers.schedules.PolynomialDecay)
+ExponentialDecayWithOffset = _make_offset_wrapper(
+    "ExponentialDecayWithOffset",
+    tf.keras.optimizers.schedules.ExponentialDecay)
+CosineDecayWithOffset = _make_offset_wrapper("CosineDecayWithOffset",
+                                             tf.keras.experimental.CosineDecay)
+class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Linear warmup schedule."""
+  def __init__(self,
+               after_warmup_lr_sched: Union[
+                   tf.keras.optimizers.schedules.LearningRateSchedule, float],
+               warmup_steps: int,
+               warmup_learning_rate: float,
+               name: Optional[str] = None):
+    """Add linear warmup schedule to a learning rate schedule.
+    warmup_lr is the initial learning rate, the final learning rate of the
+    init_warmup period is the initial learning rate of lr_schedule in use.
+    The learning rate at each step linearly increased according to the following
+    formula:
+      learning_rate = warmup_lr + step / warmup_steps
+                    * (final_warmup_lr - warmup_lr).
+    Using warmup overrides the learning rate schedule by the number of warmup
+    steps.
+    Args:
+      after_warmup_lr_sched: tf.keras.optimizers.schedules .LearningRateSchedule
+        or a constant.
+      warmup_steps: Number of the warmup steps.
+      warmup_learning_rate: Initial learning rate for the warmup.
+      name: Optional, name of warmup schedule.
+    """
+    super().__init__()
+    self._name = name
+    self._after_warmup_lr_sched = after_warmup_lr_sched
+    self._warmup_steps = warmup_steps
+    self._init_warmup_lr = warmup_learning_rate
+    if isinstance(after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      self._final_warmup_lr = after_warmup_lr_sched(warmup_steps)
+    else:
+      self._final_warmup_lr = tf.cast(after_warmup_lr_sched, dtype=tf.float32)
+  def __call__(self, step: int):
+    global_step = tf.cast(step, dtype=tf.float32)
+    linear_warmup_lr = (
+        self._init_warmup_lr + global_step / self._warmup_steps *
+        (self._final_warmup_lr - self._init_warmup_lr))
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      after_warmup_lr = self._after_warmup_lr_sched(step)
+    else:
+      after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
+    lr = tf.cond(global_step < self._warmup_steps,
+                 lambda: linear_warmup_lr,
+                 lambda: after_warmup_lr)
+    return lr
+  def get_config(self) -> Mapping[str, Any]:
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      config = {
+          "after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()}  # pytype: disable=attribute-error
+    else:
+      config = {"after_warmup_lr_sched": self._after_warmup_lr_sched}  # pytype: disable=attribute-error
+    config.update({
+        "warmup_steps": self._warmup_steps,
+        "warmup_learning_rate": self._init_warmup_lr,
+        "name": self._name
+    })
+    return config
+class PolynomialWarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Applies polynomial warmup schedule on a given learning rate decay schedule."""
+  def __init__(self,
+               after_warmup_lr_sched: Union[
+                   tf.keras.optimizers.schedules.LearningRateSchedule, float],
+               warmup_steps: int,
+               power: float = 1.0,
+               name: str = "PolynomialWarmup"):
+    super().__init__()
+    if isinstance(after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      self._initial_learning_rate = after_warmup_lr_sched(warmup_steps)
+    else:
+      self._initial_learning_rate = tf.cast(
+          after_warmup_lr_sched, dtype=tf.float32)
+    self._warmup_steps = warmup_steps
+    self._power = power
+    self._after_warmup_lr_sched = after_warmup_lr_sched
+    self._name = name
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PolynomialWarmUp") as name:
+      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
+      # learning rate will be `global_step/num_warmup_steps * init_lr`.
+      global_step_float = tf.cast(step, tf.float32)
+      warmup_steps_float = tf.cast(self._warmup_steps, tf.float32)
+      if self._warmup_steps <= 0:
+        warmup_percent_done = 1.0
+      else:
+        # A zero `step` may cause Inf. So make `step` positive.
+        step_non_zero = tf.math.maximum(global_step_float, 1.0)
+        warmup_percent_done = step_non_zero / warmup_steps_float
+      warmup_learning_rate = (
+          self._initial_learning_rate *
+          tf.math.pow(warmup_percent_done, self._power))
+      if isinstance(self._after_warmup_lr_sched,
+                    tf.keras.optimizers.schedules.LearningRateSchedule):
+        after_warmup_lr = self._after_warmup_lr_sched(step)
+      else:
+        after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
+      return tf.cond(
+          global_step_float < warmup_steps_float,
+          lambda: warmup_learning_rate,
+          lambda: after_warmup_lr,
+          name=name)
+  def get_config(self) -> Mapping[str, Any]:
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      config = {
+          "after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()}  # pytype: disable=attribute-error
+    else:
+      config = {"after_warmup_lr_sched": self._after_warmup_lr_sched}  # pytype: disable=attribute-error
+    config.update({
+        "warmup_steps": self._warmup_steps,
+        "power": self._power,
+        "name": self._name
+    })
+    return config
+class DirectPowerDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Learning rate schedule follows lr * (step)^power."""
+  def __init__(self,
+               initial_learning_rate: float,
+               power: float = 1.0,
+               name: str = "DirectPowerDecay"):
+    """Initialize configuration of the learning rate schedule.
+    Args:
+      initial_learning_rate: The initial learning rate.
+      power: The order of the polynomial.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._power = power
+    self._name = name
+  def __call__(self, step):
+    with tf.name_scope(self._name or "DirectPowerDecay"):
+      step = tf.cast(step, tf.float32)
+      learning_rate = self._initial_learning_rate
+      # A zero `step` may cause Inf. So make `step` positive.
+      step_non_zero = tf.math.maximum(step, 1.0)
+      learning_rate *= tf.math.pow(step_non_zero, self._power)
+      return learning_rate
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "power": self._power,
+        "name": self._name,
+    }
+class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Learning rate schedule with multiplied by linear decay at the end.
+  The schedule has the following behavoir.
+  Let offset_step = step - offset.
+  1) offset_step < 0, the actual learning rate equals initial_learning_rate.
+  2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
+  actual learning rate equals lr * offset_step^power.
+  3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
+  total_decay_steps, the actual learning rate equals lr * offset_step^power *
+  (total_decay_steps - offset_step) / (total_decay_steps *
+  linear_decay_fraction).
+  4) offset_step >= total_decay_steps, the actual learning rate equals zero.
+  """
+  def __init__(self,
+               initial_learning_rate: float,
+               total_decay_steps: int,
+               power: float = 1.0,
+               linear_decay_fraction: float = 0.1,
+               offset: int = 0,
+               name: str = "PowerAndLinearDecay"):
+    """Initialize configuration of the learning rate schedule.
+    Args:
+      initial_learning_rate: The initial learning rate.
+      total_decay_steps: The total number of steps for power + linear decay.
+      power: The order of the polynomial.
+      linear_decay_fraction: In the last `linear_decay_fraction` steps, the
+        learning rate will be multiplied by a linear decay.
+      offset: The offset applied to steps.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._total_decay_steps = total_decay_steps
+    self._power = power
+    self._linear_decay_fraction = linear_decay_fraction
+    self._offset = offset
+    self._name = name
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PowerAndLinearDecay"):
+      step = tf.cast(step - self._offset, tf.float32)
+      learning_rate = self._initial_learning_rate
+      # A zero `step` may cause Inf. So make `step` positive.
+      step_non_zero = tf.math.maximum(step, 1.0)
+      learning_rate *= tf.math.pow(step_non_zero, self._power)
+      if self._total_decay_steps * self._linear_decay_fraction > 0:
+        learning_rate *= tf.minimum(
+            1.0, (self._total_decay_steps - step) /
+            (self._total_decay_steps * self._linear_decay_fraction))
+        learning_rate = tf.maximum(0.0, learning_rate)
+      return learning_rate
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "total_decay_steps": self._total_decay_steps,
+        "power": self._power,
+        "linear_decay_fraction": self._linear_decay_fraction,
+        "offset": self._offset,
+        "name": self._name,
+    }
+class PowerDecayWithOffset(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Power learning rate decay with offset.
+  Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
+  Otherwise, learning rate equals to lr * (step - offset)^power.
+  """
+  def __init__(self,
+               initial_learning_rate: float,
+               power: float = 1.0,
+               offset: int = 0,
+               pre_offset_learning_rate: float = 1.0e6,
+               name: str = "PowerDecayWithOffset"):
+    """Initialize configuration of the learning rate schedule.
+    Args:
+      initial_learning_rate: The initial learning rate.
+      power: The order of the polynomial.
+      offset: The offset when computing the power decay.
+      pre_offset_learning_rate: The maximum learning rate we'll use.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._power = power
+    self._offset = offset
+    self._pre_offset_lr = pre_offset_learning_rate
+    self._name = name
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PowerDecayWithOffset"):
+      step = tf.cast(step, tf.float32)
+      lr_after_offset = tf.math.pow(
+          tf.math.maximum(step - self._offset, 1.0), self._power) * (
+              self._initial_learning_rate)
+      sign = tf.cast(step > self._offset, tf.float32)
+      lr_combined = (1.0 - sign) * self._pre_offset_lr + sign * lr_after_offset
+      # Power may give infinitely large LR. So cap it with pre_offset_lr.
+      return tf.math.minimum(lr_combined, self._pre_offset_lr)
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "power": self._power,
+        "offset": self._offset,
+        "pre_offset_learning_rate": self._pre_offset_lr,
+        "name": self._name,
+    }
+class StepConsineDecayWithOffset(
+    tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Stepwise cosine learning rate decay with offset.
+  Learning rate is equivalent to one or more consine decay(s) starting and
+  ending at each interval.
+  ExampleL
+    ```python
+    boundaries: [100000, 110000]
+    values: [1.0, 0.5]
+    lr_decayed_fn = (
+    lr_schedule.StepConsineDecayWithOffset(
+        boundaries,
+        values))
+    ```
+    from 0 to 100000 step, it will cosine decay from 1.0 to 0.5
+    from 100000 to 110000 step, it cosine decay from 0.5 to 0.0
+  """
+  def __init__(self,
+               boundaries,
+               values,
+               offset: int = 0,
+               name: str = "StepConsineDecayWithOffset"):
+    """Initialize configuration of the learning rate schedule.
+    Args:
+      boundaries: A list of `Tensor`s or `int`s with strictly
+        increasing entries, and with all elements having the same type as the
+        optimizer step.
+      values: A list of `Tensor`s or `float`s that specifies the
+        values for the intervals defined by `boundaries`. It should have one
+        more element than `boundaries`, and all elements should have the same
+        type.
+      offset: The offset when computing the power decay.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self.values = values
+    self.boundaries = boundaries
+    self.offset = offset
+    self.name = name
+    if len(self.values) < 1:
+      raise ValueError(f"Expect non empty {self.values}")
+    if len(self.boundaries) != len(self.values):
+      raise ValueError(
+          "Boundaries length is equal to learning rate levels length"
+          f"{len(self.boundaries)} != {len(self.values)}")
+    self.total_steps = (
+        [boundaries[i + 1] - boundaries[i] for i in range(len(boundaries) - 1)
+        ] + [0])
+  def __call__(self, global_step):
+    with tf.name_scope(self.name or "StepConsineDecayWithOffset"):
+      global_step = tf.cast(global_step - self.offset, tf.float32)
+      lr_levels = self.values
+      lr_steps = self.boundaries
+      level_total_steps = self.total_steps
+      num_levels = len(lr_levels)
+      init_lr = lr_levels[0]
+      next_init_lr = lr_levels[1] if num_levels > 1 else 0.
+      init_total_steps = level_total_steps[0]
+      cosine_learning_rate = ((init_lr - next_init_lr) * (tf.cos(
+          tf.constant(math.pi) * (global_step) /
+          (init_total_steps)) + 1.0) / 2.0 + next_init_lr)
+      learning_rate = cosine_learning_rate
+      tf.compat.v1.logging.info("DEBUG lr %r next lr %r", learning_rate,
+                                cosine_learning_rate)
+      tf.compat.v1.logging.info("DEBUG lr %r next lr %r inittotalstep %r",
+                                init_lr, next_init_lr, init_total_steps)
+      for i in range(1, num_levels):
+        next_init_lr = lr_levels[i]
+        next_start_step = lr_steps[i]
+        next_total_steps = level_total_steps[i]
+        next_next_init_lr = lr_levels[i + 1] if num_levels > i + 1 else 0.
+        tf.compat.v1.logging.info(
+            "DEBUG step %r nilr %r nss %r nts %r nnilr %r", global_step,
+            next_init_lr, next_start_step, next_total_steps, next_next_init_lr)
+        next_cosine_learning_rate = ((next_init_lr - next_next_init_lr) *
+                                     (tf.cos(
+                                         tf.constant(math.pi) *
+                                         (global_step - next_start_step) /
+                                         (next_total_steps)) + 1.0) / 2.0 +
+                                     next_next_init_lr)
+        learning_rate = tf.where(global_step >= next_start_step,
+                                 next_cosine_learning_rate, learning_rate)
+        tf.compat.v1.logging.info("DEBUG lr %r next lr %r", learning_rate,
+                                  next_cosine_learning_rate)
+    return learning_rate
+  def get_config(self):
+    return {
+        "boundaries": self.boundaries,
+        "values": self.values,
+        "offset": self.offset,
+        "name": self.name
+    }