version 1

05631eec · liangjing · 7e0391d9 · 05631eec · 05631eec · 05631eec
Commit 05631eec authored Apr 10, 2023 by liangjing
20 changed files
--- a/tf2_common/training/__pycache__/standard_runnable.cpython-36.pyc
+++ b/tf2_common/training/__pycache__/standard_runnable.cpython-36.pyc
--- a/tf2_common/training/__pycache__/standard_runnable.cpython-38.pyc
+++ b/tf2_common/training/__pycache__/standard_runnable.cpython-38.pyc
--- a/tf2_common/training/__pycache__/utils.cpython-36.pyc
+++ b/tf2_common/training/__pycache__/utils.cpython-36.pyc
--- a/tf2_common/training/__pycache__/utils.cpython-38.pyc
+++ b/tf2_common/training/__pycache__/utils.cpython-38.pyc
--- a/tf2_common/training/controller.py
+++ b/tf2_common/training/controller.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A light weight utilities to train TF2 models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from absl import logging
+
+import tensorflow as tf
+from typing import Callable, Dict, Optional, Text
+
+from tf2_common.training import utils
+
+
+class Controller(object):
+  """Class that facilitates training and evaluation of models."""
+
+  def __init__(
+      self,
+      strategy: Optional[tf.distribute.Strategy] = None,
+      train_fn: Optional[Callable[[tf.Tensor],
+                                  Optional[Dict[Text, tf.Tensor]]]] = None,
+      eval_fn: Optional[Callable[[tf.Tensor],
+                                 Optional[Dict[Text, tf.Tensor]]]] = None,
+      warmup_fn: Optional[Callable[[tf.Tensor],
+                                   Optional[Dict[Text, tf.Tensor]]]] = None,
+      global_step: Optional[tf.Variable] = None,
+      # Train related
+      train_steps: Optional[int] = None,
+      steps_per_loop: Optional[int] = None,
+      summary_dir: Optional[Text] = None,
+      checkpoint_manager: Optional[tf.train.CheckpointManager] = None,
+      # summary related
+      summary_interval: Optional[int] = None,
+      # Evaluation related
+      eval_summary_dir: Optional[Text] = None,
+      eval_steps: Optional[int] = None,
+      eval_interval: Optional[int] = None,
+      eval_offset: Optional[int] = 0,
+      # Warmup related
+      device_warmup_steps: Optional[int] = None):
+    """Constructs a `Controller` instance.
+
+    Args:
+      strategy: An instance of `tf.distribute.Strategy`.
+      train_fn: A callable defined as `def train_fn(num_steps)`, which
+        `num_steps` indicates the number of steps to run for each loop.
+      eval_fn: A callable defined as `def eval_fn(num_steps)`, which `num_steps`
+        indicates the number of steps for one evaluation.
+      warmup_fn: A callable defined as `def warmup_fn(num_steps)`, which
+        `num_steps` indicates the number of steps to run for each loop.
+      global_step: An integer `tf.Variable` indicating the global training step
+        number. Usually this can be obtained from `iterations` property of the
+        model's optimizer (e.g. `self.optimizer.iterations`), or users can
+        create their own global step variable as well. If the users create their
+        own global step variable, it is recommended to create the `tf.Variable`
+        inside strategy scope, and with
+        `aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA`.
+      train_steps: The total (maximum) number of training steps to perform.
+      steps_per_loop: The number of steps to run in each "inner loop" of
+        training (passed to the `num_steps` parameter of `train_fn`).
+      summary_dir: The directory to restore and write checkpoints and summaries.
+        If None, it will be set to `checkpoint_manager.directory`.
+      checkpoint_manager: An instance of `tf.train.CheckpointManager`.
+      summary_interval: Step interval for training summaries. Note that this
+        argument only applies to the summaries outside the training loop. If the
+        value is None, then training summaries are not enabled.
+      eval_summary_dir: The directory to write eval summaries. If None, no eval
+        summary will be written.
+      eval_steps: Number of steps to run evaluation.
+      eval_interval: Step interval for evaluation. If None, will skip
+        evaluation. Note that evaluation only happens outside the training loop,
+        which the loop iteration is specify by `steps_per_loop` parameter.
+      eval_offset: Step number of the first evaluation.
+      device_warmup_steps: The number of steps to run for warmup.
+
+    Raises:
+      ValueError: If both `train_fn` and `eval_fn` are None.
+      ValueError: If `train_fn` is not None and `train_steps` is None.
+      ValueError: If `steps_per_loop` is None when `train_fn` is provided.
+      ValueError: If `steps_per_loop` is not a positive integer.
+    """
+    if train_fn is None and eval_fn is None:
+      raise ValueError("`train_fn` and `eval_fn` should not both be None")
+
+    # TODO(rxsang): Support training until exhaustion by passing
+    # `train_steps=-1`. Currently it cannot be supported with a host training
+    # loop because break statements are not supported with distributed dataset.
+    if train_fn is not None and train_steps is None:
+      raise ValueError("`train_steps` is required when `train_fn` is provided.")
+    if train_fn is not None and steps_per_loop is None:
+      raise ValueError("`steps_per_loop` is required when `train_fn is "
+                       "provided.")
+    if not isinstance(steps_per_loop, int) or steps_per_loop < 1:
+      raise ValueError("`steps_per_loop` should be a positive integer")
+    if summary_interval is not None and summary_interval <= 0:
+      raise ValueError("`summary_interval` should be larger than 0")
+
+    self.strategy = strategy or tf.distribute.get_strategy()
+
+    self.train_fn = train_fn
+    self.eval_fn = eval_fn
+    self.warmup_fn = warmup_fn
+    self.global_step = global_step
+
+    self.train_steps = train_steps
+
+    self.steps_per_loop = steps_per_loop
+    self.device_warmup_steps = device_warmup_steps
+
+    self.summary_dir = summary_dir or checkpoint_manager.directory
+    self.checkpoint_manager = checkpoint_manager
+
+    self.summary_interval = summary_interval
+    summary_writer = tf.summary.create_file_writer(
+        self.summary_dir) if self.summary_interval else None
+    # TODO(rxsang): Consider pass SummaryManager directly into Controller for
+    # maximum customizability.
+    self.summary_manager = utils.SummaryManager(
+        summary_writer,
+        tf.summary.scalar,
+        global_step=self.global_step,
+        summary_interval=self.summary_interval)
+    if self.global_step:
+      tf.summary.experimental.set_step(self.global_step)
+
+    eval_summary_writer = tf.summary.create_file_writer(
+        eval_summary_dir) if eval_summary_dir else None
+    self.eval_summary_manager = utils.SummaryManager(
+        eval_summary_writer, tf.summary.scalar, global_step=self.global_step)
+
+    self.eval_steps = eval_steps
+    self.eval_interval = eval_interval
+    self.eval_offset = eval_offset
+
+    # Restore Model if needed.
+    if self.checkpoint_manager is not None:
+      model_restored = self._restore_model()
+      if not model_restored and self.checkpoint_manager.checkpoint_interval:
+        # If the model is not restored from a checkpoint, save an initial
+        # checkpoint.
+        ckpt_path = self.checkpoint_manager.save(
+            checkpoint_number=self.global_step)
+        logging.info("Saved checkpoins in %s", ckpt_path)
+
+    # Create and initialize the interval triggers.
+    self.eval_trigger = utils.IntervalTrigger(self.eval_interval,
+                                              self.eval_offset)
+
+  def _restore_model(self, checkpoint_path=None):
+    """Restore or initialize the model.
+
+    Args:
+      checkpoint_path: An optional string indicates the checkpoint path to
+        restore. If None, will restore from `self.checkpoint_manager`.
+
+    Returns:
+      True if the latest checkpoint is found or restored. Otherwise False.
+    """
+    with self.strategy.scope():
+      # Checkpoint restoring should be inside scope. b/139450638
+      if checkpoint_path is not None:
+        self.checkpoint_manager.checkpoint.restore(checkpoint_path)
+        return True
+      return self.checkpoint_manager.restore_or_initialize()
+
+  def _evaluate_once(self, current_step):
+    """Runs the evaluation once."""
+    logging.info("Start evaluation at step: %s", current_step)
+
+    with self.eval_summary_manager.summary_writer.as_default():
+      eval_outputs = self.eval_fn(self.eval_steps)
+
+    if eval_outputs:
+      eval_outputs = tf.nest.map_structure(
+          lambda x: (x if isinstance(x, (float, bool)) else x.numpy()),
+          eval_outputs)
+
+    info = "step: {}        evaluation metric: {}".format(
+        current_step, eval_outputs)
+    self._log_info(info)
+
+    self.eval_summary_manager.write_summaries(eval_outputs)
+    if "continue_training" in eval_outputs.keys():
+      return eval_outputs["continue_training"]
+    else:
+      return True
+
+  def _maybe_save_checkpoints(self, current_step, force_trigger=False):
+    if self.checkpoint_manager.checkpoint_interval:
+      ckpt_path = self.checkpoint_manager.save(
+          checkpoint_number=current_step, check_interval=force_trigger)
+      if ckpt_path is not None:
+        logging.info("Saved checkpoins in %s", ckpt_path)
+
+  def _maybe_evaluate(self, current_step, force_trigger=False):
+    if self.eval_trigger(current_step, force_trigger):
+      return self._evaluate_once(current_step)
+    else:
+      return True
+
+  def _log_info(self, message):
+    """Logs `message` to the `info` log, and also prints to stdout."""
+    logging.info(message)
+    print(message)
+
+  def train(self, evaluate=True):
+    """Runs the training, with optional evaluation.
+
+    This handles evaluation, gathering summaries, and saving checkpoints.
+
+    Args:
+      evaluate: A boolean indicates whether to perform evaluation during
+        training.
+
+    Raises:
+      RuntimeError: If `global_step` is not updated correctly in `train_fn`.
+    """
+    if self.train_fn is None:
+      raise ValueError("`self.train_fn` is required when calling `train` "
+                       "method.")
+    if self.global_step is None:
+      raise ValueError("`self.global_step` is required when calling `train` "
+                       "method.")
+    if evaluate and self.eval_fn is None:
+      raise ValueError("`self.eval_fn` is required when calling `train` method "
+                       "with `evaluate=True`")
+
+    step_timer = _StepTimer(self.global_step)
+    current_step = self.global_step.numpy()
+    logging.info("Train at step %s of %s", current_step, self.train_steps)
+    while current_step < self.train_steps:
+      # Calculates steps to run for the next train loop.
+      steps_per_loop = min(self.train_steps - current_step, self.steps_per_loop)
+      logging.info("Entering training loop with %s steps, at step %s of %s",
+                   steps_per_loop, current_step, self.train_steps)
+      current_step += steps_per_loop
+      steps_per_loop = tf.convert_to_tensor(steps_per_loop, dtype=tf.int32)
+
+      with self.summary_manager.summary_writer.as_default():
+        train_outputs = self.train_fn(steps_per_loop)
+
+      # Updates and verifies the current step after a training loop finishes.
+      if current_step != self.global_step.numpy():
+        logging.warning("`self.train_fn` is not updating `global_step` "
+                        "correctly, expected: %s, actual: %s",
+                        current_step, self.global_step.numpy())
+
+      # Print information like metrics and steps_per_second after a training
+      # loop.
+      if train_outputs:
+        train_outputs = tf.nest.map_structure(
+            lambda x: x.numpy(), train_outputs)
+      steps_per_second = step_timer.steps_per_second()
+      info = "step: {}        steps_per_second: {:.2f}        {}".format(
+          current_step, steps_per_second, train_outputs)
+      self._log_info(info)
+
+      train_outputs = train_outputs or {}
+      train_outputs["steps_per_second"] = steps_per_second
+      self.summary_manager.write_summaries(train_outputs)
+
+      self._maybe_save_checkpoints(current_step)
+
+      if evaluate:
+        continue_training = self._maybe_evaluate(current_step)
+        if not continue_training:
+          break
+
+    self.summary_manager.write_summaries(train_outputs, always_write=True)
+    self._maybe_save_checkpoints(current_step, force_trigger=True)
+    if evaluate:
+      self._maybe_evaluate(current_step, force_trigger=True)
+
+  def evaluate(self, continuous=False, timeout_fn=None):
+    """Runs the evaluation.
+
+    Args:
+      continuous: If `True`, will continously monitor the checkpoint directory
+        to evaluate on the latest checkpoint. If `False`, will do the evaluation
+        once.
+      timeout_fn: Optional callable to call after a timeout. If the function
+        returns True, then it means that no new checkpoints will be generated
+        and the iterator will exit.
+
+    Raises:
+      ValueError: If no checkpoint found in `self.checkpoint_manager.directory`.
+    """
+    if self.eval_fn is None:
+      raise ValueError("`self.eval_fn` should not be None to call "
+                       "`evaluate()` method.")
+
+    if not continuous and timeout_fn is not None:
+      raise ValueError("`timeout_fn` can be only passed when `continuous` is "
+                       "True")
+
+    if continuous:
+      for checkpoint_path in tf.train.checkpoints_iterator(
+          self.checkpoint_manager.directory, timeout_fn=timeout_fn):
+        self._restore_model(checkpoint_path)
+        self._evaluate_once(self.global_step.numpy())
+      return
+
+    latest_checkpoint = self.checkpoint_manager.latest_checkpoint
+    if not latest_checkpoint:
+      raise ValueError("no checkpoint found in dir %s" %
+                       self.checkpoint_manager.directory)
+    self._restore_model()
+    self._evaluate_once(self.global_step.numpy())
+
+  def warmup(self):
+    """Runs device warmup.
+
+    This handles running a training loop on dummy data to move TF function
+    tracing and XLA compilation outside of the training loop.
+
+    """
+    if self.global_step is None:
+      raise ValueError("`self.global_step` is required when calling `warmup` "
+                       "method.")
+
+    step_timer = _StepTimer(self.global_step)
+    current_step = self.global_step.numpy()
+    logging.info("Warmup at step %s of %s", current_step,
+                 self.device_warmup_steps)
+    while current_step < self.device_warmup_steps:
+      # Calculates steps to run for the next train loop.
+      steps_per_loop = self.device_warmup_steps
+      logging.info("Entering warmup loop with %s steps, at step %s of %s",
+                   steps_per_loop, current_step, self.device_warmup_steps)
+      current_step += steps_per_loop
+      steps_per_loop = tf.convert_to_tensor(steps_per_loop, dtype=tf.int32)
+
+      with self.summary_manager.summary_writer.as_default():
+        self.warmup_fn(steps_per_loop)
+
+      steps_per_second = step_timer.steps_per_second()
+      info = "step: {}        steps_per_second: {:.2f}".format(
+          current_step, steps_per_second)
+      self._log_info(info)
+
+
+class _StepTimer(object):
+  """Utility class for measuring steps/second."""
+
+  def __init__(self, step):
+    self.step = step
+    self.start()
+
+  def start(self):
+    self.last_iteration = self.step.numpy()
+    self.last_time = time.time()
+
+  def steps_per_second(self, restart=True):
+    value = ((self.step.numpy() - self.last_iteration) /
+             (time.time() - self.last_time))
+    if restart:
+      self.start()
+    return value
--- a/tf2_common/training/optimizer_v2modified.py
+++ b/tf2_common/training/optimizer_v2modified.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Modified optimizer_v2 implementation enabling XLA across variable updates."""
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
+from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import backend
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.optimizer_v2 import utils as optimizer_utils
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables as tf_variables
+
+class OptimizerV2Modified(optimizer_v2.OptimizerV2):
+  """This is a subclass optimizer that performs variable updates in
+  Distribution Strategy replica context. OptimizerV2 base class is currently
+  under refactoring and will have better support of this.
+
+  Please refer to optimizer_v2.OptimizerV2 for more details regarding the APIs.
+  """
+
+  def __init__(self, name, use_experimental_compile=False, **kwargs):
+    """Create a new Optimizer.
+
+    Args:
+      name: Optional name prefix for variables and ops created by the optimizer.
+      use_experimental_compile: when set to True, use experimental_compile on
+        the _distributed_apply function.
+    """
+    super(OptimizerV2Modified, self).__init__(name=name, **kwargs)
+    self.use_experimental_compile = use_experimental_compile
+
+  def apply_gradients(self,
+                      grads_and_vars,
+                      name=None,
+                      experimental_aggregate_gradients=True):
+    """Apply gradients to variables.
+
+    Only the last two lines are different from optimizer_v2.OptimizerV2.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs.
+      name: Optional name for the returned operation. Default to the name passed
+        to the `Optimizer` constructor.
+      experimental_aggregate_gradients: Whether to sum gradients from different
+        replicas in the presense of `tf.distribute.Strategy`. If False, it's
+        user responsibility to aggregate the gradients. Default to True.
+
+    Returns:
+      An `Operation` that applies the specified gradients. The `iterations`
+      will be automatically increased by 1.
+
+    Raises:
+      TypeError: If `grads_and_vars` is malformed.
+      ValueError: If none of the variables have gradients.
+      RuntimeError: If called in cross-replica context.
+    """
+    # pylint: disable=protected-access
+    grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
+    # pylint: enable=protected-access
+    var_list = [v for (_, v) in grads_and_vars]
+
+    with ops.name_scope_v2(self._name):
+      # Create iteration if necessary.
+      with ops.init_scope():
+        self._create_all_weights(var_list)
+
+      if not grads_and_vars:
+        # Distribution strategy does not support reducing an empty list of
+        # gradients
+        return control_flow_ops.no_op()
+
+      if distribute_ctx.in_cross_replica_context():
+        raise RuntimeError(
+            "`apply_gradients() cannot be called in cross-replica context. "
+            "Use `tf.distribute.Strategy.run` to enter replica "
+            "context.")
+
+      strategy = distribute_ctx.get_strategy()
+      if (not experimental_aggregate_gradients and strategy and isinstance(
+          strategy.extended,
+          parameter_server_strategy.ParameterServerStrategyExtended)):
+        raise NotImplementedError(
+            "`experimental_aggregate_gradients=False is not supported for "
+            "ParameterServerStrategy and CentralStorageStrategy")
+
+      apply_state = self._prepare(var_list)
+      if experimental_aggregate_gradients:
+        grads_and_vars = self._transform_unaggregated_gradients(grads_and_vars)
+        grads_and_vars = self._aggregate_gradients(grads_and_vars)
+      grads_and_vars = self._transform_gradients(grads_and_vars)
+
+      self._distributed_apply(None, grads_and_vars, name, apply_state)
+      return self._iterations.assign_add(1, read_value=False)
+
+  def _distributed_apply_org(self, distribution, grads_and_vars, name, apply_state):
+    """`apply_gradients` using a `DistributionStrategy`.
+
+    This is the _distributed_apply function in optimizer_v2,
+    returning a list of ops.
+    """
+
+    def apply_grad_to_update_var(var, grad):
+      """Apply gradient to variable."""
+      if isinstance(var, ops.Tensor):
+        raise NotImplementedError("Trying to update a Tensor ", var)
+
+      apply_kwargs = {}
+      if isinstance(grad, ops.IndexedSlices):
+        if var.constraint is not None:
+          raise RuntimeError(
+              "Cannot use a constraint function on a sparse variable.")
+        if "apply_state" in self._sparse_apply_args:
+          apply_kwargs["apply_state"] = apply_state
+        return self._resource_apply_sparse_duplicate_indices(
+            grad.values, var, grad.indices, **apply_kwargs)
+
+      if "apply_state" in self._dense_apply_args:
+        apply_kwargs["apply_state"] = apply_state
+      update_op = self._resource_apply_dense(grad, var, **apply_kwargs)
+      if var.constraint is not None:
+        with ops.control_dependencies([update_op]):
+          return var.assign(var.constraint(var))
+      else:
+        return update_op
+
+    update_ops = []
+    with ops.name_scope(name or self._name, skip_on_eager=True):
+      for grad, var in grads_and_vars:
+        update_ops.append(apply_grad_to_update_var(var, grad))
+      return control_flow_ops.group(*update_ops)
+
+  def _distributed_apply(self, distribution, grads_and_vars, name, apply_state):
+    if self.use_experimental_compile:
+      self._distributed_apply_compile(distribution, grads_and_vars, name,
+                                      apply_state)
+    else:
+      self._distributed_apply_org(distribution, grads_and_vars, name,
+                                  apply_state)
+
+  @tf.function(experimental_compile=False)
+  def _distributed_apply_compile(self, distribution, grads_and_vars, name,
+                                 apply_state):
+    """This is a warpper, to return a tensor, making tf.func() happy."""
+    self._distributed_apply_org(distribution, grads_and_vars,
+                                name, apply_state)
+    return tf.ones((), dtype=tf.bool)
--- a/tf2_common/training/runnable.py
+++ b/tf2_common/training/runnable.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An abstraction that users can easily handle their custom training loops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import six
+import tensorflow as tf
+from typing import Dict, Optional, Text
+
+
+@six.add_metaclass(abc.ABCMeta)
+class AbstractTrainable(tf.Module):
+  """An abstract class defining the APIs required for training."""
+
+  @abc.abstractmethod
+  def train(self,
+            num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
+    """Implements model training with multiple steps.
+
+    In training, it is common to break the total training steps into several
+    training loops, so users can do checkpointing, write summaries and run some
+    python callbacks. This is necessary for getting good performance in TPU
+    training, as the overhead for launching a multi worker tf.function may be
+    large in Eager mode. It is usually encouraged to create a host training loop
+    (e.g. using a `tf.range` wrapping `strategy.run` inside a
+    `tf.function`) in the TPU case. For the cases that don't require host
+    training loop to acheive peak performance, users can just implement a simple
+    python loop to drive each step.
+
+    Args:
+      num_steps: A guideline for how many training steps to run. Note that it is
+        up to the model what constitutes a "step" (this may involve more than
+        one update to model parameters, e.g. if training a GAN).
+
+    Returns:
+      The function may return a dictionary of `Tensors`, which will be
+      written to logs and as TensorBoard summaries.
+    """
+    pass
+
+  @abc.abstractmethod
+  def warmup(self,
+             num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
+    """Implements device warmup with multiple steps.
+
+    This loop runs the input pipeline on synthetic data before training, thereby
+    allowing XLA compilation and tf.function tracing before the training dataset
+    is accessed.
+
+    Args:
+      num_steps: A guideline for how many training steps to run. Note that it is
+        up to the model what constitutes a "step" (this may involve more than
+        one update to model parameters, e.g. if training a GAN).
+
+    Returns:
+      The function may return a dictionary of `Tensors`, which will be
+      written to logs and as TensorBoard summaries.
+    """
+    pass
+
+
+@six.add_metaclass(abc.ABCMeta)
+class AbstractEvaluable(tf.Module):
+  """An abstract class defining the APIs required for evaluation."""
+
+  @abc.abstractmethod
+  def evaluate(
+      self, num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
+    """Implements model evaluation.
+
+    Args:
+      num_steps: A guideline for how many evaluation steps to run. Note that it
+        is up to the model what constitutes a "step". Generally, it may be
+        desirable to support both a limited number of eval steps and iterating
+        over a full dataset (however many steps are required) when `num_steps`
+        is `None`.
+
+    Returns:
+      The function may return a dictionary of `Tensors`, which will be
+      written to logs and as TensorBoard summaries.
+    """
+    pass
--- a/tf2_common/training/standard_runnable.py
+++ b/tf2_common/training/standard_runnable.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An abstraction that users can easily handle their custom training loops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import six
+import tensorflow as tf
+from typing import Dict, Optional, Text
+
+from tf2_common.training import runnable
+from tf2_common.training import utils
+
+
+@six.add_metaclass(abc.ABCMeta)
+class StandardTrainable(runnable.AbstractTrainable):
+  """Implements the standard functionality of AbstractTrainable APIs."""
+
+  def __init__(self, use_tf_while_loop=True, use_tf_function=True):
+    if use_tf_while_loop and not use_tf_function:
+      raise ValueError("`use_tf_while_loop=True` and `use_tf_function=False` "
+                       "is not supported")
+    self.use_tf_while_loop = use_tf_while_loop
+    self.use_tf_function = use_tf_function
+    self.train_dataset = None
+    self.train_iter = None
+    self.train_loop_fn = None
+
+  def _initialize_train_fn(self):
+    if self.train_loop_fn is None:
+      train_fn = self.train_step
+      if self.use_tf_while_loop:
+        self.train_loop_fn = utils.create_tf_while_loop_fn(train_fn)
+      else:
+        if self.use_tf_function:
+          train_fn = tf.function(train_fn)
+        self.train_loop_fn = utils.create_loop_fn(train_fn)
+
+  def train(self,
+            num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
+    """See base class."""
+    if self.train_dataset is None:
+      # Build train input dataset
+      self.train_dataset = self.build_train_dataset()
+      self.train_iter = tf.nest.map_structure(iter, self.train_dataset)
+
+    self._initialize_train_fn()
+
+    self.train_loop_begin()
+    self.train_loop_fn(self.train_iter, num_steps)
+    return self.train_loop_end()
+
+  def train_loop_begin(self):
+    """Called once at the beginning of the training loop.
+
+    This is a good place to reset metrics that accumulate values over multiple
+    steps of training.
+    """
+    pass
+
+  @abc.abstractmethod
+  def train_step(self, iterator):
+    """Implements one step of training.
+
+    What a "step" consists of is up to the implementer. If using distribution
+    strategies, the call to this method should take place in the "cross-replica
+    context" for generality, to allow e.g. multiple iterator dequeues and calls
+    to `strategy.run`.
+
+    Args:
+      iterator: A tf.nest-compatible structure of tf.data Iterator or
+        DistributedIterator.
+    """
+    pass
+
+  def train_loop_end(self) -> Optional[Dict[Text, tf.Tensor]]:
+    """Called at the end of the training loop.
+
+    This is a good place to get metric results. The value returned from this
+    function will be returned as-is from the train() method.
+
+    Returns:
+      The function may return a dictionary of `Tensors`, which will be
+      written to logs and as TensorBoard summaries.
+    """
+    pass
+
+
+@six.add_metaclass(abc.ABCMeta)
+class StandardEvaluable(runnable.AbstractEvaluable):
+  """Implements the standard functionality of AbstractEvaluable APIs."""
+
+  def __init__(self, use_tf_function=True):
+    self.eval_use_tf_function = use_tf_function
+    self.eval_dataset = None
+    self.eval_loop_fn = None
+
+  @abc.abstractmethod
+  def build_eval_dataset(self):
+    """Builds the evaluation datasets.
+
+    Returns:
+      A tf.nest-compatible structure of tf.data.Dataset or DistributedDataset.
+    """
+    pass
+
+  def _initialize_eval_fn(self):
+    if self.eval_loop_fn is None:
+      eval_fn = self.eval_step
+      if self.eval_use_tf_function:
+        eval_fn = tf.function(eval_fn)
+      self.eval_loop_fn = utils.create_loop_fn(eval_fn)
+
+  def evaluate(
+      self, num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
+    """See base class."""
+    if self.eval_dataset is None:
+      # Build train input dataset
+      self.eval_dataset = self.build_eval_dataset()
+
+    self._initialize_eval_fn()
+    # TODO(b/147718615): When async RPC is enabled in eager runtime, we make
+    # eval iterator as a class member so it doesn't get destroyed when out of
+    # the function scope.
+    self.eval_iter = tf.nest.map_structure(iter, self.eval_dataset)
+
+    self.eval_begin()
+    self.eval_loop_fn(self.eval_iter, num_steps)
+    return self.eval_end()
+
+  def eval_begin(self):
+    """Called once at the beginning of the evaluation.
+
+    This is a good place to reset metrics that accumulate values over the entire
+    evaluation.
+    """
+    pass
+
+  @abc.abstractmethod
+  def eval_step(self, iterator):
+    """Implements one step of evaluation.
+
+    What a "step" consists of is up to the implementer. If using distribution
+    strategies, the call to this method should take place in the "cross-replica
+    context" for generality, to allow e.g. multiple iterator dequeues and calls
+    to `strategy.run`.
+
+    Args:
+      iterator: A tf.nest-compatible structure of tf.data Iterator or
+        DistributedIterator.
+    """
+    pass
+
+  def eval_end(self) -> Optional[Dict[Text, tf.Tensor]]:
+    """Called at the end of the evaluation.
+
+    This is a good place to get metric results. The value returned from this
+    function will be returned as-is from the evaluate() method.
+
+    Returns:
+      The function may return a dictionary of `Tensors`, which will be
+      written to logs and as TensorBoard summaries.
+    """
+    pass
+
+
+@six.add_metaclass(abc.ABCMeta)
+class StandardRunnableWithWarmup(StandardTrainable, StandardEvaluable):
+  """A train and eval runnable that includes a warmup step."""
+
+  def __init__(self, use_tf_while_loop=True, use_tf_function=True):
+    StandardTrainable.__init__(self, use_tf_while_loop, use_tf_function)
+    StandardEvaluable.__init__(self, use_tf_function)
+
+    self.warmup_dataset = None
+    self.warmup_iter = None
+
+  @abc.abstractmethod
+  def build_synthetic_dataset(self):
+    """Builds the synthetic dataset for warmup.
+
+    Returns:
+      A tf.nest-compatible structure of tf.data.Dataset or DistributedDataset.
+    """
+    pass
+
+  def warmup(self,
+             num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
+    """Implements device warmup with multiple steps.
+
+    This loop runs the input pipeline on synthetic data before training, thereby
+    allowing XLA compilation and tf.function tracing before the dataset
+    is accessed.
+
+    Args:
+      num_steps: A guideline for how many training steps to run. Note that it is
+        up to the model what constitutes a "step" (this may involve more than
+        one update to model parameters, e.g. if training a GAN).
+
+    Returns:
+      The function may return a dictionary of `Tensors`, which will be
+      written to logs and as TensorBoard summaries.
+    """
+    if self.warmup_dataset is None:
+      # Build warmup input dataset
+      self.warmup_dataset = self.build_synthetic_dataset()
+      self.warmup_iter = tf.nest.map_structure(iter, self.warmup_dataset)
+
+    self._initialize_train_fn()
+    self._initialize_eval_fn()
+    self.warmup_loop_begin()
+    self.train_loop_fn(self.warmup_iter, num_steps)
+    self.eval_loop_fn(self.warmup_iter, num_steps)
+    return self.warmup_loop_end()
--- a/tf2_common/training/utils.py
+++ b/tf2_common/training/utils.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Some layered modules/functions to help users writing custom training loop."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import inspect
+import six
+
+import tensorflow as tf
+
+
+def create_loop_fn(step_fn):
+  """Creates a multiple steps function driven by the python while loop.
+
+  Args:
+    step_fn: A function which takes `iterator` as input.
+
+  Returns:
+    A callable defined as the `loop_fn` defination below.
+  """
+
+  def loop_fn(iterator, num_steps, state=None, reduce_fn=None):
+    """A loop function with multiple steps.
+
+    Args:
+      iterator: A nested structure of tf.data `Iterator` or
+        `DistributedIterator`.
+      num_steps: The number of steps in the loop. If `num_steps==-1`, will
+        iterate until exausting the iterator.
+      state: An optional initial state before running the loop.
+      reduce_fn: a callable defined as `def reduce_fn(state, value)`, where
+        `value` is the outputs from `step_fn`.
+
+    Returns:
+      The updated state.
+    """
+    try:
+      step = 0
+      while (num_steps == -1 or step < num_steps):
+        outputs = step_fn(iterator)
+        if reduce_fn is not None:
+          state = reduce_fn(state, outputs)
+        step += 1
+      return state
+    except (StopIteration, tf.errors.OutOfRangeError):
+      return state
+
+  return loop_fn
+
+
+def create_tf_while_loop_fn(step_fn):
+  """Create a multiple steps function driven by tf.while_loop on the host.
+
+  Args:
+    step_fn: A function which takes `iterator` as input.
+
+  Returns:
+    A callable defined as the `loop_fn` defination below.
+  """
+
+  @tf.function
+  def loop_fn(iterator, num_steps):
+    """A loop function with multiple steps.
+
+    Args:
+      iterator: A nested structure of tf.data `Iterator` or
+        `DistributedIterator`.
+      num_steps: The number of steps in the loop. Must be a tf.Tensor.
+    """
+    if not isinstance(num_steps, tf.Tensor):
+      raise ValueError("`num_steps` should be an `tf.Tensor`. Python object "
+                       "may cause retracing.")
+
+    for _ in tf.range(num_steps):
+      step_fn(iterator)
+
+  return loop_fn
+
+
+def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
+  """A helper function to create distributed dataset.
+
+  Args:
+    strategy: An instance of `tf.distribute.Strategy`.
+    dataset_or_fn: A instance of `tf.data.Dataset` or a function which takes an
+      `tf.distribute.InputContext` as input and returns a `tf.data.Dataset`. If
+      it is a function, it could optionally have an argument named
+      `input_context` which is `tf.distribute.InputContext` argument type.
+    *args: The list of arguments to be passed to dataset_or_fn.
+    **kwargs: Any keyword arguments to be passed.
+
+  Returns:
+    A distributed Dataset.
+  """
+  if strategy is None:
+    strategy = tf.distribute.get_strategy()
+
+  if isinstance(dataset_or_fn, tf.data.Dataset):
+    return strategy.experimental_distribute_dataset(dataset_or_fn)
+
+  if not callable(dataset_or_fn):
+    raise ValueError("`dataset_or_fn` should be either callable or an instance "
+                     "of `tf.data.Dataset`")
+
+  def dataset_fn(ctx):
+    """Wrapped dataset function for creating distributed dataset.."""
+
+    # If `dataset_or_fn` is a function and has `input_context` as argument
+    # names, pass `ctx` as the value of `input_context` when calling
+    # `dataset_or_fn`. Otherwise `ctx` will not be used when calling
+    # `dataset_or_fn`.
+    if six.PY3:
+      argspec = inspect.getfullargspec(dataset_or_fn)
+    else:
+      argspec = inspect.getargspec(dataset_or_fn)
+    args_names = argspec.args
+
+    if "input_context" in args_names:
+      kwargs["input_context"] = ctx
+    ds = dataset_or_fn(*args, **kwargs)
+    return ds
+
+  return strategy.experimental_distribute_datasets_from_function(dataset_fn)
+
+
+class SummaryManager(object):
+  """A class manages writing summaries."""
+
+  def __init__(self,
+               summary_writer,
+               summary_fn,
+               global_step=None,
+               summary_interval=None):
+    """Construct a summary manager object.
+
+    Args:
+      summary_writer: A `tf.summary.SummaryWriter` instance for writing
+        summaries.
+      summary_fn: A callable defined as `def summary_fn(name, tensor,
+        step=None)`, which describes the summary operation.
+      global_step: A `tf.Variable` instance for checking the current global step
+        value, in case users want to save summaries every N steps.
+      summary_interval: An integer, indicates the minimum step interval between
+        two summaries.
+    """
+    if summary_writer is not None:
+      self._summary_writer = summary_writer
+      self._enabled = True
+    else:
+      self._summary_writer = tf.summary.create_noop_writer()
+      self._enabled = False
+    self._summary_fn = summary_fn
+
+    if global_step is None:
+      self._global_step = tf.summary.experimental.get_step()
+    else:
+      self._global_step = global_step
+
+    if summary_interval is not None:
+      if self._global_step is None:
+        raise ValueError("`summary_interval` is not None, but no `global_step` "
+                         "can be obtained ")
+      self._last_summary_step = self._global_step.numpy()
+    self._summary_interval = summary_interval
+
+  @property
+  def summary_interval(self):
+    return self._summary_interval
+
+  @property
+  def summary_writer(self):
+    """Returns the underlying summary writer."""
+    return self._summary_writer
+
+  def write_summaries(self, items, always_write=True):
+    """Write a bulk of summaries.
+
+    Args:
+      items: a dictionary of `Tensors` for writing summaries.
+      always_write: An optional boolean. If `True`, the manager will always
+        write summaries unless the summaries have been written for the same
+        step. Otherwise the manager will only write the summaries if the
+        interval between summaries are larger than `summary_interval`.
+
+    Returns:
+      A boolean indicates whether the summaries are written or not.
+    """
+    # TODO(rxsang): Support writing summaries with nested structure, so users
+    # can split the summaries into different directories for nicer visualization
+    # in Tensorboard, like train and eval metrics.
+    if not self._enabled:
+      return False
+
+    if self._summary_interval is not None:
+      current_step = self._global_step.numpy()
+      if current_step == self._last_summary_step:
+        return False
+      if not always_write and current_step < (self._last_summary_step +
+                                              self._summary_interval):
+        return False
+      self._last_summary_step = current_step
+
+    with self._summary_writer.as_default():
+      for name, tensor in items.items():
+        self._summary_fn(name, tensor, step=self._global_step)
+    return True
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Trigger(object):
+  """An abstract class representing a "trigger" for some event."""
+
+  @abc.abstractmethod
+  def __call__(self, value: float, force_trigger=False):
+    """Maybe trigger the event based on the given value.
+
+    Args:
+      value: the value for triggering.
+      force_trigger: Whether the trigger is forced triggered.
+
+    Returns:
+      `True` if the trigger is triggered on the given `value`, and
+      `False` otherwise.
+    """
+
+  @abc.abstractmethod
+  def reset(self):
+    """Reset states in the trigger."""
+
+
+class IntervalTrigger(Trigger):
+  """Triggers on every fixed interval."""
+
+  def __init__(self, interval, start=0):
+    """Constructs the IntervalTrigger.
+
+    Args:
+      interval: The triggering interval.
+      start: An initial value for the trigger.
+    """
+    self._interval = interval
+    self._last_trigger_value = start
+
+  def __call__(self, value, force_trigger=False):
+    """Maybe trigger the event based on the given value.
+
+    Args:
+      value: the value for triggering.
+      force_trigger: If True, the trigger will be forced triggered unless the
+        last trigger value is equal to `value`.
+
+    Returns:
+      `True` if the trigger is triggered on the given `value`, and
+      `False` otherwise.
+    """
+    if force_trigger and value != self._last_trigger_value:
+      self._last_trigger_value = value
+      return True
+
+    if self._interval and self._interval > 0:
+      if value >= self._last_trigger_value + self._interval:
+        self._last_trigger_value = value
+        return True
+    return False
+
+  def reset(self):
+    """See base class."""
+    self._last_trigger_value = 0
+
+
+class EpochHelper(object):
+  """A Helper class to handle epochs in Customized Training Loop."""
+
+  def __init__(self, epoch_steps, global_step):
+    """Constructs the EpochHelper.
+
+    Args:
+      epoch_steps: An integer indicates how many steps in an epoch.
+      global_step: A `tf.Variable` instance indicates the current global step.
+    """
+    self._epoch_steps = epoch_steps
+    self._global_step = global_step
+    self._current_epoch = None
+    self._epoch_start_step = None
+    self._in_epoch = False
+
+  def epoch_begin(self):
+    """Returns whether a new epoch should begin."""
+    if self._in_epoch:
+      return False
+    current_step = self._global_step.numpy()
+    self._epoch_start_step = current_step
+    self._current_epoch = current_step // self._epoch_steps
+    self._in_epoch = True
+    return True
+
+  def epoch_end(self):
+    """Returns whether the current epoch should end."""
+    if not self._in_epoch:
+      raise ValueError("`epoch_end` can only be called inside an epoch")
+    current_step = self._global_step.numpy()
+    epoch = current_step // self._epoch_steps
+
+    if epoch > self._current_epoch:
+      self._in_epoch = False
+      return True
+    return False
+
+  @property
+  def batch_index(self):
+    """Index of the next batch within the current epoch."""
+    return self._global_step.numpy() - self._epoch_start_step
+
+  @property
+  def current_epoch(self):
+    return self._current_epoch
--- a/tf2_common/utils/flags/__init__.py
+++ b/tf2_common/utils/flags/__init__.py
--- a/tf2_common/utils/flags/__pycache__/__init__.cpython-36.pyc
+++ b/tf2_common/utils/flags/__pycache__/__init__.cpython-36.pyc
--- a/tf2_common/utils/flags/__pycache__/__init__.cpython-38.pyc
+++ b/tf2_common/utils/flags/__pycache__/__init__.cpython-38.pyc
--- a/tf2_common/utils/flags/__pycache__/_base.cpython-36.pyc
+++ b/tf2_common/utils/flags/__pycache__/_base.cpython-36.pyc
--- a/tf2_common/utils/flags/__pycache__/_base.cpython-38.pyc
+++ b/tf2_common/utils/flags/__pycache__/_base.cpython-38.pyc
--- a/tf2_common/utils/flags/__pycache__/_benchmark.cpython-36.pyc
+++ b/tf2_common/utils/flags/__pycache__/_benchmark.cpython-36.pyc
--- a/tf2_common/utils/flags/__pycache__/_benchmark.cpython-38.pyc
+++ b/tf2_common/utils/flags/__pycache__/_benchmark.cpython-38.pyc
--- a/tf2_common/utils/flags/__pycache__/_conventions.cpython-36.pyc
+++ b/tf2_common/utils/flags/__pycache__/_conventions.cpython-36.pyc
--- a/tf2_common/utils/flags/__pycache__/_conventions.cpython-38.pyc
+++ b/tf2_common/utils/flags/__pycache__/_conventions.cpython-38.pyc
--- a/tf2_common/utils/flags/__pycache__/_device.cpython-36.pyc
+++ b/tf2_common/utils/flags/__pycache__/_device.cpython-36.pyc
--- a/tf2_common/utils/flags/__pycache__/_device.cpython-38.pyc
+++ b/tf2_common/utils/flags/__pycache__/_device.cpython-38.pyc