Internal change

PiperOrigin-RevId: 335446217

Internal change
PiperOrigin-RevId: 335446217
2d342592 · Dan Holtmann-Rice · A. Unique TensorFlower · 3a9ed6bd · 2d342592 · 2d342592
Commit 2d342592 authored Oct 05, 2020 by Dan Holtmann-Rice Committed by A. Unique TensorFlower Oct 05, 2020
12 changed files
--- a/orbit/README.md
+++ b/orbit/README.md
-![TensorFlow Requirement: 2.x](https://img.shields.io/badge/TensorFlow%20Requirement-2.x-brightgreen)
-
 # Orbit

-Orbit is a customized training loop library built on top of Tensorflow 2. It
-provides a flexible lightweight library that users can easily use or fork when
-writing [customized training loop code](https://www.tensorflow.org/tutorials/distribute/custom_training)
-in TF2. It intergates with `tf.distribute` seamlessly and supports running on
-different device types (CPU, GPU, and TPU).
+Orbit is a flexible, lightweight library designed to make it easy to write
+[custom training loops][custom_training] in TensorFlow 2. Orbit handles common
+model training tasks such as saving checkpoints, running model evaluations, and
+setting up summary writing, while giving users full control over implementing
+the inner training loop. It integrates with `tf.distribute` seamlessly and
+supports running on different device types (CPU, GPU, and TPU). The core code is
+intended to be easy to read and fork.
+
+See our [g3doc](g3doc) at go/orbit-trainer for additional documentation.
+
+[custom_training]: https://www.tensorflow.org/tutorials/distribute/custom_training
--- a/orbit/__init__.py
+++ b/orbit/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Defines exported symbols for `orbit` package."""
+"""Defines exported symbols for the `orbit` package."""

 from orbit import utils


--- a/orbit/controller.py
+++ b/orbit/controller.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A light weight utilities to train TF2 models."""
+"""Provides a `Controller` class for managing the outer training loop."""

+import pprint
 import time

-from typing import Callable, Dict, Optional, Text, Union
+from typing import Callable, Optional, Union

 from absl import logging
-import numpy as np

 from orbit import runner
 from orbit import utils
@@ -27,14 +27,50 @@ from orbit import utils
 import tensorflow as tf


-def _log_info(message: Text):
+def _log(message: str):
  """Logs `message` to the `info` log, and also prints to stdout."""
  logging.info(message)
  print(message)


+logging.ABSLLogger.register_frame_to_skip(__file__, _log.__name__)
+
+
+def _format_output(output, indent=4):
+  """Formats `output`, either on one line, or indented across multiple lines."""
+  formatted = pprint.pformat(output)
+  lines = formatted.splitlines()
+  if len(lines) == 1:
+    return formatted
+  lines = [" " * indent + line for line in lines]
+  return "\n" + "\n".join(lines)
+
+
 class Controller:
-  """Class that facilitates training and evaluation of models."""
+  """Class that controls the outer loop of model training and evaluation.
+
+  Orbit divides training and evaluation into "inner" and "outer" loops. Inner
+  loops are implemented by users in the form of `AbstractTrainer` and
+  `AbstractEvaluator` subclasses, and define how to run a given number of
+  training or evaluation steps. The outer loop is provided by this `Controller`,
+  and interleaves calls to the user provided inner loops with additional actions
+  such as saving checkpoints, running evaluations, and writing summaries
+  (depending on the arguments passed to `Controller.__init__` and the method
+  being called).
+
+  There are four top-level "outer loops" provided:
+
+    - `train`, which trains until a specified number of global steps is reached;
+    - `evaluate`, for one-off model evaluation;
+    - `train_and_evaluate`, for interleaved training and evaluation;
+    - `evaluate_continuously`, for monitoring a given directory and running
+      evaluations on new model checkpoints.
+
+  While this class attempts to provide out-of-the-box solutions for common
+  training and evaluation use cases, the internal details and method
+  implementations are also intended to be simple enough to make subclassing or
+  other custom outer loop implementations easy to achieve.
+  """

  def __init__(
      self,
@@ -47,63 +83,82 @@ class Controller:
      checkpoint_manager: Optional[tf.train.CheckpointManager] = None,
      # Summary related
      summary_interval: Optional[int] = None,
-      summary_dir: Optional[Text] = None,
+      summary_dir: Optional[str] = None,
      # Evaluation related
-      eval_summary_dir: Optional[Text] = None):
-    """Constructs a `Controller` instance.
+      eval_summary_dir: Optional[str] = None):
+    """Initializes a `Controller` instance.
+
+    Note that if `checkpoint_manager` is provided and there are checkpoints in
+    the associated model directory, the model will be restored from the most
+    recent checkpoint during this `__init__` method.

    Args:
-      strategy: An instance of `tf.distribute.Strategy`.
-      trainer: An instance of `orbit.AbstractTrainer`, which represents model
-        training details.
-      evaluator: An instance of `orbit.AbstractEvaluator`, which represents
-        model evaluation details.
-      global_step: An integer `tf.Variable` indicating the global training step
-        number. Usually this can be obtained from `iterations` property of the
-        model's optimizer (e.g. `self.optimizer.iterations`), or users can
-        create their own global step variable as well. If the users create their
-        own global step variable, it is recommended to create the `tf.Variable`
-        inside strategy scope, and with
-        `aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA`.
-      steps_per_loop: The number of steps to run in each "inner loop" of
-        training (passed to the `num_steps` parameter of `trainer.train`).
-      checkpoint_manager: An instance of `tf.train.CheckpointManager`.
+      strategy: An instance of `tf.distribute.Strategy`. If not provided, the
+        strategy will be initialized from the current in-scope strategy using
+        `tf.distribute.get_strategy()`.
+      trainer: An instance of `orbit.AbstractTrainer`, which implements the
+        inner training loop.
+      evaluator: An instance of `orbit.AbstractEvaluator`, which implements
+        evaluation.
+      global_step: An integer `tf.Variable` storing the global training step
+        number. Usually this can be obtained from the `iterations` property of
+        the model's optimizer (e.g. `trainer.optimizer.iterations`). In cases
+        where multiple optimizers are used, or if one model "step" corresponds
+        to more than one update to model parameters, users can create and
+        increment their own global step variable as well. In this case it is
+        recommended to create the `tf.Variable` inside the distribution strategy
+        scope, with `aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA` (see
+        also `orbit.utils.create_global_step()`).
+      steps_per_loop: The number of steps to run in each inner loop of training
+        (passed as the `num_steps` parameter of `trainer.train`).
+      checkpoint_manager: An instance of `tf.train.CheckpointManager`. If
+        provided and there are checkpoints in the associated model directory,
+        the model will be restored from the most recent checkpoint inside this
+        `__init__` method. If not provided, the `Controller` will not
+        automatically save to or restore from checkpoints.
      summary_interval: Step interval for training summaries. Note that this
-        argument only applies to the summaries inside `trainer.train` function.
-        Summaries outside like "steps_per_second" and outputs from
-        `trainer.train` function will always be enabled. If set, the value
-        should be divisible by steps_per_loop.
-      summary_dir: The directory to restore and write checkpoints and summaries.
-        For example, You can set it to `checkpoint_manager.directory`.
-        If None, it will not write training summarizes.
-      eval_summary_dir: The directory to write eval summaries. If None, it will
-        be set to `summary_dir`. If both `summary_dir` and `eval_summary_dir`
-        are None, it will not write evaluation summarizes.
+        argument only applies to `tf.summary` calls inside the `trainer.train`
+        function. Summaries written by the `Controller` (specifically
+        "steps_per_second" and output from the `trainer.train` method) will
+        always be enabled unless the `summary_dir` parameter is `None`. If set,
+        the value must be divisible by `steps_per_loop`.
+      summary_dir: The directory to write summaries to. To use the same
+        directory as for checkpointing, pass `checkpoint_manager.directory`. If
+        `None`, no training summaries will be written.
+      eval_summary_dir: The directory to write eval summaries to. If `None`, it
+        will be set to `summary_dir`. If both `summary_dir` and
+        `eval_summary_dir` are `None`, no eval summaries will be written.

    Raises:
-      ValueError: If both `trainer` and `evaluator` are None.
+      ValueError: If both `trainer` and `evaluator` are `None`.
      ValueError: If `steps_per_loop` is not a positive integer.
-      ValueError: If `summary_interval` is not a positive integer or it cannot
-        be divisible by `steps_per_loop`.
+      ValueError: If `summary_interval` is not a positive integer or is not
+        divisible by `steps_per_loop`.
    """
    if trainer is None and evaluator is None:
-      raise ValueError("`trainer` and `evaluator` should not both be None")
+      raise ValueError("`trainer` and `evaluator` should not both be `None`.")

    if trainer is not None:
      if steps_per_loop is None:
-        raise ValueError("`steps_per_loop` is required when `trainer` is "
-                         "provided.")
-
-      if not isinstance(steps_per_loop, int) or steps_per_loop < 1:
-        raise ValueError("`steps_per_loop` should be a positive integer")
+        raise ValueError(
+            "`steps_per_loop` is required when `trainer` is provided.")
+      elif not isinstance(steps_per_loop, int) or steps_per_loop < 1:
+        raise ValueError(
+            f"`steps_per_loop` ({steps_per_loop}) must be a positive integer.")

      if summary_interval is not None:
        if summary_interval <= 0:
-          raise ValueError("`summary_interval` should be larger than 0")
-        if summary_interval % steps_per_loop != 0:
-          raise ValueError("The summary interval ({}) must be a multiple "
-                           "of the steps_per_loop ({})".format(
-                               summary_interval, steps_per_loop))
+          raise ValueError(
+              f"`summary_interval` ({summary_interval}) must be larger than 0.")
+        elif summary_interval % steps_per_loop != 0:
+          raise ValueError(
+              f"`summary interval` ({summary_interval}) must be a multiple "
+              f"of `steps_per_loop` ({steps_per_loop}).")
+
+    if global_step is None:
+      raise ValueError("`global_step` is required.")
+    elif not isinstance(global_step, tf.Variable):
+      raise ValueError("`global_step` must be a `tf.Variable`.")

    self.trainer = trainer
    self.evaluator = evaluator
@@ -136,157 +191,129 @@ class Controller:
    # Restores the model if needed.
    # TODO(momernick): We probably only want to do this on certain occasions?
    if self.checkpoint_manager is not None:
-      checkpoint_interval = self.checkpoint_manager.checkpoint_interval
      restored_path = self.restore_checkpoint()
      if restored_path:
-        logging.info("Restored from checkpoint: %s", restored_path)
+        _log(f"restored from checkpoint: {restored_path}")

  def train(self, steps: int, checkpoint_at_completion: bool = True):
-    """Runs training.
+    """Runs training until the specified global step count has been reached.

-    This method calls the `train` method on the Trainable object until the
-    global step count is equal to `steps`. It will optionally save checkpoints,
-    if a CheckpointManager was passed to the Controller instance's `__init__`.
+    This method makes calls to `self.trainer.train()` until the global step
+    count is equal to `steps`. It will additionally save checkpoints (if a
+    `CheckpointManager` was passed to `Controller.__init__`) and summarize
+    training output (if `summary_dir` is set).

    Args:
      steps: The global step count to train up to.
      checkpoint_at_completion: Whether to save a checkpoint when this method
-        returns. Defaults to True (write the checkpoint). This is always
-        triggered, regardless of the checkpointing interval.
+        returns (regardless of the checkpointing interval). Defaults to `True`.
    """
-    if self.trainer is None:
-      raise ValueError("`self.trainer` is required when calling `train` "
-                       "method.")
-    if self.global_step is None:
-      raise ValueError("`self.global_step` is required when calling `train` "
-                       "method.")
+    self._require("trainer", for_method="train")

    # TODO(momernick): Support steps=None or -1 (training to exhaustion).
-    current_step = self.global_step.numpy()  # This is an expensive access.
+    current_step = self.global_step.numpy()  # Cache, since this is expensive.
+    _log(f"train | step: {current_step: 6d} | training until step {steps}...")
    while current_step < steps:
-      logging.info("Train at step %s of %s", current_step, steps)
      # Calculates steps to run for the next train loop.
      num_steps = min(steps - current_step, self.steps_per_loop)
      self._train_n_steps(num_steps)
      self._maybe_save_checkpoint()
-      current_step = self.global_step.numpy()  # This is an expensive access.
+      current_step = self.global_step.numpy()

    if checkpoint_at_completion:
-      self.save_checkpoint()
+      self._maybe_save_checkpoint(check_interval=False)

-  def evaluate(self, steps: int = None) -> Optional[Dict[Text, np.number]]:
-    """Runs evaluation.
+  def evaluate(self, steps: int = -1) -> Optional[runner.Output]:
+    """Runs evaluation for the given number of steps.

-    This method calls the `evaluate` method on the Evaluator object for `steps`
-    steps, then writes the returned summaries (if any).
+    This method calls `self.evaluator.evaluate(steps)`, then writes the returned
+    summaries (if any).

    Args:
-      steps: The number of steps to evaluate for.
+      steps: The number of evaluation steps to run. The value `-1` is reserved
+        as a special sentinel to indicate a "complete" evaluation that runs
+        until the underlying dataset is exhausted. Support for this is dependent
+        on the specific `evaluator` being used.

    Returns:
-      The evaluation results as a dictionary of numpy values.
+      The evaluation results as a dictionary mapping names to NumPy values.

    Raises:
-      ValueError: If no checkpoint found in `self.checkpoint_manager.directory`.
-      ValueError: If `evaluator` is not provided.
+      ValueError: If `evaluator` was not provided to `Controller.__init__`.
+      ValueError: If no checkpoint is present in `checkpoint_manager.directory`.
+      ValueError: If `steps` is not a positive value or -1.
    """
-    if self.evaluator is None:
-      raise ValueError("`evaluator` must be provided to call `evaluate()` "
-                       "method.")
+    self._require("evaluator", for_method="evaluate")

-    steps = steps or -1
-    current_step = self.global_step.numpy()
    if steps > 0:
-      logging.info("Running %s steps of evaluation at train step: %s", steps,
-                   current_step)
-      steps = tf.convert_to_tensor(steps, dtype=tf.int32)
+      steps_msg = f"running {steps} steps of evaluation..."
+    elif steps == -1:
+      steps_msg = "running complete evaluation..."
    else:
-      logging.info("Evaluating at train step: %s", current_step)
+      raise ValueError(f"`steps` ({steps}) should be > 0, or == -1.")

-    with self.eval_summary_manager.summary_writer().as_default():
-      eval_outputs = self.evaluator.evaluate(steps)
+    current_step = self.global_step.numpy()
+    _log(f" eval | step: {current_step: 6d} | {steps_msg}")

-    if eval_outputs:
-      eval_outputs = tf.nest.map_structure(utils.get_value, eval_outputs)
+    start = time.time()
+    with self.eval_summary_manager.summary_writer().as_default():
+      steps_tensor = tf.convert_to_tensor(steps, dtype=tf.int32)
+      eval_output = self.evaluator.evaluate(steps_tensor)
+    eval_output = tf.nest.map_structure(utils.get_value, eval_output or {})
+    elapsed = time.time() - start

-    info = "step: {}        evaluation metric: {}".format(
-        current_step, eval_outputs)
-    _log_info(info)
+    _log(f" eval | step: {current_step: 6d} | "
+         f"eval time: {elapsed: 6.1f} | "
+         f"output: {_format_output(eval_output)}")

-    self.eval_summary_manager.write_summaries(eval_outputs)
+    self.eval_summary_manager.write_summaries(eval_output)
    self.eval_summary_manager.flush()

-    return eval_outputs
-
-  def restore_checkpoint(self, checkpoint_path: Text = None):
-    """Restore or initialize the model.
-
-    Args:
-      checkpoint_path: An optional string indicates the checkpoint path to
-        restore. If None, will restore from `self.checkpoint_manager`.
-
-    Returns:
-      The path to the restored checkpoint if a restore happened, or None
-        if no restore occurred.
-    """
-    with self.strategy.scope():
-      # Checkpoint restoring should be inside scope. b/139450638
-      if checkpoint_path is not None:
-        self.checkpoint_manager.checkpoint.restore(checkpoint_path)
-        return checkpoint_path
-      return self.checkpoint_manager.restore_or_initialize()
-
-  def save_checkpoint(self):
-    """Checkpoint the model.
-
-    This method will write a checkpoint containing the current state of the
-    model.
-
-    Raises:
-      ValueError: if no CheckpointManager was provided to this Controller's
-        init args.
-    """
-    self._maybe_save_checkpoint(force_trigger=True)
+    return eval_output

  def train_and_evaluate(self,
                         train_steps: int = None,
                         eval_steps: int = None,
                         eval_interval: int = None):
-    """Train and evaluate in an interleaved manner.
+    """Runs interleaved training and evaluation.

-    This method will train the model until the global step count equals
-    `train_steps`, running an evaluation for `eval_steps` every `eval_interval`
-    training steps. In addition, this method will run a final evaluation at the
-    end of the training sequence.
+    This method interleaves calls to `self.train()` and `self.evaluate()`,
+    training the model until the global step count equals `train_steps`, and
+    running an evaluation for `eval_steps` every `eval_interval` training steps.
+    In addition, this method will run a final evaluation at the end of the
+    training sequence.

    Args:
      train_steps: The global step count to train up to.
-      eval_steps: The number of steps to run during an evaluation. If None,
-        this method will evaluate over the entire evaluation dataset.
-      eval_interval: The number of training steps to run between evaluations.
-        If set, training will always stop every `eval_interval` steps, even if
-        this results in a shorter inner loop than specified by `steps_per_loop`
+      eval_steps: The number of steps to run during an evaluation. If None, this
+        method will evaluate over the entire evaluation dataset.
+      eval_interval: The number of training steps to run between evaluations. If
+        set, training will always stop every `eval_interval` steps, even if this
+        results in a shorter inner loop than specified by `steps_per_loop`
        setting. If None, evaluation will only be performed after training is
        complete.

    Raises:
      ValueError: If eval_interval is not a multiple of self.steps_per_loop.
    """
-    current_step = self.global_step.numpy()  # This is an expensive access.
+    self._require("trainer", for_method="train_and_evaluate")
+    self._require("evaluator", for_method="train_and_evaluate")
+
+    current_step = self.global_step.numpy()  # Cache, since this is expensive.
    eval_interval = eval_interval or (train_steps - current_step)
    while current_step < train_steps:
      interval = min(train_steps - current_step, eval_interval)
      num_steps = current_step + interval
      self.train(steps=num_steps, checkpoint_at_completion=False)
      self.evaluate(steps=eval_steps)
-      current_step = self.global_step.numpy()  # This is an expensive access.
-    self.save_checkpoint()
+      current_step = self.global_step.numpy()
+    self._maybe_save_checkpoint(check_interval=False)

  def evaluate_continuously(self,
                            steps: int = None,
                            timeout: Optional[Union[int, float]] = None,
                            timeout_fn: Optional[Callable[[], bool]] = None):
-    """Monitor a directory and evaluate on checkpoints in it.
+    """Continuously monitors a directory and evaluates new checkpoints in it.

    This method continuously monitors a directory as specified by this
    Controller's CheckpointManager init arg and runs evaluation on the
@@ -303,8 +330,10 @@ class Controller:
    Raises:
      ValueError: If no checkpoint found in `self.checkpoint_manager.directory`.
      ValueError: If `evaluator` was not provided as a controller init arg.
-
    """
+    self._require("evaluator", for_method="evaluate_continuously")
+    self._require("checkpoint_manager", for_method="evaluate_continuously")
+
    for checkpoint_path in tf.train.checkpoints_iterator(
        self.checkpoint_manager.directory,
        timeout=timeout,
@@ -312,63 +341,108 @@ class Controller:
      self.restore_checkpoint(checkpoint_path)
      self.evaluate(steps)

+  def restore_checkpoint(self, checkpoint_path: str = None):
+    """Restores the model from a checkpoint.
+
+    Args:
+      checkpoint_path: An optional string specifying the checkpoint path to
+        restore from. If `None`, will restore from the most recent checkpoint
+        (or initialize the model using a custom `init_fn` if no checkpoints can
+        be found) using `self.checkpoint_manager.restore_or_initialize()`.
+
+    Returns:
+      The path to the restored checkpoint if a restore happened, or `None` if no
+      restore occurred.
+    """
+    self._require("checkpoint_manager", for_method="restore_checkpoint")
+
+    with self.strategy.scope():
+      # Checkpoint restoring should be inside scope (b/139450638).
+      if checkpoint_path is not None:
+        _log(f"restoring model from {checkpoint_path}...")
+        self.checkpoint_manager.checkpoint.restore(checkpoint_path)
+      else:
+        _log("restoring or initializing model...")
+        checkpoint_path = self.checkpoint_manager.restore_or_initialize()
+
+    if checkpoint_path is not None:
+      _log(f"restored model from {checkpoint_path}.")
+    else:
+      _log("initialized model.")
+
+    return checkpoint_path
+
+  def save_checkpoint(self):
+    """Saves the model to a checkpoint.
+
+    This method will save a checkpoint containing the current state of the
+    model.
+
+    Raises:
+      ValueError: If no `checkpoint_manager` was provided to
+        `Controller.__init__`.
+    """
+    self._require("checkpoint_manager", for_method="save_checkpoint")
+    self._maybe_save_checkpoint(check_interval=False)
+
  def _train_n_steps(self, num_steps: int):
-    """Run training for `num_steps`.
+    """Runs training for `num_steps` steps.

-    It will also write training outputs to summaries if there is any.
+    Also prints/logs updates about training progress, and summarizes training
+    output (if output is returned from `self.trainer.train()`, and if
+    `self.summary_dir` is set).

    Args:
-      num_steps: An integer indicates how many steps to run for this training
-        loop.
+      num_steps: An integer specifying how many steps of training to run.

    Raises:
-      RuntimeError: If `global_step` is not updated correctly in
-        `trainer.train`.
+      RuntimeError: If `global_step` is not properly incremented by `num_steps`
+        after calling `self.trainer.train(num_steps)`.
    """
    if not self.step_timer:
      self.step_timer = StepTimer(self.global_step)
-
-    # Calculates steps to run for the next train loop.
    current_step = self.global_step.numpy()
-    logging.info("Entering training loop at step %s to run %s steps",
-                 current_step, num_steps)
-    current_step += num_steps
-    num_steps = tf.convert_to_tensor(num_steps, dtype=tf.int32)

    with self.summary_manager.summary_writer().as_default():
-      # Create a lambda that returns true when summaries should be written.
      should_record = False  # Allows static optimization in no-summary cases.
      if self.summary_interval:
+        # Create a predicate to determine when summaries should be written.
        should_record = lambda: (self.global_step % self.summary_interval == 0)
      with tf.summary.record_if(should_record):
-        train_outputs = self.trainer.train(num_steps)
-
-    # Updates and verifies the current step after a training loop finishes.
-    if current_step != self.global_step.numpy():
-      raise RuntimeError("`trainer.train` function is not updating "
-                         "`global_step` correctly, expected: %s, actual: %s" %
-                         (current_step, self.global_step.numpy()))
+        num_steps_tensor = tf.convert_to_tensor(num_steps, dtype=tf.int32)
+        train_output = self.trainer.train(num_steps_tensor)
+    train_output = tf.nest.map_structure(utils.get_value, train_output or {})
+
+    # Verify that global_step was updated properly, then update current_step.
+    expected_step = current_step + num_steps
+    if self.global_step.numpy() != expected_step:
+      raise RuntimeError(
+          f"`trainer.train({num_steps})` did not update `global_step` by "
+          f"{num_steps}. Old value was {current_step}, expected updated value "
+          f"to be {expected_step}, but it was {self.global_step.numpy()}.")
+    current_step = expected_step

-    # Print information like metrics and steps_per_second after a training
-    # loop.
-    if train_outputs:
-      train_outputs = tf.nest.map_structure(utils.get_value, train_outputs)
-
-    train_outputs = train_outputs or {}
    steps_per_second = self.step_timer.steps_per_second()
-    info = "step: {}        steps_per_second: {:.2f}        {}".format(
-        current_step, steps_per_second, train_outputs)
-    _log_info(info)
+    _log(f"train | step: {current_step: 6d} | "
+         f"steps/sec: {steps_per_second: 6.1f} | "
+         f"output: {_format_output(train_output)}")
+
+    train_output["steps_per_second"] = steps_per_second
+    self.summary_manager.write_summaries(train_output)
+    self.summary_manager.flush()

-    train_outputs["steps_per_second"] = steps_per_second
-    self.summary_manager.write_summaries(train_outputs)
+  def _maybe_save_checkpoint(self, check_interval: bool = True):
+    """Conditionally saves a checkpoint.

-  def _maybe_save_checkpoint(self, force_trigger: bool = False):
-    """Save checkpoints if necessary.
+    A checkpoint is saved if a `CheckpointManager` is available, and if the
+    required number of steps has elapsed since the last checkpoint was saved
+    (although this condition can be disabled by setting `check_interval=False`).

    Args:
-      force_trigger: A boolean indicates whether to force saving checkpoints
-        regardless of the checkpoint interval.
+      check_interval: Whether to check if the checkpoint interval has fully
+        elapsed. If `False`, a checkpoint is saved regardless of the elapsed
+        steps since the most recent checkpoint, unless no `checkpoint_manager`
+        was provided to `Controller.__init__`.

    Returns:
      A boolean indicating whether a checkpoint was saved.
@@ -376,12 +450,19 @@ class Controller:
    if self.checkpoint_manager and self.checkpoint_manager.checkpoint_interval:
      ckpt_path = self.checkpoint_manager.save(
          checkpoint_number=self.global_step.numpy(),
-          check_interval=not force_trigger)
+          check_interval=check_interval)
      if ckpt_path is not None:
-        logging.info("Saved checkpoints in %s", ckpt_path)
+        _log(f"saved checkpoint to {ckpt_path}.")
        return True
    return False

+  def _require(self, attribute, for_method):
+    """Utility method to raise an error if the given `attribute` is not set."""
+    if getattr(self, attribute, None) is None:
+      raise ValueError(
+          f"`{attribute}` is not set. Pass `{attribute}` to "
+          f"`Controller.__init__` before calling `{for_method}()`.")
+

 class StepTimer:
  """Utility class for measuring steps/second."""

--- a/orbit/controller_test.py
+++ b/orbit/controller_test.py
@@ -15,10 +15,14 @@
 """Tests for orbit.controller."""

 import os
+
 from absl import logging
 from absl.testing import parameterized
+
 import numpy as np
+
 from orbit import controller
+from orbit import runner
 from orbit import standard_runner

 import tensorflow as tf
@@ -65,12 +69,8 @@ class TestRunner(standard_runner.StandardTrainer,
    self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
    self.eval_loss = tf.keras.metrics.Mean("eval_loss", dtype=tf.float32)
    self.return_numpy = return_numpy
-    train_dataset = (
-        self.strategy.experimental_distribute_datasets_from_function(dataset_fn)
-    )
-    eval_dataset = (
-        self.strategy.experimental_distribute_datasets_from_function(dataset_fn)
-    )
+    train_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
+    eval_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
    standard_runner.StandardTrainer.__init__(self, train_dataset)
    standard_runner.StandardEvaluator.__init__(self, eval_dataset)

@@ -95,8 +95,7 @@ class TestRunner(standard_runner.StandardTrainer,
    }

  def build_eval_dataset(self):
-    return self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
+    return self.strategy.distribute_datasets_from_function(dataset_fn)

  def eval_begin(self):
    self.eval_loss.reset_states()
@@ -125,8 +124,7 @@ class TestEvaluator(standard_runner.StandardEvaluator):
  def __init__(self):
    self.strategy = tf.distribute.get_strategy()
    self.model = create_model()
-    eval_dataset = self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
+    eval_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
    standard_runner.StandardEvaluator.__init__(self, eval_dataset)

  def eval_reduce(self, state, output):
@@ -157,16 +155,20 @@ class TestEvaluator(standard_runner.StandardEvaluator):
    }


+class TestEvaluatorNoOutput(runner.AbstractEvaluator):
+
+  def evaluate(self, num_steps):
+    pass
+
+
 class TestEvaluatorWithNestedSummary(standard_runner.StandardEvaluator):
  """Implements the training and evaluation APIs for the test model."""

  def __init__(self):
    self.strategy = tf.distribute.get_strategy()
    self.model = create_model()
-    dataset = self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
-    dataset2 = self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
+    dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
+    dataset2 = self.strategy.distribute_datasets_from_function(dataset_fn)
    self.loss = tf.keras.metrics.Mean("loss", dtype=tf.float32)
    self.accuracy = tf.keras.metrics.CategoricalAccuracy(
        "accuracy", dtype=tf.float32)
@@ -217,9 +219,7 @@ class TestTrainerWithSummaries(standard_runner.StandardTrainer):
    self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
    self.global_step = self.optimizer.iterations
    self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
-    train_dataset = (
-        self.strategy.experimental_distribute_datasets_from_function(dataset_fn)
-    )
+    train_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
    standard_runner.StandardTrainer.__init__(
        self,
        train_dataset,
@@ -227,8 +227,7 @@ class TestTrainerWithSummaries(standard_runner.StandardTrainer):
            use_tpu_summary_optimization=True))

  def build_train_dataset(self):
-    return self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
+    return self.strategy.distribute_datasets_from_function(dataset_fn)

  def train_step(self, iterator):

@@ -344,6 +343,26 @@ class ControllerTest(tf.test.TestCase, parameterized.TestCase):
    self.assertNotEmpty(tf.io.gfile.glob(
        os.path.join(self.model_dir, "summaries/eval/events.*")))

+  def test_restore_from_most_recent_checkpoint(self):
+    test_runner = TestRunner()
+    checkpoint = tf.train.Checkpoint(model=test_runner.model)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step,
+        checkpoint_interval=5)
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        global_step=test_runner.global_step,
+        checkpoint_manager=checkpoint_manager,
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
+        steps_per_loop=5)
+    test_controller.train(20)
+    self.assertLen(checkpoint_manager.checkpoints, 4)
+    restored_path = test_controller.restore_checkpoint()
+    self.assertEqual(restored_path, checkpoint_manager.checkpoints[-1])
+
  @parameterized.named_parameters(("return_numpy", True),
                                  ("return_tensor", False))
  def test_train_and_evaluate(self, return_numpy):
@@ -601,7 +620,7 @@ class ControllerTest(tf.test.TestCase, parameterized.TestCase):

    self.assertLess(test_runner.global_step, 10)

-  def test_evaluate_with_loss_outputs(self):
+  def test_evaluate_with_loss_output(self):
    test_evaluator = TestEvaluator()

    checkpoint = tf.train.Checkpoint(model=test_evaluator.model)
@@ -622,6 +641,13 @@ class ControllerTest(tf.test.TestCase, parameterized.TestCase):
        summaries_with_matching_keyword(
            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))

+  def test_evaluate_with_no_output(self):
+    test_controller = controller.Controller(
+        evaluator=TestEvaluatorNoOutput(),
+        global_step=tf.Variable(0, dtype=tf.int64),
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
+    self.assertEqual(test_controller.evaluate(steps=5), {})
+
  def test_train_and_evaluate_reset_datasets(self):
    test_runner = TestRunner()

@@ -635,11 +661,9 @@ class ControllerTest(tf.test.TestCase, parameterized.TestCase):
        train_steps=10, eval_steps=2, eval_interval=6)

    train_dataset = (
-        test_runner.strategy.experimental_distribute_datasets_from_function(
-            dataset_fn))
+        test_runner.strategy.distribute_datasets_from_function(dataset_fn))
    eval_dataset = (
-        test_runner.strategy.experimental_distribute_datasets_from_function(
-            dataset_fn))
+        test_runner.strategy.distribute_datasets_from_function(dataset_fn))
    test_runner.train_dataset = train_dataset
    test_runner.eval_dataset = eval_dataset


--- a/orbit/runner.py
+++ b/orbit/runner.py
@@ -12,62 +12,72 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""An abstraction that users can easily handle their custom training loops."""
+"""Provides AbstractTrainer/Evaluator base classes, defining train/eval APIs."""

 import abc
-from typing import Dict, Optional, Text
+
+from typing import Dict, Optional, Union
+
+import numpy as np
 import tensorflow as tf


+Output = Dict[str, Union[tf.Tensor, float, np.number, np.ndarray, 'Output']]  # pytype: disable=not-supported-yet
+
+
 class AbstractTrainer(tf.Module, metaclass=abc.ABCMeta):
-  """An abstract class defining the APIs required for training."""
+  """An abstract class defining the API required for training."""

  @abc.abstractmethod
-  def train(self,
-            num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
-    """Implements model training with multiple steps.
-
-    In training, it is common to break the total training steps into several
-    training loops, so users can do checkpointing, write summaries and run some
-    python callbacks. This is necessary for getting good performance in TPU
-    training, as the overhead for launching a multi worker tf.function may be
-    large in Eager mode. It is usually encouraged to create a host training loop
-    (e.g. using a `tf.range` wrapping `strategy.run` inside a
-    `tf.function`) in the TPU case. For the cases that don't require host
-    training loop to achieve peak performance, users can just implement a simple
-    python loop to drive each step.
+  def train(self, num_steps: tf.Tensor) -> Optional[Output]:
+    """Implements `num_steps` steps of training.
+
+    This method will by called the `Controller` to perform the "inner loop" of
+    training. This inner loop amortizes the cost of bookkeeping associated with
+    checkpointing, evaluation, and writing summaries. Additionally, the inner
+    loop can be implemented (if desired) using TensorFlow's looping constructs
+    (e.g. a `for` loop over a `tf.range` inside a `tf.function`), which can be
+    necessary for getting optimal performance when running on TPU. For cases
+    that don't require peak performance, a simple Python loop can be used
+    instead for simplicity.

    Args:
-      num_steps: A guideline for how many training steps to run. Note that it is
-        up to the model what constitutes a "step" (this may involve more than
-        one update to model parameters, e.g. if training a GAN).
+      num_steps: The number of training steps to run. Note that it is up to the
+        model what constitutes a "step", which may involve more than one update
+        to model parameters (e.g., if training a GAN).

    Returns:
-      The function may return a dictionary of `Tensors` or numpy arrays, which
-      will be written to logs and as TensorBoard summaries. It can also be a
-      nested dictionary, yielding a hierarchy of summary directories.
+      Either `None`, or a dictionary mapping names to `Tensor`s or NumPy values.
+      If a dictionary is returned, it will be written to logs and as TensorBoard
+      summaries. The dictionary may also be nested, which will generate a
+      hierarchy of summary directories.
    """
    pass


 class AbstractEvaluator(tf.Module, metaclass=abc.ABCMeta):
-  """An abstract class defining the APIs required for evaluation."""
+  """An abstract class defining the API required for evaluation."""

  @abc.abstractmethod
-  def evaluate(
-      self, num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
-    """Implements model evaluation.
+  def evaluate(self, num_steps: tf.Tensor) -> Optional[Output]:
+    """Implements `num_steps` steps of evaluation.
+
+    This method will by called the `Controller` to perform an evaluation. The
+    `num_steps` parameter specifies the number of steps of evaluation to run,
+    which is specified by the user when calling one of the `Controller`'s
+    evaluation methods. A special sentinel value of `-1` is reserved to indicate
+    evaluation should run until the underlying data source is exhausted.

    Args:
-      num_steps: A guideline for how many evaluation steps to run. Note that it
-        is up to the model what constitutes a "step". Generally, it may be
-        desirable to support both a limited number of eval steps and iterating
-        over a full dataset (however many steps are required) when `num_steps`
-        is `None`.
+      num_steps: The number of evaluation steps to run. Note that it is up to
+        the model what constitutes a "step". Evaluations may also want to
+        support "complete" evaluations when `num_steps == -1`, running until a
+        given data source is exhausted.

    Returns:
-      The function may return a dictionary of `Tensors` or numpy arrays, which
-      will be written to logs and as TensorBoard summaries. It can also be a
-      nested dictionary, yielding a hierarchy of summary directories.
+      Either `None`, or a dictionary mapping names to `Tensor`s or NumPy values.
+      If a dictionary is returned, it will be written to logs and as TensorBoard
+      summaries. The dictionary may also be nested, which will generate a
+      hierarchy of summary directories.
    """
    pass
--- a/orbit/standard_runner.py
+++ b/orbit/standard_runner.py
@@ -12,11 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""AbstractTrainer/Evaluator implementations for standard settings."""
+"""AbstractTrainer/Evaluator subclasses with added functionality.
+
+The classes in this module provide some additional structure to the bare
+`AbstractTrainer`/`AbstractEvaluator` APIs.
+
+Both `StandardTrainer` and `StandardEvaluator` split the train/eval loops into
+"begin", "step", and "end" methods, and provide an implementation of the loop
+itself that makes calls to the relevant step method.
+
+`StandardTrainer` supports running the loop using the TF while loop construct
+for added performance (particularly on TPUs). It additionally provides some
+functionality to make writing summaries from inside a model more performant when
+running on TPUs.
+
+These classes are intended to work well in common settings, however there may
+be use cases these classes don't support (for instance, `StandardEvaluator` in
+particular doesn't support running full evaluations over multiple different eval
+datasets). Users are encouraged to simply fall back to custom `AbstractTrainer`
+and `AbstractEvaluator` subclasses in these cases.
+"""

 import abc

-from typing import Any, Dict, Optional, Text
+from typing import Any, Optional

 import dataclasses

@@ -65,14 +84,26 @@ def _create_train_loop_fn(train_step_fn, options: StandardTrainerOptions):


 class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
-  """Implements the standard functionality of AbstractTrainer APIs."""
+  """Implements standard functionality on top of the AbstractTrainer API.
+
+  This class structures the training "inner loop" roughly as follows:
+
+      train_loop_begin()
+      for _ in range(num_steps):
+        train_step(train_iterator)
+      return train_loop_end()
+
+  Calls to `train_loop_begin` and `train_loop_end` are always done in eager
+  mode, while the loop/`train_step` may be implemented using `tf.while` and/or
+  `tf.function`, as determined by the `options` passed to `__init__`.
+  """

  def __init__(self, train_dataset, options: StandardTrainerOptions = None):
-    """Construct a `StandardTrainer` object.
+    """Initializes the `StandardTrainer` instance.

    Args:
-      train_dataset: A tf.nest-compatible structure of tf.data.Dataset or
-        DistributedDataset.
+      train_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
+        `DistributedDataset`.
      options: An `orbit.StandardTrainerOptions` instance.
    """
    options = options or StandardTrainerOptions()
@@ -88,11 +119,16 @@ class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
    self._train_iter = None
    self._train_loop_fn = None

-  def train(
-      self,
-      num_steps: Optional[tf.Tensor],
-  ) -> Optional[Dict[Text, tf.Tensor]]:
-    """See base class."""
+  def train(self, num_steps: tf.Tensor) -> Optional[runner.Output]:
+    """Implements `num_steps` steps of training.
+
+    Args:
+      num_steps: The number of training steps to run. This corresponds directly
+        to the number of calls made to `train_step`.
+
+    Returns:
+      The output of `train_loop_end`.
+    """
    self.train_loop_begin()

    if self._train_loop_fn is None:
@@ -108,9 +144,10 @@ class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
  def train_loop_begin(self):
    """Called once at the beginning of the training loop.

-    This method is called before dataset iterators creation.
-    This is a good place to reset metrics that accumulate values over multiple
-    steps of training.
+    This method is always called in eager mode, and is a good place to reset
+    metrics that accumulate values over multiple steps of training.
+
+    Note that this method is called before dataset iterator creation.
    """
    pass

@@ -118,28 +155,30 @@ class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
  def train_step(self, iterator):
    """Implements one step of training.

-    What a "step" consists of is up to the implementer. If using distribution
-    strategies, the call to this method should take place in the "cross-replica
+    What a "step" consists of is up to the implementer. When using distribution
+    strategies, the call to this method takes place in the "cross-replica
    context" for generality, to allow e.g. multiple iterator dequeues and calls
    to `strategy.run`.

    Note that if `use_tf_function=True`, all the code inside `train_step` should
-    be tf.function compatible, as they will be traced with tf.function. This
-    means you cannot put arbitrary python code in this function. If users have
-    any numpy operations, they should be put in `train_loop_begin` or
-    `train_loop_end` functions.
+    be compatible with `tf.function` tracing (and in particular, any state
+    modifications involving `self` should be avoided). In some cases, non-
+    `tf.function` compatible code can be moved to `train_loop_begin` or
+    `train_loop_end`, which always execute eagerly.

    Args:
-      iterator: A tf.nest-compatible structure of tf.data Iterator or
-        DistributedIterator.
+      iterator: A `tf.nest`-compatible structure of `tf.data.Iterator` or
+        `DistributedIterator`. The structure of this input matches the structure
+        of `train_dataset` as passed to `__init__`.
    """
    pass

-  def train_loop_end(self) -> Optional[Dict[Text, tf.Tensor]]:
-    """Called at the end of the training loop.
+  def train_loop_end(self) -> Optional[runner.Output]:
+    """Called once at the end of the training loop.

-    This is a good place to get metric results. The value returned from this
-    function will be returned as-is from the train() method.
+    This method is always called in eager mode, and is a good place to get
+    metric results. The value returned from this function will be returned as-is
+    from the `train` method implementation provided by `StandardTrainer`.

    Returns:
      The function may return a dictionary of `Tensors`, which will be
@@ -150,18 +189,18 @@ class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):

  @property
  def train_dataset(self):
-    """Returns the train_dataset instance."""
+    """The current training dataset."""
    return self._train_dataset

  @train_dataset.setter
  def train_dataset(self, train_dataset):
-    """Set a new train dataset and replace with the existing one.
+    """Sets a new training dataset, replacing the current one.

-    Any unfinished work in the previous dataset will be discarded.
+    Any unprocessed examples in the current dataset are discarded.

    Args:
-      train_dataset: A tf.nest-compatible structure of tf.data.Dataset or
-        DistributedDataset.
+      train_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
+        `DistributedDataset`.
    """
    self._train_dataset = train_dataset
    self._train_iter = None
@@ -187,25 +226,49 @@ def _create_eval_loop_fn(eval_step_fn, options: StandardEvaluatorOptions):


 class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
-  """Implements the standard functionality of AbstractEvaluator APIs."""
+  """Implements the standard functionality of AbstractEvaluator APIs.
+
+  This class structures evaluation roughly as follows:
+
+      state = eval_begin()
+      for _ in range(num_steps):
+        step_outputs = eval_step(eval_iterator)
+        state = eval_reduce(state, step_outputs)
+      return eval_end(state)
+
+  Calls to `eval_begin`, `eval_reduce`, and `eval_end` are always done in eager
+  mode, while `eval_step` may be compiled with `tf.function` as determined by
+  the `options` passed to `__init__`.
+
+  This class does not support completely evaluating multiple different datasets
+  (i.e., where every example of each dataset should be processed, as opposed to
+  running for a fixed number of evaluation steps). A custom `AbstractEvaluator`
+  is recommended in this case.
+  """

  def __init__(self, eval_dataset, options: StandardEvaluatorOptions = None):
-    """Construct a `StandardEvaluator` object.
+    """Initializes the `StandardEvaluator` instance.

    Args:
-      eval_dataset: A tf.nest-compatible structure of tf.data.Dataset or
-        DistributedDataset.
+      eval_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
+        `DistributedDataset`.
      options: An `orbit.StandardEvaluatorOptions` instance.
    """
    self._eval_options = options or StandardEvaluatorOptions()
    self._eval_dataset = eval_dataset
    self._eval_loop_fn = None

-  def evaluate(
-      self,
-      num_steps: Optional[tf.Tensor],
-  ) -> Optional[Dict[Text, tf.Tensor]]:
-    """See base class."""
+  def evaluate(self, num_steps: tf.Tensor) -> Optional[runner.Output]:
+    """Implements `num_steps` steps of evaluation.
+
+    Args:
+      num_steps: The number of evaluation steps to run. When this is -1,
+        evaluation proceeds until a call to `eval_step` raises a `StopIteration`
+        or `tf.errors.OutOfRangeError`.
+
+    Returns:
+      The output of `self.eval_end()`.
+    """
    outputs = self.eval_begin()  # pylint: disable=assignment-from-no-return

    if self._eval_loop_fn is None:
@@ -224,12 +287,13 @@ class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
  def eval_begin(self) -> Any:
    """Called once at the beginning of the evaluation.

-    This method is called before dataset iterators creation.
-    This is a good place to reset metrics that accumulate values over the entire
-    evaluation.
+    This method is always called in eager mode, and is a good place to reset
+    metrics that accumulate values over the course of evaluation.
+
+    Note that this method is called before dataset iterator creation.

    Returns:
-      An output which is passed as `state` argument into `eval_reduce` function.
+      An value to pass as the `state` argument to `eval_reduce`.
    """
    pass

@@ -237,20 +301,20 @@ class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
  def eval_step(self, iterator) -> Any:
    """Implements one step of evaluation.

-    What a "step" consists of is up to the implementer. If using distribution
-    strategies, the call to this method should take place in the "cross-replica
+    What a "step" consists of is up to the implementer. When using distribution
+    strategies, the call to this method takes place in the "cross-replica
    context" for generality, to allow e.g. multiple iterator dequeues and calls
    to `strategy.run`.

    Note that if `use_tf_function=True`, all the code inside `eval_step` should
-    be tf.function compatible, as they will be traced with tf.function. This
-    means you cannot put arbitrary python code in this function. If users have
-    any numpy operations, they should be put in `eval_begin`, `eval_end` or
-    `eval_reduce` functions.
+    be compatible with `tf.function` tracing (and in particular, any state
+    modifications involving `self` should be avoided). In some cases, non-
+    `tf.function` compatible code can be moved to `eval_loop_begin`,
+    `eval_reduce`, or `eval_loop_end`, which always execute eagerly.

    Args:
-      iterator: A tf.nest-compatible structure of tf.data Iterator or
-        DistributedIterator.
+      iterator: A `tf.nest`-compatible structure of `tf.data.Iterator` or
+        `DistributedIterator`.

    Returns:
      An output which is passed as `step_outputs` argument into `eval_reduce`
@@ -258,14 +322,18 @@ class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
    """
    pass

-  def eval_end(self, *args) -> Optional[Dict[Text, tf.Tensor]]:
+  def eval_end(self, *args) -> Optional[runner.Output]:
    """Called at the end of the evaluation.

-    This is a good place to get metric results. The value returned from this
-    function will be returned as-is from the evaluate() method.
+    Called once at the end of evaluation.
+
+    This method is always called in eager mode, and is a good place to get
+    metric results. The value returned from this function will be returned as-is
+    from the `evaluate` method implementation provided by `StandardEvaluator`.

    Args:
-      *args: the outputs from `eval_reduce` for the last eval step.
+      *args: The outputs from `eval_reduce` for the last eval step, if they are
+        non-`None` (if they are `None`, nothing is passed).

    Returns:
      The function may return a dictionary of `Tensors`, which will be
@@ -274,35 +342,41 @@ class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
    """
    pass

-  def eval_reduce(self, state=None, step_outputs=None) -> Any:
-    """A function to do the reduction on the evaluation outputs per step.
+  def eval_reduce(self,
+                  state: Any = None,
+                  step_outputs: Optional[runner.Output] = None) -> Any:
+    """A function to perform per-step reduction on the evaluation outputs.

-    This is useful for passing states throughout evaluation. E.g. it can be used
-    to maintain the output losses from all the evaluation steps, and compute the
-    mean loss in `eval_end` function.
+    This is useful for passing state throughout evaluation, especially in cases
+    where maintaining or accumulating state is hard to accomplish using
+    `tf.metrics.Metric` or other `tf.Variable`-based approaches. For instance,
+    it can be used to easily accumulate all per-example losses from the full
+    evaluation for subsequent processing in `eval_end()`.

    Args:
-      state: A maintained state throughout the evaluation.
+      state: A state being mainted throughout the evaluation.
      step_outputs: Outputs from the current evaluation step.

    Returns:
-      An output which is passed as `state` argument into `eval_reduce` function
-      for the next step. After evaluation is finished, the output from last step
-      will be passed into `eval_end` function.
+      An output which is passed as the `state` argument to this function for the
+      next step. After evaluation is finished, the output from last step will be
+      passed to `eval_end`.
    """
    pass

  @property
  def eval_dataset(self):
-    """Returns the train_datase instance."""
+    """The current evaluation dataset."""
    return self._eval_dataset

  @eval_dataset.setter
  def eval_dataset(self, eval_dataset):
-    """Set a new eval dataset and replace with the existing one.
+    """Sets a new eval dataset, replacing the current one.
+
+    Any unprocessed examples in the current dataset are discarded.

    Args:
-      eval_dataset: A tf.nest-compatible structure of tf.data.Dataset or
-        DistributedDataset.
+      eval_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
+        `DistributedDataset`.
    """
    self._eval_dataset = eval_dataset
--- a/orbit/standard_runner_test.py
+++ b/orbit/standard_runner_test.py
@@ -39,8 +39,7 @@ class TestTrainer(standard_runner.StandardTrainer):
  def __init__(self, options=None):
    self.strategy = tf.distribute.get_strategy()
    self.global_step = utils.create_global_step()
-    distribute = self.strategy.experimental_distribute_datasets_from_function
-    dataset = distribute(dataset_fn)
+    dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
    super().__init__(train_dataset=dataset, options=options)

  def train_loop_begin(self):
@@ -63,8 +62,7 @@ class TestEvaluator(standard_runner.StandardEvaluator):
  def __init__(self, options=None):
    self.strategy = tf.distribute.get_strategy()
    self.global_step = utils.create_global_step()
-    distribute = self.strategy.experimental_distribute_datasets_from_function
-    dataset = distribute(dataset_fn)
+    dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
    super().__init__(eval_dataset=dataset, options=options)

  def eval_begin(self):

--- a/orbit/utils/__init__.py
+++ b/orbit/utils/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Defines exported symbols for `orbit.utils` package."""
+"""Defines exported symbols for the `orbit.utils` package."""

 from orbit.utils.common import create_global_step
 from orbit.utils.common import get_value

--- a/orbit/utils/common.py
+++ b/orbit/utils/common.py
@@ -16,7 +16,6 @@

 import inspect

-import numpy as np
 import tensorflow as tf


@@ -46,16 +45,16 @@ def create_global_step() -> tf.Variable:


 def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
-  """A helper function to create distributed dataset.
+  """A utility function to help create a `tf.distribute.DistributedDataset`.

  Args:
    strategy: An instance of `tf.distribute.Strategy`.
-    dataset_or_fn: A instance of `tf.data.Dataset` or a function which takes an
-      `tf.distribute.InputContext` as input and returns a `tf.data.Dataset`. If
-      it is a function, it could optionally have an argument named
-      `input_context` which is `tf.distribute.InputContext` argument type.
-    *args: The list of arguments to be passed to dataset_or_fn.
-    **kwargs: Any keyword arguments to be passed.
+    dataset_or_fn: A instance of `tf.data.Dataset`, or a "dataset function"
+      returning a `tf.data.Dataset`. If it is a function, it may optionally have
+      an argument named `input_context` which will be passed a
+      `tf.distribute.InputContext` instance.
+    *args: Any positional arguments to pass through to `dataset_or_fn`.
+    **kwargs: Any keyword arguments to pass through to `dataset_or_fn`.

  Returns:
    A distributed Dataset.
@@ -64,38 +63,37 @@ def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
    strategy = tf.distribute.get_strategy()

  if isinstance(dataset_or_fn, tf.data.Dataset):
-    return strategy.experimental_distribute_dataset(dataset_or_fn)
+    return strategy.distribute_dataset(dataset_or_fn)

  if not callable(dataset_or_fn):
    raise ValueError("`dataset_or_fn` should be either callable or an instance "
-                     "of `tf.data.Dataset`")
+                     "of `tf.data.Dataset`.")

-  def dataset_fn(ctx):
-    """Wrapped dataset function for creating distributed dataset.."""
+  def dataset_fn(input_context):
+    """Wraps `dataset_or_fn` for strategy.distribute_datasets_from_function."""

-    # If `dataset_or_fn` is a function and has `input_context` as argument
-    # names, pass `ctx` as the value of `input_context` when calling
-    # `dataset_or_fn`. Otherwise `ctx` will not be used when calling
-    # `dataset_or_fn`.
+    # If `dataset_or_fn` is a function and has an argument named
+    # `input_context`, pass through the given `input_context`. Otherwise
+    # `input_context` will be ignored.
    argspec = inspect.getfullargspec(dataset_or_fn)
-    args_names = argspec.args
+    arg_names = argspec.args

-    if "input_context" in args_names:
-      kwargs["input_context"] = ctx
-    ds = dataset_or_fn(*args, **kwargs)
-    return ds
+    if "input_context" in arg_names:
+      kwargs["input_context"] = input_context
+    return dataset_or_fn(*args, **kwargs)

-  return strategy.experimental_distribute_datasets_from_function(dataset_fn)
+  return strategy.distribute_datasets_from_function(dataset_fn)


-def get_value(x) -> np.number:
-  """Returns the value of a variable/tensor.
+def get_value(x):
+  """Returns input values, converting any TensorFlow values to NumPy values.

  Args:
-      x: input variable.
+    x: The input. May be a `tf.Tensor` or `tf.Variable`.

  Returns:
-      A Numpy array or number.
+    If the input is a TensorFlow `Tensor`, returns the `Tensor`'s equivalent
+    NumPy value. Otherwise, just returns the input.
  """
  if not tf.is_tensor(x):
    return x

--- a/orbit/utils/epoch_helper.py
+++ b/orbit/utils/epoch_helper.py
@@ -18,14 +18,14 @@ import tensorflow as tf


 class EpochHelper:
-  """A Helper class to handle epochs in Customized Training Loop."""
+  """A helper class handle bookkeeping of epochs in custom training loops."""

  def __init__(self, epoch_steps: int, global_step: tf.Variable):
-    """Constructs the EpochHelper.
+    """Initializes the `EpochHelper` instance.

    Args:
-      epoch_steps: An integer indicates how many steps in an epoch.
-      global_step: A `tf.Variable` instance indicates the current global step.
+      epoch_steps: An integer indicating how many steps are in an epoch.
+      global_step: A `tf.Variable` providing the current global step.
    """
    self._epoch_steps = epoch_steps
    self._global_step = global_step
@@ -46,7 +46,7 @@ class EpochHelper:
  def epoch_end(self):
    """Returns whether the current epoch should end."""
    if not self._in_epoch:
-      raise ValueError("`epoch_end` can only be called inside an epoch")
+      raise ValueError("`epoch_end` can only be called inside an epoch.")
    current_step = self._global_step.numpy()
    epoch = current_step // self._epoch_steps


--- a/orbit/utils/loop_fns.py
+++ b/orbit/utils/loop_fns.py
@@ -20,36 +20,57 @@ import tensorflow as tf


 def create_loop_fn(step_fn):
-  """Creates a multiple steps function driven by the python while loop.
+  """Creates a loop function driven by a Python `while` loop.

  Args:
-    step_fn: A function which takes `iterator` as input.
+    step_fn: A function taking a nested structure of `tf.data.Iterator` or
+      `DistributedIterator`. There are no constraints on the return value of the
+      function (except that it must be compatible with any `reduce_fn` provided
+      to the returned `loop_fn`).

  Returns:
-    A callable defined as the `loop_fn` defination below.
+    A loop function taking required `iterator` and `num_steps` parameters, as
+    well as optional `state` and `reduce_fn` parameters for accumulating state
+    over multiple iterations of the loop. See the `loop_fn` definition below for
+    additional details.
  """

  def loop_fn(iterator, num_steps, state=None, reduce_fn=None):
-    """A loop function with multiple steps.
+    """Makes `num_steps` calls to `step_fn(iterator)`.
+
+    Additionally, state may be accumulated across iterations of the loop.
+    Conceptually, state accumulation is handled roughly as follows:
+
+        for _ in range(num_steps):
+          step_outputs  = step_fn(iterator)
+          state = reduce_fn(state, step_outputs)
+        return state
+
+    However, the implementation is slightly more complicated in order to support
+    looping until the iterator is exhausted (when `num_steps == -1`) and to
+    properly catch exceptions when running under async remote eager (as is the
+    case in TPU training setups involving separate coordinator/worker machines).

    Args:
-      iterator: A nested structure of tf.data `Iterator` or
+      iterator: A nested structure of `tf.data.Iterator` or
        `DistributedIterator`.
-      num_steps: The number of steps in the loop. If `num_steps==-1`, will
+      num_steps: The number of steps in the loop. If `num_steps == -1`, will
        iterate until exausting the iterator.
      state: An optional initial state before running the loop.
-      reduce_fn: a callable defined as `def reduce_fn(state, value)`, where
-        `value` is the outputs from `step_fn`.
+      reduce_fn: A callable taking two inputs, `state` and `value`, where
+        `state` is the previous output from `reduce_fn`, and `value` is the
+        output from `step_fn`.

    Returns:
-      The updated state.
+      The final state returned by `reduce_fn`, or `None` if `state` and
+      `reduce_fn` are not provided.
    """
    try:
      step = 0
-      # To make sure the OutOfRangeError exception can be handled well with
-      # async remote eager, we need to wrap the loop body in a `async_scope`.
+      # To make sure the OutOfRangeError exception can be handled well under
+      # async remote eager, we need to wrap the loop body in `async_scope`.
      with tf.experimental.async_scope():
-        while (num_steps == -1 or step < num_steps):
+        while num_steps == -1 or step < num_steps:
          outputs = step_fn(iterator)
          if reduce_fn is not None:
            state = reduce_fn(state, outputs)
@@ -63,26 +84,32 @@ def create_loop_fn(step_fn):


 def create_tf_while_loop_fn(step_fn):
-  """Create a multiple steps function driven by tf.while_loop on the host.
+  """Creates a loop function compatible with TF's AutoGraph loop conversion.

  Args:
-    step_fn: A function which takes `iterator` as input.
+    step_fn: A function taking a nested structure of `tf.data.Iterator` or
+      `DistributedIterator`. Currently, any return values are ignored.

  Returns:
-    A callable defined as the `loop_fn` defination below.
+    A loop function taking required `iterator` and `num_steps` parameters. If
+    called inside a `tf.function`, the loop will be converted by AutoGraph into
+    a `tf.while_loop` construct. See the `loop_fn` definition below for
+    additional details.
  """

  def loop_fn(iterator, num_steps):
-    """A loop function with multiple steps.
+    """Makes `num_steps` calls to `step_fn(iterator)`.

    Args:
-      iterator: A nested structure of tf.data `Iterator` or
+      iterator: A nested structure of `tf.data.Iterator` or
        `DistributedIterator`.
-      num_steps: The number of steps in the loop. Must be a tf.Tensor.
+      num_steps: The number of steps in the loop. Should be passed as a
+        `tf.Tensor`. Iterating until iterator exhaustion is not supported.
    """
    if not isinstance(num_steps, tf.Tensor):
-      raise ValueError("`num_steps` should be an `tf.Tensor`. Python object "
-                       "may cause retracing.")
+      raise ValueError(
+          "`num_steps` should be a `tf.Tensor`. Passing a Python value can "
+          "cause unnecessary retracing when wrapped by `tf.function`.")

    for _ in tf.range(num_steps):
      step_fn(iterator)

--- a/orbit/utils/summary_manager.py
+++ b/orbit/utils/summary_manager.py
@@ -20,18 +20,19 @@ import tensorflow as tf


 class SummaryManager:
-  """A class manages writing summaries."""
+  """A utility class for managing summary writing."""

  def __init__(self, summary_dir, summary_fn, global_step=None):
-    """Construct a summary manager object.
+    """Initializes the `SummaryManager` instance.

    Args:
-      summary_dir: the directory to write summaries.
-      summary_fn: A callable defined as `def summary_fn(name, tensor,
-        step=None)`, which describes the summary operation.
-      global_step: A `tf.Variable` instance for the global step.
+      summary_dir: The directory in which to write summaries. If `None`, all
+        summary writing operations provided by this class are no-ops.
+      summary_fn: A callable defined accepting `name`, `value`, and `step`
+        parameters, making calls to `tf.summary` functions to write summaries.
+      global_step: A `tf.Variable` containing the global step value.
    """
-    self._enabled = (summary_dir is not None)
+    self._enabled = summary_dir is not None
    self._summary_dir = summary_dir
    self._summary_fn = summary_fn
    self._summary_writers = {}
@@ -42,12 +43,12 @@ class SummaryManager:
      self._global_step = global_step

  def summary_writer(self, relative_path=""):
-    """Returns the underlying summary writer.
+    """Returns the underlying summary writer for a specific subdirectory.

    Args:
      relative_path: The current path in which to write summaries, relative to
-        the summary directory. By default it is empty, which specifies the root
-        directory.
+        the summary directory. By default it is empty, which corresponds to the
+        root directory.
    """
    if self._summary_writers and relative_path in self._summary_writers:
      return self._summary_writers[relative_path]
@@ -59,43 +60,41 @@ class SummaryManager:
    return self._summary_writers[relative_path]

  def flush(self):
-    """Flush the underlying summary writers."""
+    """Flushes the underlying summary writers."""
    if self._enabled:
      tf.nest.map_structure(tf.summary.flush, self._summary_writers)

  def write_summaries(self, summary_dict):
-    """Write summaries for the given values.
+    """Writes summaries for the given dictionary of values.

    This recursively creates subdirectories for any nested dictionaries
    provided in `summary_dict`, yielding a hierarchy of directories which will
    then be reflected in the TensorBoard UI as different colored curves.

-    E.g. users may evaluate on muliple datasets and return `summary_dict` as a
-    nested dictionary.
-
-    ```
-    {
-        "dataset": {
-            "loss": loss,
-            "accuracy": accuracy
-        },
-        "dataset2": {
-            "loss": loss2,
-            "accuracy": accuracy2
-        },
-    }
-    ```
-
-    This will create two subdirectories "dataset" and "dataset2" inside the
+    For example, users may evaluate on muliple datasets and return
+    `summary_dict` as a nested dictionary:
+
+        {
+            "dataset1": {
+                "loss": loss1,
+                "accuracy": accuracy1
+            },
+            "dataset2": {
+                "loss": loss2,
+                "accuracy": accuracy2
+            },
+        }
+
+    This will create two subdirectories, "dataset1" and "dataset2", inside the
    summary root directory. Each directory will contain event files including
    both "loss" and "accuracy" summaries.

    Args:
      summary_dict: A dictionary of values. If any value in `summary_dict` is
-        itself a dictionary, then the function will recursively create
-        subdirectories with names given by the keys in the dictionary. The
-        Tensor values are summarized using the summary writer instance specific
-        to the parent relative path.
+        itself a dictionary, then the function will create a subdirectory with
+        name given by the corresponding key. This is performed recursively. Leaf
+        values are then summarized using the summary writer instance specific to
+        the parent relative path.
    """
    if not self._enabled:
      return