Internal change

PiperOrigin-RevId: 335446217

Internal change
PiperOrigin-RevId: 335446217
2d342592 · Dan Holtmann-Rice · A. Unique TensorFlower · 3a9ed6bd · 2d342592 · 2d342592
Commit 2d342592 authored Oct 05, 2020 by Dan Holtmann-Rice Committed by A. Unique TensorFlower Oct 05, 2020
12 changed files
--- a/orbit/README.md
+++ b/orbit/README.md
-![TensorFlow Requirement: 2.x](https://img.shields.io/badge/TensorFlow%20Requirement-2.x-brightgreen)
-
 # Orbit

-Orbit is a customized training loop library built on top of Tensorflow 2. It
-provides a flexible lightweight library that users can easily use or fork when
-writing [customized training loop code](https://www.tensorflow.org/tutorials/distribute/custom_training)
-in TF2. It intergates with `tf.distribute` seamlessly and supports running on
-different device types (CPU, GPU, and TPU).
+Orbit is a flexible, lightweight library designed to make it easy to write
+[custom training loops][custom_training] in TensorFlow 2. Orbit handles common
+model training tasks such as saving checkpoints, running model evaluations, and
+setting up summary writing, while giving users full control over implementing
+the inner training loop. It integrates with `tf.distribute` seamlessly and
+supports running on different device types (CPU, GPU, and TPU). The core code is
+intended to be easy to read and fork.
+
+See our [g3doc](g3doc) at go/orbit-trainer for additional documentation.
+
+[custom_training]: https://www.tensorflow.org/tutorials/distribute/custom_training
--- a/orbit/__init__.py
+++ b/orbit/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Defines exported symbols for `orbit` package."""
+"""Defines exported symbols for the `orbit` package."""

 from orbit import utils


--- a/orbit/controller.py
+++ b/orbit/controller.py
--- a/orbit/controller_test.py
+++ b/orbit/controller_test.py
@@ -15,10 +15,14 @@
 """Tests for orbit.controller."""

 import os
+
 from absl import logging
 from absl.testing import parameterized
+
 import numpy as np
+
 from orbit import controller
+from orbit import runner
 from orbit import standard_runner

 import tensorflow as tf
@@ -65,12 +69,8 @@ class TestRunner(standard_runner.StandardTrainer,
    self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
    self.eval_loss = tf.keras.metrics.Mean("eval_loss", dtype=tf.float32)
    self.return_numpy = return_numpy
-    train_dataset = (
-        self.strategy.experimental_distribute_datasets_from_function(dataset_fn)
-    )
-    eval_dataset = (
-        self.strategy.experimental_distribute_datasets_from_function(dataset_fn)
-    )
+    train_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
+    eval_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
    standard_runner.StandardTrainer.__init__(self, train_dataset)
    standard_runner.StandardEvaluator.__init__(self, eval_dataset)

@@ -95,8 +95,7 @@ class TestRunner(standard_runner.StandardTrainer,
    }

  def build_eval_dataset(self):
-    return self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
+    return self.strategy.distribute_datasets_from_function(dataset_fn)

  def eval_begin(self):
    self.eval_loss.reset_states()
@@ -125,8 +124,7 @@ class TestEvaluator(standard_runner.StandardEvaluator):
  def __init__(self):
    self.strategy = tf.distribute.get_strategy()
    self.model = create_model()
-    eval_dataset = self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
+    eval_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
    standard_runner.StandardEvaluator.__init__(self, eval_dataset)

  def eval_reduce(self, state, output):
@@ -157,16 +155,20 @@ class TestEvaluator(standard_runner.StandardEvaluator):
    }


+class TestEvaluatorNoOutput(runner.AbstractEvaluator):
+
+  def evaluate(self, num_steps):
+    pass
+
+
 class TestEvaluatorWithNestedSummary(standard_runner.StandardEvaluator):
  """Implements the training and evaluation APIs for the test model."""

  def __init__(self):
    self.strategy = tf.distribute.get_strategy()
    self.model = create_model()
-    dataset = self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
-    dataset2 = self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
+    dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
+    dataset2 = self.strategy.distribute_datasets_from_function(dataset_fn)
    self.loss = tf.keras.metrics.Mean("loss", dtype=tf.float32)
    self.accuracy = tf.keras.metrics.CategoricalAccuracy(
        "accuracy", dtype=tf.float32)
@@ -217,9 +219,7 @@ class TestTrainerWithSummaries(standard_runner.StandardTrainer):
    self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
    self.global_step = self.optimizer.iterations
    self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
-    train_dataset = (
-        self.strategy.experimental_distribute_datasets_from_function(dataset_fn)
-    )
+    train_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
    standard_runner.StandardTrainer.__init__(
        self,
        train_dataset,
@@ -227,8 +227,7 @@ class TestTrainerWithSummaries(standard_runner.StandardTrainer):
            use_tpu_summary_optimization=True))

  def build_train_dataset(self):
-    return self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
+    return self.strategy.distribute_datasets_from_function(dataset_fn)

  def train_step(self, iterator):

@@ -344,6 +343,26 @@ class ControllerTest(tf.test.TestCase, parameterized.TestCase):
    self.assertNotEmpty(tf.io.gfile.glob(
        os.path.join(self.model_dir, "summaries/eval/events.*")))

+  def test_restore_from_most_recent_checkpoint(self):
+    test_runner = TestRunner()
+    checkpoint = tf.train.Checkpoint(model=test_runner.model)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step,
+        checkpoint_interval=5)
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        global_step=test_runner.global_step,
+        checkpoint_manager=checkpoint_manager,
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
+        steps_per_loop=5)
+    test_controller.train(20)
+    self.assertLen(checkpoint_manager.checkpoints, 4)
+    restored_path = test_controller.restore_checkpoint()
+    self.assertEqual(restored_path, checkpoint_manager.checkpoints[-1])
+
  @parameterized.named_parameters(("return_numpy", True),
                                  ("return_tensor", False))
  def test_train_and_evaluate(self, return_numpy):
@@ -601,7 +620,7 @@ class ControllerTest(tf.test.TestCase, parameterized.TestCase):

    self.assertLess(test_runner.global_step, 10)

-  def test_evaluate_with_loss_outputs(self):
+  def test_evaluate_with_loss_output(self):
    test_evaluator = TestEvaluator()

    checkpoint = tf.train.Checkpoint(model=test_evaluator.model)
@@ -622,6 +641,13 @@ class ControllerTest(tf.test.TestCase, parameterized.TestCase):
        summaries_with_matching_keyword(
            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))

+  def test_evaluate_with_no_output(self):
+    test_controller = controller.Controller(
+        evaluator=TestEvaluatorNoOutput(),
+        global_step=tf.Variable(0, dtype=tf.int64),
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
+    self.assertEqual(test_controller.evaluate(steps=5), {})
+
  def test_train_and_evaluate_reset_datasets(self):
    test_runner = TestRunner()

@@ -635,11 +661,9 @@ class ControllerTest(tf.test.TestCase, parameterized.TestCase):
        train_steps=10, eval_steps=2, eval_interval=6)

    train_dataset = (
-        test_runner.strategy.experimental_distribute_datasets_from_function(
-            dataset_fn))
+        test_runner.strategy.distribute_datasets_from_function(dataset_fn))
    eval_dataset = (
-        test_runner.strategy.experimental_distribute_datasets_from_function(
-            dataset_fn))
+        test_runner.strategy.distribute_datasets_from_function(dataset_fn))
    test_runner.train_dataset = train_dataset
    test_runner.eval_dataset = eval_dataset


--- a/orbit/runner.py
+++ b/orbit/runner.py
@@ -12,62 +12,72 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""An abstraction that users can easily handle their custom training loops."""
+"""Provides AbstractTrainer/Evaluator base classes, defining train/eval APIs."""

 import abc
-from typing import Dict, Optional, Text
+
+from typing import Dict, Optional, Union
+
+import numpy as np
 import tensorflow as tf


+Output = Dict[str, Union[tf.Tensor, float, np.number, np.ndarray, 'Output']]  # pytype: disable=not-supported-yet
+
+
 class AbstractTrainer(tf.Module, metaclass=abc.ABCMeta):
-  """An abstract class defining the APIs required for training."""
+  """An abstract class defining the API required for training."""

  @abc.abstractmethod
-  def train(self,
-            num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
-    """Implements model training with multiple steps.
-
-    In training, it is common to break the total training steps into several
-    training loops, so users can do checkpointing, write summaries and run some
-    python callbacks. This is necessary for getting good performance in TPU
-    training, as the overhead for launching a multi worker tf.function may be
-    large in Eager mode. It is usually encouraged to create a host training loop
-    (e.g. using a `tf.range` wrapping `strategy.run` inside a
-    `tf.function`) in the TPU case. For the cases that don't require host
-    training loop to achieve peak performance, users can just implement a simple
-    python loop to drive each step.
+  def train(self, num_steps: tf.Tensor) -> Optional[Output]:
+    """Implements `num_steps` steps of training.
+
+    This method will by called the `Controller` to perform the "inner loop" of
+    training. This inner loop amortizes the cost of bookkeeping associated with
+    checkpointing, evaluation, and writing summaries. Additionally, the inner
+    loop can be implemented (if desired) using TensorFlow's looping constructs
+    (e.g. a `for` loop over a `tf.range` inside a `tf.function`), which can be
+    necessary for getting optimal performance when running on TPU. For cases
+    that don't require peak performance, a simple Python loop can be used
+    instead for simplicity.

    Args:
-      num_steps: A guideline for how many training steps to run. Note that it is
-        up to the model what constitutes a "step" (this may involve more than
-        one update to model parameters, e.g. if training a GAN).
+      num_steps: The number of training steps to run. Note that it is up to the
+        model what constitutes a "step", which may involve more than one update
+        to model parameters (e.g., if training a GAN).

    Returns:
-      The function may return a dictionary of `Tensors` or numpy arrays, which
-      will be written to logs and as TensorBoard summaries. It can also be a
-      nested dictionary, yielding a hierarchy of summary directories.
+      Either `None`, or a dictionary mapping names to `Tensor`s or NumPy values.
+      If a dictionary is returned, it will be written to logs and as TensorBoard
+      summaries. The dictionary may also be nested, which will generate a
+      hierarchy of summary directories.
    """
    pass


 class AbstractEvaluator(tf.Module, metaclass=abc.ABCMeta):
-  """An abstract class defining the APIs required for evaluation."""
+  """An abstract class defining the API required for evaluation."""

  @abc.abstractmethod
-  def evaluate(
-      self, num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
-    """Implements model evaluation.
+  def evaluate(self, num_steps: tf.Tensor) -> Optional[Output]:
+    """Implements `num_steps` steps of evaluation.
+
+    This method will by called the `Controller` to perform an evaluation. The
+    `num_steps` parameter specifies the number of steps of evaluation to run,
+    which is specified by the user when calling one of the `Controller`'s
+    evaluation methods. A special sentinel value of `-1` is reserved to indicate
+    evaluation should run until the underlying data source is exhausted.

    Args:
-      num_steps: A guideline for how many evaluation steps to run. Note that it
-        is up to the model what constitutes a "step". Generally, it may be
-        desirable to support both a limited number of eval steps and iterating
-        over a full dataset (however many steps are required) when `num_steps`
-        is `None`.
+      num_steps: The number of evaluation steps to run. Note that it is up to
+        the model what constitutes a "step". Evaluations may also want to
+        support "complete" evaluations when `num_steps == -1`, running until a
+        given data source is exhausted.

    Returns:
-      The function may return a dictionary of `Tensors` or numpy arrays, which
-      will be written to logs and as TensorBoard summaries. It can also be a
-      nested dictionary, yielding a hierarchy of summary directories.
+      Either `None`, or a dictionary mapping names to `Tensor`s or NumPy values.
+      If a dictionary is returned, it will be written to logs and as TensorBoard
+      summaries. The dictionary may also be nested, which will generate a
+      hierarchy of summary directories.
    """
    pass
--- a/orbit/standard_runner.py
+++ b/orbit/standard_runner.py
@@ -12,11 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""AbstractTrainer/Evaluator implementations for standard settings."""
+"""AbstractTrainer/Evaluator subclasses with added functionality.
+
+The classes in this module provide some additional structure to the bare
+`AbstractTrainer`/`AbstractEvaluator` APIs.
+
+Both `StandardTrainer` and `StandardEvaluator` split the train/eval loops into
+"begin", "step", and "end" methods, and provide an implementation of the loop
+itself that makes calls to the relevant step method.
+
+`StandardTrainer` supports running the loop using the TF while loop construct
+for added performance (particularly on TPUs). It additionally provides some
+functionality to make writing summaries from inside a model more performant when
+running on TPUs.
+
+These classes are intended to work well in common settings, however there may
+be use cases these classes don't support (for instance, `StandardEvaluator` in
+particular doesn't support running full evaluations over multiple different eval
+datasets). Users are encouraged to simply fall back to custom `AbstractTrainer`
+and `AbstractEvaluator` subclasses in these cases.
+"""

 import abc

-from typing import Any, Dict, Optional, Text
+from typing import Any, Optional

 import dataclasses

@@ -65,14 +84,26 @@ def _create_train_loop_fn(train_step_fn, options: StandardTrainerOptions):


 class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
-  """Implements the standard functionality of AbstractTrainer APIs."""
+  """Implements standard functionality on top of the AbstractTrainer API.
+
+  This class structures the training "inner loop" roughly as follows:
+
+      train_loop_begin()
+      for _ in range(num_steps):
+        train_step(train_iterator)
+      return train_loop_end()
+
+  Calls to `train_loop_begin` and `train_loop_end` are always done in eager
+  mode, while the loop/`train_step` may be implemented using `tf.while` and/or
+  `tf.function`, as determined by the `options` passed to `__init__`.
+  """

  def __init__(self, train_dataset, options: StandardTrainerOptions = None):
-    """Construct a `StandardTrainer` object.
+    """Initializes the `StandardTrainer` instance.

    Args:
-      train_dataset: A tf.nest-compatible structure of tf.data.Dataset or
-        DistributedDataset.
+      train_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
+        `DistributedDataset`.
      options: An `orbit.StandardTrainerOptions` instance.
    """
    options = options or StandardTrainerOptions()
@@ -88,11 +119,16 @@ class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
    self._train_iter = None
    self._train_loop_fn = None

-  def train(
-      self,
-      num_steps: Optional[tf.Tensor],
-  ) -> Optional[Dict[Text, tf.Tensor]]:
-    """See base class."""
+  def train(self, num_steps: tf.Tensor) -> Optional[runner.Output]:
+    """Implements `num_steps` steps of training.
+
+    Args:
+      num_steps: The number of training steps to run. This corresponds directly
+        to the number of calls made to `train_step`.
+
+    Returns:
+      The output of `train_loop_end`.
+    """
    self.train_loop_begin()

    if self._train_loop_fn is None:
@@ -108,9 +144,10 @@ class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
  def train_loop_begin(self):
    """Called once at the beginning of the training loop.

-    This method is called before dataset iterators creation.
-    This is a good place to reset metrics that accumulate values over multiple
-    steps of training.
+    This method is always called in eager mode, and is a good place to reset
+    metrics that accumulate values over multiple steps of training.
+
+    Note that this method is called before dataset iterator creation.
    """
    pass

@@ -118,28 +155,30 @@ class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
  def train_step(self, iterator):
    """Implements one step of training.

-    What a "step" consists of is up to the implementer. If using distribution
-    strategies, the call to this method should take place in the "cross-replica
+    What a "step" consists of is up to the implementer. When using distribution
+    strategies, the call to this method takes place in the "cross-replica
    context" for generality, to allow e.g. multiple iterator dequeues and calls
    to `strategy.run`.

    Note that if `use_tf_function=True`, all the code inside `train_step` should
-    be tf.function compatible, as they will be traced with tf.function. This
-    means you cannot put arbitrary python code in this function. If users have
-    any numpy operations, they should be put in `train_loop_begin` or
-    `train_loop_end` functions.
+    be compatible with `tf.function` tracing (and in particular, any state
+    modifications involving `self` should be avoided). In some cases, non-
+    `tf.function` compatible code can be moved to `train_loop_begin` or
+    `train_loop_end`, which always execute eagerly.

    Args:
-      iterator: A tf.nest-compatible structure of tf.data Iterator or
-        DistributedIterator.
+      iterator: A `tf.nest`-compatible structure of `tf.data.Iterator` or
+        `DistributedIterator`. The structure of this input matches the structure
+        of `train_dataset` as passed to `__init__`.
    """
    pass

-  def train_loop_end(self) -> Optional[Dict[Text, tf.Tensor]]:
-    """Called at the end of the training loop.
+  def train_loop_end(self) -> Optional[runner.Output]:
+    """Called once at the end of the training loop.

-    This is a good place to get metric results. The value returned from this
-    function will be returned as-is from the train() method.
+    This method is always called in eager mode, and is a good place to get
+    metric results. The value returned from this function will be returned as-is
+    from the `train` method implementation provided by `StandardTrainer`.

    Returns:
      The function may return a dictionary of `Tensors`, which will be
@@ -150,18 +189,18 @@ class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):

  @property
  def train_dataset(self):
-    """Returns the train_dataset instance."""
+    """The current training dataset."""
    return self._train_dataset

  @train_dataset.setter
  def train_dataset(self, train_dataset):
-    """Set a new train dataset and replace with the existing one.
+    """Sets a new training dataset, replacing the current one.

-    Any unfinished work in the previous dataset will be discarded.
+    Any unprocessed examples in the current dataset are discarded.

    Args:
-      train_dataset: A tf.nest-compatible structure of tf.data.Dataset or
-        DistributedDataset.
+      train_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
+        `DistributedDataset`.
    """
    self._train_dataset = train_dataset
    self._train_iter = None
@@ -187,25 +226,49 @@ def _create_eval_loop_fn(eval_step_fn, options: StandardEvaluatorOptions):


 class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
-  """Implements the standard functionality of AbstractEvaluator APIs."""
+  """Implements the standard functionality of AbstractEvaluator APIs.
+
+  This class structures evaluation roughly as follows:
+
+      state = eval_begin()
+      for _ in range(num_steps):
+        step_outputs = eval_step(eval_iterator)
+        state = eval_reduce(state, step_outputs)
+      return eval_end(state)
+
+  Calls to `eval_begin`, `eval_reduce`, and `eval_end` are always done in eager
+  mode, while `eval_step` may be compiled with `tf.function` as determined by
+  the `options` passed to `__init__`.
+
+  This class does not support completely evaluating multiple different datasets
+  (i.e., where every example of each dataset should be processed, as opposed to
+  running for a fixed number of evaluation steps). A custom `AbstractEvaluator`
+  is recommended in this case.
+  """

  def __init__(self, eval_dataset, options: StandardEvaluatorOptions = None):
-    """Construct a `StandardEvaluator` object.
+    """Initializes the `StandardEvaluator` instance.

    Args:
-      eval_dataset: A tf.nest-compatible structure of tf.data.Dataset or
-        DistributedDataset.
+      eval_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
+        `DistributedDataset`.
      options: An `orbit.StandardEvaluatorOptions` instance.
    """
    self._eval_options = options or StandardEvaluatorOptions()
    self._eval_dataset = eval_dataset
    self._eval_loop_fn = None

-  def evaluate(
-      self,
-      num_steps: Optional[tf.Tensor],
-  ) -> Optional[Dict[Text, tf.Tensor]]:
-    """See base class."""
+  def evaluate(self, num_steps: tf.Tensor) -> Optional[runner.Output]:
+    """Implements `num_steps` steps of evaluation.
+
+    Args:
+      num_steps: The number of evaluation steps to run. When this is -1,
+        evaluation proceeds until a call to `eval_step` raises a `StopIteration`
+        or `tf.errors.OutOfRangeError`.
+
+    Returns:
+      The output of `self.eval_end()`.
+    """
    outputs = self.eval_begin()  # pylint: disable=assignment-from-no-return

    if self._eval_loop_fn is None:
@@ -224,12 +287,13 @@ class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
  def eval_begin(self) -> Any:
    """Called once at the beginning of the evaluation.

-    This method is called before dataset iterators creation.
-    This is a good place to reset metrics that accumulate values over the entire
-    evaluation.
+    This method is always called in eager mode, and is a good place to reset
+    metrics that accumulate values over the course of evaluation.
+
+    Note that this method is called before dataset iterator creation.

    Returns:
-      An output which is passed as `state` argument into `eval_reduce` function.
+      An value to pass as the `state` argument to `eval_reduce`.
    """
    pass

@@ -237,20 +301,20 @@ class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
  def eval_step(self, iterator) -> Any:
    """Implements one step of evaluation.

-    What a "step" consists of is up to the implementer. If using distribution
-    strategies, the call to this method should take place in the "cross-replica
+    What a "step" consists of is up to the implementer. When using distribution
+    strategies, the call to this method takes place in the "cross-replica
    context" for generality, to allow e.g. multiple iterator dequeues and calls
    to `strategy.run`.

    Note that if `use_tf_function=True`, all the code inside `eval_step` should
-    be tf.function compatible, as they will be traced with tf.function. This
-    means you cannot put arbitrary python code in this function. If users have
-    any numpy operations, they should be put in `eval_begin`, `eval_end` or
-    `eval_reduce` functions.
+    be compatible with `tf.function` tracing (and in particular, any state
+    modifications involving `self` should be avoided). In some cases, non-
+    `tf.function` compatible code can be moved to `eval_loop_begin`,
+    `eval_reduce`, or `eval_loop_end`, which always execute eagerly.

    Args:
-      iterator: A tf.nest-compatible structure of tf.data Iterator or
-        DistributedIterator.
+      iterator: A `tf.nest`-compatible structure of `tf.data.Iterator` or
+        `DistributedIterator`.

    Returns:
      An output which is passed as `step_outputs` argument into `eval_reduce`
@@ -258,14 +322,18 @@ class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
    """
    pass

-  def eval_end(self, *args) -> Optional[Dict[Text, tf.Tensor]]:
+  def eval_end(self, *args) -> Optional[runner.Output]:
    """Called at the end of the evaluation.

-    This is a good place to get metric results. The value returned from this
-    function will be returned as-is from the evaluate() method.
+    Called once at the end of evaluation.
+
+    This method is always called in eager mode, and is a good place to get
+    metric results. The value returned from this function will be returned as-is
+    from the `evaluate` method implementation provided by `StandardEvaluator`.

    Args:
-      *args: the outputs from `eval_reduce` for the last eval step.
+      *args: The outputs from `eval_reduce` for the last eval step, if they are
+        non-`None` (if they are `None`, nothing is passed).

    Returns:
      The function may return a dictionary of `Tensors`, which will be
@@ -274,35 +342,41 @@ class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
    """
    pass

-  def eval_reduce(self, state=None, step_outputs=None) -> Any:
-    """A function to do the reduction on the evaluation outputs per step.
+  def eval_reduce(self,
+                  state: Any = None,
+                  step_outputs: Optional[runner.Output] = None) -> Any:
+    """A function to perform per-step reduction on the evaluation outputs.

-    This is useful for passing states throughout evaluation. E.g. it can be used
-    to maintain the output losses from all the evaluation steps, and compute the
-    mean loss in `eval_end` function.
+    This is useful for passing state throughout evaluation, especially in cases
+    where maintaining or accumulating state is hard to accomplish using
+    `tf.metrics.Metric` or other `tf.Variable`-based approaches. For instance,
+    it can be used to easily accumulate all per-example losses from the full
+    evaluation for subsequent processing in `eval_end()`.

    Args:
-      state: A maintained state throughout the evaluation.
+      state: A state being mainted throughout the evaluation.
      step_outputs: Outputs from the current evaluation step.

    Returns:
-      An output which is passed as `state` argument into `eval_reduce` function
-      for the next step. After evaluation is finished, the output from last step
-      will be passed into `eval_end` function.
+      An output which is passed as the `state` argument to this function for the
+      next step. After evaluation is finished, the output from last step will be
+      passed to `eval_end`.
    """
    pass

  @property
  def eval_dataset(self):
-    """Returns the train_datase instance."""
+    """The current evaluation dataset."""
    return self._eval_dataset

  @eval_dataset.setter
  def eval_dataset(self, eval_dataset):
-    """Set a new eval dataset and replace with the existing one.
+    """Sets a new eval dataset, replacing the current one.
+
+    Any unprocessed examples in the current dataset are discarded.

    Args:
-      eval_dataset: A tf.nest-compatible structure of tf.data.Dataset or
-        DistributedDataset.
+      eval_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
+        `DistributedDataset`.
    """
    self._eval_dataset = eval_dataset
--- a/orbit/standard_runner_test.py
+++ b/orbit/standard_runner_test.py
@@ -39,8 +39,7 @@ class TestTrainer(standard_runner.StandardTrainer):
  def __init__(self, options=None):
    self.strategy = tf.distribute.get_strategy()
    self.global_step = utils.create_global_step()
-    distribute = self.strategy.experimental_distribute_datasets_from_function
-    dataset = distribute(dataset_fn)
+    dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
    super().__init__(train_dataset=dataset, options=options)

  def train_loop_begin(self):
@@ -63,8 +62,7 @@ class TestEvaluator(standard_runner.StandardEvaluator):
  def __init__(self, options=None):
    self.strategy = tf.distribute.get_strategy()
    self.global_step = utils.create_global_step()
-    distribute = self.strategy.experimental_distribute_datasets_from_function
-    dataset = distribute(dataset_fn)
+    dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
    super().__init__(eval_dataset=dataset, options=options)

  def eval_begin(self):

--- a/orbit/utils/__init__.py
+++ b/orbit/utils/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Defines exported symbols for `orbit.utils` package."""
+"""Defines exported symbols for the `orbit.utils` package."""

 from orbit.utils.common import create_global_step
 from orbit.utils.common import get_value

--- a/orbit/utils/common.py
+++ b/orbit/utils/common.py
@@ -16,7 +16,6 @@

 import inspect

-import numpy as np
 import tensorflow as tf


@@ -46,16 +45,16 @@ def create_global_step() -> tf.Variable:


 def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
-  """A helper function to create distributed dataset.
+  """A utility function to help create a `tf.distribute.DistributedDataset`.

  Args:
    strategy: An instance of `tf.distribute.Strategy`.
-    dataset_or_fn: A instance of `tf.data.Dataset` or a function which takes an
-      `tf.distribute.InputContext` as input and returns a `tf.data.Dataset`. If
-      it is a function, it could optionally have an argument named
-      `input_context` which is `tf.distribute.InputContext` argument type.
-    *args: The list of arguments to be passed to dataset_or_fn.
-    **kwargs: Any keyword arguments to be passed.
+    dataset_or_fn: A instance of `tf.data.Dataset`, or a "dataset function"
+      returning a `tf.data.Dataset`. If it is a function, it may optionally have
+      an argument named `input_context` which will be passed a
+      `tf.distribute.InputContext` instance.
+    *args: Any positional arguments to pass through to `dataset_or_fn`.
+    **kwargs: Any keyword arguments to pass through to `dataset_or_fn`.

  Returns:
    A distributed Dataset.
@@ -64,38 +63,37 @@ def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
    strategy = tf.distribute.get_strategy()

  if isinstance(dataset_or_fn, tf.data.Dataset):
-    return strategy.experimental_distribute_dataset(dataset_or_fn)
+    return strategy.distribute_dataset(dataset_or_fn)

  if not callable(dataset_or_fn):
    raise ValueError("`dataset_or_fn` should be either callable or an instance "
-                     "of `tf.data.Dataset`")
+                     "of `tf.data.Dataset`.")

-  def dataset_fn(ctx):
-    """Wrapped dataset function for creating distributed dataset.."""
+  def dataset_fn(input_context):
+    """Wraps `dataset_or_fn` for strategy.distribute_datasets_from_function."""

-    # If `dataset_or_fn` is a function and has `input_context` as argument
-    # names, pass `ctx` as the value of `input_context` when calling
-    # `dataset_or_fn`. Otherwise `ctx` will not be used when calling
-    # `dataset_or_fn`.
+    # If `dataset_or_fn` is a function and has an argument named
+    # `input_context`, pass through the given `input_context`. Otherwise
+    # `input_context` will be ignored.
    argspec = inspect.getfullargspec(dataset_or_fn)
-    args_names = argspec.args
+    arg_names = argspec.args

-    if "input_context" in args_names:
-      kwargs["input_context"] = ctx
-    ds = dataset_or_fn(*args, **kwargs)
-    return ds
+    if "input_context" in arg_names:
+      kwargs["input_context"] = input_context
+    return dataset_or_fn(*args, **kwargs)

-  return strategy.experimental_distribute_datasets_from_function(dataset_fn)
+  return strategy.distribute_datasets_from_function(dataset_fn)


-def get_value(x) -> np.number:
-  """Returns the value of a variable/tensor.
+def get_value(x):
+  """Returns input values, converting any TensorFlow values to NumPy values.

  Args:
-      x: input variable.
+    x: The input. May be a `tf.Tensor` or `tf.Variable`.

  Returns:
-      A Numpy array or number.
+    If the input is a TensorFlow `Tensor`, returns the `Tensor`'s equivalent
+    NumPy value. Otherwise, just returns the input.
  """
  if not tf.is_tensor(x):
    return x

--- a/orbit/utils/epoch_helper.py
+++ b/orbit/utils/epoch_helper.py
@@ -18,14 +18,14 @@ import tensorflow as tf


 class EpochHelper:
-  """A Helper class to handle epochs in Customized Training Loop."""
+  """A helper class handle bookkeeping of epochs in custom training loops."""

  def __init__(self, epoch_steps: int, global_step: tf.Variable):
-    """Constructs the EpochHelper.
+    """Initializes the `EpochHelper` instance.

    Args:
-      epoch_steps: An integer indicates how many steps in an epoch.
-      global_step: A `tf.Variable` instance indicates the current global step.
+      epoch_steps: An integer indicating how many steps are in an epoch.
+      global_step: A `tf.Variable` providing the current global step.
    """
    self._epoch_steps = epoch_steps
    self._global_step = global_step
@@ -46,7 +46,7 @@ class EpochHelper:
  def epoch_end(self):
    """Returns whether the current epoch should end."""
    if not self._in_epoch:
-      raise ValueError("`epoch_end` can only be called inside an epoch")
+      raise ValueError("`epoch_end` can only be called inside an epoch.")
    current_step = self._global_step.numpy()
    epoch = current_step // self._epoch_steps


--- a/orbit/utils/loop_fns.py
+++ b/orbit/utils/loop_fns.py
@@ -20,36 +20,57 @@ import tensorflow as tf


 def create_loop_fn(step_fn):
-  """Creates a multiple steps function driven by the python while loop.
+  """Creates a loop function driven by a Python `while` loop.

  Args:
-    step_fn: A function which takes `iterator` as input.
+    step_fn: A function taking a nested structure of `tf.data.Iterator` or
+      `DistributedIterator`. There are no constraints on the return value of the
+      function (except that it must be compatible with any `reduce_fn` provided
+      to the returned `loop_fn`).

  Returns:
-    A callable defined as the `loop_fn` defination below.
+    A loop function taking required `iterator` and `num_steps` parameters, as
+    well as optional `state` and `reduce_fn` parameters for accumulating state
+    over multiple iterations of the loop. See the `loop_fn` definition below for
+    additional details.
  """

  def loop_fn(iterator, num_steps, state=None, reduce_fn=None):
-    """A loop function with multiple steps.
+    """Makes `num_steps` calls to `step_fn(iterator)`.
+
+    Additionally, state may be accumulated across iterations of the loop.
+    Conceptually, state accumulation is handled roughly as follows:
+
+        for _ in range(num_steps):
+          step_outputs  = step_fn(iterator)
+          state = reduce_fn(state, step_outputs)
+        return state
+
+    However, the implementation is slightly more complicated in order to support
+    looping until the iterator is exhausted (when `num_steps == -1`) and to
+    properly catch exceptions when running under async remote eager (as is the
+    case in TPU training setups involving separate coordinator/worker machines).

    Args:
-      iterator: A nested structure of tf.data `Iterator` or
+      iterator: A nested structure of `tf.data.Iterator` or
        `DistributedIterator`.
-      num_steps: The number of steps in the loop. If `num_steps==-1`, will
+      num_steps: The number of steps in the loop. If `num_steps == -1`, will
        iterate until exausting the iterator.
      state: An optional initial state before running the loop.
-      reduce_fn: a callable defined as `def reduce_fn(state, value)`, where
-        `value` is the outputs from `step_fn`.
+      reduce_fn: A callable taking two inputs, `state` and `value`, where
+        `state` is the previous output from `reduce_fn`, and `value` is the
+        output from `step_fn`.

    Returns:
-      The updated state.
+      The final state returned by `reduce_fn`, or `None` if `state` and
+      `reduce_fn` are not provided.
    """
    try:
      step = 0
-      # To make sure the OutOfRangeError exception can be handled well with
-      # async remote eager, we need to wrap the loop body in a `async_scope`.
+      # To make sure the OutOfRangeError exception can be handled well under
+      # async remote eager, we need to wrap the loop body in `async_scope`.
      with tf.experimental.async_scope():
-        while (num_steps == -1 or step < num_steps):
+        while num_steps == -1 or step < num_steps:
          outputs = step_fn(iterator)
          if reduce_fn is not None:
            state = reduce_fn(state, outputs)
@@ -63,26 +84,32 @@ def create_loop_fn(step_fn):


 def create_tf_while_loop_fn(step_fn):
-  """Create a multiple steps function driven by tf.while_loop on the host.
+  """Creates a loop function compatible with TF's AutoGraph loop conversion.

  Args:
-    step_fn: A function which takes `iterator` as input.
+    step_fn: A function taking a nested structure of `tf.data.Iterator` or
+      `DistributedIterator`. Currently, any return values are ignored.

  Returns:
-    A callable defined as the `loop_fn` defination below.
+    A loop function taking required `iterator` and `num_steps` parameters. If
+    called inside a `tf.function`, the loop will be converted by AutoGraph into
+    a `tf.while_loop` construct. See the `loop_fn` definition below for
+    additional details.
  """

  def loop_fn(iterator, num_steps):
-    """A loop function with multiple steps.
+    """Makes `num_steps` calls to `step_fn(iterator)`.

    Args:
-      iterator: A nested structure of tf.data `Iterator` or
+      iterator: A nested structure of `tf.data.Iterator` or
        `DistributedIterator`.
-      num_steps: The number of steps in the loop. Must be a tf.Tensor.
+      num_steps: The number of steps in the loop. Should be passed as a
+        `tf.Tensor`. Iterating until iterator exhaustion is not supported.
    """
    if not isinstance(num_steps, tf.Tensor):
-      raise ValueError("`num_steps` should be an `tf.Tensor`. Python object "
-                       "may cause retracing.")
+      raise ValueError(
+          "`num_steps` should be a `tf.Tensor`. Passing a Python value can "
+          "cause unnecessary retracing when wrapped by `tf.function`.")

    for _ in tf.range(num_steps):
      step_fn(iterator)

--- a/orbit/utils/summary_manager.py
+++ b/orbit/utils/summary_manager.py
@@ -20,18 +20,19 @@ import tensorflow as tf


 class SummaryManager:
-  """A class manages writing summaries."""
+  """A utility class for managing summary writing."""

  def __init__(self, summary_dir, summary_fn, global_step=None):
-    """Construct a summary manager object.
+    """Initializes the `SummaryManager` instance.

    Args:
-      summary_dir: the directory to write summaries.
-      summary_fn: A callable defined as `def summary_fn(name, tensor,
-        step=None)`, which describes the summary operation.
-      global_step: A `tf.Variable` instance for the global step.
+      summary_dir: The directory in which to write summaries. If `None`, all
+        summary writing operations provided by this class are no-ops.
+      summary_fn: A callable defined accepting `name`, `value`, and `step`
+        parameters, making calls to `tf.summary` functions to write summaries.
+      global_step: A `tf.Variable` containing the global step value.
    """
-    self._enabled = (summary_dir is not None)
+    self._enabled = summary_dir is not None
    self._summary_dir = summary_dir
    self._summary_fn = summary_fn
    self._summary_writers = {}
@@ -42,12 +43,12 @@ class SummaryManager:
      self._global_step = global_step

  def summary_writer(self, relative_path=""):
-    """Returns the underlying summary writer.
+    """Returns the underlying summary writer for a specific subdirectory.

    Args:
      relative_path: The current path in which to write summaries, relative to
-        the summary directory. By default it is empty, which specifies the root
-        directory.
+        the summary directory. By default it is empty, which corresponds to the
+        root directory.
    """
    if self._summary_writers and relative_path in self._summary_writers:
      return self._summary_writers[relative_path]
@@ -59,43 +60,41 @@ class SummaryManager:
    return self._summary_writers[relative_path]

  def flush(self):
-    """Flush the underlying summary writers."""
+    """Flushes the underlying summary writers."""
    if self._enabled:
      tf.nest.map_structure(tf.summary.flush, self._summary_writers)

  def write_summaries(self, summary_dict):
-    """Write summaries for the given values.
+    """Writes summaries for the given dictionary of values.

    This recursively creates subdirectories for any nested dictionaries
    provided in `summary_dict`, yielding a hierarchy of directories which will
    then be reflected in the TensorBoard UI as different colored curves.

-    E.g. users may evaluate on muliple datasets and return `summary_dict` as a
-    nested dictionary.
+    For example, users may evaluate on muliple datasets and return
+    `summary_dict` as a nested dictionary:

-    ```
        {
-        "dataset": {
-            "loss": loss,
-            "accuracy": accuracy
+            "dataset1": {
+                "loss": loss1,
+                "accuracy": accuracy1
            },
            "dataset2": {
                "loss": loss2,
                "accuracy": accuracy2
            },
        }
-    ```

-    This will create two subdirectories "dataset" and "dataset2" inside the
+    This will create two subdirectories, "dataset1" and "dataset2", inside the
    summary root directory. Each directory will contain event files including
    both "loss" and "accuracy" summaries.

    Args:
      summary_dict: A dictionary of values. If any value in `summary_dict` is
-        itself a dictionary, then the function will recursively create
-        subdirectories with names given by the keys in the dictionary. The
-        Tensor values are summarized using the summary writer instance specific
-        to the parent relative path.
+        itself a dictionary, then the function will create a subdirectory with
+        name given by the corresponding key. This is performed recursively. Leaf
+        values are then summarized using the summary writer instance specific to
+        the parent relative path.
    """
    if not self._enabled:
      return