Commit ab5d4180 authored by Dan Holtmann-Rice's avatar Dan Holtmann-Rice Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 335446217
parent 6cf1e5a1
![TensorFlow Requirement: 2.x](https://img.shields.io/badge/TensorFlow%20Requirement-2.x-brightgreen)
# Orbit
Orbit is a customized training loop library built on top of Tensorflow 2. It
provides a flexible lightweight library that users can easily use or fork when
writing [customized training loop code](https://www.tensorflow.org/tutorials/distribute/custom_training)
in TF2. It intergates with `tf.distribute` seamlessly and supports running on
different device types (CPU, GPU, and TPU).
Orbit is a flexible, lightweight library designed to make it easy to write
[custom training loops][custom_training] in TensorFlow 2. Orbit handles common
model training tasks such as saving checkpoints, running model evaluations, and
setting up summary writing, while giving users full control over implementing
the inner training loop. It integrates with `tf.distribute` seamlessly and
supports running on different device types (CPU, GPU, and TPU). The core code is
intended to be easy to read and fork.
See our [g3doc](g3doc) at go/orbit-trainer for additional documentation.
[custom_training]: https://www.tensorflow.org/tutorials/distribute/custom_training
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Defines exported symbols for `orbit` package."""
"""Defines exported symbols for the `orbit` package."""
from orbit import utils
......
This diff is collapsed.
......@@ -15,10 +15,14 @@
"""Tests for orbit.controller."""
import os
from absl import logging
from absl.testing import parameterized
import numpy as np
from orbit import controller
from orbit import runner
from orbit import standard_runner
import tensorflow as tf
......@@ -65,12 +69,8 @@ class TestRunner(standard_runner.StandardTrainer,
self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
self.eval_loss = tf.keras.metrics.Mean("eval_loss", dtype=tf.float32)
self.return_numpy = return_numpy
train_dataset = (
self.strategy.experimental_distribute_datasets_from_function(dataset_fn)
)
eval_dataset = (
self.strategy.experimental_distribute_datasets_from_function(dataset_fn)
)
train_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
eval_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
standard_runner.StandardTrainer.__init__(self, train_dataset)
standard_runner.StandardEvaluator.__init__(self, eval_dataset)
......@@ -95,8 +95,7 @@ class TestRunner(standard_runner.StandardTrainer,
}
def build_eval_dataset(self):
return self.strategy.experimental_distribute_datasets_from_function(
dataset_fn)
return self.strategy.distribute_datasets_from_function(dataset_fn)
def eval_begin(self):
self.eval_loss.reset_states()
......@@ -125,8 +124,7 @@ class TestEvaluator(standard_runner.StandardEvaluator):
def __init__(self):
self.strategy = tf.distribute.get_strategy()
self.model = create_model()
eval_dataset = self.strategy.experimental_distribute_datasets_from_function(
dataset_fn)
eval_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
standard_runner.StandardEvaluator.__init__(self, eval_dataset)
def eval_reduce(self, state, output):
......@@ -157,16 +155,20 @@ class TestEvaluator(standard_runner.StandardEvaluator):
}
class TestEvaluatorNoOutput(runner.AbstractEvaluator):
def evaluate(self, num_steps):
pass
class TestEvaluatorWithNestedSummary(standard_runner.StandardEvaluator):
"""Implements the training and evaluation APIs for the test model."""
def __init__(self):
self.strategy = tf.distribute.get_strategy()
self.model = create_model()
dataset = self.strategy.experimental_distribute_datasets_from_function(
dataset_fn)
dataset2 = self.strategy.experimental_distribute_datasets_from_function(
dataset_fn)
dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
dataset2 = self.strategy.distribute_datasets_from_function(dataset_fn)
self.loss = tf.keras.metrics.Mean("loss", dtype=tf.float32)
self.accuracy = tf.keras.metrics.CategoricalAccuracy(
"accuracy", dtype=tf.float32)
......@@ -217,9 +219,7 @@ class TestTrainerWithSummaries(standard_runner.StandardTrainer):
self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
self.global_step = self.optimizer.iterations
self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
train_dataset = (
self.strategy.experimental_distribute_datasets_from_function(dataset_fn)
)
train_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
standard_runner.StandardTrainer.__init__(
self,
train_dataset,
......@@ -227,8 +227,7 @@ class TestTrainerWithSummaries(standard_runner.StandardTrainer):
use_tpu_summary_optimization=True))
def build_train_dataset(self):
return self.strategy.experimental_distribute_datasets_from_function(
dataset_fn)
return self.strategy.distribute_datasets_from_function(dataset_fn)
def train_step(self, iterator):
......@@ -344,6 +343,26 @@ class ControllerTest(tf.test.TestCase, parameterized.TestCase):
self.assertNotEmpty(tf.io.gfile.glob(
os.path.join(self.model_dir, "summaries/eval/events.*")))
def test_restore_from_most_recent_checkpoint(self):
test_runner = TestRunner()
checkpoint = tf.train.Checkpoint(model=test_runner.model)
checkpoint_manager = tf.train.CheckpointManager(
checkpoint,
self.model_dir,
max_to_keep=None,
step_counter=test_runner.global_step,
checkpoint_interval=5)
test_controller = controller.Controller(
trainer=test_runner,
global_step=test_runner.global_step,
checkpoint_manager=checkpoint_manager,
eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
steps_per_loop=5)
test_controller.train(20)
self.assertLen(checkpoint_manager.checkpoints, 4)
restored_path = test_controller.restore_checkpoint()
self.assertEqual(restored_path, checkpoint_manager.checkpoints[-1])
@parameterized.named_parameters(("return_numpy", True),
("return_tensor", False))
def test_train_and_evaluate(self, return_numpy):
......@@ -601,7 +620,7 @@ class ControllerTest(tf.test.TestCase, parameterized.TestCase):
self.assertLess(test_runner.global_step, 10)
def test_evaluate_with_loss_outputs(self):
def test_evaluate_with_loss_output(self):
test_evaluator = TestEvaluator()
checkpoint = tf.train.Checkpoint(model=test_evaluator.model)
......@@ -622,6 +641,13 @@ class ControllerTest(tf.test.TestCase, parameterized.TestCase):
summaries_with_matching_keyword(
"eval_loss", os.path.join(self.model_dir, "summaries/eval")))
def test_evaluate_with_no_output(self):
test_controller = controller.Controller(
evaluator=TestEvaluatorNoOutput(),
global_step=tf.Variable(0, dtype=tf.int64),
eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
self.assertEqual(test_controller.evaluate(steps=5), {})
def test_train_and_evaluate_reset_datasets(self):
test_runner = TestRunner()
......@@ -635,11 +661,9 @@ class ControllerTest(tf.test.TestCase, parameterized.TestCase):
train_steps=10, eval_steps=2, eval_interval=6)
train_dataset = (
test_runner.strategy.experimental_distribute_datasets_from_function(
dataset_fn))
test_runner.strategy.distribute_datasets_from_function(dataset_fn))
eval_dataset = (
test_runner.strategy.experimental_distribute_datasets_from_function(
dataset_fn))
test_runner.strategy.distribute_datasets_from_function(dataset_fn))
test_runner.train_dataset = train_dataset
test_runner.eval_dataset = eval_dataset
......
......@@ -12,62 +12,72 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""An abstraction that users can easily handle their custom training loops."""
"""Provides AbstractTrainer/Evaluator base classes, defining train/eval APIs."""
import abc
from typing import Dict, Optional, Text
from typing import Dict, Optional, Union
import numpy as np
import tensorflow as tf
Output = Dict[str, Union[tf.Tensor, float, np.number, np.ndarray, 'Output']] # pytype: disable=not-supported-yet
class AbstractTrainer(tf.Module, metaclass=abc.ABCMeta):
"""An abstract class defining the APIs required for training."""
"""An abstract class defining the API required for training."""
@abc.abstractmethod
def train(self,
num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
"""Implements model training with multiple steps.
In training, it is common to break the total training steps into several
training loops, so users can do checkpointing, write summaries and run some
python callbacks. This is necessary for getting good performance in TPU
training, as the overhead for launching a multi worker tf.function may be
large in Eager mode. It is usually encouraged to create a host training loop
(e.g. using a `tf.range` wrapping `strategy.run` inside a
`tf.function`) in the TPU case. For the cases that don't require host
training loop to achieve peak performance, users can just implement a simple
python loop to drive each step.
def train(self, num_steps: tf.Tensor) -> Optional[Output]:
"""Implements `num_steps` steps of training.
This method will by called the `Controller` to perform the "inner loop" of
training. This inner loop amortizes the cost of bookkeeping associated with
checkpointing, evaluation, and writing summaries. Additionally, the inner
loop can be implemented (if desired) using TensorFlow's looping constructs
(e.g. a `for` loop over a `tf.range` inside a `tf.function`), which can be
necessary for getting optimal performance when running on TPU. For cases
that don't require peak performance, a simple Python loop can be used
instead for simplicity.
Args:
num_steps: A guideline for how many training steps to run. Note that it is
up to the model what constitutes a "step" (this may involve more than
one update to model parameters, e.g. if training a GAN).
num_steps: The number of training steps to run. Note that it is up to the
model what constitutes a "step", which may involve more than one update
to model parameters (e.g., if training a GAN).
Returns:
The function may return a dictionary of `Tensors` or numpy arrays, which
will be written to logs and as TensorBoard summaries. It can also be a
nested dictionary, yielding a hierarchy of summary directories.
Either `None`, or a dictionary mapping names to `Tensor`s or NumPy values.
If a dictionary is returned, it will be written to logs and as TensorBoard
summaries. The dictionary may also be nested, which will generate a
hierarchy of summary directories.
"""
pass
class AbstractEvaluator(tf.Module, metaclass=abc.ABCMeta):
"""An abstract class defining the APIs required for evaluation."""
"""An abstract class defining the API required for evaluation."""
@abc.abstractmethod
def evaluate(
self, num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
"""Implements model evaluation.
def evaluate(self, num_steps: tf.Tensor) -> Optional[Output]:
"""Implements `num_steps` steps of evaluation.
This method will by called the `Controller` to perform an evaluation. The
`num_steps` parameter specifies the number of steps of evaluation to run,
which is specified by the user when calling one of the `Controller`'s
evaluation methods. A special sentinel value of `-1` is reserved to indicate
evaluation should run until the underlying data source is exhausted.
Args:
num_steps: A guideline for how many evaluation steps to run. Note that it
is up to the model what constitutes a "step". Generally, it may be
desirable to support both a limited number of eval steps and iterating
over a full dataset (however many steps are required) when `num_steps`
is `None`.
num_steps: The number of evaluation steps to run. Note that it is up to
the model what constitutes a "step". Evaluations may also want to
support "complete" evaluations when `num_steps == -1`, running until a
given data source is exhausted.
Returns:
The function may return a dictionary of `Tensors` or numpy arrays, which
will be written to logs and as TensorBoard summaries. It can also be a
nested dictionary, yielding a hierarchy of summary directories.
Either `None`, or a dictionary mapping names to `Tensor`s or NumPy values.
If a dictionary is returned, it will be written to logs and as TensorBoard
summaries. The dictionary may also be nested, which will generate a
hierarchy of summary directories.
"""
pass
......@@ -12,11 +12,30 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""AbstractTrainer/Evaluator implementations for standard settings."""
"""AbstractTrainer/Evaluator subclasses with added functionality.
The classes in this module provide some additional structure to the bare
`AbstractTrainer`/`AbstractEvaluator` APIs.
Both `StandardTrainer` and `StandardEvaluator` split the train/eval loops into
"begin", "step", and "end" methods, and provide an implementation of the loop
itself that makes calls to the relevant step method.
`StandardTrainer` supports running the loop using the TF while loop construct
for added performance (particularly on TPUs). It additionally provides some
functionality to make writing summaries from inside a model more performant when
running on TPUs.
These classes are intended to work well in common settings, however there may
be use cases these classes don't support (for instance, `StandardEvaluator` in
particular doesn't support running full evaluations over multiple different eval
datasets). Users are encouraged to simply fall back to custom `AbstractTrainer`
and `AbstractEvaluator` subclasses in these cases.
"""
import abc
from typing import Any, Dict, Optional, Text
from typing import Any, Optional
import dataclasses
......@@ -65,14 +84,26 @@ def _create_train_loop_fn(train_step_fn, options: StandardTrainerOptions):
class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
"""Implements the standard functionality of AbstractTrainer APIs."""
"""Implements standard functionality on top of the AbstractTrainer API.
This class structures the training "inner loop" roughly as follows:
train_loop_begin()
for _ in range(num_steps):
train_step(train_iterator)
return train_loop_end()
Calls to `train_loop_begin` and `train_loop_end` are always done in eager
mode, while the loop/`train_step` may be implemented using `tf.while` and/or
`tf.function`, as determined by the `options` passed to `__init__`.
"""
def __init__(self, train_dataset, options: StandardTrainerOptions = None):
"""Construct a `StandardTrainer` object.
"""Initializes the `StandardTrainer` instance.
Args:
train_dataset: A tf.nest-compatible structure of tf.data.Dataset or
DistributedDataset.
train_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
`DistributedDataset`.
options: An `orbit.StandardTrainerOptions` instance.
"""
options = options or StandardTrainerOptions()
......@@ -88,11 +119,16 @@ class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
self._train_iter = None
self._train_loop_fn = None
def train(
self,
num_steps: Optional[tf.Tensor],
) -> Optional[Dict[Text, tf.Tensor]]:
"""See base class."""
def train(self, num_steps: tf.Tensor) -> Optional[runner.Output]:
"""Implements `num_steps` steps of training.
Args:
num_steps: The number of training steps to run. This corresponds directly
to the number of calls made to `train_step`.
Returns:
The output of `train_loop_end`.
"""
self.train_loop_begin()
if self._train_loop_fn is None:
......@@ -108,9 +144,10 @@ class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
def train_loop_begin(self):
"""Called once at the beginning of the training loop.
This method is called before dataset iterators creation.
This is a good place to reset metrics that accumulate values over multiple
steps of training.
This method is always called in eager mode, and is a good place to reset
metrics that accumulate values over multiple steps of training.
Note that this method is called before dataset iterator creation.
"""
pass
......@@ -118,28 +155,30 @@ class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
def train_step(self, iterator):
"""Implements one step of training.
What a "step" consists of is up to the implementer. If using distribution
strategies, the call to this method should take place in the "cross-replica
What a "step" consists of is up to the implementer. When using distribution
strategies, the call to this method takes place in the "cross-replica
context" for generality, to allow e.g. multiple iterator dequeues and calls
to `strategy.run`.
Note that if `use_tf_function=True`, all the code inside `train_step` should
be tf.function compatible, as they will be traced with tf.function. This
means you cannot put arbitrary python code in this function. If users have
any numpy operations, they should be put in `train_loop_begin` or
`train_loop_end` functions.
be compatible with `tf.function` tracing (and in particular, any state
modifications involving `self` should be avoided). In some cases, non-
`tf.function` compatible code can be moved to `train_loop_begin` or
`train_loop_end`, which always execute eagerly.
Args:
iterator: A tf.nest-compatible structure of tf.data Iterator or
DistributedIterator.
iterator: A `tf.nest`-compatible structure of `tf.data.Iterator` or
`DistributedIterator`. The structure of this input matches the structure
of `train_dataset` as passed to `__init__`.
"""
pass
def train_loop_end(self) -> Optional[Dict[Text, tf.Tensor]]:
"""Called at the end of the training loop.
def train_loop_end(self) -> Optional[runner.Output]:
"""Called once at the end of the training loop.
This is a good place to get metric results. The value returned from this
function will be returned as-is from the train() method.
This method is always called in eager mode, and is a good place to get
metric results. The value returned from this function will be returned as-is
from the `train` method implementation provided by `StandardTrainer`.
Returns:
The function may return a dictionary of `Tensors`, which will be
......@@ -150,18 +189,18 @@ class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
@property
def train_dataset(self):
"""Returns the train_dataset instance."""
"""The current training dataset."""
return self._train_dataset
@train_dataset.setter
def train_dataset(self, train_dataset):
"""Set a new train dataset and replace with the existing one.
"""Sets a new training dataset, replacing the current one.
Any unfinished work in the previous dataset will be discarded.
Any unprocessed examples in the current dataset are discarded.
Args:
train_dataset: A tf.nest-compatible structure of tf.data.Dataset or
DistributedDataset.
train_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
`DistributedDataset`.
"""
self._train_dataset = train_dataset
self._train_iter = None
......@@ -187,25 +226,49 @@ def _create_eval_loop_fn(eval_step_fn, options: StandardEvaluatorOptions):
class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
"""Implements the standard functionality of AbstractEvaluator APIs."""
"""Implements the standard functionality of AbstractEvaluator APIs.
This class structures evaluation roughly as follows:
state = eval_begin()
for _ in range(num_steps):
step_outputs = eval_step(eval_iterator)
state = eval_reduce(state, step_outputs)
return eval_end(state)
Calls to `eval_begin`, `eval_reduce`, and `eval_end` are always done in eager
mode, while `eval_step` may be compiled with `tf.function` as determined by
the `options` passed to `__init__`.
This class does not support completely evaluating multiple different datasets
(i.e., where every example of each dataset should be processed, as opposed to
running for a fixed number of evaluation steps). A custom `AbstractEvaluator`
is recommended in this case.
"""
def __init__(self, eval_dataset, options: StandardEvaluatorOptions = None):
"""Construct a `StandardEvaluator` object.
"""Initializes the `StandardEvaluator` instance.
Args:
eval_dataset: A tf.nest-compatible structure of tf.data.Dataset or
DistributedDataset.
eval_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
`DistributedDataset`.
options: An `orbit.StandardEvaluatorOptions` instance.
"""
self._eval_options = options or StandardEvaluatorOptions()
self._eval_dataset = eval_dataset
self._eval_loop_fn = None
def evaluate(
self,
num_steps: Optional[tf.Tensor],
) -> Optional[Dict[Text, tf.Tensor]]:
"""See base class."""
def evaluate(self, num_steps: tf.Tensor) -> Optional[runner.Output]:
"""Implements `num_steps` steps of evaluation.
Args:
num_steps: The number of evaluation steps to run. When this is -1,
evaluation proceeds until a call to `eval_step` raises a `StopIteration`
or `tf.errors.OutOfRangeError`.
Returns:
The output of `self.eval_end()`.
"""
outputs = self.eval_begin() # pylint: disable=assignment-from-no-return
if self._eval_loop_fn is None:
......@@ -224,12 +287,13 @@ class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
def eval_begin(self) -> Any:
"""Called once at the beginning of the evaluation.
This method is called before dataset iterators creation.
This is a good place to reset metrics that accumulate values over the entire
evaluation.
This method is always called in eager mode, and is a good place to reset
metrics that accumulate values over the course of evaluation.
Note that this method is called before dataset iterator creation.
Returns:
An output which is passed as `state` argument into `eval_reduce` function.
An value to pass as the `state` argument to `eval_reduce`.
"""
pass
......@@ -237,20 +301,20 @@ class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
def eval_step(self, iterator) -> Any:
"""Implements one step of evaluation.
What a "step" consists of is up to the implementer. If using distribution
strategies, the call to this method should take place in the "cross-replica
What a "step" consists of is up to the implementer. When using distribution
strategies, the call to this method takes place in the "cross-replica
context" for generality, to allow e.g. multiple iterator dequeues and calls
to `strategy.run`.
Note that if `use_tf_function=True`, all the code inside `eval_step` should
be tf.function compatible, as they will be traced with tf.function. This
means you cannot put arbitrary python code in this function. If users have
any numpy operations, they should be put in `eval_begin`, `eval_end` or
`eval_reduce` functions.
be compatible with `tf.function` tracing (and in particular, any state
modifications involving `self` should be avoided). In some cases, non-
`tf.function` compatible code can be moved to `eval_loop_begin`,
`eval_reduce`, or `eval_loop_end`, which always execute eagerly.
Args:
iterator: A tf.nest-compatible structure of tf.data Iterator or
DistributedIterator.
iterator: A `tf.nest`-compatible structure of `tf.data.Iterator` or
`DistributedIterator`.
Returns:
An output which is passed as `step_outputs` argument into `eval_reduce`
......@@ -258,14 +322,18 @@ class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
"""
pass
def eval_end(self, *args) -> Optional[Dict[Text, tf.Tensor]]:
def eval_end(self, *args) -> Optional[runner.Output]:
"""Called at the end of the evaluation.
This is a good place to get metric results. The value returned from this
function will be returned as-is from the evaluate() method.
Called once at the end of evaluation.
This method is always called in eager mode, and is a good place to get
metric results. The value returned from this function will be returned as-is
from the `evaluate` method implementation provided by `StandardEvaluator`.
Args:
*args: the outputs from `eval_reduce` for the last eval step.
*args: The outputs from `eval_reduce` for the last eval step, if they are
non-`None` (if they are `None`, nothing is passed).
Returns:
The function may return a dictionary of `Tensors`, which will be
......@@ -274,35 +342,41 @@ class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
"""
pass
def eval_reduce(self, state=None, step_outputs=None) -> Any:
"""A function to do the reduction on the evaluation outputs per step.
def eval_reduce(self,
state: Any = None,
step_outputs: Optional[runner.Output] = None) -> Any:
"""A function to perform per-step reduction on the evaluation outputs.
This is useful for passing states throughout evaluation. E.g. it can be used
to maintain the output losses from all the evaluation steps, and compute the
mean loss in `eval_end` function.
This is useful for passing state throughout evaluation, especially in cases
where maintaining or accumulating state is hard to accomplish using
`tf.metrics.Metric` or other `tf.Variable`-based approaches. For instance,
it can be used to easily accumulate all per-example losses from the full
evaluation for subsequent processing in `eval_end()`.
Args:
state: A maintained state throughout the evaluation.
state: A state being mainted throughout the evaluation.
step_outputs: Outputs from the current evaluation step.
Returns:
An output which is passed as `state` argument into `eval_reduce` function
for the next step. After evaluation is finished, the output from last step
will be passed into `eval_end` function.
An output which is passed as the `state` argument to this function for the
next step. After evaluation is finished, the output from last step will be
passed to `eval_end`.
"""
pass
@property
def eval_dataset(self):
"""Returns the train_datase instance."""
"""The current evaluation dataset."""
return self._eval_dataset
@eval_dataset.setter
def eval_dataset(self, eval_dataset):
"""Set a new eval dataset and replace with the existing one.
"""Sets a new eval dataset, replacing the current one.
Any unprocessed examples in the current dataset are discarded.
Args:
eval_dataset: A tf.nest-compatible structure of tf.data.Dataset or
DistributedDataset.
eval_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
`DistributedDataset`.
"""
self._eval_dataset = eval_dataset
......@@ -39,8 +39,7 @@ class TestTrainer(standard_runner.StandardTrainer):
def __init__(self, options=None):
self.strategy = tf.distribute.get_strategy()
self.global_step = utils.create_global_step()
distribute = self.strategy.experimental_distribute_datasets_from_function
dataset = distribute(dataset_fn)
dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
super().__init__(train_dataset=dataset, options=options)
def train_loop_begin(self):
......@@ -63,8 +62,7 @@ class TestEvaluator(standard_runner.StandardEvaluator):
def __init__(self, options=None):
self.strategy = tf.distribute.get_strategy()
self.global_step = utils.create_global_step()
distribute = self.strategy.experimental_distribute_datasets_from_function
dataset = distribute(dataset_fn)
dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
super().__init__(eval_dataset=dataset, options=options)
def eval_begin(self):
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Defines exported symbols for `orbit.utils` package."""
"""Defines exported symbols for the `orbit.utils` package."""
from orbit.utils.common import create_global_step
from orbit.utils.common import get_value
......
......@@ -16,7 +16,6 @@
import inspect
import numpy as np
import tensorflow as tf
......@@ -46,16 +45,16 @@ def create_global_step() -> tf.Variable:
def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
"""A helper function to create distributed dataset.
"""A utility function to help create a `tf.distribute.DistributedDataset`.
Args:
strategy: An instance of `tf.distribute.Strategy`.
dataset_or_fn: A instance of `tf.data.Dataset` or a function which takes an
`tf.distribute.InputContext` as input and returns a `tf.data.Dataset`. If
it is a function, it could optionally have an argument named
`input_context` which is `tf.distribute.InputContext` argument type.
*args: The list of arguments to be passed to dataset_or_fn.
**kwargs: Any keyword arguments to be passed.
dataset_or_fn: A instance of `tf.data.Dataset`, or a "dataset function"
returning a `tf.data.Dataset`. If it is a function, it may optionally have
an argument named `input_context` which will be passed a
`tf.distribute.InputContext` instance.
*args: Any positional arguments to pass through to `dataset_or_fn`.
**kwargs: Any keyword arguments to pass through to `dataset_or_fn`.
Returns:
A distributed Dataset.
......@@ -64,38 +63,37 @@ def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
strategy = tf.distribute.get_strategy()
if isinstance(dataset_or_fn, tf.data.Dataset):
return strategy.experimental_distribute_dataset(dataset_or_fn)
return strategy.distribute_dataset(dataset_or_fn)
if not callable(dataset_or_fn):
raise ValueError("`dataset_or_fn` should be either callable or an instance "
"of `tf.data.Dataset`")
"of `tf.data.Dataset`.")
def dataset_fn(ctx):
"""Wrapped dataset function for creating distributed dataset.."""
def dataset_fn(input_context):
"""Wraps `dataset_or_fn` for strategy.distribute_datasets_from_function."""
# If `dataset_or_fn` is a function and has `input_context` as argument
# names, pass `ctx` as the value of `input_context` when calling
# `dataset_or_fn`. Otherwise `ctx` will not be used when calling
# `dataset_or_fn`.
# If `dataset_or_fn` is a function and has an argument named
# `input_context`, pass through the given `input_context`. Otherwise
# `input_context` will be ignored.
argspec = inspect.getfullargspec(dataset_or_fn)
args_names = argspec.args
arg_names = argspec.args
if "input_context" in args_names:
kwargs["input_context"] = ctx
ds = dataset_or_fn(*args, **kwargs)
return ds
if "input_context" in arg_names:
kwargs["input_context"] = input_context
return dataset_or_fn(*args, **kwargs)
return strategy.experimental_distribute_datasets_from_function(dataset_fn)
return strategy.distribute_datasets_from_function(dataset_fn)
def get_value(x) -> np.number:
"""Returns the value of a variable/tensor.
def get_value(x):
"""Returns input values, converting any TensorFlow values to NumPy values.
Args:
x: input variable.
x: The input. May be a `tf.Tensor` or `tf.Variable`.
Returns:
A Numpy array or number.
If the input is a TensorFlow `Tensor`, returns the `Tensor`'s equivalent
NumPy value. Otherwise, just returns the input.
"""
if not tf.is_tensor(x):
return x
......
......@@ -18,14 +18,14 @@ import tensorflow as tf
class EpochHelper:
"""A Helper class to handle epochs in Customized Training Loop."""
"""A helper class handle bookkeeping of epochs in custom training loops."""
def __init__(self, epoch_steps: int, global_step: tf.Variable):
"""Constructs the EpochHelper.
"""Initializes the `EpochHelper` instance.
Args:
epoch_steps: An integer indicates how many steps in an epoch.
global_step: A `tf.Variable` instance indicates the current global step.
epoch_steps: An integer indicating how many steps are in an epoch.
global_step: A `tf.Variable` providing the current global step.
"""
self._epoch_steps = epoch_steps
self._global_step = global_step
......@@ -46,7 +46,7 @@ class EpochHelper:
def epoch_end(self):
"""Returns whether the current epoch should end."""
if not self._in_epoch:
raise ValueError("`epoch_end` can only be called inside an epoch")
raise ValueError("`epoch_end` can only be called inside an epoch.")
current_step = self._global_step.numpy()
epoch = current_step // self._epoch_steps
......
......@@ -20,36 +20,57 @@ import tensorflow as tf
def create_loop_fn(step_fn):
"""Creates a multiple steps function driven by the python while loop.
"""Creates a loop function driven by a Python `while` loop.
Args:
step_fn: A function which takes `iterator` as input.
step_fn: A function taking a nested structure of `tf.data.Iterator` or
`DistributedIterator`. There are no constraints on the return value of the
function (except that it must be compatible with any `reduce_fn` provided
to the returned `loop_fn`).
Returns:
A callable defined as the `loop_fn` defination below.
A loop function taking required `iterator` and `num_steps` parameters, as
well as optional `state` and `reduce_fn` parameters for accumulating state
over multiple iterations of the loop. See the `loop_fn` definition below for
additional details.
"""
def loop_fn(iterator, num_steps, state=None, reduce_fn=None):
"""A loop function with multiple steps.
"""Makes `num_steps` calls to `step_fn(iterator)`.
Additionally, state may be accumulated across iterations of the loop.
Conceptually, state accumulation is handled roughly as follows:
for _ in range(num_steps):
step_outputs = step_fn(iterator)
state = reduce_fn(state, step_outputs)
return state
However, the implementation is slightly more complicated in order to support
looping until the iterator is exhausted (when `num_steps == -1`) and to
properly catch exceptions when running under async remote eager (as is the
case in TPU training setups involving separate coordinator/worker machines).
Args:
iterator: A nested structure of tf.data `Iterator` or
iterator: A nested structure of `tf.data.Iterator` or
`DistributedIterator`.
num_steps: The number of steps in the loop. If `num_steps==-1`, will
num_steps: The number of steps in the loop. If `num_steps == -1`, will
iterate until exausting the iterator.
state: An optional initial state before running the loop.
reduce_fn: a callable defined as `def reduce_fn(state, value)`, where
`value` is the outputs from `step_fn`.
reduce_fn: A callable taking two inputs, `state` and `value`, where
`state` is the previous output from `reduce_fn`, and `value` is the
output from `step_fn`.
Returns:
The updated state.
The final state returned by `reduce_fn`, or `None` if `state` and
`reduce_fn` are not provided.
"""
try:
step = 0
# To make sure the OutOfRangeError exception can be handled well with
# async remote eager, we need to wrap the loop body in a `async_scope`.
# To make sure the OutOfRangeError exception can be handled well under
# async remote eager, we need to wrap the loop body in `async_scope`.
with tf.experimental.async_scope():
while (num_steps == -1 or step < num_steps):
while num_steps == -1 or step < num_steps:
outputs = step_fn(iterator)
if reduce_fn is not None:
state = reduce_fn(state, outputs)
......@@ -63,26 +84,32 @@ def create_loop_fn(step_fn):
def create_tf_while_loop_fn(step_fn):
"""Create a multiple steps function driven by tf.while_loop on the host.
"""Creates a loop function compatible with TF's AutoGraph loop conversion.
Args:
step_fn: A function which takes `iterator` as input.
step_fn: A function taking a nested structure of `tf.data.Iterator` or
`DistributedIterator`. Currently, any return values are ignored.
Returns:
A callable defined as the `loop_fn` defination below.
A loop function taking required `iterator` and `num_steps` parameters. If
called inside a `tf.function`, the loop will be converted by AutoGraph into
a `tf.while_loop` construct. See the `loop_fn` definition below for
additional details.
"""
def loop_fn(iterator, num_steps):
"""A loop function with multiple steps.
"""Makes `num_steps` calls to `step_fn(iterator)`.
Args:
iterator: A nested structure of tf.data `Iterator` or
iterator: A nested structure of `tf.data.Iterator` or
`DistributedIterator`.
num_steps: The number of steps in the loop. Must be a tf.Tensor.
num_steps: The number of steps in the loop. Should be passed as a
`tf.Tensor`. Iterating until iterator exhaustion is not supported.
"""
if not isinstance(num_steps, tf.Tensor):
raise ValueError("`num_steps` should be an `tf.Tensor`. Python object "
"may cause retracing.")
raise ValueError(
"`num_steps` should be a `tf.Tensor`. Passing a Python value can "
"cause unnecessary retracing when wrapped by `tf.function`.")
for _ in tf.range(num_steps):
step_fn(iterator)
......
......@@ -20,18 +20,19 @@ import tensorflow as tf
class SummaryManager:
"""A class manages writing summaries."""
"""A utility class for managing summary writing."""
def __init__(self, summary_dir, summary_fn, global_step=None):
"""Construct a summary manager object.
"""Initializes the `SummaryManager` instance.
Args:
summary_dir: the directory to write summaries.
summary_fn: A callable defined as `def summary_fn(name, tensor,
step=None)`, which describes the summary operation.
global_step: A `tf.Variable` instance for the global step.
summary_dir: The directory in which to write summaries. If `None`, all
summary writing operations provided by this class are no-ops.
summary_fn: A callable defined accepting `name`, `value`, and `step`
parameters, making calls to `tf.summary` functions to write summaries.
global_step: A `tf.Variable` containing the global step value.
"""
self._enabled = (summary_dir is not None)
self._enabled = summary_dir is not None
self._summary_dir = summary_dir
self._summary_fn = summary_fn
self._summary_writers = {}
......@@ -42,12 +43,12 @@ class SummaryManager:
self._global_step = global_step
def summary_writer(self, relative_path=""):
"""Returns the underlying summary writer.
"""Returns the underlying summary writer for a specific subdirectory.
Args:
relative_path: The current path in which to write summaries, relative to
the summary directory. By default it is empty, which specifies the root
directory.
the summary directory. By default it is empty, which corresponds to the
root directory.
"""
if self._summary_writers and relative_path in self._summary_writers:
return self._summary_writers[relative_path]
......@@ -59,43 +60,41 @@ class SummaryManager:
return self._summary_writers[relative_path]
def flush(self):
"""Flush the underlying summary writers."""
"""Flushes the underlying summary writers."""
if self._enabled:
tf.nest.map_structure(tf.summary.flush, self._summary_writers)
def write_summaries(self, summary_dict):
"""Write summaries for the given values.
"""Writes summaries for the given dictionary of values.
This recursively creates subdirectories for any nested dictionaries
provided in `summary_dict`, yielding a hierarchy of directories which will
then be reflected in the TensorBoard UI as different colored curves.
E.g. users may evaluate on muliple datasets and return `summary_dict` as a
nested dictionary.
For example, users may evaluate on muliple datasets and return
`summary_dict` as a nested dictionary:
```
{
"dataset": {
"loss": loss,
"accuracy": accuracy
"dataset1": {
"loss": loss1,
"accuracy": accuracy1
},
"dataset2": {
"loss": loss2,
"accuracy": accuracy2
},
}
```
This will create two subdirectories "dataset" and "dataset2" inside the
This will create two subdirectories, "dataset1" and "dataset2", inside the
summary root directory. Each directory will contain event files including
both "loss" and "accuracy" summaries.
Args:
summary_dict: A dictionary of values. If any value in `summary_dict` is
itself a dictionary, then the function will recursively create
subdirectories with names given by the keys in the dictionary. The
Tensor values are summarized using the summary writer instance specific
to the parent relative path.
itself a dictionary, then the function will create a subdirectory with
name given by the corresponding key. This is performed recursively. Leaf
values are then summarized using the summary writer instance specific to
the parent relative path.
"""
if not self._enabled:
return
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment