Unverified Commit 0cceabfc authored by Yiming Shi's avatar Yiming Shi Committed by GitHub
Browse files

Merge branch 'master' into move_to_keraslayers_fasterrcnn_fpn_keras_feature_extractor

parents 17821c0d 39ee0ac9
......@@ -20,12 +20,12 @@ from __future__ import print_function
import tensorflow as tf
from tensorflow.python.keras import backend
from official.vision.detection.dataloader import mode_keys
from official.vision.detection.evaluation import factory as eval_factory
from official.vision.detection.modeling import base_model
from official.vision.detection.modeling import losses
from official.vision.detection.modeling.architecture import factory
from official.vision.detection.modeling.architecture import keras_utils
from official.vision.detection.ops import postprocess_ops
......@@ -57,7 +57,7 @@ class RetinanetModel(base_model.Model):
params.postprocess)
self._transpose_input = params.train.transpose_input
assert not self._transpose_input, 'Transpose input is not supportted.'
assert not self._transpose_input, 'Transpose input is not supported.'
# Input layer.
input_shape = (
params.retinanet_parser.output_size +
......@@ -120,7 +120,7 @@ class RetinanetModel(base_model.Model):
def build_model(self, params, mode=None):
if self._keras_model is None:
with backend.get_graph().as_default():
with keras_utils.maybe_enter_backend_graph():
outputs = self.model_outputs(self._input_layer, mode)
model = tf.keras.models.Model(
......
......@@ -20,13 +20,13 @@ from __future__ import print_function
import tensorflow as tf
from tensorflow.python.keras import backend
from official.vision.detection.dataloader import anchor
from official.vision.detection.dataloader import mode_keys
from official.vision.detection.evaluation import factory as eval_factory
from official.vision.detection.modeling import base_model
from official.vision.detection.modeling import losses
from official.vision.detection.modeling.architecture import factory
from official.vision.detection.modeling.architecture import keras_utils
from official.vision.detection.ops import postprocess_ops
from official.vision.detection.utils import box_utils
......@@ -265,7 +265,7 @@ class ShapeMaskModel(base_model.Model):
def build_model(self, params, mode):
if self._keras_model is None:
input_layers = self.build_input_layers(self._params, mode)
with backend.get_graph().as_default():
with keras_utils.maybe_enter_backend_graph():
outputs = self.model_outputs(input_layers, mode)
model = tf.keras.models.Model(
......
......@@ -119,6 +119,24 @@ python3 classifier_trainer.py \
--params_override='runtime.num_gpus=$NUM_GPUS'
```
To train on multiple hosts, each with GPUs attached using
[MultiWorkerMirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy)
please update `runtime` section in gpu.yaml
(or override using `--params_override`) with:
```YAML
# gpu.yaml
runtime:
distribution_strategy: 'multi_worker_mirrored'
worker_hosts: '$HOST1:port,$HOST2:port'
num_gpus: $NUM_GPUS
task_index: 0
```
By having `task_index: 0` on the first host and `task_index: 1` on the second
and so on. `$HOST1` and `$HOST2` are the IP addresses of the hosts, and `port`
can be chosen any free port on the hosts. Only the first host will write
TensorBoard Summaries and save checkpoints.
#### On TPU:
```bash
python3 classifier_trainer.py \
......
......@@ -235,9 +235,6 @@ def initialize(params: base_configs.ExperimentConfig,
else:
data_format = 'channels_last'
tf.keras.backend.set_image_data_format(data_format)
distribution_utils.configure_cluster(
params.runtime.worker_hosts,
params.runtime.task_index)
if params.runtime.run_eagerly:
# Enable eager execution to allow step-by-step debugging
tf.config.experimental_run_functions_eagerly(True)
......@@ -296,6 +293,10 @@ def train_and_eval(
"""Runs the train and eval path using compile/fit."""
logging.info('Running train and eval.')
distribution_utils.configure_cluster(
params.runtime.worker_hosts,
params.runtime.task_index)
# Note: for TPUs, strategy and scope should be created before the dataset
strategy = strategy_override or distribution_utils.get_distribution_strategy(
distribution_strategy=params.runtime.distribution_strategy,
......@@ -338,7 +339,8 @@ def train_and_eval(
optimizer = optimizer_factory.build_optimizer(
optimizer_name=params.model.optimizer.name,
base_learning_rate=learning_rate,
params=params.model.optimizer.as_dict())
params=params.model.optimizer.as_dict(),
model=model)
metrics_map = _get_metrics(one_hot)
metrics = [metrics_map[metric] for metric in params.train.metrics]
......
......@@ -18,11 +18,12 @@ from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
from typing import Any, Dict, Text, List
from absl import logging
import tensorflow as tf
import tensorflow_addons as tfa
from typing import Any, Dict, Text, List
from official.vision.image_classification import learning_rate
from official.vision.image_classification.configs import base_configs
......@@ -250,7 +251,8 @@ class MovingAverage(tf.keras.optimizers.Optimizer):
def build_optimizer(
optimizer_name: Text,
base_learning_rate: tf.keras.optimizers.schedules.LearningRateSchedule,
params: Dict[Text, Any]):
params: Dict[Text, Any],
model: tf.keras.Model = None):
"""Build the optimizer based on name.
Args:
......@@ -261,6 +263,8 @@ def build_optimizer(
params: String -> Any dictionary representing the optimizer params.
This should contain optimizer specific parameters such as
`base_learning_rate`, `decay`, etc.
model: The `tf.keras.Model`. This is used for the shadow copy if using
`MovingAverage`.
Returns:
A tf.keras.Optimizer.
......@@ -322,10 +326,13 @@ def build_optimizer(
# Moving average should be applied last, as it's applied at test time
moving_average_decay = params.get('moving_average_decay', 0.)
if moving_average_decay is not None and moving_average_decay > 0.:
if model is None:
raise ValueError('`model` must be provided if using `MovingAverage`.')
logging.info('Including moving average decay.')
optimizer = MovingAverage(
optimizer,
optimizer=optimizer,
average_decay=moving_average_decay)
optimizer.shadow_copy(model)
return optimizer
......
......@@ -19,15 +19,21 @@ from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import tensorflow as tf
from absl.testing import parameterized
import tensorflow as tf
from official.vision.image_classification import optimizer_factory
from official.vision.image_classification.configs import base_configs
class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
def build_toy_model(self) -> tf.keras.Model:
"""Creates a toy `tf.Keras.Model`."""
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(1, input_shape=(1,)))
return model
@parameterized.named_parameters(
('sgd', 'sgd', 0., False),
('momentum', 'momentum', 0., False),
......@@ -40,6 +46,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
('rmsprop_ema', 'rmsprop', 0.999, False))
def test_optimizer(self, optimizer_name, moving_average_decay, lookahead):
"""Smoke test to be sure no syntax errors."""
model = self.build_toy_model()
params = {
'learning_rate': 0.001,
'rho': 0.09,
......@@ -51,7 +58,8 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
optimizer = optimizer_factory.build_optimizer(
optimizer_name=optimizer_name,
base_learning_rate=params['learning_rate'],
params=params)
params=params,
model=model)
self.assertTrue(issubclass(type(optimizer), tf.keras.optimizers.Optimizer))
def test_unknown_optimizer(self):
......
......@@ -255,7 +255,7 @@ def define_keras_flags(
name='tpu', default='', help='TPU address to connect to.')
flags.DEFINE_integer(
name='steps_per_loop',
default=500,
default=None,
help='Number of steps per training loop. Only training step happens '
'inside the loop. Callbacks will not be called inside. Will be capped at '
'steps per epoch.')
......
......@@ -14,18 +14,16 @@
# ==============================================================================
"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
from absl import app
from absl import flags
from absl import logging
import orbit
import tensorflow as tf
from official.modeling import performance
from official.staging.training import controller
from official.utils.flags import core as flags_core
from official.utils.misc import distribution_utils
from official.utils.misc import keras_utils
......@@ -87,15 +85,6 @@ def get_num_train_iterations(flags_obj):
return train_steps, train_epochs, eval_steps
def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
"""Calculates steps to run on device."""
if steps_per_loop <= 0:
raise ValueError('steps_per_loop should be positive integer.')
if steps_per_loop == 1:
return steps_per_loop
return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
def run(flags_obj):
"""Run ResNet ImageNet training and eval loop using custom training loops.
......@@ -121,7 +110,6 @@ def run(flags_obj):
datasets_num_private_threads=flags_obj.datasets_num_private_threads)
common.set_cudnn_batchnorm_mode()
# TODO(anj-s): Set data_format without using Keras.
data_format = flags_obj.data_format
if data_format is None:
data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
......@@ -137,7 +125,14 @@ def run(flags_obj):
per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations(
flags_obj)
steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
if flags_obj.steps_per_loop is None:
steps_per_loop = per_epoch_steps
elif flags_obj.steps_per_loop > per_epoch_steps:
steps_per_loop = per_epoch_steps
logging.warn('Setting steps_per_loop to %d to respect epoch boundary.',
steps_per_loop)
else:
steps_per_loop = flags_obj.steps_per_loop
logging.info(
'Training %d epochs, each epoch has %d steps, '
......@@ -154,8 +149,8 @@ def run(flags_obj):
eval_interval = flags_obj.epochs_between_evals * per_epoch_steps
checkpoint_interval = (
per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
steps_per_loop * 5 if flags_obj.enable_checkpoint_and_export else None)
summary_interval = steps_per_loop if flags_obj.enable_tensorboard else None
checkpoint_manager = tf.train.CheckpointManager(
runnable.checkpoint,
......@@ -164,20 +159,24 @@ def run(flags_obj):
step_counter=runnable.global_step,
checkpoint_interval=checkpoint_interval)
resnet_controller = controller.Controller(
resnet_controller = orbit.Controller(
strategy,
runnable.train,
runnable.evaluate if not flags_obj.skip_eval else None,
runnable,
runnable if not flags_obj.skip_eval else None,
global_step=runnable.global_step,
steps_per_loop=steps_per_loop,
train_steps=per_epoch_steps * train_epochs,
checkpoint_manager=checkpoint_manager,
summary_interval=summary_interval,
eval_steps=eval_steps,
eval_interval=eval_interval)
eval_summary_dir=os.path.join(flags_obj.model_dir, 'eval'))
time_callback.on_train_begin()
resnet_controller.train(evaluate=not flags_obj.skip_eval)
if not flags_obj.skip_eval:
resnet_controller.train_and_evaluate(
train_steps=per_epoch_steps * train_epochs,
eval_steps=eval_steps,
eval_interval=eval_interval)
else:
resnet_controller.train(steps=per_epoch_steps * train_epochs)
time_callback.on_train_end()
stats = build_stats(runnable, time_callback)
......
......@@ -14,33 +14,21 @@
# ==============================================================================
"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import orbit
import tensorflow as tf
from official.modeling import performance
from official.staging.training import grad_utils
from official.staging.training import standard_runnable
from official.staging.training import utils
from official.utils.flags import core as flags_core
from official.vision.image_classification.resnet import common
from official.vision.image_classification.resnet import imagenet_preprocessing
from official.vision.image_classification.resnet import resnet_model
class ResnetRunnable(standard_runnable.StandardTrainable,
standard_runnable.StandardEvaluable):
class ResnetRunnable(orbit.StandardTrainer, orbit.StandardEvaluator):
"""Implements the training and evaluation APIs for Resnet model."""
def __init__(self, flags_obj, time_callback, epoch_steps):
standard_runnable.StandardTrainable.__init__(self,
flags_obj.use_tf_while_loop,
flags_obj.use_tf_function)
standard_runnable.StandardEvaluable.__init__(self,
flags_obj.use_tf_function)
self.strategy = tf.distribute.get_strategy()
self.flags_obj = flags_obj
self.dtype = flags_core.get_tf_dtype(flags_obj)
......@@ -107,11 +95,8 @@ class ResnetRunnable(standard_runnable.StandardTrainable,
# Handling epochs.
self.epoch_steps = epoch_steps
self.epoch_helper = utils.EpochHelper(epoch_steps, self.global_step)
def build_train_dataset(self):
"""See base class."""
return utils.make_distributed_dataset(
self.epoch_helper = orbit.utils.EpochHelper(epoch_steps, self.global_step)
train_dataset = orbit.utils.make_distributed_dataset(
self.strategy,
self.input_fn,
is_training=True,
......@@ -122,10 +107,11 @@ class ResnetRunnable(standard_runnable.StandardTrainable,
.datasets_num_private_threads,
dtype=self.dtype,
drop_remainder=True)
def build_eval_dataset(self):
"""See base class."""
return utils.make_distributed_dataset(
orbit.StandardTrainer.__init__(self, train_dataset,
flags_obj.use_tf_while_loop,
flags_obj.use_tf_function)
if not flags_obj.skip_eval:
eval_dataset = orbit.utils.make_distributed_dataset(
self.strategy,
self.input_fn,
is_training=False,
......@@ -133,6 +119,8 @@ class ResnetRunnable(standard_runnable.StandardTrainable,
batch_size=self.batch_size,
parse_record_fn=imagenet_preprocessing.parse_record,
dtype=self.dtype)
orbit.StandardEvaluator.__init__(self, eval_dataset,
flags_obj.use_tf_function)
def train_loop_begin(self):
"""See base class."""
......
![TensorFlow Requirement: 2.x](https://img.shields.io/badge/TensorFlow%20Requirement-2.x-brightgreen)
# Orbit
Orbit is a customized training loop library built on top of Tensorflow 2. It
provides a flexible lightweight library that users can easily use or fork when
writing [customized training loop code](https://www.tensorflow.org/tutorials/distribute/custom_training)
in TF2. It intergates with `tf.distribute` seamlessly and supports running on
different device types (CPU, GPU, and TPU).
# Copyright 2018 Google, Inc. All Rights Reserved.
# Copyright 2020 The Orbit Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -12,7 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Orbit package definition."""
import sklearn
import linear_regression
from orbit import utils
from orbit.controller import Controller
from orbit.runner import *
from orbit.standard_runner import *
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
# Copyright 2020 The Orbit Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -14,51 +14,47 @@
# ==============================================================================
"""A light weight utilities to train TF2 models."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import time
from typing import Callable, Dict, Optional, Text, Union
from absl import logging
import numpy as np
from orbit import runner
from orbit import utils
import tensorflow.compat.v2 as tf
from typing import Callable, Dict, Optional, Text
import tensorflow as tf
from official.staging.training import utils
def _log_info(message: Text):
"""Logs `message` to the `info` log, and also prints to stdout."""
logging.info(message)
print(message)
class Controller(object):
class Controller:
"""Class that facilitates training and evaluation of models."""
def __init__(
self,
strategy: Optional[tf.distribute.Strategy] = None,
train_fn: Optional[Callable[[tf.Tensor],
Optional[Dict[Text, tf.Tensor]]]] = None,
eval_fn: Optional[Callable[[tf.Tensor],
Optional[Dict[Text, tf.Tensor]]]] = None,
trainer: Optional[runner.AbstractTrainer] = None,
evaluator: Optional[runner.AbstractEvaluator] = None,
global_step: Optional[tf.Variable] = None,
# Train related
train_steps: Optional[int] = None,
steps_per_loop: Optional[int] = None,
summary_dir: Optional[Text] = None,
checkpoint_manager: Optional[tf.train.CheckpointManager] = None,
# summary related
# Summary related
summary_interval: Optional[int] = None,
summary_dir: Optional[Text] = None,
# Evaluation related
eval_summary_dir: Optional[Text] = None,
eval_steps: Optional[int] = None,
eval_interval: Optional[int] = None):
eval_summary_dir: Optional[Text] = None):
"""Constructs a `Controller` instance.
Args:
strategy: An instance of `tf.distribute.Strategy`.
train_fn: A callable defined as `def train_fn(num_steps)`, which
`num_steps` indicates the number of steps to run for each loop.
eval_fn: A callable defined as `def eval_fn(num_steps)`, which `num_steps`
indicates the number of steps for one evaluation.
trainer: An instance of `orbit.AbstractTrainer`, which represents model
training details.
evaluator: An instance of `orbit.AbstractEvaluator`, which represents
model evaluation details.
global_step: An integer `tf.Variable` indicating the global training step
number. Usually this can be obtained from `iterations` property of the
model's optimizer (e.g. `self.optimizer.iterations`), or users can
......@@ -66,105 +62,166 @@ class Controller(object):
own global step variable, it is recommended to create the `tf.Variable`
inside strategy scope, and with
`aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA`.
train_steps: The total (maximum) number of training steps to perform.
steps_per_loop: The number of steps to run in each "inner loop" of
training (passed to the `num_steps` parameter of `train_fn`).
summary_dir: The directory to restore and write checkpoints and summaries.
If None, it will be set to `checkpoint_manager.directory`.
training (passed to the `num_steps` parameter of `trainer.train`).
checkpoint_manager: An instance of `tf.train.CheckpointManager`.
summary_interval: Step interval for training summaries. Note that this
argument only applies to the summaries outside the training loop. If the
value is None, then training summaries are not enabled.
argument only applies to the summaries inside `trainer.train` function.
Summaries outside like "steps_per_second" and outputs from
`trainer.train` function will always be enabled. If set, the value
should be divisible by steps_per_loop.
summary_dir: The directory to restore and write checkpoints and summaries.
If None, it will be set to `checkpoint_manager.directory`.
eval_summary_dir: The directory to write eval summaries. If None, it will
be set to `summary_dir`.
eval_steps: Number of steps to run evaluation.
eval_interval: Step interval for evaluation. If None, will skip evaluation
in the middle of training. Note that evaluation only happens outside the
training loop, which the loop iteration is specify by `steps_per_loop`
parameter.
Raises:
ValueError: If both `train_fn` and `eval_fn` are None.
ValueError: If `train_fn` is not None and `train_steps` is None.
ValueError: If `steps_per_loop` is None when `train_fn` is provided.
ValueError: If both `trainer` and `evaluator` are None.
ValueError: If `steps_per_loop` is not a positive integer.
ValueError: If `summary_interval` is not a positive integer or it cannot
be divisible by `steps_per_loop`.
"""
if train_fn is None and eval_fn is None:
raise ValueError("`train_fn` and `eval_fn` should not both be None")
# TODO(rxsang): Support training until exhaustion by passing
# `train_steps=-1`. Currently it cannot be supported with a host training
# loop because break statements are not supported with distributed dataset.
if train_fn is not None:
if train_steps is None:
raise ValueError("`train_steps` is required when `train_fn` is "
"provided.")
if trainer is None and evaluator is None:
raise ValueError("`trainer` and `evaluator` should not both be None")
if trainer is not None:
if steps_per_loop is None:
raise ValueError("`steps_per_loop` is required when `train_fn is "
raise ValueError("`steps_per_loop` is required when `trainer` is "
"provided.")
if not isinstance(steps_per_loop, int) or steps_per_loop < 1:
raise ValueError("`steps_per_loop` should be a positive integer")
if summary_interval is not None and summary_interval <= 0:
if summary_interval is not None:
if summary_interval <= 0:
raise ValueError("`summary_interval` should be larger than 0")
if summary_interval % steps_per_loop != 0:
raise ValueError("The summary interval ({}) must be a multiple "
"of the steps_per_loop ({})".format(
summary_interval, steps_per_loop))
self.trainer = trainer
self.evaluator = evaluator
self.strategy = strategy or tf.distribute.get_strategy()
self.train_fn = train_fn
self.eval_fn = eval_fn
self.global_step = global_step
self.checkpoint_manager = checkpoint_manager
if self.train_fn is not None:
self.train_steps = train_steps
self.steps_per_loop = steps_per_loop
if summary_dir:
self.summary_dir = summary_dir
elif checkpoint_manager:
self.summary_dir = checkpoint_manager.directory
else:
self.summary_dir = None
if summary_dir is None and checkpoint_manager:
summary_dir = checkpoint_manager.directory
if self.trainer is not None:
self.step_timer = None
self.steps_per_loop = steps_per_loop
self.summary_interval = summary_interval
if self.summary_dir and self.summary_interval:
summary_writer = tf.summary.create_file_writer(self.summary_dir)
else:
summary_writer = None
# TODO(rxsang): Consider pass SummaryManager directly into Controller for
# maximum customizability.
self.summary_manager = utils.SummaryManager(
summary_writer,
tf.summary.scalar,
global_step=self.global_step,
summary_interval=self.summary_interval)
if self.eval_fn is not None:
eval_summary_dir = eval_summary_dir or self.summary_dir
eval_summary_writer = tf.summary.create_file_writer(
eval_summary_dir) if eval_summary_dir else None
summary_dir, tf.summary.scalar, global_step=self.global_step)
eval_summary_writer = None
if self.evaluator is not None:
eval_summary_dir = eval_summary_dir or summary_dir
if eval_summary_dir == summary_dir and self.trainer is not None:
# Reuse the summary writer if train and evaluation summary directory
# are the same.
self.eval_summary_manager = self.summary_manager
else:
self.eval_summary_manager = utils.SummaryManager(
eval_summary_writer, tf.summary.scalar, global_step=self.global_step)
eval_summary_dir, tf.summary.scalar, global_step=self.global_step)
self.eval_steps = eval_steps
self.eval_interval = eval_interval
# Creates and initializes the interval triggers.
self.eval_trigger = utils.IntervalTrigger(self.eval_interval,
self.global_step.numpy()) # pytype: disable=attribute-error
if self.global_step:
if self.global_step is not None:
tf.summary.experimental.set_step(self.global_step)
# Restores the model if needed.
# TODO(momernick): We probably only want to do this on certain occasions?
if self.checkpoint_manager is not None:
model_restored = self._restore_model()
if not model_restored and self.checkpoint_manager.checkpoint_interval:
# If the model is not restored from a checkpoint, save an initial
checkpoint_interval = self.checkpoint_manager.checkpoint_interval
model_restored = self.restore_checkpoint()
if not model_restored and (checkpoint_interval and
self.trainer is not None):
# If the model is not restored from a checkpoint, and
# `checkpoint_interval` is enabled for training, save an initial
# checkpoint.
ckpt_path = self.checkpoint_manager.save(
checkpoint_number=self.global_step)
logging.info("Saved checkpoins in %s", ckpt_path)
self.save_checkpoint()
def train(self, steps: int, checkpoint_at_completion: bool = True):
"""Runs training.
This method calls the `train` method on the Trainable object until the
global step count is equal to `steps`. It will optionally save checkpoints,
if a CheckpointManager was passed to the Controller instance's `__init__`.
Args:
steps: The global step count to train up to.
checkpoint_at_completion: Whether to save a checkpoint when this method
returns. Defaults to True (write the checkpoint). This is always
triggered, regardless of the checkpointing interval.
"""
if self.trainer is None:
raise ValueError("`self.trainer` is required when calling `train` "
"method.")
if self.global_step is None:
raise ValueError("`self.global_step` is required when calling `train` "
"method.")
# TODO(momernick): Support steps=None or -1 (training to exhaustion).
current_step = self.global_step.numpy() # This is an expensive access.
while current_step < steps:
logging.info("Train at step %s of %s", current_step, steps)
# Calculates steps to run for the next train loop.
num_steps = min(steps - current_step, self.steps_per_loop)
self._train_n_steps(num_steps)
self._maybe_save_checkpoint()
current_step = self.global_step.numpy() # This is an expensive access.
if checkpoint_at_completion:
self.save_checkpoint()
def _restore_model(self, checkpoint_path=None):
def evaluate(self, steps: int = None) -> Optional[Dict[Text, np.number]]:
"""Runs evaluation.
This method calls the `evaluate` method on the Evaluator object for `steps`
steps, then writes the returned summaries (if any).
Args:
steps: The number of steps to evaluate for.
Returns:
The evaluation results as a dictionary of numpy values.
Raises:
ValueError: If no checkpoint found in `self.checkpoint_manager.directory`.
ValueError: If `evaluator` is not provided.
"""
if self.evaluator is None:
raise ValueError("`evaluator` must be provided to call `evaluate()` "
"method.")
steps = steps or -1
current_step = self.global_step.numpy()
if steps > 0:
logging.info("Running %s steps of evaluation at train step: %s", steps,
current_step)
steps = tf.convert_to_tensor(steps, dtype=tf.int32)
else:
logging.info("Evaluating at train step: %s", current_step)
with self.eval_summary_manager.summary_writer().as_default():
eval_outputs = self.evaluator.evaluate(steps)
if eval_outputs:
eval_outputs = tf.nest.map_structure(utils.get_value, eval_outputs)
info = "step: {} evaluation metric: {}".format(
current_step, eval_outputs)
_log_info(info)
self.eval_summary_manager.write_summaries(eval_outputs)
self.eval_summary_manager.flush()
return eval_outputs
def restore_checkpoint(self, checkpoint_path: Text = None):
"""Restore or initialize the model.
Args:
......@@ -172,153 +229,164 @@ class Controller(object):
restore. If None, will restore from `self.checkpoint_manager`.
Returns:
True if the latest checkpoint is found or restored. Otherwise False.
The path to the restored checkpoint if a restore happened, or None
if no restore occurred.
"""
with self.strategy.scope():
# Checkpoint restoring should be inside scope. b/139450638
if checkpoint_path is not None:
self.checkpoint_manager.checkpoint.restore(checkpoint_path)
return True
return checkpoint_path
return self.checkpoint_manager.restore_or_initialize()
def _evaluate_once(self, current_step):
"""Runs the evaluation once."""
logging.info("Start evaluation at step: %s", current_step)
def save_checkpoint(self):
"""Checkpoint the model.
with self.eval_summary_manager.summary_writer.as_default():
eval_outputs = self.eval_fn(self.eval_steps)
This method will write a checkpoint containing the current state of the
model.
if eval_outputs:
eval_outputs = tf.nest.map_structure(lambda x: x.numpy(), eval_outputs)
Raises:
ValueError: if no CheckpointManager was provided to this Controller's
init args.
"""
self._maybe_save_checkpoint(force_trigger=True)
info = "step: {} evaluation metric: {}".format(
current_step, eval_outputs)
self._log_info(info)
def train_and_evaluate(self,
train_steps: int = None,
eval_steps: int = None,
eval_interval: int = None):
"""Train and evaluate in an interleaved manner.
self.eval_summary_manager.write_summaries(eval_outputs)
self.eval_summary_manager.flush()
This method will train the model until the global step count equals
`train_steps`, running an evaluation for `eval_steps` every `eval_interval`
training steps. In addition, this method will run a final evaluation at the
end of the training sequence.
def _maybe_save_checkpoints(self, current_step, force_trigger=False):
if self.checkpoint_manager and self.checkpoint_manager.checkpoint_interval:
ckpt_path = self.checkpoint_manager.save(
checkpoint_number=current_step, check_interval=not force_trigger)
if ckpt_path is not None:
logging.info("Saved checkpoins in %s", ckpt_path)
Args:
train_steps: The global step count to train up to.
eval_steps: The number of steps to run during an evaluation. If None,
this method will evaluate over the entire evaluation dataset.
eval_interval: The number of training steps to run between evaluations.
If set, training will always stop every `eval_interval` steps, even if
this results in a shorter inner loop than specified by `steps_per_loop`
setting. If None, evaluation will only be performed after training is
complete.
def _maybe_evaluate(self, current_step, force_trigger=False):
if self.eval_trigger(current_step, force_trigger):
self._evaluate_once(current_step)
Raises:
ValueError: If eval_interval is not a multiple of self.steps_per_loop.
"""
current_step = self.global_step.numpy() # This is an expensive access.
eval_interval = eval_interval or (train_steps - current_step)
while current_step < train_steps:
interval = min(train_steps - current_step, eval_interval)
num_steps = current_step + interval
self.train(steps=num_steps, checkpoint_at_completion=False)
self.evaluate(steps=eval_steps)
current_step = self.global_step.numpy() # This is an expensive access.
self.save_checkpoint()
def evaluate_continuously(self,
steps: int = None,
timeout: Optional[Union[int, float]] = None,
timeout_fn: Optional[Callable[[], bool]] = None):
"""Monitor a directory and evaluate on checkpoints in it.
This method continuously monitors a directory as specified by this
Controller's CheckpointManager init arg and runs evaluation on the
checkpoints found there.
def _log_info(self, message):
"""Logs `message` to the `info` log, and also prints to stdout."""
logging.info(message)
print(message)
Args:
steps: The number of steps to run when evaluating.
timeout: The maximum number of seconds to wait between checkpoints. See
tf.train.checkpoints_iterator documentation.
timeout_fn: Optional callable to call after a timeout. If the function
returns True, then it means that no new checkpoints will be generated
and the iterator will exit.
def train(self, evaluate=True):
"""Runs the training, with optional evaluation.
Raises:
ValueError: If no checkpoint found in `self.checkpoint_manager.directory`.
ValueError: If `evaluator` was not provided as a controller init arg.
"""
for checkpoint_path in tf.train.checkpoints_iterator(
self.checkpoint_manager.directory,
timeout=timeout,
timeout_fn=timeout_fn):
self.restore_checkpoint(checkpoint_path)
self.evaluate(steps)
This handles evaluation, gathering summaries, and saving checkpoints.
def _train_n_steps(self, num_steps: int):
"""Run training for `num_steps`.
It will also write training outputs to summaries if there is any.
Args:
evaluate: A boolean indicates whether to perform evaluation during
training.
num_steps: An integer indicates how many steps to run for this training
loop.
Raises:
RuntimeError: If `global_step` is not updated correctly in `train_fn`.
RuntimeError: If `global_step` is not updated correctly in
`trainer.train`.
"""
if self.train_fn is None:
raise ValueError("`self.train_fn` is required when calling `train` "
"method.")
if self.global_step is None:
raise ValueError("`self.global_step` is required when calling `train` "
"method.")
if evaluate and self.eval_fn is None:
raise ValueError("`self.eval_fn` is required when calling `train` method "
"with `evaluate=True`")
if not self.step_timer:
self.step_timer = StepTimer(self.global_step)
step_timer = _StepTimer(self.global_step)
current_step = self.global_step.numpy()
logging.info("Train at step %s of %s", current_step, self.train_steps)
while current_step < self.train_steps:
# Calculates steps to run for the next train loop.
steps_per_loop = min(self.train_steps - current_step, self.steps_per_loop)
logging.info("Entering training loop with %s steps, at step %s of %s",
steps_per_loop, current_step, self.train_steps)
current_step += steps_per_loop
steps_per_loop = tf.convert_to_tensor(steps_per_loop, dtype=tf.int32)
with self.summary_manager.summary_writer.as_default():
train_outputs = self.train_fn(steps_per_loop)
current_step = self.global_step.numpy()
logging.info("Entering training loop at step %s to run %s steps",
current_step, num_steps)
current_step += num_steps
num_steps = tf.convert_to_tensor(num_steps, dtype=tf.int32)
with self.summary_manager.summary_writer().as_default():
# Create a lambda that returns true when summaries should be written.
should_record = False # Allows static optimization in no-summary cases.
if self.summary_interval:
should_record = lambda: (self.global_step % self.summary_interval == 0)
with tf.summary.record_if(should_record):
train_outputs = self.trainer.train(num_steps)
# Updates and verifies the current step after a training loop finishes.
if current_step != self.global_step.numpy():
raise RuntimeError("`self.train_fn` is not updating `global_step` "
"correctly, expected: %s, actual: %s" %
raise RuntimeError("`trainer.train` function is not updating "
"`global_step` correctly, expected: %s, actual: %s" %
(current_step, self.global_step.numpy()))
# Print information like metrics and steps_per_second after a training
# loop.
if train_outputs:
train_outputs = tf.nest.map_structure(
lambda x: x.numpy(), train_outputs)
steps_per_second = step_timer.steps_per_second()
train_outputs = tf.nest.map_structure(utils.get_value, train_outputs)
train_outputs = train_outputs or {}
steps_per_second = self.step_timer.steps_per_second()
info = "step: {} steps_per_second: {:.2f} {}".format(
current_step, steps_per_second, train_outputs)
self._log_info(info)
_log_info(info)
train_outputs = train_outputs or {}
train_outputs["steps_per_second"] = steps_per_second
self.summary_manager.write_summaries(train_outputs)
self._maybe_save_checkpoints(current_step)
if evaluate:
self._maybe_evaluate(current_step)
self.summary_manager.write_summaries(train_outputs, always_write=True)
self.summary_manager.flush()
self._maybe_save_checkpoints(current_step, force_trigger=True)
if evaluate:
self._maybe_evaluate(current_step, force_trigger=True)
def evaluate(self, continuous=False, timeout_fn=None):
"""Runs the evaluation.
def _maybe_save_checkpoint(self, force_trigger: bool = False):
"""Save checkpoints if necessary.
Args:
continuous: If `True`, will continously monitor the checkpoint directory
to evaluate on the latest checkpoint. If `False`, will do the evaluation
once.
timeout_fn: Optional callable to call after a timeout. If the function
returns True, then it means that no new checkpoints will be generated
and the iterator will exit.
force_trigger: A boolean indicates whether to force saving checkpoints
regardless of the checkpoint interval.
Raises:
ValueError: If no checkpoint found in `self.checkpoint_manager.directory`.
Returns:
A boolean indicating whether a checkpoint was saved.
"""
if self.eval_fn is None:
raise ValueError("`self.eval_fn` should not be None to call "
"`evaluate()` method.")
if not continuous and timeout_fn is not None:
raise ValueError("`timeout_fn` can be only passed when `continuous` is "
"True")
if continuous:
for checkpoint_path in tf.train.checkpoints_iterator(
self.checkpoint_manager.directory, timeout_fn=timeout_fn):
self._restore_model(checkpoint_path)
self._evaluate_once(self.global_step.numpy())
return
latest_checkpoint = self.checkpoint_manager.latest_checkpoint
if not latest_checkpoint:
raise ValueError("no checkpoint found in dir %s" %
self.checkpoint_manager.directory)
self._restore_model()
self._evaluate_once(self.global_step.numpy())
if self.checkpoint_manager and self.checkpoint_manager.checkpoint_interval:
ckpt_path = self.checkpoint_manager.save(
checkpoint_number=self.global_step.numpy(),
check_interval=not force_trigger)
if ckpt_path is not None:
logging.info("Saved checkpoints in %s", ckpt_path)
return True
return False
class _StepTimer(object):
class StepTimer:
"""Utility class for measuring steps/second."""
def __init__(self, step):
......
# Copyright 2020 The Orbit Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for orbit.controller."""
import os
from absl import logging
from absl.testing import parameterized
import numpy as np
from orbit import controller
from orbit import standard_runner
import tensorflow as tf
def create_model():
x = tf.keras.layers.Input(shape=(3,), name="input")
y = tf.keras.layers.Dense(4, name="dense")(x)
model = tf.keras.Model(x, y)
return model
def summaries_with_matching_keyword(keyword, summary_dir):
"""Returns summary protos matching given keyword from event file."""
matches = []
event_paths = tf.io.gfile.glob(os.path.join(summary_dir, "events*"))
for event in tf.compat.v1.train.summary_iterator(event_paths[-1]):
if event.summary is not None:
for value in event.summary.value:
if keyword in value.tag:
matches.append(event.summary)
return matches
def dataset_fn(ctx):
del ctx
inputs = np.zeros((10, 3), dtype=np.float32)
targets = np.ones((10, 4), dtype=np.float32)
dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
dataset = dataset.repeat(100)
dataset = dataset.batch(10, drop_remainder=True)
return dataset
class TestRunner(standard_runner.StandardTrainer,
standard_runner.StandardEvaluator):
"""Implements the training and evaluation APIs for the test model."""
def __init__(self, return_numpy=False):
self.strategy = tf.distribute.get_strategy()
self.model = create_model()
self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
self.global_step = self.optimizer.iterations
self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
self.eval_loss = tf.keras.metrics.Mean("eval_loss", dtype=tf.float32)
self.return_numpy = return_numpy
train_dataset = (
self.strategy.experimental_distribute_datasets_from_function(dataset_fn)
)
eval_dataset = (
self.strategy.experimental_distribute_datasets_from_function(dataset_fn)
)
standard_runner.StandardTrainer.__init__(self, train_dataset)
standard_runner.StandardEvaluator.__init__(self, eval_dataset)
def train_step(self, iterator):
def _replicated_step(inputs):
"""Replicated training step."""
inputs, targets = inputs
with tf.GradientTape() as tape:
outputs = self.model(inputs)
loss = tf.reduce_mean(tf.keras.losses.MSE(targets, outputs))
grads = tape.gradient(loss, self.model.variables)
self.optimizer.apply_gradients(zip(grads, self.model.variables))
self.train_loss.update_state(loss)
self.strategy.run(_replicated_step, args=(next(iterator),))
def train_loop_end(self):
train_loss = self.train_loss.result()
return {
"loss": train_loss.numpy() if self.return_numpy else train_loss,
}
def build_eval_dataset(self):
return self.strategy.experimental_distribute_datasets_from_function(
dataset_fn)
def eval_begin(self):
self.eval_loss.reset_states()
def eval_step(self, iterator):
def _replicated_step(inputs):
"""Replicated evaluation step."""
inputs, targets = inputs
outputs = self.model(inputs)
loss = tf.reduce_mean(tf.keras.losses.MSE(targets, outputs))
self.eval_loss.update_state(loss)
self.strategy.run(_replicated_step, args=(next(iterator),))
def eval_end(self):
eval_loss = self.eval_loss.result()
return {
"eval_loss": eval_loss.numpy() if self.return_numpy else eval_loss,
}
class TestEvaluator(standard_runner.StandardEvaluator):
"""Implements the training and evaluation APIs for the test model."""
def __init__(self):
self.strategy = tf.distribute.get_strategy()
self.model = create_model()
eval_dataset = self.strategy.experimental_distribute_datasets_from_function(
dataset_fn)
standard_runner.StandardEvaluator.__init__(self, eval_dataset)
def eval_reduce(self, state, output):
state.append(output)
return state
def eval_begin(self):
return []
def eval_step(self, iterator):
def _replicated_step(inputs):
"""Replicated evaluation step."""
inputs, targets = inputs
outputs = self.model(inputs)
loss = tf.reduce_mean(tf.keras.losses.MSE(targets, outputs))
return loss
per_replica_losses = self.strategy.run(
_replicated_step, args=(next(iterator),))
mean_loss = self.strategy.reduce(
tf.distribute.ReduceOp.MEAN, per_replica_losses, axis=None)
return mean_loss
def eval_end(self, outputs):
return {
"eval_loss": tf.reduce_mean(outputs),
}
class TestEvaluatorWithNestedSummary(standard_runner.StandardEvaluator):
"""Implements the training and evaluation APIs for the test model."""
def __init__(self):
self.strategy = tf.distribute.get_strategy()
self.model = create_model()
dataset = self.strategy.experimental_distribute_datasets_from_function(
dataset_fn)
dataset2 = self.strategy.experimental_distribute_datasets_from_function(
dataset_fn)
self.loss = tf.keras.metrics.Mean("loss", dtype=tf.float32)
self.accuracy = tf.keras.metrics.CategoricalAccuracy(
"accuracy", dtype=tf.float32)
self.loss2 = tf.keras.metrics.Mean("loss", dtype=tf.float32)
self.accuracy2 = tf.keras.metrics.CategoricalAccuracy(
"accuracy", dtype=tf.float32)
standard_runner.StandardEvaluator.__init__(
self, eval_dataset={
"dataset": dataset,
"dataset2": dataset2
})
def eval_step(self, iterator):
def _replicated_step(loss, accuracy, inputs):
"""Replicated evaluation step."""
inputs, targets = inputs
outputs = self.model(inputs)
loss.update_state(tf.keras.losses.MSE(targets, outputs))
accuracy.update_state(targets, outputs)
self.strategy.run(
lambda inputs: _replicated_step(self.loss, self.accuracy, inputs),
args=(next(iterator["dataset"]),))
self.strategy.run(
lambda inputs: _replicated_step(self.loss2, self.accuracy2, inputs),
args=(next(iterator["dataset2"]),))
def eval_end(self):
return {
"dataset": {
"loss": self.loss.result(),
"accuracy": self.accuracy.result()
},
"dataset2": {
"loss": self.loss2.result(),
"accuracy": self.accuracy2.result()
},
}
class TestTrainerWithSummaries(standard_runner.StandardTrainer):
"""A Trainer model with summaries for testing purposes."""
def __init__(self):
self.strategy = tf.distribute.get_strategy()
self.model = create_model()
self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
self.global_step = self.optimizer.iterations
self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
train_dataset = (
self.strategy.experimental_distribute_datasets_from_function(dataset_fn)
)
standard_runner.StandardTrainer.__init__(
self, train_dataset, use_tpu_summary_optimization=True)
def build_train_dataset(self):
return self.strategy.experimental_distribute_datasets_from_function(
dataset_fn)
def train_step(self, iterator):
def _replicated_step(inputs):
"""Replicated training step."""
inputs, targets = inputs
with tf.GradientTape() as tape:
outputs = self.model(inputs)
loss = tf.reduce_mean(tf.keras.losses.MSE(targets, outputs))
tf.summary.scalar("loss", loss)
grads = tape.gradient(loss, self.model.variables)
self.optimizer.apply_gradients(zip(grads, self.model.variables))
self.train_loss.update_state(loss)
self.strategy.run(_replicated_step, args=(next(iterator),))
class ControllerTest(tf.test.TestCase, parameterized.TestCase):
def setUp(self):
super().setUp()
self.model_dir = self.get_temp_dir()
def test_no_checkpoint(self):
test_runner = TestRunner()
# No checkpoint manager and no strategy.
test_controller = controller.Controller(
trainer=test_runner,
evaluator=test_runner,
global_step=test_runner.global_step,
steps_per_loop=2,
summary_dir=os.path.join(self.model_dir, "summaries/train"),
eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
test_controller.train_and_evaluate(
train_steps=10, eval_steps=2, eval_interval=6)
self.assertEqual(test_runner.global_step, 10)
# Loss and accuracy values should be written into summaries.
self.assertNotEmpty(
tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
self.assertNotEmpty(
summaries_with_matching_keyword(
"loss", os.path.join(self.model_dir, "summaries/train")))
self.assertNotEmpty(
tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
self.assertNotEmpty(
summaries_with_matching_keyword(
"eval_loss", os.path.join(self.model_dir, "summaries/eval")))
# No checkpoint, so global step starts from 0.
test_runner.global_step.assign(0)
test_controller.train_and_evaluate(
train_steps=10, eval_steps=2, eval_interval=6)
self.assertEqual(test_runner.global_step, 10)
def test_no_checkpoint_and_summaries(self):
test_runner = TestRunner()
# No checkpoint + summary directories.
test_controller = controller.Controller(
trainer=test_runner,
evaluator=test_runner,
global_step=test_runner.global_step,
steps_per_loop=2)
test_controller.train_and_evaluate(
train_steps=10, eval_steps=2, eval_interval=6)
self.assertEqual(test_runner.global_step, 10)
@parameterized.named_parameters(("return_numpy", True),
("return_tensor", False))
def test_train_and_evaluate(self, return_numpy):
test_runner = TestRunner(return_numpy=return_numpy)
checkpoint = tf.train.Checkpoint(
model=test_runner.model, optimizer=test_runner.optimizer)
checkpoint_manager = tf.train.CheckpointManager(
checkpoint,
self.model_dir,
max_to_keep=None,
step_counter=test_runner.global_step,
checkpoint_interval=10)
test_controller = controller.Controller(
trainer=test_runner,
evaluator=test_runner,
global_step=test_runner.global_step,
steps_per_loop=2,
summary_dir=os.path.join(self.model_dir, "summaries/train"),
checkpoint_manager=checkpoint_manager,
eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
test_controller.train_and_evaluate(
train_steps=10, eval_steps=2, eval_interval=6)
# Checkpoints are saved.
self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
# Loss and accuracy values should be written into summaries.
self.assertNotEmpty(
tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
self.assertNotEmpty(
summaries_with_matching_keyword(
"loss", os.path.join(self.model_dir, "summaries/train")))
self.assertNotEmpty(
tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
self.assertNotEmpty(
summaries_with_matching_keyword(
"eval_loss", os.path.join(self.model_dir, "summaries/eval")))
def test_train_only(self):
test_runner = TestRunner()
checkpoint = tf.train.Checkpoint(
model=test_runner.model, optimizer=test_runner.optimizer)
checkpoint_manager = tf.train.CheckpointManager(
checkpoint,
self.model_dir,
max_to_keep=None,
step_counter=test_runner.global_step,
checkpoint_interval=10)
test_controller = controller.Controller(
trainer=test_runner,
global_step=test_runner.global_step,
steps_per_loop=2,
summary_dir=os.path.join(self.model_dir, "summaries/train"),
checkpoint_manager=checkpoint_manager,
eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
)
test_controller.train(steps=10)
# Checkpoints are saved.
self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
# Only train summaries are written.
self.assertNotEmpty(
tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
self.assertNotEmpty(
summaries_with_matching_keyword(
"loss", os.path.join(self.model_dir, "summaries/train")))
self.assertFalse(
tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/eval")))
def test_evaluate_only(self):
test_runner = TestRunner()
checkpoint = tf.train.Checkpoint(model=test_runner.model)
checkpoint.save(os.path.join(self.model_dir, "ckpt"))
checkpoint_manager = tf.train.CheckpointManager(
checkpoint,
self.model_dir,
max_to_keep=None,
step_counter=test_runner.global_step)
test_controller = controller.Controller(
evaluator=test_runner,
global_step=test_runner.global_step,
checkpoint_manager=checkpoint_manager,
summary_dir=os.path.join(self.model_dir, "summaries/train"),
eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
eval_results = test_controller.evaluate(steps=2)
# Only eval summaries are written
self.assertFalse(
tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/train")))
self.assertNotEmpty(
tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
self.assertNotEmpty(
summaries_with_matching_keyword(
"eval_loss", os.path.join(self.model_dir, "summaries/eval")))
self.assertIn("eval_loss", eval_results)
# Tests continuous eval with timeout and timeout_fn.
done_file = os.path.join(self.model_dir, "summaries/eval/Done")
def timeout_fn():
with tf.io.gfile.GFile(done_file, "w") as f:
f.write("DONE")
return True
test_controller = controller.Controller(
evaluator=test_runner,
global_step=test_runner.global_step,
checkpoint_manager=checkpoint_manager,
eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
test_controller.evaluate_continuously(
timeout=1, timeout_fn=timeout_fn, steps=2)
self.assertNotEmpty(tf.io.gfile.glob(done_file))
def test_no_eval_steps(self):
test_runner = TestRunner()
checkpoint = tf.train.Checkpoint(model=test_runner.model)
checkpoint.save(os.path.join(self.model_dir, "ckpt"))
checkpoint_manager = tf.train.CheckpointManager(
checkpoint,
self.model_dir,
max_to_keep=None,
step_counter=test_runner.global_step)
test_controller = controller.Controller(
evaluator=test_runner,
global_step=test_runner.global_step,
checkpoint_manager=checkpoint_manager)
test_controller.evaluate()
def test_already_trained_model(self):
test_runner = TestRunner()
test_runner.global_step.assign(10)
checkpoint = tf.train.Checkpoint(
model=test_runner.model, optimizer=test_runner.optimizer)
checkpoint_manager = tf.train.CheckpointManager(
checkpoint,
self.model_dir,
max_to_keep=None,
step_counter=test_runner.global_step,
checkpoint_interval=10)
test_controller = controller.Controller(
trainer=test_runner,
global_step=test_runner.global_step,
steps_per_loop=2,
checkpoint_manager=checkpoint_manager)
# `global_step` is already `train_steps`.
test_controller.train(steps=10)
def test_summaries_inside_train_fn(self):
test_runner = TestTrainerWithSummaries()
checkpoint = tf.train.Checkpoint(
model=test_runner.model, optimizer=test_runner.optimizer)
checkpoint_manager = tf.train.CheckpointManager(
checkpoint,
self.model_dir,
max_to_keep=None,
step_counter=test_runner.global_step)
test_controller = controller.Controller(
trainer=test_runner,
global_step=test_runner.global_step,
steps_per_loop=2,
summary_dir=os.path.join(self.model_dir, "summaries/train"),
summary_interval=2,
checkpoint_manager=checkpoint_manager,
)
test_controller.train(steps=10)
# Checkpoints are saved.
self.assertEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
# Only train summaries are written.
self.assertNotEmpty(
tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
self.assertNotEmpty(
summaries_with_matching_keyword(
"loss", os.path.join(self.model_dir, "summaries/train")))
self.assertFalse(
tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/eval")))
def test_train_and_evaluate_with_same_summary_dir(self):
test_runner = TestRunner()
checkpoint = tf.train.Checkpoint(
model=test_runner.model, optimizer=test_runner.optimizer)
checkpoint_manager = tf.train.CheckpointManager(
checkpoint,
self.model_dir,
max_to_keep=None,
step_counter=test_runner.global_step)
test_controller = controller.Controller(
trainer=test_runner,
evaluator=test_runner,
global_step=test_runner.global_step,
steps_per_loop=2,
summary_dir=os.path.join(self.model_dir, "summaries"),
checkpoint_manager=checkpoint_manager,
eval_summary_dir=os.path.join(self.model_dir, "summaries"))
test_controller.train_and_evaluate(
train_steps=10, eval_steps=2, eval_interval=6)
# Loss and accuracy values should be written into summaries.
self.assertNotEmpty(
tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries")))
self.assertNotEmpty(
summaries_with_matching_keyword(
"loss", os.path.join(self.model_dir, "summaries")))
self.assertNotEmpty(
summaries_with_matching_keyword(
"eval_loss", os.path.join(self.model_dir, "summaries")))
def test_early_stop_on_eval_loss(self):
test_runner = TestRunner()
class EarlyStopController(controller.Controller):
"""A subclass of Controller supports early stopping."""
def train_and_evaluate(self,
train_steps: int = None,
eval_steps: int = None,
eval_interval: int = None):
while self.global_step.numpy() < train_steps:
interval = min(train_steps - self.global_step.numpy(), eval_interval)
num_steps = self.global_step.numpy() + interval
self.train(steps=num_steps, checkpoint_at_completion=False)
self.evaluate(steps=eval_steps)
# Early stop condition.
if test_runner.eval_loss.result() < 0.1:
logging.info(
"Training early stopped as eval_loss %s is less than 0.1",
test_runner.eval_loss.result())
return
checkpoint = tf.train.Checkpoint(
model=test_runner.model, optimizer=test_runner.optimizer)
checkpoint_manager = tf.train.CheckpointManager(
checkpoint,
self.model_dir,
max_to_keep=None,
step_counter=test_runner.global_step,
checkpoint_interval=10)
test_controller = EarlyStopController(
trainer=test_runner,
evaluator=test_runner,
global_step=test_runner.global_step,
steps_per_loop=2,
checkpoint_manager=checkpoint_manager)
test_controller.train_and_evaluate(
train_steps=10, eval_steps=6, eval_interval=2)
self.assertLess(test_runner.global_step, 10)
def test_evaluate_with_loss_outputs(self):
test_evaluator = TestEvaluator()
checkpoint = tf.train.Checkpoint(model=test_evaluator.model)
checkpoint.save(os.path.join(self.model_dir, "ckpt"))
checkpoint_manager = tf.train.CheckpointManager(
checkpoint, self.model_dir, max_to_keep=None)
test_controller = controller.Controller(
evaluator=test_evaluator,
global_step=tf.Variable(0, dtype=tf.int64),
checkpoint_manager=checkpoint_manager,
eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
test_controller.evaluate(steps=5)
# Only eval summaries are written
self.assertNotEmpty(
tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
self.assertNotEmpty(
summaries_with_matching_keyword(
"eval_loss", os.path.join(self.model_dir, "summaries/eval")))
def test_train_and_evaluate_reset_datasets(self):
test_runner = TestRunner()
test_controller = controller.Controller(
trainer=test_runner,
evaluator=test_runner,
global_step=test_runner.global_step,
steps_per_loop=2)
test_controller.train_and_evaluate(
train_steps=10, eval_steps=2, eval_interval=6)
train_dataset = (
test_runner.strategy.experimental_distribute_datasets_from_function(
dataset_fn))
eval_dataset = (
test_runner.strategy.experimental_distribute_datasets_from_function(
dataset_fn))
test_runner.train_dataset = train_dataset
test_runner.eval_dataset = eval_dataset
test_controller.train_and_evaluate(
train_steps=10, eval_steps=2, eval_interval=6)
def test_eval_and_checkpoint_interval(self):
test_runner = TestRunner()
checkpoint = tf.train.Checkpoint(
model=test_runner.model, optimizer=test_runner.optimizer)
checkpoint_manager = tf.train.CheckpointManager(
checkpoint,
self.model_dir,
max_to_keep=None,
step_counter=test_runner.global_step,
checkpoint_interval=5)
test_controller = controller.Controller(
trainer=test_runner,
evaluator=test_runner,
global_step=test_runner.global_step,
steps_per_loop=10,
checkpoint_manager=checkpoint_manager)
test_controller.train_and_evaluate(
train_steps=10, eval_steps=2, eval_interval=5)
# Expect 3 checkpoints to be saved at step: 0, 5, 10.
self.assertLen(
tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt-*.data*")), 3)
# Expect evaluation is performed 2 times at step: 5, 10.
self.assertLen(
summaries_with_matching_keyword("eval_loss", self.model_dir), 2)
def test_evaluate_with_nested_summaries(self):
test_evaluator = TestEvaluatorWithNestedSummary()
test_controller = controller.Controller(
evaluator=test_evaluator,
global_step=tf.Variable(0, dtype=tf.int64),
eval_summary_dir=self.model_dir)
test_controller.evaluate(steps=5)
self.assertNotEmpty(
tf.io.gfile.listdir(os.path.join(self.model_dir, "dataset")))
self.assertNotEmpty(
summaries_with_matching_keyword(
"loss", os.path.join(self.model_dir, "dataset")))
self.assertNotEmpty(
summaries_with_matching_keyword(
"accuracy", os.path.join(self.model_dir, "dataset")))
self.assertNotEmpty(
tf.io.gfile.listdir(os.path.join(self.model_dir, "dataset2")))
self.assertNotEmpty(
summaries_with_matching_keyword(
"loss", os.path.join(self.model_dir, "dataset2")))
self.assertNotEmpty(
summaries_with_matching_keyword(
"accuracy", os.path.join(self.model_dir, "dataset2")))
if __name__ == "__main__":
tf.test.main()
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
# Copyright 2020 The Orbit Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -14,19 +14,12 @@
# ==============================================================================
"""An abstraction that users can easily handle their custom training loops."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import abc
import six
import tensorflow.compat.v2 as tf
from typing import Dict, Optional, Text
import tensorflow as tf
@six.add_metaclass(abc.ABCMeta)
class AbstractTrainable(tf.Module):
class AbstractTrainer(tf.Module, metaclass=abc.ABCMeta):
"""An abstract class defining the APIs required for training."""
@abc.abstractmethod
......@@ -50,14 +43,14 @@ class AbstractTrainable(tf.Module):
one update to model parameters, e.g. if training a GAN).
Returns:
The function may return a dictionary of `Tensors`, which will be
written to logs and as TensorBoard summaries.
The function may return a dictionary of `Tensors` or numpy arrays, which
will be written to logs and as TensorBoard summaries. It can also be a
nested dictionary, yielding a hierarchy of summary directories.
"""
pass
@six.add_metaclass(abc.ABCMeta)
class AbstractEvaluable(tf.Module):
class AbstractEvaluator(tf.Module, metaclass=abc.ABCMeta):
"""An abstract class defining the APIs required for evaluation."""
@abc.abstractmethod
......@@ -73,7 +66,8 @@ class AbstractEvaluable(tf.Module):
is `None`.
Returns:
The function may return a dictionary of `Tensors`, which will be
written to logs and as TensorBoard summaries.
The function may return a dictionary of `Tensors` or numpy arrays, which
will be written to logs and as TensorBoard summaries. It can also be a
nested dictionary, yielding a hierarchy of summary directories.
"""
pass
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
# Copyright 2020 The Orbit Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -14,67 +14,101 @@
# ==============================================================================
"""An abstraction that users can easily handle their custom training loops."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import abc
import six
import tensorflow.compat.v2 as tf
from typing import Dict, Optional, Text
from typing import Any, Dict, Optional, Text
import dataclasses
from orbit import runner
from orbit import utils
import tensorflow as tf
@dataclasses.dataclass(frozen=True)
class TrainerOverrides:
"""Advanced overrides for Orbit trainers.
Attributes:
use_tf_while_loop: A boolean indicates whether to wrap the train step with
a `tf.while_loop`.
use_tf_function: A boolean indicates whether a `tf.function` will be used.
If False, training will run on pure eager mode.
use_tpu_summary_optimization: A boolean indicates whether to enable the
performance optimization for summaries in TPUs. In TPUs, writing
summaries with outside compilation inside train step is slow. If True,
it creates two `tf.function` with two XLA programs: one with summaries
and one without, and run the program with summaries (slow one) only if
necessary.
"""
use_tf_while_loop: bool = True
use_tf_function: bool = True
use_tpu_summary_optimization: bool = False
from official.staging.training import runnable
from official.staging.training import utils
class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
"""Implements the standard functionality of AbstractTrainer APIs."""
@six.add_metaclass(abc.ABCMeta)
class StandardTrainable(runnable.AbstractTrainable):
"""Implements the standard functionality of AbstractTrainable APIs."""
def __init__(self,
train_dataset,
use_tf_while_loop=True,
use_tf_function=True,
use_tpu_summary_optimization=False):
"""Construct a `StandardTrainer` object.
def __init__(self, use_tf_while_loop=True, use_tf_function=True):
Args:
train_dataset: A tf.nest-compatible structure of tf.data.Dataset or
DistributedDataset.
use_tf_while_loop: A boolean indicates whether to wrap the train step with
a `tf.while_loop`.
use_tf_function: A boolean indicates whether a `tf.function` will be used.
If False, training will run on pure eager mode.
use_tpu_summary_optimization: A boolean indicates whether to enable the
performance optimization for summaries in TPUs. In TPUs, writing
summaries with outside compilation inside train step is slow. If True,
it creates two `tf.function` with two XLA programs: one with summaries
and one without, and run the program with summaries (slow one) only if
necessary.
"""
if use_tf_while_loop and not use_tf_function:
raise ValueError("`use_tf_while_loop=True` and `use_tf_function=False` "
"is not supported")
self.use_tf_while_loop = use_tf_while_loop
self.use_tf_function = use_tf_function
self.train_dataset = None
self.train_iter = None
self.train_loop_fn = None
@abc.abstractmethod
def build_train_dataset(self):
"""Builds the training datasets.
Returns:
A tf.nest-compatible structure of tf.data.Dataset or DistributedDataset.
"""
pass
if use_tpu_summary_optimization and not use_tf_while_loop:
raise ValueError("`use_tpu_summary_optimization=True` and "
"`use_tf_while_loop=False` is not supported")
self._use_tf_while_loop = use_tf_while_loop
self._use_tf_function = use_tf_function
self._train_dataset = train_dataset
self._train_iter = None
self._train_loop_fn = None
self._use_tpu_summary_optimization = use_tpu_summary_optimization
def train(self,
num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
"""See base class."""
if self.train_dataset is None:
# Build train input dataset
self.train_dataset = self.build_train_dataset()
self.train_iter = tf.nest.map_structure(iter, self.train_dataset)
self.train_loop_begin()
if self._train_iter is None:
self._train_iter = tf.nest.map_structure(iter, self.train_dataset)
if self.train_loop_fn is None:
if self._train_loop_fn is None:
train_fn = self.train_step
if self.use_tf_while_loop:
self.train_loop_fn = utils.create_tf_while_loop_fn(train_fn)
if self._use_tf_while_loop:
self._train_loop_fn = utils.create_tf_while_loop_fn(train_fn)
if self._use_tpu_summary_optimization:
self._train_loop_fn = utils.train_function_with_summaries(
self._train_loop_fn)
else:
self._train_loop_fn = tf.function(self._train_loop_fn)
else:
if self.use_tf_function:
if self._use_tf_function:
train_fn = tf.function(train_fn)
self.train_loop_fn = utils.create_loop_fn(train_fn)
self._train_loop_fn = utils.create_loop_fn(train_fn)
self.train_loop_begin()
self.train_loop_fn(self.train_iter, num_steps)
self._train_loop_fn(self._train_iter, num_steps)
return self.train_loop_end()
def train_loop_begin(self):
"""Called once at the beginning of the training loop.
This method is called before dataset iterators creation.
This is a good place to reset metrics that accumulate values over multiple
steps of training.
"""
......@@ -89,6 +123,12 @@ class StandardTrainable(runnable.AbstractTrainable):
context" for generality, to allow e.g. multiple iterator dequeues and calls
to `strategy.run`.
Note that if `use_tf_function=True`, all the code inside `train_step` should
be tf.function compatible, as they will be traced with tf.function. This
means you cannot put arbitrary python code in this function. If users have
any numpy operations, they should be put in `train_loop_begin` or
`train_loop_end` functions.
Args:
iterator: A tf.nest-compatible structure of tf.data Iterator or
DistributedIterator.
......@@ -103,58 +143,90 @@ class StandardTrainable(runnable.AbstractTrainable):
Returns:
The function may return a dictionary of `Tensors`, which will be
written to logs and as TensorBoard summaries.
written to logs and as TensorBoard summaries. It can also be a
nested dictionary, yielding a hierarchy of summary directories.
"""
pass
@property
def train_dataset(self):
"""Returns the train_dataset instance."""
return self._train_dataset
@six.add_metaclass(abc.ABCMeta)
class StandardEvaluable(runnable.AbstractEvaluable):
"""Implements the standard functionality of AbstractEvaluable APIs."""
@train_dataset.setter
def train_dataset(self, train_dataset):
"""Set a new train dataset and replace with the existing one.
def __init__(self, use_tf_function=True):
self.eval_use_tf_function = use_tf_function
self.eval_dataset = None
self.eval_loop_fn = None
Any unfinished work in the previous dataset will be discarded.
@abc.abstractmethod
def build_eval_dataset(self):
"""Builds the evaluation datasets.
Args:
train_dataset: A tf.nest-compatible structure of tf.data.Dataset or
DistributedDataset.
"""
self._train_dataset = train_dataset
self._train_iter = None
Returns:
A tf.nest-compatible structure of tf.data.Dataset or DistributedDataset.
@dataclasses.dataclass(frozen=True)
class EvaluatorOverrides:
"""Advanced overrides for Orbit evaluators.
Attributes:
use_tf_function: A boolean indicates whether a `tf.function` will be used.
If False, training will run on pure eager mode.
"""
pass
use_tf_function: bool = True
class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
"""Implements the standard functionality of AbstractEvaluator APIs."""
def __init__(self, eval_dataset, use_tf_function=True):
"""Construct a `StandardEvaluator` object.
Args:
eval_dataset: A tf.nest-compatible structure of tf.data.Dataset or
DistributedDataset.
use_tf_function: A boolean indicates whether a `tf.function` will be used.
If False, evaluation will run on pure eager mode.
"""
self._eval_use_tf_function = use_tf_function
self._eval_dataset = eval_dataset
self._eval_loop_fn = None
def evaluate(
self, num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
"""See base class."""
if self.eval_dataset is None:
# Build train input dataset
self.eval_dataset = self.build_eval_dataset()
outputs = self.eval_begin() # pylint: disable=assignment-from-no-return
if self.eval_loop_fn is None:
eval_iter = tf.nest.map_structure(iter, self._eval_dataset)
if self._eval_loop_fn is None:
eval_fn = self.eval_step
if self.eval_use_tf_function:
if self._eval_use_tf_function:
eval_fn = tf.function(eval_fn)
self.eval_loop_fn = utils.create_loop_fn(eval_fn)
eval_iter = tf.nest.map_structure(iter, self.eval_dataset)
self._eval_loop_fn = utils.create_loop_fn(eval_fn)
self.eval_begin()
self.eval_loop_fn(eval_iter, num_steps)
outputs = self._eval_loop_fn(
eval_iter, num_steps, state=outputs, reduce_fn=self.eval_reduce)
if outputs is None:
return self.eval_end()
else:
return self.eval_end(outputs)
def eval_begin(self):
def eval_begin(self) -> Any:
"""Called once at the beginning of the evaluation.
This method is called before dataset iterators creation.
This is a good place to reset metrics that accumulate values over the entire
evaluation.
Returns:
An output which is passed as `state` argument into `eval_reduce` function.
"""
pass
@abc.abstractmethod
def eval_step(self, iterator):
def eval_step(self, iterator) -> Any:
"""Implements one step of evaluation.
What a "step" consists of is up to the implementer. If using distribution
......@@ -162,20 +234,67 @@ class StandardEvaluable(runnable.AbstractEvaluable):
context" for generality, to allow e.g. multiple iterator dequeues and calls
to `strategy.run`.
Note that if `use_tf_function=True`, all the code inside `eval_step` should
be tf.function compatible, as they will be traced with tf.function. This
means you cannot put arbitrary python code in this function. If users have
any numpy operations, they should be put in `eval_begin`, `eval_end` or
`eval_reduce` functions.
Args:
iterator: A tf.nest-compatible structure of tf.data Iterator or
DistributedIterator.
Returns:
An output which is passed as `step_outputs` argument into `eval_reduce`
function.
"""
pass
def eval_end(self) -> Optional[Dict[Text, tf.Tensor]]:
def eval_end(self, *args) -> Optional[Dict[Text, tf.Tensor]]:
"""Called at the end of the evaluation.
This is a good place to get metric results. The value returned from this
function will be returned as-is from the evaluate() method.
Args:
*args: the outputs from `eval_reduce` for the last eval step.
Returns:
The function may return a dictionary of `Tensors`, which will be
written to logs and as TensorBoard summaries.
written to logs and as TensorBoard summaries. It can also be a
nested dictionary, yielding a hierarchy of summary directories.
"""
pass
def eval_reduce(self, state=None, step_outputs=None) -> Any:
"""A function to do the reduction on the evaluation outputs per step.
This is useful for passing states throughout evaluation. E.g. it can be used
to maintain the output losses from all the evaluation steps, and compute the
mean loss in `eval_end` function.
Args:
state: A maintained state throughout the evaluation.
step_outputs: Outputs from the current evaluation step.
Returns:
An output which is passed as `state` argument into `eval_reduce` function
for the next step. After evaluation is finished, the output from last step
will be passed into `eval_end` function.
"""
pass
@property
def eval_dataset(self):
"""Returns the train_datase instance."""
return self._eval_dataset
@eval_dataset.setter
def eval_dataset(self, eval_dataset):
"""Set a new eval dataset and replace with the existing one.
Args:
eval_dataset: A tf.nest-compatible structure of tf.data.Dataset or
DistributedDataset.
"""
self._eval_dataset = eval_dataset
# Copyright 2020 The Orbit Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for orbit.standard_runner."""
from orbit import standard_runner
import tensorflow as tf
def dataset_fn(input_context=None):
del input_context
def dummy_data(_):
return tf.zeros((1, 1), dtype=tf.float32)
dataset = tf.data.Dataset.range(1)
dataset = dataset.repeat()
dataset = dataset.map(
dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
return dataset
class TestRunner(standard_runner.StandardTrainer,
standard_runner.StandardEvaluator):
"""Implements the training and evaluation APIs for tests."""
def __init__(self):
self.strategy = tf.distribute.get_strategy()
self.global_step = tf.Variable(
0,
trainable=False,
dtype=tf.int64,
name='global_step',
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
standard_runner.StandardTrainer.__init__(self, train_dataset=None)
standard_runner.StandardEvaluator.__init__(self, eval_dataset=None)
def train_loop_begin(self):
self.train_dataset = (
self.strategy.experimental_distribute_datasets_from_function(dataset_fn)
)
def train_step(self, iterator):
def _replicated_step(_):
self.global_step.assign_add(1)
self.strategy.run(_replicated_step, args=(next(iterator),))
def train_loop_end(self):
return self.global_step.numpy()
def eval_begin(self):
self.eval_dataset = self.strategy.experimental_distribute_datasets_from_function(
dataset_fn)
def eval_step(self, iterator):
def _replicated_step(_):
self.global_step.assign_add(1)
self.strategy.run(_replicated_step, args=(next(iterator),))
def eval_end(self):
return self.global_step.numpy()
class StandardRunnerTest(tf.test.TestCase):
def test_train(self):
test_runner = TestRunner()
self.assertEqual(
test_runner.train(tf.convert_to_tensor(10, dtype=tf.int32)), 10)
def test_eval(self):
test_runner = TestRunner()
self.assertEqual(
test_runner.evaluate(tf.convert_to_tensor(10, dtype=tf.int32)), 10)
if __name__ == '__main__':
tf.test.main()
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
# Copyright 2020 The Orbit Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -14,16 +14,14 @@
# ==============================================================================
"""Some layered modules/functions to help users writing custom training loop."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import abc
import contextlib
import functools
import inspect
import six
import tensorflow.compat.v2 as tf
import os
import numpy as np
import tensorflow as tf
def create_loop_fn(step_fn):
......@@ -79,7 +77,6 @@ def create_tf_while_loop_fn(step_fn):
A callable defined as the `loop_fn` defination below.
"""
@tf.function
def loop_fn(iterator, num_steps):
"""A loop function with multiple steps.
......@@ -98,6 +95,30 @@ def create_tf_while_loop_fn(step_fn):
return loop_fn
def create_global_step() -> tf.Variable:
"""Creates a `tf.Variable` suitable for use as a global step counter.
Creating and managing a global step variable may be necessary for
`AbstractTrainer` subclasses that perform multiple parameter updates per
`Controller` "step", or use different optimizers on different steps.
In these cases, an `optimizer.iterations` property generally can't be used
directly, since it would correspond to parameter updates instead of iterations
in the `Controller`'s training loop. Such use cases should simply call
`step.assign_add(1)` at the end of each step.
Returns:
A non-trainable scalar `tf.Variable` of dtype `tf.int64`, with only the
first replica's value retained when synchronizing across replicas in
a distributed setting.
"""
return tf.Variable(
0,
dtype=tf.int64,
trainable=False,
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
"""A helper function to create distributed dataset.
......@@ -130,10 +151,7 @@ def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
# names, pass `ctx` as the value of `input_context` when calling
# `dataset_or_fn`. Otherwise `ctx` will not be used when calling
# `dataset_or_fn`.
if six.PY3:
argspec = inspect.getfullargspec(dataset_or_fn)
else:
argspec = inspect.getargspec(dataset_or_fn)
args_names = argspec.args
if "input_context" in args_names:
......@@ -144,96 +162,99 @@ def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
return strategy.experimental_distribute_datasets_from_function(dataset_fn)
class SummaryManager(object):
class SummaryManager:
"""A class manages writing summaries."""
def __init__(self,
summary_writer,
summary_fn,
global_step=None,
summary_interval=None):
def __init__(self, summary_dir, summary_fn, global_step=None):
"""Construct a summary manager object.
Args:
summary_writer: A `tf.summary.SummaryWriter` instance for writing
summaries.
summary_dir: the directory to write summaries.
summary_fn: A callable defined as `def summary_fn(name, tensor,
step=None)`, which describes the summary operation.
global_step: A `tf.Variable` instance for checking the current global step
value, in case users want to save summaries every N steps.
summary_interval: An integer, indicates the minimum step interval between
two summaries.
global_step: A `tf.Variable` instance for the global step.
"""
if summary_writer is not None:
self._summary_writer = summary_writer
self._enabled = True
else:
self._summary_writer = tf.summary.create_noop_writer()
self._enabled = False
self._enabled = (summary_dir is not None)
self._summary_dir = summary_dir
self._summary_fn = summary_fn
self._summary_writers = {}
if global_step is None:
self._global_step = tf.summary.experimental.get_step()
else:
self._global_step = global_step
if summary_interval is not None:
if self._global_step is None:
raise ValueError("`summary_interval` is not None, but no `global_step` "
"can be obtained ")
self._last_summary_step = self._global_step.numpy()
self._summary_interval = summary_interval
def summary_writer(self, relative_path=""):
"""Returns the underlying summary writer.
@property
def summary_interval(self):
return self._summary_interval
@property
def summary_writer(self):
"""Returns the underlying summary writer."""
return self._summary_writer
Args:
relative_path: The current path in which to write summaries, relative to
the summary directory. By default it is empty, which specifies the root
directory.
"""
if self._summary_writers and relative_path in self._summary_writers:
return self._summary_writers[relative_path]
if self._enabled:
self._summary_writers[relative_path] = tf.summary.create_file_writer(
os.path.join(self._summary_dir, relative_path))
else:
self._summary_writers[relative_path] = tf.summary.create_noop_writer()
return self._summary_writers[relative_path]
def flush(self):
"""Flush the underlying summary writer."""
"""Flush the underlying summary writers."""
if self._enabled:
tf.summary.flush(self._summary_writer)
def write_summaries(self, items, always_write=True):
"""Write a bulk of summaries.
tf.nest.map_structure(tf.summary.flush, self._summary_writers)
def write_summaries(self, summary_dict):
"""Write summaries for the given values.
This recursively creates subdirectories for any nested dictionaries
provided in `summary_dict`, yielding a hierarchy of directories which will
then be reflected in the TensorBoard UI as different colored curves.
E.g. users may evaluate on muliple datasets and return `summary_dict` as a
nested dictionary.
```
{
"dataset": {
"loss": loss,
"accuracy": accuracy
},
"dataset2": {
"loss": loss2,
"accuracy": accuracy2
},
}
```
This will create two subdirectories "dataset" and "dataset2" inside the
summary root directory. Each directory will contain event files including
both "loss" and "accuracy" summaries.
Args:
items: a dictionary of `Tensors` for writing summaries.
always_write: An optional boolean. If `True`, the manager will always
write summaries unless the summaries have been written for the same
step. Otherwise the manager will only write the summaries if the
interval between summaries are larger than `summary_interval`.
Returns:
A boolean indicates whether the summaries are written or not.
summary_dict: A dictionary of values. If any value in `summary_dict` is
itself a dictionary, then the function will recursively create
subdirectories with names given by the keys in the dictionary. The
Tensor values are summarized using the summary writer instance specific
to the parent relative path.
"""
# TODO(rxsang): Support writing summaries with nested structure, so users
# can split the summaries into different directories for nicer visualization
# in Tensorboard, like train and eval metrics.
if not self._enabled:
return False
if self._summary_interval is not None:
current_step = self._global_step.numpy()
if current_step == self._last_summary_step:
return False
if not always_write and current_step < (self._last_summary_step +
self._summary_interval):
return False
self._last_summary_step = current_step
with self._summary_writer.as_default():
for name, tensor in items.items():
self._summary_fn(name, tensor, step=self._global_step)
return True
return
self._write_summaries(summary_dict)
def _write_summaries(self, summary_dict, relative_path=""):
for name, value in summary_dict.items():
if isinstance(value, dict):
self._write_summaries(
value, relative_path=os.path.join(relative_path, name))
else:
with self.summary_writer(relative_path).as_default():
self._summary_fn(name, value, step=self._global_step)
@six.add_metaclass(abc.ABCMeta)
class Trigger(object):
class Trigger(metaclass=abc.ABCMeta):
"""An abstract class representing a "trigger" for some event."""
@abc.abstractmethod
......@@ -294,7 +315,7 @@ class IntervalTrigger(Trigger):
self._last_trigger_value = 0
class EpochHelper(object):
class EpochHelper:
"""A Helper class to handle epochs in Customized Training Loop."""
def __init__(self, epoch_steps, global_step):
......@@ -340,3 +361,86 @@ class EpochHelper(object):
@property
def current_epoch(self):
return self._current_epoch
@contextlib.contextmanager
def _soft_device_placement():
"""Context manager for soft device placement, allowing summaries on CPU."""
original_setting = tf.config.get_soft_device_placement()
try:
tf.config.set_soft_device_placement(True)
yield
finally:
tf.config.set_soft_device_placement(original_setting)
def train_function_with_summaries(*args, **kwargs):
"""Utility function to support TPU summaries via multiple `tf.function`s.
This permits interleaving summaries inside TPU-compatible code, but without
any performance impact on steps that do not write summaries.
Usage is as a decorator, similar to `tf.function`, and any `tf.function`
arguments will be passed through if supplied:
@trainer.train_function_with_summaries
def train(self, num_steps):
...
The decorated function is assumed to be a loop method accepting a `num_steps`
parameter, as for instance would be called within the `Controller`'s outer
train loop. The implementation here assumes that `summary_frequency` is
divisible by `steps_per_loop`. The decorated method should accept two
arguments, `self` and `num_steps`.
Two `tf.function` versions of `train_fn` are created: one inside a summary
writer scope with soft device placement enabled (used on steps that require
summary writing), and one with no summary writer present and soft device
placement disabled (used on all other steps).
Args:
*args: Arguments to pass through to `tf.function`.
**kwargs: Keyword arguments to pass through to `tf.function`.
Returns:
If the first argument is a callable, returns the decorated callable.
Otherwise, returns a decorator.
"""
def decorator(train_fn):
# TODO(dhr): Validate the signature of train_fn?
train_fn_with_summaries = tf.function(train_fn, *args, **kwargs)
train_fn_without_summaries = tf.function(train_fn, *args, **kwargs)
@functools.wraps(train_fn)
def wrapper(self, num_steps):
if tf.summary.should_record_summaries():
with _soft_device_placement():
output = train_fn_with_summaries(self, tf.constant(1))
num_steps -= 1
if num_steps >= 1:
with tf.summary.record_if(False):
output = train_fn_without_summaries(self, num_steps)
return output
return wrapper
if args and callable(args[0]):
train_fn, args = args[0], args[1:]
return decorator(train_fn)
return decorator
def get_value(x) -> np.ndarray:
"""Returns the value of a variable/tensor.
Args:
x: input variable.
Returns:
A Numpy array or number.
"""
if not tf.is_tensor(x):
return x
return x.numpy()
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
# Copyright 2020 The Orbit Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -12,16 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Special word constants.
"""Tests for orbit.utils."""
NOTE: The ids of the EOS and UNK constants should not be modified. It is assumed
that these always occupy the first two ids.
"""
from orbit import utils
# End of sentence.
EOS = "<eos>"
EOS_ID = 0
import tensorflow as tf
# Unknown.
UNK = "<unk>"
UNK_ID = 1
class UtilsTest(tf.test.TestCase):
def test_create_global_step(self):
step = utils.create_global_step()
self.assertEqual(step.dtype, tf.int64)
self.assertEqual(step, 0)
step.assign_add(1)
self.assertEqual(step, 1)
if __name__ == '__main__':
tf.test.main()
......@@ -7,118 +7,73 @@ This directory contains code implementations and pre-trained models of published
The research models are maintained by their respective authors.
## Table of Contents
- [Modeling Libraries and Models](#modeling-libraries-and-models)
- [Models and Implementations](#models-and-implementations)
* [Computer Vision](#computer-vision)
* [Natural Language Processing](#natural-language-processing)
* [Audio and Speech](#audio-and-speech)
* [Reinforcement Learning](#reinforcement-learning)
* [Others](#others)
- [Archived Models and Implementations](#warning-archived-models-and-implementations) (:no_entry_sign: No longer maintained)
- [TensorFlow Research Models](#tensorflow-research-models)
- [Table of Contents](#table-of-contents)
- [Modeling Libraries and Models](#modeling-libraries-and-models)
- [Models and Implementations](#models-and-implementations)
- [Computer Vision](#computer-vision)
- [Natural Language Processing](#natural-language-processing)
- [Audio and Speech](#audio-and-speech)
- [Reinforcement Learning](#reinforcement-learning)
- [Others](#others)
- [Old Models and Implementations in TensorFlow 1](#old-models-and-implementations-in-tensorflow-1)
- [Contributions](#contributions)
## Modeling Libraries and Models
| Directory | Name | Description | Maintainer(s) |
|-----------|------|-------------|---------------|
| [object_detection](object_detection) | TensorFlow Object Detection API | A framework that makes it easy to construct, train and deploy object detection models<br /><br />A collection of object detection models pre-trained on the COCO dataset, the Kitti dataset, the Open Images dataset, the AVA v2.1 dataset, and the iNaturalist Species Detection Dataset| @jch1, @tombstone, @pkulzc |
| [slim](slim) | TensorFlow-Slim Image Classification Model Library | A lightweight high-level API of TensorFlow for defining, training and evaluating image classification models <br />• Inception V1/V2/V3/V4<br />• Inception-ResNet-v2<br />• ResNet V1/V2<br />• VGG 16/19<br />• MobileNet V1/V2/V3<br />• NASNet-A_Mobile/Large<br />• PNASNet-5_Large/Mobile | @sguada, @marksandler2 |
| [object_detection](object_detection) | TensorFlow Object Detection API | A framework that makes it easy to construct, train and deploy object detection models<br /><br />A collection of object detection models pre-trained on the COCO dataset, the Kitti dataset, the Open Images dataset, the AVA v2.1 dataset, and the iNaturalist Species Detection Dataset| jch1, tombstone, pkulzc |
| [slim](slim) | TensorFlow-Slim Image Classification Model Library | A lightweight high-level API of TensorFlow for defining, training and evaluating image classification models <br />• Inception V1/V2/V3/V4<br />• Inception-ResNet-v2<br />• ResNet V1/V2<br />• VGG 16/19<br />• MobileNet V1/V2/V3<br />• NASNet-A_Mobile/Large<br />• PNASNet-5_Large/Mobile | sguada, marksandler2 |
## Models and Implementations
### Computer Vision
| Directory | Referenece (Paper) | Maintainer(s) |
|-----------|--------------------|---------------|
| [attention_ocr](attention_ocr) | [Attention-based Extraction of Structured Information from Street View Imagery](https://arxiv.org/abs/1704.03549) | xavigibert |
| [autoaugment](autoaugment) | [1] [AutoAugment](https://arxiv.org/abs/1805.09501)<br />[2] [Wide Residual Networks](https://arxiv.org/abs/1605.07146)<br />[3] [Shake-Shake regularization](https://arxiv.org/abs/1705.07485)<br />[4] [ShakeDrop Regularization for Deep Residual Learning](https://arxiv.org/abs/1802.02375) | barretzoph |
| [deeplab](deeplab) | [1] [DeepLabv1](https://arxiv.org/abs/1412.7062)<br />[2] [DeepLabv2](https://arxiv.org/abs/1606.00915)<br />[3] [DeepLabv3](https://arxiv.org/abs/1802.02611)<br />[4] [DeepLabv3+](https://arxiv.org/abs/1706.05587) | aquariusjay, yknzhu |
| [delf](delf) | [1] DELF (DEep Local Features): [Large-Scale Image Retrieval with Attentive Deep Local Features](https://arxiv.org/abs/1612.06321)<br />[2] [Detect-to-Retrieve](https://arxiv.org/abs/1812.01584) | andrefaraujo |
| [lstm_object_detection](lstm_object_detection) | [Mobile Video Object Detection with Temporally-Aware Feature Maps](https://arxiv.org/abs/1711.06368) | yinxiaoli, yongzhe2160, lzyuan |
| [marco](marco) | [Classification of crystallization outcomes using deep convolutional neural networks](https://arxiv.org/abs/1803.10342) | vincentvanhoucke |
| [vid2depth](vid2depth) | [Unsupervised Learning of Depth and Ego-Motion from Monocular Video Using 3D Geometric Constraints](https://arxiv.org/abs/1802.05522) | rezama |
| Directory | Paper(s) | Conference | Maintainer(s) |
|-----------|----------|------------|---------------|
| [attention_ocr](attention_ocr) | [Attention-based Extraction of Structured Information from Street View Imagery](https://arxiv.org/abs/1704.03549) | ICDAR 2017 | xavigibert |
| [autoaugment](autoaugment) | [1] [AutoAugment](https://arxiv.org/abs/1805.09501)<br />[2] [Wide Residual Networks](https://arxiv.org/abs/1605.07146)<br />[3] [Shake-Shake regularization](https://arxiv.org/abs/1705.07485)<br />[4] [ShakeDrop Regularization for Deep Residual Learning](https://arxiv.org/abs/1802.02375) | [1] CVPR 2019<br />[2] BMVC 2016<br /> [3] ICLR 2017<br /> [4] ICLR 2018 | barretzoph |
| [deeplab](deeplab) | [1] [DeepLabv1: Semantic Image Segmentation with Deep Convolutional Nets and Fully Connected CRFs](https://arxiv.org/abs/1412.7062)<br />[2] [DeepLabv2: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs](https://arxiv.org/abs/1606.00915)<br />[3] [DeepLabv3: Rethinking Atrous Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1706.05587)<br />[4] [DeepLabv3+: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1802.02611)<br />| [1] ICLR 2015 <br />[2] TPAMI 2017 <br />[4] ECCV 2018 | aquariusjay, yknzhu |
| [delf](delf) | [1] DELF (DEep Local Features): [Large-Scale Image Retrieval with Attentive Deep Local Features](https://arxiv.org/abs/1612.06321)<br />[2] [Detect-to-Retrieve: Efficient Regional Aggregation for Image Search](https://arxiv.org/abs/1812.01584)<br />[3] DELG (DEep Local and Global features): [Unifying Deep Local and Global Features for Image Search](https://arxiv.org/abs/2001.05027)<br />[4] GLDv2: [Google Landmarks Dataset v2 -- A Large-Scale Benchmark for Instance-Level Recognition and Retrieval](https://arxiv.org/abs/2004.01804) | [1] ICCV 2017<br />[2] CVPR 2019<br />[4] CVPR 2020 | andrefaraujo |
| [lstm_object_detection](lstm_object_detection) | [Mobile Video Object Detection with Temporally-Aware Feature Maps](https://arxiv.org/abs/1711.06368) | CVPR 2018 | yinxiaoli, yongzhe2160, lzyuan |
| [marco](marco) | MARCO: [Classification of crystallization outcomes using deep convolutional neural networks](https://arxiv.org/abs/1803.10342) | | vincentvanhoucke |
| [vid2depth](vid2depth) | [Unsupervised Learning of Depth and Ego-Motion from Monocular Video Using 3D Geometric Constraints](https://arxiv.org/abs/1802.05522) | CVPR 2018 | rezama |
### Natural Language Processing
| Directory | Referenece (Paper) | Maintainer(s) |
|-----------|--------------------|---------------|
| [adversarial_text](adversarial_text) | [1] [Adversarial Training Methods for Semi-Supervised Text](https://arxiv.org/abs/1605.07725) Classification<br />[2] [Semi-supervised Sequence Learning](https://arxiv.org/abs/1511.01432) | rsepassi, a-dai |
| [cvt_text](cvt_text) | [Semi-supervised sequence learning with cross-view training](https://arxiv.org/abs/1809.08370) | clarkkev, lmthang |
| Directory | Paper(s) | Conference | Maintainer(s) |
|-----------|----------|------------|---------------|
| [adversarial_text](adversarial_text) | [1] [Adversarial Training Methods for Semi-Supervised Text](https://arxiv.org/abs/1605.07725) Classification<br />[2] [Semi-supervised Sequence Learning](https://arxiv.org/abs/1511.01432) | [1] ICLR 2017<br />[2] NIPS 2015 | rsepassi, a-dai |
| [cvt_text](cvt_text) | [Semi-Supervised Sequence Modeling with Cross-View Training](https://arxiv.org/abs/1809.08370) | EMNLP 2018 | clarkkev, lmthang |
### Audio and Speech
| Directory | Referenece (Paper) | Maintainer(s) |
|-----------|--------------------|---------------|
| [audioset](audioset) | [1] [AudioSet: A Large Scale Dataset of Audio Events](https://research.google/pubs/pub45857/)<br />[2] [CNN Architectures for Large-Scale Audio Classification](https://research.google/pubs/pub45611/) | plakal, dpwe |
| Directory | Paper(s) | Conference | Maintainer(s) |
|-----------|----------|------------|---------------|
| [audioset](audioset) | [1] [Audio Set: An ontology and human-labeled dataset for audio events](https://research.google/pubs/pub45857/)<br />[2] [CNN Architectures for Large-Scale Audio Classification](https://research.google/pubs/pub45611/) | ICASSP 2017 | plakal, dpwe |
| [deep_speech](deep_speech) | [Deep Speech 2](https://arxiv.org/abs/1512.02595) | ICLR 2016 | yhliang2018 |
### Reinforcement Learning
| Directory | Referenece (Paper) | Maintainer(s) |
|-----------|--------------------|---------------|
| [efficient-hrl](efficient-hrl) | [1] [Data-Efficient Hierarchical Reinforcement Learning](https://arxiv.org/abs/1805.08296)<br />[2] [Near-Optimal Representation Learning for Hierarchical Reinforcement Learning](https://arxiv.org/abs/1810.01257) | ofirnachum |
| [pcl_rl](pcl_rl) | [1] [Improving Policy Gradient by Exploring Under-appreciated Rewards](https://arxiv.org/abs/1611.09321)<br />[2] [Bridging the Gap Between Value and Policy Based Reinforcement Learning](https://arxiv.org/abs/1702.08892)<br />[3] [Trust-PCL: An Off-Policy Trust Region Method for Continuous Control](https://arxiv.org/abs/1707.01891) | ofirnachum |
| Directory | Paper(s) | Conference | Maintainer(s) |
|-----------|----------|------------|---------------|
| [efficient-hrl](efficient-hrl) | [1] [Data-Efficient Hierarchical Reinforcement Learning](https://arxiv.org/abs/1805.08296)<br />[2] [Near-Optimal Representation Learning for Hierarchical Reinforcement Learning](https://arxiv.org/abs/1810.01257) | [1] NIPS 2018<br /> [2] ICLR 2019 | ofirnachum |
| [pcl_rl](pcl_rl) | [1] [Improving Policy Gradient by Exploring Under-appreciated Rewards](https://arxiv.org/abs/1611.09321)<br />[2] [Bridging the Gap Between Value and Policy Based Reinforcement Learning](https://arxiv.org/abs/1702.08892)<br />[3] [Trust-PCL: An Off-Policy Trust Region Method for Continuous Control](https://arxiv.org/abs/1707.01891) | [1] ICLR 2017<br />[2] NIPS 2017<br />[3] ICLR 2018 | ofirnachum |
### Others
| Directory | Referenece (Paper) | Maintainer(s) |
|-----------|--------------------|---------------|
| [lfads](lfads) | [LFADS - Latent Factor Analysis via Dynamical Systems](https://doi.org/10.1101/152884) | jazcollins, sussillo |
| [rebar](rebar) | [REBAR: Low-variance, unbiased gradient estimates for discrete latent variable models](https://arxiv.org/abs/1703.07370) | gjtucker |
| Directory | Paper(s) | Conference | Maintainer(s) |
|-----------|----------|------------|---------------|
| [lfads](lfads) | [LFADS - Latent Factor Analysis via Dynamical Systems](https://arxiv.org/abs/1608.06315) | | jazcollins, sussillo |
| [rebar](rebar) | [REBAR: Low-variance, unbiased gradient estimates for discrete latent variable models](https://arxiv.org/abs/1703.07370) | NIPS 2017 | gjtucker |
---
### Old Models and Implementations in TensorFlow 1
## :warning: Archived Models and Implementations
The following research models are no longer maintained.
**Note**: We will remove archived models from the master branch in June, 2020.
After removal, you will still be able to access archived models in the archive branch.
| Directory | Referenece (Paper) | Maintainer(s) |
|-----------|--------------------|---------------|
| [adv_imagenet_models](adv_imagenet_models) | [1] [Adversarial Machine Learning at Scale](https://arxiv.org/abs/1611.01236)<br />[2] [Ensemble Adversarial Training: Attacks and Defenses](https://arxiv.org/abs/1705.07204) | alexeykurakin |
| [adversarial_crypto](adversarial_crypto) | [Learning to Protect Communications with Adversarial Neural Cryptography](https://arxiv.org/abs/1610.06918) | dave-andersen |
| [adversarial_logit_pairing](adversarial_logit_pairing) | [Adversarial Logit Pairing](https://arxiv.org/abs/1803.06373) | alexeykurakin |
| [autoencoder](autoencoder) | Various autoencoders | snurkabill |
| [brain_coder](brain_coder) | [Neural Program Synthesis with Priority Queue Training](https://arxiv.org/abs/1801.03526) | danabo, mnorouzi |
| [cognitive_mapping_and_planning](cognitive_mapping_and_planning) | [Cognitive Mapping and Planning for Visual Navigation](https://arxiv.org/abs/1702.03920) | s-gupta |
| [compression](compression) | [Full Resolution Image Compression with Recurrent Neural Networks](https://arxiv.org/abs/1608.05148) | nmjohn |
| [deep_contextual_bandits](deep_contextual_bandits) | [Deep Bayesian Bandits Showdown: An Empirical Comparison of Bayesian Deep Networks for Thompson Sampling](https://arxiv.org/abs/1802.09127) | rikel |
| [deep_speech](deep_speech) | [Deep Speech 2](https://arxiv.org/abs/1512.02595) | yhliang2018 |
| [domain_adaptation](domain_adaptation) | [1] [Domain Separation Networks](https://arxiv.org/abs/1608.06019) <br />[2] [Unsupervised Pixel-Level Domain Adaptation with Generative Adversarial Networks](https://arxiv.org/abs/1612.05424) | bousmalis, dmrd |
| [feelvos](feelvos)| [FEELVOS](https://arxiv.org/abs/1902.09513) | pvoigtlaender, yuningchai, aquariusjay |
| [fivo](fivo)| [Filtering variational objectives for training generative sequence models](https://arxiv.org/abs/1705.09279) | dieterichlawson |
| [global_objectives](global_objectives) | [Scalable Learning of Non-Decomposable Objectives](https://arxiv.org/abs/1608.04802) | mackeya-google |
| [im2txt](im2txt) | [Show and Tell: Lessons learned from the 2015 MSCOCO Image Captioning Challenge](https://arxiv.org/abs/1609.06647) | cshallue |
| [inception](inception) | [Rethinking the Inception Architecture for Computer Vision](https://arxiv.org/abs/1512.00567) | shlens, vincentvanhoucke |
| [keypointnet](keypointnet) | [KeypointNet](https://arxiv.org/abs/1807.03146) | mnorouzi |
| [learned_optimizer](learned_optimizer) | [Learned Optimizers that Scale and Generalize](https://arxiv.org/abs/1703.04813) | olganw, nirum |
| [learning_to_remember_rare_events](learning_to_remember_rare_events) | [Learning to Remember Rare Events](https://arxiv.org/abs/1703.03129) | lukaszkaiser, ofirnachum |
| [learning_unsupervised_learning](learning_unsupervised_learning) | [Meta-Learning Update Rules for Unsupervised Representation Learning](https://arxiv.org/abs/1804.00222) | lukemetz, nirum |
| [lexnet_nc](lexnet_nc) | [Olive Oil is Made of Olives, Baby Oil is Made for Babies: Interpreting Noun Compounds using Paraphrases in a Neural Model](https://arxiv.org/abs/1803.08073) | vered1986, waterson |
| [lm_1b](lm_1b) | [Exploring the Limits of Language Modeling](https://arxiv.org/abs/1602.02410) | oriolvinyals, panyx0718 |
| [lm_commonsense](lm_commonsense) | [A Simple Method for Commonsense Reasoning](https://arxiv.org/abs/1806.02847) | thtrieu |
| [maskgan](maskgan)| [MaskGAN: Better Text Generation via Filling in the______](https://arxiv.org/abs/1801.07736) | liamb315, a-dai |
| [namignizer](namignizer)| Namignizer | knathanieltucker |
| [neural_gpu](neural_gpu)| [Neural GPUs Learn Algorithms](https://arxiv.org/abs/1511.08228) | lukaszkaiser |
| [neural_programmer](neural_programmer) | [Learning a Natural Language Interface with Neural Programmer](https://arxiv.org/abs/1611.08945) | arvind2505 |
| [next_frame_prediction](next_frame_prediction) | [Visual Dynamics](https://arxiv.org/abs/1607.02586) | panyx0718 |
| [ptn](ptn) | [Perspective Transformer Nets](https://arxiv.org/abs/1612.00814) | xcyan, arkanath, hellojas, honglaklee |
| [qa_kg](qa_kg) | [Learning to Reason](https://arxiv.org/abs/1704.05526) | yuyuz |
| [real_nvp](real_nvp) | [Density estimation using Real NVP](https://arxiv.org/abs/1605.08803) | laurent-dinh |
| [sentiment_analysis](sentiment_analysis)| [Effective Use of Word Order for Text Categorization with Convolutional Neural Networks](https://arxiv.org/abs/1412.1058) | sculd |
| [seq2species](seq2species) | [Seq2Species: A deep learning approach to pattern recognition for short DNA sequences](https://doi.org/10.1101/353474) | apbusia, depristo |
| [skip_thoughts](skip_thoughts) | [Skip-Thought Vectors](https://arxiv.org/abs/1506.06726) | cshallue |
| [steve](steve) | [Sample-Efficient Reinforcement Learning with Stochastic Ensemble Value Expansion](https://arxiv.org/abs/1807.01675) | buckman-google |
| [street](street) | [End-to-End Interpretation of the French Street Name Signs Dataset](https://arxiv.org/abs/1702.03970) | theraysmith |
| [struct2depth](struct2depth)| [Depth Prediction Without the Sensors: Leveraging Structure for Unsupervised Learning from Monocular Videos](https://arxiv.org/abs/1811.06152) | aneliaangelova |
| [swivel](swivel) | [Swivel: Improving Embeddings by Noticing What's Missing](https://arxiv.org/abs/1602.02215) | waterson |
| [tcn](tcn) | [Time-Contrastive Networks: Self-Supervised Learning from Video](https://arxiv.org/abs/1704.06888) | coreylynch, sermanet |
| [textsum](textsum)| [A Neural Attention Model for Abstractive Sentence Summarization](https://arxiv.org/abs/1509.00685) | panyx0718, peterjliu |
| [transformer](transformer) | [Spatial Transformer Network](https://arxiv.org/abs/1506.02025) | daviddao|
| [video_prediction](video_prediction) | [Unsupervised Learning for Physical Interaction through Video Prediction](https://arxiv.org/abs/1605.07157) | cbfinn |
:warning: If you are looking for old models, please visit the [Archive branch](https://github.com/tensorflow/models/tree/archive/research).
---
## Contributions
If you want to contribute, please review the [contribution guidelines](../../../wiki/How-to-contribute).
If you want to contribute, please review the [contribution guidelines](https://github.com/tensorflow/models/wiki/How-to-contribute).
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment