Internal change

PiperOrigin-RevId: 321025013

Internal change
PiperOrigin-RevId: 321025013
5df3d2e3 · Hongkun Yu · A. Unique TensorFlower · cfb7ef9e · 5df3d2e3 · cfb7ef9e
Commit 5df3d2e3 authored Jul 13, 2020 by Hongkun Yu Committed by A. Unique TensorFlower Jul 13, 2020
8 changed files
--- a/official/benchmark/resnet_ctl_imagenet_benchmark.py
+++ b/official/benchmark/resnet_ctl_imagenet_benchmark.py
@@ -426,7 +426,7 @@ class Resnet50CtlBenchmarkSynth(Resnet50CtlBenchmarkBase):
    def_flags['skip_eval'] = True
    def_flags['use_synthetic_data'] = True
    def_flags['train_steps'] = 110
-    def_flags['steps_per_loop'] = 20
+    def_flags['steps_per_loop'] = 10
    def_flags['log_steps'] = 10

    super(Resnet50CtlBenchmarkSynth, self).__init__(
@@ -441,7 +441,7 @@ class Resnet50CtlBenchmarkReal(Resnet50CtlBenchmarkBase):
    def_flags['skip_eval'] = True
    def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
    def_flags['train_steps'] = 110
-    def_flags['steps_per_loop'] = 20
+    def_flags['steps_per_loop'] = 10
    def_flags['log_steps'] = 10

    super(Resnet50CtlBenchmarkReal, self).__init__(

--- a/official/staging/training/controller.py
+++ b/official/staging/training/controller.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A light weight utilities to train TF2 models."""
-
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
-
-import time
-
-from absl import logging
-
-import tensorflow.compat.v2 as tf
-from typing import Callable, Dict, Optional, Text
-
-from official.staging.training import utils
-
-
-class Controller(object):
-  """Class that facilitates training and evaluation of models."""
-
-  def __init__(
-      self,
-      strategy: Optional[tf.distribute.Strategy] = None,
-      train_fn: Optional[Callable[[tf.Tensor],
-                                  Optional[Dict[Text, tf.Tensor]]]] = None,
-      eval_fn: Optional[Callable[[tf.Tensor],
-                                 Optional[Dict[Text, tf.Tensor]]]] = None,
-      global_step: Optional[tf.Variable] = None,
-      # Train related
-      train_steps: Optional[int] = None,
-      steps_per_loop: Optional[int] = None,
-      summary_dir: Optional[Text] = None,
-      checkpoint_manager: Optional[tf.train.CheckpointManager] = None,
-      # summary related
-      summary_interval: Optional[int] = None,
-      # Evaluation related
-      eval_summary_dir: Optional[Text] = None,
-      eval_steps: Optional[int] = None,
-      eval_interval: Optional[int] = None):
-    """Constructs a `Controller` instance.
-
-    Args:
-      strategy: An instance of `tf.distribute.Strategy`.
-      train_fn: A callable defined as `def train_fn(num_steps)`, which
-        `num_steps` indicates the number of steps to run for each loop.
-      eval_fn: A callable defined as `def eval_fn(num_steps)`, which `num_steps`
-        indicates the number of steps for one evaluation.
-      global_step: An integer `tf.Variable` indicating the global training step
-        number. Usually this can be obtained from `iterations` property of the
-        model's optimizer (e.g. `self.optimizer.iterations`), or users can
-        create their own global step variable as well. If the users create their
-        own global step variable, it is recommended to create the `tf.Variable`
-        inside strategy scope, and with
-        `aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA`.
-      train_steps: The total (maximum) number of training steps to perform.
-      steps_per_loop: The number of steps to run in each "inner loop" of
-        training (passed to the `num_steps` parameter of `train_fn`).
-      summary_dir: The directory to restore and write checkpoints and summaries.
-        If None, it will be set to `checkpoint_manager.directory`.
-      checkpoint_manager: An instance of `tf.train.CheckpointManager`.
-      summary_interval: Step interval for training summaries. Note that this
-        argument only applies to the summaries outside the training loop. If the
-        value is None, then training summaries are not enabled.
-      eval_summary_dir: The directory to write eval summaries. If None, it will
-        be set to `summary_dir`.
-      eval_steps: Number of steps to run evaluation.
-      eval_interval: Step interval for evaluation. If None, will skip evaluation
-        in the middle of training. Note that evaluation only happens outside the
-        training loop, which the loop iteration is specify by `steps_per_loop`
-        parameter.
-
-    Raises:
-      ValueError: If both `train_fn` and `eval_fn` are None.
-      ValueError: If `train_fn` is not None and `train_steps` is None.
-      ValueError: If `steps_per_loop` is None when `train_fn` is provided.
-      ValueError: If `steps_per_loop` is not a positive integer.
-    """
-    if train_fn is None and eval_fn is None:
-      raise ValueError("`train_fn` and `eval_fn` should not both be None")
-
-    # TODO(rxsang): Support training until exhaustion by passing
-    # `train_steps=-1`. Currently it cannot be supported with a host training
-    # loop because break statements are not supported with distributed dataset.
-    if train_fn is not None:
-      if train_steps is None:
-        raise ValueError("`train_steps` is required when `train_fn` is "
-                         "provided.")
-      if steps_per_loop is None:
-        raise ValueError("`steps_per_loop` is required when `train_fn is "
-                         "provided.")
-      if not isinstance(steps_per_loop, int) or steps_per_loop < 1:
-        raise ValueError("`steps_per_loop` should be a positive integer")
-    if summary_interval is not None and summary_interval <= 0:
-      raise ValueError("`summary_interval` should be larger than 0")
-
-    self.strategy = strategy or tf.distribute.get_strategy()
-
-    self.train_fn = train_fn
-    self.eval_fn = eval_fn
-    self.global_step = global_step
-    self.checkpoint_manager = checkpoint_manager
-
-    if self.train_fn is not None:
-      self.train_steps = train_steps
-      self.steps_per_loop = steps_per_loop
-      if summary_dir:
-        self.summary_dir = summary_dir
-      elif checkpoint_manager:
-        self.summary_dir = checkpoint_manager.directory
-      else:
-        self.summary_dir = None
-
-      self.summary_interval = summary_interval
-      if self.summary_dir and self.summary_interval:
-        summary_writer = tf.summary.create_file_writer(self.summary_dir)
-      else:
-        summary_writer = None
-      # TODO(rxsang): Consider pass SummaryManager directly into Controller for
-      # maximum customizability.
-      self.summary_manager = utils.SummaryManager(
-          summary_writer,
-          tf.summary.scalar,
-          global_step=self.global_step,
-          summary_interval=self.summary_interval)
-
-    if self.eval_fn is not None:
-      eval_summary_dir = eval_summary_dir or self.summary_dir
-      eval_summary_writer = tf.summary.create_file_writer(
-          eval_summary_dir) if eval_summary_dir else None
-      self.eval_summary_manager = utils.SummaryManager(
-          eval_summary_writer, tf.summary.scalar, global_step=self.global_step)
-
-      self.eval_steps = eval_steps
-      self.eval_interval = eval_interval
-
-      # Creates and initializes the interval triggers.
-      self.eval_trigger = utils.IntervalTrigger(self.eval_interval,
-                                                self.global_step.numpy())  # pytype: disable=attribute-error
-
-    if self.global_step:
-      tf.summary.experimental.set_step(self.global_step)
-
-    # Restores the model if needed.
-    if self.checkpoint_manager is not None:
-      model_restored = self._restore_model()
-      if not model_restored and self.checkpoint_manager.checkpoint_interval:
-        # If the model is not restored from a checkpoint, save an initial
-        # checkpoint.
-        ckpt_path = self.checkpoint_manager.save(
-            checkpoint_number=self.global_step)
-        logging.info("Saved checkpoins in %s", ckpt_path)
-
-  def _restore_model(self, checkpoint_path=None):
-    """Restore or initialize the model.
-
-    Args:
-      checkpoint_path: An optional string indicates the checkpoint path to
-        restore. If None, will restore from `self.checkpoint_manager`.
-
-    Returns:
-      True if the latest checkpoint is found or restored. Otherwise False.
-    """
-    with self.strategy.scope():
-      # Checkpoint restoring should be inside scope. b/139450638
-      if checkpoint_path is not None:
-        self.checkpoint_manager.checkpoint.restore(checkpoint_path)
-        return True
-      return self.checkpoint_manager.restore_or_initialize()
-
-  def _evaluate_once(self, current_step):
-    """Runs the evaluation once."""
-    logging.info("Start evaluation at step: %s", current_step)
-
-    with self.eval_summary_manager.summary_writer.as_default():
-      eval_outputs = self.eval_fn(self.eval_steps)
-
-    if eval_outputs:
-      eval_outputs = tf.nest.map_structure(lambda x: x.numpy(), eval_outputs)
-
-    info = "step: {}        evaluation metric: {}".format(
-        current_step, eval_outputs)
-    self._log_info(info)
-
-    self.eval_summary_manager.write_summaries(eval_outputs)
-    self.eval_summary_manager.flush()
-
-  def _maybe_save_checkpoints(self, current_step, force_trigger=False):
-    if self.checkpoint_manager and self.checkpoint_manager.checkpoint_interval:
-      ckpt_path = self.checkpoint_manager.save(
-          checkpoint_number=current_step, check_interval=not force_trigger)
-      if ckpt_path is not None:
-        logging.info("Saved checkpoins in %s", ckpt_path)
-
-  def _maybe_evaluate(self, current_step, force_trigger=False):
-    if self.eval_trigger(current_step, force_trigger):
-      self._evaluate_once(current_step)
-
-  def _log_info(self, message):
-    """Logs `message` to the `info` log, and also prints to stdout."""
-    logging.info(message)
-    print(message)
-
-  def train(self, evaluate=True):
-    """Runs the training, with optional evaluation.
-
-    This handles evaluation, gathering summaries, and saving checkpoints.
-
-    Args:
-      evaluate: A boolean indicates whether to perform evaluation during
-        training.
-
-    Raises:
-      RuntimeError: If `global_step` is not updated correctly in `train_fn`.
-    """
-    if self.train_fn is None:
-      raise ValueError("`self.train_fn` is required when calling `train` "
-                       "method.")
-    if self.global_step is None:
-      raise ValueError("`self.global_step` is required when calling `train` "
-                       "method.")
-    if evaluate and self.eval_fn is None:
-      raise ValueError("`self.eval_fn` is required when calling `train` method "
-                       "with `evaluate=True`")
-
-    step_timer = _StepTimer(self.global_step)
-    current_step = self.global_step.numpy()
-    logging.info("Train at step %s of %s", current_step, self.train_steps)
-    while current_step < self.train_steps:
-      # Calculates steps to run for the next train loop.
-      steps_per_loop = min(self.train_steps - current_step, self.steps_per_loop)
-      logging.info("Entering training loop with %s steps, at step %s of %s",
-                   steps_per_loop, current_step, self.train_steps)
-      current_step += steps_per_loop
-      steps_per_loop = tf.convert_to_tensor(steps_per_loop, dtype=tf.int32)
-
-      with self.summary_manager.summary_writer.as_default():
-        train_outputs = self.train_fn(steps_per_loop)
-
-      # Updates and verifies the current step after a training loop finishes.
-      if current_step != self.global_step.numpy():
-        raise RuntimeError("`self.train_fn` is not updating `global_step` "
-                           "correctly, expected: %s, actual: %s" %
-                           (current_step, self.global_step.numpy()))
-
-      # Print information like metrics and steps_per_second after a training
-      # loop.
-      if train_outputs:
-        train_outputs = tf.nest.map_structure(
-            lambda x: x.numpy(), train_outputs)
-      steps_per_second = step_timer.steps_per_second()
-      info = "step: {}        steps_per_second: {:.2f}        {}".format(
-          current_step, steps_per_second, train_outputs)
-      self._log_info(info)
-
-      train_outputs = train_outputs or {}
-      train_outputs["steps_per_second"] = steps_per_second
-      self.summary_manager.write_summaries(train_outputs)
-
-      self._maybe_save_checkpoints(current_step)
-
-      if evaluate:
-        self._maybe_evaluate(current_step)
-
-    self.summary_manager.write_summaries(train_outputs, always_write=True)
-    self.summary_manager.flush()
-    self._maybe_save_checkpoints(current_step, force_trigger=True)
-    if evaluate:
-      self._maybe_evaluate(current_step, force_trigger=True)
-
-  def evaluate(self, continuous=False, timeout_fn=None):
-    """Runs the evaluation.
-
-    Args:
-      continuous: If `True`, will continously monitor the checkpoint directory
-        to evaluate on the latest checkpoint. If `False`, will do the evaluation
-        once.
-      timeout_fn: Optional callable to call after a timeout. If the function
-        returns True, then it means that no new checkpoints will be generated
-        and the iterator will exit.
-
-    Raises:
-      ValueError: If no checkpoint found in `self.checkpoint_manager.directory`.
-    """
-    if self.eval_fn is None:
-      raise ValueError("`self.eval_fn` should not be None to call "
-                       "`evaluate()` method.")
-
-    if not continuous and timeout_fn is not None:
-      raise ValueError("`timeout_fn` can be only passed when `continuous` is "
-                       "True")
-
-    if continuous:
-      for checkpoint_path in tf.train.checkpoints_iterator(
-          self.checkpoint_manager.directory, timeout_fn=timeout_fn):
-        self._restore_model(checkpoint_path)
-        self._evaluate_once(self.global_step.numpy())
-      return
-
-    latest_checkpoint = self.checkpoint_manager.latest_checkpoint
-    if not latest_checkpoint:
-      raise ValueError("no checkpoint found in dir %s" %
-                       self.checkpoint_manager.directory)
-    self._restore_model()
-    self._evaluate_once(self.global_step.numpy())
-
-
-class _StepTimer(object):
-  """Utility class for measuring steps/second."""
-
-  def __init__(self, step):
-    self.step = step
-    self.start()
-
-  def start(self):
-    self.last_iteration = self.step.numpy()
-    self.last_time = time.time()
-
-  def steps_per_second(self, restart=True):
-    value = ((self.step.numpy() - self.last_iteration) /
-             (time.time() - self.last_time))
-    if restart:
-      self.start()
-    return value
--- a/official/staging/training/controller_test.py
+++ b/official/staging/training/controller_test.py
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for official.staging.training.controller."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.staging.training import controller
-from official.staging.training import standard_runnable
-
-
-def all_strategy_combinations():
-  """Gets combinations of distribution strategies."""
-  return combinations.combine(
-      strategy=[
-          strategy_combinations.one_device_strategy,
-          strategy_combinations.tpu_strategy,
-          strategy_combinations.one_device_strategy_gpu,
-          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-      ],
-      mode="eager",
-  )
-
-
-def create_model():
-  x = tf.keras.layers.Input(shape=(3,), name="input")
-  y = tf.keras.layers.Dense(4, name="dense")(x)
-  model = tf.keras.Model(x, y)
-  return model
-
-
-def summaries_with_matching_keyword(keyword, summary_dir):
-  """Yields summary protos matching given keyword from event file."""
-  event_paths = tf.io.gfile.glob(os.path.join(summary_dir, "events*"))
-  for event in tf.compat.v1.train.summary_iterator(event_paths[-1]):
-    if event.summary is not None:
-      for value in event.summary.value:
-        if keyword in value.tag:
-          tf.compat.v1.logging.error(event)
-          yield event.summary
-
-
-def check_eventfile_for_keyword(keyword, summary_dir):
-  """Checks event files for the keyword."""
-  return any(summaries_with_matching_keyword(keyword, summary_dir))
-
-
-def dataset_fn(ctx):
-  del ctx
-  inputs = np.zeros((10, 3), dtype=np.float32)
-  targets = np.zeros((10, 4), dtype=np.float32)
-  dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-  dataset = dataset.repeat(100)
-  dataset = dataset.batch(10, drop_remainder=True)
-  return dataset
-
-
-class TestRunnable(standard_runnable.StandardTrainable,
-                   standard_runnable.StandardEvaluable):
-  """Implements the training and evaluation APIs for the test model."""
-
-  def __init__(self):
-    standard_runnable.StandardTrainable.__init__(self)
-    standard_runnable.StandardEvaluable.__init__(self)
-    self.strategy = tf.distribute.get_strategy()
-    self.model = create_model()
-    self.optimizer = tf.keras.optimizers.RMSprop()
-    self.global_step = self.optimizer.iterations
-    self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
-    self.eval_loss = tf.keras.metrics.Mean("eval_loss", dtype=tf.float32)
-
-  def build_train_dataset(self):
-    return self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
-
-  def train_step(self, iterator):
-
-    def _replicated_step(inputs):
-      """Replicated training step."""
-      inputs, targets = inputs
-      with tf.GradientTape() as tape:
-        outputs = self.model(inputs)
-        loss = tf.math.reduce_sum(outputs - targets)
-      grads = tape.gradient(loss, self.model.variables)
-      self.optimizer.apply_gradients(zip(grads, self.model.variables))
-      self.train_loss.update_state(loss)
-
-    self.strategy.run(_replicated_step, args=(next(iterator),))
-
-  def train_loop_end(self):
-    return {
-        "loss": self.train_loss.result(),
-    }
-
-  def build_eval_dataset(self):
-    return self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
-
-  def eval_begin(self):
-    self.eval_loss.reset_states()
-
-  def eval_step(self, iterator):
-
-    def _replicated_step(inputs):
-      """Replicated evaluation step."""
-      inputs, targets = inputs
-      outputs = self.model(inputs)
-      loss = tf.math.reduce_sum(outputs - targets)
-      self.eval_loss.update_state(loss)
-
-    self.strategy.run(_replicated_step, args=(next(iterator),))
-
-  def eval_end(self):
-    return {
-        "eval_loss": self.eval_loss.result(),
-    }
-
-
-class ControllerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(ControllerTest, self).setUp()
-    self.model_dir = self.get_temp_dir()
-
-  def test_no_checkpoint(self):
-    test_runnable = TestRunnable()
-    # No checkpoint manager and no strategy.
-    test_controller = controller.Controller(
-        train_fn=test_runnable.train,
-        eval_fn=test_runnable.evaluate,
-        global_step=test_runnable.global_step,
-        train_steps=10,
-        steps_per_loop=2,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        summary_interval=2,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-        eval_steps=2,
-        eval_interval=5)
-    test_controller.train(evaluate=True)
-    self.assertEqual(test_runnable.global_step.numpy(), 10)
-    # Loss and accuracy values should be written into summaries.
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "loss", os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
-    # No checkpoint, so global step starts from 0.
-    test_runnable.global_step.assign(0)
-    test_controller.train(evaluate=True)
-    self.assertEqual(test_runnable.global_step.numpy(), 10)
-
-  def test_no_checkpoint_and_summaries(self):
-    test_runnable = TestRunnable()
-    # No checkpoint + summary directories.
-    test_controller = controller.Controller(
-        train_fn=test_runnable.train,
-        eval_fn=test_runnable.evaluate,
-        global_step=test_runnable.global_step,
-        train_steps=10,
-        steps_per_loop=2,
-        eval_steps=2,
-        eval_interval=5)
-    test_controller.train(evaluate=True)
-    self.assertEqual(test_runnable.global_step.numpy(), 10)
-
-  @combinations.generate(all_strategy_combinations())
-  def test_train_and_evaluate(self, strategy):
-    with strategy.scope():
-      test_runnable = TestRunnable()
-
-    checkpoint = tf.train.Checkpoint(
-        model=test_runnable.model, optimizer=test_runnable.optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runnable.global_step,
-        checkpoint_interval=10)
-    test_controller = controller.Controller(
-        strategy=strategy,
-        train_fn=test_runnable.train,
-        eval_fn=test_runnable.evaluate,
-        global_step=test_runnable.global_step,
-        train_steps=10,
-        steps_per_loop=2,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        summary_interval=2,
-        checkpoint_manager=checkpoint_manager,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-        eval_steps=2,
-        eval_interval=5)
-    test_controller.train(evaluate=True)
-
-    # Checkpoints are saved.
-    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
-
-    # Loss and accuracy values should be written into summaries.
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "loss", os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
-
-  @combinations.generate(all_strategy_combinations())
-  def test_train_only(self, strategy):
-    with strategy.scope():
-      test_runnable = TestRunnable()
-
-    checkpoint = tf.train.Checkpoint(
-        model=test_runnable.model, optimizer=test_runnable.optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runnable.global_step,
-        checkpoint_interval=10)
-    test_controller = controller.Controller(
-        strategy=strategy,
-        train_fn=test_runnable.train,
-        global_step=test_runnable.global_step,
-        train_steps=10,
-        steps_per_loop=2,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        summary_interval=2,
-        checkpoint_manager=checkpoint_manager,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-    )
-    test_controller.train(evaluate=False)
-
-    # Checkpoints are saved.
-    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
-
-    # Only train summaries are written.
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "loss", os.path.join(self.model_dir, "summaries/train")))
-    self.assertFalse(
-        tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/eval")))
-
-  @combinations.generate(all_strategy_combinations())
-  def test_evaluate_only(self, strategy):
-    with strategy.scope():
-      test_runnable = TestRunnable()
-
-    checkpoint = tf.train.Checkpoint(model=test_runnable.model)
-    checkpoint.save(os.path.join(self.model_dir, "ckpt"))
-
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runnable.global_step)
-    test_controller = controller.Controller(
-        strategy=strategy,
-        eval_fn=test_runnable.evaluate,
-        global_step=test_runnable.global_step,
-        checkpoint_manager=checkpoint_manager,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-        eval_steps=2,
-        eval_interval=5)
-    test_controller.evaluate()
-
-    # Only eval summaries are written
-    self.assertFalse(
-        tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/staging/training/runnable.py
+++ b/official/staging/training/runnable.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""An abstraction that users can easily handle their custom training loops."""
-
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
-
-import abc
-import six
-import tensorflow.compat.v2 as tf
-from typing import Dict, Optional, Text
-
-
-@six.add_metaclass(abc.ABCMeta)
-class AbstractTrainable(tf.Module):
-  """An abstract class defining the APIs required for training."""
-
-  @abc.abstractmethod
-  def train(self,
-            num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
-    """Implements model training with multiple steps.
-
-    In training, it is common to break the total training steps into several
-    training loops, so users can do checkpointing, write summaries and run some
-    python callbacks. This is necessary for getting good performance in TPU
-    training, as the overhead for launching a multi worker tf.function may be
-    large in Eager mode. It is usually encouraged to create a host training loop
-    (e.g. using a `tf.range` wrapping `strategy.run` inside a
-    `tf.function`) in the TPU case. For the cases that don't require host
-    training loop to acheive peak performance, users can just implement a simple
-    python loop to drive each step.
-
-    Args:
-      num_steps: A guideline for how many training steps to run. Note that it is
-        up to the model what constitutes a "step" (this may involve more than
-        one update to model parameters, e.g. if training a GAN).
-
-    Returns:
-      The function may return a dictionary of `Tensors`, which will be
-      written to logs and as TensorBoard summaries.
-    """
-    pass
-
-
-@six.add_metaclass(abc.ABCMeta)
-class AbstractEvaluable(tf.Module):
-  """An abstract class defining the APIs required for evaluation."""
-
-  @abc.abstractmethod
-  def evaluate(
-      self, num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
-    """Implements model evaluation.
-
-    Args:
-      num_steps: A guideline for how many evaluation steps to run. Note that it
-        is up to the model what constitutes a "step". Generally, it may be
-        desirable to support both a limited number of eval steps and iterating
-        over a full dataset (however many steps are required) when `num_steps`
-        is `None`.
-
-    Returns:
-      The function may return a dictionary of `Tensors`, which will be
-      written to logs and as TensorBoard summaries.
-    """
-    pass
--- a/official/staging/training/standard_runnable.py
+++ b/official/staging/training/standard_runnable.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""An abstraction that users can easily handle their custom training loops."""
-
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
-
-import abc
-import six
-import tensorflow.compat.v2 as tf
-from typing import Dict, Optional, Text
-
-from official.staging.training import runnable
-from official.staging.training import utils
-
-
-@six.add_metaclass(abc.ABCMeta)
-class StandardTrainable(runnable.AbstractTrainable):
-  """Implements the standard functionality of AbstractTrainable APIs."""
-
-  def __init__(self, use_tf_while_loop=True, use_tf_function=True):
-    if use_tf_while_loop and not use_tf_function:
-      raise ValueError("`use_tf_while_loop=True` and `use_tf_function=False` "
-                       "is not supported")
-    self.use_tf_while_loop = use_tf_while_loop
-    self.use_tf_function = use_tf_function
-    self.train_dataset = None
-    self.train_iter = None
-    self.train_loop_fn = None
-
-  @abc.abstractmethod
-  def build_train_dataset(self):
-    """Builds the training datasets.
-
-    Returns:
-      A tf.nest-compatible structure of tf.data.Dataset or DistributedDataset.
-    """
-    pass
-
-  def train(self,
-            num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
-    """See base class."""
-    if self.train_dataset is None:
-      # Build train input dataset
-      self.train_dataset = self.build_train_dataset()
-      self.train_iter = tf.nest.map_structure(iter, self.train_dataset)
-
-    if self.train_loop_fn is None:
-      train_fn = self.train_step
-      if self.use_tf_while_loop:
-        self.train_loop_fn = utils.create_tf_while_loop_fn(train_fn)
-      else:
-        if self.use_tf_function:
-          train_fn = tf.function(train_fn)
-        self.train_loop_fn = utils.create_loop_fn(train_fn)
-
-    self.train_loop_begin()
-    self.train_loop_fn(self.train_iter, num_steps)
-    return self.train_loop_end()
-
-  def train_loop_begin(self):
-    """Called once at the beginning of the training loop.
-
-    This is a good place to reset metrics that accumulate values over multiple
-    steps of training.
-    """
-    pass
-
-  @abc.abstractmethod
-  def train_step(self, iterator):
-    """Implements one step of training.
-
-    What a "step" consists of is up to the implementer. If using distribution
-    strategies, the call to this method should take place in the "cross-replica
-    context" for generality, to allow e.g. multiple iterator dequeues and calls
-    to `strategy.run`.
-
-    Args:
-      iterator: A tf.nest-compatible structure of tf.data Iterator or
-        DistributedIterator.
-    """
-    pass
-
-  def train_loop_end(self) -> Optional[Dict[Text, tf.Tensor]]:
-    """Called at the end of the training loop.
-
-    This is a good place to get metric results. The value returned from this
-    function will be returned as-is from the train() method.
-
-    Returns:
-      The function may return a dictionary of `Tensors`, which will be
-      written to logs and as TensorBoard summaries.
-    """
-    pass
-
-
-@six.add_metaclass(abc.ABCMeta)
-class StandardEvaluable(runnable.AbstractEvaluable):
-  """Implements the standard functionality of AbstractEvaluable APIs."""
-
-  def __init__(self, use_tf_function=True):
-    self.eval_use_tf_function = use_tf_function
-    self.eval_dataset = None
-    self.eval_loop_fn = None
-
-  @abc.abstractmethod
-  def build_eval_dataset(self):
-    """Builds the evaluation datasets.
-
-    Returns:
-      A tf.nest-compatible structure of tf.data.Dataset or DistributedDataset.
-    """
-    pass
-
-  def evaluate(
-      self, num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
-    """See base class."""
-    if self.eval_dataset is None:
-      # Build train input dataset
-      self.eval_dataset = self.build_eval_dataset()
-
-    if self.eval_loop_fn is None:
-      eval_fn = self.eval_step
-      if self.eval_use_tf_function:
-        eval_fn = tf.function(eval_fn)
-      self.eval_loop_fn = utils.create_loop_fn(eval_fn)
-
-    eval_iter = tf.nest.map_structure(iter, self.eval_dataset)
-
-    self.eval_begin()
-    self.eval_loop_fn(eval_iter, num_steps)
-    return self.eval_end()
-
-  def eval_begin(self):
-    """Called once at the beginning of the evaluation.
-
-    This is a good place to reset metrics that accumulate values over the entire
-    evaluation.
-    """
-    pass
-
-  @abc.abstractmethod
-  def eval_step(self, iterator):
-    """Implements one step of evaluation.
-
-    What a "step" consists of is up to the implementer. If using distribution
-    strategies, the call to this method should take place in the "cross-replica
-    context" for generality, to allow e.g. multiple iterator dequeues and calls
-    to `strategy.run`.
-
-    Args:
-      iterator: A tf.nest-compatible structure of tf.data Iterator or
-        DistributedIterator.
-    """
-    pass
-
-  def eval_end(self) -> Optional[Dict[Text, tf.Tensor]]:
-    """Called at the end of the evaluation.
-
-    This is a good place to get metric results. The value returned from this
-    function will be returned as-is from the evaluate() method.
-
-    Returns:
-      The function may return a dictionary of `Tensors`, which will be
-      written to logs and as TensorBoard summaries.
-    """
-    pass
--- a/official/staging/training/utils.py
+++ b/official/staging/training/utils.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Some layered modules/functions to help users writing custom training loop."""
-
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
-
-import abc
-import inspect
-import six
-
-import tensorflow.compat.v2 as tf
-
-
-def create_loop_fn(step_fn):
-  """Creates a multiple steps function driven by the python while loop.
-
-  Args:
-    step_fn: A function which takes `iterator` as input.
-
-  Returns:
-    A callable defined as the `loop_fn` defination below.
-  """
-
-  def loop_fn(iterator, num_steps, state=None, reduce_fn=None):
-    """A loop function with multiple steps.
-
-    Args:
-      iterator: A nested structure of tf.data `Iterator` or
-        `DistributedIterator`.
-      num_steps: The number of steps in the loop. If `num_steps==-1`, will
-        iterate until exausting the iterator.
-      state: An optional initial state before running the loop.
-      reduce_fn: a callable defined as `def reduce_fn(state, value)`, where
-        `value` is the outputs from `step_fn`.
-
-    Returns:
-      The updated state.
-    """
-    try:
-      step = 0
-      # To make sure the OutOfRangeError exception can be handled well with
-      # async remote eager, we need to wrap the loop body in a `async_scope`.
-      with tf.experimental.async_scope():
-        while (num_steps == -1 or step < num_steps):
-          outputs = step_fn(iterator)
-          if reduce_fn is not None:
-            state = reduce_fn(state, outputs)
-          step += 1
-        return state
-    except (StopIteration, tf.errors.OutOfRangeError):
-      tf.experimental.async_clear_error()
-      return state
-
-  return loop_fn
-
-
-def create_tf_while_loop_fn(step_fn):
-  """Create a multiple steps function driven by tf.while_loop on the host.
-
-  Args:
-    step_fn: A function which takes `iterator` as input.
-
-  Returns:
-    A callable defined as the `loop_fn` defination below.
-  """
-
-  @tf.function
-  def loop_fn(iterator, num_steps):
-    """A loop function with multiple steps.
-
-    Args:
-      iterator: A nested structure of tf.data `Iterator` or
-        `DistributedIterator`.
-      num_steps: The number of steps in the loop. Must be a tf.Tensor.
-    """
-    if not isinstance(num_steps, tf.Tensor):
-      raise ValueError("`num_steps` should be an `tf.Tensor`. Python object "
-                       "may cause retracing.")
-
-    for _ in tf.range(num_steps):
-      step_fn(iterator)
-
-  return loop_fn
-
-
-def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
-  """A helper function to create distributed dataset.
-
-  Args:
-    strategy: An instance of `tf.distribute.Strategy`.
-    dataset_or_fn: A instance of `tf.data.Dataset` or a function which takes an
-      `tf.distribute.InputContext` as input and returns a `tf.data.Dataset`. If
-      it is a function, it could optionally have an argument named
-      `input_context` which is `tf.distribute.InputContext` argument type.
-    *args: The list of arguments to be passed to dataset_or_fn.
-    **kwargs: Any keyword arguments to be passed.
-
-  Returns:
-    A distributed Dataset.
-  """
-  if strategy is None:
-    strategy = tf.distribute.get_strategy()
-
-  if isinstance(dataset_or_fn, tf.data.Dataset):
-    return strategy.experimental_distribute_dataset(dataset_or_fn)
-
-  if not callable(dataset_or_fn):
-    raise ValueError("`dataset_or_fn` should be either callable or an instance "
-                     "of `tf.data.Dataset`")
-
-  def dataset_fn(ctx):
-    """Wrapped dataset function for creating distributed dataset.."""
-
-    # If `dataset_or_fn` is a function and has `input_context` as argument
-    # names, pass `ctx` as the value of `input_context` when calling
-    # `dataset_or_fn`. Otherwise `ctx` will not be used when calling
-    # `dataset_or_fn`.
-    if six.PY3:
-      argspec = inspect.getfullargspec(dataset_or_fn)
-    else:
-      argspec = inspect.getargspec(dataset_or_fn)
-    args_names = argspec.args
-
-    if "input_context" in args_names:
-      kwargs["input_context"] = ctx
-    ds = dataset_or_fn(*args, **kwargs)
-    return ds
-
-  return strategy.experimental_distribute_datasets_from_function(dataset_fn)
-
-
-class SummaryManager(object):
-  """A class manages writing summaries."""
-
-  def __init__(self,
-               summary_writer,
-               summary_fn,
-               global_step=None,
-               summary_interval=None):
-    """Construct a summary manager object.
-
-    Args:
-      summary_writer: A `tf.summary.SummaryWriter` instance for writing
-        summaries.
-      summary_fn: A callable defined as `def summary_fn(name, tensor,
-        step=None)`, which describes the summary operation.
-      global_step: A `tf.Variable` instance for checking the current global step
-        value, in case users want to save summaries every N steps.
-      summary_interval: An integer, indicates the minimum step interval between
-        two summaries.
-    """
-    if summary_writer is not None:
-      self._summary_writer = summary_writer
-      self._enabled = True
-    else:
-      self._summary_writer = tf.summary.create_noop_writer()
-      self._enabled = False
-    self._summary_fn = summary_fn
-
-    if global_step is None:
-      self._global_step = tf.summary.experimental.get_step()
-    else:
-      self._global_step = global_step
-
-    if summary_interval is not None:
-      if self._global_step is None:
-        raise ValueError("`summary_interval` is not None, but no `global_step` "
-                         "can be obtained ")
-      self._last_summary_step = self._global_step.numpy()
-    self._summary_interval = summary_interval
-
-  @property
-  def summary_interval(self):
-    return self._summary_interval
-
-  @property
-  def summary_writer(self):
-    """Returns the underlying summary writer."""
-    return self._summary_writer
-
-  def flush(self):
-    """Flush the underlying summary writer."""
-    if self._enabled:
-      tf.summary.flush(self._summary_writer)
-
-  def write_summaries(self, items, always_write=True):
-    """Write a bulk of summaries.
-
-    Args:
-      items: a dictionary of `Tensors` for writing summaries.
-      always_write: An optional boolean. If `True`, the manager will always
-        write summaries unless the summaries have been written for the same
-        step. Otherwise the manager will only write the summaries if the
-        interval between summaries are larger than `summary_interval`.
-
-    Returns:
-      A boolean indicates whether the summaries are written or not.
-    """
-    # TODO(rxsang): Support writing summaries with nested structure, so users
-    # can split the summaries into different directories for nicer visualization
-    # in Tensorboard, like train and eval metrics.
-    if not self._enabled:
-      return False
-
-    if self._summary_interval is not None:
-      current_step = self._global_step.numpy()
-      if current_step == self._last_summary_step:
-        return False
-      if not always_write and current_step < (self._last_summary_step +
-                                              self._summary_interval):
-        return False
-      self._last_summary_step = current_step
-
-    with self._summary_writer.as_default():
-      for name, tensor in items.items():
-        self._summary_fn(name, tensor, step=self._global_step)
-    return True
-
-
-@six.add_metaclass(abc.ABCMeta)
-class Trigger(object):
-  """An abstract class representing a "trigger" for some event."""
-
-  @abc.abstractmethod
-  def __call__(self, value: float, force_trigger=False):
-    """Maybe trigger the event based on the given value.
-
-    Args:
-      value: the value for triggering.
-      force_trigger: Whether the trigger is forced triggered.
-
-    Returns:
-      `True` if the trigger is triggered on the given `value`, and
-      `False` otherwise.
-    """
-
-  @abc.abstractmethod
-  def reset(self):
-    """Reset states in the trigger."""
-
-
-class IntervalTrigger(Trigger):
-  """Triggers on every fixed interval."""
-
-  def __init__(self, interval, start=0):
-    """Constructs the IntervalTrigger.
-
-    Args:
-      interval: The triggering interval.
-      start: An initial value for the trigger.
-    """
-    self._interval = interval
-    self._last_trigger_value = start
-
-  def __call__(self, value, force_trigger=False):
-    """Maybe trigger the event based on the given value.
-
-    Args:
-      value: the value for triggering.
-      force_trigger: If True, the trigger will be forced triggered unless the
-        last trigger value is equal to `value`.
-
-    Returns:
-      `True` if the trigger is triggered on the given `value`, and
-      `False` otherwise.
-    """
-    if force_trigger and value != self._last_trigger_value:
-      self._last_trigger_value = value
-      return True
-
-    if self._interval and self._interval > 0:
-      if value >= self._last_trigger_value + self._interval:
-        self._last_trigger_value = value
-        return True
-    return False
-
-  def reset(self):
-    """See base class."""
-    self._last_trigger_value = 0
-
-
-class EpochHelper(object):
-  """A Helper class to handle epochs in Customized Training Loop."""
-
-  def __init__(self, epoch_steps, global_step):
-    """Constructs the EpochHelper.
-
-    Args:
-      epoch_steps: An integer indicates how many steps in an epoch.
-      global_step: A `tf.Variable` instance indicates the current global step.
-    """
-    self._epoch_steps = epoch_steps
-    self._global_step = global_step
-    self._current_epoch = None
-    self._epoch_start_step = None
-    self._in_epoch = False
-
-  def epoch_begin(self):
-    """Returns whether a new epoch should begin."""
-    if self._in_epoch:
-      return False
-    current_step = self._global_step.numpy()
-    self._epoch_start_step = current_step
-    self._current_epoch = current_step // self._epoch_steps
-    self._in_epoch = True
-    return True
-
-  def epoch_end(self):
-    """Returns whether the current epoch should end."""
-    if not self._in_epoch:
-      raise ValueError("`epoch_end` can only be called inside an epoch")
-    current_step = self._global_step.numpy()
-    epoch = current_step // self._epoch_steps
-
-    if epoch > self._current_epoch:
-      self._in_epoch = False
-      return True
-    return False
-
-  @property
-  def batch_index(self):
-    """Index of the next batch within the current epoch."""
-    return self._global_step.numpy() - self._epoch_start_step
-
-  @property
-  def current_epoch(self):
-    return self._current_epoch
--- a/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py
+++ b/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py
@@ -14,18 +14,16 @@
 # ==============================================================================
 """Runs a ResNet model on the ImageNet dataset using custom training loops."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import math
+import os
+
 from absl import app
 from absl import flags
 from absl import logging
+import orbit
 import tensorflow as tf

 from official.modeling import performance
-from official.staging.training import controller
 from official.utils.flags import core as flags_core
 from official.utils.misc import distribution_utils
 from official.utils.misc import keras_utils
@@ -87,15 +85,6 @@ def get_num_train_iterations(flags_obj):
  return train_steps, train_epochs, eval_steps


-def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
-  """Calculates steps to run on device."""
-  if steps_per_loop <= 0:
-    raise ValueError('steps_per_loop should be positive integer.')
-  if steps_per_loop == 1:
-    return steps_per_loop
-  return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
-
-
 def run(flags_obj):
  """Run ResNet ImageNet training and eval loop using custom training loops.

@@ -121,7 +110,6 @@ def run(flags_obj):
          datasets_num_private_threads=flags_obj.datasets_num_private_threads)
    common.set_cudnn_batchnorm_mode()

-  # TODO(anj-s): Set data_format without using Keras.
  data_format = flags_obj.data_format
  if data_format is None:
    data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
@@ -137,7 +125,14 @@ def run(flags_obj):

  per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations(
      flags_obj)
-  steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
+  if flags_obj.steps_per_loop is None:
+    steps_per_loop = per_epoch_steps
+  elif flags_obj.steps_per_loop > per_epoch_steps:
+    steps_per_loop = per_epoch_steps
+    logging.warn('Setting steps_per_loop to %d to respect epoch boundary.',
+                 steps_per_loop)
+  else:
+    steps_per_loop = flags_obj.steps_per_loop

  logging.info(
      'Training %d epochs, each epoch has %d steps, '
@@ -154,8 +149,8 @@ def run(flags_obj):

  eval_interval = flags_obj.epochs_between_evals * per_epoch_steps
  checkpoint_interval = (
-      per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
-  summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
+      steps_per_loop * 5 if flags_obj.enable_checkpoint_and_export else None)
+  summary_interval = steps_per_loop if flags_obj.enable_tensorboard else None

  checkpoint_manager = tf.train.CheckpointManager(
      runnable.checkpoint,
@@ -164,20 +159,24 @@ def run(flags_obj):
      step_counter=runnable.global_step,
      checkpoint_interval=checkpoint_interval)

-  resnet_controller = controller.Controller(
+  resnet_controller = orbit.Controller(
      strategy,
-      runnable.train,
-      runnable.evaluate if not flags_obj.skip_eval else None,
+      runnable,
+      runnable if not flags_obj.skip_eval else None,
      global_step=runnable.global_step,
      steps_per_loop=steps_per_loop,
-      train_steps=per_epoch_steps * train_epochs,
      checkpoint_manager=checkpoint_manager,
      summary_interval=summary_interval,
-      eval_steps=eval_steps,
-      eval_interval=eval_interval)
+      eval_summary_dir=os.path.join(flags_obj.model_dir, 'eval'))

  time_callback.on_train_begin()
-  resnet_controller.train(evaluate=not flags_obj.skip_eval)
+  if not flags_obj.skip_eval:
+    resnet_controller.train_and_evaluate(
+        train_steps=per_epoch_steps * train_epochs,
+        eval_steps=eval_steps,
+        eval_interval=eval_interval)
+  else:
+    resnet_controller.train(steps=per_epoch_steps * train_epochs)
  time_callback.on_train_end()

  stats = build_stats(runnable, time_callback)

--- a/official/vision/image_classification/resnet/resnet_runnable.py
+++ b/official/vision/image_classification/resnet/resnet_runnable.py
@@ -14,33 +14,21 @@
 # ==============================================================================
 """Runs a ResNet model on the ImageNet dataset using custom training loops."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+import orbit
 import tensorflow as tf

 from official.modeling import performance
 from official.staging.training import grad_utils
-from official.staging.training import standard_runnable
-from official.staging.training import utils
 from official.utils.flags import core as flags_core
 from official.vision.image_classification.resnet import common
 from official.vision.image_classification.resnet import imagenet_preprocessing
 from official.vision.image_classification.resnet import resnet_model


-class ResnetRunnable(standard_runnable.StandardTrainable,
-                     standard_runnable.StandardEvaluable):
+class ResnetRunnable(orbit.StandardTrainer, orbit.StandardEvaluator):
  """Implements the training and evaluation APIs for Resnet model."""

  def __init__(self, flags_obj, time_callback, epoch_steps):
-    standard_runnable.StandardTrainable.__init__(self,
-                                                 flags_obj.use_tf_while_loop,
-                                                 flags_obj.use_tf_function)
-    standard_runnable.StandardEvaluable.__init__(self,
-                                                 flags_obj.use_tf_function)
-
    self.strategy = tf.distribute.get_strategy()
    self.flags_obj = flags_obj
    self.dtype = flags_core.get_tf_dtype(flags_obj)
@@ -107,11 +95,8 @@ class ResnetRunnable(standard_runnable.StandardTrainable,

    # Handling epochs.
    self.epoch_steps = epoch_steps
-    self.epoch_helper = utils.EpochHelper(epoch_steps, self.global_step)
-
-  def build_train_dataset(self):
-    """See base class."""
-    return utils.make_distributed_dataset(
+    self.epoch_helper = orbit.utils.EpochHelper(epoch_steps, self.global_step)
+    train_dataset = orbit.utils.make_distributed_dataset(
        self.strategy,
        self.input_fn,
        is_training=True,
@@ -122,10 +107,11 @@ class ResnetRunnable(standard_runnable.StandardTrainable,
        .datasets_num_private_threads,
        dtype=self.dtype,
        drop_remainder=True)
-
-  def build_eval_dataset(self):
-    """See base class."""
-    return utils.make_distributed_dataset(
+    orbit.StandardTrainer.__init__(self, train_dataset,
+                                   flags_obj.use_tf_while_loop,
+                                   flags_obj.use_tf_function)
+    if not flags_obj.skip_eval:
+      eval_dataset = orbit.utils.make_distributed_dataset(
          self.strategy,
          self.input_fn,
          is_training=False,
@@ -133,6 +119,8 @@ class ResnetRunnable(standard_runnable.StandardTrainable,
          batch_size=self.batch_size,
          parse_record_fn=imagenet_preprocessing.parse_record,
          dtype=self.dtype)
+      orbit.StandardEvaluator.__init__(self, eval_dataset,
+                                       flags_obj.use_tf_function)

  def train_loop_begin(self):
    """See base class."""